Skip to content

I/O — DataIterator & GFFWriter

DataIterator

Factory function that returns a streaming iterator over GFF3/GTF input (file path, URL, raw string with from_string=True, or an iterable of Feature objects).

gffbase.iterators.DataIterator

DataIterator(data, checklines: int = 10, transform=None, force_dialect_check: bool = False, from_string: bool = False, **kwargs) -> _DataIterator

Legacy factory. Returns an iterator yielding Feature.

Source code in python/gffbase/iterators.py
def DataIterator(
    data,
    checklines: int = 10,
    transform=None,
    force_dialect_check: bool = False,
    from_string: bool = False,
    **kwargs,
) -> _DataIterator:
    """Legacy factory. Returns an iterator yielding ``Feature``."""
    return _DataIterator(
        data,
        checklines=checklines,
        transform=transform,
        force_dialect_check=force_dialect_check,
        from_string=from_string,
        **kwargs,
    )

GFFWriter

gffbase.gffwriter.GFFWriter

GFFWriter(out: Union[str, 'os.PathLike', 'io.IOBase'], with_header: bool = True, in_place: bool = False)

Write Feature records back to a GFF/GTF file.

Source code in python/gffbase/gffwriter.py
def __init__(
    self,
    out: Union[str, "os.PathLike", "io.IOBase"],
    with_header: bool = True,
    in_place: bool = False,
):
    self.with_header = with_header
    self.in_place = in_place
    self._opened_path: Optional[str] = None
    self._target_path: Optional[str] = None

    if hasattr(out, "write"):
        self._fh = out
    elif in_place:
        # Atomic write via tempfile, swap on close.
        self._target_path = str(out)
        tmp = tempfile.NamedTemporaryFile(
            mode="w", delete=False, dir=os.path.dirname(self._target_path) or ".",
            suffix=".gffbase.tmp", encoding="utf-8",
        )
        self._fh = tmp.file
        self._opened_path = tmp.name
        tmp.close()
        self._fh = open(self._opened_path, "w", encoding="utf-8")
    else:
        self._opened_path = str(out)
        self._fh = open(self._opened_path, "w", encoding="utf-8")

    if self.with_header:
        self._fh.write("##gff-version 3\n")

export_sqlite

Serialize a GFFBase DuckDB connection back into a legacy gffutils-compatible SQLite database.

gffbase.sqlite_export.export_sqlite

export_sqlite(con: DuckDBPyConnection, path: str, force: bool = False) -> str

Write a legacy SQLite .db from the given DuckDB connection.

Returns the absolute path on success.

Source code in python/gffbase/sqlite_export.py
def export_sqlite(con: duckdb.DuckDBPyConnection, path: str,
                  force: bool = False) -> str:
    """Write a legacy SQLite ``.db`` from the given DuckDB connection.

    Returns the absolute path on success.
    """
    if os.path.exists(path):
        if not force:
            raise ValueError(f"{path} already exists; pass force=True to overwrite")
        os.unlink(path)

    sqlite_con = sqlite3.connect(path)
    try:
        sqlite_con.executescript(_LEGACY_SCHEMA)

        # Stream features in file order.
        rows = con.execute(
            """
            SELECT id, seqid, source, featuretype, start, "end",
                   score, strand, frame,
                   CAST(attributes_blob AS VARCHAR) AS attributes,
                   CAST(extra_blob      AS VARCHAR) AS extra
            FROM features
            ORDER BY file_order NULLS LAST, id
            """
        ).fetchall()

        # Compute UCSC bin in Python (DuckDB has no native equivalent and
        # legacy SQLite users rely on this for `region()` queries).
        export_rows = []
        for r in rows:
            (fid, seqid, source, featuretype, start, end, score, strand,
             frame, attributes, extra) = r
            ucsc_bin = bin_from_coords(start, end) if start and end else None
            export_rows.append(
                (fid, seqid, source, featuretype, start, end, score, strand,
                 frame, attributes or "", extra or "", ucsc_bin)
            )
        sqlite_con.executemany(
            "INSERT INTO features VALUES (?,?,?,?,?,?,?,?,?,?,?,?)",
            export_rows,
        )

        # Closure → relations(parent, child, level=depth).
        rels = con.execute(
            "SELECT ancestor, descendant, depth FROM closure"
        ).fetchall()
        sqlite_con.executemany("INSERT INTO relations VALUES (?,?,?)", rels)

        # Meta — write the dialect (JSON) + version.
        meta = dict(con.execute("SELECT key, value FROM meta").fetchall())
        sqlite_con.execute(
            "INSERT INTO meta VALUES (?, ?)",
            (meta.get("dialect", json.dumps({"fmt": "gff3"})),
             "gffbase-export"),
        )

        # Directives.
        dirs = con.execute(
            "SELECT directive FROM directives ORDER BY seq"
        ).fetchall()
        sqlite_con.executemany("INSERT INTO directives VALUES (?)", dirs)

        # Autoincrements (typically empty in Phase 5).
        try:
            ai = con.execute("SELECT base, n FROM autoincrements").fetchall()
            if ai:
                sqlite_con.executemany("INSERT INTO autoincrements VALUES (?, ?)", ai)
        except duckdb.Error:
            pass

        sqlite_con.commit()
    finally:
        sqlite_con.close()
    return os.path.abspath(path)

Low-level parser

gffbase.parser.parse_gff

parse_gff(path: str, *, checklines: int = 10, force_dialect_check: bool = False, force_gff: bool = False, strict: bool = True, engine: Optional[str] = 'auto') -> _Iterator

Parse a GFF3/GTF file (plain text or .gz).

Returns an iterator of ParsedFeature plus .dialect(), .directives(), and (Phase 16) .warnings accessors.

Parameters

strict : bool When True (default), the iterator raises GFFFormatError on the first malformed line. When False, malformed lines are skipped silently and recorded in iterator.warnings.

Source code in python/gffbase/parser.py
def parse_gff(
    path: str,
    *,
    checklines: int = 10,
    force_dialect_check: bool = False,
    force_gff: bool = False,
    strict: bool = True,
    engine: Optional[str] = "auto",
) -> _Iterator:
    """Parse a GFF3/GTF file (plain text or ``.gz``).

    Returns an iterator of ``ParsedFeature`` plus ``.dialect()``,
    ``.directives()``, and (Phase 16) ``.warnings`` accessors.

    Parameters
    ----------
    strict : bool
        When True (default), the iterator raises ``GFFFormatError`` on
        the first malformed line. When False, malformed lines are
        skipped silently and recorded in ``iterator.warnings``.
    """
    eng = _resolve_engine(engine)
    if eng == "rust":
        it = _rust.parse_file(  # type: ignore[union-attr]
            path,
            checklines=checklines,
            force_dialect_check=force_dialect_check,
            force_gff=force_gff,
            strict=strict,
        )
        return _Iterator(it, native=True)
    it = _pyparser.parse_file(
        path,
        checklines=checklines,
        force_dialect_check=force_dialect_check,
        force_gff=force_gff,
        strict=strict,
    )
    return _Iterator(it, native=False)

gffbase.parser.parse_bytes

parse_bytes(data: bytes, *, checklines: int = 10, force_dialect_check: bool = False, force_gff: bool = False, strict: bool = True, engine: Optional[str] = 'auto') -> _Iterator
Source code in python/gffbase/parser.py
def parse_bytes(
    data: bytes,
    *,
    checklines: int = 10,
    force_dialect_check: bool = False,
    force_gff: bool = False,
    strict: bool = True,
    engine: Optional[str] = "auto",
) -> _Iterator:
    eng = _resolve_engine(engine)
    if eng == "rust":
        it = _rust.parse_bytes(  # type: ignore[union-attr]
            data,
            checklines=checklines,
            force_dialect_check=force_dialect_check,
            force_gff=force_gff,
            strict=strict,
        )
        return _Iterator(it, native=True)
    it = _pyparser.parse_bytes(
        data,
        checklines=checklines,
        force_dialect_check=force_dialect_check,
        force_gff=force_gff,
        strict=strict,
    )
    return _Iterator(it, native=False)

gffbase.parser.detect_dialect

detect_dialect(path: str, *, checklines: int = 10, engine: Optional[str] = 'auto') -> dict
Source code in python/gffbase/parser.py
def detect_dialect(
    path: str, *, checklines: int = 10, engine: Optional[str] = "auto"
) -> dict:
    eng = _resolve_engine(engine)
    if eng == "rust":
        return _rust.detect_dialect(path, checklines=checklines)  # type: ignore[union-attr]
    return _pyparser.detect_dialect(path, checklines=checklines)

gffbase.parser.native_available

native_available() -> bool

True if the compiled extension is importable.

Source code in python/gffbase/parser.py
def native_available() -> bool:
    """True if the compiled extension is importable."""
    return _NATIVE