Skip to content

FeatureDB

The query database. All methods are auto-rendered from the live docstrings.

gffbase.interface.FeatureDB

FeatureDB(dbfn, default_encoding: str = 'utf-8', keep_order: bool = False, pragmas: Optional[dict] = None, sort_attribute_values: bool = False, text_factory=str)

Drop-in successor to gffutils.FeatureDB.

Source code in python/gffbase/interface.py
def __init__(
    self,
    dbfn,
    default_encoding: str = "utf-8",
    keep_order: bool = False,
    pragmas: Optional[dict] = None,
    sort_attribute_values: bool = False,
    text_factory=str,
):
    self.default_encoding = default_encoding
    self.keep_order = keep_order
    self.sort_attribute_values = sort_attribute_values
    self.text_factory = text_factory
    self._analyzed_flag = False

    # Resolve dbfn → connection.
    if isinstance(dbfn, duckdb.DuckDBPyConnection):
        self.conn = dbfn
        self.dbfn = ":existing-connection:"
    elif isinstance(dbfn, tuple) and len(dbfn) == 2 and isinstance(
        dbfn[0], duckdb.DuckDBPyConnection
    ):
        # (con, IngestStats) — used internally by `create_db`.
        self.conn = dbfn[0]
        self.dbfn = ":existing-connection:"
    elif isinstance(dbfn, str):
        self.dbfn = dbfn
        self.conn = duckdb.connect(dbfn, read_only=False)
    else:
        raise TypeError(
            f"dbfn must be a path, DuckDB connection, or (con, stats) tuple; got {type(dbfn)!r}"
        )

    if pragmas:
        self.set_pragmas(pragmas)

    # Recover provenance from `meta`.
    meta = self._read_meta()
    self.dialect = self._parse_dialect(meta.get("dialect"))
    self.fmt = meta.get("fmt", "gff3")
    self._rtree_built = meta.get("rtree_built", "false").lower() == "true"
    self._max_depth = int(meta.get("max_depth", "8"))
    # Defensive: confirm the R-tree index actually exists in this DB.
    if self._rtree_built:
        self._rtree_built = self._has_rtree_index()
    # If the R-tree was built at ingest time, the spatial extension's
    # functions (ST_Intersects, ST_MakeEnvelope, …) must be loaded into
    # the current connection — they are NOT auto-loaded by DuckDB just
    # because the index exists. If the load fails (offline HPC node, etc),
    # gracefully fall back to the multi-column B-tree path.
    if self._rtree_built:
        try:
            self.conn.execute("LOAD spatial")
        except duckdb.Error:
            try:
                self.conn.execute("INSTALL spatial")
                self.conn.execute("LOAD spatial")
            except duckdb.Error:
                self._rtree_built = False

    # Phase 7 — closure-cache vs dynamic-CTE dispatcher. Read the corpus's
    # true hierarchy depth once at open; fall back to a live MAX(depth)
    # query if older DBs don't carry the meta row.
    cmd_meta = meta.get("closure_max_depth")
    if cmd_meta is not None:
        self._closure_max_depth = int(cmd_meta)
    else:
        try:
            row = self.conn.execute("SELECT MAX(depth) FROM closure").fetchone()
            self._closure_max_depth = int(row[0]) if row and row[0] is not None else 0
        except duckdb.Error:
            self._closure_max_depth = 0

    # Phase 7 — load the seqid → y-band map so `_region_sql_rtree` can
    # produce a tightly-bounded ST_MakeEnvelope at query time. Empty when
    # no R-tree was built (we fall back to the B-tree path anyway).
    self._seqid_y_map: dict = {}
    if self._rtree_built:
        try:
            rows = self.conn.execute(
                "SELECT seqid, seqid_y FROM seqid_map"
            ).fetchall()
            self._seqid_y_map = {s: int(y) for s, y in rows}
        except duckdb.Error:
            # Older DBs without the map — fall back to B-tree to be safe.
            self._rtree_built = False

    # Directives
    self.directives = [
        row[0] for row in self.conn.execute(
            "SELECT directive FROM directives ORDER BY seq"
        ).fetchall()
    ]

region_batched

region_batched(regions, featuretype: Optional[Union[str, List[str]]] = None, completely_within: bool = False, format: str = 'arrow')

Bulk overlap query. Performs a SINGLE spatial JOIN between every input region and the features table, returning a column-oriented result that maps each query (query_idx) back to its overlapping features.

Parameters

regions : iterable of (seqid, start, end) | str | Feature Each item is normalized through _normalize_region_args; the same four input shapes accepted by region() are accepted here. featuretype : str | list[str] | None Optional features.featuretype filter applied to all regions. completely_within : bool If True, only features fully contained in the region are returned (default False — overlap is sufficient). format : "arrow" | "df" | "polars" Return shape (default "arrow").

Result columns are query_idx, query_seqid, query_start, query_end, id, seqid, source, featuretype, start, end, score, strand, frame, file_order.

Source code in python/gffbase/interface.py
def region_batched(
    self,
    regions,
    featuretype: Optional[Union[str, List[str]]] = None,
    completely_within: bool = False,
    format: str = "arrow",
):
    """Bulk overlap query. Performs a SINGLE spatial JOIN between every
    input region and the features table, returning a column-oriented
    result that maps each query (`query_idx`) back to its overlapping
    features.

    Parameters
    ----------
    regions : iterable of (seqid, start, end) | str | Feature
        Each item is normalized through `_normalize_region_args`; the
        same four input shapes accepted by `region()` are accepted here.
    featuretype : str | list[str] | None
        Optional `features.featuretype` filter applied to all regions.
    completely_within : bool
        If True, only features fully contained in the region are returned
        (default False — overlap is sufficient).
    format : "arrow" | "df" | "polars"
        Return shape (default `"arrow"`).

    Result columns are query_idx, query_seqid, query_start,
    query_end, id, seqid, source, featuretype, start, end, score,
    strand, frame, file_order.
    """
    rows = []
    for r in regions:
        if isinstance(r, tuple) and len(r) == 3 and all(v is not None for v in r):
            seqid, rs, re_ = r[0], int(r[1]), int(r[2])
        else:
            seqid, rs, re_ = self._normalize_region_args(r, None, None, None)
        if seqid is None or rs is None or re_ is None:
            continue
        rows.append((seqid, int(rs), int(re_)))
    if not rows:
        return self._empty_region_batched(format)

    import pyarrow as pa
    regions_table = pa.table({
        "query_idx":   list(range(len(rows))),
        "query_seqid": [r[0] for r in rows],
        "query_start": [r[1] for r in rows],
        "query_end":   [r[2] for r in rows],
    })
    self.conn.register("__staging_regions", regions_table)
    try:
        # The R-tree path uses the seqid_y-encoded envelope so that
        # DuckDB's spatial index segregates chromosomes (Phase 7).
        # B-tree fallback is identical SQL minus the ST_Intersects
        # predicate.
        ft_where, ft_params = self._featuretype_filter(featuretype, qualifier="f")
        ft_clause = (" AND " + " AND ".join(ft_where)) if ft_where else ""
        within_clause = (
            ' AND f.start >= q.query_start AND f."end" <= q.query_end'
            if completely_within else ""
        )

        if self._rtree_built and self._seqid_y_map:
            # Inline the seqid → seqid_y map as a small VALUES table so
            # the JOIN can prune by chromosome inside the R-tree.
            values_pairs = ",".join(
                f"(?, {y})" for y in (self._seqid_y_map[s] for s in self._seqid_y_map)
            )
            seqid_y_params = list(self._seqid_y_map.keys())
            sql = f"""
                WITH seqid_lookup(seqid, seqid_y) AS (
                    VALUES {values_pairs}
                )
                SELECT
                    q.query_idx, q.query_seqid, q.query_start, q.query_end,
                    f.id, f.seqid, f.source, f.featuretype,
                    f.start, f."end" AS "end",
                    f.score, f.strand, f.frame, f.file_order
                FROM __staging_regions q
                JOIN seqid_lookup s ON s.seqid = q.query_seqid
                JOIN features f
                  ON f.seqid = q.query_seqid
                  AND ST_Intersects(
                      f.bbox,
                      ST_MakeEnvelope(q.query_start, s.seqid_y,
                                      q.query_end,   s.seqid_y + 1))
                WHERE 1=1{within_clause}{ft_clause}
                ORDER BY q.query_idx, f.start
            """
            params = list(seqid_y_params) + ft_params
        else:
            sql = f"""
                SELECT
                    q.query_idx, q.query_seqid, q.query_start, q.query_end,
                    f.id, f.seqid, f.source, f.featuretype,
                    f.start, f."end" AS "end",
                    f.score, f.strand, f.frame, f.file_order
                FROM __staging_regions q
                JOIN features f
                  ON f.seqid = q.query_seqid
                  AND f.start <= q.query_end
                  AND f."end"  >= q.query_start
                WHERE 1=1{within_clause}{ft_clause}
                ORDER BY q.query_idx, f.start
            """
            params = list(ft_params)

        return self._materialize_batched(sql, params, format=format)
    finally:
        try:
            self.conn.unregister("__staging_regions")
        except Exception:
            pass

children_batched

children_batched(feature_ids, level: Optional[int] = None, featuretype: Optional[Union[str, List[str]]] = None, format: str = 'arrow')

Bulk children lookup. Returns the descendants of ALL feature_ids in a single vectorized DuckDB query.

Parameters

feature_ids : iterable of str | Feature Anchors. May contain Feature objects or raw ID strings. level : int | None None → all descendants (closure cache when materialized, otherwise dynamic CTE). Integer → exact-depth point lookup. featuretype : str | list[str] | None Optional filter on features.featuretype. format : "arrow" | "df" | "polars" Return shape (default "arrow"pyarrow.Table).

Result columns are anchor (the parent ID supplied), descendant_id, seqid, source, featuretype, start, end, score, strand, frame, file_order, depth.

Source code in python/gffbase/interface.py
def children_batched(
    self,
    feature_ids,
    level: Optional[int] = None,
    featuretype: Optional[Union[str, List[str]]] = None,
    format: str = "arrow",
):
    """Bulk children lookup. Returns the descendants of ALL `feature_ids`
    in a single vectorized DuckDB query.

    Parameters
    ----------
    feature_ids : iterable of str | Feature
        Anchors. May contain `Feature` objects or raw ID strings.
    level : int | None
        None → all descendants (closure cache when materialized,
        otherwise dynamic CTE). Integer → exact-depth point lookup.
    featuretype : str | list[str] | None
        Optional filter on `features.featuretype`.
    format : "arrow" | "df" | "polars"
        Return shape (default `"arrow"` — `pyarrow.Table`).

    Result columns are anchor (the parent ID supplied),
    descendant_id, seqid, source, featuretype, start, end, score,
    strand, frame, file_order, depth.
    """
    return self._batched_relation(
        feature_ids, level=level, featuretype=featuretype,
        direction="children", format=format,
    )

parents_batched

parents_batched(feature_ids, level: Optional[int] = None, featuretype: Optional[Union[str, List[str]]] = None, format: str = 'arrow')

Bulk parents lookup. Mirrors children_batched but walks the closure / edges in the reverse direction.

Source code in python/gffbase/interface.py
def parents_batched(
    self,
    feature_ids,
    level: Optional[int] = None,
    featuretype: Optional[Union[str, List[str]]] = None,
    format: str = "arrow",
):
    """Bulk parents lookup. Mirrors `children_batched` but walks the
    closure / edges in the reverse direction."""
    return self._batched_relation(
        feature_ids, level=level, featuretype=featuretype,
        direction="parents", format=format,
    )

execute

execute(query: str)

Execute arbitrary SQL. Returns DuckDB's relation cursor. SQLite-style queries against features_compat and relations_compat views are supported; see compat_views.sql.

Source code in python/gffbase/interface.py
def execute(self, query: str):
    """Execute arbitrary SQL. Returns DuckDB's relation cursor.
    SQLite-style queries against ``features_compat`` and ``relations_compat``
    views are supported; see ``compat_views.sql``."""
    return self.conn.execute(query.rstrip(";"))