Usage Gallery — every public method, copy-pasteable¶
This is the cheat sheet. Every snippet below is runnable as-is against any GFFBase database. The cookbook pages walk through specific biological corpora; this page covers the API surface in isolation so you can grep for the method name you need and lift the snippet.
If you only read three sections, make them:
- Database initialization
- Vectorized ML API — the zero-copy PyArrow path
- Raw SQL escape hatch
1. Database initialization¶
1.1 Create a database from a file (most common)¶
from gffbase import create_db
# GTF or GFF3, plain or gzipped — auto-detected from contents.
db = create_db(
"annotation.gff3.gz", # input path
"annotation.duckdb", # output DB
force=True, # overwrite if it already exists
)
print(db.fmt) # 'gff3' or 'gtf'
print(db.count_features_of_type()) # total feature count
1.2 In-memory database (no disk artifact)¶
from gffbase import create_db
# Pass `:memory:` as the dbfn. The DB lives only for the lifetime
# of the FeatureDB object — useful for small files in scripts.
db = create_db("small_annotation.gff3", ":memory:")
1.3 Re-open an existing database¶
from gffbase import FeatureDB
# Cheap: no re-ingest. Only the meta + index DDL is read at open.
db = FeatureDB("annotation.duckdb")
print(db.schema()) # 'gff3' / 'gtf' / etc.
print(list(db.seqids())[:5]) # first 5 chromosomes
1.4 Ingest from a string instead of a file¶
from gffbase import create_db
text = (
"##gff-version 3\n"
"chr1\trs\tgene\t1\t1000\t.\t+\t.\tID=g1\n"
"chr1\trs\texon\t100\t200\t.\t+\t.\tID=e1;Parent=g1\n"
)
db = create_db(text, ":memory:", from_string=True)
1.5 Tune ingest with max_depth and parallel threads¶
import os
from gffbase import ingest, FeatureDB
# `max_depth` controls how deep the materialized closure goes.
# Default is 8 — fine for GENCODE/Ensembl. Bump it if your hierarchy
# is deeper (e.g. nested gene clusters, custom annotations).
con, stats = ingest.from_file(
"deep_annotation.gff3",
dbfn="deep.duckdb",
force=True,
max_depth=16, # default 8
batch_size=100_000, # default 50k — bigger uses more RAM
)
con.close()
print(f"closure rows: {stats.n_closure_rows:,}")
# Parallel ingest: DuckDB respects the `GFFUTILS2_THREADS` env var.
# Set it BEFORE create_db() / ingest.from_file().
os.environ["GFFUTILS2_THREADS"] = "8"
db = FeatureDB("deep.duckdb")
db.set_pragmas({"threads": 8}) # also tunable post-open
1.6 Strict vs non-strict parsing¶
from gffbase import create_db, GFFFormatError
# Strict (default) — first malformed line raises.
try:
db = create_db("messy.gff3", "messy.duckdb", force=True)
except GFFFormatError as e:
print(f"line {e.line_no}: [{e.kind}] {e.message}")
The line-by-line iterator API supports an opt-in non-strict mode (see §7.2).
2. Standard relational queries¶
2.1 children() — walk down the hierarchy¶
# Direct children only (level=1):
for tx in db.children("ENSG00000139618", level=1):
print(tx.id, tx.featuretype, tx.start, tx.end)
# All descendants (level=None):
for f in db.children("ENSG00000139618", level=None):
pass
# Filter by featuretype:
exons = list(db.children("ENST00000380152", featuretype="exon"))
2.2 parents() — walk up the hierarchy¶
# All ancestors of an exon: its mRNA + the gene above it.
for ancestor in db.parents("exon_id_42", level=None):
print(ancestor.id, ancestor.featuretype)
# One level up only:
mrna = next(db.parents("exon_id_42", level=1))
2.3 features_of_type() — every feature of a kind¶
# Stream every gene:
for gene in db.features_of_type("gene"):
print(gene.id, gene.seqid, gene.start, gene.end)
# Multiple types at once:
for f in db.features_of_type(["mRNA", "lncRNA", "miRNA"]):
pass
# Region-clipped + ordered:
for exon in db.features_of_type(
"exon",
limit="chr17:43044295-43125483",
order_by="start",
):
pass
2.4 Random access by ID¶
# Square-bracket lookup is the fastest single-feature access:
gene = db["ENSG00000139618"] # raises FeatureNotFoundError on miss
# Membership check:
if "ENSG00000139618" in db:
pass
# Aggregate counts:
print(db.count_features_of_type()) # total
print(db.count_features_of_type("exon")) # just exons
print(list(db.featuretypes())) # distinct featuretype values
3. Spatial queries¶
3.1 region() — overlap query¶
# String form: "seqid:start-end"
for f in db.region("chr17:43044295-43125483"):
print(f.featuretype, f.start, f.end, f.id)
# Keyword form (equivalent):
for f in db.region(seqid="chr17", start=43_044_295, end=43_125_483):
pass
# Filter by featuretype during the spatial query:
for exon in db.region(
"chr17:43044295-43125483",
featuretype="exon",
):
pass
# Filter by strand:
plus_strand = list(db.region(
seqid="chr1", start=1_000_000, end=2_000_000,
strand="+",
))
3.2 completely_within=True — strict containment¶
# Default (overlap): any feature touching the region.
overlapping = list(db.region("chr1:100-200"))
# Strict: only features whose [start, end] is fully inside [100, 200].
contained = list(db.region("chr1:100-200", completely_within=True))
3.3 Region from a Feature object (re-use coords)¶
gene = db["ENSG00000139618"]
# Pull every feature that overlaps this gene's footprint:
for f in db.region(gene):
pass
4. Feature object — fields, attributes, mutation, export¶
4.1 Standard fields¶
gene = db["ENSG00000139618"]
print(gene.seqid, gene.start, gene.end) # chr13 32315474 32400266
print(gene.strand, gene.frame, gene.score)
print(gene.featuretype, gene.source)
print(gene.chrom, gene.stop) # aliases for seqid / end
4.2 Dynamic attributes (col-9)¶
# attributes is a multi-value mapping: each key maps to a list[str]
print(gene.attributes["gene_id"]) # ['ENSG00000139618']
print(gene.attributes.get("Note", [])) # safe default
# Multi-value example: NCBI Dbxref carries multiple cross-refs.
for xref in gene.attributes.get("Dbxref", []):
print(xref)
# Dict-style materialization (handy for JSON/CSV export):
print(gene.attributes_dict())
# {'ID': ['gene-...'], 'Name': ['BRCA2'], 'gene_biotype': ['protein_coding'], ...}
4.3 Mutate + write back¶
# Edit in place — attributes is a mutable mapping.
gene.attributes["custom_tag"] = ["my_pipeline:v1"]
gene.start = gene.start - 1000 # extend by 1 kb upstream
gene.score = "0.95"
# Persist the change back to the DB:
db.update([gene])
4.4 Export to a GFF/GTF line¶
# str(feature) emits a tab-separated GFF line, ready for I/O.
print(str(gene))
# chr13 HAVANA gene 32315474 32400266 . + . ID=...
# bytes / file-write friendly:
with open("out.gff3", "a") as fout:
fout.write(str(gene) + "\n")
4.5 Sequence extraction (FASTA)¶
# pyfaidx-style mapping (fasta[seqid][start:end]) works:
fasta = {"chr1": "AAAA" + "ACGT" * 100_000}
seq = gene.sequence(fasta) # forward
rev = gene.sequence(fasta, use_strand=True) # rev-comp on '-'
4.6 Dict-shape representation (for downstream tools)¶
# Construct your own dict if you want a JSON-friendly shape.
record = {
"seqid": gene.seqid,
"start": gene.start, "end": gene.end,
"strand": gene.strand, "featuretype": gene.featuretype,
"attributes": gene.attributes_dict(),
}
import json; print(json.dumps(record, indent=2))
5. Vectorized ML API — the ones that make GFFBase fast¶
The headline win. One SQL query per call, results land in a
zero-copy pyarrow.Table (or pandas.DataFrame / polars.DataFrame)
without ever instantiating a Python Feature object.
5.1 children_batched() → PyArrow¶
# Pull every exon for 50 000 transcripts in one call.
transcript_ids = ["ENST00000380152", "ENST00000544455", ...] # any size
table = db.children_batched(
transcript_ids,
featuretype="exon",
format="arrow", # default
)
# Columns: anchor, descendant_id, seqid, source, featuretype,
# start, end, score, strand, frame, file_order, depth.
print(table.num_rows, "exon rows")
# Hand off to PyTorch / JAX without copying.
import torch
starts = torch.from_numpy(table.column("start").to_numpy())
5.2 children_batched() → pandas¶
df = db.children_batched(
transcript_ids,
featuretype="exon",
format="df", # pandas.DataFrame
)
# Group back to per-transcript exon counts via the `anchor` column —
# no need to re-issue N queries:
print(df.groupby("anchor").size().describe())
5.3 children_batched() → polars¶
# Polars is optional: `pip install polars`
plframe = db.children_batched(
transcript_ids,
featuretype="exon",
format="polars",
)
print(plframe.group_by("anchor").len())
5.4 parents_batched() — symmetric upward walk¶
# For each exon id, get its mRNA + gene ancestors in one query.
exon_ids = [r[0] for r in db.execute(
"SELECT id FROM features WHERE featuretype='exon' LIMIT 50000"
).fetchall()]
ancestors = db.parents_batched(exon_ids, format="arrow")
print(ancestors.column_names)
# ['anchor', 'descendant_id', ...] — 'anchor' is the input id;
# 'descendant_id' here is actually the ancestor (kept for column-
# stability across the children/parents directions).
5.5 region_batched() — bulk spatial overlap¶
# Common ML pattern: "for each ATAC-seq peak in this BED file,
# find every overlapping CDS." One SQL query.
peaks = [
("chr1", 100_000, 110_000),
("chr1", 200_000, 210_000),
("chr2", 1_500_000, 1_600_000),
# ... up to millions of regions
]
overlaps = db.region_batched(peaks, featuretype="CDS", format="arrow")
# Columns: query_idx, query_seqid, query_start, query_end,
# id, seqid, source, featuretype, start, end, score,
# strand, frame, file_order.
print(overlaps.num_rows, "CDS overlaps across", len(peaks), "peaks")
# `query_idx` lets you rebuild per-peak hits without an N-query loop:
import pyarrow.compute as pc
hits_per_peak = pc.value_counts(overlaps.column("query_idx"))
5.6 region_batched() — completely_within flag¶
contained = db.region_batched(
peaks,
featuretype="exon",
completely_within=True, # only fully-contained features
format="df",
)
6. Synthesis & convenience methods¶
6.1 interfeatures() — derive intergenic regions¶
# Yield "gap" features sitting between consecutive features.
genes = sorted(db.features_of_type("gene"), key=lambda g: (g.seqid, g.start))
for gap in db.interfeatures(genes, new_featuretype="intergenic"):
print(gap.seqid, gap.start, gap.end)
6.2 merge() / merge_all() — collapse overlapping features¶
from gffbase import merge_criteria as mc
exons_chr1 = list(db.features_of_type("exon", limit="chr1"))
for merged in db.merge(exons_chr1, merge_criteria=(mc.overlap_end_inclusive,)):
print(merged.start, merged.end, len(merged.children))
# Whole-DB sweep:
for merged in db.merge_all():
pass
6.3 create_introns() / create_splice_sites()¶
# Synthesize intron features from existing exon hierarchy.
introns = list(db.create_introns(exon_featuretype="exon",
grandparent_featuretype="gene"))
print(f"{len(introns):,} introns")
# Synthesize splice-site features.
sites = list(db.create_splice_sites(exon_featuretype="exon"))
6.4 bed12() — UCSC track export¶
# Emit a BED12 line for each transcript.
with open("transcripts.bed", "w") as fout:
for tx in db.features_of_type("transcript", limit="chr1"):
fout.write(db.bed12(tx, name_field="transcript_id") + "\n")
6.5 children_bp() — total length of nested children¶
# Total exonic basepairs in a gene.
gene = db["ENSG00000139618"]
total_bp = db.children_bp(gene, child_featuretype="exon")
print(f"{total_bp:,} bp of exonic sequence")
6.6 iter_by_parent_childs() — yield each parent + its children¶
# Stream gene + all-its-descendants groups in one pass.
for gene, *children in db.iter_by_parent_childs(featuretype="gene"):
print(f"{gene.id}: {len(children):,} child features")
7. Advanced / escape hatches¶
7.1 GFFWriter — emit new annotation files¶
from gffbase import GFFWriter
with GFFWriter("output.gff3") as w:
w.write_rec(db["ENSG00000139618"])
w.write_recs(db.features_of_type("transcript", limit="chr13"))
# Convenience helpers for hierarchy-walking output:
w.write_gene_recs(db, "ENSG00000139618") # gene + all descendants
w.write_mRNA_children(db, "ENST00000380152") # mRNA + level-1 children
w.write_exon_children(db, "exon_id_42") # exon + level-1 children
7.2 Iterate a raw file without building a database¶
from gffbase import parse_gff, GFFFormatError
# Stream parser — never materializes a DB.
for rec in parse_gff("annotation.gff3"):
print(rec.seqid, rec.start, rec.end, rec.attributes_pairs)
# Non-strict mode: malformed lines are surfaced as warnings, valid
# records still flow.
it = parse_gff("messy.gff3", strict=False)
for rec in it:
process(rec)
for w in it.warnings:
print(w["line_no"], w["kind"], w["message"])
7.3 DataIterator — legacy-compatible streaming reader¶
from gffbase import DataIterator
# Read a whole file or a single string of GFF3 records.
for f in DataIterator("annotation.gff3"):
print(f.id, f.attributes_dict())
print(DataIterator.__doc__) # discovers the legacy API verbatim
7.4 parse_bytes() — drive the parser from in-memory bytes¶
from gffbase import parse_bytes
raw = (b"##gff-version 3\n"
b"chr1\trs\texon\t1\t100\t.\t+\t.\tID=e1\n")
for rec in parse_bytes(raw):
print(rec.featuretype, rec.start, rec.end)
7.5 execute() — raw SQL escape hatch¶
# Drop into the underlying DuckDB connection for anything the API
# doesn't expose. Returns the same cursor DuckDB-Python would.
rows = db.execute("""
SELECT f.id, f.seqid, f.start, f."end"
FROM features f
JOIN attributes a ON a.feature_id = f.id
WHERE f.featuretype = 'gene'
AND a.key = 'gene_biotype'
AND a.value = 'protein_coding'
ORDER BY f.seqid, f.start
""").fetchall()
print(f"{len(rows):,} protein-coding genes")
# Back-compat: queries written for legacy gffutils' SQLite schema can
# hit `features_compat` and `relations_compat` views verbatim.
db.execute("SELECT * FROM features_compat WHERE seqid = 'chr1' LIMIT 5"
).fetchall()
7.6 analyze() + set_pragmas() — DuckDB tuning¶
# After heavy mutation (delete/update), run ANALYZE so the planner
# picks the right join order. Idempotent — cheap to call.
db.analyze()
# Set DuckDB pragmas. Anything DuckDB rejects is silently skipped
# so legacy SQLite-style pragmas are safe to pass through.
db.set_pragmas({
"threads": 8,
"memory_limit": "4GB",
})
7.7 delete() / update() / add_relation()¶
# Remove features. Accepts ids, Feature objects, or another FeatureDB.
db.delete(["ENSG00000139618"])
db.delete([db["ENSG00000141510"]]) # by Feature
# db.delete(other_feature_db) # delete every id present in `other`
# Insert new features.
from gffbase import Feature
new_exon = Feature(
seqid="chr1", source="custom", featuretype="exon",
start=100, end=200, strand="+", frame=".",
attributes={"ID": ["custom_exon_1"], "Parent": ["ENST00000000000"]},
)
db.update([new_exon])
# Wire two existing features into a parent/child edge.
db.add_relation("ENSG00000139618", "custom_exon_1", level=2)
7.8 export_sqlite() — produce a legacy-gffutils-readable file¶
from gffbase import export_sqlite
# Materialize the GFFBase data into a SQLite file with the legacy
# `features` / `relations` schema. Downstream tools that only know
# how to read `gffutils.FeatureDB("annotation.sqlite")` work
# unchanged.
out_path = export_sqlite(db.conn, "annotation.sqlite")
print(out_path)
# Round-trip:
import gffutils # pip install gffutils
legacy = gffutils.FeatureDB(out_path)
print(legacy.count_features_of_type())
8. Putting it all together — end-to-end ML pipeline¶
from gffbase import create_db
import torch
# 1. Ingest once.
db = create_db(
"gencode.v49.basic.annotation.gtf.gz",
"gencode.duckdb",
force=True,
)
# 2. Pick the transcripts you care about (any predicate).
target_ids = [r[0] for r in db.execute("""
SELECT f.id
FROM features f
JOIN attributes a ON a.feature_id = f.id
WHERE f.featuretype = 'transcript'
AND a.key = 'tag'
AND a.value = 'MANE_Select'
""").fetchall()]
print(f"{len(target_ids):,} MANE_Select transcripts")
# 3. ONE batched call → pyarrow.Table sharing memory with DuckDB.
exons = db.children_batched(
target_ids, featuretype="exon", format="arrow",
)
# 4. Tensor handoff — zero copy.
starts = torch.from_numpy(exons.column("start").to_numpy())
ends = torch.from_numpy(exons.column("end").to_numpy())
print(starts.shape, ends.shape)
That's the full GFFBase API. For real-corpus walkthroughs, see the Cookbooks. For numbers, see Performance Comparison.