GenbankAnnotationDb#

class GenbankAnnotationDb(*args: Any, **kwargs: Any)#

Support for annotations from Genbank files.

Attributes:

db
describe: top level description of the annotation db
schema_version: Return the schema version of this database.
table_names

Methods

`add_feature`(*, seqid, biotype, name, spans)	adds a record to user table
`biotype_counts`()	return counts of biological types across all tables and seqids
`close`()	closes the db
`compatible`(other_db[, symmetric])	checks whether table_names are compatible
`count_distinct`(*[, seqid, biotype, name])	return table of counts of distinct values
`from_file`(path)	Load an annotation database from a file.
`get_feature_children`(name[, biotype, ...])	yields children of name
`get_feature_parent`(name[, exclude_biotype, ...])	yields parents of name
`get_records_matching`(*[, biotype, seqid, ...])	return all fields for matching records
`make_indexes`()	adds db indexes for core attributes
`num_matches`(*[, seqid, biotype, name, ...])	return the number of records matching condition
`subset`(*[, source, biotype, seqid, name, ...])	returns a new db instance with records matching the provided conditions
`to_rich_dict`()	returns a dict suitable for json serialisation
`union`(annot_db)	returns a new instance with merged records with other
`update`(annot_db[, seqids])	update records with those from an instance of the same type
`write`(path)	writes db as bytes to path

add_records
from_dict
get_features_matching
to_json

Notes

Extended attributes are stored as json in the gb, attributes column.

StrOrBool = str | bool#

adds a record to user table

Parameters:

seqid: name of the sequence feature resides on
biotype: biological type of the record
name: the name of a record, an identifier
spans: this will be sorted
strand: either +, -. Defaults to ‘+’
attributes: additional attributes as a string
on_alignment: whether the annotation is an alignment annotation

add_records(records: Iterable[dict[str, Any]], seqid: str | None = None, **kwargs: Any) → None#

biotype_counts() → dict[str, int]#: return counts of biological types across all tables and seqids

close() → None#: closes the db

compatible(other_db: AnnotationDbABC, symmetric: bool = True) → bool#

checks whether table_names are compatible

Parameters:

other_db: the other annotation db instance
symmetric: checks only that tables of other_db equal, or are a subset, of mine

count_distinct(*, seqid: StrOrBool = False, biotype: StrOrBool = False, name: StrOrBool = False) → Table | None#

return table of counts of distinct values

Parameters:

seqid, biotype, name: if a string, selects the subset of rows matching the provided values and counts distinct values for the other fields whose value is True.

Returns:

Table with columns corresponding to argument whose value was True

Examples

To compute copy number by gene name within each genome

>>> counts_table = db.count_distinct(seqid=True, biotype="gene", name=True)

property db: Connection#

property describe: Table#: top level description of the annotation db

classmethod from_dict(data: dict[str, Any]) → Self#

classmethod from_file(path: str | PathLike[Any] | PurePath | Path) → Self#

Load an annotation database from a file.

Parameters:

path: Path to the saved database file. Must have suffix matching the class’s _suffix attribute.

Returns:

An instance of the annotation database class with the loaded data.

Raises:

ValueError: If the file suffix doesn’t match the expected suffix for this class.
OSError: If the file doesn’t exist.

get_feature_children(name: str, biotype: str | None = None, exclude_biotype: str | None = None, start: int | None = None, stop: int | None = None, **kwargs: Any) → Iterator[FeatureDataType]#: yields children of name

get_feature_parent(name: str, exclude_biotype: str | None = None, start: int | None = None, stop: int | None = None, **kwargs: Any) → Iterator[FeatureDataType]#: yields parents of name

get_features_matching(*, biotype: str | tuple[str, ...] | list[str] | set[str] | None = None, seqid: str | None = None, name: str | None = None, start: int | None = None, stop: int | None = None, strand: str | None = None, attributes: str | None = None, on_alignment: bool | None = None, allow_partial: bool = False) → Iterator[FeatureDataType]#

get_records_matching(*, biotype: str | None = None, seqid: str | None = None, name: str | None = None, start: int | None = None, stop: int | None = None, strand: str | None = None, attributes: str | None = None, on_alignment: bool | None = None, allow_partial: bool = False) → Iterator[dict[str, Any]]#: return all fields for matching records

make_indexes() → None#: adds db indexes for core attributes

num_matches(*, seqid: str | None = None, biotype: str | tuple[str, ...] | list[str] | set[str] | None = None, name: str | None = None, strand: str | None = None, attributes: str | None = None, on_alignment: bool | None = None) → int#: return the number of records matching condition

property schema_version: int#: Return the schema version of this database.

source: str | PathLike[Any] | PurePath | Path#

subset(*, source: str | PathLike[Any] | PurePath | Path = ':memory:', biotype: str | None = None, seqid: str | None = None, name: str | None = None, start: int | None = None, stop: int | None = None, strand: str | None = None, attributes: str | None = None, allow_partial: bool = False) → Self#: returns a new db instance with records matching the provided conditions

property table_names: tuple[str, ...]#

to_json() → str#

to_rich_dict() → dict[str, Any]#: returns a dict suitable for json serialisation

union(annot_db: AnnotationDbABC) → Self#

returns a new instance with merged records with other

Parameters:

annot_db: an annotation db whose schema is either a subset, or superset of self

Returns:

The class whose schema contains the other

update(annot_db: AnnotationDbABC, seqids: str | Sequence[str] | None = None, **kwargs: Any) → None#: update records with those from an instance of the same type

write(path: str | PathLike[Any] | PurePath | Path) → None#

writes db as bytes to path

Parameters:

path: Path to write the database. Must have suffix matching the class’s _suffix attribute.

Raises:

ValueError: If the file suffix doesn’t match the expected suffix for this class.