SequenceCollection#

class SequenceCollection(*, seqs_data: SeqsDataABC, moltype: MolType, info: dict | Info | None = None, source: str | Path | None = None, annotation_db: SupportsFeatures | None = None, name_map: dict | None = None, is_reversed: bool = False)#

A container of unaligned sequences.

Attributes:
annotation_db
named_seqs
names
num_seqs
seqs

Methods

add_feature(*, seqid, biotype, name, spans)

add feature on named sequence

add_seqs(seqs, **kwargs)

Returns new collection with additional sequences.

apply_pssm([pssm, path, background, ...])

scores sequences using the specified pssm

copy_annotations(seq_db)

copy annotations into attached annotation db

count_ambiguous_per_seq()

Counts of ambiguous characters per sequence.

counts([motif_length, include_ambiguity, ...])

counts of motifs

counts_per_seq([motif_length, ...])

counts of motifs per sequence

degap()

Returns new collection in which sequences have no gaps or missing characters.

distance_matrix([calc])

Estimated pairwise distance between sequences

dotplot([name1, name2, window, threshold, ...])

make a dotplot between specified sequences.

entropy_per_seq([motif_length, ...])

Returns the Shannon entropy per sequence.

from_rich_dict(data)

returns a new instance from a rich dict

get_ambiguous_positions()

Returns dict of seq:{position:char} for ambiguous chars.

get_features(*[, seqid, biotype, name, ...])

yields Feature instances

get_identical_sets([mask_degen])

returns sets of names for sequences that are identical

get_lengths([include_ambiguity, allow_gap])

returns sequence lengths as a dict of {seqid: length}

get_motif_probs([alphabet, ...])

Return a dictionary of motif probs, calculated as the averaged frequency across sequences.

get_seq(seqname[, copy_annotations])

Return a sequence object for the specified seqname.

get_seq_names_if(f[, negate])

Returns list of names of seqs where f(seq) is True.

get_similar(target[, min_similarity, ...])

Returns new SequenceCollection containing sequences similar to target.

get_translation([gc, incomplete_ok, ...])

translate sequences from nucleic acid to protein

has_terminal_stop([gc, strict])

Returns True if any sequence has a terminal stop codon.

iter_seqs([seq_order])

Iterates over sequences in the collection, in order.

make_feature(*, feature)

create a feature on named sequence, or on the collection itself

pad_seqs([pad_length])

Returns copy in which sequences are padded with the gap character to same length.

probs_per_seq([motif_length, ...])

return frequency array of motifs per sequence

rc()

Returns the reverse complement of all sequences in the collection.

rename_seqs(renamer)

Returns new collection with renamed sequences.

reverse_complement()

Returns the reverse complement of all sequences in the collection.

set_repr_policy([num_seqs, num_pos, ...])

specify policy for repr(self)

strand_symmetry([motif_length])

returns dict of strand symmetry test results per seq

take_seqs(names[, negate, copy_annotations])

Returns new collection containing only specified seqs.

take_seqs_if(f[, negate])

Returns new collection containing seqs where f(seq) is True.

to_dict([as_array])

Return a dictionary of sequences.

to_dna()

returns copy of self as a collection of DNA moltype seqs

to_fasta([block_size])

Return collection in Fasta format.

to_html([name_order, wrap, limit, colors, ...])

returns html with embedded styles for sequence colouring

to_json()

returns json formatted string

to_moltype(moltype)

returns copy of self with changed moltype

to_phylip()

Return collection in PHYLIP format and mapping to sequence ids

to_rich_dict()

returns a json serialisable dict

to_rna()

returns copy of self as a collection of RNA moltype seqs

trim_stop_codons([gc, strict])

Removes any terminal stop codons from the sequences

write(filename[, file_format])

Write the sequences to a file, preserving order of sequences.

is_ragged

Notes

Should be constructed using make_unaligned_seqs().

add_feature(*, seqid: str, biotype: str, name: str, spans: list[tuple[int, int]], parent_id: str | None = None, strand: str = '+') Feature#

add feature on named sequence

Parameters:
seqid

seq name to associate with

parent_id

name of the parent feature

biotype

biological type

name

feature name

spans

plus strand coordinates

strand

either ‘+’ or ‘-’

Returns:
Feature
add_seqs(seqs: dict[str, str | bytes | ndarray[int]] | SeqsData | list, **kwargs) SequenceCollection#

Returns new collection with additional sequences.

Parameters:
seqs

sequences to add

property annotation_db#
apply_pssm(pssm: PSSM = None, path: str | None = None, background: ndarray = None, pseudocount: int = 0, names: list | None = None, ui=None) array#

scores sequences using the specified pssm

Parameters:
pssm

A profile.PSSM instance, if not provided, will be loaded from path

path

path to either a jaspar or cisbp matrix (path must end have a suffix matching the format).

background

background frequencies distribution

pseudocount

adjustment for zero in matrix

names

returns only scores for these sequences and in the name order

Returns:
numpy array of log2 based scores at every position
copy_annotations(seq_db: SupportsFeatures) None#

copy annotations into attached annotation db

Parameters:
seq_db

compatible annotation db

Notes

Only copies annotations for records with seqid in self.names

count_ambiguous_per_seq() DictArray#

Counts of ambiguous characters per sequence.

counts(motif_length: int = 1, include_ambiguity: bool = False, allow_gap: bool = False, exclude_unobserved: bool = False) MotifCountsArray#

counts of motifs

Parameters:
motif_length

number of elements per character.

include_ambiguity

if True, motifs containing ambiguous characters from the seq moltype are included. No expansion of those is attempted.

allow_gap

if True, motifs containing a gap character are included.

exclude_unobserved

if True, unobserved motif combinations are excluded.

Notes

only non-overlapping motifs are counted

counts_per_seq(motif_length: int = 1, include_ambiguity: bool = False, allow_gap: bool = False, exclude_unobserved: bool = False, warn: bool = False) MotifCountsArray#

counts of motifs per sequence

Parameters:
motif_length

number of characters per tuple.

include_ambiguity

if True, motifs containing ambiguous characters from the seq moltype are included. No expansion of those is attempted.

allow_gap

if True, motifs containing a gap character are included.

warn

warns if motif_length > 1 and collection trimmed to produce motif columns.

Notes

only non-overlapping motifs are counted

degap() SequenceCollection#

Returns new collection in which sequences have no gaps or missing characters.

Notes

The returned collection will not retain an annotation_db if present.

distance_matrix(calc: str = 'pdist')#

Estimated pairwise distance between sequences

Parameters:
calc

The distance calculation method to use, either “pdist” or “jc69”. - “pdist” is an approximation of the proportion sites different. - “jc69” is an approximation of the Jukes Cantor distance.

Returns:
DistanceMatrix

Estimated pairwise distances between sequences in the collection

Notes

pdist approximates the proportion sites different from the Jaccard distance. Coefficients for the approximation were derived from a polynomial fit between Jaccard distance of kmers with k=10 and the proportion of sites different using mammalian 106 protein coding gene DNA sequence alignments.

jc69 approximates the Jukes Cantor distance using the approximated proportion sites different, i.e., a transformation of the above.

dotplot(name1: str | None = None, name2: str | None = None, window: int = 20, threshold: int | None = None, k: int | None = None, min_gap: int = 0, width: int = 500, title: str | None = None, rc: bool = False, biotype: str | tuple[str] = 'gene', show_progress: bool = False)#

make a dotplot between specified sequences. Random sequences chosen if names not provided.

Parameters:
name1, name2

names of sequences – if not provided, a random choice is made

window

segment size for comparison between sequences

threshold

windows where the sequences are identical >= threshold are a match

k

size of k-mer to break sequences into. Larger values increase speed but reduce resolution. If not specified, and window == threshold, then k is set to window. Otherwise, it is computed as the maximum of {threshold // (window - threshold), 5}.

min_gap

permitted gap for joining adjacent line segments, default is no gap joining

width

figure width. Figure height is computed based on the ratio of len(seq1) / len(seq2)

title

title for the plot

rc

include dotplot of reverse compliment also. Only applies to Nucleic acids moltypes

biotype

if selected sequences are annotated, display only these biotypes

Returns:
a Drawable or AnnotatedDrawable
entropy_per_seq(motif_length: int = 1, include_ambiguity: bool = False, allow_gap: bool = False, exclude_unobserved: bool = True, warn: bool = False) ndarray#

Returns the Shannon entropy per sequence.

Parameters:
motif_length: int

number of characters per tuple.

include_ambiguity: bool

if True, motifs containing ambiguous characters from the seq moltype are included. No expansion of those is attempted.

allow_gap: bool

if True, motifs containing a gap character are included.

exclude_unobserved: bool

if True, unobserved motif combinations are excluded.

warn

warns if motif_length > 1 and alignment trimmed to produce motif columns

Notes

For motif_length > 1, it’s advisable to specify exclude_unobserved=True, this avoids unnecessary calculations.

classmethod from_rich_dict(data: dict[str, str | dict[str, str]]) SequenceCollection#

returns a new instance from a rich dict

get_ambiguous_positions()#

Returns dict of seq:{position:char} for ambiguous chars.

Used in likelihood calculations.

get_features(*, seqid: str | Iterator[str] | None = None, biotype: str | None = None, name: str | None = None, start: int | None = None, stop: int | None = None, allow_partial: bool = False, **kwargs) Iterator[Feature]#

yields Feature instances

Parameters:
seqid

limit search to features on this named sequence, defaults to search all

biotype

biotype of the feature, e.g. CDS, gene

name

name of the feature

start

start position of the feature (not inclusive)

stop

stop position of the feature (inclusive)

allow_partial

allow features partially overlaping self

kwargs

additional keyword arguments to query the annotation db

Notes

  • When dealing with a nucleic acid moltype, the returned features will

yield a sequence segment that is consistently oriented irrespective of strand of the current instance. - start is non-inclusive, so if allow_partial is False, only features strictly starting after start will be returned.

get_identical_sets(mask_degen: bool = False) list[set]#

returns sets of names for sequences that are identical

Parameters:
mask_degen

if True, degenerate characters are ignored

get_lengths(include_ambiguity: bool = False, allow_gap: bool = False) dict[str, int]#

returns sequence lengths as a dict of {seqid: length}

Parameters:
include_ambiguity

if True, motifs containing ambiguous characters from the seq moltype are included. No expansion of those is attempted.

allow_gap

if True, motifs containing a gap character are included.

get_motif_probs(alphabet: AlphabetABC = None, include_ambiguity: bool = False, exclude_unobserved: bool = False, allow_gap: bool = False, pseudocount: int = 0) dict#

Return a dictionary of motif probs, calculated as the averaged frequency across sequences.

Parameters:
alphabet

alphabet to use for motifs

include_ambiguity

if True resolved ambiguous codes are included in estimation of frequencies.

exclude_unobserved

if True, motifs that are not present in the alignment are excluded from the returned dictionary.

allow_gap

allow gap motif

pseudocount

value to add to each count

Notes

only non-overlapping motifs are counted

get_seq(seqname: str, copy_annotations: bool = False) Sequence#

Return a sequence object for the specified seqname.

Parameters:
seqname

name of the sequence to return

copy_annotations

if True, only annotations from the selected seq are copied to the annotation_db of the new collection, the same annotation db is used.

get_seq_names_if(f: Callable[[Sequence], bool], negate: bool = False)#

Returns list of names of seqs where f(seq) is True.

Parameters:
f

function that takes a sequence object and returns True or False

negate

select all sequences EXCEPT those where f(seq) is True

Notes

Sequence objects can be converted into strings or numpy arrays using str() and numpy.array() respectively.

get_similar(target: ~cogent3.core.new_sequence.Sequence, min_similarity: float = 0.0, max_similarity: float = 1.0, metric: ~collections.abc.Callable[[~cogent3.core.new_sequence.Sequence, ~cogent3.core.new_sequence.Sequence], float] = <cogent3.util.transform.for_seq object>, transform: bool | None = None) SequenceCollection#

Returns new SequenceCollection containing sequences similar to target.

Parameters:
target

sequence object to compare to. Can be in the collection.

min_similarity

minimum similarity that will be kept. Default 0.0.

max_similarity

maximum similarity that will be kept. Default 1.0.

metric

a similarity function to use. Must be f(first_seq, second_seq). The default metric is fraction similarity, ranging from 0.0 (0% identical) to 1.0 (100% identical). The Sequence class have lots of methods that can be passed in as unbound methods to act as the metric, e.g. frac_same_gaps.

transform

transformation function to use on the sequences before the metric is calculated. If None, uses the whole sequences in each case. A frequent transformation is a function that returns a specified range of a sequence, e.g. eliminating the ends. Note that the transform applies to both the real sequence and the target sequence.

Notes

both min_similarity and max_similarity are inclusive.

get_translation(gc: int = 1, incomplete_ok: bool = False, include_stop: bool = False, trim_stop: bool = True, **kwargs)#

translate sequences from nucleic acid to protein

Parameters:
gc

genetic code, either the number or name (use cogent3.core.genetic_code.available_codes)

incomplete_ok

codons that are mixes of nucleotide and gaps converted to ‘?’. raises a ValueError if False

include_stop

whether to allow a stops in the translated sequence

trim_stop

exclude terminal stop codons if they exist

kwargs

related to construction of the resulting object

Returns:
A new instance of self translated into protein

Notes

Translating will break the relationship to an annotation_db if present.

has_terminal_stop(gc: Any = None, strict: bool = False) bool#

Returns True if any sequence has a terminal stop codon.

Parameters:
gc

valid input to cogent3.get_code(), a genetic code object, number or name

strict

If True, raises an exception if a seq length not divisible by 3

is_ragged() bool#
iter_seqs(seq_order: list | None = None) Iterator[Sequence | SeqViewABC]#

Iterates over sequences in the collection, in order.

Parameters:
seq_order:

list of seqids giving the order in which seqs will be returned. Defaults to self.names

make_feature(*, feature: FeatureDataType) Feature#

create a feature on named sequence, or on the collection itself

Parameters:
feature

a dict with all the necessary data to construct a feature

Returns:
Feature

Notes

To get a feature AND add it to annotation_db, use add_feature().

property named_seqs: SeqsDataABC#
property names: list#
property num_seqs: int#
pad_seqs(pad_length: int | None = None)#

Returns copy in which sequences are padded with the gap character to same length.

Parameters:
pad_length

Length all sequences are to be padded to. Will pad to max sequence length if pad_length is None or less than max length.

probs_per_seq(motif_length: int = 1, include_ambiguity: bool = False, allow_gap: bool = False, exclude_unobserved: bool = False, warn: bool = False) MotifFreqsArray#

return frequency array of motifs per sequence

Parameters:
motif_length

number of characters per motif

include_ambiguity

if True, include motifs containing ambiguous characters

allow_gap

if True, include motifs containing a gap character

exclude_unobserved

if True, exclude motifs not present in the sequences in the resulting array

warn

warns if motif_length > 1 and collection trimmed to produce motif columns.

rc()#

Returns the reverse complement of all sequences in the collection. A synonym for reverse_complement.

rename_seqs(renamer: Callable[[str], str])#

Returns new collection with renamed sequences.

reverse_complement()#

Returns the reverse complement of all sequences in the collection. A synonym for rc.

property seqs: _IndexableSeqs#
set_repr_policy(num_seqs: int | None = None, num_pos: int | None = None, ref_name: int | None = None, wrap: int | None = None) None#

specify policy for repr(self)

Parameters:
num_seqs

number of sequences to include in represented display.

num_pos

length of sequences to include in represented display.

ref_name

name of sequence to be placed first, or “longest” (default). If latter, indicates longest sequence will be chosen.

wrap

number of printed bases per row

strand_symmetry(motif_length: int = 1)#

returns dict of strand symmetry test results per seq

take_seqs(names: str | Sequence[str], negate: bool = False, copy_annotations: bool = False, **kwargs)#

Returns new collection containing only specified seqs.

Parameters:
names

sequences to select (or exclude if negate=True)

negate

select all sequences EXCEPT names

kwargs

keyword arguments to be passed to the constructor of the new collection

copy_annotations

if True, only annotations from selected seqs are copied to the annotation_db of the new collection

take_seqs_if(f: Callable[[Sequence], bool], negate: bool = False)#

Returns new collection containing seqs where f(seq) is True.

Parameters:
f

function that takes a sequence object and returns True or False

negate

select all sequences EXCEPT those where f(seq) is True

Notes

Sequence objects can be converted into strings or numpy arrays using str() and numpy.array() respectively.

to_dict(as_array: bool = False) dict[str, str | ndarray]#

Return a dictionary of sequences.

Parameters:
as_array

if True, sequences are returned as numpy arrays, otherwise as strings

to_dna()#

returns copy of self as a collection of DNA moltype seqs

to_fasta(block_size: int = 60) str#

Return collection in Fasta format.

Parameters:
block_size

the sequence length to write to each line, by default 60

Returns:
The collection in Fasta format.
to_html(name_order: Sequence[str] | None = None, wrap: int = 60, limit: int | None = None, colors: Mapping[str, str] | None = None, font_size: int = 12, font_family: str = 'Lucida Console') str#

returns html with embedded styles for sequence colouring

Parameters:
name_order

order of names for display.

wrap

number of columns per row

limit

truncate view of collection to this length

colors

{character moltype.

font_size

in points. Affects labels and sequence and line spacing (proportional to value)

font_family

string denoting font family

Examples

In a jupyter notebook, this code is used to provide the representation.

seq_col  # is rendered by jupyter

You can directly use the result for display in a notebook as

from IPython.core.display import HTML

HTML(seq_col.to_html())
to_json()#

returns json formatted string

to_moltype(moltype: str | MolType) SequenceCollection#

returns copy of self with changed moltype

Parameters:
moltype

name of the new moltype, e.g, ‘dna’, ‘rna’.

Notes

Cannot convert from nucleic acids to proteins. Use get_translation() for that.

to_phylip()#

Return collection in PHYLIP format and mapping to sequence ids

Notes

raises exception if sequences do not all have the same length

to_rich_dict() dict[str, str | dict[str, str]]#

returns a json serialisable dict

Notes

Deserialisation the object produced by this method will not include the annotation_db if present.

to_rna()#

returns copy of self as a collection of RNA moltype seqs

trim_stop_codons(gc: Any = 1, strict: bool = False)#

Removes any terminal stop codons from the sequences

Parameters:
gc

valid input to cogent3.get_code(), a genetic code object, number or name, defaults to standard code

strict

If True, raises an exception if a seq length not divisible by 3

write(filename: str, file_format: str | None = None, **kwargs) None#

Write the sequences to a file, preserving order of sequences.

Parameters:
filename

name of the sequence file

file_format

format of the sequence file

Notes

If file_format is None, will attempt to infer format from the filename suffix.