Sequences#

Note

These docs now use the new_type core objects via the following setting.

import os

# using new types without requiring an explicit argument
os.environ["COGENT3_NEW_TYPE"] = "1"

The Sequence object provides generic biological sequence manipulation functions, plus functions that are critical for the evolve module calculations.

Generic molecular types#

Sequence properties are affected by the moltype you specify. The default type for a sequence is "text".

from cogent3 import make_seq

my_seq = make_seq("AGTACACTGGT", moltype="dna")
my_seq.moltype.label
'dna'
my_seq
0
NoneAGTACACTGGT

DnaSequence, length=11

In some circumstances you can also have a "bytes" moltype, which I’ll explicitly construct here.

my_seq = make_seq("AGTACACTGGT", moltype="bytes")
my_seq.moltype.label
'bytes'
my_seq
0
NoneAGTACACTGGT

ByteSequence, length=11

DNA and RNA sequences#

Creating a DNA sequence from a string#

Sequence properties are affected by the moltype you specify. Here we specify the DNA molecular type.

from cogent3 import make_seq

my_seq = make_seq("AGTACACTGGT", moltype="dna")
my_seq
0
NoneAGTACACTGGT

DnaSequence, length=11

Creating a RNA sequence from a string#

from cogent3 import make_seq

rnaseq = make_seq("ACGUACGUACGUACGU", moltype="rna")

Converting to FASTA format#

from cogent3 import make_seq

my_seq = make_seq("AGTACACTGGT", moltype="dna")
my_seq
0
NoneAGTACACTGGT

DnaSequence, length=11

Convert a RNA sequence to FASTA format#

from cogent3 import make_seq

rnaseq = make_seq("ACGUACGUACGUACGU", moltype="rna")
rnaseq
0
NoneACGUACGUACGUACGU

RnaSequence, length=16

Creating a named sequence#

from cogent3 import make_seq

my_seq = make_seq("AGTACACTGGT", "my_gene", moltype="dna")
my_seq
type(my_seq)
cogent3.core.new_sequence.DnaSequence

Setting or changing the name of a sequence#

from cogent3 import make_seq

my_seq = make_seq("AGTACACTGGT", moltype="dna")
my_seq.name = "my_gene"
my_seq
0
my_geneAGTACACTGGT

DnaSequence, length=11

Complementing a DNA sequence#

from cogent3 import make_seq

my_seq = make_seq("AGTACACTGGT", moltype="dna")
my_seq.complement()
0
NoneTCATGTGACCA

DnaSequence, length=11

Reverse complementing a DNA sequence#

my_seq.rc()
0
NoneACCAGTGTACT

DnaSequence, length=11

Translate a sequence to protein#

from cogent3 import make_seq

my_seq = make_seq("GCTTGGGAAAGTCAAATGGAA", name="s1", moltype="dna")
pep = my_seq.get_translation()
type(pep)
cogent3.core.new_sequence.ProteinSequence
pep
0
s1AWESQME

ProteinSequence, length=7

The default is to trim a terminating stop if it exists. If you set trim_stop=False and there is a terminating stop, an AlphabetError is raised.

from cogent3 import make_seq

my_seq = make_seq("ATGCACTGGTAA", name="my_gene", moltype="dna")
my_seq.get_translation(trim_stop=False)
---------------------------------------------------------------------------
AlphabetError                             Traceback (most recent call last)
Cell In[17], line 4
      1 from cogent3 import make_seq
      3 my_seq = make_seq("ATGCACTGGTAA", name="my_gene", moltype="dna")
----> 4 my_seq.get_translation(trim_stop=False)

File ~/work/cogent3.github.io/cogent3.github.io/.venv/lib/python3.12/site-packages/cogent3/core/new_sequence.py:2199, in NucleicAcidSequenceMixin.get_translation(self, gc, incomplete_ok, include_stop, trim_stop)
   2197 if not include_stop and "*" in pep:
   2198     msg = f"{self.name!r} has a stop codon in the translation"
-> 2199     raise new_alphabet.AlphabetError(msg)
   2201 if not incomplete_ok and "X" in pep:
   2202     msg = (
   2203         f"{self.name!r} has an incomplete codon or contains an ambiguity, set incomplete_ok=True to "
   2204         "allow translation"
   2205     )

AlphabetError: 'my_gene' has a stop codon in the translation

You can also specify the genetic code.

my_seq.get_translation(gc="Vertebrate Mitochondrial") # or gc=2
0
my_geneMHW

ProteinSequence, length=3

Translating a DNA sequence containing stop codons#

By default, get_translation() will fail if there are any stop codons in frame in the sequence. You can allow translation in these cases by setting the optional argument include_stop=True.

from cogent3 import make_seq

seq = make_seq("ATGTGATGGTAA", name="s1", moltype="dna")
pep = seq.get_translation(include_stop=True)
pep
0
s1M*W

ProteinWithStopSequence, length=3

Converting a DNA sequence to RNA#

from cogent3 import make_seq

my_seq = make_seq("ACGTACGTACGTACGT", moltype="dna")
rnaseq = my_seq.to_rna()
rnaseq
0
NoneACGUACGUACGUACGU

RnaSequence, length=16

Convert an RNA sequence to DNA#

from cogent3 import make_seq

rnaseq = make_seq("ACGUACGUACGUACGU", moltype="rna")
dnaseq = rnaseq.to_dna()
dnaseq
0
NoneACGTACGTACGTACGT

DnaSequence, length=16

Testing complementarity#

from cogent3 import make_seq

a = make_seq("AGTACACTGGT", moltype="dna")
a.can_pair(a.complement())
False
a.can_pair(a.rc())
True

Joining two DNA sequences#

from cogent3 import make_seq

my_seq = make_seq("AGTACACTGGT", moltype="dna")
extra_seq = make_seq("CTGAC", moltype="dna")
long_seq = my_seq + extra_seq
long_seq
0
NoneAGTACACTGGTCTGAC

DnaSequence, length=16

Getting all k-mers from a sequence#

from cogent3 import make_seq

my_seq = make_seq("AGTACACTGGT", moltype="dna")
list(my_seq.iter_kmers(k=2))
['AG', 'GT', 'TA', 'AC', 'CA', 'AC', 'CT', 'TG', 'GG', 'GT']

Note

By default, any k-mer that contains an ambiguity code is excluded from the output.

You can include ALL k-mers by setting strict=False.

my_seq = make_seq("AGTANACTGGT", moltype="dna")
list(my_seq.iter_kmers(k=2, strict=False))
['AG', 'GT', 'TA', 'AN', 'NA', 'AC', 'CT', 'TG', 'GG', 'GT']

Slicing DNA sequences#

my_seq[1:6]
0
NoneGTANA

DnaSequence, length=5

Obtaining the codons from a DnaSequence object#

Use the method get_in_motif_size

from cogent3 import make_seq

my_seq = make_seq("ATGCACTGGTAA", name="my_gene", moltype="dna")
codons = my_seq.get_in_motif_size(3)
codons
['ATG', 'CAC', 'TGG', 'TAA']

Getting 3rd positions from codons#

from cogent3 import make_seq

seq = make_seq("ATGATGATGATG", moltype="dna")
pos3 = seq[2::3]
assert str(pos3) == "GGGG"

Getting 1st and 2nd positions from codons#

In this instance we can use features.

from cogent3 import make_seq

seq = make_seq("ATGATGATGATG", moltype="dna")
indices = [(i, i + 2) for i in range(len(seq))[::3]]
pos12 = seq.add_feature(biotype="pos12", name="pos12", spans=indices)
pos12 = pos12.get_slice()
assert str(pos12) == "ATATATAT"

Return a randomised version of the sequence#

rnaseq.shuffle()
0
NoneGGCUCUAUGCGAAUAC

RnaSequence, length=16

Remove gaps from a sequence#

from cogent3 import make_seq

s = make_seq("--AUUAUGCUAU-UAU--", moltype="rna")
s.degap()
0
NoneAUUAUGCUAUUAU

RnaSequence, length=13