Sequences#
Note
These docs now use the new_type
core objects via the following setting.
import os
# using new types without requiring an explicit argument
os.environ["COGENT3_NEW_TYPE"] = "1"
The Sequence
object provides generic biological sequence manipulation functions, plus functions that are critical for the evolve
module calculations.
Generic molecular types#
Sequence properties are affected by the moltype you specify. The default type for a sequence is "text"
.
from cogent3 import make_seq
my_seq = make_seq("AGTACACTGGT", moltype="dna")
my_seq.moltype.label
'dna'
my_seq
0 | |
None | AGTACACTGGT |
DnaSequence, length=11
In some circumstances you can also have a "bytes"
moltype, which I’ll explicitly construct here.
my_seq = make_seq("AGTACACTGGT", moltype="bytes")
my_seq.moltype.label
'bytes'
my_seq
0 | |
None | AGTACACTGGT |
ByteSequence, length=11
DNA and RNA sequences#
Creating a DNA sequence from a string#
Sequence properties are affected by the moltype you specify. Here we specify the DNA
molecular type.
from cogent3 import make_seq
my_seq = make_seq("AGTACACTGGT", moltype="dna")
my_seq
0 | |
None | AGTACACTGGT |
DnaSequence, length=11
Creating a RNA sequence from a string#
from cogent3 import make_seq
rnaseq = make_seq("ACGUACGUACGUACGU", moltype="rna")
Converting to FASTA format#
from cogent3 import make_seq
my_seq = make_seq("AGTACACTGGT", moltype="dna")
my_seq
0 | |
None | AGTACACTGGT |
DnaSequence, length=11
Convert a RNA sequence to FASTA format#
from cogent3 import make_seq
rnaseq = make_seq("ACGUACGUACGUACGU", moltype="rna")
rnaseq
0 | |
None | ACGUACGUACGUACGU |
RnaSequence, length=16
Creating a named sequence#
from cogent3 import make_seq
my_seq = make_seq("AGTACACTGGT", "my_gene", moltype="dna")
my_seq
type(my_seq)
cogent3.core.new_sequence.DnaSequence
Setting or changing the name of a sequence#
from cogent3 import make_seq
my_seq = make_seq("AGTACACTGGT", moltype="dna")
my_seq.name = "my_gene"
my_seq
0 | |
my_gene | AGTACACTGGT |
DnaSequence, length=11
Complementing a DNA sequence#
from cogent3 import make_seq
my_seq = make_seq("AGTACACTGGT", moltype="dna")
my_seq.complement()
0 | |
None | TCATGTGACCA |
DnaSequence, length=11
Reverse complementing a DNA sequence#
my_seq.rc()
0 | |
None | ACCAGTGTACT |
DnaSequence, length=11
Translate a sequence to protein#
from cogent3 import make_seq
my_seq = make_seq("GCTTGGGAAAGTCAAATGGAA", name="s1", moltype="dna")
pep = my_seq.get_translation()
type(pep)
cogent3.core.new_sequence.ProteinSequence
pep
0 | |
s1 | AWESQME |
ProteinSequence, length=7
The default is to trim a terminating stop if it exists. If you set trim_stop=False
and there is a terminating stop, an AlphabetError
is raised.
from cogent3 import make_seq
my_seq = make_seq("ATGCACTGGTAA", name="my_gene", moltype="dna")
my_seq.get_translation(trim_stop=False)
---------------------------------------------------------------------------
AlphabetError Traceback (most recent call last)
Cell In[17], line 4
1 from cogent3 import make_seq
3 my_seq = make_seq("ATGCACTGGTAA", name="my_gene", moltype="dna")
----> 4 my_seq.get_translation(trim_stop=False)
File ~/work/cogent3.github.io/cogent3.github.io/.venv/lib/python3.12/site-packages/cogent3/core/new_sequence.py:2199, in NucleicAcidSequenceMixin.get_translation(self, gc, incomplete_ok, include_stop, trim_stop)
2197 if not include_stop and "*" in pep:
2198 msg = f"{self.name!r} has a stop codon in the translation"
-> 2199 raise new_alphabet.AlphabetError(msg)
2201 if not incomplete_ok and "X" in pep:
2202 msg = (
2203 f"{self.name!r} has an incomplete codon or contains an ambiguity, set incomplete_ok=True to "
2204 "allow translation"
2205 )
AlphabetError: 'my_gene' has a stop codon in the translation
You can also specify the genetic code.
my_seq.get_translation(gc="Vertebrate Mitochondrial") # or gc=2
0 | |
my_gene | MHW |
ProteinSequence, length=3
Translating a DNA sequence containing stop codons#
By default, get_translation()
will fail if there are any stop codons in frame in the sequence. You can allow translation in these cases by setting the optional argument include_stop=True
.
from cogent3 import make_seq
seq = make_seq("ATGTGATGGTAA", name="s1", moltype="dna")
pep = seq.get_translation(include_stop=True)
pep
0 | |
s1 | M*W |
ProteinWithStopSequence, length=3
Converting a DNA sequence to RNA#
from cogent3 import make_seq
my_seq = make_seq("ACGTACGTACGTACGT", moltype="dna")
rnaseq = my_seq.to_rna()
rnaseq
0 | |
None | ACGUACGUACGUACGU |
RnaSequence, length=16
Convert an RNA sequence to DNA#
from cogent3 import make_seq
rnaseq = make_seq("ACGUACGUACGUACGU", moltype="rna")
dnaseq = rnaseq.to_dna()
dnaseq
0 | |
None | ACGTACGTACGTACGT |
DnaSequence, length=16
Testing complementarity#
from cogent3 import make_seq
a = make_seq("AGTACACTGGT", moltype="dna")
a.can_pair(a.complement())
False
a.can_pair(a.rc())
True
Joining two DNA sequences#
from cogent3 import make_seq
my_seq = make_seq("AGTACACTGGT", moltype="dna")
extra_seq = make_seq("CTGAC", moltype="dna")
long_seq = my_seq + extra_seq
long_seq
0 | |
None | AGTACACTGGTCTGAC |
DnaSequence, length=16
Getting all k-mers from a sequence#
from cogent3 import make_seq
my_seq = make_seq("AGTACACTGGT", moltype="dna")
list(my_seq.iter_kmers(k=2))
['AG', 'GT', 'TA', 'AC', 'CA', 'AC', 'CT', 'TG', 'GG', 'GT']
Note
By default, any k-mer that contains an ambiguity code is excluded from the output.
You can include ALL k-mers by setting strict=False
.
my_seq = make_seq("AGTANACTGGT", moltype="dna")
list(my_seq.iter_kmers(k=2, strict=False))
['AG', 'GT', 'TA', 'AN', 'NA', 'AC', 'CT', 'TG', 'GG', 'GT']
Slicing DNA sequences#
my_seq[1:6]
0 | |
None | GTANA |
DnaSequence, length=5
Obtaining the codons from a DnaSequence
object#
Use the method get_in_motif_size
from cogent3 import make_seq
my_seq = make_seq("ATGCACTGGTAA", name="my_gene", moltype="dna")
codons = my_seq.get_in_motif_size(3)
codons
['ATG', 'CAC', 'TGG', 'TAA']
Getting 3rd positions from codons#
from cogent3 import make_seq
seq = make_seq("ATGATGATGATG", moltype="dna")
pos3 = seq[2::3]
assert str(pos3) == "GGGG"
Getting 1st and 2nd positions from codons#
In this instance we can use features.
from cogent3 import make_seq
seq = make_seq("ATGATGATGATG", moltype="dna")
indices = [(i, i + 2) for i in range(len(seq))[::3]]
pos12 = seq.add_feature(biotype="pos12", name="pos12", spans=indices)
pos12 = pos12.get_slice()
assert str(pos12) == "ATATATAT"
Return a randomised version of the sequence#
rnaseq.shuffle()
0 | |
None | GGCUCUAUGCGAAUAC |
RnaSequence, length=16
Remove gaps from a sequence#
from cogent3 import make_seq
s = make_seq("--AUUAUGCUAU-UAU--", moltype="rna")
s.degap()
0 | |
None | AUUAUGCUAUUAU |
RnaSequence, length=13