Using genetic codes#

Selecting codes in methods that support them#

In cases where a cogent3 object method has a gc argument, you can just use the number under “Code ID” column.

For example, I’ve created a partial codon in "s1"

from cogent3 import make_aligned_seqs

data = {
    "s1": "GCTCATGCCAGCTCTTTACAGCATGAGAACA--AGT",
    "s2": "ACTCATGCCAACTCATTACAGCATGAGAACAGCAGT",
    "s3": "ACTCATGCCAGCTCATTACAGCATGAGAACAGCAGT",
    "s4": "ACTCATGCCAGCTCATTACAGCATGAGAACAGCAGT",
    "s5": "ACTCATGCCAGCTCAGTACAGCATGAGAACAGCAGT",
}

nt_seqs = make_aligned_seqs(data=data, moltype="dna")
nt_seqs
0
s2ACTCATGCCAACTCATTACAGCATGAGAACAGCAGT
s1G.........G...T................--...
s3..........G.........................
s4..........G.........................
s5..........G....G....................

5 x 36 dna alignment

We specify the genetic code, and we allow incomplete codons. In this case, if a codon contains a gap, they are converted to ? in the translation.

nt_seqs.get_translation(gc=1, incomplete_ok=True)
0
s2THANSLQHENSS
s1A..S......?.
s3...S........
s4...S........
s5...S.V......

5 x 12 protein alignment

Translate DNA sequences#

from cogent3 import get_code

standard_code = get_code(1)
standard_code.translate("TTTGCAAAC")
'FAN'

Conversion to a ProteinSequence from a DnaSequence is shown in Translate a DnaSequence to protein.

Translate all six frames#

from cogent3 import get_code, make_seq

standard_code = get_code(1)
seq = make_seq("ATGCTAACATAAA", moltype="dna")
translations = standard_code.sixframes(seq)
print(translations)
['MLT*', 'C*HK', 'ANI', 'FMLA', 'LC*H', 'YVS']

Find out how many stops in a frame#

from cogent3 import get_code, make_seq

standard_code = get_code(1)
seq = make_seq("ATGCTAACATAAA", moltype="dna")
stops_frame1 = standard_code.get_stop_indices(seq, start=0)
stops_frame1
[9]
stop_index = stops_frame1[0]
seq[stop_index : stop_index + 3]
0
NoneTAA

DnaSequence, length=3

Translate a codon#

from cogent3 import get_code, make_seq

standard_code = get_code(1)
standard_code["TTT"]
'F'

or get the codons for a single amino acid

standard_code["A"]
['GCT', 'GCC', 'GCA', 'GCG']

Look up the amino acid corresponding to a single codon#

from cogent3 import get_code

standard_code = get_code(1)
standard_code["TTT"]
'F'

Get all the codons for one amino acid#

from cogent3 import get_code

standard_code = get_code(1)
standard_code["A"]
['GCT', 'GCC', 'GCA', 'GCG']

Get all the codons for a group of amino acids#

targets = ["A", "C"]
codons = [standard_code[aa] for aa in targets]
codons
[['GCT', 'GCC', 'GCA', 'GCG'], ['TGT', 'TGC']]
flat_list = sum(codons, [])
flat_list
['GCT', 'GCC', 'GCA', 'GCG', 'TGT', 'TGC']

Converting the CodonAlphabet to codon series#

from cogent3 import get_code

gc = get_code(1)
alphabet = gc.get_alphabet()
print(alphabet)
('TTT', 'TTC', 'TTA', 'TTG', 'TCT', 'TCC', 'TCA', 'TCG', 'TAT', 'TAC', 'TGT', 'TGC', 'TGG', 'CTT', 'CTC', 'CTA', 'CTG', 'CCT', 'CCC', 'CCA', 'CCG', 'CAT', 'CAC', 'CAA', 'CAG', 'CGT', 'CGC', 'CGA', 'CGG', 'ATT', 'ATC', 'ATA', 'ATG', 'ACT', 'ACC', 'ACA', 'ACG', 'AAT', 'AAC', 'AAA', 'AAG', 'AGT', 'AGC', 'AGA', 'AGG', 'GTT', 'GTC', 'GTA', 'GTG', 'GCT', 'GCC', 'GCA', 'GCG', 'GAT', 'GAC', 'GAA', 'GAG', 'GGT', 'GGC', 'GGA', 'GGG')

Obtaining the codons from a DnaSequence object#

Use the method get_in_motif_size

from cogent3 import make_seq

my_seq = make_seq("ATGCACTGGTAA", name="my_gene", moltype="dna")
codons = my_seq.get_in_motif_size(3)
codons
['ATG', 'CAC', 'TGG', 'TAA']

Translating a DNA sequence#

The defaults for get_translation() include using the standard genetic code and trimming a terminating stop if it exists.

pep = my_seq.get_translation()
pep
0
my_geneMHW

ProteinSequence, length=3

Translating a DNA sequence containing stop codons#

Making a sequence that contains both internal and terminating stop codons.

from cogent3 import make_seq

seq = make_seq("ATGTGATGGTAA", name="s1", moltype="dna")

Translating this will fail with default settings.

pep = seq.get_translation()
---------------------------------------------------------------------------
AlphabetError                             Traceback (most recent call last)
File ~/work/cogent3.github.io/cogent3.github.io/.venv/lib/python3.12/site-packages/cogent3/core/sequence.py:1807, in NucleicAcidSequence.get_translation(self, gc, incomplete_ok, include_stop, trim_stop)
   1806 try:
-> 1807     resolved = moltype.resolve_ambiguity(
   1808         orig_codon, alphabet=codon_alphabet
   1809     )
   1810 except AlphabetError:

File ~/work/cogent3.github.io/cogent3.github.io/.venv/lib/python3.12/site-packages/cogent3/core/moltype.py:1356, in MolType.resolve_ambiguity(self, ambig_motif, alphabet, allow_gap)
   1355 if not result:
-> 1356     raise AlphabetError(ambig_motif)
   1358 return result

AlphabetError: TGA

During handling of the above exception, another exception occurred:

AlphabetError                             Traceback (most recent call last)
Cell In[18], line 1
----> 1 pep = seq.get_translation()

File ~/work/cogent3.github.io/cogent3.github.io/.venv/lib/python3.12/site-packages/cogent3/core/sequence.py:1812, in NucleicAcidSequence.get_translation(self, gc, incomplete_ok, include_stop, trim_stop)
   1810 except AlphabetError:
   1811     if not incomplete_ok or "-" not in orig_codon:
-> 1812         raise AlphabetError(
   1813             f"unresolvable codon {orig_codon!r} in {self.name}"
   1814         )
   1815     resolved = (orig_codon,)
   1816 trans = []

AlphabetError: unresolvable codon 'TGA' in s1

Unless you explicitly allow stop codons

pep = seq.get_translation(include_stop=True)
pep
0
s1M*W*

ProteinWithStopSequence, length=4