Using genetic codes#
Selecting codes in methods that support them#
In cases where a cogent3
object method has a gc
argument, you can just use the number under “Code ID” column.
For example, I’ve created a partial codon in "s1"
from cogent3 import make_aligned_seqs
data = {
"s1": "GCTCATGCCAGCTCTTTACAGCATGAGAACA--AGT",
"s2": "ACTCATGCCAACTCATTACAGCATGAGAACAGCAGT",
"s3": "ACTCATGCCAGCTCATTACAGCATGAGAACAGCAGT",
"s4": "ACTCATGCCAGCTCATTACAGCATGAGAACAGCAGT",
"s5": "ACTCATGCCAGCTCAGTACAGCATGAGAACAGCAGT",
}
nt_seqs = make_aligned_seqs(data=data, moltype="dna")
nt_seqs
0 | |
s2 | ACTCATGCCAACTCATTACAGCATGAGAACAGCAGT |
s1 | G.........G...T................--... |
s3 | ..........G......................... |
s4 | ..........G......................... |
s5 | ..........G....G.................... |
5 x 36 dna alignment
We specify the genetic code, and we allow incomplete codons. In this case, if a codon contains a gap, they are converted to ?
in the translation.
nt_seqs.get_translation(gc=1, incomplete_ok=True)
0 | |
s2 | THANSLQHENSS |
s1 | A..S......?. |
s3 | ...S........ |
s4 | ...S........ |
s5 | ...S.V...... |
5 x 12 protein alignment
Translate DNA sequences#
from cogent3 import get_code
standard_code = get_code(1)
standard_code.translate("TTTGCAAAC")
'FAN'
Conversion to a ProteinSequence
from a DnaSequence
is shown in Translate a DnaSequence to protein.
Translate all six frames#
from cogent3 import get_code, make_seq
standard_code = get_code(1)
seq = make_seq("ATGCTAACATAAA", moltype="dna")
translations = standard_code.sixframes(seq)
print(translations)
['MLT*', 'C*HK', 'ANI', 'FMLA', 'LC*H', 'YVS']
Find out how many stops in a frame#
from cogent3 import get_code, make_seq
standard_code = get_code(1)
seq = make_seq("ATGCTAACATAAA", moltype="dna")
stops_frame1 = standard_code.get_stop_indices(seq, start=0)
stops_frame1
[9]
stop_index = stops_frame1[0]
seq[stop_index : stop_index + 3]
0 | |
None | TAA |
DnaSequence, length=3
Translate a codon#
from cogent3 import get_code, make_seq
standard_code = get_code(1)
standard_code["TTT"]
'F'
or get the codons for a single amino acid
standard_code["A"]
['GCT', 'GCC', 'GCA', 'GCG']
Look up the amino acid corresponding to a single codon#
from cogent3 import get_code
standard_code = get_code(1)
standard_code["TTT"]
'F'
Get all the codons for one amino acid#
from cogent3 import get_code
standard_code = get_code(1)
standard_code["A"]
['GCT', 'GCC', 'GCA', 'GCG']
Get all the codons for a group of amino acids#
targets = ["A", "C"]
codons = [standard_code[aa] for aa in targets]
codons
[['GCT', 'GCC', 'GCA', 'GCG'], ['TGT', 'TGC']]
flat_list = sum(codons, [])
flat_list
['GCT', 'GCC', 'GCA', 'GCG', 'TGT', 'TGC']
Converting the CodonAlphabet
to codon series#
from cogent3 import get_code
gc = get_code(1)
alphabet = gc.get_alphabet()
print(alphabet)
('TTT', 'TTC', 'TTA', 'TTG', 'TCT', 'TCC', 'TCA', 'TCG', 'TAT', 'TAC', 'TGT', 'TGC', 'TGG', 'CTT', 'CTC', 'CTA', 'CTG', 'CCT', 'CCC', 'CCA', 'CCG', 'CAT', 'CAC', 'CAA', 'CAG', 'CGT', 'CGC', 'CGA', 'CGG', 'ATT', 'ATC', 'ATA', 'ATG', 'ACT', 'ACC', 'ACA', 'ACG', 'AAT', 'AAC', 'AAA', 'AAG', 'AGT', 'AGC', 'AGA', 'AGG', 'GTT', 'GTC', 'GTA', 'GTG', 'GCT', 'GCC', 'GCA', 'GCG', 'GAT', 'GAC', 'GAA', 'GAG', 'GGT', 'GGC', 'GGA', 'GGG')
Obtaining the codons from a DnaSequence
object#
Use the method get_in_motif_size
from cogent3 import make_seq
my_seq = make_seq("ATGCACTGGTAA", name="my_gene", moltype="dna")
codons = my_seq.get_in_motif_size(3)
codons
['ATG', 'CAC', 'TGG', 'TAA']
Translating a DNA sequence#
The defaults for get_translation()
include using the standard genetic code and trimming a terminating stop if it exists.
pep = my_seq.get_translation()
pep
0 | |
my_gene | MHW |
ProteinSequence, length=3
Translating a DNA sequence containing stop codons#
Making a sequence that contains both internal and terminating stop codons.
from cogent3 import make_seq
seq = make_seq("ATGTGATGGTAA", name="s1", moltype="dna")
Translating this will fail with default settings.
pep = seq.get_translation()
---------------------------------------------------------------------------
AlphabetError Traceback (most recent call last)
File ~/work/cogent3.github.io/cogent3.github.io/.venv/lib/python3.11/site-packages/cogent3/core/sequence.py:1757, in NucleicAcidSequence.get_translation(self, gc, incomplete_ok, include_stop, trim_stop)
1756 try:
-> 1757 resolved = moltype.resolve_ambiguity(
1758 orig_codon, alphabet=codon_alphabet
1759 )
1760 except AlphabetError:
File ~/work/cogent3.github.io/cogent3.github.io/.venv/lib/python3.11/site-packages/cogent3/core/moltype.py:1335, in MolType.resolve_ambiguity(self, ambig_motif, alphabet, allow_gap)
1334 if not result:
-> 1335 raise AlphabetError(ambig_motif)
1337 return result
AlphabetError: TGA
During handling of the above exception, another exception occurred:
AlphabetError Traceback (most recent call last)
Cell In[18], line 1
----> 1 pep = seq.get_translation()
File ~/work/cogent3.github.io/cogent3.github.io/.venv/lib/python3.11/site-packages/cogent3/core/sequence.py:1762, in NucleicAcidSequence.get_translation(self, gc, incomplete_ok, include_stop, trim_stop)
1760 except AlphabetError:
1761 if not incomplete_ok or "-" not in orig_codon:
-> 1762 raise AlphabetError(
1763 f"unresolvable codon {orig_codon!r} in {self.name}"
1764 )
1765 resolved = (orig_codon,)
1766 trans = []
AlphabetError: unresolvable codon 'TGA' in s1
Unless you explicitly allow stop codons
pep = seq.get_translation(include_stop=True)
pep
0 | |
s1 | M*W* |
ProteinWithStopSequence, length=4