fork download
  1. #!/usr/bin/env python
  2. # coding=utf-8
  3.  
  4. # Open Reading Frames
  5. # ===================
  6. #
  7. # Either strand of a DNA double helix can serve as the coding strand for RNA
  8. # transcription. Hence, a given DNA string implies six total reading frames, or
  9. # ways in which the same region of DNA can be translated into amino acids: three
  10. # reading frames result from reading the string itself, whereas three more
  11. # result from reading its reverse complement.
  12. #
  13. # An open reading frame (ORF) is one which starts from the start codon and ends
  14. # by stop codon, without any other stop codons in between. Thus, a candidate
  15. # protein string is derived by translating an open reading frame into amino
  16. # acids until a stop codon is reached.
  17. #
  18. # Given: A DNA string s of length at most 1 kbp.
  19. #
  20. # Return: Every distinct candidate protein string that can be translated from
  21. # ORFs of s. Strings can be returned in any order.
  22. #
  23. # Sample Dataset
  24. # --------------
  25. # AGCCATGTAGCTAACTCAGGTTACATGGGGATGACCCCGCGACTTGGATTAGAGTCTCTTTTGGAATAAGCCTGAATGATCCGAGTAGCATCTCAG
  26. #
  27. # Sample Output
  28. # -------------
  29. # MLLGSFRLIPKETLIQVAGSSPCNLS
  30. # M
  31. # MGMTPRLGLESLLE
  32. # MTPRLGLESLLE
  33.  
  34.  
  35. DNA_CODON_TABLE = {
  36. 'TTT': 'F', 'CTT': 'L', 'ATT': 'I', 'GTT': 'V',
  37. 'TTC': 'F', 'CTC': 'L', 'ATC': 'I', 'GTC': 'V',
  38. 'TTA': 'L', 'CTA': 'L', 'ATA': 'I', 'GTA': 'V',
  39. 'TTG': 'L', 'CTG': 'L', 'ATG': 'M', 'GTG': 'V',
  40. 'TCT': 'S', 'CCT': 'P', 'ACT': 'T', 'GCT': 'A',
  41. 'TCC': 'S', 'CCC': 'P', 'ACC': 'T', 'GCC': 'A',
  42. 'TCA': 'S', 'CCA': 'P', 'ACA': 'T', 'GCA': 'A',
  43. 'TCG': 'S', 'CCG': 'P', 'ACG': 'T', 'GCG': 'A',
  44. 'TAT': 'Y', 'CAT': 'H', 'AAT': 'N', 'GAT': 'D',
  45. 'TAC': 'Y', 'CAC': 'H', 'AAC': 'N', 'GAC': 'D',
  46. 'TAA': 'Stop', 'CAA': 'Q', 'AAA': 'K', 'GAA': 'E',
  47. 'TAG': 'Stop', 'CAG': 'Q', 'AAG': 'K', 'GAG': 'E',
  48. 'TGT': 'C', 'CGT': 'R', 'AGT': 'S', 'GGT': 'G',
  49. 'TGC': 'C', 'CGC': 'R', 'AGC': 'S', 'GGC': 'G',
  50. 'TGA': 'Stop', 'CGA': 'R', 'AGA': 'R', 'GGA': 'G',
  51. 'TGG': 'W', 'CGG': 'R', 'AGG': 'R', 'GGG': 'G'
  52. }
  53.  
  54.  
  55. def translate_codon(codon):
  56. protein = None
  57. if len(codon) == 3 and DNA_CODON_TABLE.has_key(codon):
  58. protein = DNA_CODON_TABLE[codon]
  59. return protein
  60.  
  61.  
  62. def reverse_complement(dna):
  63. lookup = {'A':'T', 'T':'A', 'G':'C', 'C':'G'}
  64. return ''.join([lookup[c] for c in reversed(dna)])
  65.  
  66.  
  67. def possible_protein_strings(s):
  68. results = []
  69. indices = []
  70.  
  71. l = len(s)
  72. for i in range(l):
  73. protein = translate_codon(s[i:i+3])
  74. if protein and protein == 'M':
  75. indices.append(i)
  76.  
  77. for i in indices:
  78. found_stop = False
  79. protein_string = ''
  80.  
  81. for j in range(i, l, 3):
  82. protein = translate_codon(s[j:j+3])
  83.  
  84. if not protein:
  85. break
  86.  
  87. if protein == 'Stop':
  88. found_stop = True
  89. break
  90.  
  91. protein_string += protein
  92.  
  93. if found_stop:
  94. results.append(protein_string)
  95.  
  96. return results
  97.  
  98.  
  99. if __name__ == "__main__":
  100.  
  101. small_dataset = "AGCCATGTAGCTAACTCAGGTTACATGGGGATGACCCCGCGACTTGGATTAGAGTCTCTTTTGGAATAAGCCTGAATGATCCGAGTAGCATCTCAG"
  102.  
  103. possible_a = possible_protein_strings(small_dataset)
  104. possible_b = possible_protein_strings(reverse_complement(small_dataset))
  105. print "\n".join(set(possible_a + possible_b))
Success #stdin #stdout 0.04s 25928KB
stdin
Standard input is empty
stdout
# coding=utf-8

# Open Reading Frames
# ===================
# 
# Either strand of a DNA double helix can serve as the coding strand for RNA
# transcription. Hence, a given DNA string implies six total reading frames, or
# ways in which the same region of DNA can be translated into amino acids: three
# reading frames result from reading the string itself, whereas three more
# result from reading its reverse complement.
# 
# An open reading frame (ORF) is one which starts from the start codon and ends
# by stop codon, without any other stop codons in between. Thus, a candidate
# protein string is derived by translating an open reading frame into amino
# acids until a stop codon is reached.
# 
# Given: A DNA string s of length at most 1 kbp.
# 
# Return: Every distinct candidate protein string that can be translated from
# ORFs of s. Strings can be returned in any order.
# 
# Sample Dataset
# --------------
# AGCCATGTAGCTAACTCAGGTTACATGGGGATGACCCCGCGACTTGGATTAGAGTCTCTTTTGGAATAAGCCTGAATGATCCGAGTAGCATCTCAG
# 
# Sample Output
# -------------
# MLLGSFRLIPKETLIQVAGSSPCNLS
# M
# MGMTPRLGLESLLE
# MTPRLGLESLLE


DNA_CODON_TABLE = {
    'TTT': 'F',     'CTT': 'L',     'ATT': 'I',     'GTT': 'V',
    'TTC': 'F',     'CTC': 'L',     'ATC': 'I',     'GTC': 'V',
    'TTA': 'L',     'CTA': 'L',     'ATA': 'I',     'GTA': 'V',
    'TTG': 'L',     'CTG': 'L',     'ATG': 'M',     'GTG': 'V',
    'TCT': 'S',     'CCT': 'P',     'ACT': 'T',     'GCT': 'A',
    'TCC': 'S',     'CCC': 'P',     'ACC': 'T',     'GCC': 'A',
    'TCA': 'S',     'CCA': 'P',     'ACA': 'T',     'GCA': 'A',
    'TCG': 'S',     'CCG': 'P',     'ACG': 'T',     'GCG': 'A',
    'TAT': 'Y',     'CAT': 'H',     'AAT': 'N',     'GAT': 'D',
    'TAC': 'Y',     'CAC': 'H',     'AAC': 'N',     'GAC': 'D',
    'TAA': 'Stop',  'CAA': 'Q',     'AAA': 'K',     'GAA': 'E',
    'TAG': 'Stop',  'CAG': 'Q',     'AAG': 'K',     'GAG': 'E',
    'TGT': 'C',     'CGT': 'R',     'AGT': 'S',     'GGT': 'G',
    'TGC': 'C',     'CGC': 'R',     'AGC': 'S',     'GGC': 'G',
    'TGA': 'Stop',  'CGA': 'R',     'AGA': 'R',     'GGA': 'G',
    'TGG': 'W',     'CGG': 'R',     'AGG': 'R',     'GGG': 'G'
}


def translate_codon(codon):
    protein = None
    if len(codon) == 3 and DNA_CODON_TABLE.has_key(codon):
        protein = DNA_CODON_TABLE[codon]
    return protein


def reverse_complement(dna):
    lookup = {'A':'T', 'T':'A', 'G':'C', 'C':'G'}
    return ''.join([lookup[c] for c in reversed(dna)])


def possible_protein_strings(s):
    results = []
    indices = []

    l = len(s)
    for i in range(l):
        protein = translate_codon(s[i:i+3])
        if protein and protein == 'M':
            indices.append(i)

    for i in indices:
        found_stop = False
        protein_string = ''

        for j in range(i, l, 3):
            protein = translate_codon(s[j:j+3])

            if not protein:
                break

            if protein == 'Stop':
                found_stop = True
                break

            protein_string += protein

        if found_stop:
            results.append(protein_string)

    return results


if __name__ == "__main__":

    small_dataset = "AGCCATGTAGCTAACTCAGGTTACATGGGGATGACCCCGCGACTTGGATTAGAGTCTCTTTTGGAATAAGCCTGAATGATCCGAGTAGCATCTCAG"

    possible_a = possible_protein_strings(small_dataset)
    possible_b = possible_protein_strings(reverse_complement(small_dataset))
    print "\n".join(set(possible_a + possible_b))