[60] | 1 | #!/usr/bin/env python
|
---|
| 2 |
|
---|
| 3 | # http://en.wikipedia.org/wiki/Stop_codon
|
---|
| 4 | # http://en.wikipedia.org/wiki/Escherichia_coli
|
---|
| 5 | # http://en.wikipedia.org/wiki/Open_reading_frame
|
---|
[61] | 6 | # http://nl.wikipedia.org/wiki/Genetische_code
|
---|
[60] | 7 |
|
---|
| 8 | import sys
|
---|
[61] | 9 | import csv
|
---|
| 10 | import string
|
---|
[60] | 11 |
|
---|
[63] | 12 | # http://ghmm.sourceforge.net/
|
---|
| 13 | import ghmm
|
---|
| 14 |
|
---|
| 15 | # The mapping is kind of odd, as 'r' could mean either 'g' or 'a', without any clear distintion
|
---|
| 16 | fasta_translate = {
|
---|
| 17 | 'r' : 'ga', # purine
|
---|
| 18 | 'y' : 'tc', # pyrimide
|
---|
| 19 | 'k' : 'gt', # keto
|
---|
| 20 | 'm' : 'ac', # amino
|
---|
| 21 | 's' : 'gc', # strong
|
---|
| 22 | 'w' : 'at', # weak
|
---|
| 23 | 'b' : 'gtc',
|
---|
| 24 | 'd' : 'gat',
|
---|
| 25 | 'h' : 'act',
|
---|
| 26 | 'v' : 'gca',
|
---|
| 27 | }
|
---|
| 28 |
|
---|
| 29 | DEBUG = True
|
---|
| 30 |
|
---|
[62] | 31 | dna_flip = string.maketrans('atgc','tacg')
|
---|
| 32 | def get_codon(seq, position, order):
|
---|
| 33 | codon = seq[position:position+3]
|
---|
| 34 | if not order:
|
---|
| 35 | # When living on the other side of the string make sure to flip
|
---|
| 36 | # around before comparison
|
---|
| 37 | codon = codon[::-1].translate(dna_flip)
|
---|
| 38 | return(codon)
|
---|
[60] | 39 |
|
---|
[62] | 40 |
|
---|
| 41 |
|
---|
[63] | 42 | def ecoli_hmm():
|
---|
[60] | 43 | """Try to find genes inside e sequence using a HMM"""
|
---|
| 44 | # Model 4 bases A C G T and unknown state N
|
---|
| 45 | sigma = ghmm.Alphabet(['a', 'c', 'g', 't', 'n' ])
|
---|
| 46 | print sigma
|
---|
| 47 |
|
---|
| 48 | # XXX: Proper values, based of statistics
|
---|
| 49 | # The transition matrix A is chosen such that it reflects the statistics
|
---|
| 50 | # Part one will try to get us into a gene, part two will tell us when to
|
---|
| 51 | # get out of it
|
---|
| 52 | A = [[0.6,0.4], [0.3, 0.7]]
|
---|
| 53 |
|
---|
| 54 | # The emission probabilities matrix is modeled after the statistics with a
|
---|
| 55 | # total of 1
|
---|
[61] | 56 | etogene = [ 0.1, 0.1, 0.1, 0.1, 0.6 ]
|
---|
[60] | 57 | eingene = [ 0.2, 0.3, 0.3, 0.2, 0.1 ]
|
---|
| 58 |
|
---|
| 59 | B = [etogene,eingene]
|
---|
| 60 |
|
---|
| 61 | # Initial distribution favors outside
|
---|
| 62 | pi = [0.9, 0.1]
|
---|
| 63 |
|
---|
| 64 | m = ghmm.HMMFromMatrices(sigma,ghmm.DiscreteDistribution(sigma),A ,B, pi)
|
---|
| 65 | print "Initial HMM", m
|
---|
| 66 |
|
---|
| 67 | obs_seq = m.sampleSingle(20)
|
---|
| 68 | print "Observation sequence : ", obs_seq
|
---|
| 69 | obs = map(sigma.external, obs_seq)
|
---|
| 70 | print "Observations : ", ''.join(obs)
|
---|
| 71 |
|
---|
[61] | 72 | # Start codons
|
---|
| 73 | # atg, (small chance) gtg
|
---|
| 74 | # Stop codons
|
---|
| 75 | # taa, tga
|
---|
| 76 | # tag
|
---|
| 77 | # lend = Left End
|
---|
| 78 | # rend = Right End
|
---|
| 79 |
|
---|
[62] | 80 | # Training and testing
|
---|
[61] | 81 | # A -- T, G -- C
|
---|
[62] | 82 | start_codons = ['atg', 'gtg']
|
---|
| 83 | stop_codons = ['taa', 'tga', 'tag']
|
---|
[61] | 84 | edl_file = 'data/edl_genes.csv'
|
---|
[62] | 85 |
|
---|
| 86 | # Counter limit of how many 'hard' errors are allowed before bailing out
|
---|
[63] | 87 | stop_limit = 100
|
---|
[62] | 88 |
|
---|
| 89 | # Current shifting offset
|
---|
| 90 | base_shift = 0
|
---|
| 91 | stop_counter = 0
|
---|
[63] | 92 |
|
---|
| 93 | # Sequences of contig, XXX: Make it a FASTA / BioPython Parser
|
---|
| 94 | contig_seq = {}
|
---|
[62] | 95 |
|
---|
[61] | 96 | reader = csv.reader(open(edl_file,"rU"))
|
---|
| 97 | reader.next()
|
---|
| 98 | try:
|
---|
| 99 | for row in reader:
|
---|
[62] | 100 | finished = False
|
---|
| 101 | while not finished:
|
---|
| 102 | (featureType, zNumber, contig, lend, rend, orientation,
|
---|
| 103 | segmentType, oIslandNumber, geneName, note, function, product,
|
---|
| 104 | translationNotes) = row
|
---|
| 105 | lend = int(lend)
|
---|
| 106 | rend = int(rend)
|
---|
[63] | 107 | if not contig_seq.has_key(contig):
|
---|
| 108 | # Load data
|
---|
| 109 | try:
|
---|
| 110 | handle = open(contig + ".raw","rU")
|
---|
| 111 | contig_seq[contig] = handle.read()
|
---|
| 112 | handle.close()
|
---|
| 113 | except IOError:
|
---|
| 114 | print "Unable to open '%s'" % (sys.argv[1])
|
---|
| 115 | sys.exit(64)
|
---|
| 116 | seq = contig_seq[contig]
|
---|
| 117 |
|
---|
[62] | 118 | # Make a forward orientation mark as positive
|
---|
| 119 | if orientation == '>':
|
---|
| 120 | order = True
|
---|
| 121 | else:
|
---|
| 122 | order = False
|
---|
[61] | 123 |
|
---|
[62] | 124 | # Living the world upside down to order is off
|
---|
| 125 | if order:
|
---|
| 126 | start_pos = base_shift + lend - 1
|
---|
| 127 | stop_pos = base_shift + rend - 3
|
---|
| 128 | else:
|
---|
| 129 | start_pos = base_shift + rend -3
|
---|
| 130 | stop_pos = base_shift + lend - 1
|
---|
| 131 |
|
---|
| 132 | start_codon = get_codon(seq,start_pos,order)
|
---|
| 133 | stop_codon = get_codon(seq,stop_pos,order)
|
---|
| 134 | start_match = start_codon in start_codons
|
---|
| 135 | stop_match = stop_codon in stop_codons
|
---|
[61] | 136 |
|
---|
[63] | 137 | print "%6s, %s, %s, %s, %s, %s, %s, %s, %s, %s," % (geneName,lend,rend,start_pos, stop_pos, contig, start_codon, stop_codon, orientation, base_shift),
|
---|
[62] | 138 | print "%s, %s" % (start_match, stop_match)
|
---|
[63] | 139 | if not start_match or not stop_match:
|
---|
| 140 | print "### Function (comment):", function
|
---|
[62] | 141 |
|
---|
| 142 | # Check for fucked up offsets, shifts
|
---|
[63] | 143 | start_shift = []
|
---|
| 144 | stop_shift = []
|
---|
[62] | 145 | # Technically speaking should the cope only the reading frames,
|
---|
[63] | 146 | # but what the heck looking if 'free', but make sure to include the original frames as well
|
---|
| 147 | search_range = abs(base_shift) + 3
|
---|
[62] | 148 | if not start_match:
|
---|
| 149 | # Somewhere else in the reading frame?
|
---|
[63] | 150 | matches = []
|
---|
[62] | 151 | for r in range(-search_range,search_range+1):
|
---|
| 152 | l = start_pos + r
|
---|
| 153 | t = get_codon(seq,l,order)
|
---|
| 154 | new_match = t in start_codons
|
---|
| 155 | if new_match:
|
---|
[63] | 156 | matches.append("%i:%s" % (r, t))
|
---|
| 157 | start_shift.append(r)
|
---|
| 158 | if start_shift:
|
---|
| 159 | print "# Start codon reading frame matches: ", ",".join(matches)
|
---|
[62] | 160 | if not stop_match:
|
---|
| 161 | # Somewhere else in the reading frame?
|
---|
[63] | 162 | matches = []
|
---|
[62] | 163 | for r in range(-search_range,search_range+1):
|
---|
| 164 | l = stop_pos + r
|
---|
| 165 | t = get_codon(seq,l,order)
|
---|
| 166 | new_match = t in stop_codons
|
---|
| 167 | if new_match:
|
---|
[63] | 168 | stop_shift.append(r)
|
---|
| 169 | matches.append("%i:%s" % (r, t))
|
---|
| 170 | if stop_shift:
|
---|
| 171 | print "# Stop codon reading frame matches: ", ",".join(matches)
|
---|
[61] | 172 |
|
---|
[62] | 173 |
|
---|
| 174 | # Both wrong is something screwy in the data, check offset fix
|
---|
| 175 | if not start_match and not stop_match:
|
---|
[63] | 176 | common_shift = list(set(start_shift) & set(stop_shift))
|
---|
| 177 | if common_shift:
|
---|
| 178 | print "# Matching shifts: " + ",".join(map(str,common_shift))
|
---|
| 179 | # Get the value closest to 0 for shifting purposes
|
---|
| 180 | # Currently asume no negative shifts
|
---|
| 181 | base_shift += min(common_shift)
|
---|
[62] | 182 | finished = False
|
---|
| 183 | continue
|
---|
| 184 | # Exercise left to the reader
|
---|
| 185 | if (stop_counter > stop_limit):
|
---|
| 186 | return
|
---|
| 187 | else:
|
---|
| 188 | stop_counter += 1
|
---|
| 189 | finished = True
|
---|
| 190 | else:
|
---|
| 191 | stop_counter = 0
|
---|
| 192 | finished = True
|
---|
| 193 |
|
---|
[61] | 194 | except csv.Error, e:
|
---|
| 195 | sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e))
|
---|
| 196 |
|
---|
| 197 |
|
---|
| 198 |
|
---|
| 199 | test_seq=ghmm.EmissionSequence(sigma,list(seq[0:(len(seq) / 3)]))
|
---|
| 200 | v = m.viterbi(test_seq)
|
---|
| 201 | #print "fairness of test_seq: ", v
|
---|
| 202 |
|
---|
[60] | 203 |
|
---|
[61] | 204 | # Validation
|
---|
| 205 | val_seq=ghmm.EmissionSequence(sigma,list(seq[10:2000]))
|
---|
| 206 | v = m.viterbi(val_seq)
|
---|
| 207 | #print "Test sequence: ", v
|
---|
[60] | 208 |
|
---|
| 209 | # XXX: Results
|
---|
| 210 |
|
---|
| 211 |
|
---|
| 212 |
|
---|
| 213 | if __name__ == "__main__":
|
---|
[63] | 214 | ecoli_hmm()
|
---|