[60] | 1 | #!/usr/bin/env python
|
---|
| 2 |
|
---|
| 3 | # http://en.wikipedia.org/wiki/Stop_codon
|
---|
| 4 | # http://en.wikipedia.org/wiki/Escherichia_coli
|
---|
| 5 | # http://en.wikipedia.org/wiki/Open_reading_frame
|
---|
[61] | 6 | # http://nl.wikipedia.org/wiki/Genetische_code
|
---|
[60] | 7 |
|
---|
| 8 | import sys
|
---|
[61] | 9 | import csv
|
---|
| 10 | import string
|
---|
[60] | 11 |
|
---|
[63] | 12 | # http://ghmm.sourceforge.net/
|
---|
| 13 | import ghmm
|
---|
| 14 |
|
---|
[64] | 15 | from MultiReplace import MultiReplace
|
---|
| 16 |
|
---|
[63] | 17 | # The mapping is kind of odd, as 'r' could mean either 'g' or 'a', without any clear distintion
|
---|
| 18 | fasta_translate = {
|
---|
| 19 | 'r' : 'ga', # purine
|
---|
| 20 | 'y' : 'tc', # pyrimide
|
---|
| 21 | 'k' : 'gt', # keto
|
---|
| 22 | 'm' : 'ac', # amino
|
---|
| 23 | 's' : 'gc', # strong
|
---|
| 24 | 'w' : 'at', # weak
|
---|
| 25 | 'b' : 'gtc',
|
---|
| 26 | 'd' : 'gat',
|
---|
| 27 | 'h' : 'act',
|
---|
| 28 | 'v' : 'gca',
|
---|
| 29 | }
|
---|
| 30 |
|
---|
[66] | 31 | # Transform the state to something human readable
|
---|
[64] | 32 | dna_ascii_translate = {
|
---|
[66] | 33 | '0' : '.',
|
---|
[64] | 34 | '1' : '<',
|
---|
| 35 | '2' : '<',
|
---|
| 36 | '3' : '<',
|
---|
[66] | 37 | '4' : '|',
|
---|
[64] | 38 | '5' : '>',
|
---|
| 39 | '6' : '>',
|
---|
| 40 | '7' : '>',
|
---|
| 41 | }
|
---|
[63] | 42 |
|
---|
[64] | 43 | dna_ascii = MultiReplace(dna_ascii_translate)
|
---|
[60] | 44 |
|
---|
[66] | 45 | def pretty_print(test_seq, ans_seq, v, length=100, parts=10, seperator=''):
|
---|
[64] | 46 | """ Pretty printing of output for verification purposes """
|
---|
| 47 | for i in range(0,len(v[0]),length):
|
---|
| 48 | seq = []
|
---|
| 49 | ans = []
|
---|
| 50 | result = []
|
---|
| 51 | for j in range(0,length,parts):
|
---|
| 52 | t = i + j
|
---|
| 53 | seq.append(test_seq[t:t+parts])
|
---|
| 54 | ans.append(ans_seq[t:t+parts])
|
---|
| 55 | result.append(dna_ascii.replace(''.join(map(str,v[0][t:t+parts]))))
|
---|
[62] | 56 |
|
---|
[66] | 57 | print "ORG: " + seperator.join(seq)
|
---|
| 58 | print "CSV: " + seperator.join(ans)
|
---|
| 59 | print "HMM: " + seperator.join(result)
|
---|
[64] | 60 | print ''
|
---|
| 61 | print "fairness of test_seq: ", v[1]
|
---|
[62] | 62 |
|
---|
[64] | 63 |
|
---|
[63] | 64 | def ecoli_hmm():
|
---|
[60] | 65 | """Try to find genes inside e sequence using a HMM"""
|
---|
| 66 | # Model 4 bases A C G T and unknown state N
|
---|
| 67 | sigma = ghmm.Alphabet(['a', 'c', 'g', 't', 'n' ])
|
---|
| 68 | print sigma
|
---|
| 69 |
|
---|
| 70 | # XXX: Proper values, based of statistics
|
---|
| 71 | # The transition matrix A is chosen such that it reflects the statistics
|
---|
[64] | 72 | # Probalities from moving from one state to an other
|
---|
| 73 | # 0) Outer-gene : will try to get us into a gene
|
---|
| 74 | # 1) Start-codon : beginning of gene - part 1
|
---|
| 75 | # 2) Start-codon : beginning of gene - part 2
|
---|
| 76 | # 3) Start-codon : beginning of gene - part 3
|
---|
| 77 | # 4) Inside-gene : in the gene
|
---|
| 78 | # 5) Stop-codon : end of gene - part 1
|
---|
| 79 | # 6) Stop-codon : end of gene - part 2
|
---|
| 80 | # 7) Stop-codon : end of gene - part 3
|
---|
| 81 | A = [
|
---|
[66] | 82 | [0.8, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], # 0
|
---|
| 83 | [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], # 1
|
---|
| 84 | [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0], # 2
|
---|
| 85 | [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0], # 3
|
---|
| 86 | [0.0, 0.0, 0.0, 0.0, 0.9, 0.1, 0.0, 0.0], # 4
|
---|
| 87 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0], # 5
|
---|
| 88 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0], # 6
|
---|
| 89 | [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], # 7
|
---|
[64] | 90 | ]
|
---|
[60] | 91 |
|
---|
[64] | 92 | # XXX: Proper values, based of statistics
|
---|
| 93 | # The emission probabilities matrix is modeled after the statistics
|
---|
| 94 | # (['a', 'c', 'g', 't', 'n' ]
|
---|
| 95 | B = [
|
---|
| 96 | # e.g. state 0 -> emission probability
|
---|
[66] | 97 | [0.2, 0.2, 0.2, 0.2, 0.2] , # 0
|
---|
| 98 | [0.9, 0.0, 0.1, 0.0, 0.0] , # 1
|
---|
| 99 | [0.0, 0.0, 0.0, 1.0, 0.0] , # 2
|
---|
| 100 | [0.0, 0.0, 1.0, 0.0, 0.0] , # 3
|
---|
| 101 | [0.2, 0.2, 0.2, 0.2, 0.2] , # 4
|
---|
| 102 | [0.0, 0.0, 0.0, 1.0, 0.0] , # 5
|
---|
| 103 | [0.7, 0.0, 0.3, 0.0, 0.0] , # 6
|
---|
| 104 | [0.7, 0.0, 0.3, 0.0, 0.0] , # 7
|
---|
[64] | 105 | ]
|
---|
[60] | 106 |
|
---|
| 107 | # Initial distribution favors outside
|
---|
[64] | 108 | pi = [0.9] + [0.1/7] * 7
|
---|
[60] | 109 |
|
---|
| 110 | m = ghmm.HMMFromMatrices(sigma,ghmm.DiscreteDistribution(sigma),A ,B, pi)
|
---|
[64] | 111 | print "Initial HMM"
|
---|
[66] | 112 | print m.verboseStr()
|
---|
[60] | 113 |
|
---|
| 114 | obs_seq = m.sampleSingle(20)
|
---|
| 115 | print "Observation sequence : ", obs_seq
|
---|
| 116 | obs = map(sigma.external, obs_seq)
|
---|
| 117 | print "Observations : ", ''.join(obs)
|
---|
| 118 |
|
---|
[64] | 119 | answer = {}
|
---|
| 120 | handle = open('AE005174v2-2-gene.raw', 'rU')
|
---|
| 121 | answer['AE005174v2-2'] = handle.read()
|
---|
| 122 | handle.close()
|
---|
[61] | 123 |
|
---|
[64] | 124 | contig_seq = {}
|
---|
| 125 | handle = open('AE005174v2-2.raw', 'rU')
|
---|
| 126 | contig_seq['AE005174v2-2'] = handle.read()
|
---|
| 127 | handle.close()
|
---|
[62] | 128 |
|
---|
[64] | 129 | handle = open('AE005174v2-1.raw', 'rU')
|
---|
| 130 | contig_seq['AE005174v2-1'] = handle.read()
|
---|
| 131 | handle.close()
|
---|
[62] | 132 |
|
---|
[66] | 133 | test_size = 1000;
|
---|
| 134 | test_seq = contig_seq['AE005174v2-2'][0:test_size]
|
---|
| 135 | ans_seq = answer['AE005174v2-2'][0:test_size]
|
---|
[64] | 136 | test_eseq=ghmm.EmissionSequence(sigma,list(test_seq))
|
---|
| 137 | v = m.viterbi(test_eseq)
|
---|
| 138 | pretty_print(test_seq, ans_seq, v)
|
---|
[62] | 139 |
|
---|
[61] | 140 |
|
---|
[64] | 141 | # Train sequence
|
---|
| 142 | print "Training baumWelch"
|
---|
[66] | 143 | train_seq = ghmm.EmissionSequence(sigma,list(contig_seq['AE005174v2-1'][0:10000]))
|
---|
[64] | 144 | v = m.baumWelch(train_seq)
|
---|
[66] | 145 | print m.verboseStr()
|
---|
[62] | 146 |
|
---|
[61] | 147 |
|
---|
[64] | 148 | print "Results after training sequence..."
|
---|
| 149 | v = m.viterbi(test_eseq)
|
---|
| 150 | pretty_print(test_seq, ans_seq, v)
|
---|
[60] | 151 |
|
---|
| 152 |
|
---|
| 153 | # XXX: Results
|
---|
| 154 |
|
---|
| 155 | if __name__ == "__main__":
|
---|
[63] | 156 | ecoli_hmm()
|
---|