1 | #!/usr/bin/env python
|
---|
2 |
|
---|
3 | # http://en.wikipedia.org/wiki/Stop_codon
|
---|
4 | # http://en.wikipedia.org/wiki/Escherichia_coli
|
---|
5 | # http://en.wikipedia.org/wiki/Open_reading_frame
|
---|
6 |
|
---|
7 | import ghmm
|
---|
8 | import sys
|
---|
9 |
|
---|
10 |
|
---|
11 | def ecoli_hmm(seq):
|
---|
12 | """Try to find genes inside e sequence using a HMM"""
|
---|
13 | # Model 4 bases A C G T and unknown state N
|
---|
14 | sigma = ghmm.Alphabet(['a', 'c', 'g', 't', 'n' ])
|
---|
15 | print sigma
|
---|
16 |
|
---|
17 | # XXX: Proper values, based of statistics
|
---|
18 | # The transition matrix A is chosen such that it reflects the statistics
|
---|
19 | # Part one will try to get us into a gene, part two will tell us when to
|
---|
20 | # get out of it
|
---|
21 | A = [[0.6,0.4], [0.3, 0.7]]
|
---|
22 |
|
---|
23 | # The emission probabilities matrix is modeled after the statistics with a
|
---|
24 | # total of 1
|
---|
25 | etogene = [ 0.2, 0.3, 0.3, 0.1, 0.2 ]
|
---|
26 | eingene = [ 0.2, 0.3, 0.3, 0.2, 0.1 ]
|
---|
27 |
|
---|
28 | B = [etogene,eingene]
|
---|
29 |
|
---|
30 | # Initial distribution favors outside
|
---|
31 | pi = [0.9, 0.1]
|
---|
32 |
|
---|
33 | m = ghmm.HMMFromMatrices(sigma,ghmm.DiscreteDistribution(sigma),A ,B, pi)
|
---|
34 | print "Initial HMM", m
|
---|
35 |
|
---|
36 | obs_seq = m.sampleSingle(20)
|
---|
37 | print "Observation sequence : ", obs_seq
|
---|
38 | obs = map(sigma.external, obs_seq)
|
---|
39 | print "Observations : ", ''.join(obs)
|
---|
40 |
|
---|
41 | # XXX: Training
|
---|
42 |
|
---|
43 | # XXX: Validation
|
---|
44 |
|
---|
45 | # XXX: Results
|
---|
46 |
|
---|
47 |
|
---|
48 |
|
---|
49 | if __name__ == "__main__":
|
---|
50 | # Load data
|
---|
51 | try:
|
---|
52 | handle = open(sys.argv[1],"rU")
|
---|
53 | except IndexError:
|
---|
54 | print "Usage %s <data_file>" % (sys.argv[0])
|
---|
55 | sys.exit(64)
|
---|
56 | except IOError:
|
---|
57 | print "Unable to open '%s'" % (sys.argv[1])
|
---|
58 | sys.exit(64)
|
---|
59 |
|
---|
60 | seq = handle.read()
|
---|
61 | handle.close()
|
---|
62 | ecoli_hmm(seq)
|
---|