source: liacs/dbdm/dbdm_4/ecoli_hmm.py@ 60

Last change on this file since 60 was 60, checked in by Rick van der Zwet, 15 years ago

Playing around with GHMM and seems to work pretty well

  • Property svn:executable set to *
File size: 1.6 KB
Line 
1#!/usr/bin/env python
2
3# http://en.wikipedia.org/wiki/Stop_codon
4# http://en.wikipedia.org/wiki/Escherichia_coli
5# http://en.wikipedia.org/wiki/Open_reading_frame
6
7import ghmm
8import sys
9
10
11def ecoli_hmm(seq):
12 """Try to find genes inside e sequence using a HMM"""
13 # Model 4 bases A C G T and unknown state N
14 sigma = ghmm.Alphabet(['a', 'c', 'g', 't', 'n' ])
15 print sigma
16
17 # XXX: Proper values, based of statistics
18 # The transition matrix A is chosen such that it reflects the statistics
19 # Part one will try to get us into a gene, part two will tell us when to
20 # get out of it
21 A = [[0.6,0.4], [0.3, 0.7]]
22
23 # The emission probabilities matrix is modeled after the statistics with a
24 # total of 1
25 etogene = [ 0.2, 0.3, 0.3, 0.1, 0.2 ]
26 eingene = [ 0.2, 0.3, 0.3, 0.2, 0.1 ]
27
28 B = [etogene,eingene]
29
30 # Initial distribution favors outside
31 pi = [0.9, 0.1]
32
33 m = ghmm.HMMFromMatrices(sigma,ghmm.DiscreteDistribution(sigma),A ,B, pi)
34 print "Initial HMM", m
35
36 obs_seq = m.sampleSingle(20)
37 print "Observation sequence : ", obs_seq
38 obs = map(sigma.external, obs_seq)
39 print "Observations : ", ''.join(obs)
40
41 # XXX: Training
42
43 # XXX: Validation
44
45 # XXX: Results
46
47
48
49if __name__ == "__main__":
50 # Load data
51 try:
52 handle = open(sys.argv[1],"rU")
53 except IndexError:
54 print "Usage %s <data_file>" % (sys.argv[0])
55 sys.exit(64)
56 except IOError:
57 print "Unable to open '%s'" % (sys.argv[1])
58 sys.exit(64)
59
60 seq = handle.read()
61 handle.close()
62 ecoli_hmm(seq)
Note: See TracBrowser for help on using the repository browser.