1 | #!/usr/bin/env python
|
---|
2 | #
|
---|
3 | # Tutorial on using GHMM with Python
|
---|
4 | # http://ghmm.sourceforge.net/ghmm-python-tutorial.html
|
---|
5 | # Ported to python code for lazy programmers ;-)
|
---|
6 | #
|
---|
7 | # In the following, we assume that you have installed GHMM including the Python
|
---|
8 | # bindings. Download the UnfairCasino.py-file.
|
---|
9 | #
|
---|
10 | # You might have seen the unfair casino example (Chair Biological Sequence
|
---|
11 | # Analysis, Durbin et. al, 1998), where a dealer in a casino occasionally
|
---|
12 | # exchanges a fair dice with a loaded one. We do not know if and when this
|
---|
13 | # exchange happens, we only can observe the throws. This stochastic process we
|
---|
14 | # will model with a HMM.
|
---|
15 |
|
---|
16 | import ghmm
|
---|
17 | print "For aditional help use: help('ghmm')"
|
---|
18 |
|
---|
19 | #Creating the first model:
|
---|
20 |
|
---|
21 | # There are two states in our example. Either the dice is fair (state 0; Python
|
---|
22 | # indexes arrays like C and C++ from 0) or it is loaded (state 1). We will
|
---|
23 | # define the transition and emission matrices explicitly.
|
---|
24 |
|
---|
25 | # To save us some typing (namely ghmm. before any class or function from GHMM),
|
---|
26 | # we will import all symbols from ghmm directly.
|
---|
27 |
|
---|
28 | from ghmm import *
|
---|
29 |
|
---|
30 | # Our alphabet are the numbers on the dice {1,2,3,4,5,6}. To be consistent with
|
---|
31 | # Python's range function the upper limit is not part of a range.
|
---|
32 |
|
---|
33 | sigma = IntegerRange(1,7)
|
---|
34 |
|
---|
35 | # The transition matrix A is chosen such that we will stay longer in the fair
|
---|
36 | # than the loaded state. The emission probabilities are set accordingly in B ---
|
---|
37 | # guessing equal probabilities for the fair dice and higher probabilities for
|
---|
38 | # ones and twos etc. for the loaded dice --- and the initial state distribution
|
---|
39 | # pi has no preference for either state.
|
---|
40 |
|
---|
41 | A = [[0.9, 0.1], [0.3, 0.7]]
|
---|
42 | efair = [1.0 / 6] * 6
|
---|
43 | print "Fair probabilities:", efair
|
---|
44 |
|
---|
45 | eloaded = [3.0 / 13, 3.0 / 13, 2.0 / 13, 2.0 / 13, 2.0 / 13, 1.0 / 13]
|
---|
46 | B = [efair, eloaded]
|
---|
47 | pi = [0.5] * 2
|
---|
48 | m = HMMFromMatrices(sigma, DiscreteDistribution(sigma), A, B, pi)
|
---|
49 | print "Inital HHM", m
|
---|
50 |
|
---|
51 | # Generating sequences
|
---|
52 |
|
---|
53 | # At this time GHMM still exposes some internals. For example, symbols from
|
---|
54 | # discrete alphabets are internally represented as 0,...,|alphabet| -1 and the
|
---|
55 | # alphabet is responsible for translating between internal and external
|
---|
56 | # representations.
|
---|
57 |
|
---|
58 | # First we sample from the model, specifying the length of the sequence, and
|
---|
59 | # then transfer the observations to the external representation.
|
---|
60 |
|
---|
61 | obs_seq = m.sampleSingle(20)
|
---|
62 | print "Observation sequence : ", obs_seq
|
---|
63 | obs = map(sigma.external, obs_seq)
|
---|
64 | print "Observations : ", obs
|
---|
65 |
|
---|
66 | # Learning from data
|
---|
67 |
|
---|
68 | # In a real application we would want to train the model. That is, we would
|
---|
69 | # like to estimate the parameters of the model from data with the goal to
|
---|
70 | # maximize the likelihood of that data given the model. This is done for HMMs
|
---|
71 | # with the Baum-Welch algorithm which is actually an instance of the well-known
|
---|
72 | # Expectation-Maximization procedure for missing data problems in statistics.
|
---|
73 | # The process is iterative and hence sometime called re-estimation.
|
---|
74 |
|
---|
75 | # We can use m as a starting point and use some 'real' data to train it. The
|
---|
76 | # variable train_seq is an EmissionSequence or SequenceSet object.
|
---|
77 |
|
---|
78 |
|
---|
79 | from UnfairCasino import train_seq
|
---|
80 | m.baumWelch(train_seq)
|
---|
81 | print "Trained using baumWelch and 'train_seq'"
|
---|
82 | print m
|
---|
83 |
|
---|
84 | # If you want more fine-grained control over the learning procedure, you can do
|
---|
85 | # single steps and monitor the relevant diagnostics yourself, or employ
|
---|
86 | # meta-heuristics such as noise-injection to avoid getting stuck in local
|
---|
87 | # maxima. [Currently for continous emissions only] Computing a Viterbi-path
|
---|
88 |
|
---|
89 | # A Viterbi-path is a sequence of states Q maximizing the probability
|
---|
90 | # P[Q|observations, model]. If we compute a Viterbi-path for a unfair casino
|
---|
91 | # sequence of throws, the state sequence tells us when the casino was fair
|
---|
92 | # (state 0) and when it was unfair (state 1).
|
---|
93 |
|
---|
94 |
|
---|
95 | from UnfairCasino import test_seq
|
---|
96 | v = m.viterbi(test_seq)
|
---|
97 | print "fairness of test_seq: ", v
|
---|
98 |
|
---|
99 | # You can investigate how successful your model is at detecting cheating. Just
|
---|
100 | # create artificial observations and see if you can detect them.
|
---|
101 |
|
---|
102 | my_seq = EmissionSequence(sigma, [1] * 20 + [6] * 10 + [1] * 40)
|
---|
103 | print "artificial sequence : ", my_seq
|
---|
104 | print "artificial observation : ", m.viterbi(my_seq)
|
---|
105 |
|
---|
106 | print '''
|
---|
107 | #
|
---|
108 | # XXX: Some error on closing data, save to ignore as we are finished anyways
|
---|
109 | #
|
---|
110 | '''
|
---|
111 |
|
---|
112 | # EOF
|
---|