[60] | 1 | #!/usr/bin/env python
|
---|
| 2 | #
|
---|
| 3 | # Tutorial on using GHMM with Python
|
---|
| 4 | # http://ghmm.sourceforge.net/ghmm-python-tutorial.html
|
---|
| 5 | # Ported to python code for lazy programmers ;-)
|
---|
| 6 | #
|
---|
| 7 | # In the following, we assume that you have installed GHMM including the Python
|
---|
| 8 | # bindings. Download the UnfairCasino.py-file.
|
---|
| 9 | #
|
---|
| 10 | # You might have seen the unfair casino example (Chair Biological Sequence
|
---|
| 11 | # Analysis, Durbin et. al, 1998), where a dealer in a casino occasionally
|
---|
| 12 | # exchanges a fair dice with a loaded one. We do not know if and when this
|
---|
| 13 | # exchange happens, we only can observe the throws. This stochastic process we
|
---|
| 14 | # will model with a HMM.
|
---|
| 15 |
|
---|
| 16 | import ghmm
|
---|
| 17 | print "For aditional help use: help('ghmm')"
|
---|
| 18 |
|
---|
| 19 | #Creating the first model:
|
---|
| 20 |
|
---|
| 21 | # There are two states in our example. Either the dice is fair (state 0; Python
|
---|
| 22 | # indexes arrays like C and C++ from 0) or it is loaded (state 1). We will
|
---|
| 23 | # define the transition and emission matrices explicitly.
|
---|
| 24 |
|
---|
| 25 | # To save us some typing (namely ghmm. before any class or function from GHMM),
|
---|
| 26 | # we will import all symbols from ghmm directly.
|
---|
| 27 |
|
---|
| 28 | from ghmm import *
|
---|
| 29 |
|
---|
| 30 | # Our alphabet are the numbers on the dice {1,2,3,4,5,6}. To be consistent with
|
---|
| 31 | # Python's range function the upper limit is not part of a range.
|
---|
| 32 |
|
---|
| 33 | sigma = IntegerRange(1,7)
|
---|
| 34 |
|
---|
| 35 | # The transition matrix A is chosen such that we will stay longer in the fair
|
---|
| 36 | # than the loaded state. The emission probabilities are set accordingly in B ---
|
---|
| 37 | # guessing equal probabilities for the fair dice and higher probabilities for
|
---|
| 38 | # ones and twos etc. for the loaded dice --- and the initial state distribution
|
---|
| 39 | # pi has no preference for either state.
|
---|
| 40 |
|
---|
| 41 | A = [[0.9, 0.1], [0.3, 0.7]]
|
---|
| 42 | efair = [1.0 / 6] * 6
|
---|
| 43 | print "Fair probabilities:", efair
|
---|
| 44 |
|
---|
| 45 | eloaded = [3.0 / 13, 3.0 / 13, 2.0 / 13, 2.0 / 13, 2.0 / 13, 1.0 / 13]
|
---|
| 46 | B = [efair, eloaded]
|
---|
| 47 | pi = [0.5] * 2
|
---|
| 48 | m = HMMFromMatrices(sigma, DiscreteDistribution(sigma), A, B, pi)
|
---|
| 49 | print "Inital HHM", m
|
---|
| 50 |
|
---|
| 51 | # Generating sequences
|
---|
| 52 |
|
---|
| 53 | # At this time GHMM still exposes some internals. For example, symbols from
|
---|
| 54 | # discrete alphabets are internally represented as 0,...,|alphabet| -1 and the
|
---|
| 55 | # alphabet is responsible for translating between internal and external
|
---|
| 56 | # representations.
|
---|
| 57 |
|
---|
| 58 | # First we sample from the model, specifying the length of the sequence, and
|
---|
| 59 | # then transfer the observations to the external representation.
|
---|
| 60 |
|
---|
| 61 | obs_seq = m.sampleSingle(20)
|
---|
| 62 | print "Observation sequence : ", obs_seq
|
---|
| 63 | obs = map(sigma.external, obs_seq)
|
---|
| 64 | print "Observations : ", obs
|
---|
| 65 |
|
---|
| 66 | # Learning from data
|
---|
| 67 |
|
---|
| 68 | # In a real application we would want to train the model. That is, we would
|
---|
| 69 | # like to estimate the parameters of the model from data with the goal to
|
---|
| 70 | # maximize the likelihood of that data given the model. This is done for HMMs
|
---|
| 71 | # with the Baum-Welch algorithm which is actually an instance of the well-known
|
---|
| 72 | # Expectation-Maximization procedure for missing data problems in statistics.
|
---|
| 73 | # The process is iterative and hence sometime called re-estimation.
|
---|
| 74 |
|
---|
| 75 | # We can use m as a starting point and use some 'real' data to train it. The
|
---|
| 76 | # variable train_seq is an EmissionSequence or SequenceSet object.
|
---|
| 77 |
|
---|
| 78 |
|
---|
| 79 | from UnfairCasino import train_seq
|
---|
| 80 | m.baumWelch(train_seq)
|
---|
| 81 | print "Trained using baumWelch and 'train_seq'"
|
---|
| 82 | print m
|
---|
| 83 |
|
---|
| 84 | # If you want more fine-grained control over the learning procedure, you can do
|
---|
| 85 | # single steps and monitor the relevant diagnostics yourself, or employ
|
---|
| 86 | # meta-heuristics such as noise-injection to avoid getting stuck in local
|
---|
| 87 | # maxima. [Currently for continous emissions only] Computing a Viterbi-path
|
---|
| 88 |
|
---|
| 89 | # A Viterbi-path is a sequence of states Q maximizing the probability
|
---|
| 90 | # P[Q|observations, model]. If we compute a Viterbi-path for a unfair casino
|
---|
| 91 | # sequence of throws, the state sequence tells us when the casino was fair
|
---|
| 92 | # (state 0) and when it was unfair (state 1).
|
---|
| 93 |
|
---|
| 94 |
|
---|
| 95 | from UnfairCasino import test_seq
|
---|
| 96 | v = m.viterbi(test_seq)
|
---|
| 97 | print "fairness of test_seq: ", v
|
---|
| 98 |
|
---|
| 99 | # You can investigate how successful your model is at detecting cheating. Just
|
---|
| 100 | # create artificial observations and see if you can detect them.
|
---|
| 101 |
|
---|
| 102 | my_seq = EmissionSequence(sigma, [1] * 20 + [6] * 10 + [1] * 40)
|
---|
| 103 | print "artificial sequence : ", my_seq
|
---|
| 104 | print "artificial observation : ", m.viterbi(my_seq)
|
---|
| 105 |
|
---|
| 106 | print '''
|
---|
| 107 | #
|
---|
| 108 | # XXX: Some error on closing data, save to ignore as we are finished anyways
|
---|
| 109 | #
|
---|
| 110 | '''
|
---|
| 111 |
|
---|
| 112 | # EOF
|
---|