Context Navigation

← Previous Change
Next Change →

Changeset 64 for liacs

Timestamp:

Dec 27, 2009, 7:57:56 PM (15 years ago)

Author:

Rick van der Zwet

Message:

Sample GHMM, needs statistics and verifing still

Location:

liacs/dbdm/dbdm_4

Files:

: 2 added
: 2 edited

Legend:

: Unmodified
: Added
: Removed

liacs/dbdm/dbdm_4/ecoli_hmm.py

-              r63
+              r64
 # http://ghmm.sourceforge.net/
 import ghmm
+from MultiReplace  import MultiReplace
 # The mapping is kind of odd, as 'r' could mean either 'g' or 'a', without any clear distintion
 …
+    }
+DEBUG = True
+dna_ascii_translate = {
+    '0' : '*',
+    '1' : '<',
+    '2' : '<',
+    '3' : '<',
+    '4' : '-',
+    '5' : '>',
+    '6' : '>',
+    '7' : '>',
+    }
+dna_flip = string.maketrans('atgc','tacg')
+def get_codon(seq, position, order):
+    codon = seq[position:position+3]
+    if not order:
+        # When living on the other side of the string make sure to flip
+        # around before comparison
+        codon = codon[::-1].translate(dna_flip)
+    return(codon)
+dna_ascii = MultiReplace(dna_ascii_translate)
+def pretty_print(test_seq, ans_seq, v, length=70, parts=10, seperator=''):
+    """ Pretty printing of output for verification purposes """
+    for i in range(0,len(v[0]),length):
+        seq = []
+        ans = []
+        result = []
+        for j in range(0,length,parts):
+            t = i + j
+            seq.append(test_seq[t:t+parts])
+            ans.append(ans_seq[t:t+parts])
+            result.append(dna_ascii.replace(''.join(map(str,v[0][t:t+parts]))))
+        print seperator.join(seq)
+        print seperator.join(ans)
+        print seperator.join(result)
+        print ''
+    print "fairness of test_seq: ", v[1]
 …
     # XXX: Proper values, based of statistics
     # The transition matrix A is chosen such that it reflects the statistics
+    # Part one will try to get us into a gene, part two will tell us when to
+    # get out of it
+    A = [[0.6,0.4], [0.3, 0.7]]
+    # Probalities from moving from one state to an other
+    # 0) Outer-gene  : will try to get us into a gene
+    # 1) Start-codon : beginning of gene - part 1
+    # 2) Start-codon : beginning of gene - part 2
+    # 3) Start-codon : beginning of gene - part 3
+    # 4) Inside-gene : in the gene
+    # 5) Stop-codon  : end of gene - part 1
+    # 6) Stop-codon  : end of gene - part 2
+    # 7) Stop-codon  : end of gene - part 3
+    A = [
+            [0.8, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+            [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+            [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
+            [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
+            [0.0, 0.0, 0.0, 0.0, 0.7, 0.3, 0.0, 0.0],
+            [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0],
+            [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
+            [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+        ]
+    # The emission probabilities matrix is modeled after the statistics with a
+    # total of 1
+    etogene = [ 0.1, 0.1, 0.1, 0.1, 0.6 ]
+    eingene = [ 0.2, 0.3, 0.3, 0.2, 0.1 ]
+    B = [etogene,eingene]
+    # XXX: Proper values, based of statistics
+    # The emission probabilities matrix is modeled after the statistics
+    # (['a', 'c', 'g', 't', 'n' ]
+    B = [
+            # e.g. state 0 -> emission probability
+            [0.2, 0.2, 0.2, 0.2, 0.2] ,
+            [0.9, 0.0, 0.1, 0.0, 0.0] ,
+            [0.0, 0.0, 0.0, 1.0, 0.0] ,
+            [0.0, 0.0, 1.0, 0.0, 0.0] ,
+            [0.2, 0.2, 0.2, 0.2, 0.2] ,
+            [0.0, 0.0, 0.0, 1.0, 0.0] ,
+            [0.7, 0.0, 0.3, 0.0, 0.0] ,
+            [0.7, 0.0, 0.3, 0.0, 0.0] ,
+        ]
     # Initial distribution favors outside
     pi  = [0.9, 0.1]
+    pi  = [0.9] + [0.1/7] * 7
     m = ghmm.HMMFromMatrices(sigma,ghmm.DiscreteDistribution(sigma),A ,B, pi)
+    print "Initial HMM", m
+    print "Initial HMM"
+    print m
     obs_seq = m.sampleSingle(20)
 …
     print "Observations         : ", ''.join(obs)
+    # Start codons
+    # atg, (small chance) gtg
+    # Stop codons
+    # taa, tga
+    # tag
+    # lend = Left End
+    # rend = Right End
+    answer = {}
+    handle = open('AE005174v2-2-gene.raw', 'rU')
+    answer['AE005174v2-2'] = handle.read()
+    handle.close()
+    # Training and testing
+    # A -- T, G -- C
+    start_codons = ['atg', 'gtg']
+    stop_codons = ['taa', 'tga', 'tag']
+    edl_file = 'data/edl_genes.csv'
+    contig_seq = {}
+    handle = open('AE005174v2-2.raw', 'rU')
+    contig_seq['AE005174v2-2'] = handle.read()
+    handle.close()
+    # Counter limit of how many 'hard' errors are allowed before bailing out
+    stop_limit = 100
+    handle = open('AE005174v2-1.raw', 'rU')
+    contig_seq['AE005174v2-1'] = handle.read()
+    handle.close()
+    # Current shifting offset
+    base_shift = 0
+    stop_counter = 0
+    # Sequences of contig, XXX: Make it a FASTA / BioPython Parser
+    contig_seq = {}
+    reader = csv.reader(open(edl_file,"rU"))
+    reader.next()
+    try:
+        for row in reader:
+            finished = False
+            while not finished:
+                (featureType, zNumber, contig, lend, rend, orientation,
+                segmentType, oIslandNumber, geneName, note, function, product,
+                translationNotes) = row
+                lend = int(lend)
+                rend = int(rend)
+                if not contig_seq.has_key(contig):
+                    # Load data
+                    try:
+                        handle = open(contig + ".raw","rU")
+                        contig_seq[contig] = handle.read()
+                        handle.close()
+                    except IOError:
+                        print "Unable to open '%s'" % (sys.argv[1])
+                        sys.exit(64)
+                seq = contig_seq[contig]
+                # Make a forward orientation mark as positive
+                if orientation == '>':
+                    order = True
+                else:
+                    order = False
+                # Living the world upside down to order is off
+                if order:
+                    start_pos = base_shift + lend - 1
+                    stop_pos = base_shift + rend - 3
+                else:
+                    start_pos = base_shift + rend -3
+                    stop_pos = base_shift + lend - 1
+                start_codon = get_codon(seq,start_pos,order)
+                stop_codon = get_codon(seq,stop_pos,order)
+                start_match = start_codon in start_codons
+                stop_match = stop_codon in stop_codons
+                print "%6s, %s, %s, %s, %s, %s, %s, %s, %s, %s," % (geneName,lend,rend,start_pos, stop_pos, contig, start_codon, stop_codon, orientation, base_shift),
+                print "%s, %s" % (start_match, stop_match)
+                if not start_match or not stop_match:
+                    print "### Function (comment):", function
+                # Check for fucked up offsets, shifts
+                start_shift = []
+                stop_shift = []
+                # Technically speaking should the cope only the reading frames,
+                # but what the heck looking if 'free', but make sure to include the original frames as well
+                search_range = abs(base_shift) + 3
+                if not start_match:
+                    # Somewhere else in the reading frame?
+                    matches = []
+                    for r in range(-search_range,search_range+1):
+                        l = start_pos + r
+                        t = get_codon(seq,l,order)
+                        new_match = t in start_codons
+                        if new_match:
+                            matches.append("%i:%s" % (r, t))
+                            start_shift.append(r)
+                    if start_shift:
+                        print "# Start codon reading frame matches: ", ",".join(matches)
+                if not stop_match:
+                    # Somewhere else in the reading frame?
+                    matches = []
+                    for r in range(-search_range,search_range+1):
+                        l = stop_pos + r
+                        t = get_codon(seq,l,order)
+                        new_match = t in stop_codons
+                        if new_match:
+                            stop_shift.append(r)
+                            matches.append("%i:%s" % (r, t))
+                    if stop_shift:
+                        print "# Stop codon reading frame matches: ", ",".join(matches)
+    test_seq = contig_seq['AE005174v2-2'][0:490]
+    ans_seq = answer['AE005174v2-2'][0:490]
+    test_eseq=ghmm.EmissionSequence(sigma,list(test_seq))
+    v = m.viterbi(test_eseq)
+    pretty_print(test_seq, ans_seq, v)
+                # Both wrong is something screwy in the data, check offset fix
+                if not start_match and not stop_match:
+                    common_shift = list(set(start_shift) & set(stop_shift))
+                    if common_shift:
+                        print "# Matching shifts: " + ",".join(map(str,common_shift))
+                        # Get the value closest to 0 for shifting purposes
+                        # Currently asume no negative shifts
+                        base_shift += min(common_shift)
+                        finished = False
+                        continue
+                    # Exercise left to the reader
+                    if (stop_counter > stop_limit):
+                        return
+                    else:
+                        stop_counter += 1
+                        finished = True
+                else:
+                    stop_counter = 0
+                    finished = True
+    # Train sequence
+    print "Training baumWelch"
+    train_seq = ghmm.EmissionSequence(sigma,list(contig_seq['AE005174v2-1']))
+    v = m.baumWelch(train_seq)
+    print m
+    except csv.Error, e:
+        sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e))
+    test_seq=ghmm.EmissionSequence(sigma,list(seq[0:(len(seq) / 3)]))
+    v = m.viterbi(test_seq)
+    #print "fairness of test_seq: ", v
+    print "Results after training sequence..."
+    v = m.viterbi(test_eseq)
+    pretty_print(test_seq, ans_seq, v)
-    # Validation
-    val_seq=ghmm.EmissionSequence(sigma,list(seq[10:2000]))
-    v = m.viterbi(val_seq)
-    #print "Test sequence: ", v
     # XXX: Results
 if __name__ == "__main__":
     ecoli_hmm()

liacs/dbdm/dbdm_4/parse_fasta.py

-              r63
+              r64
 from reading_frames import reading_frames
+# The mapping is kind of odd, as 'r' could mean either 'g' or 'a'
+# The mapping is kind of odd, as 'r' could mean either 'g' or 'a', but in our
+# case we map them  all to unknown
 fasta_translate = {
     'r' : 'ga', # purine
     'y' : 'tc', # pyrimide
     'k' : 'gt', # keto
     'm' : 'ac', # amino
     's' : 'gc', # strong
     'w' : 'at', # weak
     'b' : 'gtc',
     'd' : 'gat',
     'h' : 'act',
     'v' : 'gca',
+    'r' : 'n', # purine
+    'y' : 'n', # pyrimide
+    'k' : 'n', # keto
+    'm' : 'n', # amino
+    's' : 'n', # strong
+    'w' : 'n', # weak
+    'b' : 'n',
+    'd' : 'n',
+    'h' : 'n',
+    'v' : 'n',
+    }
 …
 seq2 = parse_file(file2)
 # Wrong assumption, replace is not possible as the real value is not know yet
 # seq1 = fasta.replace(seq1)
 # seq2 = fasta.replace(seq2)
+# Simplify answers
+seq1 = fasta.replace(seq1)
+seq2 = fasta.replace(seq2)
 # Find overlap

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 64 for liacs

Legend:

liacs/dbdm/dbdm_4/ecoli_hmm.py

liacs/dbdm/dbdm_4/parse_fasta.py

Download in other formats: