Context Navigation

← Previous Change
Next Change →

ecoli_hmm.py

Timestamp:

Dec 27, 2009, 1:42:15 PM (15 years ago)

Author:

Rick van der Zwet

Message:

Finally got all the matching right

File:

: 1 edited

liacs/dbdm/dbdm_4/ecoli_hmm.py (modified) (8 diffs)

Legend:

: Unmodified
: Added
: Removed

liacs/dbdm/dbdm_4/ecoli_hmm.py

-              r62
+              r63
 # http://nl.wikipedia.org/wiki/Genetische_code
-import ghmm
 import sys
 import csv
 import string
+# http://ghmm.sourceforge.net/
+import ghmm
+# The mapping is kind of odd, as 'r' could mean either 'g' or 'a', without any clear distintion
+fasta_translate = {
+    'r' : 'ga', # purine
+    'y' : 'tc', # pyrimide
+    'k' : 'gt', # keto
+    'm' : 'ac', # amino
+    's' : 'gc', # strong
+    'w' : 'at', # weak
+    'b' : 'gtc',
+    'd' : 'gat',
+    'h' : 'act',
+    'v' : 'gca',
+    }
+DEBUG = True
 dna_flip = string.maketrans('atgc','tacg')
 …
 def ecoli_hmm(seq):
+def ecoli_hmm():
     """Try to find genes inside e sequence using a HMM"""
     # Model 4 bases A C G T and unknown state N
 …
     # Counter limit of how many 'hard' errors are allowed before bailing out
     stop_limit = 5
+    stop_limit = 100
     # Current shifting offset
     base_shift = 0
     stop_counter = 0
+    # Sequences of contig, XXX: Make it a FASTA / BioPython Parser
+    contig_seq = {}
     reader = csv.reader(open(edl_file,"rU"))
 …
                 lend = int(lend)
                 rend = int(rend)
+                if not contig_seq.has_key(contig):
+                    # Load data
+                    try:
+                        handle = open(contig + ".raw","rU")
+                        contig_seq[contig] = handle.read()
+                        handle.close()
+                    except IOError:
+                        print "Unable to open '%s'" % (sys.argv[1])
+                        sys.exit(64)
+                seq = contig_seq[contig]
                 # Make a forward orientation mark as positive
                 if orientation == '>':
 …
                 stop_match = stop_codon in stop_codons
                 print "%6s, %s, %s, %s, %s, %s," % (geneName,lend,rend, start_codon, stop_codon, orientation),
+                print "%6s, %s, %s, %s, %s, %s, %s, %s, %s, %s," % (geneName,lend,rend,start_pos, stop_pos, contig, start_codon, stop_codon, orientation, base_shift),
                 print "%s, %s" % (start_match, stop_match)
+                if not start_match or not stop_match:
+                    print "### Function (comment):", function
                 # Check for fucked up offsets, shifts
                 start_shift = None
                 stop_shift = None
+                start_shift = []
+                stop_shift = []
                 # Technically speaking should the cope only the reading frames,
                 # but what the heck looking if 'free'
                 search_range = 10
+                # but what the heck looking if 'free', but make sure to include the original frames as well
+                search_range = abs(base_shift) + 3
                 if not start_match:
                     # Somewhere else in the reading frame?
+                    matches = []
                     for r in range(-search_range,search_range+1):
                         l = start_pos + r
 …
                         new_match = t in start_codons
                         if new_match:
+                            start_shift = r
+                            print "# Start codon reading frame match - shift %i: " % r,
+                            print "%i, %s: %s" % (l,t,new_match)
+                            matches.append("%i:%s" % (r, t))
+                            start_shift.append(r)
+                    if start_shift:
+                        print "# Start codon reading frame matches: ", ",".join(matches)
                 if not stop_match:
                     # Somewhere else in the reading frame?
+                    matches = []
                     for r in range(-search_range,search_range+1):
                         l = stop_pos + r
 …
                         new_match = t in stop_codons
                         if new_match:
+                            stop_shift = r
+                            print "# Stop codon reading frame match - shift %i: " % r,
+                            print "%i, %s: %s" % (l,t,new_match)
+                            stop_shift.append(r)
+                            matches.append("%i:%s" % (r, t))
+                    if stop_shift:
+                        print "# Stop codon reading frame matches: ", ",".join(matches)
                 # Both wrong is something screwy in the data, check offset fix
                 if not start_match and not stop_match:
+                    if stop_shift != None and start_shift == stop_shift:
+                        print "# Matching shift %i" % start_shift
+                        base_shift += start_shift
+                    common_shift = list(set(start_shift) & set(stop_shift))
+                    if common_shift:
+                        print "# Matching shifts: " + ",".join(map(str,common_shift))
+                        # Get the value closest to 0 for shifting purposes
+                        # Currently asume no negative shifts
+                        base_shift += min(common_shift)
                         finished = False
                         continue
 …
 if __name__ == "__main__":
+    # Load data
+    try:
+        handle = open(sys.argv[1],"rU")
+    except IndexError:
+        print "Usage %s <data_file>" % (sys.argv[0])
+        sys.exit(64)
+    except IOError:
+        print "Unable to open '%s'" % (sys.argv[1])
+        sys.exit(64)
+    seq = handle.read()
+    handle.close()
+    ecoli_hmm(seq)
+    ecoli_hmm()

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 63 for liacs/dbdm/dbdm_4/ecoli_hmm.py

Legend:

liacs/dbdm/dbdm_4/ecoli_hmm.py

Download in other formats: