Changeset 63 for liacs/dbdm/dbdm_4/ecoli_hmm.py
- Timestamp:
- Dec 27, 2009, 1:42:15 PM (15 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
liacs/dbdm/dbdm_4/ecoli_hmm.py
r62 r63 6 6 # http://nl.wikipedia.org/wiki/Genetische_code 7 7 8 import ghmm9 8 import sys 10 9 import csv 11 10 import string 11 12 # http://ghmm.sourceforge.net/ 13 import ghmm 14 15 # The mapping is kind of odd, as 'r' could mean either 'g' or 'a', without any clear distintion 16 fasta_translate = { 17 'r' : 'ga', # purine 18 'y' : 'tc', # pyrimide 19 'k' : 'gt', # keto 20 'm' : 'ac', # amino 21 's' : 'gc', # strong 22 'w' : 'at', # weak 23 'b' : 'gtc', 24 'd' : 'gat', 25 'h' : 'act', 26 'v' : 'gca', 27 } 28 29 DEBUG = True 12 30 13 31 dna_flip = string.maketrans('atgc','tacg') … … 22 40 23 41 24 def ecoli_hmm( seq):42 def ecoli_hmm(): 25 43 """Try to find genes inside e sequence using a HMM""" 26 44 # Model 4 bases A C G T and unknown state N … … 67 85 68 86 # Counter limit of how many 'hard' errors are allowed before bailing out 69 stop_limit = 587 stop_limit = 100 70 88 71 89 # Current shifting offset 72 90 base_shift = 0 73 91 stop_counter = 0 92 93 # Sequences of contig, XXX: Make it a FASTA / BioPython Parser 94 contig_seq = {} 74 95 75 96 reader = csv.reader(open(edl_file,"rU")) … … 84 105 lend = int(lend) 85 106 rend = int(rend) 107 if not contig_seq.has_key(contig): 108 # Load data 109 try: 110 handle = open(contig + ".raw","rU") 111 contig_seq[contig] = handle.read() 112 handle.close() 113 except IOError: 114 print "Unable to open '%s'" % (sys.argv[1]) 115 sys.exit(64) 116 seq = contig_seq[contig] 117 86 118 # Make a forward orientation mark as positive 87 119 if orientation == '>': … … 103 135 stop_match = stop_codon in stop_codons 104 136 105 print "%6s, %s, %s, %s, %s, %s, " % (geneName,lend,rend, start_codon, stop_codon, orientation),137 print "%6s, %s, %s, %s, %s, %s, %s, %s, %s, %s," % (geneName,lend,rend,start_pos, stop_pos, contig, start_codon, stop_codon, orientation, base_shift), 106 138 print "%s, %s" % (start_match, stop_match) 139 if not start_match or not stop_match: 140 print "### Function (comment):", function 107 141 108 142 # Check for fucked up offsets, shifts 109 start_shift = None110 stop_shift = None143 start_shift = [] 144 stop_shift = [] 111 145 # Technically speaking should the cope only the reading frames, 112 # but what the heck looking if 'free' 113 search_range = 10146 # but what the heck looking if 'free', but make sure to include the original frames as well 147 search_range = abs(base_shift) + 3 114 148 if not start_match: 115 149 # Somewhere else in the reading frame? 150 matches = [] 116 151 for r in range(-search_range,search_range+1): 117 152 l = start_pos + r … … 119 154 new_match = t in start_codons 120 155 if new_match: 121 start_shift = r 122 print "# Start codon reading frame match - shift %i: " % r, 123 print "%i, %s: %s" % (l,t,new_match) 156 matches.append("%i:%s" % (r, t)) 157 start_shift.append(r) 158 if start_shift: 159 print "# Start codon reading frame matches: ", ",".join(matches) 124 160 if not stop_match: 125 161 # Somewhere else in the reading frame? 162 matches = [] 126 163 for r in range(-search_range,search_range+1): 127 164 l = stop_pos + r … … 129 166 new_match = t in stop_codons 130 167 if new_match: 131 stop_shift = r 132 print "# Stop codon reading frame match - shift %i: " % r, 133 print "%i, %s: %s" % (l,t,new_match) 168 stop_shift.append(r) 169 matches.append("%i:%s" % (r, t)) 170 if stop_shift: 171 print "# Stop codon reading frame matches: ", ",".join(matches) 134 172 135 173 136 174 # Both wrong is something screwy in the data, check offset fix 137 175 if not start_match and not stop_match: 138 if stop_shift != None and start_shift == stop_shift: 139 print "# Matching shift %i" % start_shift 140 base_shift += start_shift 176 common_shift = list(set(start_shift) & set(stop_shift)) 177 if common_shift: 178 print "# Matching shifts: " + ",".join(map(str,common_shift)) 179 # Get the value closest to 0 for shifting purposes 180 # Currently asume no negative shifts 181 base_shift += min(common_shift) 141 182 finished = False 142 183 continue … … 171 212 172 213 if __name__ == "__main__": 173 # Load data 174 try: 175 handle = open(sys.argv[1],"rU") 176 except IndexError: 177 print "Usage %s <data_file>" % (sys.argv[0]) 178 sys.exit(64) 179 except IOError: 180 print "Unable to open '%s'" % (sys.argv[1]) 181 sys.exit(64) 182 183 seq = handle.read() 184 handle.close() 185 ecoli_hmm(seq) 214 ecoli_hmm()
Note:
See TracChangeset
for help on using the changeset viewer.