Changeset 64 for liacs/dbdm/dbdm_4
- Timestamp:
- Dec 27, 2009, 7:57:56 PM (15 years ago)
- Location:
- liacs/dbdm/dbdm_4
- Files:
-
- 2 added
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
liacs/dbdm/dbdm_4/ecoli_hmm.py
r63 r64 12 12 # http://ghmm.sourceforge.net/ 13 13 import ghmm 14 15 from MultiReplace import MultiReplace 14 16 15 17 # The mapping is kind of odd, as 'r' could mean either 'g' or 'a', without any clear distintion … … 27 29 } 28 30 29 DEBUG = True 31 dna_ascii_translate = { 32 '0' : '*', 33 '1' : '<', 34 '2' : '<', 35 '3' : '<', 36 '4' : '-', 37 '5' : '>', 38 '6' : '>', 39 '7' : '>', 40 } 30 41 31 dna_flip = string.maketrans('atgc','tacg') 32 def get_codon(seq, position, order): 33 codon = seq[position:position+3] 34 if not order: 35 # When living on the other side of the string make sure to flip 36 # around before comparison 37 codon = codon[::-1].translate(dna_flip) 38 return(codon) 42 dna_ascii = MultiReplace(dna_ascii_translate) 39 43 44 def pretty_print(test_seq, ans_seq, v, length=70, parts=10, seperator=''): 45 """ Pretty printing of output for verification purposes """ 46 for i in range(0,len(v[0]),length): 47 seq = [] 48 ans = [] 49 result = [] 50 for j in range(0,length,parts): 51 t = i + j 52 seq.append(test_seq[t:t+parts]) 53 ans.append(ans_seq[t:t+parts]) 54 result.append(dna_ascii.replace(''.join(map(str,v[0][t:t+parts])))) 55 56 print seperator.join(seq) 57 print seperator.join(ans) 58 print seperator.join(result) 59 print '' 60 print "fairness of test_seq: ", v[1] 40 61 41 62 … … 48 69 # XXX: Proper values, based of statistics 49 70 # The transition matrix A is chosen such that it reflects the statistics 50 # Part one will try to get us into a gene, part two will tell us when to 51 # get out of it 52 A = [[0.6,0.4], [0.3, 0.7]] 71 # Probalities from moving from one state to an other 72 # 0) Outer-gene : will try to get us into a gene 73 # 1) Start-codon : beginning of gene - part 1 74 # 2) Start-codon : beginning of gene - part 2 75 # 3) Start-codon : beginning of gene - part 3 76 # 4) Inside-gene : in the gene 77 # 5) Stop-codon : end of gene - part 1 78 # 6) Stop-codon : end of gene - part 2 79 # 7) Stop-codon : end of gene - part 3 80 A = [ 81 [0.8, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 82 [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], 83 [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0], 84 [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0], 85 [0.0, 0.0, 0.0, 0.0, 0.7, 0.3, 0.0, 0.0], 86 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0], 87 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0], 88 [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 89 ] 53 90 54 # The emission probabilities matrix is modeled after the statistics with a 55 # total of 1 56 etogene = [ 0.1, 0.1, 0.1, 0.1, 0.6 ] 57 eingene = [ 0.2, 0.3, 0.3, 0.2, 0.1 ] 58 59 B = [etogene,eingene] 91 # XXX: Proper values, based of statistics 92 # The emission probabilities matrix is modeled after the statistics 93 # (['a', 'c', 'g', 't', 'n' ] 94 B = [ 95 # e.g. state 0 -> emission probability 96 [0.2, 0.2, 0.2, 0.2, 0.2] , 97 [0.9, 0.0, 0.1, 0.0, 0.0] , 98 [0.0, 0.0, 0.0, 1.0, 0.0] , 99 [0.0, 0.0, 1.0, 0.0, 0.0] , 100 [0.2, 0.2, 0.2, 0.2, 0.2] , 101 [0.0, 0.0, 0.0, 1.0, 0.0] , 102 [0.7, 0.0, 0.3, 0.0, 0.0] , 103 [0.7, 0.0, 0.3, 0.0, 0.0] , 104 ] 60 105 61 106 # Initial distribution favors outside 62 pi = [0.9 , 0.1]107 pi = [0.9] + [0.1/7] * 7 63 108 64 109 m = ghmm.HMMFromMatrices(sigma,ghmm.DiscreteDistribution(sigma),A ,B, pi) 65 print "Initial HMM", m 110 print "Initial HMM" 111 print m 66 112 67 113 obs_seq = m.sampleSingle(20) … … 70 116 print "Observations : ", ''.join(obs) 71 117 72 # Start codons 73 # atg, (small chance) gtg 74 # Stop codons 75 # taa, tga 76 # tag 77 # lend = Left End 78 # rend = Right End 118 answer = {} 119 handle = open('AE005174v2-2-gene.raw', 'rU') 120 answer['AE005174v2-2'] = handle.read() 121 handle.close() 79 122 80 # Training and testing 81 # A -- T, G -- C 82 start_codons = ['atg', 'gtg'] 83 stop_codons = ['taa', 'tga', 'tag'] 84 edl_file = 'data/edl_genes.csv' 123 contig_seq = {} 124 handle = open('AE005174v2-2.raw', 'rU') 125 contig_seq['AE005174v2-2'] = handle.read() 126 handle.close() 85 127 86 # Counter limit of how many 'hard' errors are allowed before bailing out 87 stop_limit = 100 128 handle = open('AE005174v2-1.raw', 'rU') 129 contig_seq['AE005174v2-1'] = handle.read() 130 handle.close() 88 131 89 # Current shifting offset 90 base_shift = 0 91 stop_counter = 0 92 93 # Sequences of contig, XXX: Make it a FASTA / BioPython Parser 94 contig_seq = {} 95 96 reader = csv.reader(open(edl_file,"rU")) 97 reader.next() 98 try: 99 for row in reader: 100 finished = False 101 while not finished: 102 (featureType, zNumber, contig, lend, rend, orientation, 103 segmentType, oIslandNumber, geneName, note, function, product, 104 translationNotes) = row 105 lend = int(lend) 106 rend = int(rend) 107 if not contig_seq.has_key(contig): 108 # Load data 109 try: 110 handle = open(contig + ".raw","rU") 111 contig_seq[contig] = handle.read() 112 handle.close() 113 except IOError: 114 print "Unable to open '%s'" % (sys.argv[1]) 115 sys.exit(64) 116 seq = contig_seq[contig] 117 118 # Make a forward orientation mark as positive 119 if orientation == '>': 120 order = True 121 else: 122 order = False 123 124 # Living the world upside down to order is off 125 if order: 126 start_pos = base_shift + lend - 1 127 stop_pos = base_shift + rend - 3 128 else: 129 start_pos = base_shift + rend -3 130 stop_pos = base_shift + lend - 1 131 132 start_codon = get_codon(seq,start_pos,order) 133 stop_codon = get_codon(seq,stop_pos,order) 134 start_match = start_codon in start_codons 135 stop_match = stop_codon in stop_codons 136 137 print "%6s, %s, %s, %s, %s, %s, %s, %s, %s, %s," % (geneName,lend,rend,start_pos, stop_pos, contig, start_codon, stop_codon, orientation, base_shift), 138 print "%s, %s" % (start_match, stop_match) 139 if not start_match or not stop_match: 140 print "### Function (comment):", function 141 142 # Check for fucked up offsets, shifts 143 start_shift = [] 144 stop_shift = [] 145 # Technically speaking should the cope only the reading frames, 146 # but what the heck looking if 'free', but make sure to include the original frames as well 147 search_range = abs(base_shift) + 3 148 if not start_match: 149 # Somewhere else in the reading frame? 150 matches = [] 151 for r in range(-search_range,search_range+1): 152 l = start_pos + r 153 t = get_codon(seq,l,order) 154 new_match = t in start_codons 155 if new_match: 156 matches.append("%i:%s" % (r, t)) 157 start_shift.append(r) 158 if start_shift: 159 print "# Start codon reading frame matches: ", ",".join(matches) 160 if not stop_match: 161 # Somewhere else in the reading frame? 162 matches = [] 163 for r in range(-search_range,search_range+1): 164 l = stop_pos + r 165 t = get_codon(seq,l,order) 166 new_match = t in stop_codons 167 if new_match: 168 stop_shift.append(r) 169 matches.append("%i:%s" % (r, t)) 170 if stop_shift: 171 print "# Stop codon reading frame matches: ", ",".join(matches) 132 test_seq = contig_seq['AE005174v2-2'][0:490] 133 ans_seq = answer['AE005174v2-2'][0:490] 134 test_eseq=ghmm.EmissionSequence(sigma,list(test_seq)) 135 v = m.viterbi(test_eseq) 136 pretty_print(test_seq, ans_seq, v) 172 137 173 138 174 # Both wrong is something screwy in the data, check offset fix 175 if not start_match and not stop_match: 176 common_shift = list(set(start_shift) & set(stop_shift)) 177 if common_shift: 178 print "# Matching shifts: " + ",".join(map(str,common_shift)) 179 # Get the value closest to 0 for shifting purposes 180 # Currently asume no negative shifts 181 base_shift += min(common_shift) 182 finished = False 183 continue 184 # Exercise left to the reader 185 if (stop_counter > stop_limit): 186 return 187 else: 188 stop_counter += 1 189 finished = True 190 else: 191 stop_counter = 0 192 finished = True 139 # Train sequence 140 print "Training baumWelch" 141 train_seq = ghmm.EmissionSequence(sigma,list(contig_seq['AE005174v2-1'])) 142 v = m.baumWelch(train_seq) 143 print m 144 193 145 194 except csv.Error, e: 195 sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e)) 196 197 198 199 test_seq=ghmm.EmissionSequence(sigma,list(seq[0:(len(seq) / 3)])) 200 v = m.viterbi(test_seq) 201 #print "fairness of test_seq: ", v 202 146 print "Results after training sequence..." 147 v = m.viterbi(test_eseq) 148 pretty_print(test_seq, ans_seq, v) 203 149 204 # Validation205 val_seq=ghmm.EmissionSequence(sigma,list(seq[10:2000]))206 v = m.viterbi(val_seq)207 #print "Test sequence: ", v208 150 209 151 # XXX: Results 210 152 211 212 213 153 if __name__ == "__main__": 214 154 ecoli_hmm() -
liacs/dbdm/dbdm_4/parse_fasta.py
r63 r64 11 11 from reading_frames import reading_frames 12 12 13 # The mapping is kind of odd, as 'r' could mean either 'g' or 'a' 13 # The mapping is kind of odd, as 'r' could mean either 'g' or 'a', but in our 14 # case we map them all to unknown 15 14 16 fasta_translate = { 15 'r' : ' ga', # purine16 'y' : ' tc', # pyrimide17 'k' : ' gt', # keto18 'm' : ' ac', # amino19 's' : ' gc', # strong20 'w' : ' at', # weak21 'b' : ' gtc',22 'd' : ' gat',23 'h' : ' act',24 'v' : ' gca',17 'r' : 'n', # purine 18 'y' : 'n', # pyrimide 19 'k' : 'n', # keto 20 'm' : 'n', # amino 21 's' : 'n', # strong 22 'w' : 'n', # weak 23 'b' : 'n', 24 'd' : 'n', 25 'h' : 'n', 26 'v' : 'n', 25 27 } 26 28 … … 83 85 seq2 = parse_file(file2) 84 86 85 # Wrong assumption, replace is not possible as the real value is not know yet86 #seq1 = fasta.replace(seq1)87 #seq2 = fasta.replace(seq2)87 # Simplify answers 88 seq1 = fasta.replace(seq1) 89 seq2 = fasta.replace(seq2) 88 90 89 91 # Find overlap
Note:
See TracChangeset
for help on using the changeset viewer.