- Timestamp:
- Dec 23, 2009, 2:35:48 PM (15 years ago)
- Location:
- liacs/dbdm/dbdm_4
- Files:
-
- 2 added
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
liacs/dbdm/dbdm_4/ecoli_hmm.py
r61 r62 10 10 import csv 11 11 import string 12 13 dna_flip = string.maketrans('atgc','tacg') 14 def get_codon(seq, position, order): 15 codon = seq[position:position+3] 16 if not order: 17 # When living on the other side of the string make sure to flip 18 # around before comparison 19 codon = codon[::-1].translate(dna_flip) 20 return(codon) 21 12 22 13 23 … … 50 60 # rend = Right End 51 61 52 # Training 62 # Training and testing 53 63 # A -- T, G -- C 54 start_condons = ['atg', 'gtg'] 55 stop_condons = ['taa', 'tga', 'tag'] 56 dna_flip = string.maketrans('atgc','tacg') 64 start_codons = ['atg', 'gtg'] 65 stop_codons = ['taa', 'tga', 'tag'] 57 66 edl_file = 'data/edl_genes.csv' 67 68 # Counter limit of how many 'hard' errors are allowed before bailing out 69 stop_limit = 5 70 71 # Current shifting offset 72 base_shift = 0 73 stop_counter = 0 74 58 75 reader = csv.reader(open(edl_file,"rU")) 59 76 reader.next() 60 77 try: 61 78 for row in reader: 62 (featureType, zNumber, contig, lend, rend, orientation, 63 segmentType, oIslandNumber, geneName, note, function, product, 64 translationNotes) = row 65 lend = int(lend) - 1 66 rend = int(rend) 67 # Make a forward orientation as positive 68 if orientation == '>': 69 order = True 70 else: 71 order = False 79 finished = False 80 while not finished: 81 (featureType, zNumber, contig, lend, rend, orientation, 82 segmentType, oIslandNumber, geneName, note, function, product, 83 translationNotes) = row 84 lend = int(lend) 85 rend = int(rend) 86 # Make a forward orientation mark as positive 87 if orientation == '>': 88 order = True 89 else: 90 order = False 72 91 73 if order: 74 start_codon = seq[lend:lend+3] 75 stop_codon = seq[rend-3:rend] 76 else: 77 start_codon = seq[rend-3:rend].translate(dna_flip)[::-1] 78 stop_codon = seq[lend:lend+3].translate(dna_flip)[::-1] 92 # Living the world upside down to order is off 93 if order: 94 start_pos = base_shift + lend - 1 95 stop_pos = base_shift + rend - 3 96 else: 97 start_pos = base_shift + rend -3 98 stop_pos = base_shift + lend - 1 99 100 start_codon = get_codon(seq,start_pos,order) 101 stop_codon = get_codon(seq,stop_pos,order) 102 start_match = start_codon in start_codons 103 stop_match = stop_codon in stop_codons 79 104 80 print "%6s, %s, %s, %s, %s, %s," % (geneName,lend,rend, start_codon, stop_codon, orientation), 105 print "%6s, %s, %s, %s, %s, %s," % (geneName,lend,rend, start_codon, stop_codon, orientation), 106 print "%s, %s" % (start_match, stop_match) 107 108 # Check for fucked up offsets, shifts 109 start_shift = None 110 stop_shift = None 111 # Technically speaking should the cope only the reading frames, 112 # but what the heck looking if 'free' 113 search_range = 10 114 if not start_match: 115 # Somewhere else in the reading frame? 116 for r in range(-search_range,search_range+1): 117 l = start_pos + r 118 t = get_codon(seq,l,order) 119 new_match = t in start_codons 120 if new_match: 121 start_shift = r 122 print "# Start codon reading frame match - shift %i: " % r, 123 print "%i, %s: %s" % (l,t,new_match) 124 if not stop_match: 125 # Somewhere else in the reading frame? 126 for r in range(-search_range,search_range+1): 127 l = stop_pos + r 128 t = get_codon(seq,l,order) 129 new_match = t in stop_codons 130 if new_match: 131 stop_shift = r 132 print "# Stop codon reading frame match - shift %i: " % r, 133 print "%i, %s: %s" % (l,t,new_match) 81 134 82 print "%s, %s" % (start_codon in start_condons, stop_codon in stop_condons) 83 if not start_codon in start_condons or not stop_codon in stop_condons: 84 return 135 136 # Both wrong is something screwy in the data, check offset fix 137 if not start_match and not stop_match: 138 if stop_shift != None and start_shift == stop_shift: 139 print "# Matching shift %i" % start_shift 140 base_shift += start_shift 141 finished = False 142 continue 143 # Exercise left to the reader 144 if (stop_counter > stop_limit): 145 return 146 else: 147 stop_counter += 1 148 finished = True 149 else: 150 stop_counter = 0 151 finished = True 152 85 153 except csv.Error, e: 86 154 sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e))
Note:
See TracChangeset
for help on using the changeset viewer.