source: liacs/dbdm/dbdm_4/gene_detect.py@ 67

Last change on this file since 67 was 66, checked in by Rick van der Zwet, 15 years ago

Annotations and readablity fixes

  • Property svn:executable set to *
File size: 6.4 KB
RevLine 
[64]1#!/usr/bin/env python
2#
3# Rick van der Zwet
4import sys
5import csv
6import string
7
8from MultiReplace import MultiReplace
9
10DEBUG = False
11
12# gtg is small chance
13start_codons = ['atg', 'gtg']
14stop_codons = ['taa', 'tga', 'tag']
15
16dna_flip = string.maketrans('atgc','tacg')
17def get_codon(seq, position, order):
18 codon = seq[position:position+3]
19 if not order:
20 # When living on the other side of the string make sure to flip
21 # around before comparison
22 codon = codon[::-1].translate(dna_flip)
23 return(codon)
24
25def gene_detect():
26
27 # Training and testing
28 # A -- T, G -- C
29 # lend = Left End
30 # rend = Right End
31 edl_file = 'data/edl_genes.csv'
32
33 # Counter limit of how many 'hard' errors are allowed before bailing out
34 stop_limit = 100
35
36 # Current shifting offset
37 base_shift = 0
38 stop_counter = 0
39
40 # Sequences of contig, XXX: Make it a FASTA / BioPython Parser
41 contig_seq = {}
42 abstract_contig = {}
43
44 try:
45 reader = csv.reader(open(edl_file,"rU"))
46 reader.next()
47 for row in reader:
48 finished = False
49 while not finished:
50 (featureType, zNumber, contig, lend, rend, orientation,
51 segmentType, oIslandNumber, geneName, note, function, product,
52 translationNotes) = row
53 lend = int(lend)
54 rend = int(rend)
55 if not contig_seq.has_key(contig):
56 # Load data
57 try:
58 handle = open(contig + ".raw","rU")
59 contig_seq[contig] = handle.read()
[66]60 abstract_contig[contig] = list('.' * len(contig_seq[contig]))
[64]61 handle.close()
62 except IOError:
63 print "Unable to open '%s'" % (sys.argv[1])
64 sys.exit(64)
65 seq = contig_seq[contig]
66
67 # Make a forward orientation mark as positive
68 if orientation == '>':
69 order = True
70 else:
71 order = False
72
73 # Living the world upside down to order is off
74 if order:
75 start_pos = base_shift + lend - 1
76 stop_pos = base_shift + rend - 3
77 else:
78 start_pos = base_shift + rend -3
79 stop_pos = base_shift + lend - 1
80
81 start_codon = get_codon(seq,start_pos,order)
82 stop_codon = get_codon(seq,stop_pos,order)
83 start_match = start_codon in start_codons
84 stop_match = stop_codon in stop_codons
85
86 if DEBUG: print "%6s, %s, %s, %s, %s, %s, %s, %s, %s, %s," % \
87 (geneName,lend,rend,start_pos, stop_pos, contig,
88 start_codon, stop_codon, orientation, base_shift),
89 if DEBUG: print "%s, %s" % (start_match, stop_match)
90
91 if start_match and stop_match:
92 # Used for training DNA sequences and overlaps
93 print "Gene found %s, %s" % (start_pos, stop_pos)
94 if start_pos < stop_pos:
95 abstract_contig[contig][start_pos:start_pos+3] = ['<'] * 3
[66]96 abstract_contig[contig][start_pos+3:stop_pos] = ['|'] * (stop_pos - (start_pos + 3))
[64]97 abstract_contig[contig][stop_pos:stop_pos+3] = ['>'] * 3
98 break
99
100 if not start_match or not stop_match:
101 if DEBUG: print "### Function (comment):", function
102
103 # Check for fucked up offsets, shifts
104 start_shift = []
105 stop_shift = []
106 # Technically speaking should the cope only the reading frames,
107 # but what the heck looking if 'free', but make sure to include
108 # the original frames as well
109 search_range = abs(base_shift) + 3
110 if not start_match:
111 # Somewhere else in the reading frame?
112 matches = []
113 for r in range(-search_range,search_range+1):
114 l = start_pos + r
115 t = get_codon(seq,l,order)
116 new_match = t in start_codons
117 if new_match:
118 matches.append("%i:%s" % (r, t))
119 start_shift.append(r)
120 if start_shift:
121 if DEBUG: print "# Start codon reading frame matches: ", ",".join(matches)
122 if not stop_match:
123 # Somewhere else in the reading frame?
124 matches = []
125 for r in range(-search_range,search_range+1):
126 l = stop_pos + r
127 t = get_codon(seq,l,order)
128 new_match = t in stop_codons
129 if new_match:
130 stop_shift.append(r)
131 matches.append("%i:%s" % (r, t))
132 if stop_shift:
133 if DEBUG: print "# Stop codon reading frame matches: ", ",".join(matches)
134
135
136 # Both wrong is something screwy in the data, check offset fix
137 if not start_match and not stop_match:
138 common_shift = list(set(start_shift) & set(stop_shift))
139 if common_shift:
140 if DEBUG: print "# Matching shifts: " + ",".join(map(str,common_shift))
141 # Get the value closest to 0 for shifting purposes
142 # Currently asume no negative shifts
143 base_shift += min(common_shift)
144 finished = False
145 continue
146 # Exercise left to the reader
147 if (stop_counter > stop_limit):
148 return
149 else:
150 stop_counter += 1
151 finished = True
152 else:
153 stop_counter = 0
154 finished = True
155
156 except csv.Error, e:
157 sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e))
158
159 for contig in abstract_contig.keys():
160 handle = open(contig + "-gene.raw", "w")
161 handle.write(''.join(abstract_contig[contig]))
162 handle.close()
163
164if __name__ == "__main__":
165 gene_detect()
Note: See TracBrowser for help on using the repository browser.