Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: liacs/dbdm/dbdm_4/gene_detect.py@ 65

Last change on this file since 65 was 64, checked in by Rick van der Zwet, 15 years ago
Sample GHMM, needs statistics and verifing still
Property svn:executable set to ``*
File size: 6.4 KB

Line
1	#!/usr/bin/env python
2	#
3	# Rick van der Zwet
4	import sys
5	import csv
6	import string
7
8	from MultiReplace import MultiReplace
9
10	DEBUG = False
11
12	# gtg is small chance
13	start_codons = ['atg', 'gtg']
14	stop_codons = ['taa', 'tga', 'tag']
15
16	dna_flip = string.maketrans('atgc','tacg')
17	def get_codon(seq, position, order):
18	codon = seq[position:position+3]
19	if not order:
20	# When living on the other side of the string make sure to flip
21	# around before comparison
22	codon = codon[::-1].translate(dna_flip)
23	return(codon)
24
25	def gene_detect():
26
27	# Training and testing
28	# A -- T, G -- C
29	# lend = Left End
30	# rend = Right End
31	edl_file = 'data/edl_genes.csv'
32
33	# Counter limit of how many 'hard' errors are allowed before bailing out
34	stop_limit = 100
35
36	# Current shifting offset
37	base_shift = 0
38	stop_counter = 0
39
40	# Sequences of contig, XXX: Make it a FASTA / BioPython Parser
41	contig_seq = {}
42	abstract_contig = {}
43
44	try:
45	reader = csv.reader(open(edl_file,"rU"))
46	reader.next()
47	for row in reader:
48	finished = False
49	while not finished:
50	(featureType, zNumber, contig, lend, rend, orientation,
51	segmentType, oIslandNumber, geneName, note, function, product,
52	translationNotes) = row
53	lend = int(lend)
54	rend = int(rend)
55	if not contig_seq.has_key(contig):
56	# Load data
57	try:
58	handle = open(contig + ".raw","rU")
59	contig_seq[contig] = handle.read()
60	abstract_contig[contig] = list('' len(contig_seq[contig]))
61	handle.close()
62	except IOError:
63	print "Unable to open '%s'" % (sys.argv[1])
64	sys.exit(64)
65	seq = contig_seq[contig]
66
67	# Make a forward orientation mark as positive
68	if orientation == '>':
69	order = True
70	else:
71	order = False
72
73	# Living the world upside down to order is off
74	if order:
75	start_pos = base_shift + lend - 1
76	stop_pos = base_shift + rend - 3
77	else:
78	start_pos = base_shift + rend -3
79	stop_pos = base_shift + lend - 1
80
81	start_codon = get_codon(seq,start_pos,order)
82	stop_codon = get_codon(seq,stop_pos,order)
83	start_match = start_codon in start_codons
84	stop_match = stop_codon in stop_codons
85
86	if DEBUG: print "%6s, %s, %s, %s, %s, %s, %s, %s, %s, %s," % \
87	(geneName,lend,rend,start_pos, stop_pos, contig,
88	start_codon, stop_codon, orientation, base_shift),
89	if DEBUG: print "%s, %s" % (start_match, stop_match)
90
91	if start_match and stop_match:
92	# Used for training DNA sequences and overlaps
93	print "Gene found %s, %s" % (start_pos, stop_pos)
94	if start_pos < stop_pos:
95	abstract_contig[contig][start_pos:start_pos+3] = ['<'] * 3
96	abstract_contig[contig][start_pos+3:stop_pos] = ['-'] * (stop_pos - (start_pos + 3))
97	abstract_contig[contig][stop_pos:stop_pos+3] = ['>'] * 3
98	break
99
100	if not start_match or not stop_match:
101	if DEBUG: print "### Function (comment):", function
102
103	# Check for fucked up offsets, shifts
104	start_shift = []
105	stop_shift = []
106	# Technically speaking should the cope only the reading frames,
107	# but what the heck looking if 'free', but make sure to include
108	# the original frames as well
109	search_range = abs(base_shift) + 3
110	if not start_match:
111	# Somewhere else in the reading frame?
112	matches = []
113	for r in range(-search_range,search_range+1):
114	l = start_pos + r
115	t = get_codon(seq,l,order)
116	new_match = t in start_codons
117	if new_match:
118	matches.append("%i:%s" % (r, t))
119	start_shift.append(r)
120	if start_shift:
121	if DEBUG: print "# Start codon reading frame matches: ", ",".join(matches)
122	if not stop_match:
123	# Somewhere else in the reading frame?
124	matches = []
125	for r in range(-search_range,search_range+1):
126	l = stop_pos + r
127	t = get_codon(seq,l,order)
128	new_match = t in stop_codons
129	if new_match:
130	stop_shift.append(r)
131	matches.append("%i:%s" % (r, t))
132	if stop_shift:
133	if DEBUG: print "# Stop codon reading frame matches: ", ",".join(matches)
134
135
136	# Both wrong is something screwy in the data, check offset fix
137	if not start_match and not stop_match:
138	common_shift = list(set(start_shift) & set(stop_shift))
139	if common_shift:
140	if DEBUG: print "# Matching shifts: " + ",".join(map(str,common_shift))
141	# Get the value closest to 0 for shifting purposes
142	# Currently asume no negative shifts
143	base_shift += min(common_shift)
144	finished = False
145	continue
146	# Exercise left to the reader
147	if (stop_counter > stop_limit):
148	return
149	else:
150	stop_counter += 1
151	finished = True
152	else:
153	stop_counter = 0
154	finished = True
155
156	except csv.Error, e:
157	sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e))
158
159	for contig in abstract_contig.keys():
160	handle = open(contig + "-gene.raw", "w")
161	handle.write(''.join(abstract_contig[contig]))
162	handle.close()
163
164	if __name__ == "__main__":
165	gene_detect()

Note: See TracBrowser for help on using the repository browser.

Download in other formats: