Context Navigation

gene_detect.py@ 149

Last change on this file since 149 was 66, checked in by Rick van der Zwet, 15 years ago
Annotations and readablity fixes
Property svn:executable set to ``*
File size: 6.4 KB

Rev	Line
[64]	1	#!/usr/bin/env python
	2	#
	3	# Rick van der Zwet
	4	import sys
	5	import csv
	6	import string
	7
	8	from MultiReplace import MultiReplace
	9
	10	DEBUG = False
	11
	12	# gtg is small chance
	13	start_codons = ['atg', 'gtg']
	14	stop_codons = ['taa', 'tga', 'tag']
	15
	16	dna_flip = string.maketrans('atgc','tacg')
	17	def get_codon(seq, position, order):
	18	codon = seq[position:position+3]
	19	if not order:
	20	# When living on the other side of the string make sure to flip
	21	# around before comparison
	22	codon = codon[::-1].translate(dna_flip)
	23	return(codon)
	24
	25	def gene_detect():
	26
	27	# Training and testing
	28	# A -- T, G -- C
	29	# lend = Left End
	30	# rend = Right End
	31	edl_file = 'data/edl_genes.csv'
	32
	33	# Counter limit of how many 'hard' errors are allowed before bailing out
	34	stop_limit = 100
	35
	36	# Current shifting offset
	37	base_shift = 0
	38	stop_counter = 0
	39
	40	# Sequences of contig, XXX: Make it a FASTA / BioPython Parser
	41	contig_seq = {}
	42	abstract_contig = {}
	43
	44	try:
	45	reader = csv.reader(open(edl_file,"rU"))
	46	reader.next()
	47	for row in reader:
	48	finished = False
	49	while not finished:
	50	(featureType, zNumber, contig, lend, rend, orientation,
	51	segmentType, oIslandNumber, geneName, note, function, product,
	52	translationNotes) = row
	53	lend = int(lend)
	54	rend = int(rend)
	55	if not contig_seq.has_key(contig):
	56	# Load data
	57	try:
	58	handle = open(contig + ".raw","rU")
	59	contig_seq[contig] = handle.read()
[66]	60	abstract_contig[contig] = list('.' * len(contig_seq[contig]))
[64]	61	handle.close()
	62	except IOError:
	63	print "Unable to open '%s'" % (sys.argv[1])
	64	sys.exit(64)
	65	seq = contig_seq[contig]
	66
	67	# Make a forward orientation mark as positive
	68	if orientation == '>':
	69	order = True
	70	else:
	71	order = False
	72
	73	# Living the world upside down to order is off
	74	if order:
	75	start_pos = base_shift + lend - 1
	76	stop_pos = base_shift + rend - 3
	77	else:
	78	start_pos = base_shift + rend -3
	79	stop_pos = base_shift + lend - 1
	80
	81	start_codon = get_codon(seq,start_pos,order)
	82	stop_codon = get_codon(seq,stop_pos,order)
	83	start_match = start_codon in start_codons
	84	stop_match = stop_codon in stop_codons
	85
	86	if DEBUG: print "%6s, %s, %s, %s, %s, %s, %s, %s, %s, %s," % \
	87	(geneName,lend,rend,start_pos, stop_pos, contig,
	88	start_codon, stop_codon, orientation, base_shift),
	89	if DEBUG: print "%s, %s" % (start_match, stop_match)
	90
	91	if start_match and stop_match:
	92	# Used for training DNA sequences and overlaps
	93	print "Gene found %s, %s" % (start_pos, stop_pos)
	94	if start_pos < stop_pos:
	95	abstract_contig[contig][start_pos:start_pos+3] = ['<'] * 3
[66]	96	abstract_contig[contig][start_pos+3:stop_pos] = ['\|'] * (stop_pos - (start_pos + 3))
[64]	97	abstract_contig[contig][stop_pos:stop_pos+3] = ['>'] * 3
	98	break
	99
	100	if not start_match or not stop_match:
	101	if DEBUG: print "### Function (comment):", function
	102
	103	# Check for fucked up offsets, shifts
	104	start_shift = []
	105	stop_shift = []
	106	# Technically speaking should the cope only the reading frames,
	107	# but what the heck looking if 'free', but make sure to include
	108	# the original frames as well
	109	search_range = abs(base_shift) + 3
	110	if not start_match:
	111	# Somewhere else in the reading frame?
	112	matches = []
	113	for r in range(-search_range,search_range+1):
	114	l = start_pos + r
	115	t = get_codon(seq,l,order)
	116	new_match = t in start_codons
	117	if new_match:
	118	matches.append("%i:%s" % (r, t))
	119	start_shift.append(r)
	120	if start_shift:
	121	if DEBUG: print "# Start codon reading frame matches: ", ",".join(matches)
	122	if not stop_match:
	123	# Somewhere else in the reading frame?
	124	matches = []
	125	for r in range(-search_range,search_range+1):
	126	l = stop_pos + r
	127	t = get_codon(seq,l,order)
	128	new_match = t in stop_codons
	129	if new_match:
	130	stop_shift.append(r)
	131	matches.append("%i:%s" % (r, t))
	132	if stop_shift:
	133	if DEBUG: print "# Stop codon reading frame matches: ", ",".join(matches)
	134
	135
	136	# Both wrong is something screwy in the data, check offset fix
	137	if not start_match and not stop_match:
	138	common_shift = list(set(start_shift) & set(stop_shift))
	139	if common_shift:
	140	if DEBUG: print "# Matching shifts: " + ",".join(map(str,common_shift))
	141	# Get the value closest to 0 for shifting purposes
	142	# Currently asume no negative shifts
	143	base_shift += min(common_shift)
	144	finished = False
	145	continue
	146	# Exercise left to the reader
	147	if (stop_counter > stop_limit):
	148	return
	149	else:
	150	stop_counter += 1
	151	finished = True
	152	else:
	153	stop_counter = 0
	154	finished = True
	155
	156	except csv.Error, e:
	157	sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e))
	158
	159	for contig in abstract_contig.keys():
	160	handle = open(contig + "-gene.raw", "w")
	161	handle.write(''.join(abstract_contig[contig]))
	162	handle.close()
	163
	164	if __name__ == "__main__":
	165	gene_detect()

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: liacs/dbdm/dbdm_4/gene_detect.py@ 149

Download in other formats: