Context Navigation

source: liacs/dbdm/dbdm_4/ecoli_hmm.py@ 63

Last change on this file since 63 was 63, checked in by Rick van der Zwet, 15 years ago
Finally got all the matching right
Property svn:executable set to ``*
File size: 7.3 KB

Rev	Line
[60]	1	#!/usr/bin/env python
	2
	3	# http://en.wikipedia.org/wiki/Stop_codon
	4	# http://en.wikipedia.org/wiki/Escherichia_coli
	5	# http://en.wikipedia.org/wiki/Open_reading_frame
[61]	6	# http://nl.wikipedia.org/wiki/Genetische_code
[60]	7
	8	import sys
[61]	9	import csv
	10	import string
[60]	11
[63]	12	# http://ghmm.sourceforge.net/
	13	import ghmm
	14
	15	# The mapping is kind of odd, as 'r' could mean either 'g' or 'a', without any clear distintion
	16	fasta_translate = {
	17	'r' : 'ga', # purine
	18	'y' : 'tc', # pyrimide
	19	'k' : 'gt', # keto
	20	'm' : 'ac', # amino
	21	's' : 'gc', # strong
	22	'w' : 'at', # weak
	23	'b' : 'gtc',
	24	'd' : 'gat',
	25	'h' : 'act',
	26	'v' : 'gca',
	27	}
	28
	29	DEBUG = True
	30
[62]	31	dna_flip = string.maketrans('atgc','tacg')
	32	def get_codon(seq, position, order):
	33	codon = seq[position:position+3]
	34	if not order:
	35	# When living on the other side of the string make sure to flip
	36	# around before comparison
	37	codon = codon[::-1].translate(dna_flip)
	38	return(codon)
[60]	39
[62]	40
	41
[63]	42	def ecoli_hmm():
[60]	43	"""Try to find genes inside e sequence using a HMM"""
	44	# Model 4 bases A C G T and unknown state N
	45	sigma = ghmm.Alphabet(['a', 'c', 'g', 't', 'n' ])
	46	print sigma
	47
	48	# XXX: Proper values, based of statistics
	49	# The transition matrix A is chosen such that it reflects the statistics
	50	# Part one will try to get us into a gene, part two will tell us when to
	51	# get out of it
	52	A = [[0.6,0.4], [0.3, 0.7]]
	53
	54	# The emission probabilities matrix is modeled after the statistics with a
	55	# total of 1
[61]	56	etogene = [ 0.1, 0.1, 0.1, 0.1, 0.6 ]
[60]	57	eingene = [ 0.2, 0.3, 0.3, 0.2, 0.1 ]
	58
	59	B = [etogene,eingene]
	60
	61	# Initial distribution favors outside
	62	pi = [0.9, 0.1]
	63
	64	m = ghmm.HMMFromMatrices(sigma,ghmm.DiscreteDistribution(sigma),A ,B, pi)
	65	print "Initial HMM", m
	66
	67	obs_seq = m.sampleSingle(20)
	68	print "Observation sequence : ", obs_seq
	69	obs = map(sigma.external, obs_seq)
	70	print "Observations : ", ''.join(obs)
	71
[61]	72	# Start codons
	73	# atg, (small chance) gtg
	74	# Stop codons
	75	# taa, tga
	76	# tag
	77	# lend = Left End
	78	# rend = Right End
	79
[62]	80	# Training and testing
[61]	81	# A -- T, G -- C
[62]	82	start_codons = ['atg', 'gtg']
	83	stop_codons = ['taa', 'tga', 'tag']
[61]	84	edl_file = 'data/edl_genes.csv'
[62]	85
	86	# Counter limit of how many 'hard' errors are allowed before bailing out
[63]	87	stop_limit = 100
[62]	88
	89	# Current shifting offset
	90	base_shift = 0
	91	stop_counter = 0
[63]	92
	93	# Sequences of contig, XXX: Make it a FASTA / BioPython Parser
	94	contig_seq = {}
[62]	95
[61]	96	reader = csv.reader(open(edl_file,"rU"))
	97	reader.next()
	98	try:
	99	for row in reader:
[62]	100	finished = False
	101	while not finished:
	102	(featureType, zNumber, contig, lend, rend, orientation,
	103	segmentType, oIslandNumber, geneName, note, function, product,
	104	translationNotes) = row
	105	lend = int(lend)
	106	rend = int(rend)
[63]	107	if not contig_seq.has_key(contig):
	108	# Load data
	109	try:
	110	handle = open(contig + ".raw","rU")
	111	contig_seq[contig] = handle.read()
	112	handle.close()
	113	except IOError:
	114	print "Unable to open '%s'" % (sys.argv[1])
	115	sys.exit(64)
	116	seq = contig_seq[contig]
	117
[62]	118	# Make a forward orientation mark as positive
	119	if orientation == '>':
	120	order = True
	121	else:
	122	order = False
[61]	123
[62]	124	# Living the world upside down to order is off
	125	if order:
	126	start_pos = base_shift + lend - 1
	127	stop_pos = base_shift + rend - 3
	128	else:
	129	start_pos = base_shift + rend -3
	130	stop_pos = base_shift + lend - 1
	131
	132	start_codon = get_codon(seq,start_pos,order)
	133	stop_codon = get_codon(seq,stop_pos,order)
	134	start_match = start_codon in start_codons
	135	stop_match = stop_codon in stop_codons
[61]	136
[63]	137	print "%6s, %s, %s, %s, %s, %s, %s, %s, %s, %s," % (geneName,lend,rend,start_pos, stop_pos, contig, start_codon, stop_codon, orientation, base_shift),
[62]	138	print "%s, %s" % (start_match, stop_match)
[63]	139	if not start_match or not stop_match:
	140	print "### Function (comment):", function
[62]	141
	142	# Check for fucked up offsets, shifts
[63]	143	start_shift = []
	144	stop_shift = []
[62]	145	# Technically speaking should the cope only the reading frames,
[63]	146	# but what the heck looking if 'free', but make sure to include the original frames as well
	147	search_range = abs(base_shift) + 3
[62]	148	if not start_match:
	149	# Somewhere else in the reading frame?
[63]	150	matches = []
[62]	151	for r in range(-search_range,search_range+1):
	152	l = start_pos + r
	153	t = get_codon(seq,l,order)
	154	new_match = t in start_codons
	155	if new_match:
[63]	156	matches.append("%i:%s" % (r, t))
	157	start_shift.append(r)
	158	if start_shift:
	159	print "# Start codon reading frame matches: ", ",".join(matches)
[62]	160	if not stop_match:
	161	# Somewhere else in the reading frame?
[63]	162	matches = []
[62]	163	for r in range(-search_range,search_range+1):
	164	l = stop_pos + r
	165	t = get_codon(seq,l,order)
	166	new_match = t in stop_codons
	167	if new_match:
[63]	168	stop_shift.append(r)
	169	matches.append("%i:%s" % (r, t))
	170	if stop_shift:
	171	print "# Stop codon reading frame matches: ", ",".join(matches)
[61]	172
[62]	173
	174	# Both wrong is something screwy in the data, check offset fix
	175	if not start_match and not stop_match:
[63]	176	common_shift = list(set(start_shift) & set(stop_shift))
	177	if common_shift:
	178	print "# Matching shifts: " + ",".join(map(str,common_shift))
	179	# Get the value closest to 0 for shifting purposes
	180	# Currently asume no negative shifts
	181	base_shift += min(common_shift)
[62]	182	finished = False
	183	continue
	184	# Exercise left to the reader
	185	if (stop_counter > stop_limit):
	186	return
	187	else:
	188	stop_counter += 1
	189	finished = True
	190	else:
	191	stop_counter = 0
	192	finished = True
	193
[61]	194	except csv.Error, e:
	195	sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e))
	196
	197
	198
	199	test_seq=ghmm.EmissionSequence(sigma,list(seq[0:(len(seq) / 3)]))
	200	v = m.viterbi(test_seq)
	201	#print "fairness of test_seq: ", v
	202
[60]	203
[61]	204	# Validation
	205	val_seq=ghmm.EmissionSequence(sigma,list(seq[10:2000]))
	206	v = m.viterbi(val_seq)
	207	#print "Test sequence: ", v
[60]	208
	209	# XXX: Results
	210
	211
	212
	213	if __name__ == "__main__":
[63]	214	ecoli_hmm()

Note: See TracBrowser for help on using the repository browser.

Download in other formats: