Context Navigation

parse_fasta.py@ 341

Last change on this file since 341 was 64, checked in by Rick van der Zwet, 15 years ago
Sample GHMM, needs statistics and verifing still
Property svn:executable set to ``*
File size: 2.9 KB

Rev	Line
[41]	1	#!/usr/bin/env python
[55]	2	#
	3	# Parse 2 FASTA files and print statistics
	4	# BSDLicence
	5	# Rick van der Zwet - 0433373 - <info@rickvaderzwet.nl>
[41]	6	from Bio import SeqIO,Seq
	7	from Bio import Alphabet
	8	from Bio.Alphabet.IUPAC import ambiguous_dna,unambiguous_dna
	9	import Bio.Data.CodonTable
[53]	10	from MultiReplace import MultiReplace
[59]	11	from reading_frames import reading_frames
[41]	12
[64]	13	# The mapping is kind of odd, as 'r' could mean either 'g' or 'a', but in our
	14	# case we map them all to unknown
	15
[54]	16	fasta_translate = {
[64]	17	'r' : 'n', # purine
	18	'y' : 'n', # pyrimide
	19	'k' : 'n', # keto
	20	'm' : 'n', # amino
	21	's' : 'n', # strong
	22	'w' : 'n', # weak
	23	'b' : 'n',
	24	'd' : 'n',
	25	'h' : 'n',
	26	'v' : 'n',
[54]	27	}
	28
	29	fasta = MultiReplace(fasta_translate)
	30
[53]	31	def parse_file(file):
[54]	32	handle = open(file, "rU")
[53]	33	for seq_record in SeqIO.parse(handle, "fasta",ambiguous_dna):
	34	# How to translate damm thing into plain nucleic acid codes
	35	# http://en.wikipedia.org/wiki/FASTA_format
[54]	36	retval = seq_record.seq.__str__()
[53]	37
[54]	38	handle.close()
	39	return(retval)
	40
	41	def stats(seq):
	42	pdict = {}
	43	for n in range(1, len(seq)):
	44	protein = seq[n]
	45	if not pdict.has_key(protein):
	46	pdict[protein] = 1
	47	else:
	48	pdict[protein] += 1
[53]	49
[54]	50	print pdict
[41]	51
[54]	52
	53	def concat(head,tail,max_length=1000):
	54	"Concat two strings together removing common parts"
	55	l_head = len(head)
	56	l_tail = len(tail)
	57
	58	# Start/Stop at the right time
	59	start = 1
	60	if (l_head < l_tail):
	61	stop = l_head + 1
	62	else:
	63	stop = l_tail + 1
	64
	65	# Make sure not to run for-ever
	66	if (stop > max_length):
	67	stop = max_length
	68
	69	# Find largest common part
	70	# XXX: Not very effient (on very large overlap sets
	71	for i in reversed(range(start,stop)):
	72	#print "tail[0:%i] '%s' == '%s'" % (i, head[-i:], tail[0:i])
	73	if head[-i:] == tail[0:i]:
	74	return((i,(tail[0:i]),head + tail[i:]))
	75
	76	# None found return full part
	77	return(-1,'',head + tail)
	78
[58]	79	# File names
	80	file1 = "data/AE005174v2-1.fas"
	81	file2 = "data/AE005174v2-2.fas"
[54]	82
[55]	83	# Get data
[58]	84	seq1 = parse_file(file1)
[63]	85	seq2 = parse_file(file2)
[55]	86
[64]	87	# Simplify answers
	88	seq1 = fasta.replace(seq1)
	89	seq2 = fasta.replace(seq2)
[55]	90
	91	# Find overlap
[58]	92	(retval, common, result) = concat(seq2,seq1)
[54]	93	print retval, common
[55]	94
[58]	95	print file1
	96	stats(seq1)
[59]	97	reading_frames(seq1)
	98	print file2
	99	stats(seq2)
	100	reading_frames(seq2)
[58]	101
[59]	102
[58]	103	# Strictly speaking there is a gap of about 4 kbs (4000 bs) between seq1 and
	104	# seq2, so lets' put that into the the statistics as well. Due to circular
[55]	105	# nature, does not matter wether we add it in the beginning or in the end
[58]	106	print "Total (inc 4 kbs gap (n))"
[55]	107	result = result + "n" * 4000;
[54]	108	stats(result)
[59]	109	reading_frames(result)
[54]	110
[55]	111	# Write to file for later further processing
[63]	112	out = open("AE005174v2-1.raw","w")
	113	out.write(seq1)
	114	out.close()
	115
	116	out = open("AE005174v2-2.raw","w")
	117	out.write(seq2)
	118	out.close()
	119
[54]	120	out = open("full_contig.raw","w")
	121	out.write(result)
	122	out.close()

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: liacs/dbdm/dbdm_4/parse_fasta.py@ 341

Download in other formats: