Context Navigation

source: liacs/dbdm/dbdm_4/parse-fasta.py@ 55

Last change on this file since 55 was 55, checked in by Rick van der Zwet, 15 years ago
Make it 'production' ready
Property svn:executable set to ``*
File size: 2.4 KB

Rev	Line
[41]	1	#!/usr/bin/env python
[55]	2	#
	3	# Parse 2 FASTA files and print statistics
	4	# BSDLicence
	5	# Rick van der Zwet - 0433373 - <info@rickvaderzwet.nl>
[41]	6	from Bio import SeqIO,Seq
	7	from Bio import Alphabet
	8	from Bio.Alphabet.IUPAC import ambiguous_dna,unambiguous_dna
	9	import Bio.Data.CodonTable
[53]	10	from MultiReplace import MultiReplace
[41]	11
[54]	12	fasta_translate = {
	13	'r' : 'ga', # purine
	14	'y' : 'tc', # pyrimide
	15	'k' : 'gt', # keto
	16	'm' : 'ac', # amino
	17	's' : 'gc', # strong
	18	'w' : 'at', # weak
	19	'b' : 'gtc',
	20	'd' : 'gat',
	21	'h' : 'act',
	22	'v' : 'gca',
	23	}
	24
	25	fasta = MultiReplace(fasta_translate)
	26
[53]	27	def parse_file(file):
[54]	28	handle = open(file, "rU")
[53]	29	for seq_record in SeqIO.parse(handle, "fasta",ambiguous_dna):
	30	# How to translate damm thing into plain nucleic acid codes
	31	# http://en.wikipedia.org/wiki/FASTA_format
[54]	32	retval = seq_record.seq.__str__()
[53]	33
[54]	34	handle.close()
	35	return(retval)
	36
	37	def stats(seq):
	38	pdict = {}
	39	for n in range(1, len(seq)):
	40	protein = seq[n]
	41	if not pdict.has_key(protein):
	42	pdict[protein] = 1
	43	else:
	44	pdict[protein] += 1
[53]	45
[54]	46	print pdict
[41]	47
[54]	48
	49	def concat(head,tail,max_length=1000):
	50	"Concat two strings together removing common parts"
	51	l_head = len(head)
	52	l_tail = len(tail)
	53
	54	# Start/Stop at the right time
	55	start = 1
	56	if (l_head < l_tail):
	57	stop = l_head + 1
	58	else:
	59	stop = l_tail + 1
	60
	61	# Make sure not to run for-ever
	62	if (stop > max_length):
	63	stop = max_length
	64
	65	# Find largest common part
	66	# XXX: Not very effient (on very large overlap sets
	67	for i in reversed(range(start,stop)):
	68	#print "tail[0:%i] '%s' == '%s'" % (i, head[-i:], tail[0:i])
	69	if head[-i:] == tail[0:i]:
	70	return((i,(tail[0:i]),head + tail[i:]))
	71
	72	# None found return full part
	73	return(-1,'',head + tail)
	74
	75
[55]	76	# Get data
[53]	77	file1 = parse_file("data/AE005174v2-1.fas")
	78	file2 = parse_file("data/AE005174v2-2.fas")
[55]	79
[54]	80	file1 = fasta.replace(file1)
	81	file2 = fasta.replace(file2)
[55]	82
	83	# Find overlap
[54]	84	(retval, common, result) = concat(file2,file1)
	85	print retval, common
[55]	86
	87	# Strictly speaking there is a gap of about 4 kbs (4000 bs) between file1 and
	88	# file2, so lets' put that into the the statistics as well. Due to circular
	89	# nature, does not matter wether we add it in the beginning or in the end
	90	result = result + "n" * 4000;
[54]	91	stats(result)
	92
[55]	93	# Write to file for later further processing
[54]	94	out = open("full_contig.raw","w")
	95	out.write(result)
	96	out.close()

Note: See TracBrowser for help on using the repository browser.

Download in other formats: