Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

source: liacs/dbdm/dbdm_4/parse_fasta.py@ 60

Last change on this file since 60 was 59, checked in by Rick van der Zwet, 15 years ago
Do the reading_frames as well
Property svn:executable set to ``*
File size: 2.7 KB

Rev	Line
[41]	1	#!/usr/bin/env python
[55]	2	#
	3	# Parse 2 FASTA files and print statistics
	4	# BSDLicence
	5	# Rick van der Zwet - 0433373 - <info@rickvaderzwet.nl>
[41]	6	from Bio import SeqIO,Seq
	7	from Bio import Alphabet
	8	from Bio.Alphabet.IUPAC import ambiguous_dna,unambiguous_dna
	9	import Bio.Data.CodonTable
[53]	10	from MultiReplace import MultiReplace
[59]	11	from reading_frames import reading_frames
[41]	12
[54]	13	fasta_translate = {
	14	'r' : 'ga', # purine
	15	'y' : 'tc', # pyrimide
	16	'k' : 'gt', # keto
	17	'm' : 'ac', # amino
	18	's' : 'gc', # strong
	19	'w' : 'at', # weak
	20	'b' : 'gtc',
	21	'd' : 'gat',
	22	'h' : 'act',
	23	'v' : 'gca',
	24	}
	25
	26	fasta = MultiReplace(fasta_translate)
	27
[53]	28	def parse_file(file):
[54]	29	handle = open(file, "rU")
[53]	30	for seq_record in SeqIO.parse(handle, "fasta",ambiguous_dna):
	31	# How to translate damm thing into plain nucleic acid codes
	32	# http://en.wikipedia.org/wiki/FASTA_format
[54]	33	retval = seq_record.seq.__str__()
[53]	34
[54]	35	handle.close()
	36	return(retval)
	37
	38	def stats(seq):
	39	pdict = {}
	40	for n in range(1, len(seq)):
	41	protein = seq[n]
	42	if not pdict.has_key(protein):
	43	pdict[protein] = 1
	44	else:
	45	pdict[protein] += 1
[53]	46
[54]	47	print pdict
[41]	48
[54]	49
	50	def concat(head,tail,max_length=1000):
	51	"Concat two strings together removing common parts"
	52	l_head = len(head)
	53	l_tail = len(tail)
	54
	55	# Start/Stop at the right time
	56	start = 1
	57	if (l_head < l_tail):
	58	stop = l_head + 1
	59	else:
	60	stop = l_tail + 1
	61
	62	# Make sure not to run for-ever
	63	if (stop > max_length):
	64	stop = max_length
	65
	66	# Find largest common part
	67	# XXX: Not very effient (on very large overlap sets
	68	for i in reversed(range(start,stop)):
	69	#print "tail[0:%i] '%s' == '%s'" % (i, head[-i:], tail[0:i])
	70	if head[-i:] == tail[0:i]:
	71	return((i,(tail[0:i]),head + tail[i:]))
	72
	73	# None found return full part
	74	return(-1,'',head + tail)
	75
[58]	76	# File names
	77	file1 = "data/AE005174v2-1.fas"
	78	file2 = "data/AE005174v2-2.fas"
[54]	79
[55]	80	# Get data
[58]	81	seq1 = parse_file(file1)
	82	seq2 = parse_file(file1)
[55]	83
[58]	84	seq1 = fasta.replace(seq1)
	85	seq2 = fasta.replace(seq2)
[55]	86
	87	# Find overlap
[58]	88	(retval, common, result) = concat(seq2,seq1)
[54]	89	print retval, common
[55]	90
[58]	91	print file1
	92	stats(seq1)
[59]	93	reading_frames(seq1)
	94	print file2
	95	stats(seq2)
	96	reading_frames(seq2)
[58]	97
[59]	98
[58]	99	# Strictly speaking there is a gap of about 4 kbs (4000 bs) between seq1 and
	100	# seq2, so lets' put that into the the statistics as well. Due to circular
[55]	101	# nature, does not matter wether we add it in the beginning or in the end
[58]	102	print "Total (inc 4 kbs gap (n))"
[55]	103	result = result + "n" * 4000;
[54]	104	stats(result)
[59]	105	reading_frames(result)
[54]	106
[55]	107	# Write to file for later further processing
[54]	108	out = open("full_contig.raw","w")
	109	out.write(result)
	110	out.close()

Note: See TracBrowser for help on using the repository browser.

Download in other formats: