Context Navigation

source: liacs/dbdm/dbdm_4/parse_fasta.py@ 63

Last change on this file since 63 was 63, checked in by Rick van der Zwet, 15 years ago
Finally got all the matching right
Property svn:executable set to ``*
File size: 2.9 KB

Line
1	#!/usr/bin/env python
2	#
3	# Parse 2 FASTA files and print statistics
4	# BSDLicence
5	# Rick van der Zwet - 0433373 - <info@rickvaderzwet.nl>
6	from Bio import SeqIO,Seq
7	from Bio import Alphabet
8	from Bio.Alphabet.IUPAC import ambiguous_dna,unambiguous_dna
9	import Bio.Data.CodonTable
10	from MultiReplace import MultiReplace
11	from reading_frames import reading_frames
12
13	# The mapping is kind of odd, as 'r' could mean either 'g' or 'a'
14	fasta_translate = {
15	'r' : 'ga', # purine
16	'y' : 'tc', # pyrimide
17	'k' : 'gt', # keto
18	'm' : 'ac', # amino
19	's' : 'gc', # strong
20	'w' : 'at', # weak
21	'b' : 'gtc',
22	'd' : 'gat',
23	'h' : 'act',
24	'v' : 'gca',
25	}
26
27	fasta = MultiReplace(fasta_translate)
28
29	def parse_file(file):
30	handle = open(file, "rU")
31	for seq_record in SeqIO.parse(handle, "fasta",ambiguous_dna):
32	# How to translate damm thing into plain nucleic acid codes
33	# http://en.wikipedia.org/wiki/FASTA_format
34	retval = seq_record.seq.__str__()
35
36	handle.close()
37	return(retval)
38
39	def stats(seq):
40	pdict = {}
41	for n in range(1, len(seq)):
42	protein = seq[n]
43	if not pdict.has_key(protein):
44	pdict[protein] = 1
45	else:
46	pdict[protein] += 1
47
48	print pdict
49
50
51	def concat(head,tail,max_length=1000):
52	"Concat two strings together removing common parts"
53	l_head = len(head)
54	l_tail = len(tail)
55
56	# Start/Stop at the right time
57	start = 1
58	if (l_head < l_tail):
59	stop = l_head + 1
60	else:
61	stop = l_tail + 1
62
63	# Make sure not to run for-ever
64	if (stop > max_length):
65	stop = max_length
66
67	# Find largest common part
68	# XXX: Not very effient (on very large overlap sets
69	for i in reversed(range(start,stop)):
70	#print "tail[0:%i] '%s' == '%s'" % (i, head[-i:], tail[0:i])
71	if head[-i:] == tail[0:i]:
72	return((i,(tail[0:i]),head + tail[i:]))
73
74	# None found return full part
75	return(-1,'',head + tail)
76
77	# File names
78	file1 = "data/AE005174v2-1.fas"
79	file2 = "data/AE005174v2-2.fas"
80
81	# Get data
82	seq1 = parse_file(file1)
83	seq2 = parse_file(file2)
84
85	# Wrong assumption, replace is not possible as the real value is not know yet
86	# seq1 = fasta.replace(seq1)
87	# seq2 = fasta.replace(seq2)
88
89	# Find overlap
90	(retval, common, result) = concat(seq2,seq1)
91	print retval, common
92
93	print file1
94	stats(seq1)
95	reading_frames(seq1)
96	print file2
97	stats(seq2)
98	reading_frames(seq2)
99
100
101	# Strictly speaking there is a gap of about 4 kbs (4000 bs) between seq1 and
102	# seq2, so lets' put that into the the statistics as well. Due to circular
103	# nature, does not matter wether we add it in the beginning or in the end
104	print "Total (inc 4 kbs gap (n))"
105	result = result + "n" * 4000;
106	stats(result)
107	reading_frames(result)
108
109	# Write to file for later further processing
110	out = open("AE005174v2-1.raw","w")
111	out.write(seq1)
112	out.close()
113
114	out = open("AE005174v2-2.raw","w")
115	out.write(seq2)
116	out.close()
117
118	out = open("full_contig.raw","w")
119	out.write(result)
120	out.close()

Note: See TracBrowser for help on using the repository browser.

Download in other formats: