source: liacs/dbdm/dbdm_4/parse_fasta.py@ 63

Last change on this file since 63 was 63, checked in by Rick van der Zwet, 15 years ago

Finally got all the matching right

  • Property svn:executable set to *
File size: 2.9 KB
Line 
1#!/usr/bin/env python
2#
3# Parse 2 FASTA files and print statistics
4# BSDLicence
5# Rick van der Zwet - 0433373 - <info@rickvaderzwet.nl>
6from Bio import SeqIO,Seq
7from Bio import Alphabet
8from Bio.Alphabet.IUPAC import ambiguous_dna,unambiguous_dna
9import Bio.Data.CodonTable
10from MultiReplace import MultiReplace
11from reading_frames import reading_frames
12
13# The mapping is kind of odd, as 'r' could mean either 'g' or 'a'
14fasta_translate = {
15 'r' : 'ga', # purine
16 'y' : 'tc', # pyrimide
17 'k' : 'gt', # keto
18 'm' : 'ac', # amino
19 's' : 'gc', # strong
20 'w' : 'at', # weak
21 'b' : 'gtc',
22 'd' : 'gat',
23 'h' : 'act',
24 'v' : 'gca',
25 }
26
27fasta = MultiReplace(fasta_translate)
28
29def parse_file(file):
30 handle = open(file, "rU")
31 for seq_record in SeqIO.parse(handle, "fasta",ambiguous_dna):
32 # How to translate damm thing into plain nucleic acid codes
33 # http://en.wikipedia.org/wiki/FASTA_format
34 retval = seq_record.seq.__str__()
35
36 handle.close()
37 return(retval)
38
39def stats(seq):
40 pdict = {}
41 for n in range(1, len(seq)):
42 protein = seq[n]
43 if not pdict.has_key(protein):
44 pdict[protein] = 1
45 else:
46 pdict[protein] += 1
47
48 print pdict
49
50
51def concat(head,tail,max_length=1000):
52 "Concat two strings together removing common parts"
53 l_head = len(head)
54 l_tail = len(tail)
55
56 # Start/Stop at the right time
57 start = 1
58 if (l_head < l_tail):
59 stop = l_head + 1
60 else:
61 stop = l_tail + 1
62
63 # Make sure not to run for-ever
64 if (stop > max_length):
65 stop = max_length
66
67 # Find largest common part
68 # XXX: Not very effient (on very large overlap sets
69 for i in reversed(range(start,stop)):
70 #print "tail[0:%i] '%s' == '%s'" % (i, head[-i:], tail[0:i])
71 if head[-i:] == tail[0:i]:
72 return((i,(tail[0:i]),head + tail[i:]))
73
74 # None found return full part
75 return(-1,'',head + tail)
76
77# File names
78file1 = "data/AE005174v2-1.fas"
79file2 = "data/AE005174v2-2.fas"
80
81# Get data
82seq1 = parse_file(file1)
83seq2 = parse_file(file2)
84
85# Wrong assumption, replace is not possible as the real value is not know yet
86# seq1 = fasta.replace(seq1)
87# seq2 = fasta.replace(seq2)
88
89# Find overlap
90(retval, common, result) = concat(seq2,seq1)
91print retval, common
92
93print file1
94stats(seq1)
95reading_frames(seq1)
96print file2
97stats(seq2)
98reading_frames(seq2)
99
100
101# Strictly speaking there is a gap of about 4 kbs (4000 bs) between seq1 and
102# seq2, so lets' put that into the the statistics as well. Due to circular
103# nature, does not matter wether we add it in the beginning or in the end
104print "Total (inc 4 kbs gap (n))"
105result = result + "n" * 4000;
106stats(result)
107reading_frames(result)
108
109# Write to file for later further processing
110out = open("AE005174v2-1.raw","w")
111out.write(seq1)
112out.close()
113
114out = open("AE005174v2-2.raw","w")
115out.write(seq2)
116out.close()
117
118out = open("full_contig.raw","w")
119out.write(result)
120out.close()
Note: See TracBrowser for help on using the repository browser.