source: liacs/dbdm/dbdm_4/parse_fasta.py@ 405

Last change on this file since 405 was 64, checked in by Rick van der Zwet, 15 years ago

Sample GHMM, needs statistics and verifing still

  • Property svn:executable set to *
File size: 2.9 KB
Line 
1#!/usr/bin/env python
2#
3# Parse 2 FASTA files and print statistics
4# BSDLicence
5# Rick van der Zwet - 0433373 - <info@rickvaderzwet.nl>
6from Bio import SeqIO,Seq
7from Bio import Alphabet
8from Bio.Alphabet.IUPAC import ambiguous_dna,unambiguous_dna
9import Bio.Data.CodonTable
10from MultiReplace import MultiReplace
11from reading_frames import reading_frames
12
13# The mapping is kind of odd, as 'r' could mean either 'g' or 'a', but in our
14# case we map them all to unknown
15
16fasta_translate = {
17 'r' : 'n', # purine
18 'y' : 'n', # pyrimide
19 'k' : 'n', # keto
20 'm' : 'n', # amino
21 's' : 'n', # strong
22 'w' : 'n', # weak
23 'b' : 'n',
24 'd' : 'n',
25 'h' : 'n',
26 'v' : 'n',
27 }
28
29fasta = MultiReplace(fasta_translate)
30
31def parse_file(file):
32 handle = open(file, "rU")
33 for seq_record in SeqIO.parse(handle, "fasta",ambiguous_dna):
34 # How to translate damm thing into plain nucleic acid codes
35 # http://en.wikipedia.org/wiki/FASTA_format
36 retval = seq_record.seq.__str__()
37
38 handle.close()
39 return(retval)
40
41def stats(seq):
42 pdict = {}
43 for n in range(1, len(seq)):
44 protein = seq[n]
45 if not pdict.has_key(protein):
46 pdict[protein] = 1
47 else:
48 pdict[protein] += 1
49
50 print pdict
51
52
53def concat(head,tail,max_length=1000):
54 "Concat two strings together removing common parts"
55 l_head = len(head)
56 l_tail = len(tail)
57
58 # Start/Stop at the right time
59 start = 1
60 if (l_head < l_tail):
61 stop = l_head + 1
62 else:
63 stop = l_tail + 1
64
65 # Make sure not to run for-ever
66 if (stop > max_length):
67 stop = max_length
68
69 # Find largest common part
70 # XXX: Not very effient (on very large overlap sets
71 for i in reversed(range(start,stop)):
72 #print "tail[0:%i] '%s' == '%s'" % (i, head[-i:], tail[0:i])
73 if head[-i:] == tail[0:i]:
74 return((i,(tail[0:i]),head + tail[i:]))
75
76 # None found return full part
77 return(-1,'',head + tail)
78
79# File names
80file1 = "data/AE005174v2-1.fas"
81file2 = "data/AE005174v2-2.fas"
82
83# Get data
84seq1 = parse_file(file1)
85seq2 = parse_file(file2)
86
87# Simplify answers
88seq1 = fasta.replace(seq1)
89seq2 = fasta.replace(seq2)
90
91# Find overlap
92(retval, common, result) = concat(seq2,seq1)
93print retval, common
94
95print file1
96stats(seq1)
97reading_frames(seq1)
98print file2
99stats(seq2)
100reading_frames(seq2)
101
102
103# Strictly speaking there is a gap of about 4 kbs (4000 bs) between seq1 and
104# seq2, so lets' put that into the the statistics as well. Due to circular
105# nature, does not matter wether we add it in the beginning or in the end
106print "Total (inc 4 kbs gap (n))"
107result = result + "n" * 4000;
108stats(result)
109reading_frames(result)
110
111# Write to file for later further processing
112out = open("AE005174v2-1.raw","w")
113out.write(seq1)
114out.close()
115
116out = open("AE005174v2-2.raw","w")
117out.write(seq2)
118out.close()
119
120out = open("full_contig.raw","w")
121out.write(result)
122out.close()
Note: See TracBrowser for help on using the repository browser.