source: liacs/dbdm/dbdm_4/parse_fasta.py@ 246

Last change on this file since 246 was 64, checked in by Rick van der Zwet, 15 years ago

Sample GHMM, needs statistics and verifing still

  • Property svn:executable set to *
File size: 2.9 KB
RevLine 
[41]1#!/usr/bin/env python
[55]2#
3# Parse 2 FASTA files and print statistics
4# BSDLicence
5# Rick van der Zwet - 0433373 - <info@rickvaderzwet.nl>
[41]6from Bio import SeqIO,Seq
7from Bio import Alphabet
8from Bio.Alphabet.IUPAC import ambiguous_dna,unambiguous_dna
9import Bio.Data.CodonTable
[53]10from MultiReplace import MultiReplace
[59]11from reading_frames import reading_frames
[41]12
[64]13# The mapping is kind of odd, as 'r' could mean either 'g' or 'a', but in our
14# case we map them all to unknown
15
[54]16fasta_translate = {
[64]17 'r' : 'n', # purine
18 'y' : 'n', # pyrimide
19 'k' : 'n', # keto
20 'm' : 'n', # amino
21 's' : 'n', # strong
22 'w' : 'n', # weak
23 'b' : 'n',
24 'd' : 'n',
25 'h' : 'n',
26 'v' : 'n',
[54]27 }
28
29fasta = MultiReplace(fasta_translate)
30
[53]31def parse_file(file):
[54]32 handle = open(file, "rU")
[53]33 for seq_record in SeqIO.parse(handle, "fasta",ambiguous_dna):
34 # How to translate damm thing into plain nucleic acid codes
35 # http://en.wikipedia.org/wiki/FASTA_format
[54]36 retval = seq_record.seq.__str__()
[53]37
[54]38 handle.close()
39 return(retval)
40
41def stats(seq):
42 pdict = {}
43 for n in range(1, len(seq)):
44 protein = seq[n]
45 if not pdict.has_key(protein):
46 pdict[protein] = 1
47 else:
48 pdict[protein] += 1
[53]49
[54]50 print pdict
[41]51
[54]52
53def concat(head,tail,max_length=1000):
54 "Concat two strings together removing common parts"
55 l_head = len(head)
56 l_tail = len(tail)
57
58 # Start/Stop at the right time
59 start = 1
60 if (l_head < l_tail):
61 stop = l_head + 1
62 else:
63 stop = l_tail + 1
64
65 # Make sure not to run for-ever
66 if (stop > max_length):
67 stop = max_length
68
69 # Find largest common part
70 # XXX: Not very effient (on very large overlap sets
71 for i in reversed(range(start,stop)):
72 #print "tail[0:%i] '%s' == '%s'" % (i, head[-i:], tail[0:i])
73 if head[-i:] == tail[0:i]:
74 return((i,(tail[0:i]),head + tail[i:]))
75
76 # None found return full part
77 return(-1,'',head + tail)
78
[58]79# File names
80file1 = "data/AE005174v2-1.fas"
81file2 = "data/AE005174v2-2.fas"
[54]82
[55]83# Get data
[58]84seq1 = parse_file(file1)
[63]85seq2 = parse_file(file2)
[55]86
[64]87# Simplify answers
88seq1 = fasta.replace(seq1)
89seq2 = fasta.replace(seq2)
[55]90
91# Find overlap
[58]92(retval, common, result) = concat(seq2,seq1)
[54]93print retval, common
[55]94
[58]95print file1
96stats(seq1)
[59]97reading_frames(seq1)
98print file2
99stats(seq2)
100reading_frames(seq2)
[58]101
[59]102
[58]103# Strictly speaking there is a gap of about 4 kbs (4000 bs) between seq1 and
104# seq2, so lets' put that into the the statistics as well. Due to circular
[55]105# nature, does not matter wether we add it in the beginning or in the end
[58]106print "Total (inc 4 kbs gap (n))"
[55]107result = result + "n" * 4000;
[54]108stats(result)
[59]109reading_frames(result)
[54]110
[55]111# Write to file for later further processing
[63]112out = open("AE005174v2-1.raw","w")
113out.write(seq1)
114out.close()
115
116out = open("AE005174v2-2.raw","w")
117out.write(seq2)
118out.close()
119
[54]120out = open("full_contig.raw","w")
121out.write(result)
122out.close()
Note: See TracBrowser for help on using the repository browser.