source: liacs/dbdm/dbdm_4/parse_fasta.py@ 60

Last change on this file since 60 was 59, checked in by Rick van der Zwet, 15 years ago

Do the reading_frames as well

  • Property svn:executable set to *
File size: 2.7 KB
RevLine 
[41]1#!/usr/bin/env python
[55]2#
3# Parse 2 FASTA files and print statistics
4# BSDLicence
5# Rick van der Zwet - 0433373 - <info@rickvaderzwet.nl>
[41]6from Bio import SeqIO,Seq
7from Bio import Alphabet
8from Bio.Alphabet.IUPAC import ambiguous_dna,unambiguous_dna
9import Bio.Data.CodonTable
[53]10from MultiReplace import MultiReplace
[59]11from reading_frames import reading_frames
[41]12
[54]13fasta_translate = {
14 'r' : 'ga', # purine
15 'y' : 'tc', # pyrimide
16 'k' : 'gt', # keto
17 'm' : 'ac', # amino
18 's' : 'gc', # strong
19 'w' : 'at', # weak
20 'b' : 'gtc',
21 'd' : 'gat',
22 'h' : 'act',
23 'v' : 'gca',
24 }
25
26fasta = MultiReplace(fasta_translate)
27
[53]28def parse_file(file):
[54]29 handle = open(file, "rU")
[53]30 for seq_record in SeqIO.parse(handle, "fasta",ambiguous_dna):
31 # How to translate damm thing into plain nucleic acid codes
32 # http://en.wikipedia.org/wiki/FASTA_format
[54]33 retval = seq_record.seq.__str__()
[53]34
[54]35 handle.close()
36 return(retval)
37
38def stats(seq):
39 pdict = {}
40 for n in range(1, len(seq)):
41 protein = seq[n]
42 if not pdict.has_key(protein):
43 pdict[protein] = 1
44 else:
45 pdict[protein] += 1
[53]46
[54]47 print pdict
[41]48
[54]49
50def concat(head,tail,max_length=1000):
51 "Concat two strings together removing common parts"
52 l_head = len(head)
53 l_tail = len(tail)
54
55 # Start/Stop at the right time
56 start = 1
57 if (l_head < l_tail):
58 stop = l_head + 1
59 else:
60 stop = l_tail + 1
61
62 # Make sure not to run for-ever
63 if (stop > max_length):
64 stop = max_length
65
66 # Find largest common part
67 # XXX: Not very effient (on very large overlap sets
68 for i in reversed(range(start,stop)):
69 #print "tail[0:%i] '%s' == '%s'" % (i, head[-i:], tail[0:i])
70 if head[-i:] == tail[0:i]:
71 return((i,(tail[0:i]),head + tail[i:]))
72
73 # None found return full part
74 return(-1,'',head + tail)
75
[58]76# File names
77file1 = "data/AE005174v2-1.fas"
78file2 = "data/AE005174v2-2.fas"
[54]79
[55]80# Get data
[58]81seq1 = parse_file(file1)
82seq2 = parse_file(file1)
[55]83
[58]84seq1 = fasta.replace(seq1)
85seq2 = fasta.replace(seq2)
[55]86
87# Find overlap
[58]88(retval, common, result) = concat(seq2,seq1)
[54]89print retval, common
[55]90
[58]91print file1
92stats(seq1)
[59]93reading_frames(seq1)
94print file2
95stats(seq2)
96reading_frames(seq2)
[58]97
[59]98
[58]99# Strictly speaking there is a gap of about 4 kbs (4000 bs) between seq1 and
100# seq2, so lets' put that into the the statistics as well. Due to circular
[55]101# nature, does not matter wether we add it in the beginning or in the end
[58]102print "Total (inc 4 kbs gap (n))"
[55]103result = result + "n" * 4000;
[54]104stats(result)
[59]105reading_frames(result)
[54]106
[55]107# Write to file for later further processing
[54]108out = open("full_contig.raw","w")
109out.write(result)
110out.close()
Note: See TracBrowser for help on using the repository browser.