[41] | 1 | #!/usr/bin/env python
|
---|
[55] | 2 | #
|
---|
| 3 | # Parse 2 FASTA files and print statistics
|
---|
| 4 | # BSDLicence
|
---|
| 5 | # Rick van der Zwet - 0433373 - <info@rickvaderzwet.nl>
|
---|
[41] | 6 | from Bio import SeqIO,Seq
|
---|
| 7 | from Bio import Alphabet
|
---|
| 8 | from Bio.Alphabet.IUPAC import ambiguous_dna,unambiguous_dna
|
---|
| 9 | import Bio.Data.CodonTable
|
---|
[53] | 10 | from MultiReplace import MultiReplace
|
---|
[59] | 11 | from reading_frames import reading_frames
|
---|
[41] | 12 |
|
---|
[64] | 13 | # The mapping is kind of odd, as 'r' could mean either 'g' or 'a', but in our
|
---|
| 14 | # case we map them all to unknown
|
---|
| 15 |
|
---|
[54] | 16 | fasta_translate = {
|
---|
[64] | 17 | 'r' : 'n', # purine
|
---|
| 18 | 'y' : 'n', # pyrimide
|
---|
| 19 | 'k' : 'n', # keto
|
---|
| 20 | 'm' : 'n', # amino
|
---|
| 21 | 's' : 'n', # strong
|
---|
| 22 | 'w' : 'n', # weak
|
---|
| 23 | 'b' : 'n',
|
---|
| 24 | 'd' : 'n',
|
---|
| 25 | 'h' : 'n',
|
---|
| 26 | 'v' : 'n',
|
---|
[54] | 27 | }
|
---|
| 28 |
|
---|
| 29 | fasta = MultiReplace(fasta_translate)
|
---|
| 30 |
|
---|
[53] | 31 | def parse_file(file):
|
---|
[54] | 32 | handle = open(file, "rU")
|
---|
[53] | 33 | for seq_record in SeqIO.parse(handle, "fasta",ambiguous_dna):
|
---|
| 34 | # How to translate damm thing into plain nucleic acid codes
|
---|
| 35 | # http://en.wikipedia.org/wiki/FASTA_format
|
---|
[54] | 36 | retval = seq_record.seq.__str__()
|
---|
[53] | 37 |
|
---|
[54] | 38 | handle.close()
|
---|
| 39 | return(retval)
|
---|
| 40 |
|
---|
| 41 | def stats(seq):
|
---|
| 42 | pdict = {}
|
---|
| 43 | for n in range(1, len(seq)):
|
---|
| 44 | protein = seq[n]
|
---|
| 45 | if not pdict.has_key(protein):
|
---|
| 46 | pdict[protein] = 1
|
---|
| 47 | else:
|
---|
| 48 | pdict[protein] += 1
|
---|
[53] | 49 |
|
---|
[54] | 50 | print pdict
|
---|
[41] | 51 |
|
---|
[54] | 52 |
|
---|
| 53 | def concat(head,tail,max_length=1000):
|
---|
| 54 | "Concat two strings together removing common parts"
|
---|
| 55 | l_head = len(head)
|
---|
| 56 | l_tail = len(tail)
|
---|
| 57 |
|
---|
| 58 | # Start/Stop at the right time
|
---|
| 59 | start = 1
|
---|
| 60 | if (l_head < l_tail):
|
---|
| 61 | stop = l_head + 1
|
---|
| 62 | else:
|
---|
| 63 | stop = l_tail + 1
|
---|
| 64 |
|
---|
| 65 | # Make sure not to run for-ever
|
---|
| 66 | if (stop > max_length):
|
---|
| 67 | stop = max_length
|
---|
| 68 |
|
---|
| 69 | # Find largest common part
|
---|
| 70 | # XXX: Not very effient (on very large overlap sets
|
---|
| 71 | for i in reversed(range(start,stop)):
|
---|
| 72 | #print "tail[0:%i] '%s' == '%s'" % (i, head[-i:], tail[0:i])
|
---|
| 73 | if head[-i:] == tail[0:i]:
|
---|
| 74 | return((i,(tail[0:i]),head + tail[i:]))
|
---|
| 75 |
|
---|
| 76 | # None found return full part
|
---|
| 77 | return(-1,'',head + tail)
|
---|
| 78 |
|
---|
[58] | 79 | # File names
|
---|
| 80 | file1 = "data/AE005174v2-1.fas"
|
---|
| 81 | file2 = "data/AE005174v2-2.fas"
|
---|
[54] | 82 |
|
---|
[55] | 83 | # Get data
|
---|
[58] | 84 | seq1 = parse_file(file1)
|
---|
[63] | 85 | seq2 = parse_file(file2)
|
---|
[55] | 86 |
|
---|
[64] | 87 | # Simplify answers
|
---|
| 88 | seq1 = fasta.replace(seq1)
|
---|
| 89 | seq2 = fasta.replace(seq2)
|
---|
[55] | 90 |
|
---|
| 91 | # Find overlap
|
---|
[58] | 92 | (retval, common, result) = concat(seq2,seq1)
|
---|
[54] | 93 | print retval, common
|
---|
[55] | 94 |
|
---|
[58] | 95 | print file1
|
---|
| 96 | stats(seq1)
|
---|
[59] | 97 | reading_frames(seq1)
|
---|
| 98 | print file2
|
---|
| 99 | stats(seq2)
|
---|
| 100 | reading_frames(seq2)
|
---|
[58] | 101 |
|
---|
[59] | 102 |
|
---|
[58] | 103 | # Strictly speaking there is a gap of about 4 kbs (4000 bs) between seq1 and
|
---|
| 104 | # seq2, so lets' put that into the the statistics as well. Due to circular
|
---|
[55] | 105 | # nature, does not matter wether we add it in the beginning or in the end
|
---|
[58] | 106 | print "Total (inc 4 kbs gap (n))"
|
---|
[55] | 107 | result = result + "n" * 4000;
|
---|
[54] | 108 | stats(result)
|
---|
[59] | 109 | reading_frames(result)
|
---|
[54] | 110 |
|
---|
[55] | 111 | # Write to file for later further processing
|
---|
[63] | 112 | out = open("AE005174v2-1.raw","w")
|
---|
| 113 | out.write(seq1)
|
---|
| 114 | out.close()
|
---|
| 115 |
|
---|
| 116 | out = open("AE005174v2-2.raw","w")
|
---|
| 117 | out.write(seq2)
|
---|
| 118 | out.close()
|
---|
| 119 |
|
---|
[54] | 120 | out = open("full_contig.raw","w")
|
---|
| 121 | out.write(result)
|
---|
| 122 | out.close()
|
---|