[2] | 1 | #!/usr/bin/env perl -w
|
---|
| 2 | #
|
---|
| 3 | # Little algoritm to determine the Word Error Rate of speech recognition
|
---|
| 4 | # systems detection.
|
---|
| 5 | #
|
---|
| 6 | # Rick van der Zwet <info@rickvanderzwet.nl>
|
---|
| 7 | # Licence: BSDlike - http://rickvanderzwet.nl/license
|
---|
| 8 |
|
---|
| 9 |
|
---|
| 10 | # While processing look this many words ahead for potential matches. Roughly
|
---|
| 11 | # spoken this number will limit your ability to detect at maximum LOOKFORWARD
|
---|
| 12 | # number of insertions and substitution in one detection cycle.
|
---|
| 13 | $LOOKFORWARD = 10;
|
---|
| 14 |
|
---|
| 15 | # Correct user if needed.
|
---|
| 16 | if ($#ARGV < 1) {
|
---|
| 17 | print STDERR "Usage: $ARGV[0] <orignal> <new>\n";
|
---|
| 18 | exit 64;
|
---|
| 19 | }
|
---|
| 20 |
|
---|
| 21 | open ORGINAL, "<$ARGV[0]";
|
---|
| 22 | open NEW, "<$ARGV[1]";
|
---|
| 23 |
|
---|
| 24 | # Sanitize input, put into array
|
---|
| 25 | $tmp = join(" ",<ORGINAL>);
|
---|
| 26 | # Ignore case for the time beeing
|
---|
| 27 | $tmp = lc($tmp);
|
---|
| 28 | # Currently ignoring all punctations and newlines.
|
---|
| 29 | $tmp =~ tr/[\.:,;\n\r]//d;
|
---|
| 30 | @original = split(/[\t\ ]+/,$tmp);
|
---|
| 31 |
|
---|
| 32 | # Sanitize input, put into array
|
---|
| 33 | $tmp = join(" ",<NEW>);
|
---|
| 34 | $tmp = lc($tmp);
|
---|
| 35 | $tmp =~ tr/[\.:,;\n\r]//d;
|
---|
| 36 | @new = split(/[\t\ ]+/, $tmp);
|
---|
| 37 |
|
---|
| 38 | close ORGINAL;
|
---|
| 39 | close NEW;
|
---|
| 40 |
|
---|
| 41 | # Bookkeeping values
|
---|
| 42 | $insert = 0;
|
---|
| 43 | $delete = 0;
|
---|
| 44 | $substitution = 0;
|
---|
| 45 |
|
---|
| 46 | print "------------------\n";
|
---|
| 47 | # XXX: Some way of pretty print the actual text hits/misses
|
---|
| 48 | # Walktrough list comparing matches
|
---|
| 49 | $lasthit_orig = -1;
|
---|
| 50 | $lasthit_new = -1;
|
---|
| 51 | $newhit_orig = -1;
|
---|
| 52 | $newhit_new = -1;
|
---|
| 53 | foreach $np (0 .. $#new) {
|
---|
| 54 | # Try to find word hit in next words
|
---|
| 55 | $newhit_orig = -1;
|
---|
| 56 | $end = $lasthit_orig + $LOOKFORWARD;
|
---|
| 57 | $end = $#original if $end > $#original;
|
---|
| 58 | foreach $op ($lasthit_orig .. $end) {
|
---|
| 59 | if ( $new[$np] eq $original[$op] ) {
|
---|
| 60 | $newhit_orig = $op;
|
---|
| 61 | $newhit_new = $np;
|
---|
| 62 | print "$new[$np] ";
|
---|
| 63 | last;
|
---|
| 64 | }
|
---|
| 65 | }
|
---|
| 66 |
|
---|
| 67 | #Little hack to force processing the last words, on end of array, if none
|
---|
| 68 | #found, deleting does not work properly, will be handled seperately
|
---|
| 69 | if ($np == $#new and $newhit_orig == -1) {
|
---|
| 70 | $newhit_orig = $#original + 1;
|
---|
| 71 | $newhit_new = $#new + 1;
|
---|
| 72 | }
|
---|
| 73 |
|
---|
| 74 | # We got a hit
|
---|
| 75 | if ($newhit_orig >= 0) {
|
---|
| 76 | # Calculate given diffences till (no including!) last hit
|
---|
| 77 | $diff_orig = $newhit_orig - $lasthit_orig - 1;
|
---|
| 78 | $diff_new = $newhit_new - $lasthit_new - 1;
|
---|
| 79 | $diff_words = $diff_new - $diff_orig;
|
---|
| 80 |
|
---|
| 81 | # More means inserts, less deleting
|
---|
| 82 | if ($diff_words > 0) {
|
---|
| 83 | $insert += $diff_words;
|
---|
| 84 | } else {
|
---|
| 85 | $delete += abs($diff_words);
|
---|
| 86 | }
|
---|
| 87 |
|
---|
| 88 | # The smallest number defines the substitutions
|
---|
| 89 | if ($diff_new < $diff_orig) {
|
---|
| 90 | $substitution += $diff_new;
|
---|
| 91 | } else {
|
---|
| 92 | $substitution += $diff_orig;
|
---|
| 93 | }
|
---|
| 94 |
|
---|
| 95 | # Bookkeeping
|
---|
| 96 | $lasthit_orig = $newhit_orig;
|
---|
| 97 | $lasthit_new = $newhit_new;
|
---|
| 98 | }
|
---|
| 99 | }
|
---|
| 100 |
|
---|
| 101 | # Make sure to process last deletions
|
---|
| 102 | if ($#new < $#original) {
|
---|
| 103 | $delete += $#original - $#new;
|
---|
| 104 | }
|
---|
| 105 |
|
---|
| 106 | print "\n";
|
---|
| 107 | print "------------------\n";
|
---|
| 108 | print "Total orig : $#original\n";
|
---|
| 109 | print "Total new : $#new\n";
|
---|
| 110 | print "------------------\n";
|
---|
| 111 | print "Insert : $insert\n";
|
---|
| 112 | print "Delete : $delete\n";
|
---|
| 113 | print "Subsitute : $substitution\n";
|
---|
| 114 | print "------------------\n";
|
---|
| 115 | printf "WeR : %.3f\n", ($insert + $delete + $substitution) / $#original, "\n";
|
---|
| 116 |
|
---|
| 117 | exit 0;
|
---|
| 118 |
|
---|
| 119 |
|
---|