1 | #!/usr/bin/env perl -w
|
---|
2 | #
|
---|
3 | # Little algoritm to determine the Word Error Rate of speech recognition
|
---|
4 | # systems detection.
|
---|
5 | #
|
---|
6 | # Rick van der Zwet <info@rickvanderzwet.nl>
|
---|
7 | # Licence: BSDlike - http://rickvanderzwet.nl/license
|
---|
8 |
|
---|
9 |
|
---|
10 | # While processing look this many words ahead for potential matches. Roughly
|
---|
11 | # spoken this number will limit your ability to detect at maximum LOOKFORWARD
|
---|
12 | # number of insertions and substitution in one detection cycle.
|
---|
13 | $LOOKFORWARD = 10;
|
---|
14 |
|
---|
15 | # Correct user if needed.
|
---|
16 | if ($#ARGV < 1) {
|
---|
17 | print STDERR "Usage: $ARGV[0] <orignal> <new>\n";
|
---|
18 | exit 64;
|
---|
19 | }
|
---|
20 |
|
---|
21 | open ORGINAL, "<$ARGV[0]";
|
---|
22 | open NEW, "<$ARGV[1]";
|
---|
23 |
|
---|
24 | # Sanitize input, put into array
|
---|
25 | $tmp = join(" ",<ORGINAL>);
|
---|
26 | # Ignore case for the time beeing
|
---|
27 | $tmp = lc($tmp);
|
---|
28 | # Currently ignoring all punctations and newlines.
|
---|
29 | $tmp =~ tr/[\.:,;\n\r]//d;
|
---|
30 | @original = split(/[\t\ ]+/,$tmp);
|
---|
31 |
|
---|
32 | # Sanitize input, put into array
|
---|
33 | $tmp = join(" ",<NEW>);
|
---|
34 | $tmp = lc($tmp);
|
---|
35 | $tmp =~ tr/[\.:,;\n\r]//d;
|
---|
36 | @new = split(/[\t\ ]+/, $tmp);
|
---|
37 |
|
---|
38 | close ORGINAL;
|
---|
39 | close NEW;
|
---|
40 |
|
---|
41 | # Bookkeeping values
|
---|
42 | $insert = 0;
|
---|
43 | $delete = 0;
|
---|
44 | $substitution = 0;
|
---|
45 |
|
---|
46 | print "------------------\n";
|
---|
47 | # XXX: Some way of pretty print the actual text hits/misses
|
---|
48 | # Walktrough list comparing matches
|
---|
49 | $lasthit_orig = -1;
|
---|
50 | $lasthit_new = -1;
|
---|
51 | $newhit_orig = -1;
|
---|
52 | $newhit_new = -1;
|
---|
53 | foreach $np (0 .. $#new) {
|
---|
54 | # Try to find word hit in next words
|
---|
55 | $newhit_orig = -1;
|
---|
56 | $end = $lasthit_orig + $LOOKFORWARD;
|
---|
57 | $end = $#original if $end > $#original;
|
---|
58 | foreach $op ($lasthit_orig .. $end) {
|
---|
59 | if ( $new[$np] eq $original[$op] ) {
|
---|
60 | $newhit_orig = $op;
|
---|
61 | $newhit_new = $np;
|
---|
62 | print "$new[$np] ";
|
---|
63 | last;
|
---|
64 | }
|
---|
65 | }
|
---|
66 |
|
---|
67 | #Little hack to force processing the last words, on end of array, if none
|
---|
68 | #found, deleting does not work properly, will be handled seperately
|
---|
69 | if ($np == $#new and $newhit_orig == -1) {
|
---|
70 | $newhit_orig = $#original + 1;
|
---|
71 | $newhit_new = $#new + 1;
|
---|
72 | }
|
---|
73 |
|
---|
74 | # We got a hit
|
---|
75 | if ($newhit_orig >= 0) {
|
---|
76 | # Calculate given diffences till (no including!) last hit
|
---|
77 | $diff_orig = $newhit_orig - $lasthit_orig - 1;
|
---|
78 | $diff_new = $newhit_new - $lasthit_new - 1;
|
---|
79 | $diff_words = $diff_new - $diff_orig;
|
---|
80 |
|
---|
81 | # More means inserts, less deleting
|
---|
82 | if ($diff_words > 0) {
|
---|
83 | $insert += $diff_words;
|
---|
84 | } else {
|
---|
85 | $delete += abs($diff_words);
|
---|
86 | }
|
---|
87 |
|
---|
88 | # The smallest number defines the substitutions
|
---|
89 | if ($diff_new < $diff_orig) {
|
---|
90 | $substitution += $diff_new;
|
---|
91 | } else {
|
---|
92 | $substitution += $diff_orig;
|
---|
93 | }
|
---|
94 |
|
---|
95 | # Bookkeeping
|
---|
96 | $lasthit_orig = $newhit_orig;
|
---|
97 | $lasthit_new = $newhit_new;
|
---|
98 | }
|
---|
99 | }
|
---|
100 |
|
---|
101 | # Make sure to process last deletions
|
---|
102 | if ($#new < $#original) {
|
---|
103 | $delete += $#original - $#new;
|
---|
104 | }
|
---|
105 |
|
---|
106 | print "\n";
|
---|
107 | print "------------------\n";
|
---|
108 | print "Total orig : $#original\n";
|
---|
109 | print "Total new : $#new\n";
|
---|
110 | print "------------------\n";
|
---|
111 | print "Insert : $insert\n";
|
---|
112 | print "Delete : $delete\n";
|
---|
113 | print "Subsitute : $substitution\n";
|
---|
114 | print "------------------\n";
|
---|
115 | printf "WeR : %.3f\n", ($insert + $delete + $substitution) / $#original, "\n";
|
---|
116 |
|
---|
117 | exit 0;
|
---|
118 |
|
---|
119 |
|
---|