source: liacs/mss/assignment2/wer.pl@ 5

Last change on this file since 5 was 2, checked in by Rick van der Zwet, 15 years ago

Initial import of data of old repository ('data') worth keeping (e.g. tracking
means of URL access statistics)

  • Property svn:executable set to *
File size: 3.1 KB
RevLine 
[2]1#!/usr/bin/env perl -w
2#
3# Little algoritm to determine the Word Error Rate of speech recognition
4# systems detection.
5#
6# Rick van der Zwet <info@rickvanderzwet.nl>
7# Licence: BSDlike - http://rickvanderzwet.nl/license
8
9
10# While processing look this many words ahead for potential matches. Roughly
11# spoken this number will limit your ability to detect at maximum LOOKFORWARD
12# number of insertions and substitution in one detection cycle.
13$LOOKFORWARD = 10;
14
15# Correct user if needed.
16if ($#ARGV < 1) {
17 print STDERR "Usage: $ARGV[0] <orignal> <new>\n";
18 exit 64;
19}
20
21open ORGINAL, "<$ARGV[0]";
22open NEW, "<$ARGV[1]";
23
24# Sanitize input, put into array
25$tmp = join(" ",<ORGINAL>);
26# Ignore case for the time beeing
27$tmp = lc($tmp);
28# Currently ignoring all punctations and newlines.
29$tmp =~ tr/[\.:,;\n\r]//d;
30@original = split(/[\t\ ]+/,$tmp);
31
32# Sanitize input, put into array
33$tmp = join(" ",<NEW>);
34$tmp = lc($tmp);
35$tmp =~ tr/[\.:,;\n\r]//d;
36@new = split(/[\t\ ]+/, $tmp);
37
38close ORGINAL;
39close NEW;
40
41# Bookkeeping values
42$insert = 0;
43$delete = 0;
44$substitution = 0;
45
46print "------------------\n";
47# XXX: Some way of pretty print the actual text hits/misses
48# Walktrough list comparing matches
49$lasthit_orig = -1;
50$lasthit_new = -1;
51$newhit_orig = -1;
52$newhit_new = -1;
53foreach $np (0 .. $#new) {
54 # Try to find word hit in next words
55 $newhit_orig = -1;
56 $end = $lasthit_orig + $LOOKFORWARD;
57 $end = $#original if $end > $#original;
58 foreach $op ($lasthit_orig .. $end) {
59 if ( $new[$np] eq $original[$op] ) {
60 $newhit_orig = $op;
61 $newhit_new = $np;
62 print "$new[$np] ";
63 last;
64 }
65 }
66
67 #Little hack to force processing the last words, on end of array, if none
68 #found, deleting does not work properly, will be handled seperately
69 if ($np == $#new and $newhit_orig == -1) {
70 $newhit_orig = $#original + 1;
71 $newhit_new = $#new + 1;
72 }
73
74 # We got a hit
75 if ($newhit_orig >= 0) {
76 # Calculate given diffences till (no including!) last hit
77 $diff_orig = $newhit_orig - $lasthit_orig - 1;
78 $diff_new = $newhit_new - $lasthit_new - 1;
79 $diff_words = $diff_new - $diff_orig;
80
81 # More means inserts, less deleting
82 if ($diff_words > 0) {
83 $insert += $diff_words;
84 } else {
85 $delete += abs($diff_words);
86 }
87
88 # The smallest number defines the substitutions
89 if ($diff_new < $diff_orig) {
90 $substitution += $diff_new;
91 } else {
92 $substitution += $diff_orig;
93 }
94
95 # Bookkeeping
96 $lasthit_orig = $newhit_orig;
97 $lasthit_new = $newhit_new;
98 }
99}
100
101# Make sure to process last deletions
102if ($#new < $#original) {
103 $delete += $#original - $#new;
104}
105
106print "\n";
107print "------------------\n";
108print "Total orig : $#original\n";
109print "Total new : $#new\n";
110print "------------------\n";
111print "Insert : $insert\n";
112print "Delete : $delete\n";
113print "Subsitute : $substitution\n";
114print "------------------\n";
115printf "WeR : %.3f\n", ($insert + $delete + $substitution) / $#original, "\n";
116
117exit 0;
118
119
Note: See TracBrowser for help on using the repository browser.