source: liacs/mms/assignment2/wer.pl@ 352

Last change on this file since 352 was 2, checked in by Rick van der Zwet, 15 years ago

Initial import of data of old repository ('data') worth keeping (e.g. tracking
means of URL access statistics)

  • Property svn:executable set to *
File size: 3.1 KB
Line 
1#!/usr/bin/env perl -w
2#
3# Little algoritm to determine the Word Error Rate of speech recognition
4# systems detection.
5#
6# Rick van der Zwet <info@rickvanderzwet.nl>
7# Licence: BSDlike - http://rickvanderzwet.nl/license
8
9
10# While processing look this many words ahead for potential matches. Roughly
11# spoken this number will limit your ability to detect at maximum LOOKFORWARD
12# number of insertions and substitution in one detection cycle.
13$LOOKFORWARD = 10;
14
15# Correct user if needed.
16if ($#ARGV < 1) {
17 print STDERR "Usage: $ARGV[0] <orignal> <new>\n";
18 exit 64;
19}
20
21open ORGINAL, "<$ARGV[0]";
22open NEW, "<$ARGV[1]";
23
24# Sanitize input, put into array
25$tmp = join(" ",<ORGINAL>);
26# Ignore case for the time beeing
27$tmp = lc($tmp);
28# Currently ignoring all punctations and newlines.
29$tmp =~ tr/[\.:,;\n\r]//d;
30@original = split(/[\t\ ]+/,$tmp);
31
32# Sanitize input, put into array
33$tmp = join(" ",<NEW>);
34$tmp = lc($tmp);
35$tmp =~ tr/[\.:,;\n\r]//d;
36@new = split(/[\t\ ]+/, $tmp);
37
38close ORGINAL;
39close NEW;
40
41# Bookkeeping values
42$insert = 0;
43$delete = 0;
44$substitution = 0;
45
46print "------------------\n";
47# XXX: Some way of pretty print the actual text hits/misses
48# Walktrough list comparing matches
49$lasthit_orig = -1;
50$lasthit_new = -1;
51$newhit_orig = -1;
52$newhit_new = -1;
53foreach $np (0 .. $#new) {
54 # Try to find word hit in next words
55 $newhit_orig = -1;
56 $end = $lasthit_orig + $LOOKFORWARD;
57 $end = $#original if $end > $#original;
58 foreach $op ($lasthit_orig .. $end) {
59 if ( $new[$np] eq $original[$op] ) {
60 $newhit_orig = $op;
61 $newhit_new = $np;
62 print "$new[$np] ";
63 last;
64 }
65 }
66
67 #Little hack to force processing the last words, on end of array, if none
68 #found, deleting does not work properly, will be handled seperately
69 if ($np == $#new and $newhit_orig == -1) {
70 $newhit_orig = $#original + 1;
71 $newhit_new = $#new + 1;
72 }
73
74 # We got a hit
75 if ($newhit_orig >= 0) {
76 # Calculate given diffences till (no including!) last hit
77 $diff_orig = $newhit_orig - $lasthit_orig - 1;
78 $diff_new = $newhit_new - $lasthit_new - 1;
79 $diff_words = $diff_new - $diff_orig;
80
81 # More means inserts, less deleting
82 if ($diff_words > 0) {
83 $insert += $diff_words;
84 } else {
85 $delete += abs($diff_words);
86 }
87
88 # The smallest number defines the substitutions
89 if ($diff_new < $diff_orig) {
90 $substitution += $diff_new;
91 } else {
92 $substitution += $diff_orig;
93 }
94
95 # Bookkeeping
96 $lasthit_orig = $newhit_orig;
97 $lasthit_new = $newhit_new;
98 }
99}
100
101# Make sure to process last deletions
102if ($#new < $#original) {
103 $delete += $#original - $#new;
104}
105
106print "\n";
107print "------------------\n";
108print "Total orig : $#original\n";
109print "Total new : $#new\n";
110print "------------------\n";
111print "Insert : $insert\n";
112print "Delete : $delete\n";
113print "Subsitute : $substitution\n";
114print "------------------\n";
115printf "WeR : %.3f\n", ($insert + $delete + $substitution) / $#original, "\n";
116
117exit 0;
118
119
Note: See TracBrowser for help on using the repository browser.