#!/usr/bin/env perl -w # # Little algoritm to determine the Word Error Rate of speech recognition # systems detection. # # Rick van der Zwet # Licence: BSDlike - http://rickvanderzwet.nl/license # While processing look this many words ahead for potential matches. Roughly # spoken this number will limit your ability to detect at maximum LOOKFORWARD # number of insertions and substitution in one detection cycle. $LOOKFORWARD = 10; # Correct user if needed. if ($#ARGV < 1) { print STDERR "Usage: $ARGV[0] \n"; exit 64; } open ORGINAL, "<$ARGV[0]"; open NEW, "<$ARGV[1]"; # Sanitize input, put into array $tmp = join(" ",); # Ignore case for the time beeing $tmp = lc($tmp); # Currently ignoring all punctations and newlines. $tmp =~ tr/[\.:,;\n\r]//d; @original = split(/[\t\ ]+/,$tmp); # Sanitize input, put into array $tmp = join(" ",); $tmp = lc($tmp); $tmp =~ tr/[\.:,;\n\r]//d; @new = split(/[\t\ ]+/, $tmp); close ORGINAL; close NEW; # Bookkeeping values $insert = 0; $delete = 0; $substitution = 0; print "------------------\n"; # XXX: Some way of pretty print the actual text hits/misses # Walktrough list comparing matches $lasthit_orig = -1; $lasthit_new = -1; $newhit_orig = -1; $newhit_new = -1; foreach $np (0 .. $#new) { # Try to find word hit in next words $newhit_orig = -1; $end = $lasthit_orig + $LOOKFORWARD; $end = $#original if $end > $#original; foreach $op ($lasthit_orig .. $end) { if ( $new[$np] eq $original[$op] ) { $newhit_orig = $op; $newhit_new = $np; print "$new[$np] "; last; } } #Little hack to force processing the last words, on end of array, if none #found, deleting does not work properly, will be handled seperately if ($np == $#new and $newhit_orig == -1) { $newhit_orig = $#original + 1; $newhit_new = $#new + 1; } # We got a hit if ($newhit_orig >= 0) { # Calculate given diffences till (no including!) last hit $diff_orig = $newhit_orig - $lasthit_orig - 1; $diff_new = $newhit_new - $lasthit_new - 1; $diff_words = $diff_new - $diff_orig; # More means inserts, less deleting if ($diff_words > 0) { $insert += $diff_words; } else { $delete += abs($diff_words); } # The smallest number defines the substitutions if ($diff_new < $diff_orig) { $substitution += $diff_new; } else { $substitution += $diff_orig; } # Bookkeeping $lasthit_orig = $newhit_orig; $lasthit_new = $newhit_new; } } # Make sure to process last deletions if ($#new < $#original) { $delete += $#original - $#new; } print "\n"; print "------------------\n"; print "Total orig : $#original\n"; print "Total new : $#new\n"; print "------------------\n"; print "Insert : $insert\n"; print "Delete : $delete\n"; print "Subsitute : $substitution\n"; print "------------------\n"; printf "WeR : %.3f\n", ($insert + $delete + $substitution) / $#original, "\n"; exit 0;