#!/usr/bin/perl # Program: validate.pl # Written by: dave graff # Purpose: scan both source-text and human-translation-text # files to validate the latter against the former. # For the MT data creation project, a set of text files in a given # language (Chinese, Arabic, ...) is selected and formatted for # presentation to several selected translation services. Each service # is expected to return English translations of the complete text set, # preserving the structural divisions of text (headlines, paragraphs # and "segments") that were explicitly marked in the source-language # text files. # This program assumes that the source-language texts are in a # directory called "source" and the translations are in a directory # called "translation", which has separate subdirectories for each # source of translation data (each translation "system"). The # "source" directory has subdirectories to partition the data files, # typically by data provider; within these partition directories, each # individual data file contains one story (DOC unit). Each of the # "system" subdirectories should reflect the contents of "source" # exactly, the only difference being the content of the data files. die "Can't find 'source/' and 'translation/' in CWD ($ENV{PWD})\n" unless ( -d "source" && -d "translation" ); # make sure there is an up-to-date file list, showing number of # segments in each file, for both source and translation sets: if ( -e "filelist.source" ) { @newer = `find source -type f -newer filelist.source`; unlink "filelist.source" if ( @newer ); } if ( -e "filelist.source" ) { @srcs = `cat filelist.source`; map { ($file,$scnt) = split; $ref{$file}{NSEG} = $scnt; $nlen = length($file) if ($nlen$: ); for ( $i=1; $i<$#line; $i++ ) { if ( $line[$i] =~ m%^\s*(\S[^<]+)\s*$% ) { $txt = $1; $txt =~ s/\s*$//; $txt =~ s/\s+/ /g; $hash->{TKNS}[$i] = scalar( split( / /, $txt )); $hash->{SLEN}[$i] = length( $txt ); } else { $err = ( $line[$i] =~ m%^\s*% ) ? "no text" : "bad format"; warn "$path: $err at segment $i\n"; $hash->{TKNS}[$i] = $hash->{SLEN}[$i] = 0; } } } sub mkSrcInfo # generate reference list for source files (filename, number of segments) { @srcfiles = `find source -follow -type f`; open( L, ">filelist.source" ); foreach ( sort @srcfiles ) { chop; ( $s, $fid ) = split( /\// ); $ns = `grep -c 'filelist.translation" ); foreach ( sort @srcfiles ) { chop; ( $t, $sysid, $fid ) = split( /\// ); $ns = `grep -c '