#!/usr/bin/perl -w ## Usage: wsdOut.pl in_dir out_dir scoring_table_file ## Example: wsdOut.pl #---------------------------------------------------------------------------------------# ## ## INPUT ARGUMENTS: ## ## - has to contain the directories which contains the files ## to be desambiguate in "nam+id" format. ## OBLIGATORY TO FOLLOW THE ORIGINAL DIRECTORY STRUCTURE AND FILE FORMAT ## ## - folder for the out put of the desanbiguated files ## THE OUTPUT WILL FOLLOW THE ORIGINAL DIRECTORY STRUCTURE ## ## - is a file where instances are desambiguated in Senseval Scorer Format: ## ## word.pos occurrenceID + ## ------------------------------------------------- ## ## EXAMPLE: ## ## village.n village.n.5-71-143.12 06107243-n 3.7281746031746 ## village.n village.n.5-71-279.21 06630722-n 1 06107243-n 2.7281746031746 ## ... ## leader.n leader.n.9-134-37.4 06950891-n 3.7281746031746 ## leader.n leader.n.9-135-78.4 06950891-n 3.7281746031746 ## ... ## lean.v lean.v.1-24-121.11 01106486-v 0.625 01392754-v 3.1031746031746 ## lean.v lean.v.15-91-235.20 01859256-v 0.714285714285714 00464659-v 0.555555555555556 01392754-v 2.45833333333333 ## ## #------------------------------------------------------------------------------------------------------------------------------# use strict; use vars qw ($in_dir $sc_file @dirs $dir @files $file $in_file $out_file @main_dirs $out_dir); use vars qw (%id2scores); die "Usage: wsdOut.pl " if not defined $ARGV[2]; $in_dir = $ARGV[0]; $out_dir = $ARGV[1]; $sc_file = $ARGV[2]; print "\ngetting scoring table...\n"; %id2scores = &get_scoring_table($sc_file); print " ... OK\n\n"; ## Make directory for the out put path if (not -e $out_dir){ mkdir ($out_dir,0770) || die $! } opendir(DIR,"$in_dir")||die "cannot open DIR: $in_dir"; @dirs = readdir(DIR); close(DIR); foreach $dir (@dirs) { next if $dir =~ /\.$/; if (not -e "$out_dir/$dir"){ mkdir ("$out_dir/$dir",0770) || die $!;} if (not -e "$out_dir/$dir/cache_en"){mkdir ("$out_dir/$dir/cache_en",0770) || die $!;} ## Read all in put files. Only "nam+id" files opendir(DIR,"$in_dir/$dir") || die "cannot open DIR(2): $in_dir/$dir\n" ; @files = grep {/\.nam\+id$/} readdir(DIR); close(DIR); ## print each file in the directory print "Printing files in:\t$out_dir/$dir/cache_en ...\n"; foreach $file (@files){ $file =~ s/\.nam\+id$//; ## remove extension (nam+id) ## Print out put file: &print_wsd_file("$in_dir/$dir", $file, "$out_dir/$dir/cache_en"); } } print " ... OK.\n"; #--------------------------------------------------------------------------------------# ####### ## IN: the file with all the disambiguated instances in Senseval Scorer Format. ## OUT: Normalized scores in a hash of hash structure: $hash{instance_id} => $subhash{synset} => score # sub get_scoring_table{ my ($table_f) = @_; my ($w, $ex_c,$c,$v,$id,$T,$i,$s,$lemma, $pos); my (@C,@score); my (%hscores, %hid); open (I, "$table_f") || die "cannot open FILE (scores file): $table_f\n"; while (){ chomp; ($w,$ex_c,@score) = split; #senseval2/3 example code if (/(d\d+\.s\d+\.t\d+)/){ $id = $1; } #clef example code else{ ($lemma,$pos,$id,$s) = split (/\./,$ex_c); } ## NORMALIZE THE SCORES: # Compute the total sum of the scores $T = 0; for ($i=0; $i <= $#score-1; $i = $i+2){ $T = $T + $score[$i+1]; } # Normalize the scores for ($i=0; $i <= $#score-1; $i = $i+2){ $c = $score[$i]; $v = $score[$i+1] / $T; $hid{$c} = $v; } %{ $hscores{$id} } = %hid; undef %hid; } return %hscores; } ####### ## IN: original "nam+id" file; out put path ## OUT: disambiguated "wsd" file # sub print_wsd_file{ my ($target_dir, $file, $out_dir) = @_; my ($out_line, $chunk, $lemma, $pos, $sense, $id, $line,$new,$v); open(I,"$target_dir/$file.nam+id") || die "Cannot open file: $target_dir/$file.nam+id\n"; open(O,">$out_dir/$file.wsd") || die "cannot open out_file: $out_dir/$file.wsd\n"; ## print the first encoding line: ## $line = ; if ($line =~ m/<\?xml.*\?>/){ print O "$line"; }else{ die "ERROR: expecting xml encoding:\n\t\n File: $target_dir/$file.nam+id\n"; } $out_line = ""; while ($line = ){ chomp $line; ### convert entity: & => & ### $line =~ s/&/&\;/g if $line =~ m/&/ and not $line =~ m/&\;/; $out_line = ""; while ($line =~ m/(^[^<]+|[^<]*<\/WF><\/WRD>|[^<]*<\/PHR>|[^<]*$|<[^>]+>)/g){ $chunk = $1; if ($chunk =~ m/[^<]*<\/WF><\/WRD>/){ $id = $1; if (not defined $id2scores{$id}){ $chunk =~ s/<\/WF><\/WRD>/<\/WF>NO-SENSE<\/SYNSET><\/WRD>/; } else{ $new = ""; foreach $sense (keys %{ $id2scores{$id} }){ $v = $id2scores{$id}{$sense}; $new .= "$sense<\/SYNSET>"; } $chunk =~ s/<\/WF><\/WRD>/<\/WF>$new<\/WRD>/; } } $out_line .= $chunk; } print O "$out_line\n"; } close(I); close(O); }