From e3b806fa3c2a5bbf2580030131fb79a05c3f13ab Mon Sep 17 00:00:00 2001 From: Pawel Orlowicz <porlowicz@gmail.com> Date: Fri, 30 Mar 2012 12:30:39 +0200 Subject: [PATCH] Two scripts for corpus manipulation: split.pl - separate one ccl+rel file into files ccl and rel get_held_out.pl - divide corpus of ccl and rel files into two sets in given proportion --- corpus2tools/get_held_out.pl | 104 +++++++++++++++++++++++++++++++++++ corpus2tools/split.pl | 62 +++++++++++++++++++++ 2 files changed, 166 insertions(+) create mode 100755 corpus2tools/get_held_out.pl create mode 100755 corpus2tools/split.pl diff --git a/corpus2tools/get_held_out.pl b/corpus2tools/get_held_out.pl new file mode 100755 index 0000000..62b7baf --- /dev/null +++ b/corpus2tools/get_held_out.pl @@ -0,0 +1,104 @@ +#!/usr/bin/perl + +#Assume that SOURCE_DIR contains pairs of files ccl-NAME.xml.* + rel-NAME.xml +#According to the measure: option -n NUMBER_OF_DOCUMENTS_TO_HOLD_OUT or RATIO percentage of all documents to be hold out +#copy appropriate number of documents (1 document == 1 pair (ccl-NAME.xml.* + rel-NAME.xml) into HELD_OUT_PART_DIR +#and all the rest into REST_PART_DIR + +use Getopt::Std; +use File::Find; +use File::Basename; +use File::Copy; +use Cwd 'abs_path'; + +getopt('n'); + +$source_dir = $ARGV[0]; +$target_dir = $ARGV[1]; +$rest_dir = $ARGV[2]; +$ratio = $ARGV[3]; + +if( $ratio > 1 or $ratio < 0 and $opt_n eq ""){ +print "USAGE: Ratio denotes fraction of documents for held_out.It is the value between 0 and 1.\nYou can also use option -n to determine the exact number of documents to hold out."; +}else{ + + if( -d $source_dir and -d $target_dir and -d $rest_dir){ + opendir(DIR, $source_dir) or die $!; + %cclfiles; + %relfiles; + while($filename = readdir(DIR)){ + $filename = abs_path("$source_dir/$filename"); + #common part of filename as key + #full filename (with absolute path) as value + if($filename =~ /(.*\/ccl-(.+)\.xml.+)/){ + $cclfiles{$2} = $1; + }elsif($filename =~ /(.*\/rel-(.+)\.xml)/){ + $relfiles{$2} = $1; + } + } + close(DIR); + delete($cclfiles{""}); + delete($relfiles{""}); + %picker; + $i = 0; + #numerate hashes' keys for random selection by indices + while( ($k, $v) = each %relfiles){ + $picker{$i} = $k; + $i++; + } + $size_of_files = scalar(keys %cclfiles); + if($opt_n ne ""){ + if($opt_n > $size_of_files){ + die "ERROR: You want to hold out more files than are available.\n"; + } + $ratio = $opt_n; + }else{ + $ratio = $size_of_files * $ratio; + } + + for($n = 0; $n < $ratio; ){ + $index = int( rand($size_of_files)); + if($picker{$index} eq ""){ + next; + } + #CCL part + $from = "$cclfiles{$picker{$index}}"; + $base = basename($from); + $to = "$target_dir/$base"; + $mv1 = copy($from, $to); + + #REL part + $from = "$relfiles{$picker{$index}}"; + $base = basename($from); + $to = "$target_dir/$base"; + $mv2 = copy($from, $to); + + if ($mv1 and $mv2){ + $picker{$index} = ""; + $n++; + }elsif($mv1 or $mv2){ + die "Error! Only one file of ccl+rel pair file was moved!\n"; + } + } + for($n = 0; $n < $size_of_files; $n++){ + if($picker{$n} eq ""){ + next; + } + #CCL part + $from = "$cclfiles{$picker{$n}}"; + $base = basename($cclfiles{$picker{$n}}); + $to = "$rest_dir/$base"; + $mv1 = copy($from, $to); + + #REL part + $from = "$relfiles{$picker{$n}}"; + $base = basename($relfiles{$picker{$n}}); + $to = "$rest_dir/$base"; + $mv2 = copy($from, $to); + } + }else{ + print "USAGE: ./get_held_out [-n NUMBER_OF_DOCUMENTS] SOURCE_DIR HELD_OUT_PART_DIR REST_PART_DIR [RATIO]\n"; + } +} + + diff --git a/corpus2tools/split.pl b/corpus2tools/split.pl new file mode 100755 index 0000000..7b9dd87 --- /dev/null +++ b/corpus2tools/split.pl @@ -0,0 +1,62 @@ +#!/usr/bin/perl + +#Process all xml files in SOURCE_DIR assuming that it contains files which consist of ccl+rel part. +#Separate each file into ccl-file and rel-file and place them into TARGET_DIR +#(don't empty the TARGET_DIR, just add new files; it's up to the user to empty the TARGET_DIR) + +use File::Find; +use Cwd 'abs_path'; + +if( -d $ARGV[0] and -d $ARGV[1]){ +$srcdir = abs_path($ARGV[0]); +$targetdir = abs_path($ARGV[1]); + +find(\&wanted, $srcdir); + +sub wanted{ + + if(/.*xml/){ + $filename = $_; + $cclFilename = "$targetdir/ccl-".$filename; + $relFilename = "$targetdir/rel-".$filename; + $F = "filehandle"; + @cclPart = (); + @relPart = (); + $isCclPart = 1; + sysopen(F, $filename, "r"); + + while(<F>){ + if($isCclPart){ + if( $_ =~ /.*<relations>.*/){ + push(@relPart, $_); + $isCclPart = 0; + }else{ + push(@cclPart, $_); + } + }else{ + if($_ =~ /.*<\/chunkList>.*/){ + push(@cclPart, $_); + }else{ + push(@relPart, $_); + } + } + } + + close(F); + + $CCL_FILE = "cclHandle"; + $REL_FILE = "relHandle"; + open CCL_FILE, ">$cclFilename" or die $!."dir:".$cclFilename; + open REL_FILE, ">$relFilename" or die $!."dir:".$relFilename; + print CCL_FILE "@cclPart"; + print REL_FILE "@relPart"; + close(CCL_FILE); + close(REL_FILE); + + } + + +} +}else{ +print "Usage: ./split.pl SOURCE_DIR TARGET_DIR\n"; +} -- GitLab