Skip to content
Snippets Groups Projects
Commit e3b806fa authored by Pawel Orlowicz's avatar Pawel Orlowicz
Browse files

Two scripts for corpus manipulation: split.pl - separate one ccl+rel file into files ccl and rel

get_held_out.pl - divide corpus of ccl and rel files into two sets in given proportion
parent f40942c1
Branches
No related merge requests found
#!/usr/bin/perl
#Assume that SOURCE_DIR contains pairs of files ccl-NAME.xml.* + rel-NAME.xml
#According to the measure: option -n NUMBER_OF_DOCUMENTS_TO_HOLD_OUT or RATIO percentage of all documents to be hold out
#copy appropriate number of documents (1 document == 1 pair (ccl-NAME.xml.* + rel-NAME.xml) into HELD_OUT_PART_DIR
#and all the rest into REST_PART_DIR
use Getopt::Std;
use File::Find;
use File::Basename;
use File::Copy;
use Cwd 'abs_path';
getopt('n');
$source_dir = $ARGV[0];
$target_dir = $ARGV[1];
$rest_dir = $ARGV[2];
$ratio = $ARGV[3];
if( $ratio > 1 or $ratio < 0 and $opt_n eq ""){
print "USAGE: Ratio denotes fraction of documents for held_out.It is the value between 0 and 1.\nYou can also use option -n to determine the exact number of documents to hold out.";
}else{
if( -d $source_dir and -d $target_dir and -d $rest_dir){
opendir(DIR, $source_dir) or die $!;
%cclfiles;
%relfiles;
while($filename = readdir(DIR)){
$filename = abs_path("$source_dir/$filename");
#common part of filename as key
#full filename (with absolute path) as value
if($filename =~ /(.*\/ccl-(.+)\.xml.+)/){
$cclfiles{$2} = $1;
}elsif($filename =~ /(.*\/rel-(.+)\.xml)/){
$relfiles{$2} = $1;
}
}
close(DIR);
delete($cclfiles{""});
delete($relfiles{""});
%picker;
$i = 0;
#numerate hashes' keys for random selection by indices
while( ($k, $v) = each %relfiles){
$picker{$i} = $k;
$i++;
}
$size_of_files = scalar(keys %cclfiles);
if($opt_n ne ""){
if($opt_n > $size_of_files){
die "ERROR: You want to hold out more files than are available.\n";
}
$ratio = $opt_n;
}else{
$ratio = $size_of_files * $ratio;
}
for($n = 0; $n < $ratio; ){
$index = int( rand($size_of_files));
if($picker{$index} eq ""){
next;
}
#CCL part
$from = "$cclfiles{$picker{$index}}";
$base = basename($from);
$to = "$target_dir/$base";
$mv1 = copy($from, $to);
#REL part
$from = "$relfiles{$picker{$index}}";
$base = basename($from);
$to = "$target_dir/$base";
$mv2 = copy($from, $to);
if ($mv1 and $mv2){
$picker{$index} = "";
$n++;
}elsif($mv1 or $mv2){
die "Error! Only one file of ccl+rel pair file was moved!\n";
}
}
for($n = 0; $n < $size_of_files; $n++){
if($picker{$n} eq ""){
next;
}
#CCL part
$from = "$cclfiles{$picker{$n}}";
$base = basename($cclfiles{$picker{$n}});
$to = "$rest_dir/$base";
$mv1 = copy($from, $to);
#REL part
$from = "$relfiles{$picker{$n}}";
$base = basename($relfiles{$picker{$n}});
$to = "$rest_dir/$base";
$mv2 = copy($from, $to);
}
}else{
print "USAGE: ./get_held_out [-n NUMBER_OF_DOCUMENTS] SOURCE_DIR HELD_OUT_PART_DIR REST_PART_DIR [RATIO]\n";
}
}
#!/usr/bin/perl
#Process all xml files in SOURCE_DIR assuming that it contains files which consist of ccl+rel part.
#Separate each file into ccl-file and rel-file and place them into TARGET_DIR
#(don't empty the TARGET_DIR, just add new files; it's up to the user to empty the TARGET_DIR)
use File::Find;
use Cwd 'abs_path';
if( -d $ARGV[0] and -d $ARGV[1]){
$srcdir = abs_path($ARGV[0]);
$targetdir = abs_path($ARGV[1]);
find(\&wanted, $srcdir);
sub wanted{
if(/.*xml/){
$filename = $_;
$cclFilename = "$targetdir/ccl-".$filename;
$relFilename = "$targetdir/rel-".$filename;
$F = "filehandle";
@cclPart = ();
@relPart = ();
$isCclPart = 1;
sysopen(F, $filename, "r");
while(<F>){
if($isCclPart){
if( $_ =~ /.*<relations>.*/){
push(@relPart, $_);
$isCclPart = 0;
}else{
push(@cclPart, $_);
}
}else{
if($_ =~ /.*<\/chunkList>.*/){
push(@cclPart, $_);
}else{
push(@relPart, $_);
}
}
}
close(F);
$CCL_FILE = "cclHandle";
$REL_FILE = "relHandle";
open CCL_FILE, ">$cclFilename" or die $!."dir:".$cclFilename;
open REL_FILE, ">$relFilename" or die $!."dir:".$relFilename;
print CCL_FILE "@cclPart";
print REL_FILE "@relPart";
close(CCL_FILE);
close(REL_FILE);
}
}
}else{
print "Usage: ./split.pl SOURCE_DIR TARGET_DIR\n";
}
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment