From e3b806fa3c2a5bbf2580030131fb79a05c3f13ab Mon Sep 17 00:00:00 2001
From: Pawel Orlowicz <porlowicz@gmail.com>
Date: Fri, 30 Mar 2012 12:30:39 +0200
Subject: [PATCH] Two scripts for corpus manipulation: split.pl - separate one
 ccl+rel file into files ccl and rel get_held_out.pl - divide corpus of ccl
 and rel files into two sets in given proportion

---
 corpus2tools/get_held_out.pl | 104 +++++++++++++++++++++++++++++++++++
 corpus2tools/split.pl        |  62 +++++++++++++++++++++
 2 files changed, 166 insertions(+)
 create mode 100755 corpus2tools/get_held_out.pl
 create mode 100755 corpus2tools/split.pl

diff --git a/corpus2tools/get_held_out.pl b/corpus2tools/get_held_out.pl
new file mode 100755
index 0000000..62b7baf
--- /dev/null
+++ b/corpus2tools/get_held_out.pl
@@ -0,0 +1,104 @@
+#!/usr/bin/perl
+
+#Assume that SOURCE_DIR contains pairs of files ccl-NAME.xml.* + rel-NAME.xml
+#According to the measure: option -n NUMBER_OF_DOCUMENTS_TO_HOLD_OUT or RATIO percentage of all documents to be hold out
+#copy appropriate number of documents (1 document == 1 pair (ccl-NAME.xml.* + rel-NAME.xml) into HELD_OUT_PART_DIR
+#and all the rest into REST_PART_DIR
+
+use Getopt::Std;
+use File::Find;
+use File::Basename;
+use File::Copy;
+use Cwd 'abs_path';
+
+getopt('n');
+
+$source_dir = $ARGV[0];
+$target_dir = $ARGV[1];
+$rest_dir = $ARGV[2];
+$ratio = $ARGV[3];
+
+if( $ratio > 1 or $ratio < 0 and $opt_n eq ""){
+print "USAGE: Ratio denotes fraction of documents for held_out.It is the value between 0 and 1.\nYou can also use option -n to determine the exact number of documents to hold out.";
+}else{
+
+	if( -d $source_dir and -d $target_dir and -d $rest_dir){
+		opendir(DIR, $source_dir) or die $!;
+		%cclfiles;
+		%relfiles;
+		while($filename = readdir(DIR)){
+			$filename = abs_path("$source_dir/$filename");
+			#common part of filename as key
+			#full filename (with absolute path) as value
+			if($filename =~ /(.*\/ccl-(.+)\.xml.+)/){
+				$cclfiles{$2} = $1;
+			}elsif($filename =~ /(.*\/rel-(.+)\.xml)/){
+				$relfiles{$2} = $1;
+			}
+		}
+		close(DIR);
+		delete($cclfiles{""});
+		delete($relfiles{""});
+		%picker;
+		$i = 0;
+		#numerate hashes' keys for random selection by indices
+		while( ($k, $v) = each %relfiles){
+			$picker{$i} = $k;
+			$i++;	
+		}
+		$size_of_files = scalar(keys %cclfiles);
+		if($opt_n ne ""){
+			if($opt_n > $size_of_files){
+				die "ERROR: You want to hold out more files than are available.\n";
+			}
+			$ratio = $opt_n;
+		}else{
+			$ratio = $size_of_files * $ratio;
+		}
+		
+		for($n = 0; $n < $ratio; ){
+			$index = int( rand($size_of_files));
+			if($picker{$index} eq ""){
+				next;	
+			}
+			#CCL part
+			$from = "$cclfiles{$picker{$index}}";
+			$base = basename($from);
+			$to = "$target_dir/$base";
+			$mv1 = copy($from, $to);
+
+			#REL part
+			$from = "$relfiles{$picker{$index}}";
+			$base = basename($from);
+			$to = "$target_dir/$base";
+			$mv2 = copy($from, $to);
+			
+			if ($mv1 and $mv2){
+				$picker{$index} = "";
+				$n++;
+			}elsif($mv1 or $mv2){
+				die "Error! Only one file of ccl+rel pair file was moved!\n";
+			}
+		}	
+		for($n = 0; $n < $size_of_files; $n++){
+			if($picker{$n} eq ""){
+				next;	
+			}
+			#CCL part
+			$from = "$cclfiles{$picker{$n}}";
+			$base = basename($cclfiles{$picker{$n}});
+			$to = "$rest_dir/$base";
+			$mv1 = copy($from, $to);
+
+			#REL part
+			$from = "$relfiles{$picker{$n}}";
+			$base = basename($relfiles{$picker{$n}});
+			$to = "$rest_dir/$base";
+			$mv2 = copy($from, $to);
+		}	
+	}else{
+		print "USAGE: ./get_held_out [-n NUMBER_OF_DOCUMENTS] SOURCE_DIR HELD_OUT_PART_DIR REST_PART_DIR [RATIO]\n";
+	}
+}
+
+
diff --git a/corpus2tools/split.pl b/corpus2tools/split.pl
new file mode 100755
index 0000000..7b9dd87
--- /dev/null
+++ b/corpus2tools/split.pl
@@ -0,0 +1,62 @@
+#!/usr/bin/perl
+
+#Process all xml files in SOURCE_DIR assuming that it contains files which consist of ccl+rel part.
+#Separate each file into ccl-file and rel-file and place them into TARGET_DIR 
+#(don't empty the TARGET_DIR, just add new files; it's up to the user to empty the TARGET_DIR)
+
+use File::Find;
+use Cwd 'abs_path';
+
+if( -d $ARGV[0] and -d $ARGV[1]){
+$srcdir = abs_path($ARGV[0]);
+$targetdir = abs_path($ARGV[1]);
+
+find(\&wanted, $srcdir);
+
+sub wanted{
+
+	if(/.*xml/){
+		$filename = $_;
+		$cclFilename = "$targetdir/ccl-".$filename;
+		$relFilename = "$targetdir/rel-".$filename;
+		$F = "filehandle";
+		@cclPart = ();
+		@relPart = ();
+		$isCclPart = 1;
+		sysopen(F, $filename, "r");
+		
+		while(<F>){
+			if($isCclPart){
+				if( $_ =~ /.*<relations>.*/){
+					push(@relPart, $_);
+					$isCclPart = 0;
+				}else{
+					push(@cclPart, $_);
+				}
+			}else{
+				if($_ =~ /.*<\/chunkList>.*/){
+					push(@cclPart, $_);
+				}else{
+					push(@relPart, $_);
+				}
+			}
+		}
+
+		close(F);
+	
+		$CCL_FILE = "cclHandle";
+		$REL_FILE = "relHandle";	
+		open CCL_FILE, ">$cclFilename" or die $!."dir:".$cclFilename;
+		open REL_FILE, ">$relFilename" or die $!."dir:".$relFilename;
+		print CCL_FILE "@cclPart";
+		print REL_FILE "@relPart";
+		close(CCL_FILE);
+		close(REL_FILE);
+
+	}
+	
+
+}
+}else{
+print "Usage: ./split.pl SOURCE_DIR TARGET_DIR\n";
+}
-- 
GitLab