#!/usr/bin/perl

#Assume that SOURCE_DIR contains pairs of files ccl-NAME.xml.* + rel-NAME.xml
#According to the measure: option -n NUMBER_OF_DOCUMENTS_TO_HOLD_OUT or RATIO percentage of all documents to be hold out
#copy appropriate number of documents (1 document == 1 pair (ccl-NAME.xml.* + rel-NAME.xml) into HELD_OUT_PART_DIR
#and all the rest into REST_PART_DIR

use Getopt::Std;
use File::Find;
use File::Basename;
use File::Copy;
use Cwd 'abs_path';

getopt('n');

$source_dir = $ARGV[0];
$target_dir = $ARGV[1];
$rest_dir = $ARGV[2];
$ratio = $ARGV[3];

if( $ratio > 1 or $ratio < 0 and $opt_n eq ""){
print "USAGE: Ratio denotes fraction of documents for held_out.It is the value between 0 and 1.\nYou can also use option -n to determine the exact number of documents to hold out.";
}else{

	if( -d $source_dir and -d $target_dir and -d $rest_dir){
		opendir(DIR, $source_dir) or die $!;
		%cclfiles;
		%relfiles;
		while($filename = readdir(DIR)){
			$filename = abs_path("$source_dir/$filename");
			#common part of filename as key
			#full filename (with absolute path) as value
			if($filename =~ /(.*\/ccl-(.+)\.xml.+)/){
				$cclfiles{$2} = $1;
			}elsif($filename =~ /(.*\/rel-(.+)\.xml)/){
				$relfiles{$2} = $1;
			}
		}
		close(DIR);
		delete($cclfiles{""});
		delete($relfiles{""});
		%picker;
		$i = 0;
		#numerate hashes' keys for random selection by indices
		while( ($k, $v) = each %relfiles){
			$picker{$i} = $k;
			$i++;	
		}
		$size_of_files = scalar(keys %cclfiles);
		if($opt_n ne ""){
			if($opt_n > $size_of_files){
				die "ERROR: You want to hold out more files than are available.\n";
			}
			$ratio = $opt_n;
		}else{
			$ratio = $size_of_files * $ratio;
		}
		
		for($n = 0; $n < $ratio; ){
			$index = int( rand($size_of_files));
			if($picker{$index} eq ""){
				next;	
			}
			#CCL part
			$from = "$cclfiles{$picker{$index}}";
			$base = basename($from);
			$to = "$target_dir/$base";
			$mv1 = copy($from, $to);

			#REL part
			$from = "$relfiles{$picker{$index}}";
			$base = basename($from);
			$to = "$target_dir/$base";
			$mv2 = copy($from, $to);
			
			if ($mv1 and $mv2){
				$picker{$index} = "";
				$n++;
			}elsif($mv1 or $mv2){
				die "Error! Only one file of ccl+rel pair file was moved!\n";
			}
		}	
		for($n = 0; $n < $size_of_files; $n++){
			if($picker{$n} eq ""){
				next;	
			}
			#CCL part
			$from = "$cclfiles{$picker{$n}}";
			$base = basename($cclfiles{$picker{$n}});
			$to = "$rest_dir/$base";
			$mv1 = copy($from, $to);

			#REL part
			$from = "$relfiles{$picker{$n}}";
			$base = basename($relfiles{$picker{$n}});
			$to = "$rest_dir/$base";
			$mv2 = copy($from, $to);
		}	
	}else{
		print "USAGE: ./get_held_out [-n NUMBER_OF_DOCUMENTS] SOURCE_DIR HELD_OUT_PART_DIR REST_PART_DIR [RATIO]\n";
	}
}