diff --git a/corpus2tools/CMakeLists.txt b/corpus2tools/CMakeLists.txt index 3e7b198ba60e00065d0545eabb59dda56df2a145..4009ce1e506bfb73e4b42d424e3874691d582419 100644 --- a/corpus2tools/CMakeLists.txt +++ b/corpus2tools/CMakeLists.txt @@ -12,15 +12,15 @@ include_directories( ${CMAKE_SOURCE_DIR} ) add_executable( tagset-tool tagset-tool.cpp ) target_link_libraries ( tagset-tool corpus2 pwrutils ${Boost_LIBRARIES} ${LIBS}) -add_executable( corpus-get corpus-get.cpp ) -target_link_libraries ( corpus-get corpus2 pwrutils ${Boost_LIBRARIES} ${LIBS}) - include_directories(${Boost_INCLUDE_DIR}) link_directories(${Boost_LIBRARY_DIRS}) if(UNIX) - install(TARGETS tagset-tool corpus-get + install(TARGETS tagset-tool RUNTIME DESTINATION bin) - + install(FILES corpus-get + DESTINATION bin + PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE + GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE) endif(UNIX) diff --git a/corpus2tools/corpus-get b/corpus2tools/corpus-get new file mode 100755 index 0000000000000000000000000000000000000000..6e62ae334e252d0305672247b0fccc2d681ba7f4 --- /dev/null +++ b/corpus2tools/corpus-get @@ -0,0 +1,134 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +import sys +from optparse import OptionParser +from collections import defaultdict as dd +from itertools import repeat, izip +import corpus2 + +descr = """%prog [options] CORPUSFILE [SENTENCERANGE[:TOKENRANGE]] +Reads a corpus file and outputs all or some tokens. +Available input formats: """ + ' '.join(corpus2.TokenReader.available_reader_types()) + """ +""" + ' '.join(corpus2.TokenReader.available_reader_types_help()) + """ +Available output formats: """ + ' '.join(corpus2.TokenWriter.available_writer_types()) + """ +""" + ' '.join(corpus2.TokenWriter.available_writer_types_help()) + +def parse_range_info(s): + """Parses a comma-separated list of numbers that + can also be dash-separated ranges""" + selection = set() + for elem in (x.strip() for x in s.split(',')): + try: + selection.add(int(elem)) + except: + split = [x.strip() for x in elem.split('-')] + try: + if len(split) == 2: + split.sort() + for x in xrange(int(split[0]), int(split[1])+1): + selection.add(x) + else: + raise + except: + print "Fail:", elem + return selection + +def sentences(rdr): + """Yields subsequent sentences from a reader. + Declared here for demonstration.""" + while True: + sent = rdr.get_next_sentence() + if not sent: + break + yield sent + + +def chunks(rdr): + """Yields subsequent sentences from a reader.""" + while True: + chunk = rdr.get_next_chunk() + if not chunk: + break + yield chunk + + +def write_selected_sentences(sents, writer, selection): + sid = 0 + for sent in sents: + if sid in selection: + if len(selection[sid]) == 0: + writer.write_sentence(sent) + else: + tid = 0 + for tok in sent.tokens(): + if tid in selection[sid]: + writer.write_token(tok) + tid += 1 + sid += 1 + +def go(): + parser = OptionParser(usage=descr) + parser.add_option('-i', '--input-format', type='string', action='store', + dest='input_format', default='xces', + help='set the input format; default: xces-fast') + parser.add_option('-o', '--output-format', type='string', action='store', + dest='output_format', default='xces', + help='set the output format; default: xces') + parser.add_option('-t', '--tagset', type='string', action='store', + dest='tagset', default='kipi', + help='set the tagset used in input; default: kipi') + parser.add_option('-C', '--chunks', action='store_true', + dest='chunks', default=False, + help='Process chunks (select chunks/sentences, not tokens)') + parser.add_option('-v', '--verbose', action='store_true', + dest='verbose', default=False, + help='verbose mode') + (options, args) = parser.parse_args() + + if len(args) < 1: + print 'You need to provide an input corpus.' + print 'See %s --help' % sys.argv[0] + sys.exit(1) + + inpath = args[0] + # load a tagset, create a reader + tagset = corpus2.get_named_tagset(options.tagset) + reader = corpus2.TokenReader.create_path_reader(options.input_format, tagset, inpath) + writer = corpus2.TokenWriter.create_stdout_writer(options.output_format, tagset) + selection = {} + for arg in args[1:]: + if ':' in arg: + sp = arg.split(':') + if len(sp) == 2 and options.chunks: + selection.update(izip(parse_range_info(sp[0]), repeat(dict(izip(parse_range_info(sp[1]), repeat(())))))) + elif len(sp) == 3 and options.chunks: + selection.update(izip(parse_range_info(sp[0]), repeat(dict(izip(parse_range_info(sp[1]), repeat(parse_range_info(sp[2]))))))) + elif len(sp) == 2: + selection.update(izip(parse_range_info(sp[0]), repeat(parse_range_info(sp[1])))) + else: + print >> sys.stderr, "Invalid argument:", arg + return + else: + selection.update(izip(parse_range_info(arg), repeat(()))) + if selection == {}: + if options.chunks: + for chunk in chunks(reader): + writer.write_chunk(chunk) + else: + for sent in sentences(reader): + writer.write_sentence(sent) + else: + if options.chunks: + cid = 0 + for chunk in chunks(reader): + if cid in selection: + if len(selection[cid]) == 0: + writer.write_chunk(chunk) + else: + write_selected_sentences(chunk.sentences(), writer, selection[cid]) + cid += 1 + else: + write_selected_sentences(sentences(reader), writer, selection) + +if __name__ == '__main__': + go() diff --git a/corpus2tools/corpus-get.cpp b/corpus2tools/corpus-get.cpp deleted file mode 100644 index 9ea144f6a296670d209c069c527d471dbfe07ddc..0000000000000000000000000000000000000000 --- a/corpus2tools/corpus-get.cpp +++ /dev/null @@ -1,77 +0,0 @@ -#include <libcorpus2/tagsetmanager.h> -#include <libcorpus2/util/ioformat-options.h> -#include <boost/program_options.hpp> -#include <boost/algorithm/string.hpp> - -int main(int argc, char** argv) -{ - std::string tagset_name, filename; - std::string input_format, output_format; - int sentence, token = -1; - size_t stats = 0; - using boost::program_options::value; - boost::program_options::options_description desc("Allowed options"); - desc.add_options() - ("filename,F", value(&filename), - "filename") - ("sentence,S", value(&sentence), - "Sentence idx") - ("stats,s", value(&stats), - "Stats") - ("token,T", value(&token), - "Token idx ") - ("tagset,t", value(&tagset_name)->default_value("kipi"), - "Tagset name") - ; - Corpus2::add_input_options(desc); - Corpus2::add_output_options(desc); - boost::program_options::variables_map vm; - boost::program_options::positional_options_description p; - p.add("filename", 1); - p.add("sentence", 1); - p.add("token", 1); - - try { - boost::program_options::store( - boost::program_options::command_line_parser(argc, argv) - .options(desc).positional(p).run(), vm); - } catch (boost::program_options::error& e) { - std::cerr << e.what() << "\n"; - return 2; - } - boost::program_options::notify(vm); - if (vm.count("help")) { - std::cout << desc << "\n"; - return 1; - } - const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_name); - boost::shared_ptr<Corpus2::TokenReader> reader; - reader = Corpus2::create_reader(vm, tagset, filename); - Corpus2::Sentence::Ptr s; - boost::shared_ptr<Corpus2::TokenWriter> writer; - writer = Corpus2::create_writer(vm, tagset); - std::map<int,int> lens; - for (int i = 0; i <= sentence; ++i) { - s = reader->get_next_sentence(); - if (s) { - lens[s->size()]++; - if (s->size() > stats) { - std::cerr << i << "\n"; - writer->write_sentence(*s); - } - } - } - if (s) { - if (token == -1) { - writer->write_sentence(*s); - } else if (static_cast<size_t>(token) < s->size()) { - writer->write_token(*(*s)[token]); - } - } - if (stats) { - typedef std::pair<int,int> pp; - foreach (const pp& p, lens) { - std::cerr << p.first << " " << p.second << "\n"; - } - } -}