Skip to content
Snippets Groups Projects
Commit 27e5a477 authored by ilor's avatar ilor
Browse files

rewrite corpus-get in python

parent 6cc8e28a
No related merge requests found
...@@ -12,15 +12,15 @@ include_directories( ${CMAKE_SOURCE_DIR} ) ...@@ -12,15 +12,15 @@ include_directories( ${CMAKE_SOURCE_DIR} )
add_executable( tagset-tool tagset-tool.cpp ) add_executable( tagset-tool tagset-tool.cpp )
target_link_libraries ( tagset-tool corpus2 pwrutils ${Boost_LIBRARIES} ${LIBS}) target_link_libraries ( tagset-tool corpus2 pwrutils ${Boost_LIBRARIES} ${LIBS})
add_executable( corpus-get corpus-get.cpp )
target_link_libraries ( corpus-get corpus2 pwrutils ${Boost_LIBRARIES} ${LIBS})
include_directories(${Boost_INCLUDE_DIR}) include_directories(${Boost_INCLUDE_DIR})
link_directories(${Boost_LIBRARY_DIRS}) link_directories(${Boost_LIBRARY_DIRS})
if(UNIX) if(UNIX)
install(TARGETS tagset-tool corpus-get install(TARGETS tagset-tool
RUNTIME DESTINATION bin) RUNTIME DESTINATION bin)
install(FILES corpus-get
DESTINATION bin
PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE
GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE)
endif(UNIX) endif(UNIX)
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
from optparse import OptionParser
from collections import defaultdict as dd
from itertools import repeat, izip
import corpus2
descr = """%prog [options] CORPUSFILE [SENTENCERANGE[:TOKENRANGE]]
Reads a corpus file and outputs all or some tokens.
Available input formats: """ + ' '.join(corpus2.TokenReader.available_reader_types()) + """
""" + ' '.join(corpus2.TokenReader.available_reader_types_help()) + """
Available output formats: """ + ' '.join(corpus2.TokenWriter.available_writer_types()) + """
""" + ' '.join(corpus2.TokenWriter.available_writer_types_help())
def parse_range_info(s):
"""Parses a comma-separated list of numbers that
can also be dash-separated ranges"""
selection = set()
for elem in (x.strip() for x in s.split(',')):
try:
selection.add(int(elem))
except:
split = [x.strip() for x in elem.split('-')]
try:
if len(split) == 2:
split.sort()
for x in xrange(int(split[0]), int(split[1])+1):
selection.add(x)
else:
raise
except:
print "Fail:", elem
return selection
def sentences(rdr):
"""Yields subsequent sentences from a reader.
Declared here for demonstration."""
while True:
sent = rdr.get_next_sentence()
if not sent:
break
yield sent
def chunks(rdr):
"""Yields subsequent sentences from a reader."""
while True:
chunk = rdr.get_next_chunk()
if not chunk:
break
yield chunk
def write_selected_sentences(sents, writer, selection):
sid = 0
for sent in sents:
if sid in selection:
if len(selection[sid]) == 0:
writer.write_sentence(sent)
else:
tid = 0
for tok in sent.tokens():
if tid in selection[sid]:
writer.write_token(tok)
tid += 1
sid += 1
def go():
parser = OptionParser(usage=descr)
parser.add_option('-i', '--input-format', type='string', action='store',
dest='input_format', default='xces',
help='set the input format; default: xces-fast')
parser.add_option('-o', '--output-format', type='string', action='store',
dest='output_format', default='xces',
help='set the output format; default: xces')
parser.add_option('-t', '--tagset', type='string', action='store',
dest='tagset', default='kipi',
help='set the tagset used in input; default: kipi')
parser.add_option('-C', '--chunks', action='store_true',
dest='chunks', default=False,
help='Process chunks (select chunks/sentences, not tokens)')
parser.add_option('-v', '--verbose', action='store_true',
dest='verbose', default=False,
help='verbose mode')
(options, args) = parser.parse_args()
if len(args) < 1:
print 'You need to provide an input corpus.'
print 'See %s --help' % sys.argv[0]
sys.exit(1)
inpath = args[0]
# load a tagset, create a reader
tagset = corpus2.get_named_tagset(options.tagset)
reader = corpus2.TokenReader.create_path_reader(options.input_format, tagset, inpath)
writer = corpus2.TokenWriter.create_stdout_writer(options.output_format, tagset)
selection = {}
for arg in args[1:]:
if ':' in arg:
sp = arg.split(':')
if len(sp) == 2 and options.chunks:
selection.update(izip(parse_range_info(sp[0]), repeat(dict(izip(parse_range_info(sp[1]), repeat(()))))))
elif len(sp) == 3 and options.chunks:
selection.update(izip(parse_range_info(sp[0]), repeat(dict(izip(parse_range_info(sp[1]), repeat(parse_range_info(sp[2])))))))
elif len(sp) == 2:
selection.update(izip(parse_range_info(sp[0]), repeat(parse_range_info(sp[1]))))
else:
print >> sys.stderr, "Invalid argument:", arg
return
else:
selection.update(izip(parse_range_info(arg), repeat(())))
if selection == {}:
if options.chunks:
for chunk in chunks(reader):
writer.write_chunk(chunk)
else:
for sent in sentences(reader):
writer.write_sentence(sent)
else:
if options.chunks:
cid = 0
for chunk in chunks(reader):
if cid in selection:
if len(selection[cid]) == 0:
writer.write_chunk(chunk)
else:
write_selected_sentences(chunk.sentences(), writer, selection[cid])
cid += 1
else:
write_selected_sentences(sentences(reader), writer, selection)
if __name__ == '__main__':
go()
#include <libcorpus2/tagsetmanager.h>
#include <libcorpus2/util/ioformat-options.h>
#include <boost/program_options.hpp>
#include <boost/algorithm/string.hpp>
int main(int argc, char** argv)
{
std::string tagset_name, filename;
std::string input_format, output_format;
int sentence, token = -1;
size_t stats = 0;
using boost::program_options::value;
boost::program_options::options_description desc("Allowed options");
desc.add_options()
("filename,F", value(&filename),
"filename")
("sentence,S", value(&sentence),
"Sentence idx")
("stats,s", value(&stats),
"Stats")
("token,T", value(&token),
"Token idx ")
("tagset,t", value(&tagset_name)->default_value("kipi"),
"Tagset name")
;
Corpus2::add_input_options(desc);
Corpus2::add_output_options(desc);
boost::program_options::variables_map vm;
boost::program_options::positional_options_description p;
p.add("filename", 1);
p.add("sentence", 1);
p.add("token", 1);
try {
boost::program_options::store(
boost::program_options::command_line_parser(argc, argv)
.options(desc).positional(p).run(), vm);
} catch (boost::program_options::error& e) {
std::cerr << e.what() << "\n";
return 2;
}
boost::program_options::notify(vm);
if (vm.count("help")) {
std::cout << desc << "\n";
return 1;
}
const Corpus2::Tagset& tagset = Corpus2::get_named_tagset(tagset_name);
boost::shared_ptr<Corpus2::TokenReader> reader;
reader = Corpus2::create_reader(vm, tagset, filename);
Corpus2::Sentence::Ptr s;
boost::shared_ptr<Corpus2::TokenWriter> writer;
writer = Corpus2::create_writer(vm, tagset);
std::map<int,int> lens;
for (int i = 0; i <= sentence; ++i) {
s = reader->get_next_sentence();
if (s) {
lens[s->size()]++;
if (s->size() > stats) {
std::cerr << i << "\n";
writer->write_sentence(*s);
}
}
}
if (s) {
if (token == -1) {
writer->write_sentence(*s);
} else if (static_cast<size_t>(token) < s->size()) {
writer->write_token(*(*s)[token]);
}
}
if (stats) {
typedef std::pair<int,int> pp;
foreach (const pp& p, lens) {
std::cerr << p.first << " " << p.second << "\n";
}
}
}
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment