diff --git a/CMakeLists.txt b/CMakeLists.txt index 302b8f652c8cf5c540b09bae4527732866c57439..a9f18ce1eee33ef2c21ccfa1f071f827c79dc4d9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -69,15 +69,17 @@ FIND_PATH(LIBCORPUS2_SRC_DATA_DIR ) MARK_AS_ADVANCED(LIBCORPUS2_SRC_DATA_DIR) -add_subdirectory(libpwrutils) -add_subdirectory(libcorpus2) -add_subdirectory(libcorpus2_whole) if(CORPUS2_BUILD_POLIQARP) add_subdirectory(poliqarp-library) add_subdirectory(poliqarp) + add_definitions( -DWITH_POLIQARP ) else() message(STATUS "Not building Poliqarp library and wrapper") endif(CORPUS2_BUILD_POLIQARP) + +add_subdirectory(libpwrutils) +add_subdirectory(libcorpus2) +add_subdirectory(libcorpus2_whole) add_subdirectory(corpus2tools) add_subdirectory(tests) diff --git a/corpus2data/nkjp.tagset b/corpus2data/nkjp.tagset index c3a2ccc9cfae4691cb8f08b106910fe7efa4aed5..b1e0be196d37fca157d7345ec409d5fd1ebc2990 100644 --- a/corpus2data/nkjp.tagset +++ b/corpus2data/nkjp.tagset @@ -35,8 +35,8 @@ depr nmb cas gnd ger nmb cas gnd asp ngt ppron12 nmb cas gnd per [acn] ppron3 nmb cas gnd per [acn] [ppr] -num nmb cas gnd acm -numcol nmb cas gnd acm +num nmb cas gnd [acm] +numcol nmb cas gnd [acm] adj nmb cas gnd deg pact nmb cas gnd asp ngt ppas nmb cas gnd asp ngt diff --git a/corpus2data/sgjp.tagset b/corpus2data/sgjp.tagset index 95a6d09c718970ef8225eef46b59a4e3230d7340..fec42f8d5ef6581825b98381f750d10dc8e418c3 100644 --- a/corpus2data/sgjp.tagset +++ b/corpus2data/sgjp.tagset @@ -35,8 +35,8 @@ depr nmb cas gnd ger nmb cas gnd asp ngt ppron12 nmb cas gnd per [acn] ppron3 nmb cas gnd per [acn] [ppr] -num nmb cas gnd acm -numcol nmb cas gnd acm +num nmb cas gnd [acm] +numcol nmb cas gnd [acm] adj nmb cas gnd deg pact nmb cas gnd asp ngt ppas nmb cas gnd asp ngt diff --git a/libcorpus2/CMakeLists.txt b/libcorpus2/CMakeLists.txt index d5ccdc18c16c25e2b67dfb00daa582e70c72f0be..ad7900137f2e7ea296900b654c6ab5bdf12300e0 100644 --- a/libcorpus2/CMakeLists.txt +++ b/libcorpus2/CMakeLists.txt @@ -53,6 +53,7 @@ SET(libcorpus2_STAT_SRC tokenmetadata.cpp io/cclreader.cpp io/cclwriter.cpp + io/conllwriter.cpp io/helpers.cpp io/fastxces.cpp io/iob-chan.cpp diff --git a/libcorpus2/io/cclreader.cpp b/libcorpus2/io/cclreader.cpp index 34094a83f57add41f87dd3b078ca222a830fcd80..97b066f0ea2467069d3805c6fb9712de6d343de8 100644 --- a/libcorpus2/io/cclreader.cpp +++ b/libcorpus2/io/cclreader.cpp @@ -309,6 +309,8 @@ void CclReader::set_option(const std::string& option) impl_->set_autogen_sent_id(true); } else if (option == "autogen_chunk_id") { impl_->set_autogen_chunk_id(true); + } else if (option == "no_warn_unexpected_xml") { + impl_->set_warn_on_unexpected(false); } else { BufferedChunkReader::set_option(option); @@ -320,11 +322,13 @@ std::string CclReader::get_option(const std::string& option) const if (option == "disamb_only") { return impl_->get_disamb_only() ? option : ""; } else if (option == "no_warn_inconsistent") { - return impl_->get_warn_on_inconsistent() ? option : ""; + return impl_->get_warn_on_inconsistent() ? "" : option; } else if (option == "autogen_sent_id") { - return impl_->get_autogen_sent_id() ? "autogen_sent_id" : ""; + return impl_->get_autogen_sent_id() ? option : ""; } else if (option == "autogen_chunk_id") { - return impl_->get_autogen_chunk_id() ? "autogen_chunk_id" : ""; + return impl_->get_autogen_chunk_id() ? option : ""; + } else if (option == "no_warn_unexpected_xml") { + return impl_->get_warn_on_unexpected() ? "" : option; } return BufferedChunkReader::get_option(option); } diff --git a/libcorpus2/io/conllwriter.cpp b/libcorpus2/io/conllwriter.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8910e617c6dd89f792bb02f66b3c3fce6a43e528 --- /dev/null +++ b/libcorpus2/io/conllwriter.cpp @@ -0,0 +1,85 @@ +#include "conllwriter.h" +#include <libpwrutils/foreach.h> +#include <boost/algorithm/string.hpp> + + +namespace Corpus2 { + +bool ConllWriter::registered = TokenWriter::register_writer<ConllWriter>("conll"); + +ConllWriter::ConllWriter(std::ostream& os, const Tagset& tagset, + const string_range_vector& params) + : TokenWriter(os, tagset, params) +{ + myTagset=tagset; +} + +ConllWriter::~ConllWriter() +{ + finish(); +} + +void ConllWriter::write_token(const Token &t) +{ + os()<<t.orth_utf8()<<"\t"; + Lexeme lex = t.get_preferred_lexeme(myTagset); + os()<<lex.lemma_utf8()+"\t"; + std::string tag = myTagset.tag_to_string(lex.tag()); + std::vector<std::string> strs; + boost::split(strs, tag, boost::is_any_of(":")); + os()<<strs[0]<<"\t"<<strs[0]<<"\t"; + if(strs.size()>1) + { + size_t i; + for(i=1;i<strs.size()-1;i++) + { + os()<<strs[i]<<"|"; + } + os()<<strs[i]<<"\t_\t_\t_\t_"; + } + else + os()<<"_\t_\t_\t_\t_"; +} + +void ConllWriter::write_sentence(const Sentence& s) +{ + int i=1; + foreach (const Token* t, s.tokens()) { + os()<<i<<"\t"; + write_token(*t); + os()<<"\n"; + i++; + } + os()<<"\n"; +} + +void ConllWriter::write_chunk(const Chunk &c) +{ + foreach (const Sentence::ConstPtr& s, c.sentences()) { + write_sentence(*s); + } +} + +void ConllWriter::do_header() +{ + +} + +void ConllWriter::do_footer() +{ +} + +std::string ConllWriter::convert_tag(std::string tag) +{ + if(tag.compare("adja")==0||tag.compare("adjc")==0||tag.compare("adjp")==0||tag.compare("padj")==0||tag.compare("pact")==0||tag.compare("ppas")==0) + return "adj"; + if(tag.compare("padv")==0||tag.compare("pant")==0||tag.compare("pcon")==0) + return "adv"; + if(tag.compare("bedzie")==0||tag.compare("fin")==0||tag.compare("imps")==0||tag.compare("impt")==0||tag.compare("inf")==0||tag.compare("praet")==0||tag.compare("pred")==0||tag.compare("winien")==0) + return "verb"; + if(tag.compare("psubst")==0||tag.compare("depr")==0||tag.compare("ger")==0||tag.compare("ppron3")==0||tag.compare("ppron12")==0||tag.compare("siebie")==0) + return "subst"; + return tag; +} + +} /* end ns Corpus2 */ diff --git a/libcorpus2/io/conllwriter.h b/libcorpus2/io/conllwriter.h new file mode 100644 index 0000000000000000000000000000000000000000..a2132b4ca6223b1805cd4d2c0e4cedcda285c936 --- /dev/null +++ b/libcorpus2/io/conllwriter.h @@ -0,0 +1,36 @@ +#ifndef CONLLWRITER_H +#define CONLLWRITER_H + +#include <libcorpus2/io/writer.h> + +namespace Corpus2 { + +class ConllWriter : public TokenWriter +{ +public: + ConllWriter(std::ostream& os, const Tagset& tagset, + const string_range_vector& params); + + ~ConllWriter(); + + void write_token(const Token &t); + + void write_sentence(const Sentence &s); + + void write_chunk(const Chunk &c); + + static bool registered; + +protected: + void do_header(); + + void do_footer(); +private: + Tagset myTagset; + std::string convert_tag(std::string tag); + +}; + +} /* end ns Corpus2 */ + +#endif // CONLLWRITER_H diff --git a/libcorpus2/io/rft.cpp b/libcorpus2/io/rft.cpp index 8f28008dafccacd006d0656fa87a81844df67257..d7366c370986b6baea9a01eedb292c6925b85fc2 100644 --- a/libcorpus2/io/rft.cpp +++ b/libcorpus2/io/rft.cpp @@ -19,6 +19,7 @@ or FITNESS FOR A PARTICULAR PURPOSE. #include <libpwrutils/foreach.h> #include <boost/algorithm/string.hpp> +#include <boost/algorithm/string/predicate.hpp> #include <boost/make_shared.hpp> #include <fstream> @@ -143,7 +144,7 @@ Sentence::Ptr RftReader::actual_next_sentence() while (is().good()) { std::getline(is(), line); if (line.empty() - || (mbt_dialect_ && line.find_first_of("<utt>") == 0)) { // TODO: check + || (mbt_dialect_ && boost::starts_with(line, "<utt>"))) { return s; } else { size_t tab = line.find('\t'); diff --git a/libcorpus2_whole/CMakeLists.txt b/libcorpus2_whole/CMakeLists.txt index e4f6ca987c7e19c00b239ed53ac4f8f4f859f433..8312adfcf37a6c84b92fb401ed487c45179f894f 100644 --- a/libcorpus2_whole/CMakeLists.txt +++ b/libcorpus2_whole/CMakeLists.txt @@ -13,11 +13,16 @@ SET(libcorpus2_whole_SRC io/relreader.cpp io/documentreader.cpp io/documentcorpusreader.cpp - io/poliqarpdocumentreader.cpp - io/poliqarpcorpusreader.cpp io/corpusreader.cpp ) +if(CORPUS2_BUILD_POLIQARP) + SET(libcorpus2_whole_SRC ${libcorpus2_whole_SRC} + io/poliqarpdocumentreader.cpp + io/poliqarpcorpusreader.cpp + ) +endif(CORPUS2_BUILD_POLIQARP) + file(GLOB_RECURSE INCS "*.h") if(WIN32) diff --git a/libcorpus2_whole/io/corpusreader.cpp b/libcorpus2_whole/io/corpusreader.cpp index 56538ad5115a7ce386243bb1214378502f634a99..bbe0154920f040469480b842bc99c09e891b266a 100644 --- a/libcorpus2_whole/io/corpusreader.cpp +++ b/libcorpus2_whole/io/corpusreader.cpp @@ -1,7 +1,11 @@ #include <libcorpus2_whole/io/corpusreader.h> -#include <libcorpus2_whole/io/poliqarpcorpusreader.h> #include <libcorpus2_whole/io/documentcorpusreader.h> +#ifdef WITH_POLIQARP +#include <libcorpus2_whole/io/poliqarpcorpusreader.h> +#endif + + namespace Corpus2 { namespace whole{ @@ -24,14 +28,16 @@ boost::shared_ptr<Corpus> CorpusReader::read(const std::string& corpus_file_path // boost::shared_ptr<CorpusReaderI> CorpusReader::get_corpus_reader_by_type() { - if (corpus_type_ == "poliqarp") { - return boost::shared_ptr<PoliqarpCorpusReader>( - new PoliqarpCorpusReader(tagset_)); - } else if (corpus_type_ == "document") { + if (corpus_type_ == "document") { return boost::shared_ptr<DocumentCorpusReader>( new DocumentCorpusReader(tagset_)); +#ifdef WITH_POLIQARP + } else if (corpus_type_ == "poliqarp") { + return boost::shared_ptr<PoliqarpCorpusReader>( + new PoliqarpCorpusReader(tagset_)); +#endif } - throw Corpus2Error(corpus_type_ + " is unknown reader type!"); + throw Corpus2Error(corpus_type_ + " is an unknown reader type!"); } } // whole ns diff --git a/libcorpus2_whole/io/documentcorpusreader.cpp b/libcorpus2_whole/io/documentcorpusreader.cpp index 12d035904f1e8113c28b1fff413df38bd6264872..a85e97cc4a79fe7b3a01679d55fbd416b0a5a995 100644 --- a/libcorpus2_whole/io/documentcorpusreader.cpp +++ b/libcorpus2_whole/io/documentcorpusreader.cpp @@ -35,8 +35,7 @@ boost::shared_ptr<Corpus> DocumentCorpusReader::read(const std::string& corpus_f continue; } else if (splitted_line.size() == 1) { - // maybe exception? - continue; + throw Corpus2Error("DocumentReader requires both paths to relations and annotations"); } ann_path = splitted_line[0]; diff --git a/libcorpus2_whole/io/documentreader.cpp b/libcorpus2_whole/io/documentreader.cpp index 8aa1a79a421c4e08add2b75b48040fed6ad974af..c30a4fbadf8960189c9585e0284cf69546131f96 100644 --- a/libcorpus2_whole/io/documentreader.cpp +++ b/libcorpus2_whole/io/documentreader.cpp @@ -31,6 +31,10 @@ namespace whole { const std::string &annot_path, const std::string &rela_path) { ccl_reader_ = boost::make_shared<CclReader>(tagset, annot_path); + // prevent the underlying CCL reader from complaining about + // relation XML tags unknown to the reader itself + // (in case annot_path and rela_path poin to the same file) + ccl_reader_->set_option("no_warn_unexpected_xml"); rel_reader_ = boost::make_shared<RelationReader>(rela_path); } diff --git a/poliqarp-library/CMakeLists.txt b/poliqarp-library/CMakeLists.txt index a5561d8f82a0ab9be92d3c8fa754d547ae43b4c3..c232c81770128e80d978867ef20c6b2b6c82dbd5 100644 --- a/poliqarp-library/CMakeLists.txt +++ b/poliqarp-library/CMakeLists.txt @@ -142,7 +142,7 @@ set(poliqarpd_SRC ) add_library(poliqarpc2 SHARED ${foostring_SRC} ${progress_SRC} ${unibits_SRC} ${common_SRC} ${sakura_SRC} ${BF_SOURCES} ${PoliqarpLibrary_BINARY_DIR}/sakura/parser.h) -add_dependencies(poliqarpc2 ${PoliqarpLibrary_BINARY_DIR}/sakura/parser.ha) +#add_dependencies(poliqarpc2 ${PoliqarpLibrary_BINARY_DIR}/sakura/parser.h) set_target_properties(poliqarpc2 PROPERTIES VERSION "${pqlib_ver_major}.${pqlib_ver_minor}" SOVERSION ${pqlib_ver_major}) diff --git a/swig/CMakeLists.txt b/swig/CMakeLists.txt index 5e257c1d5641f62b111990d81b4524aba6f8cd1e..dd7bb2cdbcc4a500c7e0c717567973cccfc90b57 100644 --- a/swig/CMakeLists.txt +++ b/swig/CMakeLists.txt @@ -2,7 +2,11 @@ PROJECT(corpus2SwigWrap) -set(LIBS "corpus2" "corpus2_whole" "corpus2_poliqarpreader" "pwrutils") +set(LIBS "corpus2" "corpus2_whole" "pwrutils") + +if(CORPUS2_BUILD_POLIQARP) + set(LIBS ${LIBS} "corpus2_poliqarpreader" ) +endif(CORPUS2_BUILD_POLIQARP) include_directories (${corpus2_SOURCE_DIR}) include_directories (${pwrutils_SOURCE_DIR}) diff --git a/utils/corpspace.py b/utils/corpspace.py new file mode 100755 index 0000000000000000000000000000000000000000..ac3b429a7159152aff331686f52141a012064342 --- /dev/null +++ b/utils/corpspace.py @@ -0,0 +1,61 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +from optparse import OptionParser +import sys +import corpus2 +from StringIO import StringIO +from collections import defaultdict as dd +descr = """%prog [options] TAGOUT MORPHO OUT + +Util to synchronise no-space markers between tagger output (TAGOUT) that +contains the wanted disamb lexemes but may be devoid of no-space markers +with the tagger input containing proper no-space markers but no disambs. +""" + + + +def go(): + parser = OptionParser(usage=descr) + parser.add_option('-i', '--input-format', type='string', action='store', + dest='input_format', default='xces', + help='set the input format; default: xces') + parser.add_option('-o', '--output-format', type='string', action='store', + dest='output_format', default='xces', + help='set the output format; default: xces') + parser.add_option('-t', '--tagset', type='string', action='store', + dest='tagset', default='nkjp', + help='set the tagset used in input; default: nkjp') + parser.add_option('-q', '--quiet', action='store_false', default=True, dest='verbose') + parser.add_option('-d', '--debug', action='store_true', dest='debug_mode') + (options, args) = parser.parse_args() + + if len(args) != 3: + print 'You need to provide a TAGOUT, MORPHO and OUTPUT files.' + print 'See --help for details.' + print + sys.exit(1) + + tag_fn, mor_fn, out_fn = args + tagset = corpus2.get_named_tagset(options.tagset) + + tag_rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, tag_fn) + mor_rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, mor_fn) + + writer = corpus2.TokenWriter.create_path_writer(options.output_format, out_fn, tagset) + + while True: + mor_sent = mor_rdr.get_next_sentence() + tag_sent = tag_rdr.get_next_sentence() + assert (not mor_sent) == (not tag_sent) + if not mor_sent: + break + for mor_tok, tag_tok in zip(mor_sent.tokens(), tag_sent.tokens()): + assert unicode(mor_tok.orth()) == unicode(tag_tok.orth()), unicode(tag_tok.orth()) + tag_tok.set_wa(mor_tok.wa()) + writer.write_sentence(tag_sent) + + writer.finish() + +if __name__ == '__main__': + go() diff --git a/utils/corptext.py b/utils/corptext.py new file mode 100755 index 0000000000000000000000000000000000000000..36a67d72659835f9a1f354748e312be608d0de61 --- /dev/null +++ b/utils/corptext.py @@ -0,0 +1,57 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +descr = """%prog [options] INPUT OUTPUT + +Reads input and saves as plain text. By default, paragraphs are separated with +two newlines, sentence division is not marked.""" + +from optparse import OptionParser +import sys, codecs +import corpus2 + +def go(): + parser = OptionParser(usage=descr) + parser.add_option('-i', '--input-format', type='string', action='store', + dest='input_format', default='xces', + help='set the input format; default: xces') + parser.add_option('-t', '--tagset', type='string', action='store', + dest='tagset', default='nkjp', + help='set the tagset used in input; default: nkjp') + parser.add_option('-s', '--sent-sep', type='string', action='store', + dest='sent_sep', default='', + help='set the sentence separator; default: (empty)') + parser.add_option('-p', '--par-sep', type='string', action='store', + dest='par_sep', default='\n\n', + help='set the sentence separator; default: (two newlines)') + (options, args) = parser.parse_args() + if len(args) != 2: + print 'Need to provide input and output.' + print 'See --help for details.' + print + sys.exit(1) + + fn_input, fn_output = args + + with codecs.open(fn_output, 'wb', 'utf-8') as out: + tagset = corpus2.get_named_tagset(options.tagset) + rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, fn_input) + first = True + while True: + par = rdr.get_next_chunk() + if options.par_sep: + first = True # if non-empty par separator, skip pre-spaces + if not par: + break + for sent in par.sentences(): + if options.sent_sep: + first = True # if non-empty sent sep, skip pre-spaces + for tok in sent.tokens(): + if not first and tok.after_space(): + out.write(' ') + out.write(unicode(tok.orth())) + first = False + out.write(options.sent_sep) + out.write(options.par_sep) + +if __name__ == '__main__': + go() diff --git a/utils/parfolds.py b/utils/parfolds.py new file mode 100755 index 0000000000000000000000000000000000000000..d10ed210957322185d0d487ec5ce961e082d3123 --- /dev/null +++ b/utils/parfolds.py @@ -0,0 +1,63 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +descr = """%prog [options] INPUT OUTDIR + +Generates paragraph-wise folds.""" + +from optparse import OptionParser +import sys, codecs, os +import corpus2 + +def go(): + parser = OptionParser(usage=descr) + parser.add_option('-i', '--input-format', type='string', action='store', + dest='input_format', default='xces', + help='set the input format; default: xces') + parser.add_option('-o', '--output-format', type='string', action='store', + dest='output_format', default='xces', + help='set the output format; default: xces') + parser.add_option('-t', '--tagset', type='string', action='store', + dest='tagset', default='nkjp', + help='set the tagset used in input; default: nkjp') + parser.add_option('-f', '--num-folds', type='int', action='store', + dest='num_folds', default='10', + help='set the number of folds (default: 10)') + + (options, args) = parser.parse_args() + if len(args) != 2: + print 'Need to provide input and output.' + print 'See --help for details.' + print + sys.exit(1) + + fold_nums = range(options.num_folds) + fn_input, fold_dir = args + + tagset = corpus2.get_named_tagset(options.tagset) + rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, fn_input) + fold_test = [corpus2.TokenWriter.create_path_writer( + options.output_format, + os.path.join(fold_dir, 'test%02d.xml' % (num + 1)), tagset) + for num in fold_nums] + fold_train = [corpus2.TokenWriter.create_path_writer( + options.output_format, + os.path.join(fold_dir, 'train%02d.xml' % (num + 1)), tagset) + for num in fold_nums] + first = True + fold_now = 0 + while True: + par = rdr.get_next_chunk() + if not par: + break + fold_test[fold_now].write_chunk(par) + for other_num in fold_nums: + if other_num != fold_now: + fold_train[other_num].write_chunk(par) + + fold_now = (fold_now + 1) % options.num_folds + + for w in fold_test: w.finish() + for w in fold_train: w.finish() + +if __name__ == '__main__': + go()