diff --git a/CMakeLists.txt b/CMakeLists.txt index dd9d2a79a15fa5c98d7a7ad1029c6365793c768d..75783dd3c7e5d6dce8316c2750ff6892e2230929 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ PROJECT(Corpus2Library) set(corpus2_ver_major "1") -set(corpus2_ver_minor "2") +set(corpus2_ver_minor "3") set(corpus2_ver_patch "1") cmake_minimum_required(VERSION 2.8.0) @@ -53,7 +53,7 @@ set(LIBS "") include_directories(${Corpus2Library_SOURCE_DIR}) find_package(Boost 1.41 REQUIRED COMPONENTS program_options system filesystem regex) - +MARK_AS_ADVANCED(Boost_DIR) if(MSVC OR BORLAND) # Use the auto-linking feature, don't try to add libraries yourself: set(Boost_LIBRARIES "") diff --git a/corpus2tools/corpus-merge b/corpus2tools/corpus-merge index 047e9f0be7a0b8cd5bb46ceee379fa9066c99db4..d04345cbfbf39c4851d25785fbb8b534bfc02971 100755 --- a/corpus2tools/corpus-merge +++ b/corpus2tools/corpus-merge @@ -1,6 +1,6 @@ #!/usr/bin/python # -*- coding: utf-8 -*- -import sys +import sys, os from optparse import OptionParser from collections import defaultdict as dd from itertools import repeat, izip @@ -45,6 +45,9 @@ def go(): parser.add_option('-C', '--chunks', action='store_true', dest='chunks', default=False, help='Process chunks (select chunks/sentences, not tokens)') + parser.add_option('--prefix-chunks', action='store_true', + dest='prefix_chunks', default=False, + help='Prefix chunk ids with filename (file:NAME:ORIGID)') parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, help='verbose mode') @@ -68,11 +71,23 @@ def go(): for arg in args: reader = corpus2.TokenReader.create_path_reader(options.input_format, tagset, arg) if options.chunks: + fname, _ = os.path.splitext(os.path.basename(arg)) + chunk_no = 1 for chunk in chunks(reader): + if options.prefix_chunks: + if chunk.has_attribute('id'): + their_id = chunk.get_attribute('id') + else: + # autogen + their_id = ('auto%03d' % chunk_no) + full_id = 'file:%s:%s' % (fname, their_id) + chunk.set_attribute('id', full_id) writer.write_chunk(chunk) + chunk_no += 1 else: for sent in sentences(reader): writer.write_sentence(sent) + del reader if __name__ == '__main__': go() diff --git a/libcorpus2/ann/annotatedsentence.cpp b/libcorpus2/ann/annotatedsentence.cpp index 50c5ae81df19db8bcc128dbc7173fd1b09c761fa..137d0b413c98b2c0e14af561d0d1d46d49024ea8 100644 --- a/libcorpus2/ann/annotatedsentence.cpp +++ b/libcorpus2/ann/annotatedsentence.cpp @@ -34,7 +34,7 @@ AnnotatedSentence::~AnnotatedSentence() Sentence::Ptr AnnotatedSentence::clone_shared() const { boost::shared_ptr<AnnotatedSentence> copy; - copy = boost::make_shared<AnnotatedSentence>(); + copy = boost::make_shared<AnnotatedSentence>(id_); BOOST_FOREACH(const Token* t, tokens_) { copy->append(t->clone()); } @@ -48,7 +48,7 @@ boost::shared_ptr<AnnotatedSentence> AnnotatedSentence::wrap_sentence( boost::shared_ptr<AnnotatedSentence> a; a = boost::dynamic_pointer_cast<AnnotatedSentence>(s); if (!a) { - a = boost::make_shared<AnnotatedSentence>(); + a = boost::make_shared<AnnotatedSentence>(s->id()); BOOST_FOREACH(Token* t, s->tokens()) { a->append(t); } @@ -63,7 +63,7 @@ boost::shared_ptr<AnnotatedSentence> AnnotatedSentence::wrap_sentence_clone( boost::shared_ptr<AnnotatedSentence> a; a = boost::dynamic_pointer_cast<AnnotatedSentence>(s); if (!a) { - a = boost::make_shared<AnnotatedSentence>(); + a = boost::make_shared<AnnotatedSentence>(s->id()); BOOST_FOREACH(Token* t, s->tokens()) { a->append(t->clone()); } diff --git a/libcorpus2/ann/annotatedsentence.h b/libcorpus2/ann/annotatedsentence.h index 65b3a0353d94266b6e18edf0054497881e77a72d..1b3a3359ecc98f1f7145b970ebcf862abc90dcc0 100644 --- a/libcorpus2/ann/annotatedsentence.h +++ b/libcorpus2/ann/annotatedsentence.h @@ -125,6 +125,14 @@ public: return true; } + /** + * Remove channel having a given name. Returns whether removed + * (will return false if no channel of the given name exists). + */ + bool remove_channel(const std::string& name) { + return (channels_.erase(name) > 0); + } + const chan_map_t& all_channels() const { return channels_; } diff --git a/libcorpus2/sentence.cpp b/libcorpus2/sentence.cpp index bb76754d3ad3bfc78e283da5babbd4c3df586baa..073d9a42ba6f1be3508380bb0cd3ff6d9d6fd554 100644 --- a/libcorpus2/sentence.cpp +++ b/libcorpus2/sentence.cpp @@ -33,7 +33,7 @@ Sentence::~Sentence() Sentence::Ptr Sentence::clone_shared() const { - Sentence::Ptr s = boost::make_shared<Sentence>(); + Sentence::Ptr s = boost::make_shared<Sentence>(id_); BOOST_FOREACH(const Token* t, tokens_) { s->append(t->clone()); } diff --git a/libcorpus2/tagging.cpp b/libcorpus2/tagging.cpp index 785edf9ffe411dd9fd138df371a5171fcdb3eebb..a814970d37e45590d0154384dcb7938d43e30c0b 100644 --- a/libcorpus2/tagging.cpp +++ b/libcorpus2/tagging.cpp @@ -59,6 +59,12 @@ int mask_card(const Tag& mask) + PwrNlp::count_bits_set(mask.get_values()); } +Tag with_values_masked(Tag input, Tag attr_value, Tag attr_mask) { + Tag output(input); + output.add_values_masked(attr_value.get_values(), attr_mask.get_values()); + return output; +} + bool select_preferred_disamb(const Tagset& tagset, Token* token) { size_t lex_idx = token->get_preferred_lexeme_index(tagset); diff --git a/libcorpus2/tagging.h b/libcorpus2/tagging.h index 4beaba5096ac616291e822824971838aa33e2142..9039bd8c36c1f713584d2cc996670be04f298497 100644 --- a/libcorpus2/tagging.h +++ b/libcorpus2/tagging.h @@ -45,6 +45,13 @@ Tag mask_token(const Token& token, const Tag& mask, bool disamb_only); /** Returns the number of set elements belonging to the mask given. */ int mask_card(const Tag& mask); +/** + * Returns a copy of the given input tag with the attribute referred + * to by attr_mask value set to attr_value (possibly empty). + * NOTE: only attribute part of the masks are considered. + */ +Tag with_values_masked(Tag input, Tag attr_value, Tag attr_mask); + /** Forces one disamb lexeme per token. The selection is based on tagset * definition order. Returns if any disamb found. */ @@ -99,6 +106,7 @@ bool disambiguate_subset(Token* token, const Tag& mask_where, /** Sets lexemes' disamb markers iff lexeme.tag is wanted_tag. */ void set_disambs(Token *token, const Tag& wanted_tag); + } /* end ns Corpus2 */ #endif // LIBCORPUS2_TAGGING_H diff --git a/libcorpus2_whole/io/cclrelreader.cpp b/libcorpus2_whole/io/cclrelreader.cpp index 9072f2b1f3cbd2ab7c4593f633dc370762c99ab5..ee84d97d50296ec49e252d9bf9abf797558ba62e 100644 --- a/libcorpus2_whole/io/cclrelreader.cpp +++ b/libcorpus2_whole/io/cclrelreader.cpp @@ -72,11 +72,7 @@ namespace whole { void CclRelReader::set_option(const std::string& option) { - if (option == "autogen_sent_id") { - ccl_reader_->set_option("autogen_sent_id"); - } else if (option == "autogen_chunk_id") { - ccl_reader_->set_option("autogen_chunk_id"); - } + ccl_reader_->set_option(option); } std::string CclRelReader::get_option(const std::string& option) const { diff --git a/swig/annotatedsentence.i b/swig/annotatedsentence.i index c158c4aa603da845d80a225b89233845ccd080e9..f3d2b4e407e1a41a348904550539477e05a58207 100644 --- a/swig/annotatedsentence.i +++ b/swig/annotatedsentence.i @@ -50,6 +50,12 @@ namespace Corpus2 { AnnotationChannel& get_channel(const std::string& name); const AnnotationChannel& get_channel(const std::string& name) const; bool add_channel(const std::string& name, const AnnotationChannel& chan); + + /* Use with care. Note that get_channel returns a reference to the original + channel, hence it is strongly recommended to del the channel ref object + before attempting to call remove_channel. */ + bool remove_channel(const std::string& name); + const chan_map_t& all_channels() const; /* --------------------------------------------------------------------- */ diff --git a/swig/tagging.i b/swig/tagging.i index ec90902b9fc3121809429e8807a85a44e2a95ae1..b4695cef1dd7f248d582b201f62dfe68f45fc36b 100644 --- a/swig/tagging.i +++ b/swig/tagging.i @@ -21,6 +21,8 @@ Tag mask_token(const Token& token, const Tag& mask, bool disamb_only); int mask_card(const Tag& mask); +Tag with_values_masked(Tag input, Tag attr_value, Tag attr_mask); + bool select_preferred_disamb(const Tagset& tagset, Token* token); void select_preferred_lexeme(const Tagset& tagset, Token* token); diff --git a/utils/get_morpho.py b/utils/get_morpho.py new file mode 100755 index 0000000000000000000000000000000000000000..ec724b0a0a3534df65c1114fd2e9278d1290f464 --- /dev/null +++ b/utils/get_morpho.py @@ -0,0 +1,103 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# Copyright (C) 2013 Adam Radziszewski. +# This program is free software; you can redistribute and/or modify it +# under the terms of the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. +# +# See the LICENCE and COPYING files for more details + +descr = """%prog [options] CORPUS OUTDATAFILE + +Gathers morphological data from the given corpus. + +The extracted morphological data will be saved to OUTDATAFILE in tab-delimited +format (compliant with Maca analysers). +NOTE: the morphological data will not be compacted. To do so, you may use +tabclean.py script from Maca repository. +""" + +import sys, codecs +from optparse import OptionParser +import corpus2 + +class Analyser: + """Morphological dictionary resulting from reading a corpus. + Provides consume function that updates the data with the given token.""" + def __init__(self, tagset, case_sensitive): + self.tagset = tagset + self.unk_tag = self.tagset.make_ign_tag() + self.get_form = (lambda form: form) if case_sensitive else (lambda form: form.lower()) + self.data = {} + + def _add_one(self, ready_form, lex): + """Adds tag and lemma from the given lexeme.""" + if ready_form not in self.data: + self.data[ready_form] = set() + self.data[ready_form].add( + (unicode(lex.lemma()), self.tagset.tag_to_string(lex.tag())) + ) + + def consume(self, tok): + ready_form = self.get_form(unicode(tok.orth())) + for lex in tok.lexemes(): + if lex.tag() != self.unk_tag: + self._add_one(ready_form, lex) + + def write(self, output): + for form in sorted(self.data): + entries = sorted(self.data[form]) + for lemma, tag in entries: + output.write(u'%s\t%s\t%s\n' % (form, lemma, tag)) + + def save(self, fname): + output = codecs.open(fname, 'wb', 'utf-8') + self.write(output) + output.close() + +def get_morpho(options, corpname, outfname): + tagset = corpus2.get_named_tagset(options.tagset) + anal = Analyser(tagset, options.case_sens) + rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, corpname) + while True: + tok = rdr.get_next_token() + if not tok: + break + anal.consume(tok) + del rdr + anal.save(outfname) + +def go(): + parser = OptionParser(usage=descr) + parser.add_option('-i', '--input-format', type='string', action='store', + dest='input_format', default='xces', + help='set the input format; default: xces') + parser.add_option('-t', '--tagset', type='string', action='store', + dest='tagset', default='nkjp', + help='set the tagset used in input; default: nkjp') + parser.add_option('-s', '--sep', type='string', action='store', + dest='sep', default='\t', + help='set the separator used in morpho file; default: tab character') + parser.add_option('-c', '--case-sensitive', action='store_true', default=False, dest='case_sens') + parser.add_option('-q', '--quiet', action='store_false', default=True, dest='verbose') + (options, args) = parser.parse_args() + + if len(args) != 2: + print 'You need to provide an input corpus and output path' + print + parser.print_help() + sys.exit(1) + + corpname = args[0] + outfname = args[1] + + get_morpho(options, corpname, outfname) + +if __name__ == '__main__': + go() diff --git a/utils/relation_eval.py b/utils/relation_eval.py index 4067051f9c989a1b69e8a6e3c0df0d81b06e9959..0daa15a337e23cb40df2222c496c9b37fa9b1163 100755 --- a/utils/relation_eval.py +++ b/utils/relation_eval.py @@ -1,4 +1,5 @@ #!/usr/bin/python +# -*- coding: utf-8 -*- # Copyright (C) 2012 Paweł Orłowicz. # This program is free software; you can redistribute and/or modify it @@ -31,7 +32,7 @@ from optparse import OptionParser import sys import corpus2 -class RelStats : +class RelStats: def __init__(self): self.both_hits = 0 self.head_hits = 0 @@ -39,7 +40,7 @@ class RelStats : self.any_hits = 0 #helper method to get annotation vector from annotated sentence - def get_channel_annotations(self, ann_sent, dir_point) : + def get_channel_annotations(self, ann_sent, dir_point): chann_name = dir_point.channel_name() annotation_number = dir_point.annotation_number() - 1 channel = ann_sent.get_channel(chann_name) @@ -47,7 +48,7 @@ class RelStats : return ann_vec[annotation_number] #helper method to get list of tokens' indices - def get_indices(self, annotated_sentence, direction_point) : + def get_indices(self, annotated_sentence, direction_point): ann_chann = self.get_channel_annotations(annotated_sentence, direction_point) indices = ann_chann.indices #loop to unwrap Integer objects from ann_chann.indices @@ -58,34 +59,38 @@ class RelStats : return inds #helper to get index of the chunk's head - def get_head_index(self, annotated_sentence, direction_point) : + def get_head_index(self, annotated_sentence, direction_point): ann_chann = self.get_channel_annotations(annotated_sentence, direction_point) head_index = ann_chann.head_index return head_index #returns values of hits from one direction point of relation - def verify_relation(self, ref_ann_sent, dir_point_ref, target_ann_sent, dir_point_target) : + def verify_relation(self, ref_ann_sent, dir_point_ref, target_ann_sent, dir_point_target): both, head, chun = 0,0,0 #if indices from ref chunk and target chunks equals (tokens are the same) then chun hits - if self.get_indices(ref_ann_sent, dir_point_ref) == self.get_indices(target_ann_sent, dir_point_target) : - chun += 1 - #if chun hits and head indices match then head hits - if self.get_head_index(ref_ann_sent, dir_point_ref) == self.get_head_index(target_ann_sent, dir_point_target) : - head +=1 + if self.get_indices(ref_ann_sent, dir_point_ref) == self.get_indices(target_ann_sent, dir_point_target): + chun = 1 +# if chun hits and head indices match then head hits +# if self.get_head_index(ref_ann_sent, dir_point_ref) == self.get_head_index(target_ann_sent, dir_point_target): +# head =1 #if indices are different (chunks consists of different sets of words) but heads match then head hits - elif self.get_head_index(ref_ann_sent, dir_point_ref) == self.get_head_index(target_ann_sent, dir_point_target) : - head += 1 + if self.get_head_index(ref_ann_sent, dir_point_ref) == self.get_head_index(target_ann_sent, dir_point_target): + head = 1 + if chun == 1 and head == 1: + both = 1 return both,chun,head #if there was a hit on both sides of relation (dir_from, dir_to) then update counters - def update_stats(self, both, chun, head) : - if chun == 2 : + def update_stats(self, both, chun, head): + if chun == 2: self.chun_hits+=1 - if head == 2 : + if head == 2: self.head_hits += 1 - if chun == 2 and head == 2 : + if chun == 2 and head == 2: self.both_hits += 1 - if chun == 2 or head == 2: + if both > 0 and chun+head > 2: + self.any_hits+=1 + if both == 0 and chun+head > 1: self.any_hits+=1 def print_stats(self,ref_rels_count, target_rels_count, stat_mode): @@ -114,35 +119,35 @@ class RelStats : print ('Head match:\t') print '%.2f\t%.2f\t%.2f' % (p, r, f) -def compare(rel1, rel2) : +def compare(rel1, rel2): dp1_from = rel1.rel_from() dp2_from = rel2.rel_from() dp1_to = rel1.rel_to() dp2_to = rel2.rel_to() - if cmp(dp1_from.sentence_id(), dp2_from.sentence_id()) < 0 : + if cmp(dp1_from.sentence_id(), dp2_from.sentence_id()) < 0: return -1 - elif cmp(dp1_from.sentence_id(), dp2_from.sentence_id()) > 0 : + elif cmp(dp1_from.sentence_id(), dp2_from.sentence_id()) > 0: return 1 - if cmp(dp1_from.channel_name(), dp2_from.channel_name()) < 0 : + if cmp(dp1_from.channel_name(), dp2_from.channel_name()) < 0: return -1 - elif cmp(dp1_from.channel_name(), dp2_from.channel_name()) > 0 : + elif cmp(dp1_from.channel_name(), dp2_from.channel_name()) > 0: return 1 if cmp(dp1_from.annotation_number(), dp2_from.annotation_number()) < 0: return -1 - elif cmp(dp1_from.annotation_number(), dp2_from.annotation_number()) > 0 : + elif cmp(dp1_from.annotation_number(), dp2_from.annotation_number()) > 0: return 1 - if cmp(dp1_to.sentence_id(), dp2_to.sentence_id()) < 0 : + if cmp(dp1_to.sentence_id(), dp2_to.sentence_id()) < 0: return -1 - elif cmp(dp1_to.sentence_id(), dp2_to.sentence_id()) > 0 : + elif cmp(dp1_to.sentence_id(), dp2_to.sentence_id()) > 0: return 1 - if cmp(dp1_to.channel_name(), dp2_to.channel_name()) < 0 : + if cmp(dp1_to.channel_name(), dp2_to.channel_name()) < 0: return -1 - elif cmp(dp1_to.channel_name(), dp2_to.channel_name()) > 0 : + elif cmp(dp1_to.channel_name(), dp2_to.channel_name()) > 0: return 1 if cmp(dp1_to.annotation_number(), dp2_to.annotation_number()) < 0: return -1 - elif cmp(dp1_to.annotation_number(), dp2_to.annotation_number()) > 0 : + elif cmp(dp1_to.annotation_number(), dp2_to.annotation_number()) > 0: return 1 if rel1.rel_name() < rel2.rel_name(): @@ -169,7 +174,6 @@ def go(): sys.exit(1) batch_ref, batch_target, rel_name = args - rel_stats = RelStats() corpus_type = "document" @@ -182,7 +186,7 @@ def go(): target_file = open(batch_target, "r") line_ref = ref_file.readline() line_target = target_file.readline() - while line_ref and line_target : + while line_ref and line_target: line_ref = line_ref.strip() ref_ccl_filename, ref_rel_filename = line_ref.split(";") @@ -190,7 +194,6 @@ def go(): line_target = line_target.strip() target_ccl_filename, target_rel_filename = line_target.split(";") - ref_ccl_rdr = corpus2.CclRelReader(tagset, ref_ccl_filename, ref_rel_filename) target_ccl_rdr = corpus2.CclRelReader(tagset, target_ccl_filename, target_rel_filename) @@ -205,15 +208,14 @@ def go(): ref_sents = dict([ (s.id(), corpus2.AnnotatedSentence.wrap_sentence(s)) for c in ref_doc.paragraphs() for s in c.sentences()]) target_sents = dict([ (s.id(), corpus2.AnnotatedSentence.wrap_sentence(s)) for c in target_doc.paragraphs() for s in c.sentences()]) - - for pattern in ref_rels : + for pattern in ref_rels: t = filter(lambda x : (compare(x, pattern) == 0) , target_rels) - if len(t) > 0 : + if len(t) > 0: t = t[0] r = pattern both, chun, head = 0,0,0 - for dir_point_ref, dir_point_target in zip([r.rel_from(), r.rel_to()], [t.rel_from(), t.rel_to()]) : + for dir_point_ref, dir_point_target in zip([r.rel_from(), r.rel_to()], [t.rel_from(), t.rel_to()]): ref_ann_sent = ref_sents[dir_point_ref.sentence_id()] target_ann_sent = target_sents[dir_point_target.sentence_id()] b,c,h = rel_stats.verify_relation(ref_ann_sent, dir_point_ref, target_ann_sent, dir_point_target)