diff --git a/CMakeLists.txt b/CMakeLists.txt index 83e654c4131ed8939f54b01693a02b394f270786..09c8d605e5ab9f53679022b647be198cdb6ab966 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ PROJECT(Corpus2Library) set(corpus2_ver_major "1") -set(corpus2_ver_minor "2") +set(corpus2_ver_minor "3") set(corpus2_ver_patch "3") cmake_minimum_required(VERSION 2.8.0) @@ -53,7 +53,7 @@ set(LIBS "") include_directories(${Corpus2Library_SOURCE_DIR}) find_package(Boost 1.41 REQUIRED COMPONENTS program_options system filesystem regex) - +MARK_AS_ADVANCED(Boost_DIR) if(MSVC OR BORLAND) # Use the auto-linking feature, don't try to add libraries yourself: set(Boost_LIBRARIES "") diff --git a/corpus2data/beatca.tagset b/corpus2data/beatca.tagset new file mode 100644 index 0000000000000000000000000000000000000000..3607a0aeee0a87f82ae14c6e7a7af7c20a7bb3b8 --- /dev/null +++ b/corpus2data/beatca.tagset @@ -0,0 +1,17 @@ +[ATTR] + +[POS] +ADJ +ADV +OTHER +VERB +CONJ +PUNCTUATION +NUM +PREP +SUBST +PART +UNKNOWN + +[IGN] +UNKNOWN diff --git a/corpus2tools/corpus-merge b/corpus2tools/corpus-merge index 047e9f0be7a0b8cd5bb46ceee379fa9066c99db4..d04345cbfbf39c4851d25785fbb8b534bfc02971 100755 --- a/corpus2tools/corpus-merge +++ b/corpus2tools/corpus-merge @@ -1,6 +1,6 @@ #!/usr/bin/python # -*- coding: utf-8 -*- -import sys +import sys, os from optparse import OptionParser from collections import defaultdict as dd from itertools import repeat, izip @@ -45,6 +45,9 @@ def go(): parser.add_option('-C', '--chunks', action='store_true', dest='chunks', default=False, help='Process chunks (select chunks/sentences, not tokens)') + parser.add_option('--prefix-chunks', action='store_true', + dest='prefix_chunks', default=False, + help='Prefix chunk ids with filename (file:NAME:ORIGID)') parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, help='verbose mode') @@ -68,11 +71,23 @@ def go(): for arg in args: reader = corpus2.TokenReader.create_path_reader(options.input_format, tagset, arg) if options.chunks: + fname, _ = os.path.splitext(os.path.basename(arg)) + chunk_no = 1 for chunk in chunks(reader): + if options.prefix_chunks: + if chunk.has_attribute('id'): + their_id = chunk.get_attribute('id') + else: + # autogen + their_id = ('auto%03d' % chunk_no) + full_id = 'file:%s:%s' % (fname, their_id) + chunk.set_attribute('id', full_id) writer.write_chunk(chunk) + chunk_no += 1 else: for sent in sentences(reader): writer.write_sentence(sent) + del reader if __name__ == '__main__': go() diff --git a/libcorpus2/ann/annotatedsentence.h b/libcorpus2/ann/annotatedsentence.h index 65b3a0353d94266b6e18edf0054497881e77a72d..1b3a3359ecc98f1f7145b970ebcf862abc90dcc0 100644 --- a/libcorpus2/ann/annotatedsentence.h +++ b/libcorpus2/ann/annotatedsentence.h @@ -125,6 +125,14 @@ public: return true; } + /** + * Remove channel having a given name. Returns whether removed + * (will return false if no channel of the given name exists). + */ + bool remove_channel(const std::string& name) { + return (channels_.erase(name) > 0); + } + const chan_map_t& all_channels() const { return channels_; } diff --git a/libcorpus2/io/premorphwriter.cpp b/libcorpus2/io/premorphwriter.cpp index c47e84df8090e3b1ad6c5610fbc9fd9106fe72c6..e3307948b8d9fe6d36a671540c9f28922be09876 100644 --- a/libcorpus2/io/premorphwriter.cpp +++ b/libcorpus2/io/premorphwriter.cpp @@ -17,6 +17,9 @@ or FITNESS FOR A PARTICULAR PURPOSE. #include <libcorpus2/io/premorphwriter.h> #include <boost/foreach.hpp> +// for entity encoding +#include <libcorpus2/io/xcescommon.h> + namespace Corpus2 { bool PremorphWriter::registered = TokenWriter::register_writer<PremorphWriter>( @@ -42,14 +45,15 @@ PremorphWriter::~PremorphWriter() void PremorphWriter::write_token(const Token &t) { - os() << PwrNlp::Whitespace::to_whitespace(t.wa()) << t.orth_utf8(); + os() << PwrNlp::Whitespace::to_whitespace(t.wa()); + encode_xml_entities_into(os(), t.orth_utf8()); } void PremorphWriter::write_sentence(const Sentence &s) { os() << "<chunk type=\"s\">"; if (!s.tokens().empty()) { - os() << s[0]->orth_utf8(); + encode_xml_entities_into(os(), s[0]->orth_utf8()); } for (size_t i = 1; i < s.tokens().size(); ++i) { write_token(*s[i]); diff --git a/libcorpus2/io/xmlreader.cpp b/libcorpus2/io/xmlreader.cpp index 0816839e69873f82ec81219d13819b36c30b193f..bde08712fb9d61efcc950c985efb4ed0ae6092cf 100644 --- a/libcorpus2/io/xmlreader.cpp +++ b/libcorpus2/io/xmlreader.cpp @@ -19,6 +19,8 @@ or FITNESS FOR A PARTICULAR PURPOSE. #include <libxml++/libxml++.h> #include <libxml2/libxml/parser.h> #include <boost/make_shared.hpp> +#include <boost/algorithm/string.hpp> + #include <fstream> namespace Corpus2 { @@ -228,15 +230,21 @@ void XmlReader::on_end_element(const Glib::ustring &name) { //std::cerr << "/" << name << state_ << "\n"; if (state_ == STATE_ORTH && name == "orth") { - tok_->set_orth(UnicodeString::fromUTF8(get_buf())); + std::string tmp_buf = get_buf(); + boost::trim(tmp_buf); + tok_->set_orth(UnicodeString::fromUTF8(tmp_buf)); grab_characters_ = false; state_ = STATE_TOK; } else if (state_ == STATE_LEMMA && name == "base") { - tok_->lexemes().back().set_lemma(UnicodeString::fromUTF8(get_buf())); + std::string tmp_buf = get_buf(); + boost::trim(tmp_buf); + tok_->lexemes().back().set_lemma(UnicodeString::fromUTF8(tmp_buf)); grab_characters_ = false; state_ = STATE_LEX; } else if (state_ == STATE_TAG && name == "ctag") { - Tag tag = base_reader_.parse_tag(get_buf()); + std::string tmp_buf = get_buf(); + boost::trim(tmp_buf); + Tag tag = base_reader_.parse_tag(tmp_buf); tok_->lexemes().back().set_tag(tag); grab_characters_ = false; state_ = STATE_LEX; diff --git a/libcorpus2/tagging.cpp b/libcorpus2/tagging.cpp index 785edf9ffe411dd9fd138df371a5171fcdb3eebb..a814970d37e45590d0154384dcb7938d43e30c0b 100644 --- a/libcorpus2/tagging.cpp +++ b/libcorpus2/tagging.cpp @@ -59,6 +59,12 @@ int mask_card(const Tag& mask) + PwrNlp::count_bits_set(mask.get_values()); } +Tag with_values_masked(Tag input, Tag attr_value, Tag attr_mask) { + Tag output(input); + output.add_values_masked(attr_value.get_values(), attr_mask.get_values()); + return output; +} + bool select_preferred_disamb(const Tagset& tagset, Token* token) { size_t lex_idx = token->get_preferred_lexeme_index(tagset); diff --git a/libcorpus2/tagging.h b/libcorpus2/tagging.h index 4beaba5096ac616291e822824971838aa33e2142..9039bd8c36c1f713584d2cc996670be04f298497 100644 --- a/libcorpus2/tagging.h +++ b/libcorpus2/tagging.h @@ -45,6 +45,13 @@ Tag mask_token(const Token& token, const Tag& mask, bool disamb_only); /** Returns the number of set elements belonging to the mask given. */ int mask_card(const Tag& mask); +/** + * Returns a copy of the given input tag with the attribute referred + * to by attr_mask value set to attr_value (possibly empty). + * NOTE: only attribute part of the masks are considered. + */ +Tag with_values_masked(Tag input, Tag attr_value, Tag attr_mask); + /** Forces one disamb lexeme per token. The selection is based on tagset * definition order. Returns if any disamb found. */ @@ -99,6 +106,7 @@ bool disambiguate_subset(Token* token, const Tag& mask_where, /** Sets lexemes' disamb markers iff lexeme.tag is wanted_tag. */ void set_disambs(Token *token, const Tag& wanted_tag); + } /* end ns Corpus2 */ #endif // LIBCORPUS2_TAGGING_H diff --git a/libcorpus2_whole/io/cclrelreader.cpp b/libcorpus2_whole/io/cclrelreader.cpp index 9072f2b1f3cbd2ab7c4593f633dc370762c99ab5..ee84d97d50296ec49e252d9bf9abf797558ba62e 100644 --- a/libcorpus2_whole/io/cclrelreader.cpp +++ b/libcorpus2_whole/io/cclrelreader.cpp @@ -72,11 +72,7 @@ namespace whole { void CclRelReader::set_option(const std::string& option) { - if (option == "autogen_sent_id") { - ccl_reader_->set_option("autogen_sent_id"); - } else if (option == "autogen_chunk_id") { - ccl_reader_->set_option("autogen_chunk_id"); - } + ccl_reader_->set_option(option); } std::string CclRelReader::get_option(const std::string& option) const { diff --git a/swig/annotatedsentence.i b/swig/annotatedsentence.i index c158c4aa603da845d80a225b89233845ccd080e9..f3d2b4e407e1a41a348904550539477e05a58207 100644 --- a/swig/annotatedsentence.i +++ b/swig/annotatedsentence.i @@ -50,6 +50,12 @@ namespace Corpus2 { AnnotationChannel& get_channel(const std::string& name); const AnnotationChannel& get_channel(const std::string& name) const; bool add_channel(const std::string& name, const AnnotationChannel& chan); + + /* Use with care. Note that get_channel returns a reference to the original + channel, hence it is strongly recommended to del the channel ref object + before attempting to call remove_channel. */ + bool remove_channel(const std::string& name); + const chan_map_t& all_channels() const; /* --------------------------------------------------------------------- */ diff --git a/swig/tagging.i b/swig/tagging.i index ec90902b9fc3121809429e8807a85a44e2a95ae1..b4695cef1dd7f248d582b201f62dfe68f45fc36b 100644 --- a/swig/tagging.i +++ b/swig/tagging.i @@ -21,6 +21,8 @@ Tag mask_token(const Token& token, const Tag& mask, bool disamb_only); int mask_card(const Tag& mask); +Tag with_values_masked(Tag input, Tag attr_value, Tag attr_mask); + bool select_preferred_disamb(const Tagset& tagset, Token* token); void select_preferred_lexeme(const Tagset& tagset, Token* token); diff --git a/swig/token.i b/swig/token.i index 0290fd404b1c0c5f27b49bb350ae0930a9f62b1c..7f6c73c42e5c306f0d9ec138fef67c9b08cf78b0 100644 --- a/swig/token.i +++ b/swig/token.i @@ -31,7 +31,7 @@ namespace Corpus2 { Token(const UnicodeString& orth, PwrNlp::Whitespace::Enum wa); Token* clone() const; - Token* create_utf8(const std::string& orth_utf8, PwrNlp::Whitespace::Enum wa = PwrNlp::Whitespace::Space); + static Token* create_utf8(const std::string& orth_utf8, PwrNlp::Whitespace::Enum wa = PwrNlp::Whitespace::Space); const UnicodeString& orth() const; std::string orth_utf8() const; @@ -67,6 +67,8 @@ namespace Corpus2 { boost::shared_ptr<TokenMetaData> get_metadata() const; void set_metadata(TokenMetaData& md); void set_metadata_ptr(boost::shared_ptr<TokenMetaData> md); + + void create_metadata(); }; %extend Token { diff --git a/utils/get_morpho.py b/utils/get_morpho.py new file mode 100755 index 0000000000000000000000000000000000000000..ec724b0a0a3534df65c1114fd2e9278d1290f464 --- /dev/null +++ b/utils/get_morpho.py @@ -0,0 +1,103 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# Copyright (C) 2013 Adam Radziszewski. +# This program is free software; you can redistribute and/or modify it +# under the terms of the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. +# +# See the LICENCE and COPYING files for more details + +descr = """%prog [options] CORPUS OUTDATAFILE + +Gathers morphological data from the given corpus. + +The extracted morphological data will be saved to OUTDATAFILE in tab-delimited +format (compliant with Maca analysers). +NOTE: the morphological data will not be compacted. To do so, you may use +tabclean.py script from Maca repository. +""" + +import sys, codecs +from optparse import OptionParser +import corpus2 + +class Analyser: + """Morphological dictionary resulting from reading a corpus. + Provides consume function that updates the data with the given token.""" + def __init__(self, tagset, case_sensitive): + self.tagset = tagset + self.unk_tag = self.tagset.make_ign_tag() + self.get_form = (lambda form: form) if case_sensitive else (lambda form: form.lower()) + self.data = {} + + def _add_one(self, ready_form, lex): + """Adds tag and lemma from the given lexeme.""" + if ready_form not in self.data: + self.data[ready_form] = set() + self.data[ready_form].add( + (unicode(lex.lemma()), self.tagset.tag_to_string(lex.tag())) + ) + + def consume(self, tok): + ready_form = self.get_form(unicode(tok.orth())) + for lex in tok.lexemes(): + if lex.tag() != self.unk_tag: + self._add_one(ready_form, lex) + + def write(self, output): + for form in sorted(self.data): + entries = sorted(self.data[form]) + for lemma, tag in entries: + output.write(u'%s\t%s\t%s\n' % (form, lemma, tag)) + + def save(self, fname): + output = codecs.open(fname, 'wb', 'utf-8') + self.write(output) + output.close() + +def get_morpho(options, corpname, outfname): + tagset = corpus2.get_named_tagset(options.tagset) + anal = Analyser(tagset, options.case_sens) + rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, corpname) + while True: + tok = rdr.get_next_token() + if not tok: + break + anal.consume(tok) + del rdr + anal.save(outfname) + +def go(): + parser = OptionParser(usage=descr) + parser.add_option('-i', '--input-format', type='string', action='store', + dest='input_format', default='xces', + help='set the input format; default: xces') + parser.add_option('-t', '--tagset', type='string', action='store', + dest='tagset', default='nkjp', + help='set the tagset used in input; default: nkjp') + parser.add_option('-s', '--sep', type='string', action='store', + dest='sep', default='\t', + help='set the separator used in morpho file; default: tab character') + parser.add_option('-c', '--case-sensitive', action='store_true', default=False, dest='case_sens') + parser.add_option('-q', '--quiet', action='store_false', default=True, dest='verbose') + (options, args) = parser.parse_args() + + if len(args) != 2: + print 'You need to provide an input corpus and output path' + print + parser.print_help() + sys.exit(1) + + corpname = args[0] + outfname = args[1] + + get_morpho(options, corpname, outfname) + +if __name__ == '__main__': + go() diff --git a/utils/relation_eval.py b/utils/relation_eval.py index 4067051f9c989a1b69e8a6e3c0df0d81b06e9959..0daa15a337e23cb40df2222c496c9b37fa9b1163 100755 --- a/utils/relation_eval.py +++ b/utils/relation_eval.py @@ -1,4 +1,5 @@ #!/usr/bin/python +# -*- coding: utf-8 -*- # Copyright (C) 2012 Paweł Orłowicz. # This program is free software; you can redistribute and/or modify it @@ -31,7 +32,7 @@ from optparse import OptionParser import sys import corpus2 -class RelStats : +class RelStats: def __init__(self): self.both_hits = 0 self.head_hits = 0 @@ -39,7 +40,7 @@ class RelStats : self.any_hits = 0 #helper method to get annotation vector from annotated sentence - def get_channel_annotations(self, ann_sent, dir_point) : + def get_channel_annotations(self, ann_sent, dir_point): chann_name = dir_point.channel_name() annotation_number = dir_point.annotation_number() - 1 channel = ann_sent.get_channel(chann_name) @@ -47,7 +48,7 @@ class RelStats : return ann_vec[annotation_number] #helper method to get list of tokens' indices - def get_indices(self, annotated_sentence, direction_point) : + def get_indices(self, annotated_sentence, direction_point): ann_chann = self.get_channel_annotations(annotated_sentence, direction_point) indices = ann_chann.indices #loop to unwrap Integer objects from ann_chann.indices @@ -58,34 +59,38 @@ class RelStats : return inds #helper to get index of the chunk's head - def get_head_index(self, annotated_sentence, direction_point) : + def get_head_index(self, annotated_sentence, direction_point): ann_chann = self.get_channel_annotations(annotated_sentence, direction_point) head_index = ann_chann.head_index return head_index #returns values of hits from one direction point of relation - def verify_relation(self, ref_ann_sent, dir_point_ref, target_ann_sent, dir_point_target) : + def verify_relation(self, ref_ann_sent, dir_point_ref, target_ann_sent, dir_point_target): both, head, chun = 0,0,0 #if indices from ref chunk and target chunks equals (tokens are the same) then chun hits - if self.get_indices(ref_ann_sent, dir_point_ref) == self.get_indices(target_ann_sent, dir_point_target) : - chun += 1 - #if chun hits and head indices match then head hits - if self.get_head_index(ref_ann_sent, dir_point_ref) == self.get_head_index(target_ann_sent, dir_point_target) : - head +=1 + if self.get_indices(ref_ann_sent, dir_point_ref) == self.get_indices(target_ann_sent, dir_point_target): + chun = 1 +# if chun hits and head indices match then head hits +# if self.get_head_index(ref_ann_sent, dir_point_ref) == self.get_head_index(target_ann_sent, dir_point_target): +# head =1 #if indices are different (chunks consists of different sets of words) but heads match then head hits - elif self.get_head_index(ref_ann_sent, dir_point_ref) == self.get_head_index(target_ann_sent, dir_point_target) : - head += 1 + if self.get_head_index(ref_ann_sent, dir_point_ref) == self.get_head_index(target_ann_sent, dir_point_target): + head = 1 + if chun == 1 and head == 1: + both = 1 return both,chun,head #if there was a hit on both sides of relation (dir_from, dir_to) then update counters - def update_stats(self, both, chun, head) : - if chun == 2 : + def update_stats(self, both, chun, head): + if chun == 2: self.chun_hits+=1 - if head == 2 : + if head == 2: self.head_hits += 1 - if chun == 2 and head == 2 : + if chun == 2 and head == 2: self.both_hits += 1 - if chun == 2 or head == 2: + if both > 0 and chun+head > 2: + self.any_hits+=1 + if both == 0 and chun+head > 1: self.any_hits+=1 def print_stats(self,ref_rels_count, target_rels_count, stat_mode): @@ -114,35 +119,35 @@ class RelStats : print ('Head match:\t') print '%.2f\t%.2f\t%.2f' % (p, r, f) -def compare(rel1, rel2) : +def compare(rel1, rel2): dp1_from = rel1.rel_from() dp2_from = rel2.rel_from() dp1_to = rel1.rel_to() dp2_to = rel2.rel_to() - if cmp(dp1_from.sentence_id(), dp2_from.sentence_id()) < 0 : + if cmp(dp1_from.sentence_id(), dp2_from.sentence_id()) < 0: return -1 - elif cmp(dp1_from.sentence_id(), dp2_from.sentence_id()) > 0 : + elif cmp(dp1_from.sentence_id(), dp2_from.sentence_id()) > 0: return 1 - if cmp(dp1_from.channel_name(), dp2_from.channel_name()) < 0 : + if cmp(dp1_from.channel_name(), dp2_from.channel_name()) < 0: return -1 - elif cmp(dp1_from.channel_name(), dp2_from.channel_name()) > 0 : + elif cmp(dp1_from.channel_name(), dp2_from.channel_name()) > 0: return 1 if cmp(dp1_from.annotation_number(), dp2_from.annotation_number()) < 0: return -1 - elif cmp(dp1_from.annotation_number(), dp2_from.annotation_number()) > 0 : + elif cmp(dp1_from.annotation_number(), dp2_from.annotation_number()) > 0: return 1 - if cmp(dp1_to.sentence_id(), dp2_to.sentence_id()) < 0 : + if cmp(dp1_to.sentence_id(), dp2_to.sentence_id()) < 0: return -1 - elif cmp(dp1_to.sentence_id(), dp2_to.sentence_id()) > 0 : + elif cmp(dp1_to.sentence_id(), dp2_to.sentence_id()) > 0: return 1 - if cmp(dp1_to.channel_name(), dp2_to.channel_name()) < 0 : + if cmp(dp1_to.channel_name(), dp2_to.channel_name()) < 0: return -1 - elif cmp(dp1_to.channel_name(), dp2_to.channel_name()) > 0 : + elif cmp(dp1_to.channel_name(), dp2_to.channel_name()) > 0: return 1 if cmp(dp1_to.annotation_number(), dp2_to.annotation_number()) < 0: return -1 - elif cmp(dp1_to.annotation_number(), dp2_to.annotation_number()) > 0 : + elif cmp(dp1_to.annotation_number(), dp2_to.annotation_number()) > 0: return 1 if rel1.rel_name() < rel2.rel_name(): @@ -169,7 +174,6 @@ def go(): sys.exit(1) batch_ref, batch_target, rel_name = args - rel_stats = RelStats() corpus_type = "document" @@ -182,7 +186,7 @@ def go(): target_file = open(batch_target, "r") line_ref = ref_file.readline() line_target = target_file.readline() - while line_ref and line_target : + while line_ref and line_target: line_ref = line_ref.strip() ref_ccl_filename, ref_rel_filename = line_ref.split(";") @@ -190,7 +194,6 @@ def go(): line_target = line_target.strip() target_ccl_filename, target_rel_filename = line_target.split(";") - ref_ccl_rdr = corpus2.CclRelReader(tagset, ref_ccl_filename, ref_rel_filename) target_ccl_rdr = corpus2.CclRelReader(tagset, target_ccl_filename, target_rel_filename) @@ -205,15 +208,14 @@ def go(): ref_sents = dict([ (s.id(), corpus2.AnnotatedSentence.wrap_sentence(s)) for c in ref_doc.paragraphs() for s in c.sentences()]) target_sents = dict([ (s.id(), corpus2.AnnotatedSentence.wrap_sentence(s)) for c in target_doc.paragraphs() for s in c.sentences()]) - - for pattern in ref_rels : + for pattern in ref_rels: t = filter(lambda x : (compare(x, pattern) == 0) , target_rels) - if len(t) > 0 : + if len(t) > 0: t = t[0] r = pattern both, chun, head = 0,0,0 - for dir_point_ref, dir_point_target in zip([r.rel_from(), r.rel_to()], [t.rel_from(), t.rel_to()]) : + for dir_point_ref, dir_point_target in zip([r.rel_from(), r.rel_to()], [t.rel_from(), t.rel_to()]): ref_ann_sent = ref_sents[dir_point_ref.sentence_id()] target_ann_sent = target_sents[dir_point_target.sentence_id()] b,c,h = rel_stats.verify_relation(ref_ann_sent, dir_point_ref, target_ann_sent, dir_point_target) diff --git a/utils/tagger-eval.py b/utils/tagger-eval.py index 6a2c406c9621254d6675b5b5d37db88353759f6f..86a70df180f1a08102fa185e4c236e32d9f6a9c8 100755 --- a/utils/tagger-eval.py +++ b/utils/tagger-eval.py @@ -148,6 +148,7 @@ class Metric: POS_WC = ([Feat.WEAK_POS_HIT], None) POS_SC = ([Feat.STRONG_POS_HIT], None) POS_WC_LOWER = ([Feat.WEAK_POS_HIT, Feat.SEG_NOCHANGE], None) # lower bound for POS WC + POS_SC_LOWER = ([Feat.STRONG_POS_HIT, Feat.SEG_NOCHANGE], None) # lower bound for POS SC # separate stats for known and unknown forms KN_WC = ([Feat.WEAK_TAG_HIT, Feat.KNOWN], [Feat.KNOWN]) UNK_WC = ([Feat.WEAK_TAG_HIT, Feat.UNKNOWN], [Feat.UNKNOWN]) @@ -160,6 +161,9 @@ class Metric: UNK_WC_LOWER = ([Feat.WEAK_TAG_HIT, Feat.SEG_NOCHANGE, Feat.UNKNOWN], [Feat.UNKNOWN]) KN_SEG_CHANGE = ([Feat.SEG_CHANGE, Feat.KNOWN], [Feat.KNOWN]) UNK_SEG_CHANGE = ([Feat.SEG_CHANGE, Feat.UNKNOWN], [Feat.UNKNOWN]) + + KN_POS_SC_LOWER = ([Feat.STRONG_POS_HIT, Feat.SEG_NOCHANGE, Feat.KNOWN], [Feat.KNOWN]) + UNK_POS_SC_LOWER = ([Feat.STRONG_POS_HIT, Feat.SEG_NOCHANGE, Feat.UNKNOWN], [Feat.UNKNOWN]) # heur recover PUNCHIT_PUNCONLY = ([Feat.ALLPUNC_HIT], None)