Skip to content
Snippets Groups Projects
Commit 7e52f5e3 authored by Pawel Orlowicz's avatar Pawel Orlowicz
Browse files

Merge branch 'master' of nlp.pwr.wroc.pl:corpus2

parents 89e3a78c ff0a41e5
Branches
No related merge requests found
......@@ -7,12 +7,52 @@
namespace Corpus2 {
bool ConllWriter::registered = TokenWriter::register_writer<ConllWriter>("conll");
const std::string ConllWriter::SUPERPOS_ATTR("superpos");
ConllWriter::ConllWriter(std::ostream& os, const Tagset& tagset,
const string_range_vector& params)
: TokenWriter(os, tagset, params)
{
myTagset=tagset;
// check if the tagset contains 'superpos' attribute
idx_t superpos_attr = myTagset.get_attribute_index(SUPERPOS_ATTR);
if (superpos_attr == -1)
{
throw Corpus2Error("Tagset " + myTagset.name() +
" contains no 'superpos' attribute"
" (required by CONLL format)");
}
// ensure that the 'superpos' attribute is obligatory and first
// for each of the gram. classes defined
for (idx_t pos = 0; pos < myTagset.pos_count(); ++pos) {
const std::vector<bool> req_attrs = myTagset.get_pos_required_attributes(pos);
// superpos_attr is the index of 'superpos' attr
// this index should be within range of required attributes for pos
// the attrubite should be marked as required
if ((idx_t)req_attrs.size() <= superpos_attr)
{
throw Corpus2Error("Tagset " + myTagset.name() +
" should define 'superpos' attribute for each"
" grammatical class (req. by CONLL writer)");
}
if (!req_attrs[superpos_attr])
{
throw Corpus2Error("Tagset " + myTagset.name() +
" should define 'superpos' attribute"
" as REQUIRED for each class"
" (req. by CONLL writer)");
}
// ensure that no attribute comes before superpos
if (tagset.get_pos_attributes(pos)[0] != superpos_attr)
{
throw Corpus2Error("Tagset " + myTagset.name() +
" should define 'superpos' attribute"
" as the FIRST one for each class"
" (req. by CONLL writer)");
}
}
}
ConllWriter::~ConllWriter()
......@@ -22,25 +62,36 @@ ConllWriter::~ConllWriter()
void ConllWriter::write_token(const Token &t)
{
os()<<t.orth_utf8()<<"\t";
Lexeme lex = t.get_preferred_lexeme(myTagset);
os()<<lex.lemma_utf8()+"\t";
std::string tag = myTagset.tag_to_string(lex.tag());
std::vector<std::string> strs;
std::transform(tag.begin(), tag.end(), tag.begin(), ::tolower);
boost::split(strs, tag, boost::is_any_of(":"));
os()<<strs[1]<<"\t"<<strs[0]<<"\t";
if(strs.size()>2)
const Lexeme &lex = t.get_preferred_lexeme(myTagset);
os() << t.orth_utf8() << "\t" << lex.lemma_utf8() << "\t";
// get lower-case tag representation
std::string tagstr = myTagset.tag_to_string(lex.tag());
std::transform(tagstr.begin(), tagstr.end(), tagstr.begin(), ::tolower);
// ugly, but should work: split the lower tag repr on colons
std::vector<std::string> segs;
boost::split(segs, tagstr, boost::is_any_of(":"));
// now write each part of the split string and pad the non-existent
// attributes with _
// (ctr has asserted that after the obligatory gram. class comes
// 'superpos' attribute, so it is safe to assume there are always
// at least 2 segments)
os() << segs[1] << "\t" << segs[0] << "\t";
if(segs.size() > 2)
{
size_t i;
for(i=2;i<strs.size()-1;i++)
for(i = 2; i < segs.size() - 1; i++)
{
os()<<strs[i]<<"|";
os() << segs[i] <<"|";
}
os()<<strs[i]<<"\t_\t_\t_\t_";
os() << segs[i] << "\t_\t_\t_\t_";
}
else
os()<<"_\t_\t_\t_\t_";
{
os()<< "_\t_\t_\t_\t_";
}
}
void ConllWriter::write_sentence(const Sentence& s)
......
......@@ -5,6 +5,14 @@
namespace Corpus2 {
/**
* Writer in the CONLL format (as required by MALT parser). The writer
* assumes that the tagset used employs an attribute named 'superpos'
* (this naming is obligatory) and the attribute is defined as first
* and required for each grammatical class. This attribute is used to
* designate a more general POS category for each token (e.g. all verb
* classes could be marked as VERB there).
*/
class ConllWriter : public TokenWriter
{
public:
......@@ -19,6 +27,7 @@ public:
void write_chunk(const Chunk &c);
const static std::string SUPERPOS_ATTR;
static bool registered;
protected:
......
......@@ -135,6 +135,7 @@ class Metric:
# as above but metric for POS hits
POS_WC = ([Feat.WEAK_POS_HIT], None)
POS_SC = ([Feat.STRONG_POS_HIT], None)
POS_WC_LOWER = ([Feat.WEAK_POS_HIT, Feat.SEG_NOCHANGE], None) # lower bound for POS WC
# separate stats for known and unknown forms
KN_WC = ([Feat.WEAK_TAG_HIT, Feat.KNOWN], [Feat.KNOWN])
UNK_WC = ([Feat.WEAK_TAG_HIT, Feat.UNKNOWN], [Feat.UNKNOWN])
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment