Skip to content
Snippets Groups Projects
Commit dd9a3136 authored by ilor's avatar ilor
Browse files

IOB-CHAN~~~~

parent cbf8a71a
No related merge requests found
...@@ -59,6 +59,7 @@ SET(libcorpus2_STAT_SRC ...@@ -59,6 +59,7 @@ SET(libcorpus2_STAT_SRC
io/cclwriter.cpp io/cclwriter.cpp
io/helpers.cpp io/helpers.cpp
io/fastxces.cpp io/fastxces.cpp
io/iob-chan.cpp
io/nonewriter.cpp io/nonewriter.cpp
io/orthwriter.cpp io/orthwriter.cpp
io/pathwriter.cpp io/pathwriter.cpp
......
/*
Copyright (C) 2010 Tomasz Śniatowski, Adam Radziszewski
Part of the libcorpus2 project
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your option)
any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.
See the LICENSE and COPYING files for more details.
*/
#include <libcorpus2/io/iob-chan.h>
#include <libpwrutils/foreach.h>
#include <libcorpus2/ann/annotatedsentence.h>
#include <boost/algorithm/string.hpp>
#include <boost/make_shared.hpp>
#include <fstream>
#include <boost/algorithm/string/split.hpp>
namespace Corpus2 {
bool IobChanWriter::registered = TokenWriter::register_writer<IobChanWriter>(
"iob-chan", "nowarn,noforce");
IobChanWriter::IobChanWriter(std::ostream& os, const Tagset& tagset,
const string_range_vector& params)
: TokenWriter(os, tagset, params), warn_on_no_lexemes_(true), force_(true)
{
foreach (const string_range& param, params) {
std::string p = boost::copy_range<std::string>(param);
if (p == "nowarn") {
warn_on_no_lexemes_ = false;
} else if (p == "noforce") {
force_ = false;
}
}
}
void IobChanWriter::write_token(const Token& t)
{
os() << t.orth_utf8();
if (t.lexemes().empty()) {
if (warn_on_no_lexemes_) {
std::cerr << "No lexemes for token!";
}
} else {
const Lexeme& pref = t.get_preferred_lexeme(tagset());
os() << "\t";
write_tag(pref.tag());
}
os() << "\n";
}
void IobChanWriter::write_sentence(const Sentence& s)
{
const AnnotatedSentence* ann = dynamic_cast<const AnnotatedSentence*>(&s);
if (force_) {
// I sincerely apologize
AnnotatedSentence* hax = const_cast<AnnotatedSentence*>(ann);
foreach(const AnnotatedSentence::chan_map_t::value_type& v, hax->all_channels()) {
hax->get_channel(v.first).make_iob_from_segments();
}
}
for (size_t idx = 0; idx < s.size(); ++idx) {
const Token* t = s.tokens()[idx];
os() << t->orth_utf8();
if (t->lexemes().empty()) {
if (warn_on_no_lexemes_) {
std::cerr << "No lexemes for token!";
}
} else {
const Lexeme& pref = t->get_preferred_lexeme(tagset());
os() << "\t";
os() << pref.lemma_utf8();
os() << "\t";
write_tag(pref.tag());
os() << "\t";
}
if (ann) {
bool first = true;
foreach (const AnnotatedSentence::chan_map_t::value_type& v, ann->all_channels()) {
if (!first) {
os() << ",";
}
os() << v.first << "-";
os() << Corpus2::IOB::to_string(v.second.get_iob_at(idx));
first = false;
}
}
os() << "\n";
}
os() << "\n";
}
void IobChanWriter::write_chunk(const Chunk& c)
{
foreach (const Sentence::ConstPtr& s, c.sentences()) {
write_sentence(*s);
}
}
void IobChanWriter::write_tag(const Tag& tag)
{
os() << tagset().tag_to_string(tag);
}
bool IobChanReader::registered = TokenReader::register_reader<IobChanReader>("iob-chan",
"ign,loose,strict,no_set_disamb");
IobChanReader::IobChanReader(const Tagset& tagset, std::istream& is)
: BufferedSentenceReader(tagset), is_(&is), disamb_(true)
{
}
IobChanReader::IobChanReader(const Tagset& tagset, const std::string& filename)
: BufferedSentenceReader(tagset), is_(), disamb_(true)
{
is_owned_.reset(new std::ifstream(filename.c_str(), std::ifstream::in));
if (!this->is_owned_->good()) {
throw Corpus2Error("File not found!");
}
else {
this->is_ = is_owned_.get();
}
}
Sentence::Ptr IobChanReader::actual_next_sentence()
{
std::string line;
boost::shared_ptr<AnnotatedSentence> s;
typedef boost::split_iterator<std::string::const_iterator> string_split_iterator;
while (is().good()) {
std::getline(is(), line);
if (line.empty()) {
break;
}
std::vector<std::string> spl;
boost::algorithm::split(spl, line, boost::is_any_of("\t"));
if (spl.size() != 4) {
std::cerr << "Invalid line: " << line << "(" << spl.size() << ")\n";
} else {
const std::string& orth = spl[0];
const std::string& lemma = spl[1];
const std::string& tag_string = spl[2];
const std::string& anns = spl[3];
Tag tag = parse_tag(tag_string);
Token* t = new Token();
t->set_orth(UnicodeString::fromUTF8(orth));
t->set_wa(PwrNlp::Whitespace::Space);
t->add_lexeme(Lexeme(UnicodeString::fromUTF8(lemma), tag));
if (disamb_) {
t->lexemes().back().set_disamb(true);
}
if (!s) {
s = boost::make_shared<AnnotatedSentence>();
t->set_wa(PwrNlp::Whitespace::Newline);
}
s->append(t);
std::vector<std::string> annsplit;
boost::algorithm::split(annsplit, anns, boost::is_any_of(","));
foreach (const std::string& a, annsplit) {
std::vector<std::string> one_ann_split;
boost::algorithm::split(one_ann_split, a, boost::is_any_of("-"));
if (one_ann_split.size() != 2) {
std::cerr << "Invalid annotation:" << a << "\n";
} else {
const std::string& aname = one_ann_split[0];
const std::string& aiob = one_ann_split[1];
Corpus2::IOB::Enum iob = Corpus2::IOB::from_string(aiob);
if (iob == Corpus2::IOB::PostLast) {
std::cerr << "Invalid IOB tag: " << aiob << "\n";
} else {
if (!s->has_channel(aname)) {
s->create_channel(aname);
}
s->get_channel(aname).set_iob_at(s->size() - 1, iob);
}
}
}
}
}
if (s) {
foreach (const AnnotatedSentence::chan_map_t::value_type& v, s->all_channels()) {
s->get_channel(v.first).make_segments_from_iob();
}
}
return s;
}
void IobChanReader::set_option(const std::string &option)
{
if (option == "no_set_disamb") {
disamb_ = false;
} else {
BufferedSentenceReader::set_option(option);
}
}
std::string IobChanReader::get_option(const std::string &option) const
{
if (option == "no_set_disamb") {
return !disamb_ ? option : "";
}
return BufferedSentenceReader::get_option(option);
}
} /* end ns Corpus2 */
/*
Copyright (C) 2010 Tomasz Śniatowski, Adam Radziszewski
Part of the libcorpus2 project
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your option)
any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.
See the LICENSE and COPYING files for more details.
*/
#ifndef LIBSORPUS2_IO_IOB_CHAN_H
#define LIBCORPUS2_IO_IOB_CHAN_H
#include <libcorpus2/io/reader.h>
#include <libcorpus2/io/writer.h>
#include <boost/scoped_ptr.hpp>
namespace Corpus2 {
/**
* Simple writer class to output token in RFTagger-compatible corpora form.
*
* One token per line, token line consists of the orth, followed by the
* tag, followed by newline (one tag per token only). Each sentence
* is followed by a blank line.
*
* The first lexeme is used. No-lexeme tokens trigger a warning unless
* nowarn is passed.
*/
class IobChanWriter : public TokenWriter
{
public:
IobChanWriter(std::ostream& os, const Tagset& tagset,
const string_range_vector& params);
void write_token(const Token& t);
void write_sentence(const Sentence& s);
void write_chunk(const Chunk &p);
void write_tag(const Tag& tag);
static bool registered;
private:
bool warn_on_no_lexemes_;
bool force_;
};
class IobChanReader : public BufferedSentenceReader
{
public:
IobChanReader(const Tagset& tagset, std::istream& is);
IobChanReader(const Tagset& tagset, const std::string& filename);
std::istream& is() {
return *is_;
}
void set_option(const std::string& option);
std::string get_option(const std::string& option) const;
static bool registered;
protected:
/// BufferedSentenceReader override
Sentence::Ptr actual_next_sentence();
std::istream* is_;
boost::scoped_ptr<std::istream> is_owned_;
/// Whether to mark all incoming tags as disambiguated
bool disamb_;
};
} /* end ns Corpus2 */
#endif // LIBCORPUS2_IO_IOB_CHAN_H
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment