Skip to content
Snippets Groups Projects
Commit 6968244d authored by ilor's avatar ilor
Browse files

Merge branch 'master' into annotations

parents 381668c5 b6c9b5fa
Branches
No related tags found
No related merge requests found
......@@ -18,6 +18,7 @@ or FITNESS FOR A PARTICULAR PURPOSE.
#include <libcorpus2/io/sax.h>
#include <libpwrutils/foreach.h>
#include <libxml++/libxml++.h>
#include <libxml2/libxml/parser.h>
#include <boost/make_shared.hpp>
#include <fstream>
......@@ -37,6 +38,8 @@ protected:
const AttributeList& attributes);
void on_end_element(const Glib::ustring & name);
void finish_sentence();
const Tagset& tagset_;
enum state_t { XS_NONE, XS_CHUNK, XS_SENTENCE, XS_TOK, XS_ORTH, XS_LEX,
......@@ -45,6 +48,8 @@ protected:
bool chunkless_;
bool out_of_chunk_;
PwrNlp::Whitespace::Enum wa_;
Glib::ustring sbuf_;
......@@ -105,7 +110,7 @@ XcesReaderImpl::XcesReaderImpl(const Tagset& tagset,
std::deque< boost::shared_ptr<Chunk> >& obuf,
bool disamb_only, bool disamb_sh)
: BasicSaxParser()
, tagset_(tagset), state_(XS_NONE), chunkless_(false)
, tagset_(tagset), state_(XS_NONE), chunkless_(false), out_of_chunk_(false)
, wa_(PwrNlp::Whitespace::Newline)
, sbuf_(), tok_(NULL), sent_(), chunk_(), obuf_(obuf)
, disamb_only_(disamb_only), disamb_sh_(disamb_sh)
......@@ -127,6 +132,10 @@ void XcesReaderImpl::on_start_element(const Glib::ustring &name,
type = a.value;
}
}
if (out_of_chunk_) {
finish_sentence();
out_of_chunk_ = false;
}
if (state_ == XS_NONE) {
if (type == "s") {
//throw XcesError("Top level <chunk> is type=\"s\"");
......@@ -191,6 +200,31 @@ void XcesReaderImpl::on_start_element(const Glib::ustring &name,
clear_buf();
} else if (name == "ns") {
wa_ = PwrNlp::Whitespace::None;
} else if (name == "tok" && state_ == XS_NONE) {
std::cerr << "Warning: out-of-chunk token, assuming sentence start on line ";
std::cerr << this->context_->input->line << "\n";
chunkless_ = true;
out_of_chunk_ = true;
chunk_ = boost::make_shared<Chunk>();
sent_ = boost::make_shared<Sentence>();
state_ = XS_TOK;
tok_ = new Token();
tok_->set_wa(wa_);
wa_ = PwrNlp::Whitespace::Space;
}
}
void XcesReaderImpl::finish_sentence()
{
chunk_->append(sent_);
sent_.reset();
if (chunkless_) {
obuf_.push_back(chunk_);
chunk_.reset();
state_ = XS_NONE;
chunkless_ = false;
} else {
state_ = XS_CHUNK;
}
}
......@@ -216,16 +250,7 @@ void XcesReaderImpl::on_end_element(const Glib::ustring &name)
tok_ = NULL;
state_ = XS_SENTENCE;
} else if (state_ == XS_SENTENCE && name == "chunk") {
chunk_->append(sent_);
sent_.reset();
if (chunkless_) {
obuf_.push_back(chunk_);
chunk_.reset();
state_ = XS_NONE;
chunkless_ = false;
} else {
state_ = XS_CHUNK;
}
finish_sentence();
} else if (state_ == XS_CHUNK && name == "chunk") {
obuf_.push_back(chunk_);
chunk_.reset();
......
......@@ -141,6 +141,7 @@ namespace {
std::vector< mask_t > & current,
const std::vector<mask_t> & to_add, mask_t to_add_attr)
{
if (to_add.empty()) return;
size_t current_size = current.size();
for (size_t ai = 1; ai < to_add.size(); ++ai) {
for (size_t oi = 0; oi < current_size; ++oi) {
......@@ -176,6 +177,7 @@ void Tagset::parse_tag(const string_range_vector &fields, bool allow_extra,
std::vector<mask_t> values;
mask_t amask;
foreach (string_range& dot, dots) {
if (dot.empty()) continue;
mask_t v = get_value_mask(boost::copy_range<std::string>(dot));
mask_t curr = get_attribute_mask(get_value_attribute(v));
......@@ -281,6 +283,16 @@ Tag Tagset::make_tag(idx_t pos_idx, mask_t values, bool allow_extra) const
// << " of " << pos_required_attributes_idx_[pos_idx].size() << "\n";
size_t has_req = PwrNlp::count_bits_set(required_values & values);
if (has_req != pos_required_attributes_idx_[pos_idx].size()) {
foreach (idx_t a, get_pos_attributes(pos_idx)) {
if (pos_requires_attribute(pos_idx, a)) {
mask_t amask = get_attribute_mask(a);
if ((values & amask).none()) {
throw TagParseError("Required attribute missing",
tag_to_string(Tag(get_pos_mask(pos_idx), values)),
get_attribute_name(a), id_string());
}
}
}
throw TagParseError("Required attribute missing",
tag_to_string(Tag(get_pos_mask(pos_idx), values)),
get_pos_name(pos_idx), id_string());
......
......@@ -96,7 +96,7 @@ void tagset_info(const Corpus2::Tagset& tagset)
for (Corpus2::idx_t a = 0; a < tagset.attribute_count(); ++a) {
std::cerr << tagset.get_attribute_values(a).size() << " ";
}
std::cerr << "\n";
std::cerr << "]\n";
std::cerr << "Size is " << tagset.size()
<< " (extra size is " << tagset.size_extra() << ")\n";
std::cerr << "POSes: ";
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment