diff --git a/libcorpus2/io/xcescommon.cpp b/libcorpus2/io/xcescommon.cpp index 7bcb4c5fae7f76f3d8e2c846823c0b894c2d3bc8..52edd595542fadf7243f86655589e4921c66b772 100644 --- a/libcorpus2/io/xcescommon.cpp +++ b/libcorpus2/io/xcescommon.cpp @@ -46,9 +46,8 @@ namespace { } } -void token_as_xces_xml(std::ostream& os, const Tagset& tagset, - const Token& t, int indent, bool output_disamb /* = false */, - bool sort /* = false */, bool whitespace_info /* false */) +void token_as_xces_xml_head(std::ostream& os, + const Token& t, int indent, bool whitespace_info /* false */) { if (t.wa() == PwrNlp::Whitespace::None) { osi(os, indent) << "<ns/>\n"; @@ -59,7 +58,12 @@ void token_as_xces_xml(std::ostream& os, const Tagset& tagset, osi(os, indent) << "<tok ws=\"" << PwrNlp::Whitespace::to_string(t.wa()) << "\">\n"; } - ++indent; +} + +void token_as_xces_xml_body(std::ostream& os, const Tagset& tagset, + const Token& t, int indent, bool output_disamb /* = false */, + bool sort /* = false */) +{ osi(os, indent) << "<orth>"; encode_xml_entities_into(os, t.orth_utf8()); os << "</orth>\n"; @@ -80,6 +84,15 @@ void token_as_xces_xml(std::ostream& os, const Tagset& tagset, os << s; } } +} + +void token_as_xces_xml(std::ostream& os, const Tagset& tagset, + const Token& t, int indent, bool output_disamb /* = false */, + bool sort /* = false */, bool whitespace_info /* false */) +{ + token_as_xces_xml_head(os, t, indent, whitespace_info); + ++indent; + token_as_xces_xml_body(os, tagset, t, indent, output_disamb, sort); --indent; osi(os, indent) << "</tok>\n"; } diff --git a/libcorpus2/io/xcescommon.h b/libcorpus2/io/xcescommon.h index da1e8087f3f8f90b36aa585eb6b2684af31285dd..204e27280b9a0eb53d0ea4eb8544113e3b634d17 100644 --- a/libcorpus2/io/xcescommon.h +++ b/libcorpus2/io/xcescommon.h @@ -29,6 +29,12 @@ void token_as_xces_xml(std::ostream& os, const Tagset& tagset, const Token& t, int indent, bool output_disamb = false, bool sort = false, bool whitespace_info = false); +void token_as_xces_xml_head(std::ostream& os, + const Token& t, int indent, bool whitespace_info /* false */); + +void token_as_xces_xml_body(std::ostream& os, const Tagset& tagset, + const Token& t, int indent, bool output_disamb /* = false */, + bool sort /* = false */); /** * Output a xml-encoded version of the given string into the given ostream. * The default XML entity substitutions are made: less than, greater than, diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 874b60a5a4bb419e23bc7a65fbee11e8ceeb530d..2d7d8bbee77ddf04ab6910c9d18af7d240e93c7a 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -8,6 +8,7 @@ add_executable( tests main.cpp ann_basic.cpp basic.cpp + io.cpp tag_split.cpp tagset_parse.cpp ) diff --git a/tests/io.cpp b/tests/io.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c4c7541f9a423e841057aefcc1e45d1f01aa66f2 --- /dev/null +++ b/tests/io.cpp @@ -0,0 +1,74 @@ +/* + Copyright (C) 2010 Tomasz Śniatowski, Adam Radziszewski + Part of the libcorpus2 project + + This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3 of the License, or (at your option) +any later version. + + This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. + + See the LICENSE and COPYING files for more details. +*/ + +#include <boost/test/unit_test.hpp> +#include <set> +#include <libpwrutils/foreach.h> +#include <libpwrutils/bitset.h> +#include <libcorpus2/tagsetmanager.h> +#include <libcorpus2/io/xcesreader.h> +#include <libcorpus2/io/writer.h> + +namespace { +static char swiatopoglad[] = +"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +"<!DOCTYPE cesAna SYSTEM \"xcesAnaIPI.dtd\">\n" +"<cesAna xmlns:xlink=\"http://www.w3.org/1999/xlink\" version=\"1.0\" type=\"lex disamb\">\n" +"<chunkList>\n" +"<chunk id=\"ch51\" type=\"tok\">\n" +"<chunk type=\"s\">\n" +"<tok>\n" +"<orth>Uważam</orth>\n" +"<lex disamb=\"1\"><base>uważać</base><ctag>fin:sg:pri:imperf</ctag></lex>\n" +"</tok>\n" +"<ns/>\n" +"<tok>\n" +"<orth>,</orth>\n" +"<lex disamb=\"1\"><base>,</base><ctag>interp</ctag></lex>\n" +"</tok>\n" +"<tok>\n" +"<orth>że</orth>\n" +"<lex disamb=\"1\"><base>że</base><ctag>conj</ctag></lex>\n" +"</tok>\n" +"<tok>\n" +"<orth>światopogląd</orth>\n" +"<lex><base>światopogląd</base><ctag>subst:sg:acc:m3</ctag></lex>\n" +"<lex disamb=\"1\"><base>światopogląd</base><ctag>subst:sg:nom:m3</ctag></lex>\n" +"</tok>\n" +"</chunk>\n" +"</chunk>\n" +"</chunkList>\n" +"</cesAna>\n" +; +} + +BOOST_AUTO_TEST_SUITE( io ) + +BOOST_AUTO_TEST_CASE( iobase ) +{ + const Corpus2::Tagset& tagset = Corpus2::get_named_tagset("kipi"); + std::stringstream ssin; + ssin << swiatopoglad; + Corpus2::XcesReader xr(tagset, ssin); + boost::shared_ptr<Corpus2::Chunk> chunk = xr.get_next_chunk(); + std::stringstream ss; + boost::shared_ptr<Corpus2::TokenWriter> w(Corpus2::TokenWriter::create("xces,flat", ss, tagset)); + w->write_chunk(*chunk); + w->finish(); + BOOST_CHECK_EQUAL(ss.str(), swiatopoglad); +} + +BOOST_AUTO_TEST_SUITE_END();