Skip to content
Snippets Groups Projects
Select Git revision
  • 1a40b138794e271a5113a9895791a91560e7da79
  • master default protected
  • fix-words-ann
  • wccl-rules-migration
  • develop
5 results

listoperator.h

Blame
  • io.cpp 6.28 KiB
    /*
        Copyright (C) 2010 Tomasz Śniatowski, Adam Radziszewski
        Part of the libcorpus2 project
    
        This program is free software; you can redistribute it and/or modify it
    under the terms of the GNU General Lesser Public License as published by the Free
    Software Foundation; either version 3 of the License, or (at your option)
    any later version.
    
        This program is distributed in the hope that it will be useful, but
    WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    or FITNESS FOR A PARTICULAR PURPOSE.
    
        See the LICENSE.CORPUS2, POLIQARP, COPYING.LESSER and COPYING files for more details.
    */
    
    #include <boost/test/unit_test.hpp>
    #include <set>
    #include <boost/foreach.hpp>
    #include <libpwrutils/bitset.h>
    #include <libcorpus2/tagsetmanager.h>
    #include <libcorpus2/io/xcesreader.h>
    #include <libcorpus2/io/fastxces.h>
    #include <libcorpus2/io/writer.h>
    
    namespace {
    static char swiatopoglad[] =
    "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
    "<!DOCTYPE cesAna SYSTEM \"xcesAnaIPI.dtd\">\n"
    "<cesAna xmlns:xlink=\"http://www.w3.org/1999/xlink\" version=\"1.0\" type=\"lex disamb\">\n"
    "<chunkList>\n"
    "<chunk id=\"ch51\" type=\"tok\">\n"
    "<chunk type=\"s\">\n"
    "<tok>\n"
    "<orth>Uważam</orth>\n"
    "<lex disamb=\"1\"><base>uważać</base><ctag>fin:sg:pri:imperf</ctag></lex>\n"
    "</tok>\n"
    "<ns/>\n"
    "<tok>\n"
    "<orth>,</orth>\n"
    "<lex disamb=\"1\"><base>,</base><ctag>interp</ctag></lex>\n"
    "</tok>\n"
    "<tok>\n"
    "<orth>że</orth>\n"
    "<lex disamb=\"1\"><base>że</base><ctag>conj</ctag></lex>\n"
    "</tok>\n"
    "<tok>\n"
    "<orth>światopogląd</orth>\n"
    "<lex><base>światopogląd</base><ctag>subst:sg:acc:m3</ctag></lex>\n"
    "<lex disamb=\"1\"><base>światopogląd</base><ctag>subst:sg:nom:m3</ctag></lex>\n"
    "</tok>\n"
    "</chunk>\n"
    "</chunk>\n"
    "</chunkList>\n"
    "</cesAna>\n"
    ;
    
    static char swiatopoglad_noid[] =
    "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
    "<!DOCTYPE cesAna SYSTEM \"xcesAnaIPI.dtd\">\n"
    "<cesAna xmlns:xlink=\"http://www.w3.org/1999/xlink\" version=\"1.0\" type=\"lex disamb\">\n"
    "<chunkList>\n"
    "<chunk>\n"
    "<chunk type=\"s\">\n"
    "<tok>\n"
    "<orth>Uważam</orth>\n"
    "<lex disamb=\"1\"><base>uważać</base><ctag>fin:sg:pri:imperf</ctag></lex>\n"
    "</tok>\n"
    "<ns/>\n"
    "<tok>\n"
    "<orth>,</orth>\n"
    "<lex disamb=\"1\"><base>,</base><ctag>interp</ctag></lex>\n"
    "</tok>\n"
    "<tok>\n"
    "<orth>że</orth>\n"
    "<lex disamb=\"1\"><base>że</base><ctag>conj</ctag></lex>\n"
    "</tok>\n"
    "<tok>\n"
    "<orth>światopogląd</orth>\n"
    "<lex><base>światopogląd</base><ctag>subst:sg:acc:m3</ctag></lex>\n"
    "<lex disamb=\"1\"><base>światopogląd</base><ctag>subst:sg:nom:m3</ctag></lex>\n"
    "</tok>\n"
    "</chunk>\n"
    "</chunk>\n"
    "</chunkList>\n"
    "</cesAna>\n"
    ;
    
    static char swiatopoglad_broken[] =
    "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
    "<!DOCTYPE cesAna SYSTEM \"xcesAnaIPI.dtd\">\n"
    "<cesAna xmlns:xlink=\"http://www.w3.org/1999/xlink\" version=\"1.0\" type=\"lex disamb\">\n"
    "<chunkList>\n"
    "<chunk id=\"ch51\" type=\"tok\">\n"
    "<chunk type=\"s\">\n"
    "<tok>\n"
    "<orth>Uważam</orth>\n"
    "<lex disamb=\"1\"><base>uważać</base><ctag>fin:sg:pri:imperf</ctag></lex>\n"
    "</tok>\n"
    "<ns/>\n"
    "<tok>\n"
    "<orth>,</orth>\n"
    "<lex disamb=\"1\"><base>,</base><ctag>interp</ctag></lex>\n"
    "</tok>\n"
    "<tok>\n"
    "<orth>że</orth>\n"
    "<lex disamb=\"1\"><base>że</base><ctag>conj</ctag></lex>\n"
    "</tok>\n"
    "<tok>\n"
    "<orth>światopogląd</orth>\n"
    "<lex><base>światopogląd</base><ctag>subst:sg:acc:m3</ctag></lex>\n"
    "<lex disamb=\"1\"><base>światopogląd</base><ctag>subst:sg:nom:m3</ctag></lex>\n"
    "</tok>\n"
    "</chunk>\n"
    "</chunk>\n"
    "<tok>\n"
    "<orth>Uważam</orth>\n"
    "<lex disamb=\"1\"><base>uważać</base><ctag>fin:sg:pri:imperf</ctag></lex>\n"
    "</tok>\n"
    "<chunk id=\"ch51\" type=\"tok\">\n"
    "<chunk type=\"s\">\n"
    "<tok>\n"
    "<orth>Uważam</orth>\n"
    "<lex disamb=\"1\"><base>uważać</base><ctag>fin:sg:pri:imperf</ctag></lex>\n"
    "</tok>\n"
    "</chunk>\n"
    "</chunk>\n"
    "</chunkList>\n"
    "</cesAna>\n"
    ;
    }
    BOOST_AUTO_TEST_SUITE( io )
    
    BOOST_AUTO_TEST_CASE( iobase )
    {
    	const Corpus2::Tagset& tagset = Corpus2::get_named_tagset("kipi");
    	std::stringstream ssin;
    	ssin << swiatopoglad;
    	Corpus2::XcesReader xr(tagset, ssin);
    	boost::shared_ptr<Corpus2::Chunk> chunk = xr.get_next_chunk();
    	std::stringstream ss;
    	boost::shared_ptr<Corpus2::TokenWriter> w(Corpus2::TokenWriter::create_stream_writer("xces,flat", ss, tagset));
    	w->write_chunk(*chunk);
    	w->finish();
    	BOOST_CHECK_EQUAL(ss.str(), swiatopoglad);
    }
    
    
    BOOST_AUTO_TEST_CASE( fast )
    {
    	const Corpus2::Tagset& tagset = Corpus2::get_named_tagset("kipi");
    	std::stringstream ssin;
    	ssin << swiatopoglad;
    	Corpus2::FastXcesReader xr(tagset, ssin);
    	boost::shared_ptr<Corpus2::Chunk> chunk = xr.get_next_chunk();
    	std::stringstream ss;
    	boost::shared_ptr<Corpus2::TokenWriter> w(Corpus2::TokenWriter::create_stream_writer("xces,flat", ss, tagset));
    	w->write_chunk(*chunk);
    	w->finish();
    	BOOST_CHECK_EQUAL(ss.str(), swiatopoglad_noid);
    }
    
    BOOST_AUTO_TEST_CASE( io_oo )
    {
    	const Corpus2::Tagset& tagset = Corpus2::get_named_tagset("kipi");
    	std::stringstream ssin;
    	ssin << swiatopoglad_broken;
    	Corpus2::XcesReader xr(tagset, ssin);
    	xr.set_option("no_warn_inconsistent");
    	boost::shared_ptr<Corpus2::Chunk> chunk = xr.get_next_chunk();
    	std::stringstream ss;
    	boost::shared_ptr<Corpus2::TokenWriter> w(Corpus2::TokenWriter::create_stream_writer("xces,flat", ss, tagset));
    	w->write_chunk(*chunk);
    	w->finish();
    	BOOST_CHECK_EQUAL(ss.str(), swiatopoglad);
    	chunk = xr.get_next_chunk();
    	BOOST_CHECK(chunk);
    	chunk = xr.get_next_chunk();
    	BOOST_CHECK(chunk);
    }
    
    BOOST_AUTO_TEST_CASE( create_reader )
    {
    	const Corpus2::Tagset& tagset = Corpus2::get_named_tagset("kipi");
    	boost::shared_ptr<Corpus2::TokenReader> r;
    	std::stringstream ss;
    	ss << swiatopoglad;
    	r = Corpus2::TokenReader::create_stream_reader("xces,disamb_only,sh", tagset, ss);
    	boost::shared_ptr<Corpus2::XcesReader> xr;
    	xr = boost::dynamic_pointer_cast<Corpus2::XcesReader>(r);
    	BOOST_REQUIRE(xr);
    	BOOST_CHECK_EQUAL(xr->get_option("disamb_only"), "disamb_only");
    	BOOST_CHECK_EQUAL(xr->get_option("sh"), "sh");
    	r = Corpus2::TokenReader::create_stream_reader("xces,disamb_only,strict", tagset, ss);
    	xr = boost::dynamic_pointer_cast<Corpus2::XcesReader>(r);
    	BOOST_REQUIRE(xr);
    	BOOST_CHECK_EQUAL(xr->get_option("disamb_only"), "disamb_only");
    	BOOST_CHECK_EQUAL(xr->get_option("sh"), "");
    	BOOST_CHECK_EQUAL(xr->get_option("strict"), "strict");
    }
    
    
    
    BOOST_AUTO_TEST_SUITE_END();