diff --git a/libcorpus2/lexeme.cpp b/libcorpus2/lexeme.cpp index be0f8aeb7c82a1343c7b6366ff3f96affbbcc718..d5370ed662ee7974505ec9732c904cf6377e0fc6 100644 --- a/libcorpus2/lexeme.cpp +++ b/libcorpus2/lexeme.cpp @@ -33,6 +33,12 @@ Lexeme Lexeme::create(const UnicodeString& lemma, const Tag& tag) return Lexeme(lemma, tag); } +Lexeme Lexeme::create_utf8(const std::string& lemma_utf8, const Tag& tag) +{ + return Lexeme(UnicodeString::fromUTF8(lemma_utf8), tag); +} + + bool Lexeme::is_null() const { return lemma().length() == 0 || tag().is_null(); diff --git a/libcorpus2/lexeme.h b/libcorpus2/lexeme.h index fbf82a0a8957e06ea7dab83fb61f27c6a351ca2e..4c0119ab58d91c9b932be2eba46cf4373272040b 100644 --- a/libcorpus2/lexeme.h +++ b/libcorpus2/lexeme.h @@ -53,6 +53,9 @@ public: /// Helper creation function static Lexeme create(const UnicodeString& lemma, const Tag& tag); + /// Helper creation function, UTF-8 variant + static Lexeme create_utf8(const std::string& lemma_utf8, const Tag& tag); + /// Lemma accessor const UnicodeString& lemma() const { return lemma_; @@ -63,6 +66,11 @@ public: lemma_ = l; } + /// Lemma setter, UTF-8 variant + void set_lemma_utf8(const std::string& lemma_utf8) { + lemma_ = UnicodeString::fromUTF8(lemma_utf8); + } + /// UTF-8 lemma convenience accessor const std::string lemma_utf8() const { return PwrNlp::to_utf8(lemma_); diff --git a/libcorpus2/token.cpp b/libcorpus2/token.cpp index dba032254ca1c2e122be2429cd41d4c5ae776a86..38f3bdb2c4f92ed92691fe33662ac9bfc979dce2 100644 --- a/libcorpus2/token.cpp +++ b/libcorpus2/token.cpp @@ -32,6 +32,12 @@ Token::Token(const UnicodeString &orth, PwrNlp::Whitespace::Enum wa) { } +Token* Token::create_utf8(const std::string& orth_utf8, + PwrNlp::Whitespace::Enum wa /*= PwrNlp::Whitespace::Space*/) +{ + return new Token(UnicodeString::fromUTF8(orth_utf8), wa); +} + Token* Token::clone() const { Token* t = new Token(); diff --git a/libcorpus2/token.h b/libcorpus2/token.h index 13c623fb8895955913de658f8e87225a336cf190..ab0992271655081f2c4d0f8442df0ec4f4c3bd07 100644 --- a/libcorpus2/token.h +++ b/libcorpus2/token.h @@ -54,6 +54,10 @@ public: /// Create a Token with the given orth and whitespace amount Token(const UnicodeString& orth, PwrNlp::Whitespace::Enum wa); + /// Token creation, UTF-8 + static Token* create_utf8(const std::string& orth_utf8, + PwrNlp::Whitespace::Enum wa = PwrNlp::Whitespace::Space); + /// Create a duplicate Token Token* clone() const; @@ -72,6 +76,11 @@ public: orth_ = orth; } + /// Orth setter (UTF-8) + void set_orth_utf8(const std::string& orth_utf8) { + orth_ = UnicodeString::fromUTF8(orth_utf8); + } + /// WA getter const PwrNlp::Whitespace::Enum& wa() const { return wa_; diff --git a/swig/libcorpuslexeme.i b/swig/libcorpuslexeme.i index 058e8b7d449a9375f0f288543ec67670554413ee..e962845b00b107b7b538c3325819f2bfde3d95c4 100644 --- a/swig/libcorpuslexeme.i +++ b/swig/libcorpuslexeme.i @@ -24,13 +24,13 @@ namespace Corpus2 { Lexeme(const UnicodeString& lemma, const Tag& tag); static Lexeme create(const UnicodeString& lemma, const Tag& tag); - // static Lexeme create_utf8(const std::string& lemma, const Tag& tag); + static Lexeme create_utf8(const std::string& lemma, const Tag& tag); const UnicodeString& lemma() const; const std::string lemma_utf8() const; void set_lemma(const UnicodeString& l); - // void set_lemma_utf8(const std::string& l); + void set_lemma_utf8(const std::string& l); const Tag& tag() const; void set_tag(const Tag& tag); diff --git a/swig/libcorpustoken.i b/swig/libcorpustoken.i index 079063b5d88d7c26eccbf75e83195f3f6833aff7..5d81bba2580a9fe4a3236f505c70345e51ad73f1 100644 --- a/swig/libcorpustoken.i +++ b/swig/libcorpustoken.i @@ -26,12 +26,14 @@ namespace Corpus2 { Token(); Token(const UnicodeString& orth, PwrNlp::Whitespace::Enum wa); Token* clone() const; + + Token* create_utf8(const std::string& orth_utf8, PwrNlp::Whitespace::Enum wa = PwrNlp::Whitespace::Space); const UnicodeString& orth() const; std::string orth_utf8() const; void set_orth(const UnicodeString& orth); - // void set_orth_utf8(const std::string& orth); + void set_orth_utf8(const std::string& orth); const PwrNlp::Whitespace::Enum& wa() const; void set_wa(const PwrNlp::Whitespace::Enum& wa);