Select Git revision
util.h 2.51 KiB
/*
Copyright (C) 2010 Tomasz Śniatowski, Adam Radziszewski
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your option)
any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.
See the LICENSE and COPYING files for more details.
*/
#ifndef PWRNLP_UTIL_H
#define PWRNLP_UTIL_H
#include <unicode/uniset.h>
#include <unicode/unistr.h>
#include <iostream>
#include <string>
#include <climits>
namespace PwrNlp {
/**
* Helper function to 'unescape' an UTF8 string, which is done by
* converting to a UnicodeString, ICU-unescaping and converting back to
* UTF8.
*/
std::string unescape_utf8(const std::string& str);
/**
* Helper function to convert an UnicodeString to a UTF-8 std::string
*/
std::string to_utf8(const UnicodeString& ustr);
/**
* Helper function to put all characters from a std::string and put them
* in a (set-like) container. The string is converted to a UnicodeString,
* unescaping is performed and UChars are fed into the container. As a
* special case, if the string starts with a '[' character, ends with ']'
* and is more than two characters long it is treated like a ICU-style
* UnicodeSet e.g. [a-zA-Z_] and parsed as such using ICU facilities. This
* allows sepcyfying Unicode properties of the characters and more, see
* http://userguide.icu-project.org/strings/unicodeset for details.
*/
template <typename TContainer>
void utf8_string_to_uchar_container(const std::string& s,
TContainer& container)
{
UnicodeString pres = UnicodeString::fromUTF8(s);
if (pres.length() > 2 && pres.startsWith("[") && pres.endsWith("]")) {
UErrorCode status = U_ZERO_ERROR;
UnicodeSet uset(pres, status);
//std::cerr << "-----UNICODE SET FOR " << s << "\n";
if (!U_SUCCESS(status)) {
std::cerr << "Unicode character set invalid: " << s << " \n";
} else {
for (int i = 0; i < uset.size(); ++i) {
UChar32 c = uset.charAt(i);
if (U_IS_BMP(c)) {
container.insert(c);
//std::cerr <<
//to_utf8(UnicodeString((UChar)uset.charAt((i))));
}
}
}
//std::cerr << "\n----END UNICODE SET FOR " << s << "\n";
} else {
pres = pres.unescape();
for (int i = 0; i < pres.length(); ++i) {
container.insert(pres.charAt(i));
}
}
}
} /* end ns PwrNlp */
#endif // PWRNLP_UTIL_H