Skip to content
Snippets Groups Projects
Select Git revision
  • cd1e0c1124efb8a94cf2f264bf47d6b8ba54e00d
  • master default protected
  • develop protected
  • feat_remove_attr
  • python2.7
  • python3.8
6 results

util.h

Blame
  • util.h 2.51 KiB
    /*
        Copyright (C) 2010 Tomasz Śniatowski, Adam Radziszewski
    
        This program is free software; you can redistribute it and/or modify it
    under the terms of the GNU General Public License as published by the Free
    Software Foundation; either version 3 of the License, or (at your option)
    any later version.
    
        This program is distributed in the hope that it will be useful, but
    WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    or FITNESS FOR A PARTICULAR PURPOSE. 
    
        See the LICENSE and COPYING files for more details.
    */
    
    #ifndef PWRNLP_UTIL_H
    #define PWRNLP_UTIL_H
    
    #include <unicode/uniset.h>
    #include <unicode/unistr.h>
    
    #include <iostream>
    #include <string>
    #include <climits>
    
    namespace PwrNlp {
    
    /**
     * Helper function to 'unescape' an UTF8 string, which is done by
     * converting to a UnicodeString, ICU-unescaping and converting back to
     * UTF8.
     */
    std::string unescape_utf8(const std::string& str);
    
    /**
     * Helper function to convert an UnicodeString to a UTF-8 std::string
     */
    std::string to_utf8(const UnicodeString& ustr);
    
    /**
     * Helper function to put all characters from a std::string and put them
     * in a (set-like) container. The string is converted to a UnicodeString,
     * unescaping is performed and UChars are fed into the container. As a
     * special case, if the string starts with a '[' character, ends with ']'
     * and is more than two characters long it is treated like a ICU-style
     * UnicodeSet e.g. [a-zA-Z_] and parsed as such using ICU facilities. This
     * allows sepcyfying Unicode properties of the characters and more, see
     * http://userguide.icu-project.org/strings/unicodeset for details.
     */
    template <typename TContainer>
    void utf8_string_to_uchar_container(const std::string& s,
    		TContainer& container)
    {
    	UnicodeString pres = UnicodeString::fromUTF8(s);
    	if (pres.length() > 2 && pres.startsWith("[") && pres.endsWith("]")) {
    		UErrorCode status = U_ZERO_ERROR;
    		UnicodeSet uset(pres, status);
    		//std::cerr << "-----UNICODE SET FOR  " << s << "\n";
    		if (!U_SUCCESS(status)) {
    			std::cerr << "Unicode character set invalid: " << s << " \n";
    		} else {
    			for (int i = 0; i < uset.size(); ++i) {
    				UChar32 c = uset.charAt(i);
    				if (U_IS_BMP(c)) {
    					container.insert(c);
    					//std::cerr <<
    					//to_utf8(UnicodeString((UChar)uset.charAt((i))));
    				}
    			}
    		}
    		//std::cerr << "\n----END UNICODE SET FOR  " << s << "\n";
    	} else {
    		pres = pres.unescape();
    		for (int i = 0; i < pres.length(); ++i) {
    			container.insert(pres.charAt(i));
    		}
    	}
    }
    
    
    
    } /* end ns PwrNlp */
    
    #endif // PWRNLP_UTIL_H