diff --git a/CMakeLists.txt b/CMakeLists.txt index ee9c9ea6c9e194698f84b288fea9995fe01e3429..b0c9c21299f1ee757466814497ab303eb47f2688 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -67,4 +67,6 @@ add_subdirectory(libcorpus2) add_subdirectory(corpus2tools) add_subdirectory(tests) add_subdirectory(swig) +add_subdirectory(poliqarp-library) +add_subdirectory(poliqarp) diff --git a/poliqarp-library/CMakeLists.txt b/poliqarp-library/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..5f07bb24637d95fff68081c4a2749fe63238a57e --- /dev/null +++ b/poliqarp-library/CMakeLists.txt @@ -0,0 +1,157 @@ +project (PoliqarpLibrary) +cmake_minimum_required(VERSION 2.8.0) +set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/CMakeScripts) + +INCLUDE (CheckIncludeFiles) +INCLUDE (CheckLibraryExists) +INCLUDE (CheckFunctionExists) +INCLUDE (CheckTypeSize) +INCLUDE (AC_HEADER_STDC) + +CHECK_FUNCTION_EXISTS(ngettext HAVE_GETTEXT) +CHECK_FUNCTION_EXISTS(nftw HAVE_NFTW) +CHECK_FUNCTION_EXISTS(snprintf HAVE_SNPRINTF) +CHECK_FUNCTION_EXISTS(_snprintf HAVE__SNPRINTF) + +CHECK_INCLUDE_FILES("inttypes.h" HAVE_INTTYPES_H) +CHECK_INCLUDE_FILES("locale.h" HAVE_LOCALE_H) +CHECK_INCLUDE_FILES("memory.h" HAVE_MEMORY_H) +CHECK_INCLUDE_FILES("malloc.h" HAVE_MALLOC_H) +CHECK_INCLUDE_FILES("netinet/in.h" HAVE_NETINET_IN_H) +CHECK_INCLUDE_FILES("pthread.h" HAVE_PTHREAD) +CHECK_INCLUDE_FILES("stdbool.h" HAVE_STDBOOL_H) +CHECK_INCLUDE_FILES("stdint.h" HAVE_STDINT_H) +CHECK_INCLUDE_FILES("stdlib.h" HAVE_STDLIB_H) +CHECK_INCLUDE_FILES("strings.h" HAVE_STRINGS_H) +CHECK_INCLUDE_FILES("string.h" HAVE_STRING_H) +CHECK_INCLUDE_FILES("sys/socket.h" HAVE_SYS_SOCKET_H) +CHECK_INCLUDE_FILES("sys/stat.h" HAVE_SYS_STAT_H) +CHECK_INCLUDE_FILES("sys/types.h" HAVE_SYS_TYPES_H) +CHECK_INCLUDE_FILES("unistd.h" HAVE_UNISTD_H) + +CHECK_LIBRARY_EXISTS(expat XML_ParserCreate "" HAVE_LIBEXPAT) + +SET(CMAKE_EXTRA_INCLUDE_FILES unistd.h sys/socket.h netinet/in.h) +CHECK_TYPE_SIZE("socklen_t" SOCKLEN_T) +SET(CMAKE_EXTRA_INCLUDE_FILES unistd.h) +CHECK_TYPE_SIZE("ssize_t" SSIZE_T) +SET(CMAKE_EXTRA_INCLUDE_FILES stdbool.h) +CHECK_TYPE_SIZE("_Bool" _BOOL) +SET(CMAKE_EXTRA_INCLUDE_FILES) + +CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/cmake-config.h.in ${CMAKE_CURRENT_BINARY_DIR}/poliqarp-config.h) +CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/commondef.h ${CMAKE_CURRENT_BINARY_DIR}/sakura/commondef.h) + +include_directories(${PoliqarpLibrary_SOURCE_DIR} ${PoliqarpLibrary_BINARY_DIR}) +# ${PoliqarpLibrary_SOURCE_DIR}/sakura/commons ${PoliqarpLibrary_SOURCE_DIR}/foostring ${PoliqarpLibrary_SOURCE_DIR}/progress ${PoliqarpLibrary_SOURCE_DIR}/sakura ${PoliqarpLibrary_SOURCE_DIR}/unibits) +find_package(Parsers REQUIRED) +find_package(Threads) + +set(CMAKE_C_FLAGS "$ENV{CXXFLAGS}") +set(CMAKE_C_FLAGS_DEBUG "-O0 -DDEBUG -ggdb3 $ENV{CXXFLAGS}") +set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O2 -g -fno-omit-frame-pointer") + +GENERATE_BISON_FLEX_SOURCES( + "${CMAKE_CURRENT_SOURCE_DIR}/sakura/parser.y" "" + "${CMAKE_CURRENT_SOURCE_DIR}/sakura/lexer.y" "" +) +add_custom_command( + OUTPUT ${PoliqarpLibrary_BINARY_DIR}/sakura/parser.h + COMMAND ${CMAKE_COMMAND} + ARGS -E copy ${PoliqarpLibrary_BINARY_DIR}/parser.h ${PoliqarpLibrary_BINARY_DIR}/sakura/parser.h + COMMENT "copy parser.h" + DEPENDS ${PoliqarpLibrary_BINARY_DIR}/parser.h +) + +set(foostring_SRC foostring/foostring.c foostring/strnlen.c) +set(progress_SRC progress/progress.c) +set(unibits_SRC unibits/tclUtils.c unibits/tclUtf.c unibits/strcoll.c) + +set(common_SRC + sakura/common/args.c + sakura/common/bitstream.c + sakura/common/bs.c + sakura/common/bs-file-backend.c + sakura/common/entity.c + sakura/common/file-map.c + sakura/common/file-reader.c + sakura/common/getline.c + sakura/common/graph.c + sakura/common/hash-table.c + sakura/common/memory-arena.c + sakura/common/newdict.c + sakura/common/set.c + sakura/common/system-error.c + sakura/common/tinydb.c +) +set(sakura_SRC + sakura/backend-base.c + sakura/backend-config.c + sakura/backend-corpus.c + sakura/backend-document.c + sakura/backend-index.c + sakura/backend-interp.c + sakura/backend-meta.c + sakura/backend-orth.c + sakura/backend-subdocument.c + sakura/backend-syntax.c + sakura/backend-tag.c + sakura/cdf.c + sakura/config.c + sakura/corpus.c + sakura/dict.c + sakura/exception.c + sakura/expression.c + sakura/meta-value.c + sakura/poliqarp.c + sakura/query.c + sakura/query-rewrite.c + sakura/random.c + sakura/regexp.c + sakura/value-attr.c + sakura/value-base.c + sakura/value.c + sakura/value-interp.c + sakura/value-orth.c + sakura/value-pattern.c + sakura/value-pos.c + sakura/value-space.c + sakura/value-tag.c + sakura/value-type.c +) +set(poliqarpd_SRC + poliqarpd/async.c + poliqarpd/configuration.c + poliqarpd/log.c + poliqarpd/msgqueue.c + poliqarpd/od_unix.c +# poliqarpd/od_win32.c + poliqarpd/poliqarpd.c + poliqarpd/protocol.c + poliqarpd/server.c + poliqarpd/session.c + poliqarpd/sessopt.c + poliqarpd/sockets.c + poliqarpd/sockstream.c + poliqarpd/utils.c +) + +add_library(libpoliqarp SHARED ${foostring_SRC} ${progress_SRC} ${unibits_SRC} ${common_SRC} ${sakura_SRC} ${BF_SOURCES} ${PoliqarpLibrary_BINARY_DIR}/sakura/parser.h) +add_dependencies(libpoliqarp ${PoliqarpLibrary_BINARY_DIR}/sakura/parser.ha) +add_executable(poliqarpc-shared utils/poliqarpc.c) +target_link_libraries(poliqarpc-shared libpoliqarp pthread) +add_executable(poliqarpd-shared ${poliqarpd_SRC}) +target_link_libraries(poliqarpd-shared libpoliqarp pthread) +if(UNIX) + install(TARGETS libpoliqarp LIBRARY DESTINATION lib) + install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/sakura DESTINATION include + FILES_MATCHING PATTERN "*.h") + install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/progress DESTINATION include/sakura + FILES_MATCHING PATTERN "*.h") + install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/ DESTINATION include/sakura + FILES_MATCHING PATTERN "poliqarp-config.h") + install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/sakura DESTINATION include + FILES_MATCHING PATTERN "commondef.h") + install(TARGETS poliqarpc-shared poliqarpd-shared RUNTIME DESTINATION bin) +endif(UNIX) + diff --git a/poliqarp-library/CMakeScripts/AC_HEADER_STDC.cmake b/poliqarp-library/CMakeScripts/AC_HEADER_STDC.cmake new file mode 100644 index 0000000000000000000000000000000000000000..630ea5bf6ab8117964edd6c23e8eb86801971a63 --- /dev/null +++ b/poliqarp-library/CMakeScripts/AC_HEADER_STDC.cmake @@ -0,0 +1,85 @@ +#This file has been taken from libgd. Its license goes: + +#Portions copyright 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, +#2002 by Cold Spring Harbor Laboratory. Funded under Grant +#P41-RR02188 by the National Institutes of Health. +# +#Portions copyright 1996, 1997, 1998, 1999, 2000, 2001, 2002 by +#Boutell.Com, Inc. +# +#Portions relating to GD2 format copyright 1999, 2000, 2001, 2002 +#Philip Warner. +# +#Portions relating to PNG copyright 1999, 2000, 2001, 2002 Greg +#Roelofs. +# +#Portions relating to gdttf.c copyright 1999, 2000, 2001, 2002 John +#Ellson (ellson@lucent.com). +# +#Portions relating to gdft.c copyright 2001, 2002 John Ellson +#(ellson@lucent.com). +# +#Portions copyright 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 +#Pierre-Alain Joye (pierre@libgd.org). +# +#Portions relating to JPEG and to color quantization copyright 2000, +#2001, 2002, Doug Becker and copyright (C) 1994, 1995, 1996, 1997, +#1998, 1999, 2000, 2001, 2002, Thomas G. Lane. This software is +#based in part on the work of the Independent JPEG Group. See the +#file README-JPEG.TXT for more information. +# +#Portions relating to WBMP copyright 2000, 2001, 2002 Maurice +#Szmurlo and Johan Van den Brande. +# +#Permission has been granted to copy, distribute and modify gd in +#any context without fee, including a commercial application, +#provided that this notice is present in user-accessible supporting +#documentation. +# +#This does not affect your ownership of the derived work itself, and +#the intent is to assure proper credit for the authors of gd, not to +#interfere with your productive use of gd. If you have questions, +#ask. "Derived works" includes all programs that utilize the +#library. Credit must be given in user-accessible documentation. +# +#This software is provided "AS IS." The copyright holders disclaim +#all warranties, either express or implied, including but not +#limited to implied warranties of merchantability and fitness for a +#particular purpose, with respect to this code and accompanying +#documentation. +# +#Although their code does not appear in gd, the authors wish to thank +#David Koblas, David Rowley, and Hutchison Avenue Software Corporation +#for their prior contributions. + +message(STATUS "Checking whether system has ANSI C header files") +INCLUDE (CheckPrototypeExists) +INCLUDE (CheckIncludeFiles) + +check_include_files("dlfcn.h;stdint.h;stddef.h;inttypes.h;stdlib.h;strings.h;string.h;float.h" StandardHeadersExist) +if(StandardHeadersExist) + check_prototype_exists(memchr string.h memchrExists) + if(memchrExists) + + check_prototype_exists(free stdlib.h freeExists) + if(freeExists) + message(STATUS "ANSI C header files - found") + set(STDC_HEADERS 1 CACHE INTERNAL "System has ANSI C header files") + set(HAVE_STRINGS_H 1) + set(HAVE_STRING_H 1) + set(HAVE_FLOAT_H 1) + set(HAVE_STDLIB_H 1) + set(HAVE_STDDEF_H 1) + set(HAVE_STDINT_H 1) + set(HAVE_INTTYPES_H 1) + set(HAVE_DLFCN_H 1) + endif(freeExists) + endif(memchrExists) +endif(StandardHeadersExist) + +if(NOT STDC_HEADERS) + message(STATUS "ANSI C header files - not found") + set(STDC_HEADERS 0 CACHE INTERNAL "System has ANSI C header files") +endif(NOT STDC_HEADERS) + + diff --git a/poliqarp-library/CMakeScripts/CheckPrototypeExists.cmake b/poliqarp-library/CMakeScripts/CheckPrototypeExists.cmake new file mode 100644 index 0000000000000000000000000000000000000000..678c658f93ef67fefe335c41e71d027f298c25f8 --- /dev/null +++ b/poliqarp-library/CMakeScripts/CheckPrototypeExists.cmake @@ -0,0 +1,34 @@ +# - Check if the prototype for a function exists. +# CHECK_PROTOTYPE_EXISTS (FUNCTION HEADER VARIABLE) +# +# FUNCTION - the name of the function you are looking for +# HEADER - the header(s) where the prototype should be declared +# VARIABLE - variable to store the result +# +# The following variables may be set before calling this macro to +# modify the way the check is run: +# +# CMAKE_REQUIRED_FLAGS = string of compile command line flags +# CMAKE_REQUIRED_DEFINITIONS = list of macros to define (-DFOO=bar) +# CMAKE_REQUIRED_INCLUDES = list of include directories + +INCLUDE(CheckCXXSourceCompiles) + +MACRO (CHECK_PROTOTYPE_EXISTS _SYMBOL _HEADER _RESULT) + SET(_INCLUDE_FILES) + FOREACH (it ${_HEADER}) + SET(_INCLUDE_FILES "${_INCLUDE_FILES}#include <${it}>\n") + ENDFOREACH (it) + + SET(_CHECK_PROTO_EXISTS_SOURCE_CODE " +${_INCLUDE_FILES} +int main() +{ +#ifndef ${_SYMBOL} + int i = sizeof(&${_SYMBOL}); +#endif + return 0; +} +") + CHECK_CXX_SOURCE_COMPILES("${_CHECK_PROTO_EXISTS_SOURCE_CODE}" ${_RESULT}) +ENDMACRO (CHECK_PROTOTYPE_EXISTS _SYMBOL _HEADER _RESULT) diff --git a/poliqarp-library/CMakeScripts/FindParsers.cmake b/poliqarp-library/CMakeScripts/FindParsers.cmake new file mode 100644 index 0000000000000000000000000000000000000000..91494e4b4d3f586fd31dc65eda83c9ddc148f101 --- /dev/null +++ b/poliqarp-library/CMakeScripts/FindParsers.cmake @@ -0,0 +1,134 @@ + # /* + # For more information, please see: http://software.sci.utah.edu + # The MIT License + # Copyright (c) 2005-2006 + # Scientific Computing and Imaging Institute, University of Utah + # License for the specific language governing rights and limitations under + # Permission is hereby granted, free of charge, to any person obtaining a + # copy of this software and associated documentation files (the "Software"), + # to deal in the Software without restriction, including without limitation + # the rights to use, copy, modify, merge, publish, distribute, sublicense, + # and/or sell copies of the Software, and to permit persons to whom the + # Software is furnished to do so, subject to the following conditions: + # The above copyright notice and this permission notice shall be included + # in all copies or substantial portions of the Software. + # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + # OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + # DEALINGS IN THE SOFTWARE. + # */ + + SET(PARSERS_FOUND FOOBAR) + + # These variables need to be specified in order to get CMake not to + # barf on the IF(EXISTS ${BISON_EXECUTABLE} ..) expression even though + # the code shouldn't get called. By setting them to BISON_EXECUTABLE + + SET(BISON_EXECUTABLE "BISON_EXECUTABLE-NOTFOUND" CACHE FILEPATH "bison executable") + SET(FLEX_EXECUTABLE "FLEX_EXECUTABLE-NOTFOUND" CACHE FILEPATH "flex executable") + # Mark these variables as advanced options + MARK_AS_ADVANCED(FORCE BISON_EXECUTABLE) + MARK_AS_ADVANCED(FORCE FLEX_EXECUTABLE) + + # You need at least version 2.4 for this to work. + IF("${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}" LESS 2.4) + MESSAGE("You need at least version 2.4 for generating flex and bison parsers. Go get it from http://www.cmake.org/HTML/Download.html";) + SET(PARSERS_FOUND 0) + + ELSE("${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}" LESS 2.4) + + FIND_PROGRAM(BISON_EXECUTABLE + NAMES bison + PATHS ${BISON_DIR} ) + + FIND_PROGRAM(FLEX_EXECUTABLE + NAMES flex + PATHS ${FLEX_DIR} ) + + IF(EXISTS ${BISON_EXECUTABLE} AND EXISTS ${FLEX_EXECUTABLE}) + SET(PARSERS_FOUND 1) + + ELSE(EXISTS ${BISON_EXECUTABLE} AND EXISTS ${FLEX_EXECUTABLE}) + SET(PARSERS_FOUND 0) + # Print some error messages to the user + IF (NOT EXISTS ${BISON_EXECUTABLE}) + MESSAGE("Couldn't find bison executable. Please check value in BISON_EXECUTABLE in advanced settings.") + ENDIF (NOT EXISTS ${BISON_EXECUTABLE}) + IF (NOT EXISTS ${FLEX_EXECUTABLE}) + MESSAGE("Couldn't find flex executable. Please check value in FLEX_EXECUTABLE in advanced settings.") + ENDIF (NOT EXISTS ${FLEX_EXECUTABLE}) + + ENDIF(EXISTS ${BISON_EXECUTABLE} AND EXISTS ${FLEX_EXECUTABLE}) + + ENDIF("${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}" LESS 2.4) + + # These are helper functions for parsers. + + # parser is the parser file name like parser.y + # lexer is like lexer.l + + # The names of the output files will be based on the input names. + # BF_SOURCES will be parser.cc, parser.h and lexer.cc. + + MACRO(GENERATE_BISON_FLEX_SOURCES parser parser_args + lexer lexer_args) + GET_FILENAME_COMPONENT(parser_base "${parser}" NAME_WE) + + SET(BISON_TAB_C "${CMAKE_CURRENT_BINARY_DIR}/${parser_base}.tab.c") + SET(BISON_TAB_H "${CMAKE_CURRENT_BINARY_DIR}/${parser_base}.tab.h") + SET(BISON_CC "${CMAKE_CURRENT_BINARY_DIR}/${parser_base}.c") + SET(BISON_H "${CMAKE_CURRENT_BINARY_DIR}/${parser_base}.h") + + ADD_CUSTOM_COMMAND( + OUTPUT ${BISON_TAB_C} ${BISON_TAB_H} + COMMAND ${BISON_EXECUTABLE} + ARGS "${parser}" ${parser_args} "--defines" + DEPENDS "${parser}" + COMMENT "1 Generating ${BISON_TAB_C} ${BISON_TAB_H} from ${parser}" + ) + + ADD_CUSTOM_COMMAND( + OUTPUT ${BISON_CC} + COMMAND ${CMAKE_COMMAND} + ARGS -E copy ${BISON_TAB_C} ${BISON_CC} + DEPENDS ${BISON_TAB_C} + COMMENT "2 Copying ${BISON_TAB_C} to ${BISON_CC}" + ) + + ADD_CUSTOM_COMMAND( + OUTPUT ${BISON_H} + COMMAND ${CMAKE_COMMAND} + ARGS -E copy ${BISON_TAB_H} ${BISON_H} + DEPENDS ${BISON_TAB_H} + COMMENT "3 Copying ${BISON_TAB_H} to ${BISON_H}" + ) + + GET_FILENAME_COMPONENT(lexer_base "${lexer}" NAME_WE) + SET(FLEX_C "${CMAKE_CURRENT_BINARY_DIR}/lex.yy.c") + + SET(FLEX_CC "${CMAKE_CURRENT_BINARY_DIR}/${lexer_base}.cc") + + ADD_CUSTOM_COMMAND( + OUTPUT ${FLEX_C} + COMMAND ${FLEX_EXECUTABLE} + ARGS "${lexer}" ${lexer_args} + DEPENDS "${lexer}" ${BISON_H} + COMMENT "4 Generating ${FLEX_C} from ${lexer}" + ) + + ADD_CUSTOM_COMMAND( + OUTPUT ${FLEX_CC} + COMMAND ${CMAKE_COMMAND} + ARGS -E copy ${FLEX_C} ${FLEX_CC} + DEPENDS ${FLEX_C} + COMMENT "5 Copying ${FLEX_C} to ${FLEX_CC}" + ) + + SET(BF_SOURCES ${BISON_CC} ${BISON_H} ${FLEX_C}) + + ENDMACRO(GENERATE_BISON_FLEX_SOURCES) + + diff --git a/poliqarp-library/cmake-config.h.in b/poliqarp-library/cmake-config.h.in new file mode 100644 index 0000000000000000000000000000000000000000..48b3c67fabe0678c01f969f3a991f7d9fb1dc2c6 --- /dev/null +++ b/poliqarp-library/cmake-config.h.in @@ -0,0 +1,162 @@ +#ifndef POLIQARP_CONFIG_H_INCLUDED +#define POLIQARP_CONFIG_H_INCLUDED +/* config.h.in. Generated from configure.ac by autoheader. */ + +#cmakedefine HAVE_MALLOC_H + +/* Define to 1 if your compiler supports `extern inline'. */ +#cmakedefine HAVE_EXTERN_INLINE 1 + +/* Define if you have the gettext and ngettext functions in <libintl.h>. */ +#cmakedefine HAVE_GETTEXT + +/* Define to 1 if you have the <inttypes.h> header file. */ +#cmakedefine HAVE_INTTYPES_H 1 + +/* Define to 1 if you have the `expat' library (-lexpat). */ +#cmakedefine HAVE_LIBEXPAT 1 + +/* Define to 1 if you have the <locale.h> header file. */ +#cmakedefine HAVE_LOCALE_H 1 + +/* Define to 1 if you have the <memory.h> header file. */ +#cmakedefine HAVE_MEMORY_H 1 + +/* Define to 1 if you have the <netinet/in.h> header file. */ +#cmakedefine HAVE_NETINET_IN_H 1 + +/* Define to 1 if you have the `nftw' function. */ +#cmakedefine HAVE_NFTW 1 + +/* Define if you have POSIX threads libraries and header files. */ +#cmakedefine HAVE_PTHREAD + +/* Define to 1 if you have the `snprintf' function. */ +#cmakedefine HAVE_SNPRINTF 1 + +/* Define to 1 if the system has the type `socklen_t'. */ +#cmakedefine HAVE_SOCKLEN_T + +/* Define to 1 if the system has the type `ssize_t'. */ +#cmakedefine HAVE_SSIZE_T 1 + +/* Define to 1 if stdbool.h conforms to C99. */ +#cmakedefine HAVE_STDBOOL_H 1 + +/* Define to 1 if you have the <stdint.h> header file. */ +#cmakedefine HAVE_STDINT_H 1 + +/* Define to 1 if you have the <stdlib.h> header file. */ +#cmakedefine HAVE_STDLIB_H 1 + +/* Define to 1 if you have the <strings.h> header file. */ +#cmakedefine HAVE_STRINGS_H 1 + +/* Define to 1 if you have the <string.h> header file. */ +#cmakedefine HAVE_STRING_H 1 + +/* Define to 1 if you have the <sys/socket.h> header file. */ +#cmakedefine HAVE_SYS_SOCKET_H 1 + +/* Define to 1 if you have the <sys/stat.h> header file. */ +#cmakedefine HAVE_SYS_STAT_H 1 + +/* Define to 1 if you have the <sys/types.h> header file. */ +#cmakedefine HAVE_SYS_TYPES_H 1 + +/* Define to 1 if you have the <unistd.h> header file. */ +#cmakedefine HAVE_UNISTD_H 1 + +/* Define to 1 if the system has the type `_Bool'. */ +#cmakedefine HAVE__BOOL 1 + +/* Define to 1 if you have the `_snprintf' function. */ +#cmakedefine HAVE__SNPRINTF 1 + +/* Define to the address where bug reports for this package should be sent. */ +#cmakedefine PACKAGE_BUGREPORT + +/* Define to the full name of this package. */ +#cmakedefine PACKAGE_NAME + +/* Define to the full name and version of this package. */ +#cmakedefine PACKAGE_STRING + +/* Define to the one symbol short name of this package. */ +#cmakedefine PACKAGE_TARNAME + +/* Define to the version of this package. */ +#cmakedefine PACKAGE_VERSION + +/* Define to necessary symbol if this constant uses a non-standard name on + your system. */ +#cmakedefine PTHREAD_CREATE_JOINABLE + +/* Define to 1 if you have the ANSI C header files. */ +#cmakedefine STDC_HEADERS 1 + +/* Define to 1 if you want/need to use the included TCL regular expressions + library. */ +#undef USE_TCL_REGEX + +/* Define to 1 if your processor stores words with the most significant byte + first (like Motorola and SPARC, unlike Intel and VAX). */ +#undef WORDS_BIGENDIAN + +/* Define to 1 if `lex' declares `yytext' as a `char *' by default, not a + `char[]'. */ +#cmakedefine YYTEXT_POINTER 1 + +/* Define to empty if `const' does not conform to ANSI C. */ +#cmakedefine const + +/* Define to `__inline__' or `__inline' if that's what the C compiler + calls it, or to nothing if 'inline' is not supported under any name. */ +#ifndef __cplusplus +#undef inline +#endif + +typedef char * PCHAR; +#ifdef _MSC_VER + #define WIN32_LEAN_AND_MEAN /* Exclude rarely-used stuff from Windows headers */ + #include <windows.h> + #include <direct.h> + #include <io.h> + #include <tchar.h> + + /* Library signature: */ + #ifdef POLI_EXPORTS + #define POLI_API __declspec(dllexport) + #else + #ifdef POLI_STATIC + #define POLI_API + #else + #define POLI_API __declspec(dllimport) + #endif + #endif + + #pragma warning( disable: 4290 ) /* warning C4290: C++ exception specification ignored except to indicate a function is not __declspec(nothrow) */ + #pragma warning( disable: 4996 ) /* c code "unsafe" :/ */ + + #pragma warning(disable:4251) /* "need to have dll interface" for stl // TODO */ + + + +#else + #define POLI_API +#endif + +#define PRIuSIZE "zu" +#define off64_t int64_t +#define LOCALEDIR "@CMAKE_INSTALL_PREFIX@/share/locale" + +#define PACKAGE_BUGREPORT "http://poliqarp.sf.net/bugs/" +#define PACKAGE_NAME "Poliqarp" +#define PACKAGE_STRING "Poliqarp 1.3.11" +#define PACKAGE_TARNAME "poliqarp" +#define PACKAGE_URL "" +#define PACKAGE_VERSION "1.3.11" + +#endif /* POLIQARP_CONFIG_H_INCLUDED */ + +#include <sakura/commondef.h> diff --git a/poliqarp-library/commondef.h b/poliqarp-library/commondef.h new file mode 100644 index 0000000000000000000000000000000000000000..8f2a4ecbc7f3675a6e92d84ffbe81d9a71a8f006 --- /dev/null +++ b/poliqarp-library/commondef.h @@ -0,0 +1,102 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef POLIQARP_COMMONDEF_H +#define POLIQARP_COMMONDEF_H + +#ifdef HAVE_INTTYPES_H +# include <inttypes.h> +#endif + +#ifdef HAVE_GETTEXT +# include <libintl.h> +# define _(x) gettext(x) +#else +# define _(x) (x) +# define gettext(x) (x) +# define ngettext(x,y,n) (((n) == 1) ? (x) : (y)) +#endif +#define __(x,y,n) ngettext(x,y,n) +#define _M(x) (x) /* xgettext --keyword=_M ... */ + +#ifdef HAVE_STDBOOL_H +# include <stdbool.h> +#else +# undef bool +# define bool unsigned char +# define true 1 +# define false 0 +#endif + +#ifndef __GNUC__ +# define __attribute__(x) +#endif + +#include <limits.h> +#ifndef PATH_MAX +# define PATH_MAX 4096 +#endif + +#if !HAVE_FSEEKO && HAVE_FSEEKO64 +# undef fseeko +# define fseeko fseeko64 +# undef ftello +# define ftello ftello64 +#endif + +#ifdef WORDS_BIGENDIAN + +static inline uint16_t en2(uint16_t x) +{ + return ((x & 0xFF) << 8) + (x >> 8); +} + +static inline uint32_t en4(uint32_t x) +{ + return ((x & 0xFF) << 24) + + (((x >> 8) & 0xFF) << 16) + + (((x >> 16) & 0xFF) << 8) + + ((x >> 24) & 0xFF); +} + +static inline uint64_t en8(uint64_t x) +{ + return ((x & 0xFF) << 56) + + (((x >> 8) & 0xFF) << 48) + + (((x >> 16) & 0xFF) << 40) + + (((x >> 24) & 0xFF) << 32) + + (((x >> 32) & 0xFF) << 24) + + (((x >> 40) & 0xFF) << 16) + + (((x >> 48) & 0xFF) << 8) + + ((x >> 56) & 0xFF); +} + +#else + +static inline uint16_t en2(uint16_t x) { return x; } +static inline uint32_t en4(uint32_t x) { return x; } +static inline uint64_t en8(uint64_t x) { return x; } + +#endif + +#endif /* POLIQARP_COMMONDEF_H */ diff --git a/poliqarp-library/foostring/foostring.c b/poliqarp-library/foostring/foostring.c new file mode 100644 index 0000000000000000000000000000000000000000..c1194d877062ee54b5c063bdb716cd2bb6ca0053 --- /dev/null +++ b/poliqarp-library/foostring/foostring.c @@ -0,0 +1,618 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <poliqarp-config.h> + +#include <stdlib.h> +#include <stdarg.h> +#include <stdio.h> +#include <errno.h> +#include <string.h> +#include <assert.h> + +#include <foostring/foostring.h> + +size_t strnlen(const char *str, size_t maxlen); + +static void default_oom_callback(string_t s) +{ + return; +} + +void (*string_oom_callback)(string_t) = default_oom_callback; + +/** + * A helper function determining the buffer size. + */ +static inline size_t string_capacity_advice(size_t request) +{ + size_t size = 16; + if (request > SIZE_MAX / 2) + return SIZE_MAX; + while (size < request) + size *= 2; + return size; +} + +/** + * Enlarges the buffer to new_size bytes. + * + * @return 0 on success, -1 on error + */ +static inline int string_enlarge(string_t s, size_t new_size) +{ + assert(new_size >= s->size); + assert(new_size != 0); + if (new_size > s->size) { + char *new_cstring; + new_cstring = realloc(s->str, new_size); + if (new_cstring == NULL) { + string_oom_callback(s); + return -1; + } + s->str = new_cstring; + s->size = new_size; + } + return 0; +} + +/** + * Ensures that a buffer will be able to contain at least count + * additional bytes. + * + * @return 0 on success, -1 on error + */ +static inline int string_ensure_extra_capacity(string_t s, size_t count) +{ + if (count >= SIZE_MAX - s->len) + return -1; + if (s->size <= s->len + count + 1) + return string_enlarge(s, string_capacity_advice(s->len + count + 1)); + return 0; +} + +/** + * Ensures that a buffer will be able to contain at least count bytes + * (not counting the terminal null byte). + * + * @return 0 on success, -1 on error + */ +static inline int string_ensure_capacity(string_t s, size_t count) +{ + if (count == SIZE_MAX) + return -1; + if (s->size <= count + 1) + return string_enlarge(s, string_capacity_advice(count + 1)); + return 0; +} + + +/** + * Setting this to non-zero will cause the library to collect + * statistics regarding string usage in the program. This can be + * useful for profiling purposes. + * + * Unfortunately, turning this settings on makes foostring routines + * non-reentrant. + */ +#define STRING_STATS 0 + +/** + * Setting this to a non-zero value will cause the library to store + * at most STRING_CACHE_SIZE string_t structures in the allocator's + * cache. This will limit the number of allocations and theoretically + * improve the program's efficiency. + * + * Unfortunately, turning this settings on makes foostring routines + * non-reentrant, so it's turned off by default. + **/ +#define STRING_CACHE_SIZE 0 + +#if defined(STRING_CACHE_SIZE) && STRING_CACHE_SIZE > 0 + +/** + * List of free string structures. Pointer to the character + * buffer is treated as pointer to the next element. + **/ +static string_t free_string_list = NULL; + +/** + * Number of elements in the above list. + **/ +static size_t free_string_count = 0; + +#endif + +#if defined(STRING_STATS) && STRING_STATS > 0 + +/** + * Maximum number of allocated strings observed. + **/ +static size_t peek_string_count = 0; + +/** + * Current number of allocated strings. + **/ +static size_t curr_string_count = 0; + +#endif + +/** + * Allocates a string_t structure. If string cache is enabled, tries + * to use free strings from the cache. + **/ +static string_t string_alloc() +{ + string_t this; +#if defined (STRING_CACHE_SIZE) && STRING_CACHE_SIZE > 0 + if (free_string_list) { + this = free_string_list; + free_string_count -= 1; + free_string_list = (void *)this->str; + } else + this = malloc(sizeof *this); +#else + this = malloc(sizeof *this); +#endif + if (this == NULL) { + string_oom_callback(NULL); + return NULL; + } +#if defined (STRING_STATS) && STRING_STATS > 0 + ++curr_string_count; + if (curr_string_count > peek_string_count) + peek_string_count = curr_string_count; +#endif + return this; +} + +/** + * Frees a string. If the program is compiled with string cache + * enabled, tries to store the string on the list of free elements. + **/ +static void string_free_internal(string_t this) +{ +#if defined (STRING_CACHE_SIZE) && STRING_CACHE_SIZE > 0 + if (free_string_count < STRING_CACHE_SIZE) { + this->str = (void *)free_string_list; + free_string_list = this; + free_string_count += 1; + } else + free (this); +#else + free (this); +#endif +#if defined (STRING_STATS) && STRING_STATS > 0 + --curr_string_count; +#endif +} + +string_t string_create() +{ + string_t this = string_alloc(); + if (this == NULL) + return NULL; + this->str = malloc(1); + if (this->str == NULL) { + free(this); + string_oom_callback(NULL); + return NULL; + } + this->str[0] = 0; + this->len = 0; + this->size = 1; + return this; +} + +string_t string_init(const char *c_string) +{ + string_t this; + size_t length; + + assert(c_string != NULL); + + this = string_alloc(); + if (this == NULL) + return NULL; + length = strlen(c_string); + this->str = malloc(length + 1); + if (this->str == NULL) { + free(this); + string_oom_callback(NULL); + return NULL; + } + this->len = length; + this->size = length + 1; + memcpy(this->str, c_string, length); + this->str[this->len] = 0; + return this; +} + +string_t string_init_n(const char *c_string, size_t n) +{ + string_t this; + size_t length; + + assert(c_string != NULL); + + this = string_alloc(); + if (this == NULL) + return NULL; + length = strnlen(c_string, n); + this->str = malloc(length + 1); + if (this->str == NULL) { + free(this); + string_oom_callback(NULL); + return NULL; + } + this->len = length; + this->size = length + 1; + memcpy(this->str, c_string, length); + this->str[this->len] = 0; + return this; +} + +void string_free(string_t s) +{ + assert(s != NULL); + free(s->str); + s->len = -1; + s->size = 0; + string_free_internal(s); +} + +char *string_free_and_get_buffer(string_t s) +{ + char *buffer; + assert (s != NULL); + + buffer = s->str; + string_free_internal(s); + return buffer; +} + +int string_append(string_t s, const string_t s2) +{ + assert(s != NULL); + assert(s2 != NULL); + + return string_append_buf(s, s2->str, s2->len); +} + +int string_append_buf(string_t s, const char *buffer, size_t count) +{ + assert(s != NULL); + assert(buffer != NULL); + + if (string_ensure_extra_capacity(s, count) != 0) + return -1; + s->str[s->len + count] = 0; + memcpy(s->str + s->len, buffer, count); + s->len += count; + return 0; +} + +int string_append_str(string_t s, const char *c_string) +{ + assert(s != NULL); + assert(c_string != NULL); + + return string_append_buf(s, c_string, strlen(c_string)); +} + +int string_append_strn(string_t s, const char *c_string, size_t n) +{ + assert(s != NULL); + assert(c_string != NULL); + + return string_append_buf(s, c_string, strnlen(c_string, n)); +} + +int string_append_char(string_t s, char c) +{ + assert (s != NULL); + + if (string_ensure_extra_capacity(s, 1) != 0) + return -1; + s->str[s->len + 1] = 0; + s->str[s->len++] = c; + return 0; +} + +int string_append_long(string_t s, long value) +{ + char buffer[CHAR_BIT * sizeof (long) / 3 + 2]; + sprintf(buffer, "%ld", value); + return string_append_str(s, buffer); +} + +int string_append_unsigned_long(string_t s, unsigned long value) +{ + char buffer[CHAR_BIT * sizeof (unsigned long) / 3 + 2]; + sprintf(buffer, "%lu", value); + return string_append_str(s, buffer); +} + +int string_insert_strn(string_t s, size_t index, const char *c_string, + size_t count) +{ + assert(s != NULL); + assert(c_string != NULL); + + count = strnlen(c_string, count); + if (string_ensure_extra_capacity(s, count) != 0) + return -1; + memmove(s->str + index + count, s->str + index, s->len + 1 - index); + memmove(s->str + index, c_string, count); + s->len += count; + return 0; +} + +int string_insert_str(string_t s, size_t index, const char *c_string) +{ + assert(s != NULL); + assert(c_string != NULL); + + return string_insert_strn(s, index, c_string, -1); +} + +void string_clear(string_t s) +{ + assert(s != NULL); + + if (s->size > string_capacity_advice(0)) { + char *new_cstring; + int my_errno = errno; + s->size = string_capacity_advice(0); + new_cstring = realloc(s->str, s->size); + if (new_cstring != NULL) + s->str = new_cstring; + else { + errno = my_errno; + /* Swallow possible errors, as nothing terrible happened. */ + } + } + s->str[0] = 0; + s->len = 0; +} + +int string_format(string_t s, const char *format, ...) +{ + int rc; + va_list ap; + + va_start(ap, format); + string_clear(s); + rc = string_vformat_append(s, format, ap); + va_end(ap); + return rc; +} + +int string_format_append(string_t s, const char *format, ...) +{ + int rc; + va_list ap; + + va_start(ap, format); + rc = string_vformat_append(s, format, ap); + va_end(ap); + return rc; +} + +int string_vformat(string_t s, const char *format, va_list ap) +{ + string_clear(s); + return string_vformat_append(s, format, ap); +} + +int string_vformat_append(string_t s, const char *format, va_list ap) +{ + int rc; + const char *f = format; + +again: + /* copy as many characters as we can */ + while (*f != 0 && *f != '%') { + rc = string_append_char(s, *f++); + if (rc != 0) + return -1; + } + + /* if we have to take care of formatting, go ahead :-) */ + if (*f++ == '%') { + char c; + struct { + bool f_long; + } flags; + + flags.f_long = false; +format_again: + rc = 0; + switch (c = *f++) { + case 0: + break; + case '%': + rc = string_append_char(s, '%'); + break; + case 'c': + { + char ch = (char)va_arg(ap, int); + rc = string_append_char(s, ch); + } + break; + case 's': + { + const char *text = va_arg(ap, const char *); + rc = string_append_str(s, text); + } + break; + case 'S': + { + string_t text = va_arg(ap, string_t); + rc = string_append(s, text); + } + break; + case 'd': + if (flags.f_long) { + long d = va_arg(ap, long); + rc = string_append_long(s, d); + } else { + int d = va_arg(ap, int); + rc = string_append_long(s, d); + } + break; + case 'u': + if (flags.f_long) { + unsigned long d = va_arg(ap, unsigned long); + rc = string_append_unsigned_long(s, d); + } else { + unsigned int d = va_arg(ap, unsigned int); + rc = string_append_unsigned_long(s, d); + } + break; + case 'l': + flags.f_long = true; + goto format_again; + default: + rc = string_append_char(s, c); + break; + } + if (rc != 0) + return -1; + /* formatting complete, let's see whether we have some more + characters to handle */ + goto again; + } /* if (*f == '%') */ + return 0; +} + +char *string_avformat(const char *format, va_list ap) +{ + int rc; + string_t s = string_create(); + char *cstring; + + rc = string_vformat_append(s, format, ap); + cstring = string_free_and_get_buffer(s); + if (rc == 0) + return cstring; + else { + free(cstring); + return NULL; + } +} + +char *string_aformat(const char *format, ...) +{ + int rc; + va_list ap; + string_t s = string_create(); + char *cstring; + + va_start(ap, format); + rc = string_vformat_append(s, format, ap); + va_end(ap); + cstring = string_free_and_get_buffer(s); + if (rc == 0) + return cstring; + else { + free(cstring); + return NULL; + } +} + +string_t string_fgets(FILE *f) +{ + char buf[BUFSIZ]; + string_t res = string_create(); + if (res == NULL) + return NULL; + + while (fgets(buf, sizeof buf, f)) { + int rc = string_append_str(res, buf); + if (rc != 0) + return NULL; + if (strchr(buf, '\n')) + break; + } + if (string_len(res) == 0) { + string_free(res); + return NULL; + } + return res; +} + +int string_cmp(const string_t s1, const string_t s2) +{ + return strcmp(string_str(s1), string_str(s2)); +} + +int string_ccmp(const string_t s1, const char *s2) +{ + return strcmp(string_str(s1), s2); +} + +int string_printf(const char *format, ...) +{ + int result; + va_list ap; + va_start(ap, format); + result = string_vfprintf(stdout, format, ap); + va_end(ap); + return result; +} + +int string_fprintf(FILE *stream, const char *format, ...) +{ + int result; + va_list ap; + va_start(ap, format); + result = string_vfprintf(stream, format, ap); + va_end(ap); + return result; +} + +int string_vprintf(const char *format, va_list ap) +{ + return string_vfprintf(stdout, format, ap); +} + +int string_vfprintf(FILE *stream, const char *format, va_list ap) +{ + string_t text; + int rc; + + text = string_create(); + rc = string_vformat(text, format, ap); + if (rc != 0) + return -1; + rc = fputs(text->str, stream); + string_free(text); + return -(rc > 0); +} + +void string_purge(string_t str) +{ + str->str[0] = '\0'; + str->len = 0; +} diff --git a/poliqarp-library/foostring/foostring.h b/poliqarp-library/foostring/foostring.h new file mode 100644 index 0000000000000000000000000000000000000000..64df12634baa5fe1ad570af4e609c11da6c4ff5a --- /dev/null +++ b/poliqarp-library/foostring/foostring.h @@ -0,0 +1,385 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef FOOSTRING_H +#define FOOSTRING_H + +#include <stdio.h> +#include <stdlib.h> +#include <stdarg.h> + +/** + * A simple textual type enabling the user to create arbitrarily long + * strings without having to care for the length of the buffer. The + * string is available via the field `str'. One should not access + * this field directly. Instead, the macro string_str() should be + * used. + * + * @par Example code: + * @code + * string_t s; + * + * s = string_init("ala"); + * string_append_char(s, ' '); + * string_append_str(s, "ma kota"); + * printf("%s\n", string_str(s)); + * string_free(s); + * @endcode + */ +struct string_t { + char *str; /**< Buffer of characters */ + size_t len; /**< Number of characters used */ + size_t size; /**< Size of the buffer in bytes */ +}; + +/** + * A typedef to the string_t structure. + * @note For the programmer's convenience, this type is an alias for a + * pointer to struct string_t. + */ +typedef struct string_t *string_t; + +/** + * This function will be called in an out-of-memory situation. By default it + * does nothing. + */ +extern void (*string_oom_callback)(string_t); + +/** + * Creates and returns a new, empty string. + * + * @return a newly created string, or NULL in case of error + */ +string_t string_create(void); + +/** + * Creates and returns a string containing a copy of the given text. + * + * @note The initial text should be non-null. Use string_create() to + * create empty strings. + * + * @return a newly created string, or NULL in case of error + */ +string_t string_init(const char *c_string); + +/** + * Creates and returns a string containing a copy of at most n initial + * characters of the given text. + * + * @note The initial text should be non-null. Use string_create() to + * create empty strings. + * + * @return a newly created string, or NULL in case of error + */ +string_t string_init_n(const char *c_string, size_t n); + +/** + * Frees a string and deallocates its buffer. + */ +void string_free(string_t s); + +/** + * Frees a string structure, but returns its text buffer which should + * be freed separately via free(). + */ +char *string_free_and_get_buffer(string_t s); + +/** + * Affixes s2 at the end of s. + * + * @return 0 on success, -1 on error + */ +int string_append(string_t s, const string_t s2); + +/** + * Affixes the given buffer at the end of a string. + * + * @return 0 on success, -1 on error + */ +int string_append_buf(string_t s, const char *buffer, size_t count); + +/** + * Affixes the given C string at the end of a string. + * + * @return 0 on success, -1 on error + */ +int string_append_str(string_t s, const char *c_string); + +/** + * Appends at most n bytes of a given C string at the end of a string. + * + * @return 0 on success, -1 on error + */ +int string_append_strn(string_t s, const char *c_string, size_t n); + +/** + * Appends a given character at the end of a string. + * + * @return 0 on success, -1 on error + */ +int string_append_char(string_t s, char c); + +/** + * Appends a decimal value at the end of a string. + * + * @return 0 on success, -1 on error + */ +int string_append_long(string_t s, long value); + +/** + * Appends an unsigned decimal value at the end of a string. + * + * @return 0 on success, -1 on error + */ +int string_append_unsigned_long(string_t s, unsigned long value); + +/** + * Reads a line from the file. + * + * @return a newly created string, or NULL in case of error or if were no data + * to read + */ +string_t string_fgets(FILE *f); + +/** + * Compares two strings (with semantic as in strcmp()). + */ +int string_cmp(const string_t s1, const string_t s2); + +/** + * Compares a string with a null-terminated string (with semantic as in strcmp()). + */ +int string_ccmp(const string_t s1, const char *s2); + +/* -------------------------------------------------- */ + +/** + * Inserts at most count characters of a given C string at the given + * place in the string buffer. + * + * @note When count == (size_t) -1, the entire C string is inserted. + * + * @return 0 on success, -1 on error + */ +int string_insert_strn(string_t s, size_t index, const char *c_string, + size_t count); + +/** + * Insert a given C string at the given place in the string buffer. + * + * @return 0 on success, -1 on error + */ +int string_insert_str(string_t s, size_t index, const char *c_string); + +/** + * Clears the text buffer without freeing the structure's memory. + */ +void string_clear(string_t s); + + +/** + * Gets the string buffer. + */ +static inline char* string_str(string_t s) +{ + return s->str; +} + +/** + * Gets the string length. + */ +static inline size_t string_len(string_t s) +{ + return s->len; +} + +/** + * Formats a text buffer according to the given pattern. The format + * is similar to printf() and friends, but it handles only the + * following specifiers: + * <dl> + * <dt>%c</dt> + * <dd>Interprets the argument as a single character</dd> + * <dt>%s</dt> + * <dd>Interprets the argument as a C string (char *)</dd> + * <dt>%S</dt> + * <dd>Interprets the argument as a string_t object</dd> + * <dt>%d</dt> + * <dd>Interprets the argument as an <code>int</code></dd> + * <dt>%u</dt> + * <dd>Interprets the argument as an <code>unsigned int</code></dd> + * <dt>%ld</dt> + * <dd>Interprets the argument as an <code>long</code></dd> + * <dt>%lu</dt> + * <dd>Interprets the argument as an <code>unsigned long</code></dd> + * <dt>%%</dt> + * <dd>Inserts a literal percent character + * </dl> + * + * @return 0 on success, -1 on error + */ +int string_format(string_t s, const char *format, ...); + +/** + * Formats text and appends it at the end of the given string. + * + * @return 0 on success, -1 on error + */ +int string_format_append(string_t s, const char *format, ...); + +/** + * Similar to string_format_append(), but the argument list is passed + * as a va_list. + * + * @return 0 on success, -1 on error + */ +int string_vformat_append(string_t s, const char *format, va_list ap); + +/** + * Similar to string_format(), but the argument list is passed as a va_list. + * + * @return 0 on success, -1 on error + */ +int string_vformat(string_t s, const char *format, va_list ap); + +/* -------------------------------------------------- */ + +/** + * Uses string_format() to create a C string that should be freed + * using free(). + * + * @return 0 on success, -1 on error + */ +char *string_aformat(const char *format, ...); + +/** + * Similar to string_aformat(), but the argument list is passed as a + * va_list. + * + * @return 0 on success, -1 on error + */ +char *string_avformat(const char *format, va_list ap); + +/* -------------------------------------------------- */ + +/** + * Works like printf(), but interprets its arguments in the same way + * string_format() does. + * + * @return 0 on success, -1 on error + */ +int string_printf(const char *format, ...); + +/** + * Works like fprintf(), but interprets its arguments in the same way + * string_format() does. + * + * @return 0 on success, -1 on error + */ +int string_fprintf(FILE *stream, const char *format, ...); + +/** + * Works like vprintf(), but interprets its arguments in the same way + * string_format() does. + * + * @return 0 on success, -1 on error + */ +int string_vprintf(const char *format, va_list ap); + +/** + * Works like vfprintf(), but interprets its arguments in the same way + * string_format() does. + * + * @return 0 on success, -1 on error + */ +int string_vfprintf(FILE *stream, const char *format, va_list ap); + +/** + * Clears the string without strinking the string buffer. + */ +void string_purge(string_t str); + + +/** + * An ASCII variant of isspace(). + */ +static inline int ascii_isspace(char c) +{ + return + c == ' ' || c == '\f' || c == '\n' || + c == '\r' || c == '\t' || c == '\v'; +} + +/** + * An ASCII variant of isalpha(). + */ +static inline int ascii_isalpha(char c) +{ + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); +} + +/** + * An ASCII variant of isdigit(). + */ +static inline int ascii_isdigit(char c) +{ + return c >= '0' && c <= '9'; +} + +/** + * An ASCII variant of isalnum(). + */ +static inline int ascii_isalnum(char c) +{ + return ascii_isalpha(c) || ascii_isdigit(c); +} + +/** + * An ASCII variant of isgraph(). + */ +static inline int ascii_isgraph(char c) +{ + return c > ' ' && c < '\x7f'; +} + +/** + * An ASCII variant of toupper() + */ +static inline char ascii_toupper(char c) +{ + if (c >= 'a' && c <= 'z') + c += 'A' - 'a'; + return c; +} + +/** + * An ASCII variant of tolower() + */ +static inline char ascii_tolower(char c) +{ + if (c >= 'A' && c <= 'Z') + c += 'a' - 'A'; + return c; +} + +#endif /* FOOSTRING_H */ diff --git a/poliqarp-library/foostring/strnlen.c b/poliqarp-library/foostring/strnlen.c new file mode 100644 index 0000000000000000000000000000000000000000..d5c73f0c46261064fe611cbebb1b65470bb6dd70 --- /dev/null +++ b/poliqarp-library/foostring/strnlen.c @@ -0,0 +1,31 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <string.h> + +size_t strnlen (const char *str, size_t maxlen) +{ + size_t res = 0; + while (maxlen-- && *str++) res++; + return res; +} diff --git a/poliqarp-library/poliqarp-config.h.in b/poliqarp-library/poliqarp-config.h.in new file mode 100644 index 0000000000000000000000000000000000000000..d69c1ba3f8ecc1a7aa704f02e4bf5fa22d39481b --- /dev/null +++ b/poliqarp-library/poliqarp-config.h.in @@ -0,0 +1,214 @@ +/* poliqarp-config.h.in. Generated from configure.ac by autoheader. */ + +/* Define if building universal (internal helper macro) */ +#undef AC_APPLE_UNIVERSAL_BUILD + +/* Define to 1 if you have the declaration of `strerror_r', and to 0 if you + don't. */ +#undef HAVE_DECL_STRERROR_R + +/* Define to 1 if fseeko (and presumably ftello) exists and is declared. */ +#undef HAVE_FSEEKO + +/* Define to 1 if you have the `fseeko64' function. */ +#undef HAVE_FSEEKO64 + +/* Define if you have the gettext and ngettext functions in <libintl.h>. */ +#undef HAVE_GETTEXT + +/* Define to 1 if the system has the type `intptr_t'. */ +#undef HAVE_INTPTR_T + +/* Define to 1 if you have the <inttypes.h> header file. */ +#undef HAVE_INTTYPES_H + +/* Define to 1 if the system has the type `locale_t'. */ +#undef HAVE_LOCALE_T + +/* Define to 1 if you have the <memory.h> header file. */ +#undef HAVE_MEMORY_H + +/* Define to 1 if you have the <netinet/in.h> header file. */ +#undef HAVE_NETINET_IN_H + +/* Define if OpenMP is enabled */ +#undef HAVE_OPENMP + +/* Define if you have POSIX threads libraries and header files. */ +#undef HAVE_PTHREAD + +/* Define to 1 if stdbool.h conforms to C99. */ +#undef HAVE_STDBOOL_H + +/* Define to 1 if you have the <stdint.h> header file. */ +#undef HAVE_STDINT_H + +/* Define to 1 if you have the <stdlib.h> header file. */ +#undef HAVE_STDLIB_H + +/* Define to 1 if you have the `strerror_r' function. */ +#undef HAVE_STRERROR_R + +/* Define to 1 if you have the <strings.h> header file. */ +#undef HAVE_STRINGS_H + +/* Define to 1 if you have the <string.h> header file. */ +#undef HAVE_STRING_H + +/* Define to 1 if you have the `strndup' function. */ +#undef HAVE_STRNDUP + +/* Define to 1 if you have the <sys/socket.h> header file. */ +#undef HAVE_SYS_SOCKET_H + +/* Define to 1 if you have the <sys/stat.h> header file. */ +#undef HAVE_SYS_STAT_H + +/* Define to 1 if you have the <sys/types.h> header file. */ +#undef HAVE_SYS_TYPES_H + +/* Define to 1 if the system has the type `uint_fast32_t'. */ +#undef HAVE_UINT_FAST32_T + +/* Define to 1 if you have the <unistd.h> header file. */ +#undef HAVE_UNISTD_H + +/* Define to 1 if the system has the type `_Bool'. */ +#undef HAVE__BOOL + +/* Define to the address where bug reports for this package should be sent. */ +#undef PACKAGE_BUGREPORT + +/* Define to the full name of this package. */ +#undef PACKAGE_NAME + +/* Define to the full name and version of this package. */ +#undef PACKAGE_STRING + +/* Define to the one symbol short name of this package. */ +#undef PACKAGE_TARNAME + +/* Define to the home page for this package. */ +#undef PACKAGE_URL + +/* Define to the version of this package. */ +#undef PACKAGE_VERSION + +/* Define if you want a single-threaded Poliqarp library */ +#undef POLIQARP_SINGLE_THREADED + +/* Define to a printf format code for size_t */ +#undef PRIuSIZE + +/* Define to necessary symbol if this constant uses a non-standard name on + your system. */ +#undef PTHREAD_CREATE_JOINABLE + +/* Define to 1 if you have the ANSI C header files. */ +#undef STDC_HEADERS + +/* Define to 1 if strerror_r returns char *. */ +#undef STRERROR_R_CHAR_P + +/* Define to 1 if you want to use memory mapped files extensively. */ +#undef USE_EXTENSIVE_MMAP + +/* Define to 1 if you want/need to use the included TCL regular expressions + library. */ +#undef USE_TCL_REGEX + +/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most + significant byte first (like Motorola and SPARC, unlike Intel). */ +#if defined AC_APPLE_UNIVERSAL_BUILD +# if defined __BIG_ENDIAN__ +# define WORDS_BIGENDIAN 1 +# endif +#else +# ifndef WORDS_BIGENDIAN +# undef WORDS_BIGENDIAN +# endif +#endif + +/* Number of bits in a file offset, on hosts where this is settable. */ +#undef _FILE_OFFSET_BITS + +/* Define to 1 to make fseeko visible on some hosts (e.g. glibc 2.2). */ +#undef _LARGEFILE_SOURCE + +/* Define for large files, on AIX-style hosts. */ +#undef _LARGE_FILES + +/* Define for Solaris 2.5.1 so the uint32_t typedef from <sys/synch.h>, + <pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the + #define below would cause a syntax error. */ +#undef _UINT32_T + +/* Define for Solaris 2.5.1 so the uint64_t typedef from <sys/synch.h>, + <pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the + #define below would cause a syntax error. */ +#undef _UINT64_T + +/* Define for Solaris 2.5.1 so the uint8_t typedef from <sys/synch.h>, + <pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the + #define below would cause a syntax error. */ +#undef _UINT8_T + +/* Define to `__inline__' or `__inline' if that's what the C compiler + calls it, or to nothing if 'inline' is not supported under any name. */ +#ifndef __cplusplus +#undef inline +#endif + +/* Define to the type of a signed integer type of width exactly 16 bits if + such a type exists and the standard includes do not define it. */ +#undef int16_t + +/* Define to the type of a signed integer type of width exactly 32 bits if + such a type exists and the standard includes do not define it. */ +#undef int32_t + +/* Define to the type of a signed integer type of width exactly 64 bits if + such a type exists and the standard includes do not define it. */ +#undef int64_t + +/* Define to the type of a signed integer type of width exactly 8 bits if such + a type exists and the standard includes do not define it. */ +#undef int8_t + +/* Define to the type of a signed integer type wide enough to hold a pointer, + if such a type exists, and if the system does not define it. */ +#undef intptr_t + +/* Define to `int64_t' if <sys/types.h> does not define. */ +#undef off64_t + +/* Define to `unsigned int' if <sys/types.h> does not define. */ +#undef size_t + +/* Define to `int' if <sys/socket.h> does not define. */ +#undef socklen_t + +/* Define to `int' if <sys/types.h> does not define. */ +#undef ssize_t + +/* Define to the type of an unsigned integer type of width exactly 16 bits if + such a type exists and the standard includes do not define it. */ +#undef uint16_t + +/* Define to the type of an unsigned integer type of width exactly 32 bits if + such a type exists and the standard includes do not define it. */ +#undef uint32_t + +/* Define to the type of an unsigned integer type of width exactly 64 bits if + such a type exists and the standard includes do not define it. */ +#undef uint64_t + +/* Define to the type of an unsigned integer type of width exactly 8 bits if + such a type exists and the standard includes do not define it. */ +#undef uint8_t + +/* Define to the fastest at least 32-bit unsigned integer type if <stdint.h> + and <inttypes.h> do not define. */ +#undef uint_fast32_t + +#include "commondef.h" diff --git a/poliqarp-library/poliqarpd/async.c b/poliqarp-library/poliqarpd/async.c new file mode 100644 index 0000000000000000000000000000000000000000..f969c73c6185b9f16e7642a84308884863ba0a07 --- /dev/null +++ b/poliqarp-library/poliqarpd/async.c @@ -0,0 +1,30 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include "async.h" +#include "session.h" + +void async_notify_new_results(void *session) +{ + session_csend((session_t *)session, "M NEW-RESULTS\n"); +} diff --git a/poliqarp-library/poliqarpd/async.h b/poliqarp-library/poliqarpd/async.h new file mode 100644 index 0000000000000000000000000000000000000000..bfd713b7ac82cc34c892f8fe781c20e50e318021 --- /dev/null +++ b/poliqarp-library/poliqarpd/async.h @@ -0,0 +1,33 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef POLIQARPD_ASYNC_H +#define POLIQARPD_ASYNC_H + +/** + * Sends an asynchronous message notifying the client about the new bunch of + * results available. + */ +void async_notify_new_results(void *session); + +#endif /* POLIQARPD_ASYNC_H */ diff --git a/poliqarp-library/poliqarpd/configuration.c b/poliqarp-library/poliqarpd/configuration.c new file mode 100644 index 0000000000000000000000000000000000000000..e1f07009d0d46de165d1685baacca1bf3010878b --- /dev/null +++ b/poliqarp-library/poliqarpd/configuration.c @@ -0,0 +1,385 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> + +#include <foostring/foostring.h> + +#include <sakura/common/hash-table.h> + +#include "configuration.h" + +#define DFL_HOSTNAME "127.0.0.1" + +#define MAX_PORT ((1 << 16) - 1) +#define DFL_PORT 4567 + +#define DFL_LOGFILE "poliqarpd.log" + +#define MIN_BUFFER_SIZE 1 +#define DFL_BUFFER_SIZE 1000 +#define MAX_BUFFER_SIZE 50000 + +#define MIN_IDLE_TIME 1 +#define DFL_IDLE_TIME 1200 +#define MAX_IDLE_TIME 86400 + +#define MIN_MATCH_LENGTH 10 +#define DFL_MATCH_LENGTH 1000 +#define MAX_MATCH_LENGTH 1000 + +struct configuration cfg = { + .hostname = NULL, + .port = DFL_PORT, + .logging_on = 0, + .logfile = NULL, + .match_buffer_size = DFL_BUFFER_SIZE, + .max_session_idle = DFL_IDLE_TIME, + .max_match_length = DFL_MATCH_LENGTH, + .corpora = NULL, + .allow_any_corpus = false, + .gui_mode = false, + .detach = false, + .notify_thread_id = 0, + .log = NULL, + .logmutex = PTHREAD_MUTEX_INITIALIZER, +}; + +void init_default_cfg() +{ + cfg.hostname = string_init(DFL_HOSTNAME); + cfg.logfile = string_init(DFL_LOGFILE); +} + +void done_cfg() +{ + if (cfg.hostname) { + string_free(cfg.hostname); + cfg.hostname = NULL; + } + if (cfg.logfile) { + string_free(cfg.logfile); + cfg.logfile = NULL; + } + if (cfg.corpora) + { + destroy_hash_table(cfg.corpora, free); + free(cfg.corpora); + cfg.corpora = NULL; + } +} + +struct config_hook { + char *option; + int (*proc)(const char *, char **); +}; + +#define STR_RANGE(x, y) STR_RANGE_(x, y) +#define STR_RANGE_(x, y) "" # x " and " # y + +static char err_nan[] = _M("not a number"); +static char err_invalid_hostname[] = _M("unspecified host name"); +static char err_invalid_port[] = _M("port number out of range"); +static char err_invalid_logging[] = _M("must be \"on\" or \"off\""); +static char err_invalid_logfile[] = _M("unspecified log file name"); +static char err_invalid_buffer_size[] = _M("must be between " STR_RANGE(MIN_BUFFER_SIZE, MAX_BUFFER_SIZE)); +static char err_invalid_idle_time[] = _M("must be between " STR_RANGE(MIN_IDLE_TIME, MAX_IDLE_TIME)); +static char err_invalid_key_value[] = _M("invalid key:value pair"); +static char err_invalid_match_length[] = _M("must be between " STR_RANGE(MIN_MATCH_LENGTH, MAX_MATCH_LENGTH)); + +static int hostname_hook(const char *val, char **err) +{ + if (strlen(val) == 0) { + *err = err_invalid_hostname; + return -1; + } + if (cfg.hostname) { + string_free(cfg.hostname); + cfg.hostname = NULL; + } + cfg.hostname = string_init(val); + return 0; +} + +static int port_hook(const char *val, char **err) +{ + char *serr; + long port; + + errno = 0; + port = strtol(val, &serr, 10); + if (*val == '\0' || *serr != '\0') { + *err = err_nan; + return -1; + } + if (port < 0 || port > MAX_PORT) { + *err = err_invalid_port; + return -1; + } + cfg.port = (uint16_t)port; + return 0; +} + +static int logging_hook(const char *val, char **err) +{ + if (!strcmp(val, "on")) { + cfg.logging_on = 1; + return 0; + } + if (!strcmp(val, "off")) { + cfg.logging_on = 0; + return 0; + } + *err = err_invalid_logging; + return -1; +} + +static int logfile_hook(const char *val, char **err) +{ + if (strlen(val) == 0) { + *err = err_invalid_logfile; + return -1; + } + if (cfg.logfile) { + string_free(cfg.logfile); + cfg.logfile = NULL; + } + cfg.logfile = string_init(val); + return 0; +} + +static int match_buffer_size_hook(const char *val, char **err) +{ + char *serr; + unsigned long size; + + size = strtoul(val, &serr, 10); + if (*val == '\0' || *serr != '\0') { + *err = err_nan; + return -1; + } + if (size < MIN_BUFFER_SIZE) + { + cfg.match_buffer_size = MIN_BUFFER_SIZE; + *err = err_invalid_buffer_size; + return -1; + } + if (size > MAX_BUFFER_SIZE) + { + cfg.match_buffer_size = MAX_BUFFER_SIZE; + *err = err_invalid_buffer_size; + return -1; + } + cfg.match_buffer_size = size; + return 0; +} + +static int max_match_length_hook(const char *val, char **err) +{ + char *serr; + unsigned long size; + size = strtoul(val, &serr, 10); + if (*val == '\0' || *serr != '\0') { + *err = err_nan; + return -1; + } + if (size < MIN_MATCH_LENGTH) + { + cfg.max_match_length = MIN_MATCH_LENGTH; + *err = err_invalid_match_length; + return -1; + } + if (size > MAX_MATCH_LENGTH) + { + cfg.max_match_length = MAX_MATCH_LENGTH; + *err = err_invalid_match_length; + return -1; + } + cfg.max_match_length = size; + return 0; +} + +static int max_session_idle_hook(const char *val, char **err) +{ + char *serr; + unsigned long msi; + + msi = strtoul(val, &serr, 10); + if (*val == '\0' || *serr != '\0') { + *err = err_nan; + return -1; + } + if (msi < MIN_IDLE_TIME) + { + cfg.max_session_idle = MIN_IDLE_TIME; + *err = err_invalid_idle_time; + return -1; + } + if (msi > MAX_IDLE_TIME) + { + cfg.max_session_idle = MAX_IDLE_TIME; + *err = err_invalid_idle_time; + return -1; + } + cfg.max_session_idle = msi; + return 0; +} + +static int split(char *buf, int len, char ch, char **start1, char **start2); + +static int corpus_hook(const char *val, char **err) +{ + char *corpus_key, *corpus_value, *corpus_item; + if (strcmp(val, "any") == 0) + { + cfg.allow_any_corpus = true; + return 0; + } + corpus_item = strdup(val); + if (corpus_item == NULL) + abort(); + if (split(corpus_item, strlen(corpus_item), ':', &corpus_key, &corpus_value) != 0) + { + *err = err_invalid_key_value; + free(corpus_item); + return -1; + } + corpus_value = strdup(corpus_value); + if (corpus_value == NULL) + abort(); + if (cfg.corpora == NULL) + { + cfg.corpora = malloc(sizeof(*cfg.corpora)); + if (cfg.corpora == NULL) + abort(); + if (create_hash_table(cfg.corpora, 16, HASHTABLE_DUPLICATE_KEYS, NULL) != 0) + abort(); + } + if (hash_table_set(cfg.corpora, corpus_key, corpus_value) != 0) + abort(); + free(corpus_item); + return 0; +} + +static const struct config_hook hooks[] = { + { "hostname", hostname_hook }, + { "port", port_hook }, + { "logging", logging_hook }, + { "logfile", logfile_hook }, + { "log-file", logfile_hook }, + { "match-buffer-size", match_buffer_size_hook }, + { "max-match-length", max_match_length_hook }, + { "max-session-idle", max_session_idle_hook }, + { "corpus", corpus_hook }, +}; + +/** Searches for a character in the text buffer and splits the text into + * two strings (before and after the character), trimming whitespaces from + * the beginning and end of each of the two strings. + * + * @param buf the buffer to search in + * @param len the length of the buffer + * @param ch the character to search for + * @param start1 pointer to a variable that will store the beginning of first + * string + * @param start2 ditto, second string + * @return 0 if the split was successful + * @return -1 if the character was not found + */ +static int split(char *buf, int len, char ch, char **start1, char **start2) +{ + char *end1, *end2, *charpos; + charpos = strchr(buf, ch); + if (!charpos) + return -1; + *start1 = buf; + end1 = charpos - 1; + *start2 = charpos + 1; + end2 = buf + len - 1; + while (end1 > *start1 && ascii_isspace(*end1)) + end1--; + while (end2 > *start2 && ascii_isspace(*end2)) + end2--; + while (*start1 < end1 && ascii_isspace(**start1)) + (*start1)++; + while (*start2 < end2 && ascii_isspace(**start2)) + (*start2)++; + end1[1] = 0; + end2[1] = 0; + return 0; +} + +static void scan_hooks(const char *name, const char *value, + const char *filename, int lineno) +{ + size_t i; + bool found = false; + char *err; + + for (i = 0; i < sizeof(hooks) / sizeof(struct config_hook); i++) { + if (!strcmp(name, hooks[i].option)) { + found = true; + if (hooks[i].proc(value, &err) == -1) { + fprintf(stderr, "%s(%d): %s: %s\n", filename, lineno, name, _(err)); + fflush(stderr); + } + } + } + + if (!found) { + fprintf(stderr, "%s(%d): %s \"%s\"\n", filename, lineno, + _("unknown configuration option"), name); + fflush(stderr); + } +} + +int read_cfg(const char *filename) +{ + FILE *f; + string_t str; + int lineno = 0; + + if ((f = fopen(filename, "rt")) == NULL) + return -1; + while ((str = string_fgets(f)) != NULL) { + int len = string_len(str); + char *name, *value; + char *buf = string_free_and_get_buffer(str); + + lineno++; + if (buf[0] == '#' || split(buf, len, '=', &name, &value) == -1) { + free(buf); + continue; + } + scan_hooks(name, value, filename, lineno); + free(buf); + } + if (fclose(f) == EOF) + return -1; + if (cfg.corpora == NULL) + cfg.allow_any_corpus = true; + return 0; +} diff --git a/poliqarp-library/poliqarpd/configuration.h b/poliqarp-library/poliqarpd/configuration.h new file mode 100644 index 0000000000000000000000000000000000000000..daba7a0157e2b4b25ce07ab114fc4547ad96fb4b --- /dev/null +++ b/poliqarp-library/poliqarpd/configuration.h @@ -0,0 +1,84 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +/** + * Declaration of a structure holding poliqarpd's configuration and + * functions used to handle it. + */ + +#ifndef POLIQARPD_CONFIGURATION_H +#define POLIQARPD_CONFIGURATION_H + +#include <poliqarp-config.h> + +#include <foostring/foostring.h> +#include <pthread.h> +#include <stdio.h> + + +struct configuration { + /* Static configuration */ + + string_t hostname; /**< the host that we're listening on */ + uint16_t port; /**< the port that we're listening on */ + int logging_on; /**< logging enabled? */ + string_t logfile; /**< name of the file that we're logging to */ + size_t match_buffer_size; /**< initial size of match buffer */ + time_t max_session_idle; /**< how long a session can be idle */ + size_t max_match_length; /**< maximum match length, in segments */ + struct hash_table* corpora; /**< collection of predefined corpora */ + bool allow_any_corpus; /**< allow to open any corpus in the file system */ + + /* Dynamic configuration */ + + bool gui_mode; /**< GUI mode: whether to terminate the server + when the number of active sessions drops to + zero */ + bool detach; /**< deamon mode */ + unsigned long notify_thread_id; + /**< thread to be notified when the server is + ready to accept connections (Win32 only) */ + FILE *log; /**< physical counterpart of logfile */ + pthread_mutex_t logmutex; /**< mutex that synchronizes log calls to + localtime() */ +}; + +/** The global configuration object. */ +extern struct configuration cfg; + +/** Initializes the configuration object. */ +void init_default_cfg(); + +/** Frees all resources associated with a configuration. */ +void done_cfg(); + +/** Reads the configuration from a given file and store it in the memory area + * pointed to by cfg. Outputs any warning messages (about wrong options, etc.) + * to stderr. + * @return 0 if the configuration was read successfully (possibly with + * warnings). + * @return -1 if an I/O error occurred. + */ +int read_cfg(const char *filename); + +#endif /* POLIQARPD_CONFIGURATION_H */ diff --git a/poliqarp-library/poliqarpd/errors.h b/poliqarp-library/poliqarpd/errors.h new file mode 100644 index 0000000000000000000000000000000000000000..224c0fa5d0327fb54bfa2c2c1408f098d0c03288 --- /dev/null +++ b/poliqarp-library/poliqarpd/errors.h @@ -0,0 +1,64 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +/** + * Definitions of errors that can be returned by various functions. + */ + +#ifndef POLIQARPD_ERRORS_H +#define POLIQARPD_ERRORS_H + +/* PE stands for Poliqarp Error */ + +#define PE_GENERIC -1 /* Generic error */ +#define PE_NOMEM -2 /* Not enough memory */ +#define PE_LIMITEXC -3 /* Hard limit exceeded */ +#define PE_INVSID -4 /* Invalid session ID */ +#define PE_SIDUSED -5 /* Session ID already used */ +#define PE_INVUID -6 /* Invalid user ID */ +#define PE_INVOPT -7 /* Incorrect session option name */ +#define PE_INVVAL -8 /* Invalid session option value */ + +/* PPE stands for Poliqarp Protocol Error */ + +#define PPE_ARGCOUNT 1 /* Incorrect number of arguments */ +#define PPE_NOSESSION 3 /* No session opened */ +#define PPE_SESSBOUND 4 /* Cannot create a session for a connection that + is already bound */ +#define PPE_NOMEM 5 /* Not enough memory */ +#define PPE_INVSID 6 /* Invalid session ID */ +#define PPE_SIDUSED 7 /* Session with this ID is already bound */ +#define PPE_INVUID 8 /* Session user ID does not match the argument + of RECONNECT */ +#define PPE_CORPUSALR 10 /* Session already has an open corpus */ +#define PPE_SYSOPEN 12 /* System error while opening the corpus */ +#define PPE_NOCORPUS 13 /* No corpus opened */ +#define PPE_INVJID 14 /* Invalid job ID */ +#define PPE_BUSY 15 /* A job is already in progress */ +#define PPE_INVQUERY 16 /* Incorrect query */ +#define PPE_INVRANGE 17 /* Invalid result range */ +#define PPE_INVOPT 18 /* Incorrect session option */ +#define PPE_INVVAL 19 /* Invalid session option value */ +#define PPE_INVCRIT 20 /* Invalid sorting criteria */ + +#endif /* POLIQARPD_ERRORS_H */ diff --git a/poliqarp-library/poliqarpd/getpid.h b/poliqarp-library/poliqarpd/getpid.h new file mode 100644 index 0000000000000000000000000000000000000000..340f372a63484b187f49900daa6b961cdca9e395 --- /dev/null +++ b/poliqarp-library/poliqarpd/getpid.h @@ -0,0 +1,37 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +/** + * A cross-platform way of including getpid() and friends. + */ + +#ifndef POLIQARPD_GETPID_H +#define POLIQARPD_GETPID_H + +#ifdef _WIN32 +# include <process.h> +#else +# include <unistd.h> +#endif /* _WIN32 */ + +#endif /* POLIQARPD_GETPID_H */ diff --git a/poliqarp-library/poliqarpd/log.c b/poliqarp-library/poliqarpd/log.c new file mode 100644 index 0000000000000000000000000000000000000000..e91cdc9b07e5415f6a56653edfe4f8ebead9016c --- /dev/null +++ b/poliqarp-library/poliqarpd/log.c @@ -0,0 +1,75 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <stdio.h> +#include <time.h> + +#include "getpid.h" +#include "log.h" +#include "utils.h" + +int init_log() +{ + if (cfg.logging_on) { + cfg.log = fopen(string_str(cfg.logfile), "a"); + if (cfg.log == NULL) + return -1; + } + return 0; +} + +void done_log() +{ + if (cfg.logging_on) { + if (cfg.log) + fclose(cfg.log); + } +} + +void log_entry(const char *fmt, ...) +{ + va_list l; + string_t s; + struct tm *tm; + time_t t; + char timebuf[30]; + + if (!cfg.logging_on) + return; + s = string_create(); + time(&t); + lock(&cfg.logmutex); + tm = localtime(&t); + unlock(&cfg.logmutex); + strftime(timebuf, sizeof timebuf, "%Y-%m-%d %H:%M:%S%z", tm); + string_append_char(s, '['); + string_append_str(s, timebuf); + string_format_append(s, "] poliqarpd[%d]: ", getpid()); + va_start(l, fmt); + string_vformat_append(s, fmt, l); + va_end(l); + string_append_char(s, '\n'); + fputs(string_str(s), cfg.log); + fflush(cfg.log); + string_free(s); +} diff --git a/poliqarp-library/poliqarpd/log.h b/poliqarp-library/poliqarpd/log.h new file mode 100644 index 0000000000000000000000000000000000000000..14e5e6009c111c5e76b2ebaf09fa59d968e99c60 --- /dev/null +++ b/poliqarp-library/poliqarpd/log.h @@ -0,0 +1,53 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +/** + * Public interface of the logging subsystem. + */ + +#ifndef POLIQARPD_LOG_H +#define POLIQARPD_LOG_H + +#include <foostring/foostring.h> +#include <stdarg.h> + +#include "configuration.h" + +/** + * Initializes the logging subsystem. + * @return 0 upon successful initialization. + * @return -1 if an error occurred. + */ +int init_log(); + +/** + * Deallocates all resources associated with the logging subsystem. + */ +void done_log(); + +/** + * Writes a given message to the log. + */ +void log_entry(const char *fmt, ...); + +#endif /* POLIQARPD_LOG_H */ diff --git a/poliqarp-library/poliqarpd/msgqueue.c b/poliqarp-library/poliqarpd/msgqueue.c new file mode 100644 index 0000000000000000000000000000000000000000..587343024373acfd4dd55454130e5ec7f2536eec --- /dev/null +++ b/poliqarp-library/poliqarpd/msgqueue.c @@ -0,0 +1,59 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <stdlib.h> + +#include "msgqueue.h" + +void msgqueue_init(struct msgqueue *queue) +{ + queue->first = queue->last = NULL; +} + +void msgqueue_append(struct msgqueue *queue, string_t str) +{ + struct msgqueue_elem *el = malloc(sizeof(*el)); + el->msg = str; + el->next = NULL; + if (queue->first) { + queue->last->next = el; + queue->last = el; + } else { + queue->first = queue->last = el; + } +} + +string_t msgqueue_get(struct msgqueue *queue) +{ + if (queue->first) { + struct msgqueue_elem *el = queue->first; + string_t msg = el->msg; + + queue->first = el->next; + if (el->next == NULL) + queue->last = NULL; + free(el); + return msg; + } else + return NULL; +} diff --git a/poliqarp-library/poliqarpd/msgqueue.h b/poliqarp-library/poliqarpd/msgqueue.h new file mode 100644 index 0000000000000000000000000000000000000000..129c2f0792359b7974d83493d496f8c47f9487aa --- /dev/null +++ b/poliqarp-library/poliqarpd/msgqueue.h @@ -0,0 +1,64 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +/** + * FIFO queues of strings. + */ + +#ifndef POLIQARPD_MSGQUEUE_H +#define POLIQARPD_MSGQUEUE_H + +#include <foostring/foostring.h> + +/** + * A FIFO queue of strings (implemented as a singly-linked list). + */ +struct msgqueue { + struct msgqueue_elem *first, *last; +}; + +/** + * An element of the queue. + */ +struct msgqueue_elem { + string_t msg; + struct msgqueue_elem *next; +}; + +/** + * Initializes the queue. + */ +void msgqueue_init(struct msgqueue *queue); + +/** + * Inserts a string onto the end of the queue. + */ +void msgqueue_append(struct msgqueue *queue, string_t str); + +/** + * Returns the first element in the queue, removing it. Returns NULL + * if the queue was empty. + */ +string_t msgqueue_get(struct msgqueue *queue); + +#endif /* POLIQARPD_MSGQUEUE_H */ diff --git a/poliqarp-library/poliqarpd/od_unix.c b/poliqarp-library/poliqarpd/od_unix.c new file mode 100644 index 0000000000000000000000000000000000000000..e374da9bf363c8fd2f522070410be9eb7a8b632d --- /dev/null +++ b/poliqarp-library/poliqarpd/od_unix.c @@ -0,0 +1,87 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <poliqarp-config.h> + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <signal.h> + +#include <foostring/foostring.h> + +#include "osdep.h" + +static char *config_dir = NULL; + +const char *get_config_dir() +{ + if (config_dir == NULL) + { + char* home = getenv("HOME"); + if (home == NULL) + return ""; + config_dir = string_aformat("%s/.poliqarp/", home); + } + return config_dir; +} + +void free_config_dir() { + free(config_dir); +} + +void poliqarp_sleep(int seconds) +{ + sleep(seconds); +} + +void handle_signals() +{ + signal(SIGPIPE, SIG_IGN); + signal(SIGINT, SIG_IGN); +} + +int console_detach() +{ + switch (fork()) { + case -1: + return -1; + case 0: + break; + default: + _exit(0); + } + if (setsid() == -1) + return -1; + return 0; +} + +void notify_readiness(unsigned long thread_id) +{ + /* Deliberately not implemented. */ +} + +void print_socket_error(const char *s) +{ + perror(s); +} diff --git a/poliqarp-library/poliqarpd/od_win32.c b/poliqarp-library/poliqarpd/od_win32.c new file mode 100644 index 0000000000000000000000000000000000000000..fd8e6854e4b2536a04884a9dee2eec2475fa5e82 --- /dev/null +++ b/poliqarp-library/poliqarpd/od_win32.c @@ -0,0 +1,106 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <windows.h> +#include <stdio.h> + +#include "osdep.h" + +const char *get_config_dir() +{ + return ""; +} + +void free_config_dir() { + /* Nothing to do. */ +} + +void poliqarp_sleep(int seconds) +{ + Sleep(seconds * 1000); +} + +void handle_signals() +{ +} + +int console_detach() +{ + return !FreeConsole(); +} + +void notify_readiness(unsigned long thread_id) +{ + PostThreadMessage(thread_id, WM_APP, 0, 0); /* deliberately ignore errors */ +} + +void herror(const char *s) +{ + print_socket_error(s); +} + +void print_socket_error(const char *s) +{ + char *message; + if (s != NULL) + fprintf(stderr, "%s: ", s); + switch (WSAGetLastError()) + { +#define MAP(x, y) case WSA##x: message = y; break; + case 0: + message = "Success"; + break; + MAP(EACCES, "Permission denied") + MAP(EADDRINUSE, "Address already in use") + MAP(EADDRNOTAVAIL, "Address not available") + MAP(EAFNOSUPPORT, "Address family not supported") + MAP(ECONNABORTED, "Connection aborted") + MAP(ECONNRESET, "Connection reset by peer") + MAP(EFAULT, "Bad address") + MAP(EINPROGRESS, "Operation now in progress") + MAP(EINTR, "Interrupted system call") + MAP(EINVAL, "Invalid argument") + MAP(EISCONN, "Socket is already connected") + MAP(EMFILE, "Too many open files") + MAP(EMSGSIZE, "Message too long") + MAP(ENETDOWN, "Network is down") + MAP(ENETRESET, "Connection aborted by network") + MAP(ENOBUFS, "No buffer space is available") + MAP(ENOTSOCK, "Not a socket") + MAP(EOPNOTSUPP, "Operation not supported on socket") + MAP(EPROTONOSUPPORT, "Protocol not supported") + MAP(EPROTOTYPE, "Protocol wrong type for socket") + MAP(ESHUTDOWN, "Cannot send after transport endpoint shutdown") + MAP(ESOCKTNOSUPPORT, "Socket type not supported") + MAP(EWOULDBLOCK, "Operation would block") + MAP(HOST_NOT_FOUND, "Host not found") + MAP(NOTINITIALISED, "Successful WSASTARTUP not yet performed") + MAP(NO_DATA, "Valid name, no data record of requested type") + MAP(NO_RECOVERY, "A nonrecoverable error occurred") + MAP(TRY_AGAIN, "Nonauthoritative host not found") +#undef MAP + default: + message = "Unknown error"; + } + fprintf(stderr, "%s\n", message); +} diff --git a/poliqarp-library/poliqarpd/osdep.h b/poliqarp-library/poliqarpd/osdep.h new file mode 100644 index 0000000000000000000000000000000000000000..d1e9cb7f76523b7fe19dbb13ed26f293bb0db411 --- /dev/null +++ b/poliqarp-library/poliqarpd/osdep.h @@ -0,0 +1,86 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +/* + * Various OS-dependent functions. + * Implemented in a file named od_<OSNAME>.c, eg. od_unix.c. + */ + +#ifndef POLIQARPD_OSDEP_H +#define POLIQARPD_OSDEP_H + +/** + * Return the directory (terminated by slash or backslash) containing the + * server's configuration file. On some systems, it may return an empty string, + * indicating that the program's working directory should be used for this + * purpose. + * + * @return a pointer to a null-terminated string containing the directory + * name. + */ +const char *get_config_dir(); + +/** + * Frees memory allocated by a get_config_dir() call (if any). + */ +void free_config_dir(); + +/** + * Sleep for a given number of seconds. + */ +void poliqarp_sleep(int seconds); + +/** + * Intercept all signals that might be necessary, if the platform + * supports it. + */ +void handle_signals(); + + +/** + * Notify a process/thread that poliqarpd is ready to accept connections. + * + * Win32: send a WM_APP message to the thread. + * UNIX: do nothing. + */ +void notify_readiness(unsigned long); + + +/** + * Detach from the console. + */ +int console_detach(); + +/** + * Produce a message on the standard error output, describing the last error + * encountered during resolving host names. + */ +void herror(const char *); + +/** + * Produce a message on the standard error output, describing the last socket + * error. + */ +void print_socket_error(const char *); + +#endif /* POLIQARPD_OSDEP_H */ diff --git a/poliqarp-library/poliqarpd/poliqarpd.c b/poliqarp-library/poliqarpd/poliqarpd.c new file mode 100644 index 0000000000000000000000000000000000000000..95e1036e9ab3de9e04316fe1704bc2e63ef1ce61 --- /dev/null +++ b/poliqarp-library/poliqarpd/poliqarpd.c @@ -0,0 +1,146 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <poliqarp-config.h> + +#include <getopt.h> +#include <sakura/poliqarp.h> +#include <stdio.h> +#include <string.h> + +#include "configuration.h" +#include "log.h" +#include "osdep.h" +#include "server.h" + +void display_help(const char *progname) +{ + printf(_("Usage: %s OPTIONS\n\n" + " -h, --help display this help and exit\n" + " -v, --version display version information and exit\n" + " -g, --gui GUI mode: cause the server to terminate when\n" + " the session counter drops down to zero for\n" + " the first time\n" + " -c, --config-file=FILE use the given FILE as a configuration file\n" + " instead of $HOME/.poliqarp/poliqarpd.conf\n" + " -d, --detach detach from the controlling terminal and\n" + " run in the background\n" +#if 0 + /* -n is intended for internal use only; keep it undocumented */ + " -n, --notify=TID notify thread TID that poliqarpd is ready\n" + " to accept connections\n" +#endif + "\n" + "Report bugs to <" PACKAGE_BUGREPORT ">.\n"), + progname); +} + +void display_version() +{ + printf(_("poliqarpd version %s (compiled with %s library, " + "version %d.%d.%d)\n"), POLIQARPD_VERSION, poliqarp_library_name, + poliqarp_major_version, poliqarp_minor_version, + poliqarp_revision_number); +} + +void init_subsystems() +{ + struct poliqarp_error error = poliqarp_error_none; + if (poliqarp_create("" /* use the system locale */, &error) != 0) { + fprintf(stderr, "%s.\n", poliqarp_error_message_get(&error)); + poliqarp_error_message_set(&error, NULL); + exit(1); + } + if (init_log() == -1) + fprintf(stderr, _("Warning: could not initialize log")); + handle_signals(); +#ifdef HAVE_GETTEXT + bindtextdomain("poliqarp", LOCALEDIR); + textdomain("poliqarp"); +#endif +} + +void done_subsystems() +{ + done_log(); + poliqarp_destroy(); +} + +int main(int argc, char *argv[]) +{ + struct option longopts[] = { + { "help", 0, NULL, 'h' }, + { "version", 0, NULL, 'v' }, + { "gui", 0, NULL, 'g' }, + { "config-file", 1, NULL, 'c' }, + { "detach", 0, NULL, 'd' }, + { "notify", 1, NULL, 'n' }, + { NULL, 0, NULL, 0 } + }; + int opt; + string_t config_file = string_create(); + + init_default_cfg(); + string_format(config_file, "%spoliqarpd.conf", get_config_dir()); + while ((opt = getopt_long(argc, argv, "hvgc:dn:", longopts, NULL)) != -1) { + switch (opt) { + case 'h': + display_help(*argv); + return 0; + case 'v': + display_version(); + return 0; + case 'g': + cfg.gui_mode = true; + break; + case 'c': + string_free(config_file); + config_file = string_init(optarg); + break; + case 'd': + cfg.detach = true; + break; + case 'n': + cfg.notify_thread_id = strtoul(optarg, NULL, 10); + /* deliberately ignore errors */ + break; + default: + exit(EXIT_FAILURE); + } + } + + if (read_cfg(string_str(config_file)) == -1) + fprintf(stderr, _("Warning: could not access configuration file\n")); + string_free(config_file); + + init_subsystems(); + + log_entry(_("server starting up")); + server_loop(); + log_entry(_("server shutting down")); + + done_subsystems(); + done_cfg(); + free_config_dir(); + return 0; +} diff --git a/poliqarp-library/poliqarpd/protocol.c b/poliqarp-library/poliqarpd/protocol.c new file mode 100644 index 0000000000000000000000000000000000000000..bf569b1424fb17a4a06c76f7e1752006ccdd7fd7 --- /dev/null +++ b/poliqarp-library/poliqarpd/protocol.c @@ -0,0 +1,1562 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include "poliqarp-config.h" + +#include <string.h> +#include <stdio.h> +#include <errno.h> +#include <locale.h> + +#include "errors.h" +#include "log.h" +#include "protocol.h" +#include "sockets.h" +#include "sockstream.h" +#include "utils.h" + +typedef void (*handler_t)(char *, connection_t *); + +typedef struct { + char *command; + handler_t handler; +} hook_t; + +/** + * Returns the first word in the given C string as a string_t and advances + * the string to the beginning of the next word. + * @param str pointer to the string. + */ +static string_t get_word(char **str) +{ + char *start = *str, *end = start; + string_t res; + + while (*end != '\0' && *end != ' ') end++; + res = string_init_n(start, (size_t)(end - start)); + while (*end == ' ') end++; + *str = end; + return res; +} + +/** + * Sends an error message to the connection's socket. + * @param conn the connection to send the message to. + * @param err the error number as specified in the protocol specification. + */ +static void send_error(connection_t *conn, int err) +{ + char *buf = string_aformat("R ERR %d\n", err); + lock(&conn->mutex); + socket_write_cstring(conn->stream->fd, buf); + unlock(&conn->mutex); + free(buf); +} + +/** + * Sends a success message ("R OK") along with additional information + * to the connection's socket. + * @param conn the connection to send the message to. + * @param fmt format of the additional information in the style accepted + * by string_aformat. If this is NULL, the message will be just "R OK". + * Note that the format string need not specify the starting space or + * trailing newline character. + */ +static void send_ok(connection_t *conn, const char *fmt, ...) +{ + va_list l; + + lock(&conn->mutex); + if (fmt == NULL) { + socket_write_cstring(conn->stream->fd, "R OK\n"); + } else { + string_t buf = string_init("R OK "); + va_start(l, fmt); + string_vformat_append(buf, fmt, l); + va_end(l); + string_append_char(buf, '\n'); + socket_write_string(conn->stream->fd, buf); + string_free(buf); + } + unlock(&conn->mutex); +} + +/** + * Retrieves the first word from a C string. If there are no more words + * after this one, sends an error message and frees the newly retrieved + * word. + * @param str pointer to a C string to retrieve the word from. + * @param s pointer to the variable where the retrieved word should be stored. + * @return 0 if the assertion succeeded. + * @return -1 if the assertion failed. + */ +static int expect_more(char **str, string_t *s, connection_t *conn) +{ + if (**str == '\0') { + send_error(conn, PPE_ARGCOUNT); + return -1; + } + *s = get_word(str); + if (**str == '\0') { + send_error(conn, PPE_ARGCOUNT); + string_free(*s); + return -1; + } + return 0; +} + +/** + * Checks for end of string. If the given pointer doesn't point + * to string "\0", returns a nonzero value and sends error message + * to the given connection, else returns 0. + */ +static int expect_end(char *str, connection_t *conn) +{ + if (*str != '\0') { + send_error(conn, PPE_ARGCOUNT); + return -1; + } + return 0; +} + +/** + * Same as expect_more(), but this time the assertion is that there should + * be no more words after this one. + */ +static int expect_no_more(char **str, string_t *s, connection_t *conn) +{ + if (**str == '\0') { + send_error(conn, PPE_ARGCOUNT); + return -1; + } + *s = get_word(str); + if (**str != '\0') { + send_error(conn, PPE_ARGCOUNT); + string_free(*s); + return -1; + } + return 0; +} + +/** + * Calls expect_more() and tries to convert the retrieved word into a decimal + * integer. + * @return 0 if the word is a valid decimal integer. + * @return -1 if either the conversion or expect_more() failed. + */ +static int expect_more_int(char **str, unsigned long *val, connection_t *conn, int err) +{ + string_t s; + char *tmp; + if (expect_more(str, &s, conn)) + return -1; + errno = 0; + *val = strtoul(string_str(s), &tmp, 10); + if (errno == ERANGE || *tmp != '\0') { + send_error(conn, err); + string_free(s); + return -1; + } + string_free(s); + return 0; +} + +/** + * Same as expect_more_int(), but calls expect_no_more(). + */ +static int expect_no_more_int(char **str, unsigned long *val, connection_t *conn, + int err) +{ + string_t s; + char *tmp; + if (expect_no_more(str, &s, conn)) + return -1; + errno = 0; + *val = strtoul(string_str(s), &tmp, 10); + if (errno == ERANGE || *tmp != '\0') { + send_error(conn, err); + string_free(s); + return -1; + } + string_free(s); + return 0; +} + +static void h_unimplemented(char *cmd, connection_t *conn) +{ + lock(&conn->mutex); + socket_write_cstring(conn->stream->fd, "R UNIMPLEMENTED\n"); + unlock(&conn->mutex); +} + +static void h_make_session(char *cmd, connection_t *conn) +{ + int res; + string_t s; + + if (conn->session >= 0) { + send_error(conn, PPE_SESSBOUND); + return; + } + + if (expect_no_more(&cmd, &s, conn)) return; + + res = session_insert(conn->sessions, string_str(s), conn); + switch (res) { + case PE_LIMITEXC: + case PE_NOMEM: + send_error(conn, PPE_NOMEM); + break; + default: + conn->session = res; + send_ok(conn, "%d", res); + log_entry(_("%S created session %d"), s, res); + break; + } + string_free(s); +} + +static void h_close_session(char *cmd, connection_t *conn) +{ + if (expect_end(cmd, conn)) return; + if (conn->session < 0) { + send_error(conn, PPE_NOSESSION); + return; + } + session_remove(conn->sessions, conn->session); + conn->session = -1; + send_ok(conn, NULL); +} + +static void h_resume_session(char *cmd, connection_t *conn) +{ + int res; + string_t uid; + unsigned long sid; + + if (conn->session >= 0) { + send_error(conn, PPE_SESSBOUND); + return; + } + + if (expect_more_int(&cmd, &sid, conn, PPE_INVSID)) return; + if (expect_no_more(&cmd, &uid, conn)) return; + + res = session_rebind(conn->sessions, sid, uid, conn); + switch (res) { + case PE_INVSID: + send_error(conn, PPE_INVSID); + break; + case PE_SIDUSED: + send_error(conn, PPE_SIDUSED); + break; + case PE_INVUID: + send_error(conn, PPE_INVUID); + break; + default: + send_ok(conn, NULL); + } + + string_free(uid); +} + +static void h_suspend_session(char *cmd, connection_t *conn) +{ + if (expect_end(cmd, conn)) return; + if (conn->session < 0) { + send_error(conn, PPE_NOSESSION); + return; + } + session_idle(conn->sessions, conn->session); + conn->session = -1; + send_ok(conn, NULL); +} + +struct opener_thread_data { + session_t *session; + string_t filename; +#ifdef HAVE_LOCALE_T + locale_t locale; +#endif + void *private_data; +}; + +static void opener_cleanup(void *data) +{ + struct opener_thread_data *odata = (struct opener_thread_data *)data; +#ifdef HAVE_LOCALE_T + if (odata->locale) + freelocale(odata->locale); +#endif + string_free(odata->filename); + free(odata); +} + +static void *opener_thread(void *data) +{ + struct opener_thread_data *odata = (struct opener_thread_data *)data; + struct poliqarp_error open_error = poliqarp_error_none; + string_t ress; + struct poliqarp_corpus *corpus = NULL; + bool closable_corpus; + int res = 0; + int cancel_state; + + /* Disable cancellation as early as possible: */ + pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cancel_state); +#ifdef HAVE_LOCALE_T + lock(&odata->session->mutex); + if (odata->session->locale) + odata->locale = duplocale(odata->session->locale); + if (odata->locale) + uselocale(odata->locale); + unlock(&odata->session->mutex); +#endif + ress = string_create(); + pthread_cleanup_push(opener_cleanup, data); + if (odata->session->connection->corpora) { + closable_corpus = false; + corpus = hash_table_get(odata->session->connection->corpora, string_str(odata->filename)); + if (corpus == NULL && !cfg.allow_any_corpus) { + poliqarp_error_message_set(&open_error, _("Unknown corpus")); + res = 1; + } + else + res = 0; + } + if (res == 0 && corpus == NULL) { + closable_corpus = true; + corpus = malloc(sizeof(struct poliqarp_corpus)); + if (corpus == NULL) { + poliqarp_error_from_system(&open_error, NULL); + } else { + pthread_setcancelstate(cancel_state, NULL); + /* The only place an actual job cancellation can occur: */ + res = poliqarp_open_corpus(corpus, string_str(odata->filename), + &odata->session->progress, &open_error); + pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL); + } + } + lock(&odata->session->mutex); + odata->session->closable_corpus = closable_corpus; + if (res == 0) { + odata->session->corpus = corpus; + string_format_append(ress, "M OPENED\n"); + } else { + if (closable_corpus) + free(corpus); + odata->session->corpus = NULL; + string_format_append(ress, "M OPENFAIL %d\n", PPE_SYSOPEN); + } + poliqarp_error_set(&odata->session->error, &open_error); + unlock(&odata->session->mutex); + session_job_cleanup(data); + session_send(odata->session, ress); + pthread_cleanup_pop(1); + return NULL; +} + +static void launch_opener_thread(session_t *session, string_t filename) +{ + struct opener_thread_data *data = malloc(sizeof(struct opener_thread_data)); + data->session = session; +#ifdef HAVE_LOCALE_T + data->locale = 0; +#endif + data->filename = string_init(string_str(filename)); + session_job_launch(session, opener_thread, data); +} + +static void h_open_corpus(char *cmd, connection_t *conn) +{ + string_t corpusname; + session_t *session; + + if (conn->session < 0) { + send_error(conn, PPE_NOSESSION); + return; + } + corpusname = string_init(cmd); + session = conn->sessions->sessions[conn->session]; + lock(&session->mutex); + if (session->corpus != NULL) + send_error(conn, PPE_CORPUSALR); + else { + launch_opener_thread(session, corpusname); + send_ok(conn, NULL); + } + unlock(&session->mutex); + string_free(corpusname); +} + +static void h_get_corpus_stats(char *cmd, connection_t *conn) +{ + session_t *session; + struct poliqarp_corpus_info info; + + if (conn->session < 0) { + send_error(conn, PPE_NOSESSION); + return; + } + if (expect_end(cmd, conn)) return; + session = conn->sessions->sessions[conn->session]; + lock(&session->mutex); + if (session->corpus == NULL) + send_error(conn, PPE_NOCORPUS); + else if (session->job) + send_error(conn, PPE_BUSY); + else { + poliqarp_get_corpus_info(session->corpus, &info); + send_ok(conn, "%lu %lu %lu %lu", + (unsigned long) info.num_segments, + (unsigned long) info.num_types, + (unsigned long) info.num_lemmata, + (unsigned long) info.num_tags + ); + } + unlock(&session->mutex); +} + +static void h_unknown(connection_t *conn) +{ + lock(&conn->mutex); + socket_write_cstring(conn->stream->fd, "R UNKNOWN\n"); + unlock(&conn->mutex); +} + +static void h_ping(char *cmd, connection_t *conn) +{ + if (expect_end(cmd, conn)) return; + lock(&conn->mutex); + socket_write_cstring(conn->stream->fd, "R PONG\n"); + unlock(&conn->mutex); +} + +static void h_get_version(char *cmd, connection_t *conn) +{ + char *res; + + if (expect_end(cmd, conn)) return; + res = string_aformat("R %s (%s %d.%d.%d)\n", POLIQARPD_VERSION, + poliqarp_library_name, poliqarp_major_version, poliqarp_minor_version, + poliqarp_revision_number); + lock(&conn->mutex); + socket_write_cstring(conn->stream->fd, res); + unlock(&conn->mutex); + free(res); +} + +static void h_get_job_status(char *cmd, connection_t *conn) +{ + session_t *session; + + if (conn->session < 0) { + send_error(conn, PPE_NOSESSION); + return; + } + if (expect_end(cmd, conn)) return; + + session = conn->sessions->sessions[conn->session]; + lock(&session->mutex); + if (session->job == NULL) { + send_error(conn, PPE_INVJID); + } else { + send_ok(conn, "%d", progress_get(&session->progress)); + } + unlock(&session->mutex); +} + +static void h_cancel_job(char *cmd, connection_t *conn) +{ + session_t *session; + int res; + + if (conn->session < 0) { + send_error(conn, PPE_NOSESSION); + return; + } + if (expect_end(cmd, conn)) return; + + session = conn->sessions->sessions[conn->session]; + res = session_job_cancel(session); + if (res == 0) + send_ok(conn, NULL); + else + send_error(conn, res); +} + +static void h_close_corpus(char *cmd, connection_t *conn) +{ + session_t *session; + + if (expect_end(cmd, conn)) return; + if (conn->session < 0) { + send_error(conn, PPE_NOSESSION); + return; + } + session = conn->sessions->sessions[conn->session]; + lock(&session->mutex); + if (session->corpus == NULL) + send_error(conn, PPE_NOCORPUS); + else if (session->job) + send_error(conn, PPE_BUSY); + else { + if (session->closable_corpus) { + poliqarp_close_corpus(session->corpus); + free(session->corpus); + } + session->corpus = NULL; + send_ok(conn, NULL); + } + unlock(&session->mutex); +} + +static void h_get_num_results(char *cmd, connection_t *conn) +{ + session_t *session; + struct poliqarp_match_buffer_info info; + + if (expect_end(cmd, conn)) return; + if (conn->session < 0) { + send_error(conn, PPE_NOSESSION); + return; + } + session = conn->sessions->sessions[conn->session]; + poliqarp_get_match_buffer_info(session->match_buffer, &info); + send_ok(conn, "%lu", info.num_results); +} + +static void h_get_buffer_state(char *cmd, connection_t *conn) +{ + session_t *session; + struct poliqarp_match_buffer_info info; + + if (expect_end(cmd, conn)) return; + if (conn->session < 0) { + send_error(conn, PPE_NOSESSION); + return; + } + session = conn->sessions->sessions[conn->session]; + poliqarp_get_match_buffer_info(session->match_buffer, &info); + send_ok(conn, "%lu %lu", + (unsigned long) info.capacity, + (unsigned long) info.used + ); +} + +static void h_resize_buffer(char *cmd, connection_t *conn) +{ + session_t *session; + unsigned long newsize; + + if (conn->session < 0) { + send_error(conn, PPE_NOSESSION); + return; + } + session = conn->sessions->sessions[conn->session]; + if (expect_no_more_int(&cmd, &newsize, conn, PPE_INVRANGE)) + return; + lock(&session->mutex); + if (session->job) { + send_error(conn, PPE_BUSY); + } else if (poliqarp_resize_match_buffer(session->match_buffer, newsize) == -1) { + send_error(conn, PPE_INVRANGE); + } else + send_ok(conn, NULL); + unlock(&session->mutex); +} + +static bool strempty(const char *cmd) +{ + while (*cmd) { + if (*cmd != ' ' || *cmd != '\t') + return false; + cmd++; + } + return true; +} + +static void h_make_query(char *cmd, connection_t *conn) +{ + session_t *session; + + if (conn->session < 0) { + send_error(conn, PPE_NOSESSION); + return; + } + session = conn->sessions->sessions[conn->session]; + lock(&session->mutex); + if (session->job) + send_error(conn, PPE_BUSY); + else if (session->corpus == NULL) + send_error(conn, PPE_NOCORPUS); + else if (strempty(cmd)) + send_error(conn, PPE_INVQUERY); + else { + if (session->query) { + poliqarp_destroy_query(session->query); + free(session->query); + } + session->query = malloc(sizeof(struct poliqarp_query)); + if (session->query == NULL) { + send_error(conn, PPE_NOMEM); + goto end; + } + assert(session->corpus != NULL); + if (poliqarp_create_query(session->query, cmd, session->corpus, + session->options.qflags, string_str(session->options.rewrite), + session->options.random_sample ? &session->random_state : NULL, + &session->error)) + { + free(session->query); + session->query = NULL; + send_error(conn, PPE_INVQUERY); + log_entry(_("%S/%d makes invalid query: <%s>"), session->client_id, session->index, cmd); + goto end; + } + session->qeflags = session->query->eflags; + poliqarp_forget(session->match_buffer); + send_ok(conn, NULL); + log_entry(_("%S/%d makes query: <%s>, %lu segments to inspect"), + session->client_id, session->index, cmd, (unsigned long) session->query->area.num_segments); + } +end: ; + unlock(&session->mutex); +} + +struct query_thread_data { + session_t *session; + size_t num; +#ifdef HAVE_LOCALE_T + locale_t locale; +#endif + void *private_data; +}; + +static void query_cleanup(void *data) +{ + struct query_thread_data *qdata = (struct query_thread_data *)data; +#ifdef HAVE_LOCALE_T + if (qdata->locale) + freelocale(qdata->locale); +#endif + free(qdata); +} + +static void *query_thread(void *data) +{ + struct query_thread_data *qdata = (struct query_thread_data *)data; + int res; + int cancel_state; + + /* Disable cancellation as early as possible: */ + pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cancel_state); + pthread_cleanup_push(query_cleanup, data); +#ifdef HAVE_LOCALE_T + lock(&qdata->session->mutex); + if (qdata->session->locale) + qdata->locale = duplocale(qdata->session->locale); + if (qdata->locale) + uselocale(qdata->locale); + unlock(&qdata->session->mutex); +#endif + { + pthread_setcancelstate(cancel_state, NULL); + /* The only place an actual job cancellation can occur: */ + res = poliqarp_produce(qdata->session->match_buffer, qdata->num, + qdata->session->query, &qdata->session->progress, qdata->session, + qdata->session->options.interval, cfg.max_match_length); + pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL); + } + if (res == 0) { + string_t ress = string_create(); + struct poliqarp_match_buffer_info info; + poliqarp_get_match_buffer_info(qdata->session->match_buffer, &info); + string_format_append(ress, "M QUERY-DONE %lu\n", (unsigned long) info.used); + log_entry(_("%S/%d query is done"), qdata->session->client_id, + qdata->session->index); + session_job_cleanup(data); + session_send(qdata->session, ress); + } + else + session_job_cleanup(data); + pthread_cleanup_pop(1); + return NULL; +} + +static void launch_query_thread(session_t *session, size_t num) +{ + struct query_thread_data *data = malloc(sizeof(struct query_thread_data)); + data->session = session; + data->num = num; +#ifdef HAVE_LOCALE_T + data->locale = 0; +#endif + session_job_launch(session, query_thread, data); +} + +static void h_run_query(char *cmd, connection_t *conn) +{ + session_t *session; + unsigned long num; + + if (conn->session < 0) { + send_error(conn, PPE_NOSESSION); + return; + } + if (expect_no_more_int(&cmd, &num, conn, PPE_INVRANGE)) return; + + session = conn->sessions->sessions[conn->session]; + lock(&session->mutex); + if (session->corpus == NULL) + send_error(conn, PPE_NOCORPUS); + else if (session->query == NULL) + send_error(conn, PPE_INVQUERY); + else if (session->job) + send_error(conn, PPE_BUSY); + else { + launch_query_thread(session, num); + send_ok(conn, NULL); + } + unlock(&session->mutex); +} + +static int validate_criteria(const char *criteria) +{ + int len = strlen(criteria); + if (len < 1 || len > 4) + return -1; + while (*criteria) { + if ((*criteria < 'A' || *criteria > 'H') && + (*criteria < 'a' || *criteria > 'h')) + return -1; + criteria++; + } + return 0; +} + +struct sort_thread_data { + session_t *session; + string_t criteria; + void *private_data; +}; + +static int do_sort(struct poliqarp_match_buffer *match_buffer, char how, + session_t *session) +{ + struct poliqarp_sort_info info; + info.ascending = (how < 'a'); + info.atergo = (ascii_toupper(how) >= 'E'); + if (info.atergo) + how -= ('e' - 'a'); + switch (ascii_toupper(how)) { + case 'A': info.column = POLIQARP_COLUMN_LEFT_CONTEXT; + info.context = session->options.left_context_width.width; + break; + case 'B': info.column = POLIQARP_COLUMN_LEFT_MATCH; break; + case 'C': info.column = POLIQARP_COLUMN_RIGHT_MATCH; break; + case 'D': info.column = POLIQARP_COLUMN_RIGHT_CONTEXT; + info.context = session->options.right_context_width.width; + break; + } + return poliqarp_sort_match_buffer(match_buffer, &info, &session->progress); +} + +static void sort_cleanup(void *data) +{ + struct sort_thread_data *qdata = (struct sort_thread_data *)data; + free(qdata); +} + +static void *sort_thread(void *data) +{ + struct sort_thread_data *qdata = (struct sort_thread_data *)data; + int res = 0, i; + int cancel_state; + char *crit; + + /* Disable cancellation as early as possible: */ + pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cancel_state); + pthread_cleanup_push(sort_cleanup, data); + +#ifdef HAVE_LOCALE_T + lock(&qdata->session->mutex); + if (qdata->session->locale) + uselocale(qdata->session->locale); + unlock(&qdata->session->mutex); +#endif + + crit = string_str(qdata->criteria); + + pthread_setcancelstate(cancel_state, NULL); + for (i = strlen(crit) - 1; i >= 0; i--) + res = do_sort(qdata->session->match_buffer, crit[i], qdata->session); + pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL); + if (res == 0) { + string_t ress = string_create(); + string_format_append(ress, "M SORTED\n"); + session_job_cleanup(data); + session_send(qdata->session, ress); + } + else + session_job_cleanup(data); + pthread_cleanup_pop(1); + return NULL; +} + +static void launch_sort_thread(session_t *session, const char *criteria) +{ + struct sort_thread_data *data = malloc(sizeof(struct query_thread_data)); + data->session = session; + data->criteria = string_init(criteria); + session_job_launch(session, sort_thread, data); +} + +static void h_sort_results(char *cmd, connection_t *conn) +{ + session_t *session; + + if (conn->session < 0) { + send_error(conn, PPE_NOSESSION); + return; + } + if (validate_criteria(cmd)) { + send_error(conn, PPE_INVCRIT); + return; + } + + session = conn->sessions->sessions[conn->session]; + lock(&session->mutex); + if (session->corpus == NULL) + send_error(conn, PPE_NOCORPUS); + else if (session->query == NULL) + send_error(conn, PPE_INVQUERY); + else if (session->job) + send_error(conn, PPE_BUSY); + else { + launch_sort_thread(session, cmd); + send_ok(conn, NULL); + } + unlock(&session->mutex); +} + +#define DISPLAY_BASE 0x01 +#define DISPLAY_TAG 0x02 +#define DISPLAY_IDS 0x04 + +static void output_interpretation(sockstream_t *stream, + struct poliqarp_interpretation *interp, int what) +{ + struct poliqarp_interpretation_info info; + + poliqarp_get_interpretation_info(interp, &info); + if (what & DISPLAY_BASE) { + sockstream_write_cstring(stream, "R "); + sockstream_write_cstring(stream, info.base); + sockstream_write_cstring(stream, "\n"); + } + if (what & DISPLAY_TAG) { + sockstream_write_cstring(stream, "R "); + sockstream_write_cstring(stream, info.tag); + sockstream_write_cstring(stream, "\n"); + } +} + +static void output_segment(sockstream_t *stream, bool disamb, + size_t nth, struct poliqarp_segment *segment, int what, int seq) +{ + char *buf; + struct poliqarp_segment_info info; + struct poliqarp_interpretation_set set; + struct poliqarp_interpretation_set_info sinfo; + struct poliqarp_interpretation interp; + size_t i, num; + + poliqarp_get_segment_info(segment, &info); + if (disamb) + poliqarp_get_disambiguated_interpretations(segment, &set); + else + poliqarp_get_ambiguous_interpretations(segment, &set); + poliqarp_get_interpretation_set_info(&set, &sinfo); + if (!seq) { + sockstream_write_cstring(stream, "R "); + } + if (info.space_before) + sockstream_write_cstring(stream, " "); + sockstream_write_cstring(stream, info.text); + if (!seq) { + sockstream_write_cstring(stream, "\n"); + } + if (what == 0) + return; + num = sinfo.size; + buf = string_aformat("R %lu\n", (unsigned long) num); + sockstream_write_cstring(stream, buf); + free(buf); + for (i = 0; i < num; i++) { + poliqarp_get_interpretation(&set, &interp, i); + output_interpretation(stream, &interp, what); + } + if (what & DISPLAY_IDS) { + buf = string_aformat("R %lu\n", (unsigned long) nth); + sockstream_write_cstring(stream, buf); + free(buf); + } +} + +static void output_range(struct poliqarp_corpus *corpus, bool disamb, + sockstream_t *stream, size_t start, size_t end, int lemmata, int tags, + int ids, int seq) +{ + char *buf; + size_t i; + + assert(start <= end); + if (lemmata) + lemmata = DISPLAY_BASE; + if (tags) + tags = DISPLAY_TAG; + if (ids) + ids = DISPLAY_IDS; + if (seq) { + lemmata = tags = ids = 0; + sockstream_write_cstring(stream, "R "); + } else { + buf = string_aformat("R %lu\n", (unsigned long) (end - start)); + sockstream_write_cstring(stream, buf); + free(buf); + } + for (i = start; i < end; i++) { + struct poliqarp_segment segment; + poliqarp_get_segment(&segment, corpus, i); + output_segment(stream, disamb, i, &segment, lemmata | tags | ids, seq); + } + if (seq) { + sockstream_write_cstring(stream, "\n"); + } +} + +static void output_match(sockstream_t *stream, session_t *session, size_t ofs, + int seq) +{ + struct poliqarp_match match; + struct poliqarp_match document_range; + size_t lcsize, rcsize; + size_t lc, rc; + int retrieve = session->options.retrieve; + struct poliqarp_corpus *corpus = session->corpus; + bool disamb = session->options.disamb; + + if (seq) { + lcsize = rcsize = session->options.wide_context_width.width; + } else { + lcsize = session->options.left_context_width.width; + rcsize = session->options.right_context_width.width; + } + poliqarp_get_match(session->match_buffer, &match, ofs); + poliqarp_get_match_for_document(corpus, match.document, &document_range); + lc = match.start < document_range.start + lcsize + ? document_range.start + : match.start - lcsize; + rc = match.end + rcsize > document_range.end + ? document_range.end + : match.end + rcsize; + output_range(corpus, disamb, stream, lc, match.start, + retrieve & SHOW_LEMMATA_LC, retrieve & SHOW_TAGS_LC, + retrieve & SHOW_IDS_LC, seq); + if (session->qeflags & POLIQARP_QEFLAG_HAS_ALIGN) { + output_range(corpus, disamb, stream, match.start, match.focus, + retrieve & SHOW_LEMMATA_LM, retrieve & SHOW_TAGS_LM, + retrieve & SHOW_IDS_LM, seq); + output_range(corpus, disamb, stream, match.focus, match.end, + retrieve & SHOW_LEMMATA_RM, retrieve & SHOW_TAGS_RM, + retrieve & SHOW_IDS_RM, seq); + } else { + if (seq) + sockstream_write_cstring(stream, "R \n"); + output_range(corpus, disamb, stream, match.start, match.end, + retrieve & SHOW_LEMMATA_LM, retrieve & SHOW_TAGS_LM, + retrieve & SHOW_IDS_LM, seq); + } + output_range(corpus, disamb, stream, match.end, rc, + retrieve & SHOW_LEMMATA_RC, retrieve & SHOW_TAGS_RC, + retrieve & SHOW_IDS_RC, seq); +} + +static void h_get_results(char *cmd, connection_t *conn) +{ + session_t *session; + unsigned long i, from, to; + struct poliqarp_match_buffer_info info; + + if (conn->session < 0) { + send_error(conn, PPE_NOSESSION); + return; + } + + if (expect_more_int(&cmd, &from, conn, PPE_INVRANGE)) return; + if (expect_no_more_int(&cmd, &to, conn, PPE_INVRANGE)) return; + + session = conn->sessions->sessions[conn->session]; + if (from > to) { + send_error(conn, PPE_INVRANGE); + return; + } + + poliqarp_get_match_buffer_info(session->match_buffer, &info); + if (to >= info.used) { + send_error(conn, PPE_INVRANGE); + return; + } + + lock(&conn->mutex); + sockstream_write_cstring(conn->stream, "R OK\n"); + for (i = from; i <= to; i++) { + output_match(conn->stream, session, i, 0); + } + sockstream_output_flush(conn->stream); + unlock(&conn->mutex); +} + +static void h_get_last_error(char *cmd, connection_t *conn) +{ + session_t *session; + + if (conn->session < 0) { + send_error(conn, PPE_NOSESSION); + return; + } + + session = conn->sessions->sessions[conn->session]; + lock(&session->mutex); + lock(&conn->mutex); + const char *error_message = poliqarp_error_message_get(&session->error); + if (error_message) { + sockstream_write_cstring(conn->stream, "R ERROR "); + sockstream_write_cstring(conn->stream, error_message); + sockstream_write_cstring(conn->stream, "\n"); + sockstream_output_flush(conn->stream); + } else { + sockstream_write_cstring(conn->stream, "R NOERROR\n"); + sockstream_output_flush(conn->stream); + } + unlock(&conn->mutex); + unlock(&session->mutex); +} + +static void h_set_locale(char *cmd, connection_t *conn) +{ + string_t locale_name; + if (conn->session < 0) { + send_error(conn, PPE_NOSESSION); + return; + } + if (expect_no_more(&cmd, &locale_name, conn)) return; +#ifdef HAVE_LOCALE_T + session_t *session = conn->sessions->sessions[conn->session]; + lock(&session->mutex); + if (session->locale) + freelocale(session->locale); + session->locale = newlocale(LC_ALL_MASK, "", NULL); + if (session->locale) + { + locale_t locale = newlocale(LC_MESSAGES_MASK, string_str(locale_name), session->locale); + if (locale) + session->locale = locale; + else + { + freelocale(session->locale); + session->locale = 0; + } + } + uselocale(session->locale ? session->locale : LC_GLOBAL_LOCALE); + unlock(&session->mutex); +#endif + send_ok(conn, NULL); +} + +static void h_get_context(char *cmd, connection_t *conn) +{ + session_t *session; + unsigned long ofs; + struct poliqarp_match_buffer_info info; + + if (conn->session < 0) { + send_error(conn, PPE_NOSESSION); + return; + } + if (expect_no_more_int(&cmd, &ofs, conn, PPE_INVRANGE)) return; + + session = conn->sessions->sessions[conn->session]; + poliqarp_get_match_buffer_info(session->match_buffer, &info); + if (ofs >= info.used) { + send_error(conn, PPE_INVRANGE); + return; + } + + lock(&conn->mutex); + sockstream_write_cstring(conn->stream, "R OK\n"); + output_match(conn->stream, session, ofs, 1); + sockstream_output_flush(conn->stream); + unlock(&conn->mutex); +} + +static void h_get_metadata(char *cmd, connection_t *conn) +{ + session_t *session; + unsigned long ofs; + struct poliqarp_match_buffer_info info; + struct poliqarp_metadata_set set; + struct poliqarp_metadata meta; + struct poliqarp_metadata_info minfo; + struct poliqarp_match match; + size_t i, count; + char *tmp = NULL; + bool error = false; + + if (conn->session < 0) { + send_error(conn, PPE_NOSESSION); + return; + } + if (expect_no_more_int(&cmd, &ofs, conn, PPE_INVRANGE)) return; + + session = conn->sessions->sessions[conn->session]; + lock(&session->mutex); + if (session->corpus == NULL) { + send_error(conn, PPE_NOCORPUS); + error = true; + } + unlock(&session->mutex); + if (error) + return; + poliqarp_get_match_buffer_info(session->match_buffer, &info); + if (ofs >= info.used) { + send_error(conn, PPE_INVRANGE); + return; + } + + poliqarp_get_match(session->match_buffer, &match, ofs); + poliqarp_get_metadata_set(session->corpus, match.document, &set); + count = poliqarp_metadata_count(&set); + send_ok(conn, "%lu", (unsigned long) count); + lock(&conn->mutex); + for (i = 0; i < count; i++) { + poliqarp_get_metadata(&set, i, &meta); + poliqarp_get_metadata_info(&meta, &minfo); + sockstream_write_cstring(conn->stream, "R "); + sockstream_write_cstring(conn->stream, minfo.key); + sockstream_write_cstring(conn->stream, "\n"); + switch (minfo.type) { + case POLIQARP_META_TEXT: + tmp = string_aformat("R T %s\n", minfo.value.text); + break; + case POLIQARP_META_DATE: + tmp = string_aformat("R D %d %d %d\n", minfo.value.date.year, + minfo.value.date.month, minfo.value.date.day); + break; + case POLIQARP_META_UNDEFINED: + tmp = string_aformat("R U\n"); + break; + } + sockstream_write_cstring(conn->stream, tmp); + free(tmp); + } + sockstream_output_flush(conn->stream); + unlock(&conn->mutex); +} + +static void h_get_metadata_types(char *cmd, connection_t *conn) +{ + session_t *session; + struct poliqarp_metadata_types types; + size_t i; + bool error = false; + + if (conn->session < 0) { + send_error(conn, PPE_NOSESSION); + return; + } + if (expect_end(cmd, conn)) return; + session = conn->sessions->sessions[conn->session]; + lock(&session->mutex); + if (session->corpus == NULL) { + send_error(conn, PPE_NOCORPUS); + error = true; + } + unlock(&session->mutex); + if (error) + return; + poliqarp_get_metadata_types(&types, session->corpus); + send_ok(conn, "%lu", (unsigned long) types.num_types); + lock(&conn->mutex); + for (i = 0; i < types.num_types; i++) { + sockstream_write_cstring(conn->stream, "R "); + sockstream_write_cstring(conn->stream, types.types[i].type == POLIQARP_META_TEXT ? + "T " : "D "); + sockstream_write_cstring(conn->stream, types.types[i].key); + sockstream_write_cstring(conn->stream, "\n"); + } + sockstream_output_flush(conn->stream); + unlock(&conn->mutex); + poliqarp_free_metadata_types(&types); +} + +static void h_set_option(char *cmd, connection_t *conn) +{ + session_t *session; + string_t optname; + + if (conn->session < 0) { + send_error(conn, PPE_NOSESSION); + return; + } + if (expect_more(&cmd, &optname, conn)) return; + + session = conn->sessions->sessions[conn->session]; + switch (sessopt_set(&session->options, string_str(optname), cmd)) { + case PE_INVOPT: + send_error(conn, PPE_INVOPT); + break; + case PE_INVVAL: + send_error(conn, PPE_INVVAL); + break; + default: /* 0 */ + send_ok(conn, NULL); + break; + } + + string_free(optname); +} + +static void h_create_alias(char *cmd, connection_t *conn) +{ + session_t *session; + string_t alias; + bool error = false; + + if (conn->session < 0) { + send_error(conn, PPE_NOSESSION); + return; + } + if (expect_more(&cmd, &alias, conn)) return; + + session = conn->sessions->sessions[conn->session]; + lock(&session->mutex); + if (session->corpus == NULL) { + send_error(conn, PPE_NOCORPUS); + error = true; + } + unlock(&session->mutex); + if (error) + return; + if (poliqarp_define_alias(session->corpus, string_str(alias), cmd)) { + string_free(alias); + send_error(conn, 25); + return; + } + + send_ok(conn, NULL); + string_free(alias); +} + +static void h_delete_alias(char *cmd, connection_t *conn) +{ + session_t *session; + string_t alias; + bool error = false; + + if (conn->session < 0) { + send_error(conn, PPE_NOSESSION); + return; + } + if (expect_no_more(&cmd, &alias, conn)) return; + + session = conn->sessions->sessions[conn->session]; + lock(&session->mutex); + if (session->corpus == NULL) { + send_error(conn, PPE_NOCORPUS); + error = true; + } + unlock(&session->mutex); + if (error) + return; + if (poliqarp_delete_alias(session->corpus, string_str(alias))) { + string_free(alias); + send_error(conn, 25); + return; + } + + send_ok(conn, NULL); + string_free(alias); +} + +static void h_get_aliases(char *cmd, connection_t *conn) +{ + session_t *session; + struct poliqarp_alias_list aliases; + size_t i; + bool error = false; + + if (conn->session < 0) { + send_error(conn, PPE_NOSESSION); + return; + } + if (expect_end(cmd, conn)) return; + session = conn->sessions->sessions[conn->session]; + lock(&session->mutex); + if (session->corpus == NULL) { + send_error(conn, PPE_NOCORPUS); + error = true; + } + unlock(&session->mutex); + if (error) + return; + poliqarp_get_aliases(session->corpus, &aliases); + + send_ok(conn, "%lu", (unsigned long) aliases.num_aliases); + lock(&conn->mutex); + for (i = 0; i < aliases.num_aliases; i++) { + sockstream_write_cstring(conn->stream, "R "); + sockstream_write_cstring(conn->stream, aliases.aliases[i].name); + sockstream_write_cstring(conn->stream, "\n"); + sockstream_write_cstring(conn->stream, "R "); + sockstream_write_cstring(conn->stream, aliases.aliases[i].value); + sockstream_write_cstring(conn->stream, "\n"); + } + sockstream_output_flush(conn->stream); + unlock(&conn->mutex); + poliqarp_free_aliases(&aliases); +} + +static void h_halt(char *cmd, connection_t *conn) +{ + if (expect_end(cmd, conn)) return; + if (conn->ip != 0x7F000001LU) { + send_error(conn, 26); + log_entry(_("%d.%d.%d.%d tried to shutdown the server"), + (conn->ip >> 24) & 0xFF, + (conn->ip >> 16) & 0xFF, + (conn->ip >> 8) & 0xFF, + conn->ip & 0xFF + ); + return; + } + server_shutdown(); /* bye bye! */ +} + +/* + * Format of the reply: + * + * R OK [number-of-categories] [number-of-grammatical-classes] + * R [description-of-category-1] + * ... + * R [description-of-category-N] + * R [description-of-class-1] + * R [description-of-class-2] + * ... + * R [description-of-class-M] + * + * where [description-of-category] is of the form + * + * category-name attr1 attr2 ... attrN + * + * and [description-of-class] is of the form + * + * class-name category-1 ... category-N + * + * where category-X are names of the appropriate categories, in square brackets + * if the category is optional. + * + * This is an optional command: an implementation might return R UNSUPPORTED. + */ +static void h_get_tagset(char *cmd, connection_t *conn) +{ + session_t *session; + struct poliqarp_tagset_info info; + size_t i; + bool error = false; + + if (conn->session < 0) { + send_error(conn, PPE_NOSESSION); + return; + } + if (expect_end(cmd, conn)) + return; + session = conn->sessions->sessions[conn->session]; + lock(&session->mutex); + if (session->corpus == NULL) { + send_error(conn, PPE_NOCORPUS); + error = true; + } + unlock(&session->mutex); + if (error) + return; + poliqarp_get_tagset_info(session->corpus, &info); + send_ok(conn, "%lu %lu", + (unsigned long) info.num_categories, + (unsigned long) info.num_classes + ); + lock(&conn->mutex); + for (i = 0; i < info.num_categories; i++) { + sockstream_write_cstring(conn->stream, "R "); + sockstream_write_cstring(conn->stream, info.categories[i]); + sockstream_write_cstring(conn->stream, "\n"); + } + for (i = 0; i < info.num_classes; i++) { + sockstream_write_cstring(conn->stream, "R "); + sockstream_write_cstring(conn->stream, info.classes[i]); + sockstream_write_cstring(conn->stream, "\n"); + } + sockstream_output_flush(conn->stream); + unlock(&conn->mutex); + poliqarp_free_tagset_info(&info); +} + +static void append_column_types(string_t str, bool match, + const session_t *session) +{ + string_append_str(str, "<"); + if (session->options.retrieve & (match ? SHOW_LEMMATA_LM : SHOW_LEMMATA_LC)) + string_append_str(str, "l"); + if (session->options.retrieve & (match ? SHOW_TAGS_LM : SHOW_TAGS_LC)) + string_append_str(str, "t"); + if (session->options.retrieve & (match ? SHOW_IDS_LM : SHOW_IDS_LC)) + string_append_str(str, "i"); + string_append_str(str, ">"); +} + +static void h_get_column_types(char *cmd, connection_t *conn) +{ + session_t *session; + + if (conn->session < 0) { + send_error(conn, PPE_NOSESSION); + return; + } + if (expect_end(cmd, conn)) + return; + session = conn->sessions->sessions[conn->session]; + if (session->query) { + string_t res = string_create(); + string_append_str(res, "LC"); + append_column_types(res, false, session); + if (session->qeflags & POLIQARP_QEFLAG_HAS_ALIGN) { + string_append_str(res, ":LM"); + append_column_types(res, true, session); + string_append_str(res, ":RM"); + append_column_types(res, true, session); + } else { + string_append_str(res, ":M"); + append_column_types(res, true, session); + } + string_append_str(res, ":RC"); + append_column_types(res, false, session); + send_ok(conn, "%s", string_str(res)); + string_free(res); + } else { + send_error(conn, 25); + } +} + +static hook_t hooks[] = { + { "PING", h_ping }, + { "GET-VERSION", h_get_version }, + { "VERSION", h_get_version }, /* deprecated alias for GET-VERSION */ + { "MAKE-SESSION", h_make_session }, + { "RESUME-SESSION", h_resume_session }, + { "RECONNECT", h_resume_session }, /* deprecated alias for RESUME-SESSION */ + { "CLOSE-SESSION", h_close_session }, + { "SUSPEND-SESSION", h_suspend_session }, + { "OPEN-CORPUS", h_open_corpus }, + { "OPEN", h_open_corpus }, /* deprecated alias for OPEN */ + { "CLOSE-CORPUS", h_close_corpus }, + { "CLOSE", h_close_corpus }, /* deprecated alias for CLOSE */ + { "GET-CORPUS-STATS", h_get_corpus_stats }, + { "CORPUS-STATS", h_get_corpus_stats }, /* deprecated alias for GET-CORPUS-STATUS */ + { "GET-JOB-STATUS", h_get_job_status }, + { "STATUS", h_get_job_status }, /* deprecated alias for GET-JOB-STATUS */ + { "CANCEL-JOB", h_cancel_job }, + { "CANCEL", h_cancel_job }, /* deprecated alias for CANCEL-JOB */ + { "MAKE-QUERY", h_make_query }, + { "RUN-QUERY", h_run_query }, + { "GET-RESULTS", h_get_results }, + { "GET-CONTEXT", h_get_context }, + { "SORT-RESULTS", h_sort_results }, + { "SORT", h_sort_results }, /* deprecated alias for SORT */ + { "GET-BUFFER-STATE", h_get_buffer_state }, + { "GET-NUM-RESULTS", h_get_num_results }, + { "BUFFER-STATE", h_get_buffer_state }, /* deprecated alias for GET-BUFFER-STATE */ + { "BUFFER-SHIFT", h_unimplemented }, /* deprecated */ + { "RESIZE-BUFFER", h_resize_buffer }, + { "BUFFER-RESIZE", h_resize_buffer }, /* deprecated alias for RESIZE-BUFFER*/ + { "SET-OPTION", h_set_option }, + { "SET", h_set_option }, /* deprecated alias for SET */ + { "CREATE-ALIAS", h_create_alias }, + { "DELETE-ALIAS", h_delete_alias }, + { "GET-ALIASES", h_get_aliases }, + { "HALT", h_halt }, + { "GET-METADATA", h_get_metadata }, + { "GET-METADATA-TYPES", h_get_metadata_types }, + { "METADATA", h_get_metadata }, /* deprecated alias for GET-METADATA */ + { "METADATA-TYPES", h_get_metadata_types }, /* deprecated alias for GET-METADATA-TYPES */ + { "GET-TAGSET", h_get_tagset }, + { "GET-COLUMN-TYPES", h_get_column_types }, + { "GET-LAST-ERROR", h_get_last_error }, + { "SET-LOCALE", h_set_locale }, + { NULL, NULL } +}; + +void dispatch(string_t cmd, connection_t *conn) +{ + hook_t *hook = hooks, *found = NULL; + char *ccmd = string_str(cmd); + string_t cmdname = get_word(&ccmd); + + while (hook->command) { + if (!string_ccmp(cmdname, hook->command)) { + found = hook; + break; + } + hook++; + } + string_free(cmdname); + if (found) { + found->handler(ccmd, conn); + } else { + h_unknown(conn); + if (conn->session >= 0) { + const session_t *session = conn->sessions->sessions[conn->session]; + log_entry(_("%S/%d issued an unknown command: %S"), + session->client_id, session->index, cmd); + } else { + log_entry(_("IP %d.%d.%d.%d (with no session) issued an unknown command: %S"), + (conn->ip >> 24) & 0xFF, + (conn->ip >> 16) & 0xFF, + (conn->ip >> 8) & 0xFF, + conn->ip & 0xFF, + cmd + ); + } + } + string_free(cmd); +} diff --git a/poliqarp-library/poliqarpd/protocol.h b/poliqarp-library/poliqarpd/protocol.h new file mode 100644 index 0000000000000000000000000000000000000000..6ac37c5126f68867b18135493e53c89ccd941d4f --- /dev/null +++ b/poliqarp-library/poliqarpd/protocol.h @@ -0,0 +1,40 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef POLIQARPD_PROTOCOL_H +#define POLIQARPD_PROTOCOL_H + +#include <foostring/foostring.h> +#include <stdlib.h> + +#include "configuration.h" +#include "session.h" +#include "sockstream.h" + +/** + * Recognizes and executes a protocol command with respect to the given + * connection. + */ +void dispatch(string_t cmd, connection_t *conn); + +#endif /* POLIQARPD_PROTOCOL_H */ diff --git a/poliqarp-library/poliqarpd/server.c b/poliqarp-library/poliqarpd/server.c new file mode 100644 index 0000000000000000000000000000000000000000..803985807bef102f472cc256ae3ed7521a2d2ea0 --- /dev/null +++ b/poliqarp-library/poliqarpd/server.c @@ -0,0 +1,382 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <poliqarp-config.h> + +#include <errno.h> +#include <pthread.h> +#include <stdio.h> +#include <string.h> + +#include <foostring/foostring.h> + +#include "sockets.h" +#include "configuration.h" +#include "log.h" +#include "protocol.h" +#include "sockstream.h" +#include "osdep.h" +#include "server.h" +#include "session.h" +#include "utils.h" + +#define BACKLOG_SIZE 5 + +/* When set to 1, the server is in its shutdown phase and should not accept + * any incoming connections. */ +static int server_end = 0; + +/* A mutex protecting the above variable. */ +static pthread_mutex_t end_mutex = PTHREAD_MUTEX_INITIALIZER; + +void server_shutdown() +{ + lock(&end_mutex); + server_end = 1; + unlock(&end_mutex); +} + +struct grim_reaper_data { + struct session_list *list; /**< the session collection */ + void *private_data; /**< thread's private data */ +}; + +void connection_list_init(struct connection_list *l) +{ + int i; + pthread_mutex_init(&l->mutex, NULL); + for (i = 0; i < MAX_CONNECTIONS; i++) + l->array[i] = NULL; +} + +int connection_list_pick_index(struct connection_list *l) +{ + int i, j; + + pthread_mutex_lock(&l->mutex); + j = -1; + for (i = 0; i < MAX_CONNECTIONS; i++) { + if (l->array[i] == NULL) { + j = i; + break; + } + } + return j; +} + +void connection_list_insert_at(struct connection_list *l, int offset, + connection_t *connection) +{ + if (offset >= 0) + l->array[offset] = connection; + pthread_mutex_unlock(&l->mutex); +} + +int connection_list_delete_unsafe(struct connection_list *l, int offset) +{ + if (l->array[offset]) { + free(l->array[offset]); + l->array[offset] = NULL; + return 0; + } else { + return -1; + } +} + +int connection_list_delete(struct connection_list *l, int offset) +{ + int retval = 0; + + lock(&l->mutex); + pthread_detach(l->array[offset]->thread); + retval = connection_list_delete_unsafe(l, offset); + unlock(&l->mutex); + + return retval; +} + +void connection_cleanup(struct connection_list *l) +{ + int i; + + lock(&l->mutex); + for (i = 0; i < MAX_CONNECTIONS; i++) { + connection_t *conn = l->array[i]; + if (conn) { + pthread_cancel(conn->thread); + pthread_join(conn->thread, NULL); + connection_list_delete_unsafe(l, i); + } + } + unlock(&l->mutex); + /* There should no other threads alive at this point: */ + pthread_mutex_destroy(&l->mutex); +} + +void connection_remove(void *data) +{ + connection_t *conn = (connection_t *)data; + sockstream_free(conn->stream); + pthread_mutex_destroy(&conn->mutex); +} + +void grim_reaper_cleanup(void *data) +{ + return; +} + +void *grim_reaper(void *data) +{ + struct grim_reaper_data *gdata = (struct grim_reaper_data *)data; + pthread_cleanup_push(grim_reaper_cleanup, data); + for (;;) { + poliqarp_sleep(5); + pthread_testcancel(); + session_remove_idle(gdata->list); + } + pthread_cleanup_pop(1); + return NULL; +} + +void *connection_thread(void *data) +{ + connection_t *conn = (connection_t *)data; + string_t s; +#if 0 + uint32_t ip = conn->ip; +#endif + + pthread_cleanup_push(connection_remove, data); +#if 0 + log_entry(_("connection with %d.%d.%d.%d established"), + ip & 0xFF, (ip >> 8) & 0xFF, (ip >> 16) & 0xFF, (ip >> 24) & 0xFF); +#endif + for (;;) { + s = sockstream_read_string(conn->stream); + if (string_len(s) == 0) { + string_free(s); + break; + } + dispatch(s, conn); + } +#if 0 + log_entry(_("connection with %d.%d.%d.%d closed"), ip & 0xFF, + (ip >> 8) & 0xFF, (ip >> 16) & 0xFF, (ip >> 24) & 0xFF); +#endif + if (conn->session != -1) + session_idle(conn->sessions, conn->session); + pthread_cleanup_pop(1); + lock(&end_mutex); + if (!server_end) + connection_list_delete(conn->connections, conn->id); + unlock(&end_mutex); + return NULL; +} + +static void corpora_open_iterator(const char *key, const void *value, void *corpora) +{ + int res; + struct poliqarp_error open_error = poliqarp_error_none; + struct poliqarp_corpus *corpus; + progress_t progress; + corpus = malloc(sizeof(*corpus)); + if (corpus == NULL) + system_error(_("malloc() failed")); + progress_init(&progress); + res = poliqarp_open_corpus(corpus, value, &progress, &open_error); + progress_destroy(&progress); + if (res == 0) + hash_table_set(corpora, key, corpus); + else { + fprintf(stderr, "%s\n", poliqarp_error_message_get(&open_error)); + exit(1); + } +} + +static struct hash_table *corpora_open(const struct hash_table *file_names) +{ + if (file_names != NULL) { + struct hash_table *corpora; + corpora = malloc(sizeof(*corpora)); + if (corpora == NULL) + abort(); + if (create_hash_table(corpora, hash_table_num_items(file_names) * 2, 0, NULL) != 0) + abort(); + hash_table_iterate(file_names, corpora, corpora_open_iterator); + return corpora; + } + else + return NULL; +} + +static void corpora_close_iterator(void *corpus) +{ + poliqarp_close_corpus(corpus); + free(corpus); +} + +static void corpora_close(struct hash_table *corpora) +{ + if (corpora == NULL) + return; + destroy_hash_table(corpora, corpora_close_iterator); + free(corpora); +} + +void server_loop() +{ + int sock; + struct sockaddr_in server_name; + struct hostent *host; + struct connection_list connections; + struct hash_table *corpora; + struct session_list sessions; + struct grim_reaper_data reaper_data; + pthread_t reaper; + + init_sockets(); + if ((sock = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP)) == -1) + socket_error(_("socket() failed")); + +/* This compiles on Windows, but has the side-effect that the server will + happily continue running even if another one is listening on the same port... + Weird. */ +#ifndef _WIN32 + { + int sockopt = 1; + if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &sockopt, sizeof(sockopt)) == -1) + { + socket_error(_("setsockopt() failed")); + } + } +#endif /* _WIN32 */ + + { + int d1, d2, d3, d4; + char ch; + if (sscanf(string_str(cfg.hostname), "%d.%d.%d.%d%c", &d1, &d2, &d3, &d4, &ch) == 4 && + 0 <= d1 && d1 <= 255 && 0 <= d2 && d2 <= 255 && + 0 <= d3 && d3 <= 255 && 0 <= d4 && d4 <= 255) { + server_name.sin_addr.s_addr = htonl( + ((uint32_t) d1 << 24) | ((uint32_t) d2 << 16) | + ((uint32_t) d3 << 8) | (uint32_t) d4); + server_name.sin_family = AF_INET; + } + else + { + if ((host = gethostbyname(string_str(cfg.hostname))) == NULL) + resolve_error(_("gethostbyname() failed")); + memset(&server_name, 0, sizeof(server_name)); + memcpy(&server_name.sin_addr, host->h_addr, host->h_length); + } + } + server_name.sin_port = htons(cfg.port); + + if (bind(sock, (struct sockaddr *)&server_name, sizeof(server_name)) == -1) + socket_error(_("bind() failed")); + if (listen(sock, BACKLOG_SIZE) == -1) + socket_error(_("listen() failed")); + + corpora = corpora_open(cfg.corpora); + + connection_list_init(&connections); + session_list_init(&sessions); + srand(time(NULL)); + + reaper_data.list = &sessions; + if (pthread_create(&reaper, NULL, grim_reaper, &reaper_data)) + system_error(_("pthread_create() failed")); + + if (cfg.notify_thread_id) { + notify_readiness(cfg.notify_thread_id); + } + if (cfg.detach) { + if (console_detach() != 0) + system_error(_("console_detach() failed")); + } + + bool server_running = true; + while (server_running) { + struct timeval tv; + struct sockaddr_in client_name; + connection_t *conn; + int slave_sock, have_conn; + socklen_t client_length = sizeof (client_name); + fd_set rdfs; + + memset(&client_name, 0, client_length); + FD_ZERO(&rdfs); + FD_SET(sock, &rdfs); + tv.tv_sec = 1; + tv.tv_usec = 0; + + have_conn = select(sock + 1, &rdfs, NULL, NULL, &tv); + if (have_conn == -1) + socket_error(_("select() failed")); + if (have_conn) { + int index; + slave_sock = accept(sock, (struct sockaddr *)&client_name, + &client_length); + if (slave_sock == -1) { + if (errno == EAGAIN) + continue; + else + socket_error(_("accept() failed")); + } + index = connection_list_pick_index(&connections); + if (index == -1) { + socket_write_cstring(slave_sock, + "Too many connections, try again later.\n"); + closesocket(slave_sock); + pthread_mutex_unlock(&connections.mutex); + continue; + } + + conn = malloc(sizeof(connection_t)); + conn->ip = ntohl(client_name.sin_addr.s_addr); + conn->stream = sockstream_create(slave_sock); + conn->id = index; + conn->connections = &connections; + conn->corpora = corpora; + conn->sessions = &sessions; + conn->session = -1; + pthread_mutex_init(&conn->mutex, NULL); + + if (pthread_create(&conn->thread, NULL, connection_thread, + (void *)conn)) + { + system_error(_("pthread_create() failed")); + } + connection_list_insert_at(&connections, index, conn); + } + lock(&end_mutex); + server_running = !server_end; + unlock(&end_mutex); + } + + pthread_cancel(reaper); + pthread_join(reaper, NULL); + connection_cleanup(&connections); + session_cleanup(&sessions); + corpora_close(corpora); +} diff --git a/poliqarp-library/poliqarpd/server.h b/poliqarp-library/poliqarpd/server.h new file mode 100644 index 0000000000000000000000000000000000000000..de41823dfe16a6679e6898eaf680048b65d8f258 --- /dev/null +++ b/poliqarp-library/poliqarpd/server.h @@ -0,0 +1,80 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef POLIQARPD_SERVER_H +#define POLIQARPD_SERVER_H + +#include "sockstream.h" + +#include <sakura/common/hash-table.h> + +#define POLIQARPD_VERSION PACKAGE_VERSION +#define MAX_CONNECTIONS 128 + +/* Declared later in this file. */ +struct connection_list; + +/** + * A connection, one of the two most important entities in the server. + * It holds all information needed for a connection thread to handle + * incoming requests. + */ +typedef struct connection { + uint32_t ip; /**< the client's IP */ + sockstream_t *stream; /**< I/O socket stream */ + pthread_mutex_t mutex; /**< mutex protecting writes to this + connection's socket stream */ + struct connection_list *connections; + /**< the collection of connections */ + struct session_list *sessions; /**< the collection of sessions */ + struct hash_table *corpora; /**< the collection of predefined corpora */ + pthread_t thread; /**< identifier of the thread that + handles this connection */ + int session; /**< identifier of the session bound to + this connection; -1 if none */ + int id; /**< offset of this connection into the + connection list */ + void *private_data; /**< libpoliqarp's private data */ +} connection_t; + +/** + * A collection of connections. The name can be a bit confusing: this is + * not a linked list, but an array with holes protected by a mutex. + */ +struct connection_list { + pthread_mutex_t mutex; /**< mutex protecting the collection */ + connection_t *array[MAX_CONNECTIONS]; /**< the array proper */ +}; + +/** + * Initiates the server's shutdown sequence. + */ +void server_shutdown(); + +/** + * The server proper. Accepts connections from clients until request for + * termination, fires up and handles the threads. + */ +void server_loop(); + +#endif /* POLIQARPD_SERVER_H */ diff --git a/poliqarp-library/poliqarpd/session.c b/poliqarp-library/poliqarpd/session.c new file mode 100644 index 0000000000000000000000000000000000000000..7cfbdea69cfb4de14575022092849ed8a8f87276 --- /dev/null +++ b/poliqarp-library/poliqarpd/session.c @@ -0,0 +1,306 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include "poliqarp-config.h" + +#include "errors.h" +#include "log.h" +#include "session.h" +#include "sockstream.h" +#include "utils.h" + +void session_list_init(struct session_list *list) +{ + int i; + + pthread_mutex_init(&list->mutex, NULL); + list->count = 0; + for (i = 0; i < MAX_SESSIONS; i++) + list->sessions[i] = NULL; +} + +int session_insert(struct session_list *list, const char *client_id, + connection_t *conn) +{ + int i, index = -1; + session_t *session; + + lock(&list->mutex); + for (i = 0; i < MAX_SESSIONS; i++) + if (list->sessions[i] == NULL) { + index = i; + break; + } + if (index == -1) { + pthread_mutex_unlock(&list->mutex); + return PE_LIMITEXC; + } + + session = malloc(sizeof(session_t)); + + session->match_buffer = malloc(sizeof(struct poliqarp_match_buffer)); + if (poliqarp_create_match_buffer(session->match_buffer, + cfg.match_buffer_size) != 0) + { + free(session->match_buffer); + free(session); + pthread_mutex_unlock(&list->mutex); + return PE_NOMEM; + } + session->index = index; + session->client_id = string_init(client_id); + progress_init(&session->progress); + sessopt_init(&session->options); + session->job = NULL; + pthread_mutex_init(&session->mutex, NULL); + session->cancel = 0; + session->cleanup = 0; + session->qeflags = 0; + pthread_mutex_init(&session->clnmutex, NULL); + pthread_cond_init(&session->jobnull, NULL); + session->corpus = NULL; + session->query = NULL; + msgqueue_init(&session->messages); + time(&session->last_used); + session->connection = conn; + session->error = poliqarp_error_none; + poliqarp_srandom_time(&session->random_state); +#ifdef HAVE_LOCALE_T + session->locale = 0; +#endif + + list->sessions[index] = session; + list->count++; + unlock(&list->mutex); + return index; +} + +void session_delete(session_t *session) +{ + session_job_cancel(session); + /* Cancelling a job might have been not possible: */ + lock(&session->mutex); + while (session->job != NULL) + pthread_cond_wait(&session->jobnull, &session->mutex); + unlock(&session->mutex); + sessopt_free(&session->options); + progress_destroy(&session->progress); + poliqarp_destroy_match_buffer(session->match_buffer); + free(session->match_buffer); + if (session->query) { + poliqarp_destroy_query(session->query); + free(session->query); + session->query = NULL; + } + if (session->corpus) { + if (session->closable_corpus) { + poliqarp_close_corpus(session->corpus); + free(session->corpus); + } + session->corpus = NULL; + } + string_free(session->client_id); +#ifdef HAVE_LOCALE_T + if (session->locale) + freelocale(session->locale); +#endif + poliqarp_error_message_set(&session->error, NULL); + pthread_mutex_destroy(&session->mutex); + pthread_mutex_destroy(&session->clnmutex); + pthread_cond_destroy(&session->jobnull); + free(session); +} + +void session_remove(struct session_list *list, int session) +{ + lock(&list->mutex); + if (list->sessions[session]) { + session_delete(list->sessions[session]); + list->sessions[session] = NULL; + list->count--; + if (list->count == 0 && cfg.gui_mode) + server_shutdown(); + } + unlock(&list->mutex); +} + +void session_idle(struct session_list *list, int session) +{ + time_t now = time(NULL); + + lock(&list->mutex); + if (list->sessions[session]) { + lock(&list->sessions[session]->mutex); + list->sessions[session]->connection = NULL; + list->sessions[session]->last_used = now; + unlock(&list->sessions[session]->mutex); + } + unlock(&list->mutex); +} + +void session_remove_idle(struct session_list *list) +{ + int i; + time_t now = time(NULL); + + lock(&list->mutex); + for (i = 0; i < MAX_SESSIONS; i++) { + if (list->sessions[i] && list->sessions[i]->connection == NULL && + now - list->sessions[i]->last_used >= cfg.max_session_idle) + { + session_delete(list->sessions[i]); + log_entry(_("grim reaper: removing session %d"), i); + list->sessions[i] = NULL; + } + } + unlock(&list->mutex); +} + +void session_cleanup(struct session_list *list) +{ + int i; + + lock(&list->mutex); + for (i = 0; i < MAX_SESSIONS; i++) { + if (list->sessions[i]) { + session_delete(list->sessions[i]); + list->sessions[i] = NULL; + } + } + unlock(&list->mutex); + /* There should no other threads alive at this point: */ + pthread_mutex_destroy(&list->mutex); +} + +static void flush_pending_messages(session_t *session) +{ + string_t str; + + lock(&session->mutex); + lock(&session->connection->mutex); + str = msgqueue_get(&session->messages); + while (str) { + socket_write_string(session->connection->stream->fd, str); + string_free(str); + str = msgqueue_get(&session->messages); + } + unlock(&session->connection->mutex); + unlock(&session->mutex); +} + +int session_rebind(struct session_list *list, int session, string_t uid, + connection_t *conn) +{ + int res = 0; + + if (session < 0 || session >= MAX_SESSIONS) + return PE_INVSID; + + lock(&list->mutex); + if (list->sessions[session] == NULL) + res = PE_INVSID; + else if (list->sessions[session]->connection) + res = PE_SIDUSED; + else if (string_cmp(uid, list->sessions[session]->client_id)) + res = PE_INVUID; + else { + list->sessions[session]->connection = conn; + conn->session = session; +#if HAVE_LOCALE_T + uselocale(list->sessions[session]->locale + ? list->sessions[session]->locale + : LC_GLOBAL_LOCALE); +#endif + flush_pending_messages(list->sessions[session]); + } + unlock(&list->mutex); + + return res; +} + +void session_send(session_t *session, string_t msg) +{ + lock(&session->mutex); + if (session->connection) { + lock(&session->connection->mutex); + socket_write_string(session->connection->stream->fd, msg); + unlock(&session->connection->mutex); + string_free(msg); + } else { + msgqueue_append(&session->messages, msg); + } + unlock(&session->mutex); +} + +void session_csend(session_t *session, const char *msg) +{ + session_send(session, string_init(msg)); +} + +void session_job_launch(session_t *session, void *(*routine)(void *), + void *data) +{ + lock(&session->clnmutex); + session->cleanup = session->cancel = 0; + unlock(&session->clnmutex); + session->job = malloc(sizeof(pthread_t)); + pthread_create(session->job, NULL, routine, data); +} + +int session_job_cancel(session_t *session) +{ + int res; + + lock(&session->mutex); + if (session->job == NULL) + res = PPE_INVJID; + else { + pthread_cancel(*(session->job)); + pthread_join(*(session->job), NULL); + free(session->job); + session->job = NULL; + pthread_cond_broadcast(&session->jobnull); + res = 0; + } + unlock(&session->mutex); + return res; +} + +void session_job_cleanup(void *sess) +{ + session_t *session = *((session_t **)sess); + int cleanup; + lock(&session->clnmutex); + cleanup = session->cleanup; + session->cleanup = 1; + unlock(&session->clnmutex); + if (cleanup == 1) + return; + lock(&session->mutex); + pthread_detach(*(session->job)); + free(session->job); + session->job = NULL; + pthread_cond_broadcast(&session->jobnull); + pthread_detach(pthread_self()); + unlock(&session->mutex); +} diff --git a/poliqarp-library/poliqarpd/session.h b/poliqarp-library/poliqarpd/session.h new file mode 100644 index 0000000000000000000000000000000000000000..2e1a8f153d1e2d52eabcf1c230d6282fe5088192 --- /dev/null +++ b/poliqarp-library/poliqarpd/session.h @@ -0,0 +1,179 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef POLIQARPD_SESSION_H +#define POLIQARPD_SESSION_H + +#include "poliqarp-config.h" + +#ifdef HAVE_LOCALE_T +#include <locale.h> +#endif + +#include <foostring/foostring.h> +#include <sakura/poliqarp.h> +#include <sakura/random.h> +#include <pthread.h> +#include <time.h> + +#include "configuration.h" +#include "msgqueue.h" +#include "server.h" +#include "sessopt.h" + +#define MAX_SESSIONS 256 /**< maximum concurrent sessions */ + +/** + * A session is a series of operations performed by the same user. + * They may all occur during a single connection, but generally + * a session can span multiple connections: it could be unbound from a + * connection and then rebound later. + */ +typedef struct session { + int index; /**< Sequence number. */ + string_t client_id; /**< The 'mnemonic' identifier of the creator + of this session. */ + connection_t *connection; /**< Pointer to a connection that this session + is bound to, or else NULL if it is + currently idle. */ + struct msgqueue messages; /**< Queue of pending messages for an idle + session. */ + progress_t progress; /**< Progress of time-consuming operations. */ + pthread_mutex_t mutex; /**< Mutex for synchronizing job and connection + threads. */ + pthread_t *job; /**< Pointer to a variable containing of the + ID of the thread performing the currently + requested job. */ + int cancel; /**< When set to 1, the job is being cancelled. */ + int cleanup; /**< When set to 1, there is a cleanup in + progress. */ + pthread_mutex_t clnmutex; /**< Mutex for synchronizing cancel variable. */ + pthread_cond_t jobnull; /**< Condition variable to wait for a job to finish. */ + time_t last_used; /**< When did this session last become idle? */ + struct sessopt options; /**< Settable options for this session. */ + int qeflags; /**< Extended flags for last successful query. */ + struct poliqarp_match_buffer *match_buffer; /**< Match buffer. */ + struct poliqarp_corpus *corpus; /**< Corpus. */ + bool closable_corpus; /**< Allow to close corpus? */ + struct poliqarp_query *query; /**< Query. */ + struct poliqarp_error error; /**< Last error. */ + struct poliqarp_random_state random_state; /**< State of the pseudo-random number generator. */ +#ifdef HAVE_LOCALE_T + locale_t locale; /**< Locale */ +#endif +} session_t; + +/** + * This is an array of sessions, with holes and protected from accessing + * by more than one thread at once. + */ +struct session_list { + pthread_mutex_t mutex; /**< mutex for synchronization */ + session_t *sessions[MAX_SESSIONS]; /**< the actual array */ + int count; /**< number of sessions */ +}; + +/** + * Initializes the session list. + */ +void session_list_init(struct session_list *list); + +/** + * Creates a new session (bound to a connection) with the given name + * in a list of sessions. Returns the index of a newly created session + * or an error value if the creation failed (see errors.h). + */ +int session_insert(struct session_list *list, const char *client_id, + connection_t *conn); + +/** + * Deallocates all resources of the session. + */ +void session_delete(session_t *session); + +/** + * Deallocates all resources of an entry of the session list and removes + * it from the list. + */ +void session_remove(struct session_list *list, int session); + +/** + * Removes all sessions which have stayed idle (that is, not bound to any + * connection) for more than max-session-idle seconds. + */ +void session_remove_idle(struct session_list *list); + +/** + * Marks an entry of a session list idle: that is, note that no connection + * is currently using this session. + */ +void session_idle(struct session_list *list, int session); + +/** + * Removes all existing sesions from the list. + */ +void session_cleanup(struct session_list *list); + +/** + * Re-binds a connection to the session. + */ +int session_rebind(struct session_list *list, int session, string_t uid, + connection_t *conn); + +/** + * Sends a message to a session. Works regardless of which state (idle + * or bound) the session is currently in. If the session is idle, + * messages will be delivered as soon as somebody reconnects to it. + * The message is freed as soon as it's delivered. + */ +void session_send(session_t *session, string_t msg); + +/** + * Works exactly like session_send(), except that it sends a const char * + * instead of string_t. + */ +void session_csend(session_t *session, const char *msg); + +/** + * Launches a job for the session. + */ +void session_job_launch(session_t *session, void *(*routine)(void *), + void *data); + +/** + * Tries to cancel a running job. + */ +int session_job_cancel(session_t *session); + +/** + * Performs job cleanup after a completed or interrupted job, setting the + * session's job pointer to NULL. Must be called prior to outputting + * any asynchronous return messages from this session. + * @note session is really a pointer to session_t *. It is declared as + * void * to enable this function to be passed as an argument to + * pthread_cleanup_push(). Note also that any structure having a session + * as its first member is also valid as an argument here. + */ +void session_job_cleanup(void *session); + +#endif /* POLIQARPD_SESSION_H */ diff --git a/poliqarp-library/poliqarpd/sessopt.c b/poliqarp-library/poliqarpd/sessopt.c new file mode 100644 index 0000000000000000000000000000000000000000..ff8292b29ef8593bddb9413c233cd4089e23b519 --- /dev/null +++ b/poliqarp-library/poliqarpd/sessopt.c @@ -0,0 +1,290 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <stdlib.h> +#include <string.h> +#include <errno.h> + +#include <sakura/poliqarp.h> + +#include "sessopt.h" + +#define MAX_CONTEXT_CHARS 500 +#define MAX_CONTEXT_SEGMENTS 20 +#define MAX_WCONTEXT_CHARS 5000 +#define MAX_WCONTEXT_SEGMENTS 200 + +typedef int (*optproc_t)(struct sessopt *, const char *); + +struct option_hook { + char *name; + optproc_t proc; +}; + +static int process_width(const char *value, struct context_width *width) +{ + char *endptr; + unsigned long res; + errno = 0; + res = strtoul(value, &endptr, 10); + if (value == endptr) + return -1; /* no digits converted */ + if (errno == ERANGE || res != (size_t)res) + return -1; /* integer overflow */ + if (*endptr == '\0') { + width->charspec = false; + width->width = res; + return 0; + } + if ((*endptr == 'c' || *endptr == 'C') && *(endptr + 1) == '\0') { + width->charspec = true; + width->width = res; + return 0; + } + return -1; +} + +static int process_flag(char value, int *what, int flag) +{ + if (value == '1') { + *what |= flag; + return 0; + } + if (value == '0') { + *what &= (~flag); + return 0; + } + return -1; +} + +static int process_lemmata(const char *value, int *what) +{ + if (process_flag(value[0], what, SHOW_LEMMATA_LC) || + process_flag(value[1], what, SHOW_LEMMATA_LM) || + process_flag(value[2], what, SHOW_LEMMATA_RM) || + process_flag(value[3], what, SHOW_LEMMATA_RC) || + value[4] != '\0') + return -1; + else + return 0; +} + +static int process_tags(const char *value, int *what) +{ + if (process_flag(value[0], what, SHOW_TAGS_LC) || + process_flag(value[1], what, SHOW_TAGS_LM) || + process_flag(value[2], what, SHOW_TAGS_RM) || + process_flag(value[3], what, SHOW_TAGS_RC) || + value[4] != '\0') + return -1; + else + return 0; +} + +static int process_ids(const char *value, int *what) +{ + if (process_flag(value[0], what, SHOW_IDS_LC) || + process_flag(value[1], what, SHOW_IDS_LM) || + process_flag(value[2], what, SHOW_IDS_RM) || + process_flag(value[3], what, SHOW_IDS_RC) || + value[4] != '\0') + return -1; + else + return 0; +} + +static inline int check_width(const struct context_width *width) +{ + if (width->charspec && (width->width <= 0 || width->width > + MAX_CONTEXT_CHARS)) + return -1; + if (!width->charspec && (width->width <= 0 || width->width > + MAX_CONTEXT_SEGMENTS)) + return -1; + return 0; +} + +static inline int wide_check_width(const struct context_width *width) +{ + if (width->charspec && (width->width <= 0 || width->width > + MAX_WCONTEXT_CHARS)) + return -1; + if (!width->charspec && (width->width <= 0 || width->width > + MAX_WCONTEXT_SEGMENTS)) + return -1; + return 0; +} + +static int left_context_hook(struct sessopt *options, const char *value) +{ + struct context_width width; + if (process_width(value, &width) || check_width(&width)) + return PE_INVVAL; + options->left_context_width = width; + return 0; +} + +static int right_context_hook(struct sessopt *options, const char *value) +{ + struct context_width width; + if (process_width(value, &width) || check_width(&width)) + return PE_INVVAL; + options->right_context_width = width; + return 0; +} + +static int wide_context_hook(struct sessopt *options, const char *value) +{ + struct context_width width; + if (process_width(value, &width) || wide_check_width(&width)) + return PE_INVVAL; + options->wide_context_width = width; + return 0; +} + +static int lemmata_hook(struct sessopt *options, const char *value) +{ + int res = options->retrieve; + if (process_lemmata(value, &res)) + return PE_INVVAL; + options->retrieve = res; + return 0; +} + +static int tags_hook(struct sessopt *options, const char *value) +{ + int res = options->retrieve; + if (process_tags(value, &res)) + return PE_INVVAL; + options->retrieve = res; + return 0; +} + +static int ids_hook(struct sessopt *options, const char *value) +{ + int res = options->retrieve; + if (process_ids(value, &res)) + return PE_INVVAL; + options->retrieve = res; + return 0; +} + +static int notification_interval_hook(struct sessopt *options, + const char *value) +{ + char *endptr; + unsigned long res = strtoul(value, &endptr, 10); + if (*value == '\0' || *endptr != '\0' || res > 1000) + return PE_INVVAL; + options->interval = res; + return 0; +} + +static int disamb_hook(struct sessopt *options, const char *value) +{ + bool val = true; + if (strcmp(value, "0") == 0) + val = false; + else if (strcmp(value, "1") != 0) + return PE_INVVAL; + options->disamb = val; + return 0; +} + +static int query_flags_hook(struct sessopt *options, const char *value) +{ + int opt = 0; + if (strlen(value) != 4) + return PE_INVVAL; + if (value[0] == '1') opt |= POLIQARP_QFLAG_QUERY_I; + if (value[1] == '1') opt |= POLIQARP_QFLAG_QUERY_X; + if (value[2] == '1') opt |= POLIQARP_QFLAG_META_I; + if (value[3] == '1') opt |= POLIQARP_QFLAG_META_X; + options->qflags = opt; + return 0; +} + +static int rewrite_hook(struct sessopt *options, const char *value) +{ + if (!*value) + return PE_INVVAL; + string_clear(options->rewrite); + string_append_str(options->rewrite, value); + return 0; +} + +static int random_sample_hook(struct sessopt *options, const char *value) +{ + if (strcmp(value, "0") == 0) + options->random_sample = false; + else if (strcmp(value, "1") == 0) + options->random_sample = true; + else + return PE_INVVAL; + return 0; +} + +static const struct option_hook hooks[] = { + { "left-context-width", left_context_hook }, + { "right-context-width", right_context_hook }, + { "wide-context-width", wide_context_hook }, + { "retrieve-lemmata", lemmata_hook }, + { "retrieve-tags", tags_hook }, + { "retrieve-ids", ids_hook }, + { "notification-interval", notification_interval_hook }, + { "query-flags", query_flags_hook }, + { "disamb", disamb_hook }, + { "rewrite", rewrite_hook }, + { "random-sample", random_sample_hook }, +}; + +static const struct sessopt default_options = { + { 5, false }, /* left context */ + { 5, false }, /* right context */ + { 50, false }, /* wide context */ + SHOW_LEMMATA_LM | SHOW_LEMMATA_RM | SHOW_TAGS_LM | SHOW_TAGS_RM, + 0, + true, + POLIQARP_QFLAG_META_I | POLIQARP_QFLAG_META_X, + NULL +}; + +void sessopt_init(struct sessopt *options) +{ + *options = default_options; + options->rewrite = string_init("default"); +} + +void sessopt_free(struct sessopt *options) +{ + string_free(options->rewrite); +} + +int sessopt_set(struct sessopt *options, const char *name, const char *value) +{ + size_t i; + for (i = 0; i < sizeof(hooks) / sizeof(hooks[0]); i++) + if (!strcmp(hooks[i].name, name)) + return hooks[i].proc(options, value); + return PE_INVOPT; +} diff --git a/poliqarp-library/poliqarpd/sessopt.h b/poliqarp-library/poliqarpd/sessopt.h new file mode 100644 index 0000000000000000000000000000000000000000..d1db1b38cf7b86b275850f9d725b157077dd46d5 --- /dev/null +++ b/poliqarp-library/poliqarpd/sessopt.h @@ -0,0 +1,109 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef POLIQARPD_SESSOPT_H +#define POLIQARPD_SESSOPT_H + +#include <poliqarp-config.h> + +#include <foostring/foostring.h> +#include "errors.h" + +/** + * Structure representing width of a context (which can be given either in + * segments or characters). + * FIXME: shouldn't this be moved to libpoliqarp.h? + */ +struct context_width { + size_t width; /**< how wide this segment is. */ + bool charspec; /**< if this is true, the `width' field specifies the width + in characters, else in segments. */ +}; + +/** + * Structure representing options that can be set for a session. + */ +struct sessopt { + struct context_width left_context_width; + struct context_width right_context_width; + struct context_width wide_context_width; + + /** This is an integer that specifies what should be displayed in various + * parts of the query result. It consists of several bitfields + * that can be set or not. */ + int retrieve; + + /** This value specifies the intervals between result portions. */ + size_t interval; + + /** Specifies whether to retrieve disambiguated interpretations. */ + bool disamb; + + /** Specifies the default flags for various parts of the query. This is a + * bitwise OR of QFLAG_*. */ + int qflags; + + /** Specifies the name for query rewriting rules. + */ + string_t rewrite; + + /** Specifies whether a random sample is requested. + */ + bool random_sample; +}; + +/** + * Bitmasks for use with `retrieve' field of struct sessopt. + */ +#define SHOW_LEMMATA_LC 0x0001 +#define SHOW_LEMMATA_LM 0x0002 +#define SHOW_LEMMATA_RM 0x0004 +#define SHOW_LEMMATA_RC 0x0008 +#define SHOW_TAGS_LC 0x0010 +#define SHOW_TAGS_LM 0x0020 +#define SHOW_TAGS_RM 0x0040 +#define SHOW_TAGS_RC 0x0080 +#define SHOW_IDS_LC 0x0100 +#define SHOW_IDS_LM 0x0200 +#define SHOW_IDS_RM 0x0400 +#define SHOW_IDS_RC 0x0800 + +/** + * Initializes the given options structure with default values. + */ +void sessopt_init(struct sessopt *options); + +/** + * Frees the given options structure. + */ +void sessopt_free(struct sessopt *options); + +/** + * Set the value of an option. + * @return 0 upon successful setting. + * @return PE_INVOPT if the option name was incorrect. + * @return PE_INVVAL if the option value was incorrect. + */ +int sessopt_set(struct sessopt *options, const char *name, const char *value); + +#endif /* POLIQARPD_SESSION_H */ diff --git a/poliqarp-library/poliqarpd/sockets.c b/poliqarp-library/poliqarpd/sockets.c new file mode 100644 index 0000000000000000000000000000000000000000..9d2e133176cf8a43411814b27f2c7189d3647da5 --- /dev/null +++ b/poliqarp-library/poliqarpd/sockets.c @@ -0,0 +1,57 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <errno.h> + +#include "sockets.h" + +int init_sockets() +{ +#ifdef _WIN32 + static WSADATA wsa_data; + + if (WSAStartup(MAKEWORD(2, 0), &wsa_data) != 0) + return -1; +#endif /* _WIN32 */ + + return 0; +} + +int done_sockets() +{ +#ifdef _WIN32 + return WSACleanup(); +#else + return 0; +#endif /* _WIN32 */ +} + +bool peer_disconnected() +{ +#ifdef _WIN32 + int err = WSAGetLastError(); + return err == WSAECONNRESET || err == WSAECONNABORTED; +#else + return errno == ECONNRESET || errno == EPIPE; +#endif /* _WIN32 */ +} diff --git a/poliqarp-library/poliqarpd/sockets.h b/poliqarp-library/poliqarpd/sockets.h new file mode 100644 index 0000000000000000000000000000000000000000..fac7873deca8163ecfd8252d2db0c67f10537e94 --- /dev/null +++ b/poliqarp-library/poliqarpd/sockets.h @@ -0,0 +1,62 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +/* + * A cross-platform way of including TCP-related functions. + */ + +#ifndef POLIQARPD_SOCKETS_H +#define POLIQARPD_SOCKETS_H + +#include <poliqarp-config.h> + +#ifdef _WIN32 +# include <winsock2.h> +#else +# include <netdb.h> +# include <unistd.h> +# include <sys/socket.h> +# include <netinet/in.h> +# include <arpa/inet.h> +# define closesocket(x) close(x) +#endif /* _WIN32 */ + +/** + * Initializes the socket functions. Must be called before any of the functions + * is used. + * @return 0 on success, -1 on error. + */ +int init_sockets(); + +/** + * Deallocates any resources allocated by the socket functions. + * @return 0 on success, -1 on error. + */ +int done_sockets(); + +/** + * Call after a failing send() to check if peer has disconnected. + */ +bool peer_disconnected(); + +#endif /* POLIQARPD_SOCKETS_H */ diff --git a/poliqarp-library/poliqarpd/sockstream.c b/poliqarp-library/poliqarpd/sockstream.c new file mode 100644 index 0000000000000000000000000000000000000000..b63a5e61cc2f4e8cdb7d75b440648f9603e52464 --- /dev/null +++ b/poliqarp-library/poliqarpd/sockstream.c @@ -0,0 +1,150 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <string.h> + +#include "sockstream.h" +#include "sockets.h" +#include "utils.h" + +#define BUF_SIZE 4096 +#define OUTPUT_BUF_SIZE 32768 + +sockstream_t *sockstream_create(int fd) +{ + sockstream_t *res = malloc(sizeof(sockstream_t)); + res->fd = fd; + res->buf = malloc(BUF_SIZE); + res->start = res->used = 0; + res->obuf = malloc(OUTPUT_BUF_SIZE); + res->oused = 0; + return res; +} + +void sockstream_free(sockstream_t *s) +{ + closesocket(s->fd); + free(s->obuf); + free(s->buf); + free(s); +} + +string_t sockstream_read_string(sockstream_t *s) +{ + string_t res = string_create(); + char *eol = memchr(s->buf + s->start, '\n', s->used); + int goon = 1; + if (eol != NULL) { + size_t numbytes = eol - s->buf - s->start + 1; + string_append_strn(res, s->buf + s->start, numbytes - 1); + s->start += numbytes; + s->used -= numbytes; + return res; + } + string_append_strn(res, s->buf + s->start, s->used); + while (goon) { + size_t numbytes; + ssize_t recvd; + s->start = 0; + recvd = recv(s->fd, s->buf, BUF_SIZE, 0); + if (recvd < 0) + { + if (!peer_disconnected()) + socket_error(_("recv() failed")); + else + recvd = 0; + } + s->used = recvd; + if (s->used == 0) + break; + eol = memchr(s->buf, '\n', s->used); + if (eol != NULL) { + numbytes = eol - s->buf; + goon = 0; + } else { + numbytes = s->used; + } + string_append_strn(res, s->buf, numbytes); + s->start += (numbytes + 1); + s->used -= (numbytes + 1); + } + return res; +} + +static void socket_write_cstring_len(int fd, const char *str, size_t length) +{ + while (length) { + ssize_t written = send(fd, str, length, 0); + if (written < 0) + { + if (!peer_disconnected()) + socket_error(_("send() failed")); + return; + } + str += written; + length -= written; + } +} + +void socket_write_string(int sock, const string_t str) +{ + socket_write_cstring_len(sock, string_str(str), string_len(str)); +} + +void socket_write_cstring(int sock, const char *str) +{ + socket_write_cstring_len(sock, str, strlen(str)); +} + +void socket_writeln(int sock) +{ + char newline = '\n'; + int written = send(sock, &newline, 1, 0); + if (written != 1 && !peer_disconnected()) + socket_error(_("send() failed")); +} + +void socket_writeln_string(int sock, const string_t str) +{ + socket_write_string(sock, str); + socket_writeln(sock); +} + +void sockstream_output_flush(sockstream_t *s) +{ + socket_write_cstring_len(s->fd, s->obuf, s->oused); + s->oused = 0; +} + +void sockstream_write_cstring(sockstream_t *s, const char *str) +{ + size_t len = strlen(str), ofs = 0; + while (ofs < len) { + size_t towrite = MIN(OUTPUT_BUF_SIZE - s->oused, len - ofs); + memcpy(s->obuf + s->oused, str + ofs, towrite); + s->oused += towrite; + ofs += towrite; + if (s->oused == OUTPUT_BUF_SIZE) + sockstream_output_flush(s); + } +} diff --git a/poliqarp-library/poliqarpd/sockstream.h b/poliqarp-library/poliqarpd/sockstream.h new file mode 100644 index 0000000000000000000000000000000000000000..6cff789c29d45f2d8de9d5d3ee43c3c30e66ecd4 --- /dev/null +++ b/poliqarp-library/poliqarpd/sockstream.h @@ -0,0 +1,90 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef POLIQARPD_SOCKSTREAM_H +#define POLIQARPD_SOCKSTREAM_H + +#include <foostring/foostring.h> +#include <string.h> + +/** + * This structure is a streamed view of a connected TCP socket, from which one + * can retrieve strings and to which one can send strings. The strings are not + * actually sent until a flush function is called. + */ +typedef struct sockstream { + int fd; /**< descriptor of the socket */ + char *buf; /**< the internal input buffer */ + size_t start; /**< start of the next portion that will be read */ + size_t used; /**< number of used bytes in input buffer */ + char *obuf; /**< the internal output buffer */ + size_t oused; /**< number of used bytes in the output buffer */ +} sockstream_t; + +/** + * Creates a sockstream from a given socket. + */ +sockstream_t *sockstream_create(int fd); + +/** + * Closes the socket connected with the given stream and deallocates all its + * resources. + */ +void sockstream_free(sockstream_t *sock); + +/** + * Reads characters from the socket until encountering a newline. Returns + * the string consisting of the characters read, without the terminating + * newline character. Can return an empty string in case the underlying + * recv() call returns 0. + */ +string_t sockstream_read_string(sockstream_t *sock); + +/** + * Writes a string to the given socket. + */ +void socket_write_string(int sock, const string_t str); + +/** + * Writes a null-terminated string to the given socket. + */ +void socket_write_cstring(int sock, const char *str); + +/** + * Writes a string to the given socket followed by a newline. + */ +void socket_writeln_string(int sock, const string_t str); + +/** + * Writes a string to the socket stream's output buffer. The string is not + * sent to the underlying socket immediately, but stored in the buffer and + * sent only when explicitly flushed, or when the buffer overflows. + */ +void sockstream_write_cstring(sockstream_t *sock, const char *str); + +/** + * Flushes the socket stream's output buffer. + */ +void sockstream_output_flush(sockstream_t *sock); + +#endif /* POLIQARPD_SOCKSTREAM_H */ diff --git a/poliqarp-library/poliqarpd/utils.c b/poliqarp-library/poliqarpd/utils.c new file mode 100644 index 0000000000000000000000000000000000000000..205ffa510131a6b0ed90fbe8df146115e8ec8f10 --- /dev/null +++ b/poliqarp-library/poliqarpd/utils.c @@ -0,0 +1,45 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <stdio.h> +#include <stdlib.h> + +#include "osdep.h" + +void system_error(const char *msg) +{ + perror(msg); + exit(1); +} + +void socket_error(const char *msg) +{ + print_socket_error(msg); + exit(1); +} + +void resolve_error(const char *msg) +{ + herror(msg); + exit(1); +} diff --git a/poliqarp-library/poliqarpd/utils.h b/poliqarp-library/poliqarpd/utils.h new file mode 100644 index 0000000000000000000000000000000000000000..f7db4958235f6197a4da33eb3445310a64736d94 --- /dev/null +++ b/poliqarp-library/poliqarpd/utils.h @@ -0,0 +1,67 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +/** Useful functions and definitions. */ + +#ifndef POLIQARPD_UTILS_H +#define POLIQARPD_UTILS_H + +#ifdef MIN +# undef MIN +#endif +#define MIN(a, b) (((a) > (b)) ? (b) : (a)) + +#ifdef MAX +# undef MAX +#endif +#define MAX(a, b) (((a) > (b)) ? (a) : (b)) + +/** These two macros are defined for easy locking and unlocking of mutexes. */ +#define lock(m) \ + pthread_cleanup_push((void (*)(void *))pthread_mutex_unlock, (void *)(m)); \ + do { \ + if (pthread_mutex_lock(m)) \ + system_error("mutex lock failed"); \ + } while (0) + +#define unlock(m) pthread_cleanup_pop(1) + +/** + * Prints out the given error message (along with the system error message) + * on stderr and terminates the process. + */ +void system_error(const char *msg); + +/** + * Prints out the given error message (along with the socket error message) + * on stderr and terminates the process. + */ +void socket_error(const char *msg); + +/** + * Prints out the given error message (along with the resolver error message) + * on stderr and terminates the process. + */ +void resolve_error(const char *msg); + +#endif /* POLIQARPD_UTILS_H */ diff --git a/poliqarp-library/progress/progress.c b/poliqarp-library/progress/progress.c new file mode 100644 index 0000000000000000000000000000000000000000..2aa21f5c63acd5147ac6ed5a4149d1a2acdac7ea --- /dev/null +++ b/poliqarp-library/progress/progress.c @@ -0,0 +1,65 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2010 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include "progress.h" + +void progress_init(progress_t *progress) +{ + pthread_mutex_init(&progress->mutex, NULL); + progress->progress = 0; +} + +void progress_destroy(progress_t *progress) +{ + pthread_mutex_destroy(&progress->mutex); +} + +void progress_set(progress_t *progress, int value) +{ + pthread_mutex_lock(&progress->mutex); + progress->progress = value > 100 ? 100 : value < 0 ? 0 : value; + pthread_mutex_unlock(&progress->mutex); +} + +void progress_reset(progress_t *progress) +{ + progress_set(progress, 0); +} + +void progress_advance(progress_t *progress, int amount) +{ + pthread_mutex_lock(&progress->mutex); + progress->progress += amount; + if (progress->progress > 100) + progress->progress = 100; + pthread_mutex_unlock(&progress->mutex); +} + +int progress_get(progress_t *progress) +{ + int value; + pthread_mutex_lock(&progress->mutex); + value = progress->progress; + pthread_mutex_unlock(&progress->mutex); + return value; +} diff --git a/poliqarp-library/progress/progress.h b/poliqarp-library/progress/progress.h new file mode 100644 index 0000000000000000000000000000000000000000..99816796e852607d21384bd4452cb8fda21e2c26 --- /dev/null +++ b/poliqarp-library/progress/progress.h @@ -0,0 +1,65 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2010 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +/** + * @file progress.h + * @brief This is libprogress, the public library which is used by + * implementations of libpoliqarp to signal the progress of time-consuming + * calls. This library is reentrant and thread-safe. + */ + +#ifndef PROGRESS_H +#define PROGRESS_H + +#include <pthread.h> + +/** + * The type of a progress. It is an integer in the range 0-100, corresponding + * to the percentage of already completed task (0 means not started yet, + * 100 means complete.) + */ +typedef struct { + int progress; + pthread_mutex_t mutex; +} progress_t; + +/** Initializes the progress. Must be called before any other function. */ +void progress_init(progress_t *progress); + +/** Frees resources associated with the progress. + * No other function can be called after this one. */ +void progress_destroy(progress_t *progress); + +/** Re-sets the progress to zero. */ +void progress_reset(progress_t *progress); + +/** Sets the progress to the given value. */ +void progress_set(progress_t *progress, int value); + +/** Increases the progress by a given amount. */ +void progress_advance(progress_t *progress, int amount); + +/** Returns the value of the progress as an integer. */ +int progress_get(progress_t *progress); + +#endif /* PROGRESS_H */ diff --git a/poliqarp-library/sakura/abi.h b/poliqarp-library/sakura/abi.h new file mode 100644 index 0000000000000000000000000000000000000000..dd80a130add789c58dc969b04d21e728dbae155c --- /dev/null +++ b/poliqarp-library/sakura/abi.h @@ -0,0 +1,198 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +/** + * @file abi.h + * This file describes binary corpus format. + */ + +#ifndef POLIQARP_ABI_H +#define POLIQARP_ABI_H + +#include <poliqarp-config.h> +#include <stddef.h> + +#include <stddef.h> + +#define POLIQARP_ABI_VERSION 2 + +/** + * Interpretation item ABI. + */ +struct poliqarp_binary_interp { + uint32_t base_id; /**< Identifier of string associated with this + base form. */ + uint32_t tag_id; /**< Identifier of string associated with this tag. */ +} __attribute__((packed)); /* -64- */ + +union poliqarp_legacy_binary_interp { + struct { + unsigned base_id:20; /**< Identifier of string associated with this + base form. */ + unsigned tag_id:12; /**< Identifier of string associated with this tag. */ + } __attribute__((packed)) attributes; /* -32- */ + uint32_t data; +}; + +static inline struct poliqarp_binary_interp +poliqarp_convert_legacy_binary_interp( + const union poliqarp_legacy_binary_interp legacy_interpretation) +{ + struct poliqarp_binary_interp res; + uint32_t data = en4(legacy_interpretation.data); + res.base_id = data & 0xfffff; + res.tag_id = data >> 20; + return res; +} + +/** + * Segment item ABI. + */ +struct poliqarp_binary_segment { + uint32_t orth_space_id; /**< Identifier of string associated with + this segment; lowest bit set to 1 + if a space occurs before this segment. */ + uint32_t interp_disamb_id; /**< Identifier of the interpretation set + (disambiguated). */ + uint32_t interp_amb_id; /**< Identifier of the interpretation set + (ambiguous). */ +} __attribute__((packed)); /* -96- */ + +union poliqarp_legacy_binary_segment { + struct { + unsigned space:1; /**< Set to one if a space occurs before + this segment. */ + unsigned orth_id:21; /**< Identifier of string associated with + this segment. */ + unsigned interp_disamb_id:21; /**< Identifier of the interpretation set + (disambiguated). */ + unsigned interp_amb_id:21; /**< Identifier of the interpretation set + (ambiguous). */ + } __attribute__((packed)) attributes; + uint64_t data; +} __attribute__((packed)); /* -64- */ + +static inline struct poliqarp_binary_segment +poliqarp_convert_legacy_binary_segment( + const union poliqarp_legacy_binary_segment legacy_segment) +{ + struct poliqarp_binary_segment res; + uint64_t data = en8(legacy_segment.data); + res.orth_space_id = data & 0x3fffff; + res.interp_disamb_id = (data >> 22) & 0x1fffff; + res.interp_amb_id = (data >> 43) & 0x1fffff; + return res; +} + +#ifdef WORDS_BIGENDIAN + +#define POLIQARP_INTERP_LE_TO_HE(x) do { \ + (x).base_id = en4((x).base_id); \ + (x).tag_id = en4((x).tag_id); \ +} while (0) + +#define POLIQARP_SEGMENT_LE_TO_HE(x) do { \ + (x).orth_space_id = en4((x).orth_space_id); \ + (x).interp_disamb_id = en4((x).interp_disamb_id); \ + (x).interp_amb_id = en4((x).interp_amb_id); \ +} while (0) + +#else + +#define POLIQARP_INTERP_LE_TO_HE(x) do { } while (0) +#define POLIQARP_SEGMENT_LE_TO_HE(x) do { } while (0) + +#endif + +struct poliqarp_interpretation { + const struct poliqarp_corpus *corpus; + struct poliqarp_binary_interp interp; + bool disamb; +}; + +struct poliqarp_segment { + const struct poliqarp_corpus *corpus; /**< Corpus that this segment + belongs to. */ + struct poliqarp_binary_segment segment; /**< Information about the + segment. */ +}; + +struct poliqarp_interpretation_set { + const struct poliqarp_corpus *corpus; + /**< Corpus that this set belongs to. */ + uint32_t set; /**< Identifier of the set. */ + bool disamb; /**< True iff it's a disambiguated set. */ +}; + +/** Date metadata, all fields are self-explanatory. */ +struct poliqarp_meta_date { + uint16_t year; /**< Year. */ + uint8_t month; /**< Month. */ + uint8_t day; /**< Day. */ +}; + +enum poliqarp_binary_metadata_type { + POLIQARP_METADATA_SINGLE, /**< Item has a single value. */ + POLIQARP_METADATA_MULTI, /**< Item has multiple values. */ + POLIQARP_METADATA_DATE, /**< Item has a single date value. */ + POLIQARP_METADATA_UNDEFINED /**< This value is undefined for the item. */ +}; + +/** Type + value union. */ +struct poliqarp_binary_metadata { + uint32_t type; /**< Type of meta item. */ + uint32_t key; /**< Key to key dictionary in backend meta. */ + union { + uint32_t text; /**< Key to value dictionary. */ + struct poliqarp_meta_date date; /**< Packed date. */ + } value_as; /**< Union of values. */ +}; + +struct poliqarp_metadata { + const struct poliqarp_corpus *corpus; + /**< Corpus that this metadata belongs to. */ + struct poliqarp_binary_metadata meta; /**< The metadata proper. */ +}; + +struct poliqarp_metadata_set { + const struct poliqarp_corpus *corpus; + /**< Corpus that this set belongs to. */ + size_t low; /**< Lower range of metadata. */ + size_t high; /**< Higher range of metadata. */ +}; + +/** Document information. */ +struct poliqarp_document { + uint32_t corpus_low; /**< Lower bound of corpus. */ + uint32_t corpus_high; /**< Higher bound of corpus. */ + uint32_t meta_low; /**< Lower bound of meta data. */ + uint32_t meta_high; /**< Higher bound of meta data. */ +}; + +/** Subdocument information. */ +struct poliqarp_subdocument { + uint32_t corpus_low; /**< Lower bound of corpus. */ + uint32_t corpus_high; /**< Higher bound of corpus. */ +}; + +#endif /* ABI_H */ diff --git a/poliqarp-library/sakura/backend-base.c b/poliqarp-library/sakura/backend-base.c new file mode 100644 index 0000000000000000000000000000000000000000..c07787063035e9db15a2112b8fa5c955c5aa63af --- /dev/null +++ b/poliqarp-library/sakura/backend-base.c @@ -0,0 +1,53 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <sakura/backend-base.h> +#include <sakura/dict.h> +#include <sakura/exception.h> + +#include <foostring/foostring.h> + +int poliqarp_backend_base_open(struct poliqarp_backend_base *this, + const char *base_name, struct poliqarp_error *error) +{ + int rc; + rc = poliqarp_newdict_open(&this->dict_disamb, base_name, true, + POLIQARP_BASE1_IMAGE_FORMAT, POLIQARP_BASE1_OFFSET_FORMAT, + _("Unable to open base-1 dictionary"), error); + if (rc != 0) + return rc; + rc = poliqarp_newdict_open(&this->dict_amb, base_name, true, + POLIQARP_BASE2_IMAGE_FORMAT, POLIQARP_BASE2_OFFSET_FORMAT, + _("Unable to open base-2 dictionary"), error); + if (rc != 0) { + newdict_close(&this->dict_disamb); + return rc; + } + return 0; +} + +void poliqarp_backend_base_close(struct poliqarp_backend_base *this) +{ + newdict_close(&this->dict_disamb); + newdict_close(&this->dict_amb); +} diff --git a/poliqarp-library/sakura/backend-base.h b/poliqarp-library/sakura/backend-base.h new file mode 100644 index 0000000000000000000000000000000000000000..3cd81f8165a758f58bae4e4ff3a64a261e9ed1d0 --- /dev/null +++ b/poliqarp-library/sakura/backend-base.h @@ -0,0 +1,97 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef POLIQARP_BACKEND_BASE_H +#define POLIQARP_BACKEND_BASE_H + +#include <poliqarp-config.h> + +#include <sakura/exception.h> +#include <sakura/common/newdict.h> + +/** @todo */ +#define POLIQARP_BASE1_IMAGE_FORMAT "%s.poliqarp.base1.image" +/** @todo */ +#define POLIQARP_BASE1_OFFSET_FORMAT "%s.poliqarp.base1.offset" + +/** @todo */ +#define POLIQARP_BASE2_IMAGE_FORMAT "%s.poliqarp.base2.image" +/** @todo */ +#define POLIQARP_BASE2_OFFSET_FORMAT "%s.poliqarp.base2.offset" + + +/** @todo */ +struct poliqarp_backend_base { + struct newdict dict_disamb; /**< Dictionary with disambiguated base + strings. */ + struct newdict dict_amb; /**< Dictionary with ambiguous base strings. */ +}; + +/** @todo */ +int poliqarp_backend_base_open(struct poliqarp_backend_base *this, + const char *base_name, struct poliqarp_error *error); + +/** @todo */ +void poliqarp_backend_base_close(struct poliqarp_backend_base *this); + +static inline size_t poliqarp_backend_base_num_items__disamb( + const struct poliqarp_backend_base *this) +{ + return GET_NUM_ITEMS(&this->dict_disamb); +} + +static inline size_t poliqarp_backend_base_num_items_amb( + const struct poliqarp_backend_base *this) +{ + return GET_NUM_ITEMS(&this->dict_amb); +} + +static inline const char *poliqarp_backend_base_fetch__disamb( + const struct poliqarp_backend_base *this, size_t key) +{ + assert(key < this->dict_disamb.num_items); + return GET_ITEM(&this->dict_disamb, key); +} + +static inline const char *poliqarp_backend_base_fetch__amb( + const struct poliqarp_backend_base *this, size_t key) +{ + assert(key < this->dict_amb.num_items); + return GET_ITEM(&this->dict_amb, key); +} + +static inline size_t poliqarp_backend_base_length__disamb( + const struct poliqarp_backend_base *this, size_t key) +{ + assert(key < this->dict_disamb.num_items); + return GET_LENGTH(&this->dict_disamb, key); +} + +static inline size_t poliqarp_backend_base_length__amb( + const struct poliqarp_backend_base *this, size_t key) +{ + assert(key < this->dict_amb.num_items); + return GET_LENGTH(&this->dict_amb, key); +} + +#endif /* POLIQARP_BACKEND_BASE_H */ diff --git a/poliqarp-library/sakura/backend-config.c b/poliqarp-library/sakura/backend-config.c new file mode 100644 index 0000000000000000000000000000000000000000..bfaf3ef00bbae1b1910ee9be8aa38abccd4208cf --- /dev/null +++ b/poliqarp-library/sakura/backend-config.c @@ -0,0 +1,528 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <foostring/foostring.h> + +#include <sakura/backend-config.h> +#include <sakura/backend-index.h> +#include <sakura/abi.h> +#include <sakura/cdf.h> +#include <sakura/query-rewrite.h> +#include <sakura/exception.h> +#include <sakura/config.h> + +enum poliqarp_section { + SECTION_NONE = -1, + SECTION_STARTUP, + SECTION_ALIASES, + SECTION_ATTR, + SECTION_POS, + SECTION_NAMED_ENTITY, + SECTION_QUERY_REWRITE_RULES, +}; + +/* attribute construction */ + +struct poliqarp_attr *poliqarp_new_attr(struct poliqarp_backend_config *this, + const char *name) +{ + static enum poliqarp_entity_type tag = POLIQARP_ENTITY_ATTR; + struct entity *self; + struct poliqarp_attr *result; + + /* create entity */ + self = new_entity(&this->named_items, name, &tag, NULL); + + if (self == NULL) + return NULL; + + /* create attr */ + result = marena_alloc(&this->arena, sizeof *result); + if (result == NULL) + return NULL; + result->self = self; + result->num_values = 0; + result->first_value = NULL; + result->id = this->num_attr++; + + /* update pointer in entity */ + self->data = result; + + return result; +} + +struct poliqarp_attr_value *poliqarp_new_attr_value( + struct poliqarp_backend_config *this, struct poliqarp_attr *attr, const char *name) +{ + static enum poliqarp_entity_type tag = POLIQARP_ENTITY_ATTR_VALUE; + struct entity *self; + struct poliqarp_attr_value *result; + + /* create entity NOTE: it uses different entity bag */ + self = new_entity(&this->attr_values, name, &tag, NULL); + + if (self == NULL) + return NULL; + + /* create attr value */ + result = marena_alloc(&this->arena, sizeof *result); + if (result == NULL) + return NULL; + result->self = self; + result->next_value = attr->first_value; + result->attr = attr; + result->id = attr->num_values++; + + /* update pointer in entity */ + self->data = result; + + /* update attr's value list */ + attr->first_value = result; + + return result; +} + +/* part of speech construction */ + +struct poliqarp_part_of_speech *poliqarp_new_part_of_speech( + struct poliqarp_backend_config *this, const char *name) +{ + static enum poliqarp_entity_type tag = POLIQARP_ENTITY_POS; + struct entity *self; + struct poliqarp_part_of_speech *result; + + /* create entity */ + self = new_entity(&this->named_items, name, &tag, NULL); + + if (self == NULL) + return NULL; + + /* create part of speech */ + result = marena_alloc(&this->arena, sizeof *result); + if (result == NULL) + return NULL; + result->self = self; + result->num_instances = 0; + result->first_instance = NULL; + result->id = this->num_pos++; + + /* update pointer in entity */ + self->data = result; + + return result; +} + +/* attribute instance construction */ + +struct poliqarp_attr_instance *poliqarp_new_attr_instance( + struct poliqarp_backend_config *this, struct poliqarp_part_of_speech *pos, + struct poliqarp_attr *attr, bool is_optional) +{ + struct poliqarp_attr_instance *result = marena_alloc(&this->arena, + sizeof *result); + if (result == NULL) + return NULL; + + /* create instance */ + result->next_instance = pos->first_instance; + result->attr = attr; + result->pos = pos; + result->is_optional = is_optional; + + /* update attr instance list in part of speech */ + pos->first_instance = result; + + return result; +} + +/** Configuration parser handler. */ +static int poliqarp_backend_config_handler(void *exta, int section, + char *text, struct poliqarp_error *error); + +int poliqarp_backend_config_open(struct poliqarp_backend_config *this, + const char *base_name, struct poliqarp_error *error) +{ + int rc; + bool have_marena = false, have_named_items = false, + have_attr_values = false, have_aliases = false, + have_query_rewrite_table = false; + static enum poliqarp_entity_type tag_pos = POLIQARP_ENTITY_ITEM_POS; + static enum poliqarp_entity_type tag_orth = POLIQARP_ENTITY_ITEM_ORTH; + static enum poliqarp_entity_type tag_base = POLIQARP_ENTITY_ITEM_BASE; + static enum poliqarp_entity_type tag_tag = POLIQARP_ENTITY_ITEM_TAG; + static enum poliqarp_entity_type tag_space = POLIQARP_ENTITY_ITEM_SPACE; + static enum poliqarp_entity_type tag_type = POLIQARP_ENTITY_ITEM_TYPE; + + /* create arena */ + marena_create(&this->arena); + have_marena = true; + + /* create two entity bags */ + create_entity_bag(&this->named_items, &this->arena); + have_named_items = true; + create_entity_bag(&this->attr_values, &this->arena); + have_attr_values = true; + + /* create alias manager */ + create_hash_table(&this->aliases, 25, HASHTABLE_DUPLICATE_KEYS, &this->arena); + have_aliases = true; + + /* create query rewrite table */ + poliqarp_create_query_rewrite_table(&this->query_rewrite_table); + have_query_rewrite_table = true; + + /* reset counters */ + this->num_attr = 0; + this->num_pos = 0; + + /* create default entities */ + new_entity(&this->named_items, "entity-pos", &tag_pos, NULL); + new_entity(&this->named_items, "entity-orth", &tag_orth, NULL); + new_entity(&this->named_items, "entity-base", &tag_base, NULL); + new_entity(&this->named_items, "entity-tag", &tag_tag, NULL); + new_entity(&this->named_items, "space", &tag_space, NULL); + new_entity(&this->named_items, "type", &tag_type, NULL); + + static const struct poliqarp_config_section sections[] = { + { "STARTUP", SECTION_STARTUP }, /* obsolete */ + { "ALIASES", SECTION_ALIASES }, + { "ATTR", SECTION_ATTR }, + { "POS", SECTION_POS }, + { "NAMED-ENTITY", SECTION_NAMED_ENTITY }, + { "QUERY-REWRITE-RULES", SECTION_QUERY_REWRITE_RULES }, + { NULL, 0 } + }; + + /* parse the config file */ + char *conf_path = string_aformat("%s.cfg", base_name); + if (conf_path == NULL) { + poliqarp_error_from_system(error, "Unable to read corpus configuration file"); + goto error; + } + rc = poliqarp_parse_config_file(this, conf_path, + sections, poliqarp_backend_config_handler, error); + if (rc != 0) + goto error; + free(conf_path); + + /* read CDF */ + rc = poliqarp_cdf_read(base_name, &this->cdf); + if (rc != 0 && errno != ENOENT) { + poliqarp_error_from_system(error, _("Unable to read CDF file")); + goto error; + } + + /* validate corpus format */ + if (this->cdf.version != POLIQARP_ABI_VERSION) { + poliqarp_error_message_set(error, + _("Version %d of binary format is not supported"), this->cdf.version); + goto error; + } + else if (this->cdf.endianness != ENDIAN_LE) { + poliqarp_error_message_set(error, + _("Big-endian binary format is not supported"), this->cdf.version); + goto error; + } + return 0; +error: + if (have_named_items) + destroy_entity_bag(&this->named_items); + if (have_attr_values) + destroy_entity_bag(&this->attr_values); + if (have_aliases) + destroy_hash_table(&this->aliases, free); + if (have_query_rewrite_table) + poliqarp_destroy_query_rewrite_table(&this->query_rewrite_table); + if (have_marena) + marena_destroy(&this->arena); + return -1; +} + +void poliqarp_backend_config_close(struct poliqarp_backend_config *this) +{ + /* get rid of everything */ + destroy_entity_bag(&this->named_items); + destroy_entity_bag(&this->attr_values); + destroy_hash_table(&this->aliases, free); + poliqarp_destroy_query_rewrite_table(&this->query_rewrite_table); + + /* free everything we have, quick and painless */ + marena_destroy(&this->arena); +} + +/* tag parsers */ + +/** + * Parse the tag on the fly, the tag is destroyed. + */ +int poliqarp_backend_config_parse(const struct poliqarp_backend_config *this, + struct poliqarp_parsed_tag *result, char *tag) +{ + const struct entity *entity; + char *item; + size_t i; + + /* reset attributes */ + for (i = 0; i < this->num_attr; ++i) + result->attr_value[i] = NULL; + + /* parse part of speech */ + item = strtok(tag, ":"); + entity = lookup_const_entity(&this->named_items, item); + if (entity == NULL) + return -1; + if (entity->tag == NULL || + *(enum poliqarp_entity_type *)entity->tag != POLIQARP_ENTITY_POS) + { + errno = EINVAL; + return -1; + } + result->pos = entity->data; + + /* parse attributes */ + while ((item = strtok(NULL, ":"))) { + entity = lookup_const_entity(&this->attr_values, item); + if (entity == NULL) + return -1; + if (entity == NULL || entity->tag == NULL || + *(enum poliqarp_entity_type *)entity->tag != POLIQARP_ENTITY_ATTR_VALUE) + { + errno = EINVAL; + return -1; + } + result->attr_value[((struct poliqarp_attr_value *) entity->data)->attr->id] = + entity->data; + } + return 0; +} + +/** + * Non-destructive version of poliqarp_backend_config_parse_copy(). */ +int poliqarp_backend_config_parse_copy(const struct poliqarp_backend_config *this, + struct poliqarp_parsed_tag *result, const char *tag) +{ + char *copy = strdup(tag); + if (copy == NULL) + return -1; + int rc = poliqarp_backend_config_parse(this, result, copy); + free(copy); + return rc; +} + +static int poliqarp_backend_config_handler(void *extra, int section, + char *text, struct poliqarp_error *error) +{ + struct poliqarp_backend_config *this = extra; + bool have_args = false, have_variable = true; + struct poliqarp_variable variable; + struct text_args args; + + int rc; + bool is_optional = false; + size_t i; + + struct entity *entity; + struct entity *result; + struct poliqarp_part_of_speech *pos; + struct poliqarp_attr *attr; + + args_init(&args); + + switch (section) { + case SECTION_ATTR: + case SECTION_POS: + case SECTION_NAMED_ENTITY: + case SECTION_QUERY_REWRITE_RULES: + rc = poliqarp_parse_variable(&variable, text); + if (rc != 0) { + if (errno == EINVAL) { + poliqarp_error_message_set(error, _("a line is not a key=value pair")); + goto error; + } else + goto system_error; + } + have_variable = true; + rc = args_parse(&args, variable.value); + if (rc != 0) + goto system_error; + have_args = true; + break; + case SECTION_STARTUP: /* support for old syntax: /alias foo = bar baz */ + rc = poliqarp_parse_variable(&variable, text); + if (rc != 0) + goto system_error; + have_variable = true; + if (strncmp(variable.name, "/alias ", 7) == 0) { + char *tmp = variable.value; + for (; *tmp; tmp++) + if (*tmp == ' ') + *tmp = '|'; + rc = hash_table_set(&this->aliases, variable.name + 7, strdup(variable.value)); + if (rc != 0) + goto system_error; + } + poliqarp_free_variable(&variable); + have_variable = false; + break; + case SECTION_ALIASES: + rc = poliqarp_parse_variable(&variable, text); + if (rc != 0) + goto system_error; + have_variable = true; + rc = hash_table_set(&this->aliases, variable.name, strdup(variable.value)); + if (rc != 0) + goto system_error; + poliqarp_free_variable(&variable); + have_variable = false; + break; + default: + break; + } + switch (section) { + case SECTION_POS: + /* create new part of speech */ + pos = poliqarp_new_part_of_speech(this, variable.name); + if (pos == NULL) { + if (errno == EEXIST) { + poliqarp_error_message_set(error, + _("unable to redefine part-of-speech '%s'"), variable.name); + goto error; + } + else + goto system_error; + } + for (i = 0; i < args.num_items; ++i) { + char *item = args.item[i].value; + /* detect optional markers */ + if (strcmp(item, "[") == 0) { + is_optional = true; + continue; + } + /* lookup name */ + entity = lookup_entity(&this->named_items, item); + if (entity == NULL) { + if (errno == ENOENT) { + poliqarp_error_message_set(error, _("entity '%s' is undefined"), item); + goto error; + } else + goto system_error; + } + if (entity->tag == NULL || + *(enum poliqarp_entity_type *)entity->tag != POLIQARP_ENTITY_ATTR) + { + poliqarp_error_message_set(error, _("entity '%s' is not an attribute"), item); + goto error; + } + attr = entity->data; + /* create new attr instance */ + if (poliqarp_new_attr_instance(this, pos, attr, is_optional) == NULL) + goto system_error; + if (is_optional) { + ++i; + is_optional = false; + } + } + break; + + case SECTION_ATTR: + /* create new attribute */ + attr = poliqarp_new_attr(this, variable.name); + if (attr == NULL) { + if (errno == EEXIST) { + poliqarp_error_message_set(error, _("unable to redefine attribute '%s'"), + variable.name); + goto error; + } + else + goto system_error; + } + + /* add values */ + for (i = 0; i < args.num_items; ++i) + if (poliqarp_new_attr_value(this, attr, args.item[i].value) == NULL) { + if (errno == EEXIST) { + poliqarp_error_message_set(error, _("duplicate attribute value '%s'"), args.item[i].value); + goto error; + } + else + goto system_error; + } + break; + case SECTION_NAMED_ENTITY: + entity = lookup_entity(&this->named_items, variable.name); + + if (entity == NULL) { + if (errno == ENOENT) { + poliqarp_error_message_set(error, _("entity '%s' is undefined"), variable.name); + goto error; + } + else + goto system_error; + } + /* make aliases */ + for (i = 0; i < args.num_items; ++i) { + result = new_entity_alias(&this->named_items, entity, + args.item[i].value); + if (result != NULL) + continue; + if (errno == EEXIST) { + poliqarp_error_message_set(error, + _("unable to redefine '%s' as '%s'"), entity->name, + args.item[i].value); + goto error; + } + else + goto system_error; + } + break; + case SECTION_QUERY_REWRITE_RULES: + if (args.num_items != 2) { + poliqarp_error_message_set(error, _("unable to create rule set %s; " + "each rewrite rule should consist of exactly 2 strings"), variable.name); + goto error; + } + { + struct poliqarp_query_rewrite *rewrite = + poliqarp_get_query_rewrite(&this->query_rewrite_table, variable.name, true); + if (rewrite == NULL) + goto system_error; + rc = poliqarp_add_query_rewrite_rule(rewrite, args.item[0].value, args.item[1].value); + if (rc != 0) + goto system_error; + } + default: + break; + } /* switch (section) */ + rc = 0; + goto done; +system_error: + poliqarp_error_from_system(error, NULL); +error: + rc = -1; +done: + if (have_variable) + poliqarp_free_variable(&variable); + if (have_args) + args_free(&args); + return rc; +} diff --git a/poliqarp-library/sakura/backend-config.h b/poliqarp-library/sakura/backend-config.h new file mode 100644 index 0000000000000000000000000000000000000000..2caa9c7429432e7e0c8aa757749509c38b0017da --- /dev/null +++ b/poliqarp-library/sakura/backend-config.h @@ -0,0 +1,165 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef POLIQARP_BACKEND_CONFIG_H +#define POLIQARP_BACKEND_CONFIG_H + +#include <poliqarp-config.h> + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> + +#include <sakura/cdf.h> +#include <sakura/common/hash-table.h> +#include <sakura/common/entity.h> +#include <sakura/common/args.h> +#include <sakura/exception.h> +#include <sakura/query-rewrite.h> + +/** @defgroup poliqarp_backend_config Configuration Backend */ +/** @{ */ +/** @file backend-config.h Configuration Backend */ + +/* -------------------------------------------------------------- */ + +struct poliqarp_attr; + +/** Single attribute value. */ +struct poliqarp_attr_value { + struct poliqarp_attr_value *next_value; /**< List stuff. */ + struct poliqarp_attr *attr; /**< Attribute pointer. */ + size_t id; /**< Identifier. */ + const struct entity *self; /**< Represented entity. */ +}; + +/** Single attribute name. */ +struct poliqarp_attr { + size_t num_values; /**< Number of values associated with + attribute. */ + struct poliqarp_attr_value *first_value; /**< Pointer to first value. */ + size_t id; /**< Identifier. */ + const struct entity *self; /**< Represented entity. */ +}; + +/** Instance of an attribute (for assembling parts of speech). */ +struct poliqarp_attr_instance { + struct poliqarp_attr_instance *next_instance; + const struct poliqarp_attr *attr; + struct poliqarp_part_of_speech *pos; + bool is_optional; +}; + +/** Part of speech. */ +struct poliqarp_part_of_speech { + size_t num_instances; + struct poliqarp_attr_instance *first_instance; + size_t id; + const struct entity *self; +}; + +/** Entity type. + * + * Entities are used to get a name-to-object association. + * The type is used to describe the content of the object + */ +enum poliqarp_entity_type { + POLIQARP_ENTITY_ATTR, /**< Attribute name. */ + POLIQARP_ENTITY_ATTR_VALUE, /**< Part of speech name? */ + POLIQARP_ENTITY_POS, /**< Part of speech name. */ + POLIQARP_ENTITY_ITEM_POS, /**< This is the representation of 'pos' + in expressions. */ + POLIQARP_ENTITY_ITEM_ORTH, /**< Likewise 'orth'. */ + POLIQARP_ENTITY_ITEM_SPACE, /**< Likewise 'space'. */ + POLIQARP_ENTITY_ITEM_BASE, /**< Likewise 'base name'. */ + POLIQARP_ENTITY_ITEM_TAG, /**< Likewise 'tag'. */ + POLIQARP_ENTITY_ITEM_TYPE /**< Likewise 'type'. */ +}; + +/** Configuration backend structure. */ +struct poliqarp_backend_config { + struct marena arena; /**< Memory arena used by entities. */ + struct entity_bag named_items; /**< Entity list. */ + struct entity_bag attr_values; /**< Attribute value list. */ + size_t num_attr; /**< Number of all attributes. */ + size_t num_pos; /**< Number of all parts of speech. */ + struct hash_table aliases; /**< Alias manager. */ + struct poliqarp_query_rewrite_table query_rewrite_table; + /**< Rules of query rewriting. */ + struct poliqarp_cdf_info cdf; /**< Machine-specific configuration. */ +}; + +/* -------------------------------------------------------------- */ + +/** Attribute constructor. */ +struct poliqarp_attr *poliqarp_new_attr(struct poliqarp_backend_config *this, + const char *name); + +/** Attribute value constructor. */ +struct poliqarp_attr_value *poliqarp_new_attr_value( + struct poliqarp_backend_config *this, struct poliqarp_attr *attr, const char *name); + +/** Part of speech constructor. */ +struct poliqarp_part_of_speech *poliqarp_new_part_of_speech( + struct poliqarp_backend_config *this, const char *name); + +/** Attribute instance constructor. */ +struct poliqarp_attr_instance *poliqarp_new_attr_instance( + struct poliqarp_backend_config *this, struct poliqarp_part_of_speech *pos, + struct poliqarp_attr *attr, bool is_optional); + +/** Open the attribute backend by the given base name. */ +int poliqarp_backend_config_open(struct poliqarp_backend_config *this, + const char *base_name, struct poliqarp_error *error); + +/** Close the attribute backend. */ +void poliqarp_backend_config_close(struct poliqarp_backend_config *this); + +/** Parsed tag representation. */ +struct poliqarp_parsed_tag { + struct poliqarp_part_of_speech *pos; /**< Part of speech. */ + struct poliqarp_attr_value **attr_value; /**< Array of attributes. + The size of the array can be + retrieved from the part of speech + object. */ +}; + +/** + * Parse the tag on the fly. + * @note the tag is destroyed + * @return 0 on success, -1 otherwise. + */ +int poliqarp_backend_config_parse(const struct poliqarp_backend_config *this, + struct poliqarp_parsed_tag *result, char *tag); + +/** + * Parse the tag on the fly. + * @return 0 on success, -1 otherwise. + */ +int poliqarp_backend_config_parse_copy(const struct poliqarp_backend_config *this, + struct poliqarp_parsed_tag *result, const char *tag); + +/** @} */ + +#endif /* POLIQARP_BACKEND_CONFIG_H */ + diff --git a/poliqarp-library/sakura/backend-corpus.c b/poliqarp-library/sakura/backend-corpus.c new file mode 100644 index 0000000000000000000000000000000000000000..d3a34917028498d830c1eb0ce6a50d198851cea3 --- /dev/null +++ b/poliqarp-library/sakura/backend-corpus.c @@ -0,0 +1,54 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <foostring/foostring.h> + +#include <sakura/abi.h> +#include <sakura/backend-corpus.h> +#include <sakura/exception.h> +#include <sakura/common/file-reader.h> + +int poliqarp_backend_corpus_open(struct poliqarp_backend_corpus *this, + const char *base_name, struct poliqarp_error *error) +{ + int rc; + char *path = string_aformat(POLIQARP_CORPUS_IMAGE_FORMAT, base_name); + if (path == NULL) { + poliqarp_error_from_system(error, _("Unable to open corpus image")); + return -1; + } + rc = file_reader_create(&this->corpus, path, + sizeof(struct poliqarp_binary_segment)); + free(path); + if (rc != 0) { + poliqarp_error_from_system(error, _("Unable to open corpus image (%s)"), + path); + return -1; + } + return 0; +} + +void poliqarp_backend_corpus_close(struct poliqarp_backend_corpus *this) +{ + file_reader_destroy(&this->corpus); +} diff --git a/poliqarp-library/sakura/backend-corpus.h b/poliqarp-library/sakura/backend-corpus.h new file mode 100644 index 0000000000000000000000000000000000000000..179410be8bcdc953a00c7e7a1e8f314c2aec68ea --- /dev/null +++ b/poliqarp-library/sakura/backend-corpus.h @@ -0,0 +1,88 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +/** + * @file backend-corpus.h + * @brief The corpus backend -- loading and manipulations. + * + * This backend handles the corpus proper (*.poliqarp.corpus.image). + * The backend is currently implemented as a file-reader of fixed-size + * structures of type struct binary_corpus and provides a random-access + * API. + * + * TODO: The format will change in the near future to support Huffman-encoded + * corpus images; the random-access API will be retained, but a notion of + * an `access context' will be introduced, one context for searching thread + * and one for result retrieval (so as not to mix cache buffers). + */ + +#ifndef POLIQARP_BACKEND_CORPUS_H +#define POLIQARP_BACKEND_CORPUS_H + +#include <sakura/common/file-reader.h> +#include <sakura/exception.h> +#include <sakura/abi.h> + +/* This is here because indexer needs it. */ +#define POLIQARP_CORPUS_IMAGE_FORMAT "%s.poliqarp.corpus.image" + +struct poliqarp_backend_corpus { + struct file_reader corpus; +}; + +int poliqarp_backend_corpus_open(struct poliqarp_backend_corpus *this, + const char *base_name, struct poliqarp_error *error); + +void poliqarp_backend_corpus_close(struct poliqarp_backend_corpus *this); + +static inline struct poliqarp_binary_segment poliqarp_backend_corpus_get( + struct poliqarp_backend_corpus *this, size_t index) +{ + struct poliqarp_binary_segment res = *( + (struct poliqarp_binary_segment *)file_reader_get(&this->corpus, index)); + POLIQARP_SEGMENT_LE_TO_HE(res); + return res; +} + +static inline uint32_t poliqarp_backend_corpus_size( + const struct poliqarp_backend_corpus *this) +{ + return this->corpus.num_items; +} + +static inline void poliqarp_backend_corpus_seek(struct poliqarp_backend_corpus *this, + size_t index) +{ + file_reader_seek(&this->corpus, index); +} + +static inline struct poliqarp_binary_segment poliqarp_backend_corpus_next( + struct poliqarp_backend_corpus *this) +{ + struct poliqarp_binary_segment res = *( + (struct poliqarp_binary_segment *)file_reader_next(&this->corpus)); + POLIQARP_SEGMENT_LE_TO_HE(res); + return res; +} + +#endif /* POLIQARP_BACKEND_CORPUS_H */ diff --git a/poliqarp-library/sakura/backend-document.c b/poliqarp-library/sakura/backend-document.c new file mode 100644 index 0000000000000000000000000000000000000000..4415f1107a8848c574ea1ee7546fca29775b44b6 --- /dev/null +++ b/poliqarp-library/sakura/backend-document.c @@ -0,0 +1,102 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <stdlib.h> + +#include <foostring/foostring.h> +#include <sakura/backend-document.h> +#include <sakura/exception.h> + + +int poliqarp_backend_document_open(struct poliqarp_backend_document *this, + const char *basename, struct poliqarp_error *error) +{ + int rc; + char *path = string_aformat(POLIQARP_CORPUS_DOCUMENT_FORMAT, basename); + if (path == NULL) { + poliqarp_error_from_system(error, _("Unable to open corpus document")); + return -1; + } + rc = tinydb_open(&this->document, path, sizeof(struct poliqarp_document)); + if (rc != 0) { + poliqarp_error_from_system(error, _("Unable to open corpus document (%s)"), path); + } + free(path); + if (rc != 0) + return rc; + this->current = 0; + return 0; +} + +void poliqarp_backend_document_close(struct poliqarp_backend_document *this) +{ + tinydb_close(&this->document); +} + +int poliqarp_backend_document_next(struct poliqarp_backend_document *this, + struct poliqarp_document *document) +{ + int res = poliqarp_backend_document_fetch(this, this->current, document); + this->current++; + return res; +} + +int poliqarp_backend_document_fetch(const struct poliqarp_backend_document *this, + size_t n, struct poliqarp_document *document) +{ + if (n >= this->document.num_items) + return -1; + + *document = *(struct poliqarp_document *)tinydb_fetch_item(&this->document, n); + /* endian-neutralize */ + document->corpus_low = en4(document->corpus_low); + document->corpus_high = en4(document->corpus_high); + document->meta_low = en4(document->meta_low); + document->meta_high = en4(document->meta_high); + return 0; +} + +/* FIXME: This is code duplication (see poliqarp_subdocument_search), to be fixed + * someday... */ +void poliqarp_backend_document_search(struct poliqarp_backend_document *this, + uint32_t pos) +{ + struct poliqarp_document *arr = (struct poliqarp_document *)this->document.image; + size_t l = this->current, r = this->document.num_items - 1, m; + while (l < r) { + m = (l + r) / 2; + if (pos >= en4(arr[m].corpus_low) && pos < en4(arr[m].corpus_high)) { + this->current = m; + return; + } else if (pos < en4(arr[m].corpus_low)) { + r = m - 1; + } else { + l = m + 1; + } + } + if (pos >= en4(arr[r].corpus_high)) + this->current = r + 1; + else + this->current = r; +} + diff --git a/poliqarp-library/sakura/backend-document.h b/poliqarp-library/sakura/backend-document.h new file mode 100644 index 0000000000000000000000000000000000000000..11c5f647910dc8538a1b3203a92fc04b56e056e3 --- /dev/null +++ b/poliqarp-library/sakura/backend-document.h @@ -0,0 +1,48 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef POLIQARP_BACKEND_DOCUMENT_H +#define POLIQARP_BACKEND_DOCUMENT_H + +#include <sakura/common/tinydb.h> +#include <sakura/exception.h> +#include <sakura/abi.h> + +#define POLIQARP_CORPUS_DOCUMENT_FORMAT "%s.poliqarp.chunk.image" + +struct poliqarp_backend_document { + struct tinydb document; + uint32_t current; +}; + +int poliqarp_backend_document_open(struct poliqarp_backend_document *this, + const char *basename, struct poliqarp_error *error); +void poliqarp_backend_document_close(struct poliqarp_backend_document *this); +int poliqarp_backend_document_fetch(const struct poliqarp_backend_document *this, + size_t n, struct poliqarp_document *document); +int poliqarp_backend_document_next(struct poliqarp_backend_document *this, + struct poliqarp_document *document); +void poliqarp_backend_document_search(struct poliqarp_backend_document *this, + uint32_t pos); + +#endif /* POLIQARP_BACKEND_DOCUMENT_H */ diff --git a/poliqarp-library/sakura/backend-index.c b/poliqarp-library/sakura/backend-index.c new file mode 100644 index 0000000000000000000000000000000000000000..72a290789431db8677af536291aed502a4b7019a --- /dev/null +++ b/poliqarp-library/sakura/backend-index.c @@ -0,0 +1,194 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <sakura/backend-index.h> +#include <sakura/exception.h> +#include <foostring/foostring.h> + +int poliqarp_index_calculate(const char *desc) +{ + int res = 0; + while (*desc) { + switch (*desc) { + case 'o': + res |= POLIQARP_INDEX_ORTH; + break; + case 'd': + res |= POLIQARP_INDEX_DISAMB; + break; + case 'a': + res |= POLIQARP_INDEX_AMB; + break; + default: + return -1; + } + desc++; + } + return res; +} + +char *poliqarp_index_stringify(int indices) +{ + static char res[4]; + char *tmp = res; + + if (indices & POLIQARP_INDEX_ORTH) + *tmp++ = 'o'; + if (indices & POLIQARP_INDEX_DISAMB) + *tmp++ = 'd'; + if (indices & POLIQARP_INDEX_AMB) + *tmp++ = 'a'; + *tmp++ = 0; + return res; +} + +static int ibs_mem_backend_next_bit(void *bs) +{ + struct poliqarp_ibs_mem_backend *ibs = + (struct poliqarp_ibs_mem_backend *)bs; + int x = ibs->data[ibs->num_bit / 8] & (1 << (ibs->num_bit % 8)); + ibs->num_bit++; + return x; +} + +static struct poliqarp_rindex *open_rindex(const char *base_name, + size_t granularity, const char *format, struct poliqarp_error *error) +{ + char *path, *format2; + struct poliqarp_rindex *index; + bool have_image_map = false, have_offset_map = false; + int rc; + + index = malloc(sizeof *index); + if (index == NULL) + goto error_message; + + index->granularity = granularity; + + init_file_map(&index->image); + init_file_map(&index->offset); + + path = string_aformat(format, base_name); + if (path == NULL) + goto error_message; + + rc = create_file_map(&index->image, path); + if (rc != 0) { + poliqarp_error_from_system(error, _("Unable to open rindex image (%s)"), + path); + } + free(path); + if (rc != 0) + goto error; + have_image_map = true; + + format2 = string_aformat("%s.offset", format); + if (format2 == NULL) + goto error_message; + path = string_aformat(format2, base_name); + free(format2); + if (path == NULL) + goto error_message; + + rc = create_file_map(&index->offset, path); + if (rc != 0) + poliqarp_error_from_system(error, _("Unable to open rindex offset (%s)"), + path); + free(path); + if (rc != 0) + goto error; + have_offset_map = true; + + index->backend.data = (uint8_t *)get_file_image(&index->image); + index->ibs.data = &index->backend; + index->ibs.next_bit = ibs_mem_backend_next_bit; + index->num_lists = get_file_size(&index->offset) / sizeof(uint32_t); + return index; +error_message: + poliqarp_error_from_system(error, _("Unable to initialize rindex backend")); +error: + if (have_image_map) + destroy_file_map(&index->image); + if (have_offset_map) + destroy_file_map(&index->offset); + free(index); + return NULL; +} + +void poliqarp_rindex_set(struct poliqarp_rindex *this, size_t which) +{ + uint32_t *offsets = (uint32_t *)get_file_image(&this->offset); + this->backend.num_bit = en4(offsets[which]); +} + +static void close_rindex(struct poliqarp_rindex *index) +{ + destroy_file_map(&index->offset); + destroy_file_map(&index->image); + free(index); +} + +int poliqarp_backend_index_open(struct poliqarp_backend_index *this, + const struct poliqarp_backend_config *config, const char *base_name, + struct poliqarp_error *error) +{ + this->orth_index = this->disamb_index = this->amb_index = NULL; + if (config->cdf.indices & POLIQARP_INDEX_ORTH) { + this->orth_index = open_rindex(base_name, config->cdf.granularity, + POLIQARP_CORPUS_ORTH_INDEX_FORMAT, error); + if (this->orth_index == NULL) + goto error; + } + if (config->cdf.indices & POLIQARP_INDEX_DISAMB) { + this->disamb_index = open_rindex(base_name, config->cdf.granularity, + POLIQARP_CORPUS_DISAMB_INDEX_FORMAT, error); + if (this->disamb_index == NULL) + goto error; + } + if (config->cdf.indices & POLIQARP_INDEX_AMB) { + this->amb_index = open_rindex(base_name, config->cdf.granularity, + POLIQARP_CORPUS_AMB_INDEX_FORMAT, error); + if (this->amb_index == NULL) + goto error; + } + return 0; +error: + poliqarp_backend_index_close(this); + return -1; +} + +void poliqarp_backend_index_close(struct poliqarp_backend_index *this) +{ + if (this->orth_index) { + close_rindex(this->orth_index); + this->orth_index = NULL; + } + if (this->disamb_index) { + close_rindex(this->disamb_index); + this->disamb_index = NULL; + } + if (this->amb_index) { + close_rindex(this->amb_index); + this->amb_index = NULL; + } +} diff --git a/poliqarp-library/sakura/backend-index.h b/poliqarp-library/sakura/backend-index.h new file mode 100644 index 0000000000000000000000000000000000000000..cfca0bf966943373ba90157d78a46b535ebba857 --- /dev/null +++ b/poliqarp-library/sakura/backend-index.h @@ -0,0 +1,76 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef POLIQARP_BACKEND_INDEX_H +#define POLIQARP_BACKEND_INDEX_H + +#include <sakura/backend-config.h> + +#include <sakura/common/bitstream.h> +#include <sakura/common/file-map.h> + +#define POLIQARP_CORPUS_ORTH_INDEX_FORMAT "%s.poliqarp.rindex.orth" +#define POLIQARP_CORPUS_DISAMB_INDEX_FORMAT "%s.poliqarp.rindex.disamb" +#define POLIQARP_CORPUS_AMB_INDEX_FORMAT "%s.poliqarp.rindex.amb" + +#define POLIQARP_INDEX_ORTH 1 /**< Index of orths. */ +#define POLIQARP_INDEX_DISAMB 2 /**< Index of disambiguated interpretations. */ +#define POLIQARP_INDEX_AMB 4 /**< Index of ambiguous interpretations. */ + +#define POLIQARP_INDEX_DEFAULT_GRANULARITY 1024 +#define POLIQARP_INDEX_MIN_GRANULARITY 100 +#define POLIQARP_INDEX_MAX_GRANULARITY 1000000 + +struct poliqarp_ibs_mem_backend { + uint8_t *data; + size_t num_bit; +}; + +struct poliqarp_rindex { + struct file_map image; + struct file_map offset; + struct ibs ibs; + struct poliqarp_ibs_mem_backend backend; + size_t num_lists; + size_t granularity; +}; + +struct poliqarp_backend_index { + struct poliqarp_rindex *orth_index; + struct poliqarp_rindex *disamb_index; + struct poliqarp_rindex *amb_index; +}; + +int poliqarp_index_calculate(const char *desc); + +char *poliqarp_index_stringify(int indices); + +int poliqarp_backend_index_open(struct poliqarp_backend_index *this, + const struct poliqarp_backend_config *config, const char *base_name, + struct poliqarp_error *error); + +void poliqarp_backend_index_close(struct poliqarp_backend_index *this); + +void poliqarp_rindex_set(struct poliqarp_rindex *this, size_t which); + +#endif /* POLIQARP_BACKEND_INDEX_H */ diff --git a/poliqarp-library/sakura/backend-interp.c b/poliqarp-library/sakura/backend-interp.c new file mode 100644 index 0000000000000000000000000000000000000000..5edfac2d4d5ff0a30c7cb50c8889589381ef2968 --- /dev/null +++ b/poliqarp-library/sakura/backend-interp.c @@ -0,0 +1,54 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <sakura/backend-interp.h> +#include <sakura/dict.h> +#include <sakura/exception.h> + +#include <foostring/foostring.h> + +int poliqarp_backend_interp_open(struct poliqarp_backend_interp *this, + const char *base_name, struct poliqarp_error *error) +{ + int rc; + rc = poliqarp_newdict_open(&this->dict__disamb, base_name, false, + POLIQARP_INTERP1_IMAGE_FORMAT, POLIQARP_INTERP1_OFFSET_FORMAT, + "Unable to open interp-1 dictionary", error); + if (rc != 0) + return rc; + rc = poliqarp_newdict_open(&this->dict__amb, base_name, false, + POLIQARP_INTERP2_IMAGE_FORMAT, POLIQARP_INTERP2_OFFSET_FORMAT, + "Unable to open interp-2 dictionary", error); + if (rc != 0) { + newdict_close(&this->dict__disamb); + return rc; + } + return 0; +} + +void poliqarp_backend_interp_close(struct poliqarp_backend_interp *this) +{ + newdict_close(&this->dict__disamb); + newdict_close(&this->dict__amb); +} + diff --git a/poliqarp-library/sakura/backend-interp.h b/poliqarp-library/sakura/backend-interp.h new file mode 100644 index 0000000000000000000000000000000000000000..848edcfbab338b3c29489e0d6cc9b6c38665a972 --- /dev/null +++ b/poliqarp-library/sakura/backend-interp.h @@ -0,0 +1,127 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef POLIQARP_BACKEND_INTERP_H +#define POLIQARP_BACKEND_INTERP_H + +#include <sakura/backend-config.h> +#include <sakura/abi.h> + +#include <sakura/common/newdict.h> + +/** @todo */ +#define POLIQARP_INTERP1_IMAGE_FORMAT "%s.poliqarp.interp1.image" +/** @todo */ +#define POLIQARP_INTERP1_OFFSET_FORMAT "%s.poliqarp.interp1.offset" +/** @todo */ +#define POLIQARP_INTERP2_IMAGE_FORMAT "%s.poliqarp.interp2.image" +/** @todo */ +#define POLIQARP_INTERP2_OFFSET_FORMAT "%s.poliqarp.interp2.offset" + +/** @todo */ +#define POLIQARP_OLD_INTERP1_IMAGE_FORMAT "%s.poliqarp.subpos1.image" +/** @todo */ +#define POLIQARP_OLD_INTERP1_OFFSET_FORMAT "%s.poliqarp.subpos1.offset" +/** @todo */ +#define POLIQARP_OLD_INTERP2_IMAGE_FORMAT "%s.poliqarp.subpos2.image" +/** @todo */ +#define POLIQARP_OLD_INTERP2_OFFSET_FORMAT "%s.poliqarp.subpos2.offset" + +/** @todo */ +struct poliqarp_backend_interp { + struct newdict dict__disamb; + struct newdict dict__amb; +}; + +/** @todo */ +int poliqarp_backend_interp_open(struct poliqarp_backend_interp *this, + const char *base_name, struct poliqarp_error *error); + +/** @todo */ +void poliqarp_backend_interp_close(struct poliqarp_backend_interp *this); + +/** + * Get the number of disambiguated interpretation sets. + */ +static inline size_t poliqarp_backend_interp_num_items__disamb( + const struct poliqarp_backend_interp *this) +{ + return GET_NUM_ITEMS(&this->dict__disamb); +} + +/** + * Get given disambiguated interpretation set. + */ +static inline const struct poliqarp_binary_interp * +poliqarp_backend_interp_fetch__disamb( + const struct poliqarp_backend_interp *this, size_t key) +{ + assert(key < this->dict__disamb.num_items); + return GET_ITEM(&this->dict__disamb, key); +} + +/** + * Get the length of given disambiguated interpretation. + * Length is equal to number of interpretations in given set. + */ +static inline size_t poliqarp_backend_interp_length__disamb( + const struct poliqarp_backend_interp *this, size_t key) +{ + assert(key < this->dict__disamb.num_items); + return GET_LENGTH(&this->dict__disamb, key) / + sizeof(struct poliqarp_binary_interp); +} + +/** + * Get the number of ambiguous interpretation sets. + */ +static inline size_t poliqarp_backend_interp_num_items__amb( + const struct poliqarp_backend_interp *this) +{ + return GET_NUM_ITEMS(&this->dict__amb); +} + +/** + * Get given ambiguous interpretation set. + */ +static inline const struct poliqarp_binary_interp * +poliqarp_backend_interp_fetch__amb( + const struct poliqarp_backend_interp *this, size_t key) +{ + assert(key < this->dict__amb.num_items); + return GET_ITEM(&this->dict__amb, key); +} + +/** + * Get the length of given ambiguous interpretation. + * Length is equal to number of interpretations in given set. + */ +static inline size_t poliqarp_backend_interp_length__amb( + const struct poliqarp_backend_interp *this, size_t key) +{ + assert(key < this->dict__amb.num_items); + return GET_LENGTH(&this->dict__amb, key) / + sizeof(struct poliqarp_binary_interp); +} + +#endif /* POLIQARP_BACKEND_INTERP_H */ diff --git a/poliqarp-library/sakura/backend-meta.c b/poliqarp-library/sakura/backend-meta.c new file mode 100644 index 0000000000000000000000000000000000000000..353abf72d621b05f075737e97dd7cfd1e8275279 --- /dev/null +++ b/poliqarp-library/sakura/backend-meta.c @@ -0,0 +1,189 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <errno.h> +#include <sys/types.h> + +#include <sakura/backend-meta.h> +#include <sakura/exception.h> +#include <sakura/common/getline.h> +#include <sakura/dict.h> +#include <foostring/foostring.h> + +static void free_type_list(struct poliqarp_meta_type_list *head) +{ + while (head) { + struct poliqarp_meta_type_list *elem = head; + head = head->next; + free(elem->key); + free(elem); + } +} + +static int read_type_list(struct poliqarp_meta_type_list **head, + const char *file_name) +{ + FILE *fp = fopen(file_name, "rt"); + char *buf = NULL; + size_t buflen = 0; + + *head = NULL; + if (fp == NULL) + return -1; + for (;;) { + struct poliqarp_meta_type_list *elem; + enum poliqarp_meta_type type = POLIQARP_META_TYPE_STRING; + bool have_type = false; + size_t len; + + if (getline(&buf, &buflen, fp) == -1) + break; + len = strlen(buf); + if (len < 3) + continue; + if (buf[len - 1] == '\n') + buf[len - 1] = '\0'; + switch (buf[0]) { + case 'S': + type = POLIQARP_META_TYPE_STRING; + have_type = true; + break; + case 'D': + type = POLIQARP_META_TYPE_DATE; + have_type = true; + break; + } + if (!have_type) + continue; + elem = malloc(sizeof *elem); + if (elem == NULL) + break; + elem->next = *head; + elem->type = type; + elem->key = strdup(buf + 2); + if (elem->key == NULL) { + free(elem); + break; + } + *head = elem; + } + free(buf); + if (!feof(fp)) { + int my_errno = errno; + fclose(fp); /* It's safe to ignore errors. */ + *head = NULL; + errno = my_errno; + return -1; + } + else { + fclose(fp); /* It's safe to ignore errors. */ + return 0; + } +} + +int poliqarp_backend_meta_open(struct poliqarp_backend_meta *this, + const char *base_name, struct poliqarp_error *error) +{ + char *path; + bool have_key, have_value, have_meta, have_types; + int rc; + + have_key = have_value = have_meta = have_types = false; + + rc = poliqarp_newdict_open(&this->key, base_name, false, + POLIQARP_META_KEY_IMAGE_FORMAT, POLIQARP_META_KEY_OFFSET_FORMAT, + _("Unable to open meta-key dictionary"), error); + if (rc != 0) + goto error; + have_key = true; + + rc = poliqarp_newdict_open(&this->value, base_name, false, + POLIQARP_META_VALUE_IMAGE_FORMAT, POLIQARP_META_VALUE_OFFSET_FORMAT, + _("Unable to open meta-value dictionary"), error); + if (rc != 0) + goto error; + have_value = true; + + path = string_aformat(POLIQARP_META_IMAGE_FORMAT, base_name); + if (path == NULL) + goto error_message; + + rc = tinydb_open(&this->meta, path, + sizeof(struct poliqarp_binary_metadata)); + if (rc != 0) { + poliqarp_error_from_system(error, _("Unable to open meta image (%s)"), + path); + } + free(path); + if (rc != 0) + goto error; + have_meta = true; + + path = string_aformat(POLIQARP_META_CONFIG_FORMAT, base_name); + if (path == NULL) + goto error_message; + + rc = read_type_list(&this->types, path); + if (rc != 0) { + poliqarp_error_from_system(error, _("Unable to open meta configuration" + " (%s)"), path); + } + free(path); + if (rc != 0) + goto error; + have_types = true; + + return 0; + +error_message: + poliqarp_error_from_system(error, _("Unable to initialize meta backend")); +error: + if (have_key) + newdict_close(&this->key); + if (have_value) + newdict_close(&this->value); + if (have_meta) + tinydb_close(&this->meta); + if (have_types) + free_type_list(this->types); + return -1; +} + +void poliqarp_backend_meta_close(struct poliqarp_backend_meta *this) +{ + newdict_close(&this->key); + newdict_close(&this->value); + tinydb_close(&this->meta); + free_type_list(this->types); + this->types = NULL; +} + +enum poliqarp_meta_type poliqarp_get_metadata_type( + const struct poliqarp_backend_meta *this, const char *key) +{ + struct poliqarp_meta_type_list *l; + for (l = this->types; l; l = l->next) + if (strcmp(l->key, key) == 0) + return l->type; + return POLIQARP_META_TYPE_UNKNOWN; +} diff --git a/poliqarp-library/sakura/backend-meta.h b/poliqarp-library/sakura/backend-meta.h new file mode 100644 index 0000000000000000000000000000000000000000..bcfe1d78be129d8316a6b4c485fffe8419ad1b3b --- /dev/null +++ b/poliqarp-library/sakura/backend-meta.h @@ -0,0 +1,130 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef POLIQARP_BACKEND_META_H +#define POLIQARP_BACKEND_META_H + +#include <sakura/common/newdict.h> +#include <sakura/common/tinydb.h> +#include <sakura/exception.h> +#include <sakura/abi.h> + +#define POLIQARP_META_KEY_IMAGE_FORMAT "%s.poliqarp.meta-key.image" +#define POLIQARP_META_KEY_OFFSET_FORMAT "%s.poliqarp.meta-key.offset" +#define POLIQARP_META_VALUE_IMAGE_FORMAT "%s.poliqarp.meta-value.image" +#define POLIQARP_META_VALUE_OFFSET_FORMAT "%s.poliqarp.meta-value.offset" +#define POLIQARP_META_IMAGE_FORMAT "%s.poliqarp.meta.image" +#define POLIQARP_META_CONFIG_FORMAT "%s.meta.cfg" + +enum poliqarp_meta_type { + POLIQARP_META_TYPE_STRING, + POLIQARP_META_TYPE_DATE, + POLIQARP_META_TYPE_UNKNOWN +}; + +struct poliqarp_meta_type_list { + char *key; + enum poliqarp_meta_type type; + struct poliqarp_meta_type_list *next; +}; + +/** @todo */ +struct poliqarp_backend_meta { + struct newdict key; + struct newdict value; + struct tinydb meta; + struct poliqarp_meta_type_list *types; +}; + +/** @todo */ +int poliqarp_backend_meta_open(struct poliqarp_backend_meta *this, + const char *base_name, struct poliqarp_error *error); + +/** @todo */ +void poliqarp_backend_meta_close(struct poliqarp_backend_meta *this); + +static inline size_t poliqarp_backend_meta_num_keys( + const struct poliqarp_backend_meta *this) +{ + return GET_NUM_ITEMS(&this->key); +} + +static inline size_t poliqarp_backend_meta_num_values( + const struct poliqarp_backend_meta *this) +{ + return GET_NUM_ITEMS(&this->value); +} + +static inline size_t poliqarp_backend_meta_num_items( + const struct poliqarp_backend_meta *this) +{ + return this->meta.num_items; +} + +static inline const char *poliqarp_backend_meta_key_fetch( + const struct poliqarp_backend_meta *this, size_t index) +{ + assert(index < this->key.num_items); + return GET_ITEM(&this->key, index); +} + +static inline const char *poliqarp_backend_meta_value_fetch( + const struct poliqarp_backend_meta *this, size_t index) +{ + assert(index < this->value.num_items); + return GET_ITEM(&this->value, index); +} + +static inline struct poliqarp_binary_metadata poliqarp_backend_meta_fetch( + const struct poliqarp_backend_meta *this, size_t index) +{ + struct poliqarp_binary_metadata res; + assert(index < this->meta.num_items); + res = + *(struct poliqarp_binary_metadata *) + (tinydb_fetch_item(&this->meta, index)); + res.type = en4(res.type); + res.key = en4(res.key); + switch (res.type) { + case POLIQARP_METADATA_SINGLE: + case POLIQARP_METADATA_MULTI: + res.value_as.text = en4(res.value_as.text); + break; + case POLIQARP_METADATA_DATE: + res.value_as.date.year = en2(res.value_as.date.year); + break; + case POLIQARP_METADATA_UNDEFINED: + break; + default: + abort(); /* Should not happen for a valid corpus. */ + } + return res; +} + +/** + * Returns the type of given key of metadata. + */ +enum poliqarp_meta_type poliqarp_get_metadata_type( + const struct poliqarp_backend_meta *this, const char *key); + +#endif diff --git a/poliqarp-library/sakura/backend-orth.c b/poliqarp-library/sakura/backend-orth.c new file mode 100644 index 0000000000000000000000000000000000000000..4e2ec84f7d56d5f41a3c040205d087a4a79e0173 --- /dev/null +++ b/poliqarp-library/sakura/backend-orth.c @@ -0,0 +1,100 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <sakura/backend-orth.h> +#include <sakura/dict.h> +#include <sakura/exception.h> +#include <foostring/foostring.h> + +/*==========[ ORTH BACKEND ]==============================================*/ + +int poliqarp_backend_orth_open(struct poliqarp_backend_orth *this, + const char *base_name, struct poliqarp_error *error) +{ + char *path; + bool have_dict, have_afronte_index, have_atergo_index; + int rc; + + have_dict = have_afronte_index = have_atergo_index = false; + + rc = poliqarp_newdict_open(&this->dict, base_name, true, + POLIQARP_ORTH_IMAGE_FORMAT, POLIQARP_ORTH_OFFSET_FORMAT, + _("Unable to open text dictionary"), error); + if (rc != 0) + goto error; + have_dict = true; + + path = string_aformat(POLIQARP_ORTH_INDEX_FORMAT, base_name, "afronte"); + if (path == NULL) + goto error_message; + rc = newindex_open(&this->afronte_index, &this->dict, path); + if (rc != 0) { + /* try deprecated name */ + free(path); + path = string_aformat(POLIQARP_ORTH_INDEX_FORMAT, base_name, "alpha"); + if (path == NULL) + goto error_message; + rc = newindex_open(&this->afronte_index, &this->dict, path); + if (rc != 0) { + poliqarp_error_from_system(error, _("Unable to open afronte index" + "(%s)"), path); + } + free(path); + if (rc != 0) + goto error; + } + have_afronte_index = true; + + path = string_aformat(POLIQARP_ORTH_INDEX_FORMAT, base_name, "atergo"); + if (path == NULL) + goto error_message; + + rc = newindex_open(&this->atergo_index, &this->dict, path); + if (rc != 0) { + poliqarp_error_from_system(error, _("Unable to open atergo index (%s)"), + path); + } + free(path); + if (rc != 0) + goto error; + have_atergo_index = true; + return 0; +error_message: + poliqarp_error_from_system(error, _("Unable to initialize orth backend")); +error: + if (have_dict) + newdict_close(&this->dict); + if (have_afronte_index) + newindex_close(&this->afronte_index); + if (have_atergo_index) + newindex_close(&this->atergo_index); + return -1; +} + +void poliqarp_backend_orth_close(struct poliqarp_backend_orth *this) +{ + newdict_close(&this->dict); + newindex_close(&this->afronte_index); + newindex_close(&this->atergo_index); +} + diff --git a/poliqarp-library/sakura/backend-orth.h b/poliqarp-library/sakura/backend-orth.h new file mode 100644 index 0000000000000000000000000000000000000000..cf3006bc69fc244c885ea24c301fca52cb4fd1ee --- /dev/null +++ b/poliqarp-library/sakura/backend-orth.h @@ -0,0 +1,85 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef POLIQARP_BACKEND_ORTH_H +#define POLIQARP_BACKEND_ORTH_H + +#include <sakura/common/newdict.h> +#include <sakura/exception.h> + +/** @todo */ +#define POLIQARP_ORTH_IMAGE_FORMAT "%s.poliqarp.orth.image" +/** @todo */ +#define POLIQARP_ORTH_OFFSET_FORMAT "%s.poliqarp.orth.offset" +/** @todo */ +#define POLIQARP_ORTH_INDEX_FORMAT "%s.poliqarp.orth.index.%s" + +/** @todo */ +struct poliqarp_backend_orth { + struct newdict dict; + struct newindex afronte_index; + struct newindex atergo_index; +}; + +/** @todo */ +int poliqarp_backend_orth_open(struct poliqarp_backend_orth *this, + const char *base_name, struct poliqarp_error *error); + +/** @todo */ +void poliqarp_backend_orth_close(struct poliqarp_backend_orth *this); + +static inline size_t poliqarp_backend_orth_num_items( + const struct poliqarp_backend_orth *this) +{ + return GET_NUM_ITEMS(&this->dict); +} + +static inline const char *poliqarp_backend_orth_fetch( + const struct poliqarp_backend_orth *this, size_t key) +{ + assert(key < this->dict.num_items); + return GET_ITEM(&this->dict, key); +} + +static inline size_t poliqarp_backend_orth_length( + const struct poliqarp_backend_orth *this, size_t key) +{ + assert(key < this->dict.num_items); + return GET_LENGTH(&this->dict, key); +} + +static inline size_t poliqarp_backend_orth_afronte_fetch( + const struct poliqarp_backend_orth *this, size_t key) +{ + assert(key < this->dict.num_items); + return GET_INDEX_ITEM(&this->afronte_index, key); +} + +static inline size_t poliqarp_backend_orth_atergo_fetch( + const struct poliqarp_backend_orth *this, size_t key) +{ + assert(key < this->dict.num_items); + return GET_INDEX_ITEM(&this->atergo_index, key); +} + +#endif /* POLIQARP_BACKEND_ORTH_H */ diff --git a/poliqarp-library/sakura/backend-subdocument.c b/poliqarp-library/sakura/backend-subdocument.c new file mode 100644 index 0000000000000000000000000000000000000000..2c2154967405fe78c4f977389b429eaa2f3e510e --- /dev/null +++ b/poliqarp-library/sakura/backend-subdocument.c @@ -0,0 +1,162 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <stdlib.h> + +#include <foostring/foostring.h> +#include <sakura/backend-subdocument.h> +#include <sakura/exception.h> +#include <sakura/dict.h> + +static int poliqarp_subdocument_set_open(struct poliqarp_subdocument_set *set, + const char *basename, const char *setname, struct poliqarp_error *error) +{ + int rc; + char *fname = string_aformat(POLIQARP_SUBDOCUMENT_ITEM_FORMAT, basename, + setname); + if (fname == NULL) { + poliqarp_error_from_system(error, _("Unable to open subdocument set %s"), + setname); + return -1; + } + init_file_map(&set->map); + rc = create_file_map(&set->map, fname); + if (rc != 0) { + poliqarp_error_from_system(error, _("Unable to open subdocument set %s" + " (%s)"), setname, fname); + } + free(fname); + set->current = 0; + return rc; +} + +static void poliqarp_subdocument_set_close(struct poliqarp_subdocument_set *set) +{ + destroy_file_map(&set->map); +} + +int poliqarp_subdocument_next(struct poliqarp_subdocument_set *set, + struct poliqarp_subdocument *subdocument) +{ + if (set->current >= get_file_size(&set->map) / sizeof(*subdocument)) + return -1; + *subdocument = ((struct poliqarp_subdocument *)get_file_image(&set->map))[set->current++]; + /* endian-neutralize */ + subdocument->corpus_low = en4(subdocument->corpus_low); + subdocument->corpus_high = en4(subdocument->corpus_high); + return 0; +} + +void poliqarp_subdocument_search(struct poliqarp_subdocument_set *set, + uint32_t pos) +{ + struct poliqarp_subdocument *arr = (struct poliqarp_subdocument *)get_file_image(&set->map); + size_t l = set->current, r = (get_file_size(&set->map) / sizeof(*arr)) - 1, m; + while (l < r) { + m = (l + r) / 2; + if (pos >= en4(arr[m].corpus_low) && pos < en4(arr[m].corpus_high)) { + set->current = m; + return; + } else if (pos < en4(arr[m].corpus_low)) { + r = m - 1; + } else { + l = m + 1; + } + } + if (pos >= en4(arr[r].corpus_high)) + set->current = r + 1; + else + set->current = r; +} + +int poliqarp_backend_subdocument_open(struct poliqarp_backend_subdocument *this, + const char *base_name, struct poliqarp_error *error) +{ + size_t i; + int rc; + + rc = poliqarp_newdict_open(&this->name_dict, base_name, false, + POLIQARP_SUBDOCUMENT_IMAGE_FORMAT, POLIQARP_SUBDOCUMENT_OFFSET_FORMAT, + _("Unable to open subdocument name dictionary"), error); + if (rc != 0) + return -1; + + this->sets = malloc(sizeof(this->sets[0]) * GET_NUM_ITEMS(&this->name_dict)); + if (this->sets == NULL) { + poliqarp_error_from_system(error, _("Unable to initialize subdocument backend")); + goto error; + } + for (i = 0; i < GET_NUM_ITEMS(&this->name_dict); i++) { + const char *setname = (const char *)GET_ITEM(&this->name_dict, i); + rc = poliqarp_subdocument_set_open(this->sets + i, base_name, setname, error); + if (rc != 0) { + size_t j; + for (j = 0; j < i; j++) + poliqarp_subdocument_set_close(this->sets + j); + goto error; + } + } + return 0; +error: + free(this->sets); + newdict_close(&this->name_dict); + return -1; +} + +void poliqarp_backend_subdocument_close(struct poliqarp_backend_subdocument *this) +{ + size_t i; + for (i = 0; i < GET_NUM_ITEMS(&this->name_dict); i++) + poliqarp_subdocument_set_close(this->sets + i); + newdict_close(&this->name_dict); + free(this->sets); +} + +struct poliqarp_subdocument_set *poliqarp_backend_subdocument_lookup_set( + struct poliqarp_backend_subdocument *this, const char *set_name) +{ + size_t i; + for (i = 0; i < GET_NUM_ITEMS(&this->name_dict); i++) + if (strcmp(set_name, GET_ITEM(&this->name_dict, i)) == 0) + return this->sets + i; + return NULL; +} + +#ifdef TEST_SUBDOCUMENT + +int main(int argc, char *argv[]) +{ + struct poliqarp_subdocument_set set; + struct poliqarp_subdocument ch; + if (argc < 3) + return 1; + poliqarp_subdocument_set_open(&set, argv[1], argv[2]); + if (argc >= 4) + poliqarp_subdocument_search(&set, atoi(argv[3])); + while (poliqarp_subdocument_next(&set, &ch) == 0) { + printf("%d, %d\n", ch.corpus_low, ch.corpus_high); + } + return 0; +} + +#endif diff --git a/poliqarp-library/sakura/backend-subdocument.h b/poliqarp-library/sakura/backend-subdocument.h new file mode 100644 index 0000000000000000000000000000000000000000..099a2b4b5e0164a4f74e51c36cfca310720fbb11 --- /dev/null +++ b/poliqarp-library/sakura/backend-subdocument.h @@ -0,0 +1,56 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef POLIQARP_BACKEND_SUBDOCUMENT_H +#define POLIQARP_BACKEND_SUBDOCUMENT_H + +#include <sakura/common/file-map.h> +#include <sakura/common/newdict.h> +#include <sakura/exception.h> +#include <sakura/abi.h> + +#define POLIQARP_SUBDOCUMENT_ITEM_FORMAT "%s.poliqarp.subchunk.item.%s" +#define POLIQARP_SUBDOCUMENT_IMAGE_FORMAT "%s.poliqarp.subchunk.image" +#define POLIQARP_SUBDOCUMENT_OFFSET_FORMAT "%s.poliqarp.subchunk.offset" + +struct poliqarp_subdocument_set { + struct file_map map; + uint32_t current; +}; + +struct poliqarp_backend_subdocument { + struct newdict name_dict; + struct poliqarp_subdocument_set *sets; +}; + +int poliqarp_subdocument_next(struct poliqarp_subdocument_set *set, + struct poliqarp_subdocument *subdocument); +void poliqarp_subdocument_search(struct poliqarp_subdocument_set *set, + uint32_t pos); +int poliqarp_backend_subdocument_open(struct poliqarp_backend_subdocument *this, + const char *base_name, struct poliqarp_error *error); +void poliqarp_backend_subdocument_close(struct poliqarp_backend_subdocument *this); +struct poliqarp_subdocument_set *poliqarp_backend_subdocument_lookup_set( + struct poliqarp_backend_subdocument *this, const char *set_name); + +#endif /* POLIQARP_BACKEND_SUBDOCUMENT_H */ diff --git a/poliqarp-library/sakura/backend-syntax.c b/poliqarp-library/sakura/backend-syntax.c new file mode 100644 index 0000000000000000000000000000000000000000..a767ee55f0540ce47e213132bd1ba8e63c7d87c8 --- /dev/null +++ b/poliqarp-library/sakura/backend-syntax.c @@ -0,0 +1,182 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <sakura/backend-syntax.h> + +#include <sakura/dict.h> +#include <foostring/foostring.h> + +static int syntax_next_bit(void *data) +{ + struct poliqarp_backend_syntax *syntax = + (struct poliqarp_backend_syntax *)data; + uint8_t *buffer = (uint8_t *)syntax->data.image; + int res = buffer[syntax->pos / 8] & (1 << (syntax->pos % 8)); + syntax->pos++; + return res; +} + +int poliqarp_backend_syntax_open(struct poliqarp_backend_syntax *this, + const char *base_name, struct poliqarp_error *error) +{ + bool have_dict, have_image, have_offsets, have_groups; + char *path; + int rc; + have_dict = have_image = have_offsets = have_groups = false; + rc = poliqarp_newdict_open(&this->dict_types, base_name, true, + "%s.poliqarp.grtype.image", "%s.poliqarp.grtype.offset", + _("Unable to open group types dictionary"), error); + if (rc != 0) { + if (errno == ENOENT) { + /* Presumably there's no syntax data. */ + poliqarp_error_message_set(error, NULL); + this->syntax = false; + return 0; + } + goto error; + } + have_dict = true; + + path = string_aformat("%s.poliqarp.syntax.image", base_name); + if (path == NULL) + goto error_message; + rc = create_file_map(&this->data, path); + if (rc != 0) { + poliqarp_error_from_system(error, _("Unable to open syntax image" + " (%s)"), path); + } + free(path); + if (rc != 0) + goto error; + have_image = true; + + this->numbits = 8 * this->data.size - 8 + + ((uint8_t *)this->data.image)[this->data.size - 1]; + + path = string_aformat("%s.poliqarp.syntax.offset", base_name); + if (path == NULL) + goto error_message; + rc = tinydb_open(&this->offsets, path, sizeof(uint32_t)); + if (rc != 0) { + + } + free(path); + if (rc != 0) + goto error; + have_offsets = true; + + this->size = 4096; + this->groups = malloc(this->size * sizeof(struct poliqarp_syntax_group)); + if (this->groups == NULL) + goto error_message; + have_groups = true; + + this->bitstream.next_bit = syntax_next_bit; + this->bitstream.data = this; + this->pos = this->start = this->end = 0; + this->syntax = true; + return 0; + +error_message: + poliqarp_error_from_system(error, _("Unable initialize syntax backend")); +error: + if (have_dict) + newdict_close(&this->dict_types); + if (have_image) + destroy_file_map(&this->data); + if (have_offsets) + tinydb_close(&this->offsets); + if (have_groups) + free(this->groups); + return -1; +} + +void poliqarp_backend_syntax_close(struct poliqarp_backend_syntax *this) +{ + if (this->syntax) { + free(this->groups); + tinydb_close(&this->offsets); + destroy_file_map(&this->data); + newdict_close(&this->dict_types); + this->syntax = false; + } +} + +static int poliqarp_backend_syntax_next_internal(struct poliqarp_backend_syntax *this, + int recursive) +{ + int synchro; + size_t old_end = this->end; + struct poliqarp_syntax_group *group = this->groups + this->end; + if (this->pos >= this->numbits) + return -1; + synchro = this->bitstream.next_bit(this); + if (synchro) + group->from = decode_binary(&this->bitstream, 32); + else + group->from = this->lastfrom + decode_gamma(&this->bitstream) - 1; + this->lastfrom = group->from; + group->to = group->from + decode_gamma(&this->bitstream) - 1; + group->type = decode_gamma(&this->bitstream) - 1; + if (group->type == POLIQARP_SYNTAX_GROUP_COORD) + group->u.coord.length = decode_gamma(&this->bitstream); + else { + int len = clog2(group->to - group->from + 1); + int havesynh = this->bitstream.next_bit(this); + int havesemh = this->bitstream.next_bit(this); + group->u.noncoord.synh = havesynh + ? group->from + decode_binary(&this->bitstream, len) + : POLIQARP_SYNTAX_GROUP_UNKNOWN; + group->u.noncoord.semh = havesemh + ? group->from + decode_binary(&this->bitstream, len) + : POLIQARP_SYNTAX_GROUP_UNKNOWN; + } + this->end++; + if (this->end == this->size) + this->end = 0; + if (this->end == this->start) { + size_t oldsize = this->size; + this->size *= 2; + this->groups = realloc(this->groups, this->size * sizeof(struct poliqarp_syntax_group)); + memcpy(this->groups + oldsize, this->groups, this->start * sizeof(struct poliqarp_syntax_group)); + this->end += oldsize; + group = this->groups + old_end; + } + if (group->type == POLIQARP_SYNTAX_GROUP_COORD && recursive) { + size_t i; + for (i = 0; i < group->u.coord.length; i++) + if (poliqarp_backend_syntax_next_internal(this, 0) == -1) + return -1; + } + return 0; +} + +int poliqarp_backend_syntax_next(struct poliqarp_backend_syntax *this) +{ + return poliqarp_backend_syntax_next_internal(this, 1); +} + +void poliqarp_backend_syntax_reset(struct poliqarp_backend_syntax *this) +{ + this->start = this->end = this->pos = 0; +} diff --git a/poliqarp-library/sakura/backend-syntax.h b/poliqarp-library/sakura/backend-syntax.h new file mode 100644 index 0000000000000000000000000000000000000000..a4153adfef9bcdb3ef17fab458cb96bf4af5fe10 --- /dev/null +++ b/poliqarp-library/sakura/backend-syntax.h @@ -0,0 +1,75 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef POLIQARP_BACKEND_SYNTAX_H +#define POLIQARP_BACKEND_SYNTAX_H + +#include <poliqarp-config.h> + +#include <sakura/common/bitstream.h> +#include <sakura/common/newdict.h> +#include <sakura/common/tinydb.h> +#include <sakura/common/file-map.h> +#include <sakura/exception.h> + +struct poliqarp_syntax_group { + uint32_t type; + uint32_t from, to; + union { + struct { + uint32_t length; + } coord; + struct { + uint32_t synh; + uint32_t semh; + } noncoord; + } u; +}; + +struct poliqarp_backend_syntax { + bool syntax; /**< Whether we have syntax backend at all. */ + struct ibs bitstream; /**< Bitstream for input syntax file. */ + struct newdict dict_types; /**< Dictionary of group types. */ + struct file_map data; /**< Memory view of the syntax image file. */ + struct tinydb offsets; /**< Array of offsets providing random access. */ + struct poliqarp_syntax_group *groups; /**< Array of decoded groups. */ + uint32_t type; + uint32_t lastfrom; /**< The 'from' attribute of last read group. */ + size_t numbits; /**< Total number of bits in the syntax image. */ + size_t pos; /**< Offset of the bit we're currently on. */ + size_t size; /**< Number of groups in the `groups' array. */ + size_t start; /**< Where the decoded groups start. */ + size_t end; /**< Where the decoded groups end. */ +}; + +#define POLIQARP_SYNTAX_GROUP_UNKNOWN ((uint32_t)(-1)) +#define POLIQARP_SYNTAX_GROUP_COORD 0 +#define POLIQARP_SYNTAX_GROUP_CONJUNCTION 1 + +int poliqarp_backend_syntax_open(struct poliqarp_backend_syntax *this, + const char *base_name, struct poliqarp_error *error); +void poliqarp_backend_syntax_close(struct poliqarp_backend_syntax *this); +int poliqarp_backend_syntax_next(struct poliqarp_backend_syntax *this); +void poliqarp_backend_syntax_reset(struct poliqarp_backend_syntax *this); + +#endif /* POLIQARP_BACKEND_SYNTAX_H */ diff --git a/poliqarp-library/sakura/backend-tag.c b/poliqarp-library/sakura/backend-tag.c new file mode 100644 index 0000000000000000000000000000000000000000..a27a402364c8d4e50dd67a548105025383377822 --- /dev/null +++ b/poliqarp-library/sakura/backend-tag.c @@ -0,0 +1,87 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <sakura/backend-tag.h> +#include <sakura/dict.h> +#include <sakura/exception.h> +#include <foostring/foostring.h> +#include <sys/types.h> + +int poliqarp_backend_tag_open(struct poliqarp_backend_tag *this, + struct poliqarp_backend_config *backend_config, const char *base_name, + struct poliqarp_error *error) +{ + size_t i; + size_t num_tags; + size_t num_attr; + int rc; + + rc = poliqarp_newdict_open(&this->dict, base_name, false, + POLIQARP_TAG_IMAGE_FORMAT, POLIQARP_TAG_OFFSET_FORMAT, + _("Unable to open tag dictionary"), error); + if (rc != 0) + return rc; + + this->parsed_tag = NULL; + this->parsed_tag_memory = NULL; + + num_tags = poliqarp_backend_tag_num_items(this); + num_attr = backend_config->num_attr; + + this->parsed_tag = malloc(sizeof *this->parsed_tag * num_tags); + if (this->parsed_tag != NULL) + this->parsed_tag_memory = + malloc(sizeof *this->parsed_tag[i].attr_value * num_attr * num_tags); + if (this->parsed_tag == NULL || this->parsed_tag_memory == NULL) { + poliqarp_error_from_system(error, _("Unable to parse tags")); + goto error; + } + for (i = 0; i < num_tags; ++i) { + const char *tag_name; + this->parsed_tag[i].attr_value = (struct poliqarp_attr_value **) + (this->parsed_tag_memory) + i * num_attr; + tag_name = poliqarp_backend_tag_fetch(this, i); + rc = poliqarp_backend_config_parse_copy(backend_config, + this->parsed_tag + i, tag_name); + if (rc != 0) { + if (errno == EINVAL || errno == ENOENT) + poliqarp_error_message_set(error, _("%s is not a valid tag"), tag_name); + else + poliqarp_error_from_system(error, _("Unable to parse tags")); + goto error; + } + } + return 0; +error: + newdict_close(&this->dict); + free(this->parsed_tag); + free(this->parsed_tag_memory); + return -1; +} + +void poliqarp_backend_tag_close(struct poliqarp_backend_tag *this) +{ + newdict_close(&this->dict); + free(this->parsed_tag); + free(this->parsed_tag_memory); +} diff --git a/poliqarp-library/sakura/backend-tag.h b/poliqarp-library/sakura/backend-tag.h new file mode 100644 index 0000000000000000000000000000000000000000..d8ecd5b48ad461cd939a75d61aa2d8df4f7fd32e --- /dev/null +++ b/poliqarp-library/sakura/backend-tag.h @@ -0,0 +1,90 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef POLIQARP_BACKEND_TAG_H +#define POLIQARP_BACKEND_TAG_H + +#include <sakura/backend-config.h> + +#include <sakura/common/newdict.h> + +/* the %s is for encoding selected */ +#define POLIQARP_TAG_IMAGE_FORMAT "%s.poliqarp.tag.image" +#define POLIQARP_TAG_OFFSET_FORMAT "%s.poliqarp.tag.offset" + +/** + * Tag backend structure. + * Used to retrieve tags from storage + */ +struct poliqarp_backend_tag { + struct newdict dict; /**< Dictionary object that contains + all tags. */ + void *parsed_tag_memory; /**< Memory used by all parsed tags. */ + struct poliqarp_parsed_tag *parsed_tag; /**< Parsed tags. */ +}; + +/** Open the tag backend for the given base name. + * + * The attribute backend is needed to parse the tags into a more meaningful + * structure + * + * @param this Uninitialized backend object. + * @param backend_config Initialized configuration backend. + * @param base_name Base name. + */ +int poliqarp_backend_tag_open(struct poliqarp_backend_tag *this, + struct poliqarp_backend_config *backend_config, const char *base_name, + struct poliqarp_error *error); + +/** + * Close the tag backend and release resources. + */ +void poliqarp_backend_tag_close(struct poliqarp_backend_tag *this); + +static inline size_t poliqarp_backend_tag_num_items( + const struct poliqarp_backend_tag *this) +{ + return GET_NUM_ITEMS(&this->dict); +} + +static inline const char *poliqarp_backend_tag_fetch( + const struct poliqarp_backend_tag *this, size_t key) +{ + assert(key < this->dict.num_items); + return GET_ITEM(&this->dict, key); +} + +static inline const struct poliqarp_parsed_tag *poliqarp_backend_parsed_tag_fetch( + const struct poliqarp_backend_tag *this, size_t key) +{ + assert(key < this->dict.num_items); + return this->parsed_tag + key; +} + +static inline size_t poliqarp_backend_tag_length( + const struct poliqarp_backend_tag *this, size_t key) +{ + return GET_LENGTH(&this->dict, key); +} + +#endif /* POLIQARP_BACKEND_TAG_H */ diff --git a/poliqarp-library/sakura/cdf.c b/poliqarp-library/sakura/cdf.c new file mode 100644 index 0000000000000000000000000000000000000000..fddaf26c1181b8159d2cb587759a8a7f82be1e03 --- /dev/null +++ b/poliqarp-library/sakura/cdf.c @@ -0,0 +1,151 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <sakura/cdf.h> +#include <sakura/config.h> +#include <sakura/abi.h> +#include <sakura/backend-index.h> +#include <sakura/exception.h> +#include <foostring/foostring.h> + +static const char filespec[] = "%s.cdf"; + +static const char spec_version[] = "version"; +static const char spec_endianness[] = "endianness"; +static const char spec_indices[] = "indices"; +static const char spec_granularity[] = "index-granularity"; +static const char spec_le[] = "little-endian"; +static const char spec_be[] = "big-endian"; + +static int handle_var(const struct poliqarp_variable *var, + struct poliqarp_cdf_info *cdf) +{ + if (strcmp(var->name, spec_version) == 0) { + cdf->version = atoi(var->value); + if (cdf->version < 1) + { + cdf->version = 1; + return -1; + } + return 0; + } + else if (strcmp(var->name, spec_endianness) == 0) { + if (strcmp(var->value, spec_le) == 0) + cdf->endianness = ENDIAN_LE; + else if (strcmp(var->value, spec_be) == 0) + cdf->endianness = ENDIAN_BE; + else + return -1; + } else if (strcmp(var->name, spec_indices) == 0) { + int indices = poliqarp_index_calculate(var->value); + if (indices == -1) + return -1; + else + cdf->indices = indices; + } else if (strcmp(var->name, spec_granularity) == 0) { + int gran = atoi(var->value); + if (gran < POLIQARP_INDEX_MIN_GRANULARITY) + return -1; + if (gran > POLIQARP_INDEX_MAX_GRANULARITY) + return -1; + cdf->granularity = gran; + } else + return -1; + return 0; +} + +void poliqarp_cdf_create(struct poliqarp_cdf_info *cdf) +{ + cdf->version = POLIQARP_ABI_VERSION; + cdf->indices = 0; + cdf->granularity = POLIQARP_INDEX_DEFAULT_GRANULARITY; + cdf->endianness = ENDIAN_LE; +} + +int poliqarp_cdf_read(const char *name, struct poliqarp_cdf_info *cdf) +{ + char *pathname, *line; + FILE *f; + struct poliqarp_variable var; + int rc; + + poliqarp_cdf_create(cdf); + cdf->version = 1; + pathname = string_aformat(filespec, name); + if (pathname == NULL) + return -1; + f = fopen(pathname, "rt"); + free(pathname); + if (f == NULL) + return -1; + while ((line = poliqarp_fetch_line(f)) != NULL) { + rc = poliqarp_parse_variable(&var, line); + free(line); + if (rc != 0) + break; + rc = handle_var(&var, cdf); + poliqarp_free_variable(&var); + if (rc != 0) + break; + } + if (rc == 0 && !feof(f)) /* poliqarp_fetch_line() failed */ { + /* Don't allow fclose() to overwrite errno. */ + int my_errno = errno; + fclose(f); + errno = my_errno; + return -1; + } + fclose(f); /* Safe to ignore errors. */ + if (rc != 0) + errno = EINVAL; + return rc; +} + +int poliqarp_cdf_write(const char *name, const struct poliqarp_cdf_info *cdf) +{ + char *pathname; + FILE *f; + + pathname = string_aformat(filespec, name); + if (pathname == NULL) + return -1; + f = fopen(pathname, "wt"); + free(pathname); + if (f == NULL) + return -1; + fprintf(f, "%s = %d\n", spec_version, POLIQARP_ABI_VERSION); + fprintf(f, "%s = %s\n", spec_endianness, + cdf->endianness == ENDIAN_LE ? spec_le : spec_be); + fprintf(f, "%s = %s\n", spec_indices, poliqarp_index_stringify(cdf->indices)); + fprintf(f, "%s = %" PRIuSIZE "\n", spec_granularity, cdf->granularity); + if (ferror(f)) { + /* Don't allow fclose() to overwrite errno. */ + int my_errno = errno; + fclose(f); + errno = my_errno; + return -1; + } + if (fclose(f) != 0) + return -1; + return 0; +} diff --git a/poliqarp-library/sakura/cdf.h b/poliqarp-library/sakura/cdf.h new file mode 100644 index 0000000000000000000000000000000000000000..ecfb098569957db00d864e18b537f77c92347192 --- /dev/null +++ b/poliqarp-library/sakura/cdf.h @@ -0,0 +1,84 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +/** + * @file cdf.h + * @brief Corpus Description File -- reading, parsing, writing. + * + * A corpus description file (CDF in short; this is also the preferred + * extension for such files) contains information about the binary corpus + * that should not be changed by the user, such as presence and granularity + * of indices. Just like the .cfg file, it consists of lines of the form + * 'name = value'. + */ + +#ifndef POLIQARP_CDF_H +#define POLIQARP_CDF_H + +#include <stdlib.h> + +/** + * Endianness of an architecture. + */ +enum poliqarp_endianness { + ENDIAN_LE, /**< Little endian. */ + ENDIAN_BE /**< Big endian. */ +}; + +/** + * Information contained in CDF files. + */ +struct poliqarp_cdf_info { + int version; /**< Version of binary format. */ + int indices; /**< Inverted indices built for this corpus. */ + size_t granularity; /**< Granularity of the indices. */ + enum poliqarp_endianness endianness; /**< Endianness of the architecture that this + corpus was built on. */ +}; + +/** + * Creates a CDF structure with the current corpus format. + * @param cdf The destination structure. + */ +void poliqarp_cdf_create(struct poliqarp_cdf_info *cdf); + +/** + * Reads and parses a CDF file. If some fields are not given, assumes some + * sensible defaults. + * @param name Base name of the corpus. + * @param cdf The destination structure. + * @return 0 if the file was successfully read, -1 otherwise. + * @note Even if -1 is returned, the destination structure will contain + * sensible default when this function returns. + */ +int poliqarp_cdf_read(const char *name, struct poliqarp_cdf_info *cdf); + +/** + * Writes a structure to a CDF file. + * @param name Base name of the corpus. + * @param cdf Structure to be written. + * @return 0 if the structure was successfully written, -1 otherwise. + */ +int poliqarp_cdf_write(const char *name, const struct poliqarp_cdf_info *cdf); + +#endif /* CDF_H */ diff --git a/poliqarp-library/sakura/common/args.c b/poliqarp-library/sakura/common/args.c new file mode 100644 index 0000000000000000000000000000000000000000..faed88bea7c4aad307870b81df5f82b9871562dc --- /dev/null +++ b/poliqarp-library/sakura/common/args.c @@ -0,0 +1,439 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <poliqarp-config.h> + +#include <errno.h> + +#include <foostring/foostring.h> + +#include <sakura/common/args.h> + +enum STATE { + S_error, + S_start, + S_numeric, + S_bareword, + S_punct, + S_string +}; + +enum TOKEN { + TOK_error = -1, + TOK_end, + TOK_numeric, + TOK_bareword, + TOK_string, + TOK_string_error, + TOK_punct +}; + +struct lex_context { + size_t offset; + size_t size; + const char *buffer; + const char *token; + size_t token_size; +}; + +static void set_parse_buffer(struct lex_context *this, const char *buffer, + size_t size) +{ + this->offset = 0; + this->size = size; + this->buffer = buffer; + this->token = NULL; + this->token_size = 0; +} + +#define set_state(what) \ + do { \ + state = what; \ + goto loop; \ + } while (0) + +#define token(what) \ + do { \ + this->token_size = buf - this->token; \ + this->offset = buf - this->buffer; \ + return what; \ + } while (0) + +static char *duplicate_token(const struct lex_context *this) +{ + char *result; + result = malloc(this->token_size + 1); + if (result == NULL) + return NULL; + memcpy(result, this->token, this->token_size); + result[this->token_size] = 0; + return result; +} + +static enum TOKEN get_token(struct lex_context *this) +{ + const char *buf = this->buffer + this->offset; + const char *buf_end = this->buffer + this->size; + enum STATE state = S_start; + int c; + int quote_char = 0; + +loop: + switch (state) { + case S_error: /* ERRORS */ + token(TOK_error); + + case S_start: /* START */ + if (buf == buf_end) + token(TOK_end); + c = *buf; + + /* eat white space */ + if (ascii_isspace(c)) { + for (; buf < buf_end && ascii_isspace(*buf); ++buf) {} + set_state(S_start); + } + + /* detect... */ + + /* numbers */ + if (ascii_isdigit(c) || c == '-' || c == '+') { + this->token = buf; + set_state(S_numeric); + } + + /* quoted content */ + if (strchr("\"'`", c)) { + quote_char = c; + this->token = buf++; + set_state(S_string); + } + + /* words */ + if (c == '.' || ascii_isalpha(c)) { + this->token = buf; + set_state(S_bareword); + } + + /* almost everything else */ + if (ascii_isgraph(c)) { + this->token = buf; + set_state(S_punct); + } + + /* @FIXME can we ever get here? */ + + this->token = buf; + set_state(S_error); + + case S_numeric: /* NUMERIC */ + while (buf < buf_end && ascii_isdigit(*buf)) + ++buf; + if (ascii_isalpha(*buf) || strchr("_-+/.", *buf)) + set_state(S_bareword); + token(TOK_numeric); + + case S_bareword: /* WORDS */ + while (buf < buf_end && (ascii_isalnum(*buf) || strchr("_-=/.", *buf))) + ++buf; + token(TOK_bareword); + + case S_punct: /* PUNCT */ + while (buf < buf_end && (ascii_isgraph(*buf) && !ascii_isalnum(*buf))) + ++buf; + token(TOK_punct); + + case S_string: /* QUOTED CONTENT */ + c = -1; + for (; buf < buf_end && (c = *buf) != quote_char; ++buf) + if (c == '\\') + buf++; + if (c == quote_char) { + buf++; + token(TOK_string); + } else + token(TOK_string_error); + default: + abort(); /* Should not happen. */ + } +} + +static char *interpolate(const char *buf, size_t size) +{ + int c; + char *result_buf; + char *result; + const char *buf_end; + + result_buf = result = malloc(size + 1); + if (result == NULL) + return NULL; + buf_end = buf[size - 1] == buf[0] ? buf + size - 1 : buf + size; + buf += 1; + switch (buf[-1]) { + case '/': case '"': + while (buf < buf_end) { + c = *buf++; + if (c == '\\') { + c = *buf++; + switch (c) { + case 'n': + *result++ = '\n'; + break; + case 'r': + *result++ = '\r'; + break; + case 't': + *result++ = '\t'; + break; + case 'v': + *result++ = '\v'; + break; + case 'b': + *result++ = '\b'; + break; + case 'a': + *result++ = '\a'; + break; + case 'f': + *result++ = '\f'; + break; + case 'e': + *result++ = 27; + break; + case 'x': + { + int n; + int old_c; + + old_c = c = *buf++; + if (c >= '0' && c <= '9') + c -= '0'; + else if (c >= 'a' && c <= 'z') + c -= 'a'; + else if (c >= 'A' && c <= 'Z') + c -= 'A'; + else { + *result++ = 'x'; + *result++ = old_c; + break; + } + + n = c * 16; + + c = *buf++; + if (c >= '0' && c <= '9') + c -= '0'; + else if (c >= 'a' && c <= 'z') + c -= 'a'; + else if (c >= 'A' && c <= 'Z') + c -= 'A'; + else { + *result++ = 'x'; + *result++ = old_c; + *result++ = c; + break; + } + + n += c; + + *result++ = n; + } + default: + *result++ = c; + } + } else + *result++ = c; + } + break; + case '\'': + while (buf < buf_end) { + c = *buf++; + if (c == '\\') + *result++ = *buf++; + else + *result++ = c; + } + break; + case '`': + while (buf < buf_end) { + c = *buf++; + if (c == '\\') { + c = *buf++; + switch (c) { + case 'n': + *result++ = '\n'; + break; + case 'r': + *result++ = '\r'; + break; + case 't': + *result++ = '\t'; + break; + case 'v': + *result++ = '\v'; + break; + case 'b': + *result++ = '\b'; + break; + case 'a': + *result++ = '\a'; + break; + case 'f': + *result++ = '\f'; + break; + case 'e': + *result++ = 27; + break; + case 'x': + { + int n; + int old_c; + + old_c = c = *buf++; + if (c >= '0' && c <= '9') + c -= '0'; + else if (c >= 'a' && c <= 'z') + c -= 'a'; + else if (c >= 'A' && c <= 'Z') + c -= 'A'; + else { + *result++ = 'x'; + *result++ = old_c; + break; + } + + n = c * 16; + + c = *buf++; + if (c >= '0' && c <= '9') + c -= '0'; + else if (c >= 'a' && c <= 'z') + c -= 'a'; + else if (c >= 'A' && c <= 'Z') + c -= 'A'; + else { + *result++ = 'x'; + *result++ = old_c; + *result++ = c; + break; + } + + n += c; + + *result++ = n; + } + default: + *result++ = c; + } + } else + *result++ = c; + /* eat whitespace */ + if (ascii_isspace(result[-1])) { + if (ascii_isspace(*result_buf)) + --result; + else if (result > result_buf + 1 && ascii_isspace(result[-1]) + && ascii_isspace(result[-2])) { + --result; + *result = ' '; + } + } + } + if (result > result_buf && ascii_isspace(result[-1])) + --result; + break; + } + *result++ = 0; + return result_buf; +} + + +void args_init(struct text_args *this) +{ + this->num_items = 0; + this->capacity = 0; + this->item = NULL; +} + +int args_parse(struct text_args *this, const char *text) +{ + struct lex_context context; + size_t i; + + for (i = 0; i < this->num_items; ++i) + free(this->item[i].value); + + set_parse_buffer(&context, text, strlen(text)); + + for (this->num_items = 0;; this->num_items++) { + struct text_value *this_item; + + if (this->num_items >= this->capacity) { + struct text_value *new_items; + this->capacity = this->capacity * 2 + 1; + if (this->capacity > SIZE_MAX / sizeof *this->item) { + errno = ENOMEM; + return -1; + } + new_items = realloc(this->item, this->capacity * sizeof *this->item); + if (new_items == NULL) + return -1; + else + this->item = new_items; + } + this_item = this->item + this->num_items; + switch (get_token(&context)) { + case 0: + return 0; + case TOK_bareword: + case TOK_punct: + this_item->value = duplicate_token(&context); + this_item->flags = 0; + break; + case TOK_numeric: + this_item->value = duplicate_token(&context); + this_item->flags = TV_NUMERIC; + break; + case TOK_string: + this_item->value = interpolate(context.token, context.token_size); + this_item->flags = 0; + break; + case TOK_string_error: + this_item->value = interpolate(context.token, context.token_size); + this_item->flags = TV_ERROR; + break; + case TOK_error: + return 0; + } + if (this_item->value == NULL) + return -1; + } +} + +void args_free(struct text_args *this) +{ + size_t i; + for (i = 0; i < this->num_items; ++i) + free(this->item[i].value); + free(this->item); +} diff --git a/poliqarp-library/sakura/common/args.h b/poliqarp-library/sakura/common/args.h new file mode 100644 index 0000000000000000000000000000000000000000..7294c8d9b4db6571c5068e14f16370fa3b646c5c --- /dev/null +++ b/poliqarp-library/sakura/common/args.h @@ -0,0 +1,51 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef ARGS_H +#define ARGS_H + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> + +#define TV_NUMERIC 1 +#define TV_ERROR 2 + +struct text_value { + char *value; + unsigned flags; +}; + +struct text_args { + size_t num_items; + size_t capacity; + struct text_value *item; +}; + +void args_init(struct text_args *this); + +int args_parse(struct text_args *this, const char *text); + +void args_free(struct text_args *this); + +#endif /* ARGS_H */ diff --git a/poliqarp-library/sakura/common/bit-routines.h b/poliqarp-library/sakura/common/bit-routines.h new file mode 100644 index 0000000000000000000000000000000000000000..5124af948951521c5a40985e01e006fd70ec05c6 --- /dev/null +++ b/poliqarp-library/sakura/common/bit-routines.h @@ -0,0 +1,63 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +/** + * @file bit-routines.h + * @brief Operations on bit sequences. + * + * The macros defined in this header allow to treat arrays of integer values + * as packed arrays of bits. + */ + +#ifndef BIT_ROUTINES_H +#define BIT_ROUTINES_H + +#include <limits.h> + +/** + * Returns the size of the given bit array in bytes. + * @param base Pointer to the beginning of the array. + * @param size Number of bits in the array. + */ +#define BIT_ARRAY_LENGTH_BYTES(base, size) \ + (((size) + sizeof *(base) * CHAR_BIT - 1) / CHAR_BIT) + +/** + * Sets the value of a particular bit in the array to 1. + * @param base Pointer to the beginning of the array. + * @param index Zero-based index of the bit. + */ +#define BIT_ARRAY_SET(base, index) \ + ((base)[(index) / (sizeof *(base) * CHAR_BIT)] |= \ + 1 << (index) % (sizeof *(base) * CHAR_BIT)) + +/** + * Returns true iff a particular bit in the array is set to 1. + * @param base Pointer to the beginning of the array. + * @param index Zero-based index of the bit. + */ +#define BIT_ARRAY_GET(base, index) \ + ((base)[(index) / (sizeof *(base) * CHAR_BIT)] & \ + (1 << (index) % (sizeof *(base) * CHAR_BIT)) ? true : false) + +#endif /* BIT_ROUTINES_H */ diff --git a/poliqarp-library/sakura/common/bitstream.c b/poliqarp-library/sakura/common/bitstream.c new file mode 100644 index 0000000000000000000000000000000000000000..dcb10c086b298c99e78a3f02d45f75b3d9bfee55 --- /dev/null +++ b/poliqarp-library/sakura/common/bitstream.c @@ -0,0 +1,128 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <poliqarp-config.h> + +#include <assert.h> +#include <stdio.h> +#include <stdlib.h> +#include <time.h> + +#include <sakura/common/bitstream.h> + +void encode_unary(struct obs *bs, uint32_t x) +{ + while (--x) + bs->add_bit(bs->data, 1); + bs->add_bit(bs->data, 0); +} + +uint32_t decode_unary(struct ibs *bs) +{ + int x = 1; + while (bs->next_bit(bs->data)) + x++; + return x; +} + +void encode_binary(struct obs *bs, uint32_t x, int bits) +{ + int i; + for (i = 0; i < bits; i++) { + bs->add_bit(bs->data, x & (1 << (bits - 1))); + x <<= 1; + } +} + +uint32_t decode_binary(struct ibs *bs, int bits) +{ + uint32_t res = 0; + while (bits--) + if (bs->next_bit(bs->data)) + res += (1 << bits); + return res; +} + +void encode_gamma(struct obs *bs, uint32_t x) +{ + int nb = flog2(x); + encode_unary(bs, 1 + nb); + encode_binary(bs, x - (1 << nb), nb); +} + +uint32_t decode_gamma(struct ibs *bs) +{ + int nb = decode_unary(bs) - 1; + uint32_t res = decode_binary(bs, nb); + return (1 << nb) + res; +} + +void encode_delta(struct obs *bs, uint32_t x) +{ + int nb = flog2(x); + encode_gamma(bs, 1 + nb); + encode_binary(bs, x - (1 << nb), nb); +} + +uint32_t decode_delta(struct ibs *bs) +{ + int nb = decode_gamma(bs) - 1; + uint32_t res = decode_binary(bs, nb); + return (1 << nb) + res; +} + +uint32_t get_golomb_parameter(uint32_t n, uint32_t m) +{ + uint64_t ln2_shift32 = 2977044472UL; /* log(2.0) << 32 */ + uint64_t m_shift32 = ((uint64_t)m << 32) - 1; + uint32_t b = ((ln2_shift32 * n + m_shift32) / m) >> 32; + return b; +} + +void encode_golomb(struct obs *bs, uint32_t x, uint32_t b) +{ + int q = (x - 1) / b, r = x - q * b - 1, cl = clog2(b), limit = (1 << cl) - b; + encode_unary(bs, q + 1); + if (r < limit) + encode_binary(bs, r, cl - 1); + else + encode_binary(bs, r + limit, cl); +} + +uint32_t decode_golomb(struct ibs *bs, uint32_t b) +{ + int q = decode_unary(bs) - 1, cl, limit, r; + if (b == 1) + return q + 1; + cl = clog2(b); + limit = (1 << cl) - b; + r = decode_binary(bs, cl - 1); + if (r >= limit) { + r = 2 * r - limit; + if (bs->next_bit(bs->data)) + r++; + } + return q * b + r + 1; +} + + diff --git a/poliqarp-library/sakura/common/bitstream.h b/poliqarp-library/sakura/common/bitstream.h new file mode 100644 index 0000000000000000000000000000000000000000..b651f1ddc8896a0ee25c84e6f11f2e17b1d8cfd7 --- /dev/null +++ b/poliqarp-library/sakura/common/bitstream.h @@ -0,0 +1,157 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +/** + * @file bitstream.h + * @brief Input/output streams of bits. + * + * The streams declared herein are implemented in an object-oriented way. + * A stream object consists of a backend (which can be just about anything) + * and a function taking that backend as an argument. This allows for + * polymorphic implementation of functions that encode integers as sequences + * of bits. + */ + +#ifndef POLIQARP_BITSTREAM_H +#define POLIQARP_BITSTREAM_H + +#include <poliqarp-config.h> + +/** + * Output bitstream. + */ +struct obs { + void *data; /**< The underlying backend. */ + int (*add_bit)(void *, int); /**< Emits a single bit to the stream. */ +}; + +/** + * Input bitstream. + */ +struct ibs { + void *data; /**< The underlying backend. */ + int (*next_bit)(void *); /**< Retrieves next bit from the stream. + Returns a non-zero value iff it is 1. */ +}; + +/** + * Encodes a number in unary code. + * @param bs the bitstream to use. + * @param x the number to encode. + */ +void encode_unary(struct obs *bs, uint32_t x); + +/** + * Encodes a number in binary code. + * @param bs the bitstream to use. + * @param x the number to encode. + * @param bits number of bits to use. + */ +void encode_binary(struct obs *bs, uint32_t x, int bits); + +/** + * Encodes a number in gamma code. + * @param bs the bitstream to use. + * @param x the number to encode. + */ +void encode_gamma(struct obs *bs, uint32_t x); + +/** + * Encodes a number in delta code. + * @param bs the bitstream to use. + * @param x the number to encode. + */ +void encode_delta(struct obs *bs, uint32_t x); + +/** + * Encodes a number in Golomb code. + * @param bs the bitstream to use. + * @param x the number to encode. + * @param b parameter of the Golomb code. + */ +void encode_golomb(struct obs *bs, uint32_t x, uint32_t b); + +/** + * Reads an unary-coded number from the stream. + * @param bs the bitstream to use. + * @return the decoded number. + */ +uint32_t decode_unary(struct ibs *bs); + +/** + * Reads a binary-coded number from the stream. + * @param bs the bitstream to use. + * @param bits number of bits that the number consists of. + * @return the decoded number. + */ +uint32_t decode_binary(struct ibs *bs, int bits); + +/** + * Reads a gamma-coded number from the stream. + * @param bs the bitstream to use. + * @return the decoded number. + */ +uint32_t decode_gamma(struct ibs *bs); + +/** + * Reads a delta-coded number from the stream. + * @param bs the bitstream to use. + * @return the decoded number. + */ +uint32_t decode_delta(struct ibs *bs); + +uint32_t get_golomb_parameter(uint32_t n, uint32_t m); +/** + * Calculates the parameter for the Glomb code. + * @ return an approximation of ceil(ln(2) * n / m). + */ + +/** + * Reads a Golomb-coded number from the stream. + * @param bs the bitstream to use. + * @param b parameter of the Golomb code. + * @return the decoded number. + */ +uint32_t decode_golomb(struct ibs *bs, uint32_t b); + +/** + * Returns ceil(log2(x)), where log2(x) is the binary (base 2) logarithm + * of x. + */ +static inline int clog2(uint32_t x) +{ + int res = 0; + x--; + while (x) x >>= 1, res++; + return res; +} + +/** + * Returns floor(log2(x)) with log2 declared as above. + */ +static inline int flog2(uint32_t x) +{ + return clog2(x + 1) - 1; +} + +#endif /* POLIQARP_BITSTREAM_H */ diff --git a/poliqarp-library/sakura/common/bs-file-backend.c b/poliqarp-library/sakura/common/bs-file-backend.c new file mode 100644 index 0000000000000000000000000000000000000000..b28dac3f7a611d4895d01bf001e1788ae444ba29 --- /dev/null +++ b/poliqarp-library/sakura/common/bs-file-backend.c @@ -0,0 +1,110 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <stdlib.h> +#include <string.h> + +#include <sakura/common/bs-file-backend.h> + +int file_obs_create(struct file_obs *bs, size_t block_size, FILE *f) +{ + bs->block_size = block_size; + bs->bits_used = 0; + bs->blocks_written = 0; + bs->data = malloc(block_size); + if (bs->data == NULL) + return -1; + bs->file = f; + return 0; +} + +void file_obs_set_backend(struct obs *obs, struct file_obs *backend) +{ + obs->data = backend; + obs->add_bit = file_obs_add_bit; +} + +void file_obs_destroy(struct file_obs *bs) +{ + free(bs->data); +} + +int file_obs_flush(void *d) +{ + struct file_obs *bs = (struct file_obs *)d; + if (bs->bits_used == 0) + return 0; + if (fwrite(bs->data, bs->block_size, 1, bs->file) != 1) + return -1; + bs->blocks_written++; + bs->bits_used = 0; + return 0; +} + +int file_obs_flush_partial(void *d) +{ + struct file_obs *bs = (struct file_obs *)d; + if (bs->bits_used == 0) + return 0; + if (fwrite(bs->data, (bs->bits_used + 7) / 8, 1, bs->file) != 1) + return -1; + bs->blocks_written++; + bs->bits_used = 0; + return 0; +} + +int file_obs_halve_block(struct file_obs *bs) +{ + if (bs->block_size == 1) + return 0; + bs->block_size /= 2; + bs->blocks_written *= 2; + if (bs->bits_used >= 8 * bs->block_size) { + if (fwrite(bs->data, bs->block_size, 1, bs->file) != 1) + return -1; + bs->bits_used -= 8 * bs->block_size; + bs->blocks_written++; + memcpy(bs->data, bs->data + bs->block_size, bs->block_size); + } + return 0; +} + +void file_obs_add_bit_noflush(void *d, int x) +{ + struct file_obs *bs = (struct file_obs *)d; + size_t bit = bs->bits_used++; + if (x) + bs->data[bit / 8] |= (1 << (bit % 8)); + else + bs->data[bit / 8] &= ~(1 << (bit % 8)); +} + +int file_obs_add_bit(void *d, int x) +{ + struct file_obs *bs = (struct file_obs *)d; + file_obs_add_bit_noflush(d, x); + if (bs->bits_used == 8 * bs->block_size) + return file_obs_flush(d); + return 0; +} + diff --git a/poliqarp-library/sakura/common/bs-file-backend.h b/poliqarp-library/sakura/common/bs-file-backend.h new file mode 100644 index 0000000000000000000000000000000000000000..58ca09877745071e570560b16544540360c0777d --- /dev/null +++ b/poliqarp-library/sakura/common/bs-file-backend.h @@ -0,0 +1,138 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +/** + * @file bs-file-backend.h + * @brief File-based backend for an output bit stream. + * + * This backend is used for sequential output to a file. Bytes (each of them + * comprising eight bits) are not actually written until the internal buffer + * is filled. The buffer has fixed size, but can be halved and flushed to disk + * at any given time. + */ + +#ifndef POLIQARP_BS_FILE_BACKEND +#define POLIQARP_BS_FILE_BACKEND + +#include <poliqarp-config.h> + +#include <stdio.h> + +#include <sakura/common/bitstream.h> + +/** + * The file backend. + */ +struct file_obs { + size_t block_size; /**< Size of the internal buffer. */ + size_t bits_used; /**< Number of used bits in the buffer. */ + size_t blocks_written; /**< Number of blocks written so far. */ + uint8_t *data; /**< The internal buffer. */ + FILE *file; /**< File associated with this backend. */ +}; + +/** + * Creates a backend. + * @param bs the structure to be filled. + * @param block_size initial size of the buffer. + * @param f file to be associated with this backend. + * + * @return 0 on success, -1 on error. + */ +int file_obs_create(struct file_obs *bs, size_t block_size, FILE *f); + +/** + * Associates a backend with the bitstream. + * @param obs the bitstream to be initialized. + * @param backend the backend. + */ +void file_obs_set_backend(struct obs *obs, struct file_obs *backend); + +/** + * Frees all resources allocated for a backend. + * @param bs the backend to be destroyed. + */ +void file_obs_destroy(struct file_obs *bs); + +/** + * Flushes contents of a backend's internal buffer to the file associated + * with it, emptying the buffer. + * + * @note If the number of bits in the buffer is > 0, the number of bytes + * written is always equal to the size of the buffer, regardless of + * whether the buffer was full or not. + * + * @param bs the backend to be flushed. + * + * @return 0 on success, -1 on error. + * + * @see file_obs_flush_partial + */ +int file_obs_flush(void *bs); + +/** + * Flushes contents of a backend's internal buffer to the file associated + * with it, emptying the buffer. + * + * @note In contrast to file_obs_flush(), only the bits in the internal + * buffer and at most 7 padding bits are written. + * + * @param bs the backend to be flushed. + * + * @return 0 on success, -1 on error. + * + * @see file_obs_flush + */ +int file_obs_flush_partial(void *bs); + +/** + * Cuts the size of the buffer by half. Can flush some data to a file + * in case there were too much for the new size. + * @param bs the backend to be shrunk. + * + * @return 0 on success, -1 on error. + */ +int file_obs_halve_block(struct file_obs *bs); + +/** + * Adds a single bit to this stream. Does not flush the buffer when it + * becomes full. + * @note This function can be used as an implementation of #obs.add_bit. + * @param bs The backend to write the bit to, cast to void *. + * @param x Indicates whether to clear or set the bit. + */ +void file_obs_add_bit_noflush(void *bs, int x); + +/** + * Same as file_obs_add_bit_noflush(), except that this function flushes + * the buffer when it becomes empty. + * @note This function can be used as an implementation of #obs.add_bit + * (and in fact is). + * @param bs The backend to write the bit to, cast to void *. + * @param x Indicates whether to clear or set the bit. + * + * @return 0 on success, -1 on error. + */ +int file_obs_add_bit(void *bs, int x); + +#endif /* POLIQARP_BS_FILE_BACKEND */ diff --git a/poliqarp-library/sakura/common/bs.c b/poliqarp-library/sakura/common/bs.c new file mode 100644 index 0000000000000000000000000000000000000000..de0b9a4d1e21c38915f68a2a08ccb17101f32642 --- /dev/null +++ b/poliqarp-library/sakura/common/bs.c @@ -0,0 +1,225 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <poliqarp-config.h> + +#include <sakura/common/bs.h> + +void bitset_arena_create_dummy(bitset_arena *this) +{ + assert(this != NULL); + this->arena = NULL; + this->private_arena = NULL; +} + +int bitset_arena_create(bitset_arena *this, size_t num_bits, + struct marena *arena) +{ + if (arena) { + this->arena = arena; + this->private_arena = false; + } else { + this->arena = malloc(sizeof *this->arena); + if (this->arena == NULL) + return -1; + this->private_arena = true; + marena_create(this->arena); + } + this->num_bits = num_bits; + this->num_units = (num_bits + BS_BITS_PER_UNIT - 1) / BS_BITS_PER_UNIT; + this->num_bytes = this->num_units * sizeof(bitset_unit); + return 0; +} + +void bitset_arena_destroy(bitset_arena *this) +{ + assert(this != NULL); + /* detect if we've been constructed in the first place */ + if (this->arena && this->private_arena) { + marena_destroy(this->arena); + free(this->arena); + } +} + +bitset bitset_arena_alloc(bitset_arena *this) +{ + bitset bs; + assert(this != NULL); + bs = marena_alloc(this->arena, this->num_bytes); + if (bs == NULL) + return NULL; + memset(bs, 0, this->num_bytes); + return bs; +} + +bitset bitset_arena_alloc_ones(bitset_arena *this) +{ + bitset bs; + assert(this != NULL); + bs = marena_alloc(this->arena, this->num_bytes); + if (bs == NULL) + return NULL; + memset(bs, 255, this->num_bytes); + return bs; +} + +/* results are placed in bs1 */ +bitset bitset_arena_union(struct bitset_arena *this, bitset bs1, + const_bitset bs2) +{ + bitset_unit *dest; + const bitset_unit *src; + const bitset_unit *end; + + assert(this != NULL); + assert(bs1 != NULL); + assert(bs2 != NULL); + dest = bs1; + src = bs2; + end = bs2 + this->num_units; + while (src < end) { + *dest |= *src; + ++dest; + ++src; + } + return bs1; +} + +/* results are placed in bs1 */ +bitset bitset_arena_intersect(struct bitset_arena *this, bitset bs1, + const_bitset bs2) +{ + bitset_unit *dest; + const bitset_unit *src; + const bitset_unit *end; + + assert(this != NULL); + assert(bs1 != NULL); + assert(bs2 != NULL); + dest = bs1; + src = bs2; + end = bs2 + this->num_units; + while (src < end) { + *dest &= *src; + ++dest; + ++src; + } + return bs1; +} + +bitset bitset_arena_copy(bitset_arena *this, const_bitset bs) +{ + bitset result; + assert(this != NULL); + assert(bs != NULL); + result = bitset_arena_alloc(this); + if (result == NULL) + return NULL; + memcpy(result, bs, this->num_bytes); + return result; +} + +bitset bitset_arena_copy_to(bitset_arena *this, const_bitset from, bitset to) +{ + assert(this != NULL); + assert(from != NULL); + assert(to != NULL); + memcpy(to, from, this->num_bytes); + return to; +} + +void bitset_arena_print(const bitset_arena *this, const_bitset bs, + FILE *stream) +{ + assert(this != NULL); + if (bs != NULL) { + size_t i; + int items = 0; + fprintf(stream, "{ "); + for (i = 0; i < this->num_bits; ++i) + if (bitset_arena_get(this, bs, i)) { + fprintf(stream, "%" PRIuSIZE " ", i); + ++items; + } + fprintf(stream, __("} (%d item)\n", "} (%d items)\n", items), items); + } else + fputs("(null)\n", stream); +} + +size_t bitset_count_ones(const bitset_arena *this, const_bitset bs) +{ + size_t i, items = 0; + assert(this != NULL); + for (i = 0; i < this->num_bits; ++i) + if (bitset_arena_get(this, bs, i)) + items++; + return items; +} + +int bitset_arena_get(const bitset_arena *this, const_bitset bs, size_t i) +{ + assert(this != NULL); + assert(bs != NULL); + assert(i < this->num_bits); + return bs[i / BS_BITS_PER_UNIT] & ((bitset_unit)1 << (i % BS_BITS_PER_UNIT)) ? 1 : 0; +} + +void bitset_arena_set(const bitset_arena *this, bitset bs, size_t i) +{ + assert(this != NULL); + assert(bs != NULL); + assert(i < this->num_bits); + bs[i / BS_BITS_PER_UNIT] |= ((bitset_unit)1 << (i % BS_BITS_PER_UNIT)); +} + +void bitset_arena_clear(const bitset_arena *this, bitset bs, size_t i) +{ + assert(this != NULL); + assert(bs != NULL); + assert(i < this->num_bits); + bs[i / BS_BITS_PER_UNIT] &= ~((bitset_unit)1 << (i % BS_BITS_PER_UNIT)); +} + +void bitset_arena_fill(struct bitset_arena *this, bitset bs, int value) +{ + assert(this != NULL); + assert(bs != NULL); + memset(bs, value ? 255 : 0, this->num_bytes); +} + +void bitset_arena_flip(const bitset_arena *this, bitset bs, size_t i) +{ + assert(this != NULL); + assert(bs != NULL); + assert(i < this->num_bits); + bs[i / BS_BITS_PER_UNIT] ^= ~((bitset_unit)1 << (i % BS_BITS_PER_UNIT)); +} + +int bitset_arena_compare(const bitset_arena *this, const_bitset bs1, + const_bitset bs2) +{ + assert(this != NULL); + assert(bs1 != NULL); + assert(bs2 != NULL); + return memcmp(bs1, bs2, this->num_bytes); +} diff --git a/poliqarp-library/sakura/common/bs.h b/poliqarp-library/sakura/common/bs.h new file mode 100644 index 0000000000000000000000000000000000000000..0296bb9036ea5028f5b94169901a038326c9f906 --- /dev/null +++ b/poliqarp-library/sakura/common/bs.h @@ -0,0 +1,197 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +/** + * @file bs.h + * @brief Bitsets and bitset arenas. + * + * A bitset is a packed (memory-efficient) array of bits. Since Poliqarp + * excessively allocates, drops and reuses large amounts of bitsets + * of the same size, a specialized memory source called a 'bitset arena' + * is provided. It allows for quick allocations of bitsets of the same size. + */ + +#ifndef BS_H +#define BS_H + +#include <poliqarp-config.h> + +#include <stdlib.h> +#include <stddef.h> +#include <string.h> +#include <limits.h> + +#include <sakura/common/memory-arena.h> + +typedef uint_fast32_t bitset_unit; /**< The type of memory unit for bitsets. */ +typedef bitset_unit *bitset; /**< A bitset is an array of memory units. */ +typedef const bitset_unit *const_bitset; /**< A constant bitset. */ + +#define BS_BITS_PER_UNIT (sizeof (bitset_unit) * CHAR_BIT) +#define BS_NONE -1 + +/** Bitset arena structure. */ +typedef struct bitset_arena { + size_t num_bits; /**< Size of each block (in bits actually used). */ + size_t num_bytes; /**< Size of each block (in bytes allocated). */ + size_t num_units; /**< Size of each block (in units). */ + struct marena *arena; /**< Memory source. */ + bool private_arena; /**< True if arena is private and should be freed. */ +} bitset_arena; + +/** + * Create a dummy bitset arena object. The only legal operation on such an area + * is to destroy it. + */ +void bitset_arena_create_dummy(bitset_arena *this); + +/** + * Creates the bitset arena object. + * From now on we can call bitset_arena_alloc() to obtain + * a new bit-set. Each bit set is of the same size. + * The memory for each subsequent bitset_arena_alloc() call is obtained + * from the specified arena object. + * + * @return 0 on success, -1 on error. + * + * @param num_bits Size of each set in bits. + * @param this Use this arena as a backing storage. If this is NULL, this + * bitset will allocate a private arena for its own use. + */ +int bitset_arena_create(bitset_arena *this, size_t num_bits, + struct marena *arena); + +/** + * Destructor. + */ +void bitset_arena_destroy(bitset_arena *this); + +/** + * Allocate new, empty bit-set of the size specified when creating this arena. + * + * @return The allocated bitset or NULL. + */ +bitset bitset_arena_alloc(bitset_arena *this); + +/** + * Allocate new, full bit-set of the size specified when creating this arena. + * + * @return The allocated bitset or NULL. + */ +bitset bitset_arena_alloc_ones(bitset_arena *this); + +/** + * Create an union of two bit-sets of the same size. + * The union is performed by bitwise or-ing the memory area; results + * are placed in the first bitset. + * + * @return The first bitset. + */ +bitset bitset_arena_union(bitset_arena *this, bitset bs1, const_bitset bs2); + +/** + * Create an intersection of two bit-sets of the same size. + * The intersection is performed by bitwise and-ing the memory area; results + * are placed in the first bitset. + * + * @return The first bitset. + */ +bitset bitset_arena_intersect(bitset_arena *this, bitset bs1, const_bitset bs2); + +/** + * Allocate a new bit-set and copy all contents from the specified bit-set. + * + * @return The allocated bitset or NULL. + */ +bitset bitset_arena_copy(bitset_arena *this, const_bitset bs); + +/** + * Copy contents of a bitset to another already allocated bitset. + */ +bitset bitset_arena_copy_to(bitset_arena *this, const_bitset from, bitset to); + +/** + * Print the indices of all existing objects. + * The printout looks like this: + * { 1 3 4 6 } (4 items) + */ +void bitset_arena_print(const bitset_arena *this, const_bitset bs, + FILE *stream); + +/** + * Calculate number of bits set to one in the given bitset. + */ +size_t bitset_count_ones(const bitset_arena *this, const_bitset bs); + +/** + * Get bit. + * + * @return 0 or 1. + */ +int bitset_arena_get(const bitset_arena *arena, const_bitset bs, size_t i); + +/** + * Set bit (set to 1). + */ +void bitset_arena_set(const bitset_arena *arena, bitset bs, size_t i); + +/** + * Clear bit (set to 0). + */ +void bitset_arena_clear(const bitset_arena *arena, bitset bs, size_t i); + +/** + * Set all bits to 0 or 1. + */ +void bitset_arena_fill(struct bitset_arena *arena, bitset bs, int value); + +/** + * Flip bit (set to opposite value). + */ +void bitset_arena_flip(const bitset_arena *arena, bitset bs, size_t i); + +/** + * Returns a nonzero value iff the two given bitsets are equal (that is, + * contain exactly the same bits). + */ +int bitset_arena_compare(const bitset_arena *arena, const_bitset bs1, + const_bitset bs2); + +/** + * Iterate over the given index. + * The iterator works as follows: + * + * @code + * for (index=0; (index=bitset_arena_iterate(this,bs,index)) != (size_t)-1; ++index) + * printf ("object at index %d exists\n", index) + * @endcode + */ +static inline size_t bitset_arena_iterate(const bitset_arena *this, + const_bitset bs, size_t index) +{ + while (index < this->num_bits && bitset_arena_get(this, bs, index) == 0) + ++index; + return index < this->num_bits ? index : (size_t)-1; +} + +#endif /* BS_H */ diff --git a/poliqarp-library/sakura/common/entity.c b/poliqarp-library/sakura/common/entity.c new file mode 100644 index 0000000000000000000000000000000000000000..04fac3ceb95221f04f7c1da30a8cbfd1808d2894 --- /dev/null +++ b/poliqarp-library/sakura/common/entity.c @@ -0,0 +1,145 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <sakura/common/entity.h> +#include <errno.h> + +int create_entity_bag(struct entity_bag *this, struct marena *arena) +{ + assert(this != NULL); + this->first_entity = NULL; + this->num_items = 0; + if (arena) { + this->local_arena = false; + this->arena = arena; + } else { + this->local_arena = true; + this->arena = malloc(sizeof *this->arena); + if (this->arena == NULL) + return -1; + marena_create(this->arena); + } + return 0; +} + +void destroy_entity_bag(struct entity_bag *this) +{ + assert(this != NULL); + if (this->arena != NULL && this->local_arena) { + marena_destroy(this->arena); + free(this->arena); + } +} + +/* entity construction */ +struct entity *new_entity(struct entity_bag *this, const char *name, void *tag, + void *data) +{ + struct entity *last = NULL; + struct entity *current; + struct entity *result; + int cmp_result; + + assert(this != NULL); + for (current = this->first_entity; current != NULL; + current = current->next_entity) + { + cmp_result = strcmp(current->name, name); + if (cmp_result == 0) { + errno = EEXIST; + return NULL; + } + if (cmp_result > 0) + break; /* keep ordering */ + last = current; + } + + /* create new entity and link it with the rest */ + result = marena_alloc(this->arena, sizeof *result); + if (result == NULL) + return NULL; + + result->next_entity = current; + /* either NULL or last->next_entity so it's OK */ + + result->name = marena_strdup(this->arena, name); + if (result->name == NULL) + return NULL; + result->tag = tag; + result->data = data; + + if (last) + last->next_entity = result; + else + this->first_entity = result; + + /* count me */ + ++this->num_items; + + return result; +} + +/* entity lookup */ +struct entity *lookup_entity(struct entity_bag *this, const char *name) +{ + struct entity *entity; + int cmp_result; + + assert(this != NULL); + for (entity = this->first_entity; entity != NULL; + entity = entity->next_entity) + { + cmp_result = strcmp(entity->name, name); + if (cmp_result > 0) { + /* ordering failed, what we're looking for isn't here */ + errno = ENOENT; + return NULL; + } + if (cmp_result != 0) + continue; + /* unalias */ + while (entity && entity->tag == entity) + entity = entity->data; + return entity; + } + errno = ENOENT; + return NULL; +} + +/* entity lookup */ +const struct entity *lookup_const_entity(const struct entity_bag *this, + const char *name) +{ + return lookup_entity((struct entity_bag *)this, name); +} + +/* entity aliasing */ +struct entity *new_entity_alias(struct entity_bag *this, struct entity *entity, + const char *name) +{ + struct entity *result; + result = new_entity(this, name, NULL, entity); + if (result) + result->tag = result; + return result; +} diff --git a/poliqarp-library/sakura/common/entity.h b/poliqarp-library/sakura/common/entity.h new file mode 100644 index 0000000000000000000000000000000000000000..94d1a1026208a4210e28deb014c112625f9bcefe --- /dev/null +++ b/poliqarp-library/sakura/common/entity.h @@ -0,0 +1,125 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +/** + * @file entity.h + * @brief Named entities and entity bags. + * + * An entity can be just about anything that has a name. Entities can be + * perceived as mappings from names to arbitrary data. They can also have + * tags (metadata) associated with them, which also can be just about anything. + * It is possible to create aliases (alternate names) for already existing + * entities. + * + * Entities are grouped in collections called 'entity bags'. It is possible + * to look up entities by name in a bag. + */ + +#ifndef ENTITY_H +#define ENTITY_H + +#include <poliqarp-config.h> + +#include <stdlib.h> +#include <string.h> + +#include <sakura/common/memory-arena.h> + +/** + * A named entity. + * @note When this->tag == this, entity is an alias and this->data is the + * aliased entity. + */ +struct entity { + struct entity *next_entity; /**< Entities are assembled together into a + singly-linked list. */ + char *name; /**< Name of the entity. */ + void *tag; /**< Metadata mechanism. */ + void *data; /**< Data pointer. */ +}; + +/** + * Bag of entities. + */ +struct entity_bag { + struct entity *first_entity; /**< First element of the list of entities in + this bag. */ + size_t num_items; /**< Number of items in this bag. */ + struct marena *arena; /**< Memory arena used as a backing storage. */ + bool local_arena; /**< If true, the arena is local for this bag + and should be destroyed when the bag + gets freed. */ +}; + +/** + * Creates an empty entity bag. + * @param this The structure to be initialized. + * @param arena Memory arena used as a backing storage. If this is NULL, + * the bag will allocate an arena for its own use. + * @return 0 on success, -1 on error. + */ +int create_entity_bag(struct entity_bag *this, struct marena *arena); + +/** + * Destroys an entity bag and all entities contained in it. + * @param this The bag to be destroyed. + */ +void destroy_entity_bag(struct entity_bag *this); + +/** + * Constructs and returns a new entity. + * @param this The bag that will contain the entity. + * @param name Name of the entity. + * @param tag Metadata for the entity. + * @param tag Data for the entity. + * @return The newly-created entity. + */ +struct entity *new_entity(struct entity_bag *this, const char *name, void *tag, + void *data); + +/** + * Searches for an entity in the entity bag. + * @param this The bag to search in. + * @param name Name of the entity to search for. + * @return The first matching entity if found, or NULL if there is no such + * entity. + */ +struct entity *lookup_entity(struct entity_bag *this, const char *name); + +/** + * Same as lookup_entity, except that the parameters must be constant. + */ +const struct entity *lookup_const_entity(const struct entity_bag *this, + const char *name); + +/** + * Creates an alias for the existing entity. + * @param this The bag that will contain the alias. + * @param entity The entity that the alias will point to. + * @param name Name of the new alias. + * @return The resulting alias. + */ +struct entity *new_entity_alias(struct entity_bag *this, struct entity *entity, + const char *name); + +#endif /* ENTITY_H */ diff --git a/poliqarp-library/sakura/common/file-map.c b/poliqarp-library/sakura/common/file-map.c new file mode 100644 index 0000000000000000000000000000000000000000..1d9dede961fb76b3d2e330f56880a597ae08e27f --- /dev/null +++ b/poliqarp-library/sakura/common/file-map.c @@ -0,0 +1,198 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +/* FIXME: Way too many #ifdefs in here! */ + +#include <errno.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#ifndef _WIN32 +#include <unistd.h> +#endif +#if defined _POSIX_MAPPED_FILES +#include <sys/mman.h> +#endif +#include <sakura/common/file-map.h> +#include <sakura/common/system-error.h> + +/** Cleanup mode used by destroy_file_map. */ +enum file_map_cleanup_mode { + CLEANUP_NONE, + CLEANUP_MMAP, + CLEANUP_WINMAP +}; + +void init_file_map(struct file_map *this) +{ + this->cleanup = CLEANUP_NONE; +} + +static int prepare_mapped_file(struct file_map *this, const char *pathname) +{ +#if defined _WIN32 + this->hfile = CreateFile(pathname, GENERIC_READ, FILE_SHARE_READ, NULL, + OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL); + if (this->hfile == INVALID_HANDLE_VALUE) + { + set_errno_from_last_error(); + goto err; + } + this->size = GetFileSize(this->hfile, NULL); + return 0; +#elif defined _POSIX_MAPPED_FILES + struct stat stat_data; + int fd; + int flags = O_RDONLY; + + if ((fd = open(pathname, flags)) < 0) + goto err; + if (fstat(fd, &stat_data) != 0) + goto err_stat; + this->size = stat_data.st_size; + this->fd = fd; + return 0; + +err_stat: + close(fd); +#endif +err: + return -1; +} + +static int file_map__mmap(struct file_map *this) +{ +#ifndef _WIN32 +#if defined _POSIX_MAPPED_FILES + void *image; + + image = mmap(NULL, this->size, PROT_READ, MAP_SHARED, this->fd, 0L); + if (image == MAP_FAILED) + goto err; + + this->image = image; + this->cleanup = CLEANUP_MMAP; + return 0; + + /* cleanup code */ + +err: + this->cleanup = CLEANUP_NONE; +#else +#warning "Not using memory mapping interface of any kind!" +#endif +#endif + return -1; +} + +static int file_map__windows(struct file_map *this, const char *pathname) +{ +#ifdef _WIN32 + HANDLE hmap; + void *image; + + hmap = CreateFileMapping(this->hfile, NULL, PAGE_READONLY, 0, this->size, + NULL); + if (!hmap) + { + set_errno_from_last_error(); + goto err_file; + } + + image = MapViewOfFileEx(hmap, FILE_MAP_READ, (DWORD) 0, (DWORD) 0, 0, NULL); + if (!image) + { + set_errno_from_last_error(); + goto err_file_map; + } + + this->hmap = hmap; + this->image = image; + this->cleanup = CLEANUP_WINMAP; + return 0; + + /* cleanup code */ + +err_file_map: + CloseHandle(hmap); +err_file: + CloseHandle(this->hfile); + this->cleanup = CLEANUP_NONE; +#endif + return -1; +} + +int create_file_map(struct file_map *this, const char *pathname) +{ + if (prepare_mapped_file(this, pathname) != 0) + goto err; + if (file_map__mmap(this) == 0 || file_map__windows(this, pathname) == 0) + return 0; +#ifdef _POSIX_MAPPED_FILES + { + int my_errno; + my_errno = errno; + close(this->fd); + errno = my_errno; + } +#endif +err: + this->image = NULL; + this->size = 0; + return -1; +} + +void destroy_file_map(struct file_map *this) +{ + if (this->cleanup == CLEANUP_NONE) + return; + switch (this->cleanup) { + case CLEANUP_MMAP: +#ifdef _POSIX_MAPPED_FILES + munmap(this->image, this->size); + close(this->fd); +#endif + break; + case CLEANUP_WINMAP: +#ifdef _WIN32 + UnmapViewOfFile(this->image); + CloseHandle(this->hmap); + CloseHandle(this->hfile); +#endif + break; + default: + abort(); /* Should not happen. */ + break; + } + this->cleanup = CLEANUP_NONE; +} + +const void *get_file_image(const struct file_map *this) +{ + return this->image; +} + +size_t get_file_size(const struct file_map * this) +{ + return this->size; +} diff --git a/poliqarp-library/sakura/common/file-map.h b/poliqarp-library/sakura/common/file-map.h new file mode 100644 index 0000000000000000000000000000000000000000..f7101fd056d906a25912203a0d2fee49fdd43c01 --- /dev/null +++ b/poliqarp-library/sakura/common/file-map.h @@ -0,0 +1,85 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +/** + * @file file-map.h + * @brief Cross-platform memory mapping routines. + * + * This file defines structures and functions that provide a consistent wrapper + * API around mmap (Linux) or MapViewOfFileEx (Win32). + */ + +#ifndef FILE_MAP_H +#define FILE_MAP_H + +#include <stdlib.h> +#ifdef _WIN32 +#include <windows.h> +#endif + +/** + * Portable mmap replacement/wrapper. + * Allows to map a file if such a function is provided by the OS. + */ +struct file_map { +#ifdef _WIN32 + HANDLE hfile; /**< Windows file handle. */ + HANDLE hmap; /**< Windows file map handle. */ +#endif + void *image; /**< File image. */ + size_t size; /**< File size. */ + int fd; /**< File descriptor. */ + int cleanup; /**< Cleanup mode. */ +}; + +/** + * Initialize file map so that it's safe to destroy it. + * It doesn't contain anything, any method other than destroy will produce + * undefined behaviour. + */ +void init_file_map(struct file_map *this); + +/** + * Create read-only view of a file. + * The file has to exist and be readable. + * + * @return 0 on success, some undefined error code on failure. + */ +int create_file_map(struct file_map *this, const char *pathname); + +/** + * Destroy file map. + */ +void destroy_file_map(struct file_map *this); + +/** + * Return pointer to contents of mapped file. + */ +const void *get_file_image(const struct file_map *this); + +/** + * Return the size of mapped file. + */ +size_t get_file_size(const struct file_map *this); + +#endif /* FILE_MAP_H */ diff --git a/poliqarp-library/sakura/common/file-reader.c b/poliqarp-library/sakura/common/file-reader.c new file mode 100644 index 0000000000000000000000000000000000000000..6f794f430821b7f63c4b9dfbeb32c3c7f6f0e5ee --- /dev/null +++ b/poliqarp-library/sakura/common/file-reader.c @@ -0,0 +1,217 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <poliqarp-config.h> + +#include <assert.h> +#include <stdlib.h> +#include <string.h> +#include <sakura/common/file-reader.h> + +#define SHIFT(ptr, x, y) ((void *)(((char *)(ptr)) + (x) * (y))) + +typedef void (*cleanup_t)(void *); + +#ifndef USE_EXTENSIVE_MMAP +static void file_reader_rewind(struct file_reader *reader, off64_t pos) +{ +#ifndef POLIQARP_SINGLE_THREADED + pthread_cleanup_push((cleanup_t)pthread_mutex_unlock, + (void *)&reader->ns_mutex); + pthread_mutex_lock(&reader->ns_mutex); +#endif + assert(pos >= 0); + fseeko(reader->file, pos * reader->item_size, SEEK_SET); + reader->seq_start = pos; + reader->seq_offset = 0; + fread(reader->buffer_seq, reader->item_size, reader->buffer_size, + reader->file); +#ifndef POLIQARP_SINGLE_THREADED + pthread_cleanup_pop(1); +#endif +} + +static inline off64_t *last_cache_offset(struct file_reader *reader, size_t i) +{ + return (off64_t *)SHIFT(reader->last_cache, reader->item_size + sizeof(off64_t), i); +} + +static inline void *last_cache_data(struct file_reader *reader, size_t i) +{ + return (void *)((char *)(last_cache_offset(reader, i)) + sizeof(off64_t)); +} +#endif + +int file_reader_create(struct file_reader *reader, const char *filename, + size_t item_size) +{ +#ifdef USE_EXTENSIVE_MMAP + struct file_map map; + if (create_file_map(&map, filename) != 0) + return -1; + reader->map = map; +#else + size_t i; + FILE *file = fopen(filename, "rb"); + if (file == NULL) + return -1; + reader->file = file; +#endif + reader->item_size = item_size; +#ifdef USE_EXTENSIVE_MMAP + reader->num_items = get_file_size(&map) / item_size; + reader->seq_start = reader->seq_offset = 0; + return 0; +#else +#ifndef POLIQARP_SINGLE_THREADED + pthread_mutex_init(&reader->ns_mutex, NULL); +#endif + reader->buffer_size = 1024; + reader->buffer_seq = (void *)malloc(item_size * reader->buffer_size); + if (reader->buffer_seq == NULL) + goto err_at_seq; + reader->buffer_rnd = (void *)malloc(item_size * reader->buffer_size); + if (reader->buffer_rnd == NULL) + goto err_at_rnd; + reader->last_cache = (void *)malloc((item_size + sizeof(off64_t)) * + reader->buffer_size); + if (reader->last_cache == NULL) + goto err_at_lc; + for (i = 0; i < reader->buffer_size; i++) + *last_cache_offset(reader, i) = -1; + fseeko(file, 0, SEEK_END); + reader->num_items = ftello(file) / item_size; + file_reader_rewind(reader, 0); + reader->rnd_start = -1; + reader->need_seek = false; + return 0; +err_at_lc: + free(reader->buffer_rnd); +err_at_rnd: + free(reader->buffer_seq); +err_at_seq: + return -1; +#endif +} + +int file_reader_destroy(struct file_reader *reader) +{ +#ifdef USE_EXTENSIVE_MMAP + destroy_file_map(&reader->map); +#else + free(reader->buffer_seq); + free(reader->buffer_rnd); + free(reader->last_cache); + fclose(reader->file); +#ifndef POLIQARP_SINGLE_THREADED + pthread_mutex_destroy(&reader->ns_mutex); +#endif +#endif + return 0; +} + +void file_reader_seek(struct file_reader *reader, off64_t pos) +{ + assert(pos >= 0); +#ifdef USE_EXTENSIVE_MMAP + reader->seq_start = pos; +#else + if (pos >= reader->seq_start && + pos < reader->seq_start + reader->buffer_size) + { + reader->seq_offset = pos - reader->seq_start; + } else { + file_reader_rewind(reader, pos); + } +#endif +} + +const void *file_reader_next(struct file_reader *reader) +{ +#ifdef USE_EXTENSIVE_MMAP + return get_file_image(&reader->map) + reader->seq_start * reader->item_size; +#else + void *result; + if (reader->seq_start + reader->seq_offset >= reader->num_items) + return NULL; + if (reader->seq_offset == reader->buffer_size) { + reader->seq_start += reader->seq_offset; + reader->seq_offset = 0; +#ifndef POLIQARP_SINGLE_THREADED + pthread_cleanup_push((cleanup_t)pthread_mutex_unlock, + (void *)&reader->ns_mutex); + pthread_mutex_lock(&reader->ns_mutex); +#endif + if (reader->need_seek) { + reader->need_seek = false; + assert(reader->seq_start >= 0); + fseeko(reader->file, reader->seq_start * reader->item_size, SEEK_SET); + } + fread(reader->buffer_seq, reader->item_size, reader->buffer_size, + reader->file); +#ifndef POLIQARP_SINGLE_THREADED + pthread_cleanup_pop(1); +#endif + } + result = SHIFT(reader->buffer_seq, reader->item_size, reader->seq_offset++); + return result; +#endif +} + +const void *file_reader_get(struct file_reader *reader, off64_t i) +{ +#ifdef USE_EXTENSIVE_MMAP + return get_file_image(&reader->map) + i * reader->item_size; +#else + off64_t delta = i - reader->rnd_start; + size_t cacheofs = i % reader->buffer_size; + void *retval; + + assert(i >= 0); + + if (*last_cache_offset(reader, cacheofs) == i) + return last_cache_data(reader, cacheofs); + if (reader->rnd_start == -1 || delta < 0 || delta >= reader->buffer_size) + { +#ifndef POLIQARP_SINGLE_THREADED + pthread_cleanup_push((cleanup_t)pthread_mutex_unlock, + (void *)&reader->ns_mutex); + pthread_mutex_lock(&reader->ns_mutex); +#endif + assert(i >= 0); + fseeko(reader->file, i * reader->item_size, SEEK_SET); + reader->rnd_start = i; + delta = 0; + fread(reader->buffer_rnd, reader->item_size, reader->buffer_size, + reader->file); + reader->need_seek = true; +#ifndef POLIQARP_SINGLE_THREADED + pthread_cleanup_pop(1); +#endif + } + retval = SHIFT(reader->buffer_rnd, reader->item_size, delta); + *last_cache_offset(reader, cacheofs) = i; + memcpy(last_cache_data(reader, cacheofs), retval, reader->item_size); + return retval; +#endif +} diff --git a/poliqarp-library/sakura/common/file-reader.h b/poliqarp-library/sakura/common/file-reader.h new file mode 100644 index 0000000000000000000000000000000000000000..ec99c6f6043223ee7a3c6eeb04ffa60e7e890cd1 --- /dev/null +++ b/poliqarp-library/sakura/common/file-reader.h @@ -0,0 +1,127 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +/** + * @file file-reader.h + * @brief Lightweight mmap() workalike. + * + * The memory view of files provided by mmap() and lookalikes (see file-map.h) + * is appropriate when one needs random access to files. However, there are + * files that Poliqarp processes mostly sequentially, occassionally skipping + * some parts of the file and only rarely needing a random access. Moreover, + * the two kinds of access are performed simultaneously by different threads. + * Hence the need for a structure that provides easy access to file elements + * while at the same time allowing random access. Such a structure, called + * a file reader, and operations on it are what this header defines. + */ + +#ifndef FILE_READER_H +#define FILE_READER_H + +#include <poliqarp-config.h> + +#include <stdio.h> +#include <sys/types.h> +#ifndef POLIQARP_SINGLE_THREADED +#include <pthread.h> +#endif + +#ifdef USE_EXTENSIVE_MMAP +#include <sakura/common/file-map.h> +#endif + +/** + * A lightweight memory mapping object. + */ +struct file_reader { +#ifdef USE_EXTENSIVE_MMAP + struct file_map map; /**< The underlying file map. */ +#else + FILE *file; /**< The underlying file object. */ + size_t buffer_size; /**< Size of the caches, as number of items. */ + void *buffer_seq; /**< Cache for sequential access. */ + void *buffer_rnd; /**< Cache for random access. */ + void *last_cache; /**< Cache of recently fetched elements. */ + off64_t rnd_start; /**< Likewise, for random cache. */ + bool need_seek; /**< Do we need to do a fseeko()? */ +#ifndef POLIQARP_SINGLE_THREADED + pthread_mutex_t ns_mutex; /**< Thread synchronizer. */ +#endif +#endif + size_t item_size; /**< Size of a single item, in bytes. */ + off64_t num_items; /**< Number of items in this file. */ + off64_t seq_start; /**< Offset of the element corresponding to + current start of sequential cache. */ + off64_t seq_offset; /**< Offset of the 'current' position in the + sequential cache. */ +}; + +/** + * Creates a file reader for the specified file. + * @param filename Path of the file to be mapped. + * @param item_size Size of a single item in the file, in bytes. + * @return 0 upon successful creation, a nonzero value on failure. + * @note The specified file must exist, be readable; moreover, its size + * should be divisible by item_size. + */ +int file_reader_create(struct file_reader *reader, const char *filename, + size_t item_size); + +/** + * Frees resources assigned to a file reader, closing the file. + */ +int file_reader_destroy(struct file_reader *reader); + +/** + * Returns next element in the file. + * @note This routine pertains to sequential access. + */ +const void *file_reader_next(struct file_reader *reader); + +/** + * Causes the specified element to be returned by next call of + * file_reader_next(). Sequential reading of elements will continue from that + * element. + * @param pos Offset of the element in the file. + * @note This routine pertains to sequential access. + */ +void file_reader_seek(struct file_reader *reader, off64_t pos); + +/** + * Returns the specified element in the file. Doesn't influence the + * 'current' position of sequential access. + * @param i Offset of the element in the file. + * @note This routine pertains to random access. + */ +const void *file_reader_get(struct file_reader *reader, off64_t i); + +/** + * Returns the current position of sequential access. Analogous + * to ftello(). + */ +static inline size_t file_reader_current(const struct file_reader *reader) +{ + return reader->seq_start + reader->seq_offset; +} + +#endif /* FILE_READER_H */ diff --git a/poliqarp-library/sakura/common/getline.c b/poliqarp-library/sakura/common/getline.c new file mode 100644 index 0000000000000000000000000000000000000000..49641ab9e257be9b104f6a4f77455c80f11a0a2f --- /dev/null +++ b/poliqarp-library/sakura/common/getline.c @@ -0,0 +1,85 @@ +/* + * getdelim for uClibc + * + * Copyright (C) 2000 by Lineo, inc. + * Written by Erik Andersen <andersen@lineo.com>, <andersee@debian.org> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU Library General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License + * for more details. + * + * You should have received a copy of the GNU Library General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include <sys/types.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <limits.h> +#include <errno.h> + +#include <sakura/common/getline.h> + +/* Read up to (and including) a TERMINATOR from STREAM into *LINEPTR + (and null-terminate it). *LINEPTR is a pointer returned from malloc (or + NULL), pointing to *N characters of space. It is realloc'd as + necessary. Returns the number of characters read (not including the + null delimiter), or -1 on error or EOF. */ +ssize_t getdelim(char **linebuf, size_t *linebufsz, int delimiter, FILE *file) +{ + static const int GROWBY = 80; /* how large we will grow strings by */ + int ch; + size_t idx = 0; + if (file == NULL || linebuf == NULL || linebufsz == NULL) { + errno = EINVAL; + return -1; + } + if (*linebuf == NULL || *linebufsz < 2) { + *linebuf = malloc(GROWBY); + if (!*linebuf) { + errno = ENOMEM; + return -1; + } + *linebufsz += GROWBY; + } + while (1) { + ch = fgetc(file); + if (ch == EOF) + break; + + /* grow the line buffer as necessary */ + while (idx > *linebufsz - 2) { + *linebuf = realloc(*linebuf, *linebufsz += GROWBY); + if (!*linebuf) { + errno = ENOMEM; + return -1; + } + } + (*linebuf)[idx++] = (char) ch; + if ((char) ch == delimiter) + break; + } + if (idx != 0) + (*linebuf)[idx] = 0; + + else if (ch == EOF) + return -1; + return idx; +} + +/* Basically getdelim() with the delimiter hard wired to '\n' */ +ssize_t getline(char **linebuf, size_t *n, FILE *file) +{ + return (getdelim(linebuf, n, '\n', file)); +} + diff --git a/poliqarp-library/sakura/common/getline.h b/poliqarp-library/sakura/common/getline.h new file mode 100644 index 0000000000000000000000000000000000000000..5f8e3402941037d5d796319ebf2a87582350729a --- /dev/null +++ b/poliqarp-library/sakura/common/getline.h @@ -0,0 +1,56 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +/** + * @file getline.h + * @brief getline() replacement for systems that lack it, taken from + * uClibc. + */ + +#ifndef GETLINE_H +#define GETLINE_H + +#include <poliqarp-config.h> + +#include <sys/types.h> +#include <stdio.h> + +/** + * getdelim() with delimiter hardwired to '\n'. + * + * @see getdelim() + */ +ssize_t getline(char **linebuf, size_t *n, FILE *file); + +/** + * Reads up to (and including) a TERMINATOR from STREAM into *LINEPTR + * (and null-terminate it). *LINEPTR is a pointer returned from malloc (or + * NULL), pointing to *N characters of space. It is realloc'd as + * necessary. + * + * @return the number of characters read (not including the null delimiter), + * or -1 on error or EOF. + */ +ssize_t getdelim(char **linebuf, size_t *linebufsz, int delimiter, FILE *file); + +#endif /* GETLINE_H */ diff --git a/poliqarp-library/sakura/common/graph.c b/poliqarp-library/sakura/common/graph.c new file mode 100644 index 0000000000000000000000000000000000000000..30ee852a9eba4d166c281ffd87de7f458acdb79f --- /dev/null +++ b/poliqarp-library/sakura/common/graph.c @@ -0,0 +1,609 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <errno.h> + +#include <sakura/common/graph.h> + +void graph_nfs_create(struct nfs_env *this, struct marena *arena) +{ + this->first_node = NULL; + this->node_array = NULL; + this->num_nodes = 0; + this->arena = arena; +} + +int graph_nfs_to_dfs(struct graph_env *this, struct nfs_graph root) +{ + struct nfs_node *node; + struct nfs_node **node_array; + int rc; + + /* create node array */ + this->nfs.root = root; + node_array = this->nfs.node_array = malloc(sizeof *this->nfs.node_array * + this->nfs.num_nodes); + if (node_array == NULL) + return -1; + node_array += this->nfs.num_nodes - 1; /* write backward! */ + for (node = this->nfs.first_node; node != NULL; node = node->next, + --node_array) + { + *node_array = node; + } + + /* initialize bitset */ + bitset_arena_create(&this->bitset_arena, this->nfs.num_nodes, &this->arena); + + /* transform graphs */ + { + bitset closure; + rc = graph_nfs_get_closure(&closure, this, this->nfs.root.start); + if (rc != 0) + return rc; + this->dfs.root = graph_dfs_alloc(this, closure); + if (this->dfs.root == NULL) + return -1; + } + + rc = graph_dfs_build(this, this->dfs.root); + if (rc != 0) + return rc; + graph_dfs_calc_dist(this->dfs.root, 0); + return 0; +} + +void graph_nfs_destroy(struct nfs_env *this) +{ + free(this->node_array); +} + +void graph_dfs_create(struct dfs_env *this, struct marena *arena) +{ + this->first_node = NULL; + this->num_nodes = 0; + this->arena = arena; +} + +void graph_dfs_destroy(struct dfs_env *this) +{ + /* nothing is needed here */ +} + +int graph_create(struct graph_env *this, set_compare_fn symbol_compare, + set_free_fn symbol_free) +{ + int rc; + marena_create(&this->arena); + rc = set_create(&this->symbol_set, symbol_compare, symbol_free, &this->arena); + if (rc != 0) { + marena_destroy(&this->arena); + return rc; + } + bitset_arena_create_dummy(&this->bitset_arena); + graph_nfs_create(&this->nfs, &this->arena); + graph_dfs_create(&this->dfs, &this->arena); + return 0; +} + +void graph_destroy(struct graph_env *this) +{ + set_destroy(&this->symbol_set); + bitset_arena_destroy(&this->bitset_arena); + graph_nfs_destroy(&this->nfs); + graph_dfs_destroy(&this->dfs); + marena_destroy(&this->arena); +} + +void *graph_store_symbol(struct graph_env *this, void *symbol) +{ + struct set_node *node = set_put(&this->symbol_set, symbol); + if (node == NULL) + return NULL; + return node->item; +} + +struct nfs_node *graph_nfs_alloc(struct graph_env *this) +{ + struct nfs_node *result; + + result = marena_alloc(this->nfs.arena, sizeof *result); + if (result == NULL) + return NULL; + result->next = this->nfs.first_node; + result->first_link = NULL; + result->id = this->nfs.num_nodes++; + result->closure = NULL; + result->clone = NULL; + result->flags.is_marked = 0; + + this->nfs.first_node = result; + return result; +} + +int graph_nfs_link(struct graph_env *this, struct nfs_node *from, + struct nfs_node *to, void *symbol, unsigned flags) +{ + struct nfs_link *link; + + assert(from != NULL); + assert(to != NULL); + + for (link = from->first_link; link; link = link->next) + if (link->to == to && link->symbol == symbol) { + link->flags |= flags; + return 0; + } + + link = marena_alloc(this->nfs.arena, sizeof *link); + if (link == NULL) + return -1; + link->next = from->first_link; + link->to = to; + link->symbol = symbol; + link->flags = flags; + from->first_link = link; + return 0; +} + +void graph_nfs_closure_partial(struct graph_env *env, struct nfs_node *node, + struct nfs_node *orig) +{ + struct nfs_link *link; + + assert(node != NULL); + + /* mark this node as set */ + bitset_arena_set(&env->bitset_arena, orig->closure, node->id); + + /* create the closure recursively */ + for (link = node->first_link; link; link = link->next) + if (link->symbol == SYMBOL_EPSILON && + bitset_arena_get(&env->bitset_arena, orig->closure, + link->to->id) == 0) + { + graph_nfs_closure_partial(env, link->to, orig); + } +} + +int graph_nfs_get_closure(bitset *result, struct graph_env *env, struct nfs_node *node) +{ + assert(node != NULL); + + if (node->closure == NULL) { + node->closure = bitset_arena_alloc(&env->bitset_arena); + if (node->closure == NULL) + return -1; + graph_nfs_closure_partial(env, node, node); + } + *result = node->closure; + return 0; +} + +int graph_nfs_atomic(struct nfs_graph *result, struct graph_env *this, void *symbol) +{ + result->start = graph_nfs_alloc(this); + if (result->start == NULL) + return -1; + result->end = graph_nfs_alloc(this); + if (result->end == NULL) + return -1; + return graph_nfs_link(this, result->start, result->end, symbol, 0); +} + +int graph_nfs_concat(struct nfs_graph *result, struct graph_env *this, struct + nfs_graph a, struct nfs_graph b, unsigned flags) +{ + result->start = a.start; + result->end = b.end; + return graph_nfs_link(this, a.end, b.start, SYMBOL_EPSILON, flags); +} + +int graph_nfs_union(struct nfs_graph *result, struct graph_env *this, struct + nfs_graph a, struct nfs_graph b, unsigned flags) +{ + result->start = graph_nfs_alloc(this); + if (result->start == NULL) + return -1; + result->end = graph_nfs_alloc(this); + if (result->end == NULL) + return -1; + return + graph_nfs_link(this, result->start, a.start, SYMBOL_EPSILON, 0) || + graph_nfs_link(this, result->start, b.start, SYMBOL_EPSILON, 0) || + graph_nfs_link(this, a.end, result->end, SYMBOL_EPSILON, 0) || + graph_nfs_link(this, b.end, result->end, SYMBOL_EPSILON, 0); +} + +int graph_nfs_copy_partial(struct graph_env *this, struct nfs_node *node) +{ + int rc; + struct nfs_link *link; + + /* been here */ + node->flags.is_marked = 1; + + for (link = node->first_link; link; link = link->next) { + if (link->to->clone == NULL) { + link->to->clone = graph_nfs_alloc(this); /* duplicate destination */ + if (link->to->clone == NULL) + return -1; + } + + /* duplicate the link */ + rc = graph_nfs_link(this, node->clone, link->to->clone, link->symbol, + link->flags); + if (rc != 0) + return rc; + + if (link->to->flags.is_marked == 0) { + rc = graph_nfs_copy_partial(this, link->to); + if (rc != 0) + return rc; + } + } + return 0; +} + +int graph_nfs_copy(struct nfs_graph *result, struct graph_env *this, struct nfs_graph g) +{ + struct nfs_node *node; + struct nfs_node *clone; + int rc; + + for (node = this->nfs.first_node; node; node = node->next) { + node->flags.is_marked = 0; + node->clone = NULL; + } + + clone = graph_nfs_alloc(this); + if (clone == NULL) + return -1; + + g.start->clone = clone; + g.end->flags.is_marked = 1; + + /* mumbo jumbo */ + rc = graph_nfs_copy_partial(this, g.start); + if (rc != 0) + return rc; + + result->start = g.start->clone; + result->end = g.end->clone; + + return 0; +} + +int graph_nfs_quantify(struct nfs_graph *result, struct graph_env *this, + struct nfs_graph g, int n, int m) +{ + int rc; + + if (n == 0 && m == -1) { + /* '*' */ + *result = g; + rc = + graph_nfs_link(this, result->start, result->end, SYMBOL_EPSILON, 0) || + graph_nfs_link(this, result->end, result->start, SYMBOL_EPSILON, 0); + return rc; + } else if (n == 0 && m == 0) { + /* {0} is erroneous */ + result->start = result->end = NULL; + errno = EINVAL; + return -1; + } else if (n == 0 && m == 1) { + /* '?' */ + *result = g; + rc = graph_nfs_link(this, result->start, result->end, SYMBOL_EPSILON, 0); + return rc; + } else if (n == 1 && m == -1) { + /* '+' */ + *result = g; + rc = graph_nfs_link(this, result->end, result->start, SYMBOL_EPSILON, 0); + return rc; + } else if (m == -1) { + /* {n,} */ + int i; + struct nfs_graph *copy = malloc(sizeof *copy * (n + 1)); + if (copy == NULL) + return -1; + + copy[0] = g; + + for (i = 1; i <= n; ++i) { + rc = graph_nfs_copy(copy + i, this, g); + if (rc != 0) { + free(copy); + return rc; + } + } + for (i = 0; i < n; ++i) { + rc = graph_nfs_link(this, copy[i].end, copy[i + 1].start, + SYMBOL_EPSILON, 0); + if (rc != 0) { + free(copy); + return rc; + } + } + + rc = graph_nfs_link(this, copy[n].end, copy[n].start, SYMBOL_EPSILON, 0); + if (rc != 0) { + free(copy); + return rc; + } + + result->start = copy[0].start; + result->end = copy[n].start; + + free(copy); + return 0; + } else if (n <= m) { + /* {n,m} */ + int i; + struct nfs_graph *copy = malloc(sizeof *copy * m); + if (copy == NULL) + return -1; + + copy[0] = g; + for (i = 1; i < m; ++i) { + rc = graph_nfs_copy(copy + i, this, g); + if (rc != 0) { + free(copy); + return rc; + } + } + + result->start = graph_nfs_alloc(this); + if (result->start == NULL) { + free(copy); + return -1; + } + result->end = graph_nfs_alloc(this); + if (result->end == NULL) { + free(copy); + return -1; + } + + /* link start -> first */ + rc = graph_nfs_link(this, result->start, copy[0].start, SYMBOL_EPSILON, 0); + if (rc != 0) { + free(copy); + return rc; + } + + /* link last -> end */ + rc = graph_nfs_link(this, copy[m - 1].end, result->end, SYMBOL_EPSILON, 0); + if (rc != 0) { + free(copy); + return rc; + } + + for (i = 0; i < m; ++i) { + if (i > 0) { + rc = graph_nfs_link(this, copy[i - 1].end, copy[i].start, + SYMBOL_EPSILON, 0); + if (rc != 0) { + free(copy); + return rc; + } + } + if (i >= n) { + rc = graph_nfs_link(this, copy[i].start, result->end, SYMBOL_EPSILON, 0); + if (rc != 0) { + free(copy); + return rc; + } + } + } + free(copy); + return 0; + } else { + result->start = result->end = NULL; + errno = EINVAL; + return -1; + } +} + +/** + * Get an existing node with this description or allocate it. + */ +struct dfs_node *graph_dfs_fetch(struct graph_env *this, bitset env_id) +{ + struct dfs_node *node; + + for (node = this->dfs.first_node; node; node = node->next) + if (bitset_arena_compare(&this->bitset_arena, env_id, node->env_id) == 0) + return node; + + return graph_dfs_alloc(this, env_id); +} + +/** + * Allocate a node with given description. + */ +struct dfs_node *graph_dfs_alloc(struct graph_env *env, bitset env_id) +{ + struct dfs_node *result = marena_alloc(env->dfs.arena, sizeof *result); + if (result == NULL) + return NULL; + + assert(env_id != NULL); + + result->next = env->dfs.first_node; + result->first_link = NULL; + result->dot_link = NULL; + result->num_links = 0; + result->env_id = env_id; + result->id = env->dfs.num_nodes++; + result->flags.is_final = bitset_arena_get(&env->bitset_arena, env_id, + env->nfs.root.end->id) ? 1 : 0; + result->flags.is_complete = 0; + result->visited = 0; + + env->dfs.first_node = result; + + return result; +} + + +/** + * Recursive friend of graph_nfs_2_dfs. + */ +int graph_dfs_build(struct graph_env *this, struct dfs_node *d_node) +{ + size_t index; + struct nfs_link *n_link; + struct dfs_link *d_link = NULL; + struct nfs_node *n_node; + unsigned flags = 0; + + assert(this != NULL); + assert(d_node != NULL); + + /* been here */ + d_node->flags.is_complete = 1; + + /* for every nfs-node in the bitset */ + for (index = 0; + (index = bitset_arena_iterate(&this->bitset_arena, d_node->env_id, + index)) != (size_t)-1; + ++index) + { + assert(index < this->nfs.num_nodes); + n_node = this->nfs.node_array[index]; + /* for every link in that nfs-node */ + for (n_link = n_node->first_link; n_link != NULL; n_link = n_link->next) + { + /* skip EPSILONs */ + if (n_link->symbol == SYMBOL_EPSILON) { + flags |= n_link->flags; /* but collect their flags */ + continue; + } + /* treat dot symbol specially */ + if (n_link->symbol == SYMBOL_DOT) { + if (d_node->dot_link != NULL) { + d_link = d_node->dot_link; + goto skip_create_d_link; + } + + d_link = d_node->dot_link = marena_alloc(this->dfs.arena, + sizeof *d_node->dot_link); + if (d_link == NULL) + return -1; + d_link->next = NULL; + d_link->to = NULL; + d_link->symbol = SYMBOL_DOT; + d_link->flags = flags; + d_link->path_id = bitset_arena_alloc(&this->bitset_arena); + if (d_link->path_id == NULL) + return -1; + d_node->num_links++; + } else { + /* find the corresponding link in dfs-node */ + for (d_link = d_node->first_link; d_link != NULL; + d_link = d_link->next) + { + if (d_link->symbol == n_link->symbol) + goto skip_create_d_link; + } + /* didn't find link with this symbol? create it! */ + d_link = marena_alloc(this->dfs.arena, sizeof *d_link); + if (d_link == NULL) + return -1; + d_link->next = d_node->first_link; + d_link->to = NULL; + d_link->symbol = n_link->symbol; + d_link->flags = flags; + d_link->path_id = bitset_arena_alloc(&this->bitset_arena); + if (d_link->path_id == NULL) + return -1; + d_node->first_link = d_link; + d_node->num_links++; + } +skip_create_d_link: + /* AND FINALLY: mark nfs-node in the path-id */ + assert(d_link != NULL); + assert(d_link->path_id != NULL); + { + bitset closure; + int rc = graph_nfs_get_closure(&closure, this, n_link->to); + if (rc != 0) + return rc; + bitset_arena_union(&this->bitset_arena, d_link->path_id, closure); + } + d_link->flags |= n_link->flags; + } + } + + /* make dot link the very last link */ + if (d_node->dot_link) { + d_link = d_node->first_link; + if (d_link == NULL) + d_node->first_link = d_node->dot_link; + else { + while (d_link->next != NULL) + d_link = d_link->next; + d_link->next = d_node->dot_link; + } + } + + /* create all targets */ + for (d_link = d_node->first_link; d_link != NULL; d_link = d_link->next) { + d_link->to = graph_dfs_fetch(this, d_link->path_id); + if (d_link->to == NULL) + return -1; + } + + /* build all unbuilt targets */ + for (d_link = d_node->first_link; d_link != NULL; d_link = d_link->next) + if (d_link->to->flags.is_complete == 0) { + int rc = graph_dfs_build(this, d_link->to); + if (rc != 0) + return rc; + } + + return 0; +} + +void graph_dfs_calc_dist(struct dfs_node *node, size_t distance) +{ + struct dfs_link *link; + if (distance == (size_t)-1) { + if (node->distance == (size_t)-1) + return; + else { + node->distance = (size_t)-1; + for (link = node->first_link; link != NULL; link = link->next) + graph_dfs_calc_dist(link->to, (size_t)-1); + } + } else { + if (node->visited) + graph_dfs_calc_dist(node, (size_t)-1); + else { + node->visited = 1; + node->distance = distance; + for (link = node->first_link; link != NULL; link = link->next) + graph_dfs_calc_dist(link->to, distance + 1); + } + } +} diff --git a/poliqarp-library/sakura/common/graph.h b/poliqarp-library/sakura/common/graph.h new file mode 100644 index 0000000000000000000000000000000000000000..edbe0d222136456d558ebf654b777db20d5815db --- /dev/null +++ b/poliqarp-library/sakura/common/graph.h @@ -0,0 +1,242 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +/** + * @file graph.h + * @brief Finite-state automata represented as graphs. + * + * This file declares structures representing finite-state automata in + * two flavours: non-deterministic (called NFS throughout the rest of these + * comments) and deterministic (DFS). The automata are used to represent + * compiled Poliqarp queries. + */ + +#ifndef HAVE_GRAPH_H +#define HAVE_GRAPH_H + +#include <sakura/common/set.h> +#include <sakura/common/bs.h> + +/** Symbol used to represent epsilon. */ +#define SYMBOL_EPSILON NULL + +/** Symbol used to represent anything. */ +#define SYMBOL_DOT (void *)(-1) + +/** Link between two NFS nodes. */ +struct nfs_link { + struct nfs_link *next; /**< Next link in the originating node. */ + struct nfs_node *to; /**< Destination node. */ + void *symbol; /**< Symbol. */ + unsigned flags; /**< Flags associated with this link. */ +}; + +/** Link between two DFS nodes. */ +struct dfs_link { + struct dfs_link *next; /**< Next link in the originating node. */ + struct dfs_node *to; /**< Destination node. */ + void *symbol; /**< Symbol. */ + bitset path_id; /**< Bitset describing which nfs-nodes + are reachable via this symbol. */ + unsigned flags; /**< Flags associated with this link. */ +}; + +/** Node in an NFS graph. */ +struct nfs_node { + struct nfs_node *next; /**< Next node in this graph environment. */ + struct nfs_link *first_link; /**< List of links. */ + int id; /**< My unique id. */ + struct nfs_node *clone; /**< My clone. This is used to create + copies of graphs. */ + bitset closure; /**< Closure of this node. Everything + reachable via SYMBOL_EPSILON. */ + struct { + unsigned is_marked:1; /**< Used to create copies of graphs. */ + } flags; /**< Flags associated with this node. */ +}; + +/** Node in an DFS graph. */ +struct dfs_node { + struct dfs_node *next; /**< Next node in this graph environment. */ + struct dfs_link *first_link; /**< Normal links. */ + struct dfs_link *dot_link; /**< Special dot link. */ + int num_links; /**< Number of links. */ + const_bitset env_id; /**< Bitset describing nfs nodes + composing this node. */ + int id; /**< My unique id. */ + struct { + unsigned is_final:1; /**< This is a final node (one that can + successfully end the graph). */ + unsigned is_complete:1; /**< This node has been built. */ + } flags; /**< Flags associated with this node. */ + size_t distance; /**< Length of path from root to this node. + (size_t)-1 if cycle */ + int visited; /**< Needed when calculating the above. */ +}; + +/** NFS graph structure. */ +struct nfs_graph { + struct nfs_node *start; /**< Initial node. */ + struct nfs_node *end; /**< Final node. */ +}; + +/** DFS graph structure. */ +struct dfs_graph { + struct dfs_node *start; /**< Initial node. */ + struct dfs_node *end; /**< Final node. */ +}; + +/** NFS graph environment. Keeps graph data together. */ +struct nfs_env { + struct nfs_node *first_node; /**< List of nodes. */ + struct nfs_node **node_array; /**< Array of nodes (when nfs->dfs + transformation begins). */ + size_t num_nodes; /**< Number of nodes. */ + struct nfs_graph root; /**< The graph that contains all nodes. */ + struct marena *arena; /**< Memory arena that supplies memory + for all internal allocations. */ +}; + +/** DFS graph environment. Keeps graph data together. */ +struct dfs_env { + struct dfs_node *first_node; /**< List of nodes. */ + size_t num_nodes; /**< Number of nodes. */ + struct dfs_node *root; /**< The graph that contains all nodes. */ + struct marena *arena; /**< Memory arena that supplies memory + for all internal allocations. */ +}; + +/** Graph environment. Keeps graph data together. */ +struct graph_env { + struct marena arena; /**< Graph wide memory arena. */ + struct set symbol_set; /**< Set of symbols. */ + struct bitset_arena bitset_arena; /**< Bitset arena. */ + struct nfs_env nfs; /**< NFS graph subsystem. */ + struct dfs_env dfs; /**< DFS graph subsystem. */ +}; + +/** + * Create a graph with the given symbol comparator and destructor. + * @param this Uninitialized graph object. + * @param symbol_compare Symbol comparator. + * @param symbol_free Symbol destructor. + */ +int graph_create(struct graph_env *this, set_compare_fn symbol_compare, + set_free_fn symbol_free); + +/** Graph destructor. */ +void graph_destroy(struct graph_env *this); + +/** Store symbol in the graph environment. */ +void *graph_store_symbol(struct graph_env *this, void *symbol); + +/** Allocate an NFS node. */ +struct nfs_node *graph_nfs_alloc(struct graph_env *this); + +/** Link two NFS nodes. */ +int graph_nfs_link(struct graph_env *this, struct nfs_node *from, + struct nfs_node *to, void *symbol, unsigned flags); + +/** Calculate bitset of all nodes reachable via epsilon from the given node. */ +int graph_nfs_get_closure(bitset *result, struct graph_env *this, + struct nfs_node *node); + +/** + * Create atomic NFS graph of two nodes. The nodes are connected with + * a single link that has the given symbol associated with it + */ +int graph_nfs_atomic(struct nfs_graph *result, struct graph_env *this, + void *symbol); + +/** + * Concatenate two graphs together. The concatenation is achieved by adding + * a link from the final node of graph 'a' to the initial node of the + * graph 'b'. The link has the epsilon symbol associated with it and is + * given 'flags'. + * + * @return the graph containing a's initial node and b's final node + */ +int graph_nfs_concat(struct nfs_graph *result, struct graph_env *this, + struct nfs_graph a, struct nfs_graph b, unsigned flags); + +/** + * Create a union of two graphs. The union is performed by creating two nodes + * (that later become the resulting graph) and linking them in the following + * fashion: + * + * <ul> + * <li>new initial node is linked with a'a initial node</li> + * <li>new initial node is linked with b'a initial node</li> + * <li>a's final node is linked with new final node</li> + * <li>b's final node is linked with new final node</li> + * </ul> + * + * All links are associated with the epsilon symbol and the given flags + */ +int graph_nfs_union(struct nfs_graph *result, struct graph_env *this, + struct nfs_graph a, struct nfs_graph b, unsigned flags); + +/** + * Return a graph containing copy of all the nodes and links + * of the given graph. + * + * @bug This function is rather messy and inefficient + */ +int graph_nfs_copy(struct nfs_graph *result, struct graph_env *this, + struct nfs_graph g); + +/** + * Quantify the given graph. + * + * @bug Quantification is messy and inefficient. + * @param n Minimum number of occurrences. + * @param m Maximum number of occurrences. + */ +int graph_nfs_quantify(struct nfs_graph *result, struct graph_env *this, + struct nfs_graph g, int n, int m); + +/** + * Get node by id. + * + * Get a node with this description if it exists or + * create a new node and assign it this id + * + * @return the node with matching id + */ +struct dfs_node *graph_dfs_fetch(struct graph_env *this, bitset env_id); + +/** Allocate a node with given description. */ +struct dfs_node *graph_dfs_alloc(struct graph_env *this, bitset env_id); + +/** Recursive friend of nfs_to_dfs. */ +int graph_dfs_build(struct graph_env *this, struct dfs_node *node); + +/** Convert from nfs to dfs. */ +int graph_nfs_to_dfs(struct graph_env *this, struct nfs_graph root); + +/** Calculate path lengths. */ +void graph_dfs_calc_dist(struct dfs_node *node, size_t distance); + +#endif + +/** @} */ diff --git a/poliqarp-library/sakura/common/hash-table.c b/poliqarp-library/sakura/common/hash-table.c new file mode 100644 index 0000000000000000000000000000000000000000..b429ab3e5065af31aadb955a06868e90055d700c --- /dev/null +++ b/poliqarp-library/sakura/common/hash-table.c @@ -0,0 +1,157 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <string.h> + +#include <sakura/common/hash-table.h> + +int create_hash_table(struct hash_table *table, size_t size, int flags, + struct marena *arena) +{ + table->size = size; + table->num_items = 0; + table->flags = flags; + table->node_array = calloc(size, sizeof *table->node_array); + if (table->node_array == NULL) + return -1; + if (arena) { + table->arena = arena; + table->private_arena = false; + } else { + table->arena = malloc(sizeof(*arena)); + if (table->arena == NULL) { + free(table->node_array); + return -1; + } + marena_create(table->arena); + table->private_arena = true; + } + return 0; +} + +int destroy_hash_table(struct hash_table *table, void (*free_fn)(void *)) +{ + size_t i; + struct hash_table_node *node, *next_node; + if (free_fn) { + for (i = 0; i < table->size; ++i) { + for (node = table->node_array[i]; node; node = next_node) { + next_node = node->next; + free_fn(node->value); + } + } + } + if (table->private_arena) { + marena_destroy(table->arena); + free(table->arena); + } + free(table->node_array); + return 0; +} + +int hash_table_set(struct hash_table *table, const char *key, void *value) +{ + string_hash_t hash; + struct hash_table_node *node; + size_t index; + + hash = hash_string(key); + index = hash % table->size; + node = table->node_array[index]; + for (; node != NULL; node = node->next) + if (hash == node->hash && strcmp(key, node->key) == 0) { + node->value = value; + return 0; + } + node = marena_alloc(table->arena, sizeof *node); + if (!node) + return -1; + node->key = (table->flags & HASHTABLE_DUPLICATE_KEYS) ? + marena_strdup(table->arena, key) : (char *)key; + if (node->key == NULL) + return -1; + node->value = value; + node->hash = hash; + node->next = table->node_array[index]; + table->node_array[index] = node; + table->num_items++; + return 0; +} + +int hash_table_unset(struct hash_table *table, const char *key) +{ + string_hash_t hash; + struct hash_table_node *node, *prev; + size_t index; + + hash = hash_string(key); + index = hash % table->size; + node = table->node_array[index]; + prev = NULL; + for (; node != NULL; prev = node, node = node->next) + if (hash == node->hash && strcmp(key, node->key) == 0) { + if (prev == NULL) + table->node_array[index] = node->next; + else + prev->next = node->next; + free(node->value); + table->num_items--; + return 0; + } + return -1; + } + +void *hash_table_get(struct hash_table *table, const char *key) +{ + string_hash_t hash; + struct hash_table_node *node; + hash = hash_string(key); + node = table->node_array[hash % table->size]; + for (; node != NULL; node = node->next) + if (hash == node->hash && strcmp(key, node->key) == 0) + return node->value; + return NULL; +} + +const void *hash_table_const_get(const struct hash_table *table, + const char *key) +{ + return hash_table_get((struct hash_table *) table, key); +} + +void hash_table_iterate(const struct hash_table *table, void *env, + void (*iterator)(const char *, const void *, void *)) +{ + size_t i; + struct hash_table_node *node; + for (i = 0; i < table->size; i++) { + for (node = table->node_array[i]; node != NULL; node = node->next) { + iterator(node->key, node->value, env); + } + } +} + +size_t hash_table_num_items(const struct hash_table *table) +{ + return table->num_items; +} diff --git a/poliqarp-library/sakura/common/hash-table.h b/poliqarp-library/sakura/common/hash-table.h new file mode 100644 index 0000000000000000000000000000000000000000..0f2e826f3bb138887da217631790a10fb86ae776 --- /dev/null +++ b/poliqarp-library/sakura/common/hash-table.h @@ -0,0 +1,146 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +/** + * @file hash-table.h + * @brief Hash tables with textual keys. + * + * This file defines hash tables (with standard operations) that map strings + * to arbitrary values. Hash conflicts are resolved by means of linked lists. + */ + +#ifndef HASH_TABLE_H +#define HASH_TABLE_H + +#include <poliqarp-config.h> + +#include <stdlib.h> + +#include <sakura/common/memory-arena.h> +#include <sakura/common/string-hash.h> + +/* Hashtable creation flags. */ +#define HASHTABLE_DUPLICATE_KEYS 1 /**< Shall the keys be duplicated? */ + +/** + * A single node in the table. + */ +struct hash_table_node { + char *key; /**< Key for this mapping. */ + void *value; /**< Value for this mapping. */ + struct hash_table_node *next; /**< Next node with the same hash value. */ + string_hash_t hash; /**< Hash value of the key. */ +}; + +/** + * The hash table. + */ +struct hash_table { + size_t size; /**< Number of nodes. */ + size_t num_items; /**< Number of items currently stores in the table. */ + int flags; /**< Table flags, bitwise OR of HASHTABLE_*. */ + struct hash_table_node **node_array; /**< Array of nodes. */ + struct marena *arena; /**< Memory source. */ + bool private_arena; /**< True iff arena is private for this hash table. */ +}; + +/** + * Constructs an empty hash table with the given size. + * + * @param table the structure to be initialized. + * @param size size of the table (number of distinct hash values possible). + * @param flags creation flags of the table, with the following meanings: + * <dl> + * <dt><tt>HASHTABLE_DUPLICATE_KEYS</tt></dt> + * <dd>If this is set, the hash_table_set() function will make a private + * copy of the key parameter passed to it, so that the original can + * be freed immediately. Otherwise, a pointer to the original key + * is stored. + * </dd> + * </dl> + * @param arena the memory arena to be used as the source of memory. If + * this is NULL, a private arena will be allocated and freed upon + * destruction of this hash table. + * @return 0 upon successful creation, a non-zero value on error. + */ +int create_hash_table(struct hash_table *table, size_t size, int flags, + struct marena *arena); + +/** + * Destroys the given hash table. + * + * @param table the structure to be destroyed. + * @param free_fn pointer to a function that frees a single value. + * @return 0 upon successful completion, a non-zero value on error. + */ +int destroy_hash_table(struct hash_table *table, void (*free_fn)(void *)); + +/** + * Creates a key-to-value mapping in the table. If a mapping for this key + * already existed, it is replaced by the new one (the old mapping is lost). + * + * @return 0 upon success, a non-zero value on error. + */ +int hash_table_set(struct hash_table *table, const char *key, void *value); + +/** + * Removes a mapping for the given key from the table. + * + * @return 0 upon success, a non-zero value on error or if there were no such + * mapping. + */ +int hash_table_unset(struct hash_table *table, const char *key); + +/** + * Returns the value associated with the given key, or NULL if there is no + * such value. + */ +void *hash_table_get(struct hash_table *table, const char *key); + +/** + * Same as hash_table_get(), except that it operates on a constant hash table. + * + * @see hash_table_get + */ +const void *hash_table_const_get(const struct hash_table *table, + const char *key); + +/** + * Invokes the given function (the iterator) on all mappings contained + * in the hash table. + * + * @param env The environment, an arbitrary value provided for the + * key-value mappings to communicate with outside world. + * @param iterator The iterator function. It must accept three arguments: + * key, value, and environment (env is passed as a third argument to + * the iterator). + */ +void hash_table_iterate(const struct hash_table *table, void *env, + void (*iterator)(const char *, const void *, void *)); + +/** + * Returns number of items in the given hash table. + */ +size_t hash_table_num_items(const struct hash_table *table); + +#endif diff --git a/poliqarp-library/sakura/common/memory-arena.c b/poliqarp-library/sakura/common/memory-arena.c new file mode 100644 index 0000000000000000000000000000000000000000..b2748d2d8421c3d8d9fab5b40bf6ede2b1d1ac03 --- /dev/null +++ b/poliqarp-library/sakura/common/memory-arena.c @@ -0,0 +1,234 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <poliqarp-config.h> + +#include <sakura/common/memory-arena.h> + +static inline size_t align_request(size_t request, size_t size) +{ + return ((request + size - 1) / size) * size; +} + +struct marena_node *marena_alloc_node(size_t size) +{ + struct marena_node *this; + +#if MARENA_DEBUG + fprintf(stderr, "marena_alloc_node: %d\n", size); +#endif + + this = malloc(sizeof *this + size); + if (this == NULL) + return NULL; /* out of memory */ + this->next = NULL; + this->used = 0; + this->capacity = size; + this->memory = ((char *)this) + sizeof *this; + assert((((size_t)this->memory) & (MARENA_ALIGNMENT - 1)) == 0); + return this; +} + +struct marena_bignode *marena_alloc_bignode(size_t size) +{ + struct marena_bignode *this; + +#if MARENA_DEBUG + fprintf(stderr, "marena_alloc_bignode: %d\n", size); +#endif + + this = malloc(sizeof *this + size); + if (this == NULL) + return NULL; /* out of memory */ + this->next = NULL; + this->size = size; + this->memory = ((char *)this) + sizeof *this; + assert((((size_t)this->memory) & (MARENA_ALIGNMENT - 1)) == 0); + return this; +} + +void *marena_get_node_memory(struct marena_node *this, size_t size) +{ + if (this->used + size <= this->capacity) { + void *result = ((char *) this->memory) + this->used; + this->used += size; + return result; + } else + return NULL; +} + +void marena_create(struct marena *this) +{ + this->first = NULL; + this->current = NULL; + this->first_big = NULL; +#if MARENA_TRACK_ALLOC + this->n_alloc = 0; +#endif + this->block_size = MARENA_DEF_BLOCK_SIZE; +} + +void *marena_alloc(struct marena *this, size_t size) +{ + void *result; + assert(this != NULL); + +#if MARENA_DEBUG + fprintf(stderr, "marena_alloc: %d -> %d\n", size, + align_request(size, MARENA_ALIGNMENT)); +#endif + + /* align the request */ + size = align_request(size, MARENA_ALIGNMENT); + +#if MARENA_TRACK_ALLOC + this->n_alloc++; +#endif + + /* allocate large chunks separately */ + if (size > this->block_size) { + struct marena_bignode *bignode = marena_alloc_bignode(size); + if (bignode == NULL) + return NULL; /* out of memory */ + bignode->next = this->first_big; + this->first_big = bignode; + return bignode->memory; + } + + /* make sure we have a chain ready */ + if (this->current == NULL) { + this->current = this->first = + marena_alloc_node(align_request(size, this->block_size)); + if (this->current == NULL) + return NULL; /* out of memory */ + } + + /* try to allocate memory */ + result = marena_get_node_memory(this->current, size); + if (result != NULL) + return result; + + /* allocate another chain element */ + if (this->current->next == NULL) { + this->current->next = + marena_alloc_node(align_request(size, this->block_size)); + if (this->current->next == NULL) + return NULL; /* out of memory */ + } + this->current = this->current->next; + + /* try to allocate memory again */ + result = marena_get_node_memory(this->current, size); + assert(result != NULL); + return result; +} + +void marena_release(struct marena *this) +{ + struct marena_node *n = this->first; + struct marena_bignode *bn, *bnext; + for (bn = this->first_big; bn; bn = bnext) { + bnext = bn->next; + free(bn); + } + while (n) { + n->used = 0; + n = n->next; + } + this->current = this->first; +} + +char *marena_strdup(struct marena *this, const char *string) +{ + size_t length; + char *memory; + + length = strlen(string) + 1; + memory = marena_alloc(this, length); + if (memory == NULL) + return NULL; /* out of memory */ + memcpy(memory, string, length); + return memory; +} + +char *marena_strndup(struct marena *this, const char *string, size_t length) +{ + char *memory; + const char *end; + + end = memchr(string, 0, length); + if (end) + length = end - string; + memory = marena_alloc(this, length + 1); + if (memory == NULL) + return NULL; /* out of memory */ + memcpy(memory, string, length); + memory[length] = 0; + return memory; +} + +void marena_destroy(struct marena *this) +{ + struct marena_node *n, *next; + struct marena_bignode *bn, *bnext; + for (n = this->first; n; n = next) { + next = n->next; + free(n); + } + for (bn = this->first_big; bn; bn = bnext) { + bnext = bn->next; + free(bn); + } + this->first = NULL; + this->current = NULL; + this->first_big = NULL; +} + +int marena_setopt(struct marena *this, int opt, size_t value) +{ + switch (opt) { + case MARENA_OPT_BLOCKSIZE: + this->block_size = value ? value : MARENA_DEF_BLOCK_SIZE; + break; + default: + return -1; + } + return -1; +} + +size_t marena_getopt(struct marena * this, int opt) +{ + switch (opt) { + case MARENA_OPT_BLOCKSIZE: + return this->block_size; + case MARENA_OPT_NALLOC: +#if MARENA_TRACK_ALLOC + return this->n_alloc; +#else + return 0; +#endif + default: + return -1; + } + return -1; +} diff --git a/poliqarp-library/sakura/common/memory-arena.h b/poliqarp-library/sakura/common/memory-arena.h new file mode 100644 index 0000000000000000000000000000000000000000..3c2241d49312494d267ab0e1a828cbba2bdee238 --- /dev/null +++ b/poliqarp-library/sakura/common/memory-arena.h @@ -0,0 +1,139 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +/** + * @file memory-arena.h + * @brief Quick allocator for small objects. + * + * A common case when allocating objects is that we want to allocate many + * objects, then free them all at once. It could be done with malloc() and + * free(), but that would be costly, especially in case of a very large + * amount of tiny objects. To resolve this difficulty, Poliqarp provides a + * memory allocator for such objects, called a memory arena. + * + * Memory arenas somewhat resemble 'obstacks' defined by GNU glibc: they + * pre-allocate chunks of memory of fixed size and place objects on them, + * so that allocation is fast. When one doesn't need the memory allocated + * on the arena anymore, one can free it all at once, or release it + * (the memory then won't be freed, but made available for allocation + * again). + */ + +#ifndef MEMORY_ARENA_H +#define MEMORY_ARENA_H + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <assert.h> + +/* Memory areas are tunable via these parameters. */ + +#define MARENA_ALIGNMENT 4 /**< Align requests to this boundary. */ +#define MARENA_DEF_BLOCK_SIZE 4096 /**< Default block allocation size. */ +#define MARENA_TRACK_ALLOC 0 /**< Track number of allocations. */ +#define MARENA_DEBUG 0 /**< Display debugging information. */ + +/** Memory arena node. */ +struct marena_node { + struct marena_node *next; /**< Next such node. */ + size_t used; /**< Number of bytes used. */ + size_t capacity; /**< Capacity of this node. */ + void *memory; /**< Memory allocated for this node. */ +}; + +/** Memory arena big node. */ +struct marena_bignode { + struct marena_bignode *next; /**< Next such node. */ + size_t size; /**< Capacity (all used). */ + void *memory; /**< Memory allocated for this node. */ +}; + +/** Memory arena. */ +struct marena { + struct marena_node *first; /**< List of nodes. */ + struct marena_node *current; /**< Current (last) node. */ + struct marena_bignode *first_big; /**< List of big nodes. */ +#if MARENA_TRACK_ALLOC + size_t n_alloc; /**< Number of allocations. */ +#endif + size_t block_size; /**< Block size. */ +}; + +/** + * Creates a memory arena. + * @param this Pointer to uninitialized object. + */ +void marena_create(struct marena *this); + +/** + * Allocates memory from arena. + */ +void *marena_alloc(struct marena *this, size_t size); + +/** + * Releases all the memory allocated by this arena. + * When the memory allocated from an arena is no longer used, this function + * can be called. It doesn't free the memory, but releases it so that it + * can be reused. + */ +void marena_release(struct marena *this); + +/** + * Creates a duplicate of a string getting necessary memory from arena. + */ +char *marena_strdup(struct marena *this, const char *string); + +/** + * Same as marena_strdup, except that it copies at most 'length' characters. + * When the string is longer than 'length', only 'length' characters are + * copied, and a null byte is added. + */ +char *marena_strndup(struct marena *this, const char *string, size_t length); + +/** + * Destroys this arena and frees all allocated memory. + */ +void marena_destroy(struct marena *this); + +/* Options used to tune the arena at run-time. */ + +#define MARENA_OPT_BLOCKSIZE 1 /**< Block allocation size. */ +#define MARENA_OPT_NALLOC 2 /**< Get number of allocations for + this arena. Available only when + support for this option is enabled. */ + +/** + * Sets a run-time option for an area. + * @param opt Option number, one of MARENA_OPT_*. + * @param value Value of the option. + */ +int marena_setopt(struct marena *this, int opt, size_t value); + +/** + * Gets the value of a run-time option. + * @param opt Option number, one of MARENA_OPT_*. + */ +size_t marena_getopt(struct marena *this, int opt); + +#endif /* MEMORY_ARENA_H */ diff --git a/poliqarp-library/sakura/common/newdict.c b/poliqarp-library/sakura/common/newdict.c new file mode 100644 index 0000000000000000000000000000000000000000..ea8f51e227062af98778b956d1291768bb26d8a6 --- /dev/null +++ b/poliqarp-library/sakura/common/newdict.c @@ -0,0 +1,165 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <errno.h> + +#include <sakura/common/newdict.h> + +static void skip(void *foo) +{ +} + +int newdict_open(struct newdict *this, const char *image_name, + const char *offset_name, bool want_index) +{ + int my_errno; + + init_file_map(&this->freq_map); + this->freq = NULL; + + if (create_file_map(&this->offset_map, offset_name) != 0) + { + my_errno = errno; + goto err; + } + + this->num_items = get_file_size(&this->offset_map) / + sizeof(newdict_offset_t); + this->item = malloc(sizeof *this->item * this->num_items); + if (this->item == NULL) + { + my_errno = errno; + goto err_at_malloc; + } + + if (create_file_map(&this->dict_map, image_name) != 0) + { + my_errno = errno; + goto err_at_image; + } + + { + const void **item = this->item; + const char *image = get_file_image(&this->dict_map); + const newdict_offset_t *offset = get_file_image(&this->offset_map); + + size_t num_items = this->num_items; + size_t i; + for (i = 0; i < num_items; ++i) + item[i] = image + en4(offset[i]); + if (want_index) { + this->index = malloc(sizeof *(this->index)); + if (this->index == NULL) + { + my_errno = errno; + goto err_at_image; + } + if (create_hash_table(this->index, num_items, 0, NULL)) + { + my_errno = errno; + goto err_at_index_create; + } + for (i = 0; i < num_items; ++i) + if (hash_table_set(this->index, (const char *)item[i], + (void *)((intptr_t)(i + 1)))) + { + my_errno = errno; + goto err_at_index; + } + } else + this->index = NULL; + } + return 0; + + /* cleanup code */ + +err_at_index: + destroy_hash_table(this->index, skip); +err_at_index_create: + free(this->index); +err_at_image: + free(this->item); +err_at_malloc: + destroy_file_map(&this->offset_map); +err: + init_file_map(&this->dict_map); + init_file_map(&this->offset_map); + this->item = NULL; + errno = my_errno; + return -1; +} + +int newdict_open2(struct newdict *this, const char *image_name, + const char *offset_name, const char *freq_name) +{ + int code; + + code = create_file_map(&this->freq_map, freq_name); + if (code != 0) + return code; + + code = newdict_open(this, image_name, offset_name, false); + if (code != 0) { + int my_errno = errno; + destroy_file_map(&this->freq_map); + errno = my_errno; + return code; + } + this->freq = get_file_image(&this->freq_map); + return 0; +} + +void newdict_close(struct newdict *this) +{ + destroy_file_map(&this->dict_map); + destroy_file_map(&this->offset_map); + destroy_file_map(&this->freq_map); + if (this->index) { + destroy_hash_table(this->index, skip); + free(this->index); + } + free(this->item); +} + +int newindex_open(struct newindex *this, struct newdict *dict, + const char *index_name) +{ + int code; + code = create_file_map(&this->index_map, index_name); + if (code != 0) + return code; + + this->num_items = get_file_size(&this->index_map) / sizeof(newdict_offset_t); + if (this->num_items != dict->num_items) { + destroy_file_map(&this->index_map); + errno = EINVAL; + return -1; + } + this->index = get_file_image(&this->index_map); + return 0; +} + +void newindex_close(struct newindex *this) +{ + destroy_file_map(&this->index_map); +} diff --git a/poliqarp-library/sakura/common/newdict.h b/poliqarp-library/sakura/common/newdict.h new file mode 100644 index 0000000000000000000000000000000000000000..a5a9f78b69820ef270f754123cd31599b611791c --- /dev/null +++ b/poliqarp-library/sakura/common/newdict.h @@ -0,0 +1,138 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +/** + * @file newdict.h + * @brief On-disk dictionaries and dictionary indices. + * + * Dictionaries are sequences of data of variable size (such as strings) + * stored on disk in two files: one (the image file) contains actual data + * as one block, the other (the offset file) contains offsets of those + * data stored as unsigned 32-bit little-endian integers. + * + * Dictionary indices are integer-to-integer on-disk maps which contain + * indices to dictionaries and allow to iterate over elements of a dictionary + * in a certain order. + */ + +#ifndef HAVE_NEWDICT_H +#define HAVE_NEWDICT_H + +#include <poliqarp-config.h> + +#include <stdlib.h> + +#include <sakura/common/file-map.h> +#include <sakura/common/hash-table.h> + +/** + * Offset used inside dictionary file. + */ +typedef uint32_t newdict_offset_t; + +/** + * Dictionary structure. Some fields are optional (can be NULL). + */ +struct newdict { + const void **item; /**< Array containing the data. */ + const uint32_t *freq; /**< Array containing (optionally) frequencies + of items. */ + size_t num_items; /**< Number of items in this dictionary. */ + struct hash_table *index; /**< Optional inverted index mapping strings + to their positions. */ + struct file_map dict_map; /**< Image file map object. */ + struct file_map offset_map; /**< Offset file map object. */ + struct file_map freq_map; /**< Frequency file map object. */ +}; + +/** + * Open dictionary. + * @param this Uninitialized dictionary object. + * @param image_name File name of the dictionary image. + * @param offset_name File name of the dictionary offset table. + * @param want_index Whether to create an inverted index for this dictionary. + */ +int newdict_open(struct newdict *this, const char *image_name, + const char *offset_name, bool want_index); + +/** + * Open dictionary with frequency table. + * @param this Uninitialized dictionary object. + * @param image_name File name of the dictionary image. + * @param offset_name File name of the dictionary offset table. + * @param freq_name File name of the dictionary frequency table. + */ +int newdict_open2(struct newdict *this, const char *image_name, + const char *offset_name, const char *freq_name); + +/** + * Close dictionary. + */ +void newdict_close(struct newdict *this); + +/** + * Dictionary index. + * Allows to iterate over elements in specified order + */ +struct newindex { + const uint32_t *index; /**< Array of the indices. */ + struct file_map index_map; /**< Index file map object. */ + size_t num_items; /**< Number of items. */ +}; + +/** + * Open dictionary index. + */ +int newindex_open(struct newindex *this, struct newdict *dict, + const char *index_name); + +/** + * Close dictionary index. + */ +void newindex_close(struct newindex *this); + +/** + * Get item (pointer to data) from dictionary. + */ +#define GET_ITEM(dict, key) ((dict)->item[key]) + +/** + * Get frequency of item from dictionary. + */ +#define GET_FREQ(dict, key) ((dict)->freq[key]) + +/** + * Get content length of item. + */ +#define GET_LENGTH(dict, key) \ + (en4(*(newdict_offset_t *)(((char *)GET_ITEM(dict,key)) - \ + sizeof (newdict_offset_t)))) + +#define GET_INDEX_ITEM(INDEX,KEY) ((INDEX)->index[KEY]) + +/** + * Get number of items in dictionary/index. + */ +#define GET_NUM_ITEMS(DICT_OR_INDEX) ((DICT_OR_INDEX)->num_items) + +#endif /* NEWDICT_H */ diff --git a/poliqarp-library/sakura/common/set.c b/poliqarp-library/sakura/common/set.c new file mode 100644 index 0000000000000000000000000000000000000000..2d33f8a6987dd8fbee693aa3f1bfda626a1d566e --- /dev/null +++ b/poliqarp-library/sakura/common/set.c @@ -0,0 +1,100 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <sakura/common/set.h> + +int set_create(struct set *this, set_compare_fn compare, set_free_fn free, + struct marena *arena) +{ + this->first_node = NULL; + this->num_items = 0; + this->item_compare = compare; + this->item_free = free; + if (arena) { + this->private_arena = false; + this->arena = arena; + } else { + this->arena = malloc(sizeof *this->arena); + if (this->arena == NULL) + return -1; + marena_create(this->arena); + } + return 0; +} + +void set_destroy(struct set *this) +{ + struct set_node *curr; + for (curr = this->first_node; curr; curr = curr->next) + this->item_free(curr->item); + if (this->private_arena) { + marena_destroy(this->arena); + free(this->arena); + } +} + +struct set_node *set_put(struct set *this, void *item) +{ + int cmp_result; + struct set_node *current; + struct set_node *last; + struct set_node *new; + + for (last = NULL, current = this->first_node; current != NULL; + current = current->next) + { + cmp_result = this->item_compare(current->item, item); + if (cmp_result > 0) /* new element would break ordering */ + break; + if (cmp_result == 0) { /* identical element */ + this->item_free(item); + return current; + } + last = current; + } + new = marena_alloc(this->arena, sizeof *new); + if (new == NULL) + return NULL; + new->next = current; + new->item = item; + new->id = this->num_items++; + + if (last) + last->next = new; + else + this->first_node = new; + return new; +} + +void *set_get(struct set *this, int id) +{ + struct set_node *current; + + for (current = this->first_node; current != NULL; current = current->next) { + if (current->id > id) + return NULL; + if (current->id == id) /* identical element */ + return current->item; + } + return NULL; +} diff --git a/poliqarp-library/sakura/common/set.h b/poliqarp-library/sakura/common/set.h new file mode 100644 index 0000000000000000000000000000000000000000..2425761bab03b08fd2da81082663b9be8988b5ab --- /dev/null +++ b/poliqarp-library/sakura/common/set.h @@ -0,0 +1,67 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef HAVE_SET_H +#define HAVE_SET_H + +#include <poliqarp-config.h> + +#include <stdlib.h> +#include <stddef.h> + +#include <sakura/common/memory-arena.h> + +typedef int (*set_compare_fn)(const void *item1, const void *item2); + +typedef void (*set_free_fn)(void *item); + +struct set_node { + struct set_node *next; + void *item; + int id; +}; + +struct set { + struct set_node *first_node; + int num_items; + set_compare_fn item_compare; + set_free_fn item_free; + bool private_arena; + struct marena *arena; +}; + +int set_create(struct set *this, set_compare_fn compare, set_free_fn free, + struct marena *arena); + +/** Destroy given set. */ +void set_destroy(struct set *this); + +/** + * Put item in the set. + * Duplicates are destroyed. + */ +struct set_node *set_put(struct set *this, void *item); + +void *set_get(struct set *this, int id); + +#endif diff --git a/poliqarp-library/sakura/common/string-hash.h b/poliqarp-library/sakura/common/string-hash.h new file mode 100644 index 0000000000000000000000000000000000000000..92c54ede4e51e9b1570231020a11aafe89552f7f --- /dev/null +++ b/poliqarp-library/sakura/common/string-hash.h @@ -0,0 +1,109 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef STRING_HASH_H +#define STRING_HASH_H + +#include <assert.h> +#include <stdint.h> +#include <string.h> +#include <sys/types.h> + +typedef uint32_t string_hash_t; + +#if defined(__GNUC__) && defined(__i386__) +/* x86 architecture supports unaligned 16-bit word access. */ +static inline uint16_t hash_string_16(const char *string) +{ + return ((const uint16_t*) string)[0]; +} +#else +/* More portable but slower. */ +static inline uint16_t hash_string_16(const char *string) +{ + return (uint8_t) string[1] | ((uint8_t) string[0] << 8); +} +#endif + +/** + * Paul Hsieh hash function. + * @param string string to hash + * @param length length of the string + * See http://www.azillionmonkeys.com/qed/hash.html for details. + */ +static inline string_hash_t hash_string_n(const char *string, size_t length) +{ + assert(string != NULL); + size_t reminding_length = length % 4; + length /= 4; + string_hash_t hash = length, temp; + + while (length > 0) { + hash += hash_string_16(string); + temp = (hash_string_16(string + 2) << 11) ^ hash; + hash = (hash << 16) ^ temp; + string += 4; + hash += hash >> 11; + length--; + } + switch (reminding_length) { + case 3: + hash += hash_string_16(string); + hash ^= hash << 16; + hash ^= (uint8_t) string[2] << 18; + hash += hash >> 11; + break; + case 2: + hash += hash_string_16(string); + hash ^= hash << 11; + hash += hash >> 17; + break; + case 1: + hash += (uint8_t) string[0]; + hash ^= hash << 10; + hash += hash >> 1; + break; + default: + break; + } + hash ^= hash << 3; + hash += hash >> 5; + hash ^= hash << 4; + hash += hash >> 17; + hash ^= hash << 25; + hash += hash >> 6; + return hash; +} + +/** + * Paul Hsieh hash function. + * @param string null-terminated string to hash + * @see hash_string_n + * See http://www.azillionmonkeys.com/qed/hash.html for details. + */ +static inline string_hash_t hash_string(const char *string) +{ + return hash_string_n(string, strlen(string)); +} + +#endif diff --git a/poliqarp-library/sakura/common/system-error.c b/poliqarp-library/sakura/common/system-error.c new file mode 100644 index 0000000000000000000000000000000000000000..05ea91669afe38b58b1f537181d9fdee7b97559e --- /dev/null +++ b/poliqarp-library/sakura/common/system-error.c @@ -0,0 +1,67 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2008-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifdef _WIN32 +#include <windows.h> +#include <winerror.h> +#endif +#include <errno.h> + +void set_errno_from_last_error() +{ +#ifdef _WIN32 + int last_error = GetLastError(); + switch (last_error) + { +#define MAP(x, y) case ERROR_##x: errno = y; break; + MAP(ACCESS_DENIED, EACCES) + MAP(ALREADY_EXISTS, EEXIST) + MAP(BAD_DEVICE, ENODEV) + MAP(DIRECTORY, ENOTDIR) + MAP(DISK_FULL, ENOSPC) + MAP(FILE_EXISTS, EEXIST) + MAP(FILE_INVALID, ENXIO) + MAP(FILE_NOT_FOUND, ENOENT) + MAP(INVALID_HANDLE, EBADF) + MAP(NOACCESS, EFAULT) + MAP(NOT_ENOUGH_MEMORY, ENOMEM) + MAP(OPEN_FAILED, EIO) + MAP(OUTOFMEMORY, ENOMEM) + MAP(PATH_NOT_FOUND, ENOENT) + MAP(SHARING_VIOLATION, EBUSY) + MAP(TOO_MANY_OPEN_FILES, EMFILE) + MAP(WRITE_PROTECT, EROFS) + MAP(DISK_CORRUPT, EIO) + MAP(FILE_CORRUPT, EIO) + MAP(HANDLE_DISK_FULL, ENOSPC) + MAP(INVALID_NAME, ENOENT) + default: + errno = 0; + return; + } +#undef MAP +#else + errno = 0; + return; +#endif +} diff --git a/poliqarp-library/sakura/common/system-error.h b/poliqarp-library/sakura/common/system-error.h new file mode 100644 index 0000000000000000000000000000000000000000..64e2fe48ff94f50fdd5afb0d8f56da7b02b5670e --- /dev/null +++ b/poliqarp-library/sakura/common/system-error.h @@ -0,0 +1,24 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2008-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +void set_errno_from_last_error(); diff --git a/poliqarp-library/sakura/common/tinydb.c b/poliqarp-library/sakura/common/tinydb.c new file mode 100644 index 0000000000000000000000000000000000000000..261c67962ec201bc4f7fe854b6799f975cd53fca --- /dev/null +++ b/poliqarp-library/sakura/common/tinydb.c @@ -0,0 +1,50 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <errno.h> + +#include <sakura/common/tinydb.h> + +int tinydb_open(struct tinydb *this, const char *image_pathname, + size_t item_size) +{ + int code; + code = create_file_map(&this->file_map, image_pathname); + if (code != 0) + return code; + this->num_bytes = get_file_size(&this->file_map); + if (this->num_bytes % item_size != 0) { + destroy_file_map(&this->file_map); + errno = EINVAL; + return -1; + } + this->num_items = this->num_bytes / item_size; + this->item_size = item_size; + this->image = get_file_image(&this->file_map); + return 0; +} + +void tinydb_close(struct tinydb *this) +{ + destroy_file_map(&this->file_map); +} diff --git a/poliqarp-library/sakura/common/tinydb.h b/poliqarp-library/sakura/common/tinydb.h new file mode 100644 index 0000000000000000000000000000000000000000..54336f91fb0e015fa83a7e9a493516a659f1ac50 --- /dev/null +++ b/poliqarp-library/sakura/common/tinydb.h @@ -0,0 +1,59 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef TINYDB_H +#define TINYDB_H + +#include <poliqarp-config.h> + +#include <stdlib.h> +#include <assert.h> + +#include <sakura/common/file-map.h> + +struct tinydb { + struct file_map file_map; + const void *image; + size_t item_size; + size_t num_items; + size_t num_bytes; +}; + +int tinydb_open(struct tinydb *this, const char *image_pathname, + size_t item_size); + +void tinydb_close(struct tinydb *this); + +static inline const void *tinydb_fetch_item(const struct tinydb *this, + size_t index) +{ + assert(index < this->num_items); + return ((char *)this->image) + this->item_size * index; +} + +static inline const void *tinydb_fetch_image(const struct tinydb *this) +{ + return this->image; +} + +#endif /* TINYDB_H */ diff --git a/poliqarp-library/sakura/config.c b/poliqarp-library/sakura/config.c new file mode 100644 index 0000000000000000000000000000000000000000..f67517c31c4a177a8b71334cec6b6f4e4e9b863e --- /dev/null +++ b/poliqarp-library/sakura/config.c @@ -0,0 +1,206 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <poliqarp-config.h> + +#include <assert.h> +#include <errno.h> +#include <string.h> + +#include <sakura/config.h> +#include <sakura/common/getline.h> +#include <foostring/foostring.h> + +/** + * Trim the string. + * @param s the string to trim + * @param start pointer to the start of the trimmed part of string + * @param end pointer to the end of (i.e. null byte) of the trimmed part of string + */ +static inline void poliqarp_trim_helper(const char *s, const char **start, const char **end) +{ + assert(s != NULL); + *start = s; + *end = strchr(s, '\0'); + /* Skip leading white space. */ + while (*start < *end && ascii_isspace(**start)) + (*start)++; + /* Skip trailing white space. */ + while (*start < *end && ascii_isspace((*end)[-1])) + (*end)--; + assert(*start <= *end); +} + +/** + * Trim the string in-place. + * @param s the string to trim + */ +static void poliqarp_trim_inplace(char *s) +{ + const char *start, *end; + poliqarp_trim_helper(s, &start, &end); + s[end - s] = '\0'; + if (start != s) + memmove(s, start, end - start + 1); +} + +/** + * Trim the string. + * @param s the string to trim + * @return the trimmed string, allocated by malloc(), or NULL in case of error + */ +static char *poliqarp_trim(char *s) +{ + char *result; + const char *start, *end; + poliqarp_trim_helper(s, &start, &end); + result = malloc(end - start + 1); + if (result == NULL) + return NULL; + memcpy(result, start, end - start); + result[end - start] = '\0'; + return result; +} + + +char *poliqarp_fetch_line(FILE *stream) +{ + size_t n = 0; + char *line = NULL; + ssize_t read; + + do { + if ((read = getline(&line, &n, stream)) == -1) { + free(line); + return NULL; + } + /* Cut comments, if any: */ + char *ptr = strchr(line, '#'); + if (ptr != NULL) + *ptr = '\0'; + poliqarp_trim_inplace(line); + } while (*line == '\0'); + return line; +} + +int poliqarp_parse_variable(struct poliqarp_variable *this, char *text) +{ + char *ptr = strchr(text, '='); + this->name = this->value = NULL; + if (ptr == NULL) { + errno = EINVAL; + return -1; + } + *ptr = '\0'; + this->name = poliqarp_trim(text); + if (this->name == NULL) + return -1; + this->value = poliqarp_trim(ptr + 1); + if (this->value == NULL) { + free(this->name); + this->name = NULL; + return -1; + } + return 0; +} + +void poliqarp_free_variable(struct poliqarp_variable *this) +{ + free(this->name); + free(this->value); +} + +static int poliqarp_get_section_id(const char *name, + const struct poliqarp_config_section *sections) +{ + const struct poliqarp_config_section *section = sections; + for (; section->name; section++) + if (strcasecmp(section->name, name) == 0) + return section->id; + return -1; +} + +int poliqarp_parse_config_file(void *extra, const char *path, + const struct poliqarp_config_section *sections, + poliqarp_config_handler handler, struct poliqarp_error *error) +{ + FILE *fp = fopen(path, "rt"); + if (fp == NULL) { + poliqarp_error_from_system(error, _("Unable to open configuration file (%s)"), path); + return -1; + } + + int rc = 0; + int section = -1; + char *line; + while ((line = poliqarp_fetch_line(fp)) != NULL) { + if (line[0] == '[') { /* section declaration */ + const char *section_name; + char *ptr = strrchr(line, ']'); + if (ptr != NULL) + *ptr = 0; + else { + poliqarp_error_message_set(error, + _("Malformed configuration file (%s):" + " no ']' after section name"), path); + rc = -1; + break; + } + section_name = line + 1; + section = poliqarp_get_section_id(section_name, sections); + if (section < 0) { + poliqarp_error_message_set(error, + _("Malformed configuration file (%s):" + " unknown section %s"), path, section_name); + rc = -1; + break; + } + } else if (section < 0) { + poliqarp_error_message_set(error, + _("Malformed configuration file (%s):" + " statement outside of any section"), path); + break; + } else { + rc = handler(extra, section, line, error); + if (rc != 0) { + char *old_message = strdup(poliqarp_error_message_get(error)); + poliqarp_error_message_set(error, + _("Unable to process configuration file (%s): %s"), + path, old_message == NULL ? + _("possibly cannot allocate memory") : old_message); + break; + free(old_message); + } + } + free(line); + line = NULL; + } + if (rc == 0 && !feof(fp)) /* poliqarp_fetch_line() failed */ { + poliqarp_error_from_system(error, + "Unable to read configuration file (%s)", path); + rc = -1; + } + free(line); + fclose(fp); /* Safe to ignore errors. */ + return rc; +} diff --git a/poliqarp-library/sakura/config.h b/poliqarp-library/sakura/config.h new file mode 100644 index 0000000000000000000000000000000000000000..16b6901895698b7e890a27cfecc39e7e70806a99 --- /dev/null +++ b/poliqarp-library/sakura/config.h @@ -0,0 +1,66 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef POLIQARP_CONFIG_H +#define POLIQARP_CONFIG_H 1 + +#include <poliqarp-config.h> + +#include <stdio.h> + +#include <sakura/exception.h> + +/** + * Read one line, trim whitespace and comments. + * @return a string allocated by malloc(), or NULL in case of error or EOF + */ +char *poliqarp_fetch_line(FILE *stream); + +/** @todo */ +struct poliqarp_variable { + char *name; + char *value; +}; + +/** + * Parse a name = value pair. + * @return 0 on sucess, -1 on failure + * */ +int poliqarp_parse_variable(struct poliqarp_variable *this, char *text); + +/** @todo */ +void poliqarp_free_variable(struct poliqarp_variable *this); + +typedef int (*poliqarp_config_handler)(void *extra, int section, + char *text, struct poliqarp_error *error); + +struct poliqarp_config_section { + const char *name; + int id; +}; + +int poliqarp_parse_config_file(void *extra, const char *path, + const struct poliqarp_config_section *sections, + poliqarp_config_handler handler, struct poliqarp_error *error); + +#endif /* POLIQARP_CONFIG_H */ diff --git a/poliqarp-library/sakura/corpus.c b/poliqarp-library/sakura/corpus.c new file mode 100644 index 0000000000000000000000000000000000000000..33afbfbf989209aa826aecd1222e4293d0048d46 --- /dev/null +++ b/poliqarp-library/sakura/corpus.c @@ -0,0 +1,213 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <poliqarp-config.h> + +#include <sakura/poliqarp.h> +#include <stdio.h> + +#define CONFIG_CLEANUP_FLAG 1 +#define INTERP_CLEANUP_FLAG 2 +#define CORPUS_CLEANUP_FLAG 4 +#define DOCUMENT_CLEANUP_FLAG 8 +#define SUBDOCUMENT_CLEANUP_FLAG 16 +#define META_CLEANUP_FLAG 32 +#define ORTH_CLEANUP_FLAG 64 +#define BASE_CLEANUP_FLAG 128 +#define TAG_CLEANUP_FLAG 256 +#define INDEX_CLEANUP_FLAG 512 +#define SYNTAX_CLEANUP_FLAG 1024 + +#ifdef POLIQARP_SINGLE_THREADED + +/* disable progress */ +#define progress_reset(x) do {} while (0) +#define progress_advance(x, y) do {} while (0) +/* disable cancellation testing */ +#define pthread_allow_cancel() do {} while (0) +#undef pthread_setcancelstate +#define pthread_setcancelstate(x, y) do {} while (0) +#undef pthread_cleanup_push +#define pthread_cleanup_push(x, y) do {} while (0) +#undef pthread_cleanup_pop +#define pthread_cleanup_pop(x) do {} while (0) + +#else + +#define pthread_allow_cancel() do { \ + pthread_setcancelstate(cancel_state, NULL); \ + pthread_testcancel(); \ + pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL); \ +} while (0) + +static void poliqarp_open_corpus_cleanup(void *data) +{ + struct poliqarp_corpus *corpus = data; + poliqarp_close_corpus(corpus); +} + +#endif + +int poliqarp_open_corpus(struct poliqarp_corpus *corpus, const char *name, + progress_t *progress, struct poliqarp_error *error) +{ + int cancel_state; + int rc = 0; + pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cancel_state); + corpus->cleanup_flags = 0; + corpus->base_name = strdup(name); + if (corpus->base_name == NULL) { + poliqarp_error_from_system(error, "Unable to open corpus"); + rc = -1; + } + progress_reset(progress); + pthread_cleanup_push(poliqarp_open_corpus_cleanup, corpus); + while (rc == 0) { + /* config */ + rc = poliqarp_backend_config_open(&corpus->config, name, error); + if (rc != 0) + break; + corpus->cleanup_flags |= CONFIG_CLEANUP_FLAG; + progress_advance(progress, 9); /* 9 */ + pthread_allow_cancel(); + /* interp */ + rc = poliqarp_backend_interp_open(&corpus->interp, name, error); + if (rc != 0) + break; + corpus->cleanup_flags |= INTERP_CLEANUP_FLAG; + progress_advance(progress, 9); /* 18 */ + pthread_allow_cancel(); + /* corpus image */ + rc = poliqarp_backend_corpus_open(&corpus->corpus, name, error); + if (rc != 0) + break; + corpus->cleanup_flags |= CORPUS_CLEANUP_FLAG; + progress_advance(progress, 9); /* 27 */ + pthread_allow_cancel(); + /* corpus document */ + rc = poliqarp_backend_document_open(&corpus->document, name, error); + if (rc != 0) + break; + corpus->cleanup_flags |= DOCUMENT_CLEANUP_FLAG; + progress_advance(progress, 9); /* 36 */ + pthread_allow_cancel(); + /* subdocument */ + rc = poliqarp_backend_subdocument_open(&corpus->subdocument, name, error); + if (rc != 0) + break; + corpus->cleanup_flags |= SUBDOCUMENT_CLEANUP_FLAG; + progress_advance(progress, 9); /* 45 */ + pthread_allow_cancel(); + /* meta document */ + rc = poliqarp_backend_meta_open(&corpus->meta, name, error); + if (rc != 0) + break; + corpus->cleanup_flags |= META_CLEANUP_FLAG; + progress_advance(progress, 10); /* 55 */ + pthread_allow_cancel(); + /* orth */ + rc = poliqarp_backend_orth_open(&corpus->orth, name, error); + if (rc != 0) + break; + corpus->cleanup_flags |= ORTH_CLEANUP_FLAG; + progress_advance(progress, 9); /* 64 */ + pthread_allow_cancel(); + /* base */ + rc = poliqarp_backend_base_open(&corpus->base, name, error); + if (rc != 0) + break; + corpus->cleanup_flags |= BASE_CLEANUP_FLAG; + progress_advance(progress, 9); /* 73 */ + pthread_allow_cancel(); + /* tag */ + rc = poliqarp_backend_tag_open(&corpus->tag, &corpus->config, name, error); + if (rc != 0) + break; + corpus->cleanup_flags |= TAG_CLEANUP_FLAG; + progress_advance(progress, 9); /* 82 */ + pthread_allow_cancel(); + /* index */ + rc = poliqarp_backend_index_open(&corpus->index, &corpus->config, name, error); + if (rc != 0) + break; + corpus->cleanup_flags |= INDEX_CLEANUP_FLAG; + progress_advance(progress, 9); /* 91 */ + pthread_allow_cancel(); + /* syntax */ + rc = poliqarp_backend_syntax_open(&corpus->syntax, name, error); + if (rc != 0) + break; + corpus->cleanup_flags |= SYNTAX_CLEANUP_FLAG; + progress_advance(progress, 9); /* 100 */ + break; + }; + if (rc != 0) + poliqarp_close_corpus(corpus); + pthread_cleanup_pop(0); + pthread_setcancelstate(cancel_state, NULL); + return rc; +} + +int poliqarp_close_corpus(struct poliqarp_corpus *corpus) +{ + assert(corpus != NULL); + + if (corpus->cleanup_flags & INDEX_CLEANUP_FLAG) + poliqarp_backend_index_close(&corpus->index); + + if (corpus->cleanup_flags & INTERP_CLEANUP_FLAG) + poliqarp_backend_interp_close(&corpus->interp); + + if (corpus->cleanup_flags & CORPUS_CLEANUP_FLAG) + poliqarp_backend_corpus_close(&corpus->corpus); + + if (corpus->cleanup_flags & DOCUMENT_CLEANUP_FLAG) + poliqarp_backend_document_close(&corpus->document); + + if (corpus->cleanup_flags & SUBDOCUMENT_CLEANUP_FLAG) + poliqarp_backend_subdocument_close(&corpus->subdocument); + + if (corpus->cleanup_flags & META_CLEANUP_FLAG) + poliqarp_backend_meta_close(&corpus->meta); + + if (corpus->cleanup_flags & ORTH_CLEANUP_FLAG) + poliqarp_backend_orth_close(&corpus->orth); + + if (corpus->cleanup_flags & BASE_CLEANUP_FLAG) + poliqarp_backend_base_close(&corpus->base); + + if (corpus->cleanup_flags & CONFIG_CLEANUP_FLAG) + poliqarp_backend_config_close(&corpus->config); + + if (corpus->cleanup_flags & TAG_CLEANUP_FLAG) + poliqarp_backend_tag_close(&corpus->tag); + + if (corpus->cleanup_flags & SYNTAX_CLEANUP_FLAG) + poliqarp_backend_syntax_close(&corpus->syntax); + + corpus->cleanup_flags = 0; + free(corpus->base_name); + corpus->base_name = NULL; + + return 0; +} diff --git a/poliqarp-library/sakura/corpus.h b/poliqarp-library/sakura/corpus.h new file mode 100644 index 0000000000000000000000000000000000000000..19452b2c1cb8a7b6a0a44bbd503825a7fab7ec2a --- /dev/null +++ b/poliqarp-library/sakura/corpus.h @@ -0,0 +1,73 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +/** @defgroup poliqarp_corpus Corpus Module */ +/** @{ */ +/** @file corpus.h Corpus Module */ + +#ifndef POLIQARP_CORPUS_H +#define POLIQARP_CORPUS_H + +#include <progress/progress.h> +#include <sakura/backend-corpus.h> +#include <sakura/backend-document.h> +#include <sakura/backend-subdocument.h> +#include <sakura/backend-meta.h> +#include <sakura/backend-orth.h> +#include <sakura/backend-base.h> +#include <sakura/backend-tag.h> +#include <sakura/backend-interp.h> +#include <sakura/backend-config.h> +#include <sakura/backend-index.h> +#include <sakura/backend-syntax.h> + +#include <sakura/common/hash-table.h> + +/** + * Corpus structure. + * Holds all backends together + */ +struct poliqarp_corpus { + struct poliqarp_backend_document document; /**< Document backend. */ + struct poliqarp_backend_subdocument subdocument; /**< Subdocument backend. */ + struct poliqarp_backend_meta meta; /**< Meta backend. */ + struct poliqarp_backend_corpus corpus; /**< Corpus backend. */ + struct poliqarp_backend_orth orth; /**< Orth backend. */ + struct poliqarp_backend_base base; /**< Base backend. */ + struct poliqarp_backend_config config; /**< Configuration backend. */ + struct poliqarp_backend_tag tag; /**< Tag backend. */ + struct poliqarp_backend_interp interp; /**< Interpretation backend. */ + struct poliqarp_backend_index index; /**< Index backend. */ + struct poliqarp_backend_syntax syntax; /**< Syntax backend. */ + char *base_name; /**< Copy of the base name. */ + unsigned cleanup_flags; /**< Cleanup flags. Each backend gets its own flag + here if it opened correctly. */ +}; + +#define poliqarp_get_backend(corpus, type) (&(corpus)->type) +#define poliqarp_get_const_backend(corpus, type) \ + ((const struct poliqarp_backend_##type *) poliqarp_get_backend(corpus, type)) + +#endif /* POLIQARP_CORPUS_H */ + +/** @} */ diff --git a/poliqarp-library/sakura/date-span.h b/poliqarp-library/sakura/date-span.h new file mode 100644 index 0000000000000000000000000000000000000000..7999ebd919b7ed8c0acae2fe8bd64063d20c34a2 --- /dev/null +++ b/poliqarp-library/sakura/date-span.h @@ -0,0 +1,43 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef POLIQARP_DATE_SPAN_H +#define POLIQARP_DATE_SPAN_H + +#include <stdlib.h> +#include <sakura/abi.h> + +/** @todo */ +struct poliqarp_date_span { + struct poliqarp_date_span *next; + struct poliqarp_meta_date start; + struct poliqarp_meta_date end; +}; + +/** @todo */ +struct poliqarp_date_span *poliqarp_date_span_create(struct poliqarp_meta_date date); + +/** @todo */ +void poliqarp_date_span_destroy(struct poliqarp_date_span *this); + +#endif diff --git a/poliqarp-library/sakura/dict.c b/poliqarp-library/sakura/dict.c new file mode 100644 index 0000000000000000000000000000000000000000..1c7efb93b479e3109717316550f78d83fd5b7221 --- /dev/null +++ b/poliqarp-library/sakura/dict.c @@ -0,0 +1,58 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <foostring/foostring.h> + +#include <sakura/dict.h> +#include <sakura/exception.h> + +int poliqarp_newdict_open(struct newdict *dict, + const char *base_name, bool want_index, + const char *image_path_format, const char *offset_path_format, + const char *error_message, struct poliqarp_error *error) +{ + char *image_path = NULL, *offset_path = NULL; + int rc; + image_path = string_aformat(image_path_format, base_name); + if (image_path == NULL) + goto error; + offset_path = string_aformat(offset_path_format, base_name); + if (offset_path == NULL) + goto error; + rc = newdict_open(dict, image_path, offset_path, want_index); + if (rc == 0) + goto done; +error: + if (image_path == NULL || offset_path == NULL) { + poliqarp_error_from_system(error, "%s", error_message); + } else { + poliqarp_error_from_system(error, _("%s (%s or %s)"), error_message, + image_path, offset_path); + } + rc = -1; +done: + free(image_path); + free(offset_path); + return rc; +} + diff --git a/poliqarp-library/sakura/dict.h b/poliqarp-library/sakura/dict.h new file mode 100644 index 0000000000000000000000000000000000000000..368773ac495fe40d0ec443924172f8b9aea737fa --- /dev/null +++ b/poliqarp-library/sakura/dict.h @@ -0,0 +1,35 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef POLIQARP_DICT_H +#define POLIQARP_DICT_H 1 + +#include <sakura/exception.h> +#include <sakura/common/newdict.h> + +int poliqarp_newdict_open(struct newdict *dict, + const char *base_name, bool want_index, + const char *image_path_format, const char *offset_path_format, + const char *error_message, struct poliqarp_error *error); + +#endif diff --git a/poliqarp-library/sakura/exception.c b/poliqarp-library/sakura/exception.c new file mode 100644 index 0000000000000000000000000000000000000000..02b7f06d820e8d253c8429479627b8e9d8ddc970 --- /dev/null +++ b/poliqarp-library/sakura/exception.c @@ -0,0 +1,131 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <assert.h> +#include <string.h> + +#include <foostring/foostring.h> + +#include <sakura/exception.h> +#include <sakura/regexp.h> + +static char *surrogate_message = "Possibly cannot allocate memory"; + +void poliqarp_error_set(struct poliqarp_error *this, struct poliqarp_error *that) +{ + *this = *that; + *that = poliqarp_error_none; +} + +void poliqarp_error_message_set(struct poliqarp_error *this, const char *fmt, ...) +{ + int my_errno = errno; + if (this->malloced) + free(this->message); + if (fmt == NULL) { + *this = poliqarp_error_none; + return; + } + va_list ap; + va_start(ap, fmt); + this->message = string_avformat(fmt, ap); + va_end(ap); + if (this->message == NULL) { + this->message = surrogate_message; + this->malloced = false; + } else + this->malloced = true; + errno = my_errno; +} + +const char *poliqarp_error_message_get(const struct poliqarp_error *this) +{ + return this->message; +} + +void poliqarp_error_from_system_n(int errnum, struct poliqarp_error *this, const char *fmt, ...) +{ + int my_errno = errno; + if (this->malloced) + free(this->message); + const char *error_message; +#ifdef HAVE_STRERROR_R + char error_message_buf[BUFSIZ]; +#ifdef STRERROR_R_CHAR_P + error_message = strerror_r(errnum, error_message_buf, + sizeof error_message_buf); +#else + strerror_r(errno, error_message_buf, sizeof error_message_buf); + error_message = error_message_buf; +#endif +#else + error_message = strerror(errno); + /* FIXME: strerror() call is *might* not be thread-safe. */ +#endif + string_t message = string_create(); + if (fmt != NULL) { + va_list ap; + va_start(ap, fmt); + string_vformat(message, fmt, ap); + va_end(ap); + string_append_str(message, ": "); + } + string_append_str(message, error_message); + this->message = string_free_and_get_buffer(message); + if (this->message == NULL) { + this->message = surrogate_message; + this->malloced = false; + } else + this->malloced = true; + errno = my_errno; +} + +void poliqarp_error_from_regexp(struct poliqarp_error *this, + const struct poliqarp_regexp *regexp, const char *fmt, ...) +{ + int my_errno = errno; + assert(regexp->status != 0); + if (regexp->message == NULL) { + if (this->malloced) + free(this->message); + this->message = surrogate_message; + this->malloced = false; + return; + } + string_t message = string_create(); + if (fmt != NULL) { + va_list ap; + va_start(ap, fmt); + string_vformat(message, fmt, ap); + va_end(ap); + string_append_str(message, ": "); + } + string_append_str(message, regexp->message); + this->message = string_free_and_get_buffer(message); + if (this->message == NULL) { + this->message = surrogate_message; + this->malloced = false; + } else + this->malloced = true; + my_errno = errno; +} diff --git a/poliqarp-library/sakura/exception.h b/poliqarp-library/sakura/exception.h new file mode 100644 index 0000000000000000000000000000000000000000..b7e64cfa688337f92e74108a659038d958cf739a --- /dev/null +++ b/poliqarp-library/sakura/exception.h @@ -0,0 +1,55 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef POLIQARP_EXCEPTION_H +#define POLIQARP_EXCEPTION_H + +#include <poliqarp-config.h> + +#include <stdlib.h> +#include <stdio.h> +#include <errno.h> + +struct poliqarp_error { + char *message; + bool malloced; +}; + +static const struct poliqarp_error poliqarp_error_none = { + .message = NULL, + .malloced = false +}; + +const char *poliqarp_error_message_get(const struct poliqarp_error *); +void poliqarp_error_set(struct poliqarp_error *, struct poliqarp_error *); +void poliqarp_error_message_set(struct poliqarp_error *, const char *, ...); +void poliqarp_error_from_system_n(int, struct poliqarp_error *, + const char *, ...); + +#define poliqarp_error_from_system(...) \ + do { \ + int __errnum = errno; \ + poliqarp_error_from_system_n(__errnum, __VA_ARGS__); \ + } while (0) + +#endif diff --git a/poliqarp-library/sakura/expression.c b/poliqarp-library/sakura/expression.c new file mode 100644 index 0000000000000000000000000000000000000000..8aac6489a60eec3d9096afb02d3d3efadf3b18f7 --- /dev/null +++ b/poliqarp-library/sakura/expression.c @@ -0,0 +1,432 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <errno.h> + +#include <sakura/expression.h> + +struct poliqarp_expression *poliqarp_and_expression(struct poliqarp_expression *left, + struct poliqarp_expression *right) +{ + if (left->type == POLIQARP_EXPRESSION_CONSTANT) { + /* left side is constant, return either constant false or + * right expression */ + if (left->as.constant == false) { + poliqarp_expression_destroy(right); + return left; + } else { + poliqarp_expression_destroy(left); + return right; + } + } else if (right->type == POLIQARP_EXPRESSION_CONSTANT) { + /* right side is constant, return either constant false or + * left expression */ + if (right->as.constant == false) { + poliqarp_expression_destroy(left); + return right; + } else { + poliqarp_expression_destroy(right); + return left; + } + } else { + /* both are ordinary values, create extra logical node */ + struct poliqarp_expression *this = malloc(sizeof *this); + if (this == NULL) + return NULL; + this->type = POLIQARP_EXPRESSION_AND; + this->as.expression.negate = false; + this->as.expression.left = left; + this->as.expression.right = right; + if (poliqarp_expression_type(this) == POLIQARP_EXPRESSION_INVALID) { + free(this); + return NULL; + } + return this; + } +} + +struct poliqarp_expression *poliqarp_or_expression(struct poliqarp_expression *left, + struct poliqarp_expression *right) +{ + if (left->type == POLIQARP_EXPRESSION_CONSTANT) { + /* left side is constant, return either constant true or + * right expression */ + if (left->as.constant == true) { + poliqarp_expression_destroy(right); + return left; + } else { + poliqarp_expression_destroy(left); + return right; + } + } else if (right->type == POLIQARP_EXPRESSION_CONSTANT) { + /* right side is constant, return either constant false or + * left expression */ + if (right->as.constant == true) { + poliqarp_expression_destroy(left); + return right; + } else { + poliqarp_expression_destroy(right); + return left; + } + } else { + /* both are ordinary values, create extra logical node */ + struct poliqarp_expression *this = malloc(sizeof *this); + if (this == NULL) + return NULL; + this->type = POLIQARP_EXPRESSION_OR; + this->as.expression.negate = false; + this->as.expression.left = left; + this->as.expression.right = right; + if (poliqarp_expression_type(this) == POLIQARP_EXPRESSION_INVALID) { + free(this); + return NULL; + } + return this; + } +} + +struct poliqarp_expression *poliqarp_phrase_expression(struct poliqarp_expression *synh, + struct poliqarp_expression *semh, bool same, bool all) +{ + struct poliqarp_expression *this = malloc(sizeof *this); + if (this == NULL) + return NULL; + this->type = POLIQARP_EXPRESSION_PHRASE; + this->as.phrase.same = same; + this->as.phrase.all = all; + this->as.phrase.synh = synh; + this->as.phrase.semh = semh; + this->as.phrase.negate = false; + return this; +} + +struct poliqarp_expression *poliqarp_variable_expression(size_t n_children, + struct poliqarp_expression **children, int id) +{ + struct poliqarp_expression *this = malloc(sizeof *this); + if (this == NULL) + return NULL; + this->type = POLIQARP_EXPRESSION_VARIABLE; + assert(n_children > 0); + this->as.variable.n_children = n_children; + this->as.variable.children = children; + this->as.variable.id = id; + return this; +} + +struct poliqarp_expression *poliqarp_not_expression(struct poliqarp_expression *this) +{ + switch (this->type) { + case POLIQARP_EXPRESSION_AND: + case POLIQARP_EXPRESSION_OR: + this->as.expression.negate ^= true; + break; + case POLIQARP_EXPRESSION_VALUE: + this->as.value.negate ^= true; + break; + case POLIQARP_EXPRESSION_CONSTANT: + this->as.constant ^= true; + break; + case POLIQARP_EXPRESSION_PHRASE: + this->as.phrase.negate ^= true; + break; + case POLIQARP_EXPRESSION_VARIABLE: + { + size_t i; + for (i = 0; i < this->as.variable.n_children; i++) + poliqarp_not_expression(this->as.variable.children[i]); + } + break; + case POLIQARP_EXPRESSION_INVALID: + abort(); /* shoult not happen */ + } + return this; +} + + +struct poliqarp_expression *poliqarp_expression_create_constant(bool value) +{ + struct poliqarp_expression *this = malloc(sizeof *this); + if (this == NULL) + return NULL; + this->type = POLIQARP_EXPRESSION_CONSTANT; + this->as.constant = value; + return this; +} + +struct poliqarp_expression *poliqarp_expression_create_value(void *value, + poliqarp_value_eval_fn eval, poliqarp_value_compare_fn compare, + poliqarp_value_destroy_fn destroy) +{ + struct poliqarp_expression *this = malloc(sizeof *this); + if (this == NULL) + return NULL; + this->type = POLIQARP_EXPRESSION_VALUE; + this->as.value.value = value; + this->as.value.eval = eval; + this->as.value.compare = compare; + this->as.value.destroy = destroy; + this->as.value.negate = false; + + return this; +} + +static bool poliqarp_expression_eval__and(const struct poliqarp_expression *this, + const struct poliqarp_corpus *corpus, + const void *argument, const void *bindings) +{ + assert(this != NULL); + return this->as.expression.negate ^ + (poliqarp_expression_eval(this->as.expression.left, corpus, argument, bindings) && + poliqarp_expression_eval(this->as.expression.right, corpus, argument, bindings)); +} + +static bool poliqarp_expression_eval__or(const struct poliqarp_expression *this, + const struct poliqarp_corpus *corpus, + const void *argument, const void *bindings) +{ + assert(this != NULL); + return this->as.expression.negate ^ + (poliqarp_expression_eval(this->as.expression.left, corpus, argument, bindings) || + poliqarp_expression_eval(this->as.expression.right, corpus, argument, bindings)); +} + +static bool poliqarp_expression_eval__value(const struct poliqarp_expression *this, + const struct poliqarp_corpus *corpus, + const void *argument, const void *bindings) +{ + assert(this != NULL); + return this->as.value.negate ^ this->as.value.eval(this->as.value.value, + corpus, argument); +} + +static bool poliqarp_expression_eval__const(const struct poliqarp_expression *this, + const struct poliqarp_corpus *corpus, + const void *argument, const void *bindings) +{ + assert(this != NULL); + return this->as.constant; +} + +static bool poliqarp_expression_eval__variable(const struct poliqarp_expression *this, + const struct poliqarp_corpus *corpus, + const void *argument, const void *bindings) +{ + assert(bindings != NULL); + assert(this != NULL); + const size_t *variable_bindings = bindings; + size_t nth = variable_bindings[this->as.variable.id]; + return poliqarp_expression_eval(this->as.variable.children[nth], corpus, + argument, bindings); +} + +bool poliqarp_expression_eval(const struct poliqarp_expression *this, + const struct poliqarp_corpus *corpus, + const void *argument, const void *bindings) +{ + assert(this != NULL); + assert(corpus != NULL); + assert(argument != NULL); + switch (this->type) { + case POLIQARP_EXPRESSION_AND: + return poliqarp_expression_eval__and(this, corpus, argument, bindings); + case POLIQARP_EXPRESSION_OR: + return poliqarp_expression_eval__or(this, corpus, argument, bindings); + case POLIQARP_EXPRESSION_VALUE: + return poliqarp_expression_eval__value(this, corpus, argument, bindings); + case POLIQARP_EXPRESSION_CONSTANT: + return poliqarp_expression_eval__const(this, corpus, argument, bindings); + case POLIQARP_EXPRESSION_PHRASE: + return true; + case POLIQARP_EXPRESSION_VARIABLE: + return poliqarp_expression_eval__variable(this, corpus, argument, bindings); + case POLIQARP_EXPRESSION_INVALID: + abort(); /* Should not happen. */ + } + abort(); /* Should not happen. */ +} + +int poliqarp_expression_compare(const struct poliqarp_expression *this, + const struct poliqarp_expression *that) +{ + size_t i; + int res; + if (this == that) + return 0; + if (this == NULL) + return 1; + if (that == NULL) + return -1; + if (this->type != that->type) + return this->type - that->type; + + switch (this->type) { + case POLIQARP_EXPRESSION_OR: + case POLIQARP_EXPRESSION_AND: + if (this->as.expression.negate != that->as.expression.negate) + return this->as.expression.negate - that->as.expression.negate; + res = poliqarp_expression_compare(this->as.expression.left, + that->as.expression.left); + if (res) + return res; + return poliqarp_expression_compare(this->as.expression.right, + that->as.expression.right); + case POLIQARP_EXPRESSION_VALUE: + if (this->as.value.negate != that->as.value.negate) + return this->as.value.negate - that->as.value.negate; + return this->as.value.compare(this->as.value.value, + that->as.value.value); + case POLIQARP_EXPRESSION_CONSTANT: + return this->as.constant - that->as.constant; + case POLIQARP_EXPRESSION_PHRASE: + if (this->as.phrase.same != that->as.phrase.same) + return this->as.phrase.same - that->as.phrase.same; + if (this->as.phrase.all != that->as.phrase.all) + return this->as.phrase.all - that->as.phrase.all; + if (this->as.phrase.negate != that->as.phrase.negate) + return this->as.phrase.negate - that->as.phrase.negate; + res = poliqarp_expression_compare(this->as.phrase.synh, that->as.phrase.synh); + if (res) + return res; + return poliqarp_expression_compare(this->as.phrase.semh, that->as.phrase.semh); + case POLIQARP_EXPRESSION_VARIABLE: + if (this->as.variable.id != that->as.variable.id) + return this->as.variable.id - that->as.variable.id; + assert(this->as.variable.n_children == that->as.variable.n_children); + for (i = 0; i < this->as.variable.n_children; i++) + { + res = poliqarp_expression_compare( + this->as.variable.children[i], + that->as.variable.children[i]); + if (res) + return res; + } + return 0; + case POLIQARP_EXPRESSION_INVALID: + abort(); /* Should not happen. */ + } + abort(); /* Should not happen. */ +} + +void poliqarp_expression_destroy(struct poliqarp_expression *this) +{ + size_t i; + if (this == NULL) + return; + switch (this->type) { + case POLIQARP_EXPRESSION_AND: + case POLIQARP_EXPRESSION_OR: + poliqarp_expression_destroy(this->as.expression.left); + poliqarp_expression_destroy(this->as.expression.right); + break; + case POLIQARP_EXPRESSION_VALUE: + this->as.value.destroy(this->as.value.value); + break; + case POLIQARP_EXPRESSION_CONSTANT: + break; + case POLIQARP_EXPRESSION_PHRASE: + if (this->as.phrase.synh != NULL) + poliqarp_expression_destroy(this->as.phrase.synh); + if (this->as.phrase.semh != NULL && !this->as.phrase.same) + poliqarp_expression_destroy(this->as.phrase.semh); + break; + case POLIQARP_EXPRESSION_VARIABLE: + for (i = 0; i < this->as.variable.n_children; i++) + poliqarp_expression_destroy(this->as.variable.children[i]); + free(this->as.variable.children); + break; + case POLIQARP_EXPRESSION_INVALID: + abort(); /* Should not happen. */ + } + free(this); +} + +size_t poliqarp_expression_variable_ranges( + const struct poliqarp_expression *this, size_t *ranges) +{ + assert(this != NULL); + switch (this->type) { + case POLIQARP_EXPRESSION_CONSTANT: + case POLIQARP_EXPRESSION_PHRASE: + case POLIQARP_EXPRESSION_VALUE: + return 0; + case POLIQARP_EXPRESSION_OR: + case POLIQARP_EXPRESSION_AND: + { + size_t i; + size_t j; + i = poliqarp_expression_variable_ranges(this->as.expression.left, ranges); + j = poliqarp_expression_variable_ranges(this->as.expression.right, ranges); + return (i > j) ? i : j; + } + case POLIQARP_EXPRESSION_VARIABLE: + ranges[this->as.variable.id] = this->as.variable.n_children; + return this->as.variable.id; + case POLIQARP_EXPRESSION_INVALID: + abort(); /* Should not happen. */ + } + abort(); /* Should not happen. */ +} + +enum poliqarp_expression_type poliqarp_expression_type( + const struct poliqarp_expression *this) +{ + assert(this != NULL); + switch (this->type) { + case POLIQARP_EXPRESSION_CONSTANT: + case POLIQARP_EXPRESSION_PHRASE: + return this->type; + case POLIQARP_EXPRESSION_VALUE: + /* this is ugly (relies on poliqarp_value_compare not being inline function). + * FIXME: Change at some time */ + if (this->as.value.compare == (poliqarp_value_compare_fn)poliqarp_value_compare && + ((struct poliqarp_value *)this->as.value.value)->domain == + POLIQARP_DOMAIN_TYPE) + { + return POLIQARP_EXPRESSION_PHRASE; + } else + return POLIQARP_EXPRESSION_VALUE; + case POLIQARP_EXPRESSION_OR: + case POLIQARP_EXPRESSION_AND: + { + enum poliqarp_expression_type tleft, tright; + tleft = poliqarp_expression_type(this->as.expression.left); + tright = poliqarp_expression_type(this->as.expression.right); + if (tleft == POLIQARP_EXPRESSION_CONSTANT) + return tright; + else if (tright == POLIQARP_EXPRESSION_CONSTANT) + return tleft; + else if (tleft == tright) + return tleft; + else { + errno = EINVAL; + return POLIQARP_EXPRESSION_INVALID; + } + } + case POLIQARP_EXPRESSION_VARIABLE: + return poliqarp_expression_type(this->as.variable.children[0]); + case POLIQARP_EXPRESSION_INVALID: + abort(); /* Should not happen. */ + } + abort(); /* Should not happen. */ +} diff --git a/poliqarp-library/sakura/expression.h b/poliqarp-library/sakura/expression.h new file mode 100644 index 0000000000000000000000000000000000000000..6b1c120356f167aa6a455011e59aaa5213d0d658 --- /dev/null +++ b/poliqarp-library/sakura/expression.h @@ -0,0 +1,153 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef POLIQARP_EXPRESSION_H +#define POLIQARP_EXPRESSION_H + +#include <sakura/corpus.h> +#include <sakura/value.h> + +/** Expression node type. */ +enum poliqarp_expression_type { + POLIQARP_EXPRESSION_VALUE, /**< Expression contains some (calulatable) + logic value. */ + POLIQARP_EXPRESSION_AND, /**< Logical AND expression. */ + POLIQARP_EXPRESSION_OR, /**< Logical OR expression. */ + POLIQARP_EXPRESSION_CONSTANT, /**< Expression contains some constant + logic value. */ + POLIQARP_EXPRESSION_PHRASE, /**< Expression contains a phrase + heads description. */ + POLIQARP_EXPRESSION_VARIABLE, /**< Expression contains a variable */ + POLIQARP_EXPRESSION_INVALID = -1 /**< It is not possible to determine + expression type */ +}; + +/** + * Evaluation function type. + */ +typedef bool (*poliqarp_value_eval_fn)(void *this, + const struct poliqarp_corpus *corpus, const void *argument); + +/** + * Comparator function type. + */ +typedef int (*poliqarp_value_compare_fn)(const void *this, const void *that); + +/** + * Destructor function type. + */ +typedef void (*poliqarp_value_destroy_fn)(void *this); + +/** Expression node structure. */ +struct poliqarp_expression { + enum poliqarp_expression_type type; /**< Type of the expression. */ + union { + struct { + bool negate; /**< Whether to negate calculated + value. */ + void *value; /**< Value pointer. */ + poliqarp_value_eval_fn eval; /**< Evaluator function. */ + poliqarp_value_compare_fn compare; /**< Comparator function. */ + poliqarp_value_destroy_fn destroy; /**< Destructor function. */ + } value; /**< Value stuff. */ + struct { + bool negate; /**< Whether to negate calculated + value. */ + struct poliqarp_expression *left; /**< Left hand expression. */ + struct poliqarp_expression *right; /**< Right hand expression. */ + } expression; /**< Expression stuff. */ + struct { + bool same; /**< Whether syntactic and semantic + heads must be the same + segment. */ + bool all; /**< Whether all members of a + coordination phrase should + match. */ + bool negate; /**< Whether to negate calculated + value. */ + struct poliqarp_expression *synh; /**< Syntactic head. */ + struct poliqarp_expression *semh; /**< Semantic head. */ + } phrase; + struct { + const void* type; + struct poliqarp_expression **children; + size_t n_children; + int id; + } variable; /**< Variable wrapper. */ + bool constant; /**< Constant value. */ + } as; /**< Data union. */ +}; + +#define POLIQARP_MAX_VARIABLES 16 + +/** Create logical 'and' expression. */ +struct poliqarp_expression *poliqarp_and_expression(struct poliqarp_expression *left, + struct poliqarp_expression *right); + +/** Create logical 'or' expression. */ +struct poliqarp_expression *poliqarp_or_expression(struct poliqarp_expression *left, + struct poliqarp_expression *right); + +/** Create logical 'not' expression. */ +struct poliqarp_expression *poliqarp_not_expression(struct poliqarp_expression *this); + +/** Create a phrasal expression with given head specifications. */ +struct poliqarp_expression *poliqarp_phrase_expression(struct poliqarp_expression *synh, + struct poliqarp_expression *semh, bool same, bool all); + +/** Create an expression with a variable. */ +struct poliqarp_expression *poliqarp_variable_expression(size_t n_children, + struct poliqarp_expression **children, int id); + +/** Create constant expression. */ +struct poliqarp_expression *poliqarp_expression_create_constant(bool value); + +/** Create value expression. */ +struct poliqarp_expression *poliqarp_expression_create_value(void *value, + poliqarp_value_eval_fn eval, poliqarp_value_compare_fn compare, + poliqarp_value_destroy_fn destroy); + +/** Evaluate expression. */ +bool poliqarp_expression_eval(const struct poliqarp_expression *this, + const struct poliqarp_corpus *corpus, + const void *argument, const void *bindings); + +/** Compare two expressions. */ +int poliqarp_expression_compare(const struct poliqarp_expression *this, + const struct poliqarp_expression *that); + +/** Free expression. */ +void poliqarp_expression_destroy(struct poliqarp_expression *this); + +/** Checks the type of an expression. Return POLIQARP_EXPRESSION_INVALID if the + * types don't match. */ +enum poliqarp_expression_type poliqarp_expression_type( + const struct poliqarp_expression *this); + +/** + * FIXME: write a proper comment for this function + */ +size_t poliqarp_expression_variable_ranges( + const struct poliqarp_expression *this, size_t *ranges); + +#endif diff --git a/poliqarp-library/sakura/lexer.h b/poliqarp-library/sakura/lexer.h new file mode 100644 index 0000000000000000000000000000000000000000..15e68f96e9b9aa8e2ac037d3ab1d6b15f379bd3f --- /dev/null +++ b/poliqarp-library/sakura/lexer.h @@ -0,0 +1,35 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2008 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef POLIQARP_LEXER_H +#define POLIQARP_LEXER_H + +typedef void* yyscan_t; + +int yy_scan_string(const char *string, yyscan_t scanner); + +int yylex_init(yyscan_t *scanner); + +int yylex_destroy(yyscan_t scanner); + +#endif diff --git a/poliqarp-library/sakura/lexer.y b/poliqarp-library/sakura/lexer.y new file mode 100644 index 0000000000000000000000000000000000000000..15dbfc8a5691ef35cafa470331a2f5d7b91fa296 --- /dev/null +++ b/poliqarp-library/sakura/lexer.y @@ -0,0 +1,361 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +%{ + +#include <poliqarp-config.h> + +#include <math.h> +#include <stdlib.h> +#include <stdio.h> + +#include <sakura/corpus.h> +#include <sakura/expression.h> +#include <sakura/value.h> +#include <sakura/meta-value.h> +#include <sakura/common/graph.h> +#include <sakura/parser.h> +#include <foostring/foostring.h> +#include <unibits/unibits.h> + +static int string_to_int(const char *s) +{ + long result; + char *end_ptr; + errno = 0; + result = strtol(s, &end_ptr, 10); + if (result < INT_MIN) + result = INT_MIN; + if (result > INT_MAX || *end_ptr != '\0') + result = INT_MAX; + return result; +} + +%} + +%x META_MODE +%x STAT_MODE +%x STAT_SORT_MODE +%x STAT_INTERP_MODE +%x DQUOTE +%x SQUOTE +%option noyywrap +%option reentrant bison-bridge +%option 8bit +%option nounput +%option stack + +DIGIT [[:digit:]] +SPACE [[:space:]]+ + +/* Note: flex (not to mention the other lexes) is brain damaged. It doesn't + * use the regex library (or regex facilities of libc), thus the regular + * expression below does not support multibyte characters. Because we expect + * UTF-8 input, and because the characters 128-255 can only be parts of + * multibyte characters above U+0080, we explicitly allow them all to be + * parts of identifiers. */ +ID [:_.a-zA-Z0-9\x80-\xff-] +ID_EXT [*?+] + +%% + + string_t string_buf; + string_buf = string_create(); /* just to shut up compilers */ + string_free(string_buf); + +<INITIAL>(?i:meta) { + BEGIN(META_MODE); + return META; +} + +<INITIAL,META_MODE>(?i:within) { + return WITHIN; +} + +<INITIAL,META_MODE>(?i:group{SPACE}by) { + BEGIN(STAT_MODE); + return GROUP_BY; +} + +<INITIAL>(?i:head) { + return HEAD; +} + +<INITIAL>(?i:synh) { + return SYNH; +} + +<INITIAL>(?i:semh) { + return SEMH; +} + +<STAT_MODE>(i?:interp) { + BEGIN(STAT_INTERP_MODE); + return STAT_INTERP; +} + +<STAT_INTERP_MODE>(i?:random) { + BEGIN(STAT_MODE); + return STAT_INTERP_RANDOM; +} + +<STAT_INTERP_MODE>(i?:combine) { + BEGIN(STAT_MODE); + return STAT_INTERP_COMBINE; +} + +<STAT_MODE>(i?:sort{SPACE}a{SPACE}fronte) { + return STAT_SORT_AFRONTE; +} + +<STAT_MODE>(?i:sort{SPACE}by) { + BEGIN(STAT_SORT_MODE); + return STAT_SORT_BY; +} + +<STAT_SORT_MODE>(?i:freq) { + BEGIN(STAT_MODE); + return STAT_SORT_BY_FREQ; +} + +<STAT_SORT_MODE>(?i:cp) { + BEGIN(STAT_MODE); + return STAT_SORT_BY_CP; +} + +<STAT_SORT_MODE>(?i:scp) { + BEGIN(STAT_MODE); + return STAT_SORT_BY_SCP; +} + +<STAT_SORT_MODE>(?i:maxcp) { + BEGIN(STAT_MODE); + return STAT_SORT_BY_MAXCP; +} + +<STAT_SORT_MODE>(?i:dice) { + BEGIN(STAT_MODE); + return STAT_SORT_BY_DICE; +} + +<STAT_MODE>(?i:bias) { + return STAT_BIAS; +} + +<STAT_MODE>(?i:min) { + return STAT_MIN; +} + +<STAT_MODE>(?i:count) { + return STAT_COUNT; +} + +<META_MODE>"<" { + yylval->as_mop.strategy = POLIQARP_STRATEGY_SMALLER; + yylval->as_mop.negate = false; + return MOP; +} + +<META_MODE>"<=" { + yylval->as_mop.strategy = POLIQARP_STRATEGY_SMALLER_EQUAL; + yylval->as_mop.negate = false; + return MOP; +} + +<META_MODE>">" { + yylval->as_mop.strategy = POLIQARP_STRATEGY_GREATER; + yylval->as_mop.negate = false; + return MOP; +} + +<META_MODE>">=" { + yylval->as_mop.strategy = POLIQARP_STRATEGY_GREATER_EQUAL; + yylval->as_mop.negate = false; + return MOP; +} + +<META_MODE>"=" { + yylval->as_mop.strategy = POLIQARP_STRATEGY_EQUAL; + yylval->as_mop.negate = false; + return MOP; +} + +<META_MODE>"!=" { + yylval->as_mop.strategy = POLIQARP_STRATEGY_EQUAL; + yylval->as_mop.negate = true; + return MOP; +} + +<INITIAL>[$]{DIGIT}+ { + yylval->as_int = string_to_int(yytext + 1); + return VARIABLE; +} + +<INITIAL,STAT_MODE>{DIGIT}+ { + yylval->as_int = string_to_int(yytext); + return INTEGER; +} + +<STAT_MODE>[+-]{DIGIT}+ { + yylval->as_int = string_to_int(yytext); + return SIGNED_INTEGER; +} + +<STAT_MODE>[+-]?{DIGIT}+[.]{DIGIT}+ { + char *end_ptr; + errno = 0; + yylval->as_double = strtod(yytext, &end_ptr); + if (*end_ptr != '\0' || errno == ERANGE) + yylval->as_double = NAN; + return REAL_NUMBER; +} + +<INITIAL,META_MODE,STAT_MODE>{ID}+({ID}|{ID_EXT})* { + yylval->as_text = strdup(yytext); + return IDENT; +} + +<INITIAL>~ { + yylval->as_op.strategy = POLIQARP_STRATEGY_ANY; + yylval->as_op.use_disamb = false; + yylval->as_op.negate = false; + return OP; +} + +<INITIAL>!~ { + yylval->as_op.strategy = POLIQARP_STRATEGY_ANY; + yylval->as_op.use_disamb = false; + yylval->as_op.negate = true; + return OP; +} + +<INITIAL>~~ { + yylval->as_op.strategy = POLIQARP_STRATEGY_ALL; + yylval->as_op.use_disamb = false; + yylval->as_op.negate = false; + return OP; +} + +<INITIAL>!~~ { + yylval->as_op.strategy = POLIQARP_STRATEGY_ALL; + yylval->as_op.use_disamb = false; + yylval->as_op.negate = true; + return OP; +} + +<INITIAL>= { + yylval->as_op.strategy = POLIQARP_STRATEGY_ANY; + yylval->as_op.use_disamb = true; + yylval->as_op.negate = false; + return OP; +} + +<INITIAL>!= { + yylval->as_op.strategy = POLIQARP_STRATEGY_ANY; + yylval->as_op.use_disamb = true; + yylval->as_op.negate = true; + return OP; +} + +<INITIAL>== { + yylval->as_op.strategy = POLIQARP_STRATEGY_ALL; + yylval->as_op.use_disamb = true; + yylval->as_op.negate = false; + return OP; +} + +<INITIAL>!== { + yylval->as_op.strategy = POLIQARP_STRATEGY_ALL; + yylval->as_op.use_disamb = true; + yylval->as_op.negate = true; + return OP; +} + +<INITIAL,META_MODE,STAT_MODE>"\'" { + yy_push_state(SQUOTE, yyscanner); + string_buf = string_create(); +} + +<INITIAL,META_MODE,STAT_MODE>"\"" { + yy_push_state(DQUOTE, yyscanner); + string_buf = string_create(); +} + +<DQUOTE>\" | +<SQUOTE>\' { + yy_pop_state(yyscanner); + yylval->as_text = string_free_and_get_buffer(string_buf); + return STRING; +} + +<SQUOTE,DQUOTE>{ + "\\n" string_append_char(string_buf, '\n'); + "\\r" string_append_char(string_buf, '\r'); + "\\t" string_append_char(string_buf, '\t'); + "\\v" string_append_char(string_buf, '\v'); + "\\b" string_append_char(string_buf, '\b'); + "\\f" string_append_char(string_buf, '\f'); + "\\\\" string_append_char(string_buf, '\\'); + "\\\"" string_append_char(string_buf, '\"'); + "\\\'" string_append_char(string_buf, '\''); + "\\\n" string_append_char(string_buf, yytext[1]); + \\(x[0-9a-fA-F]{2}|u[0-9a-fA-F]{4}|U00[0-9a-fA-F]{6}) { + unsigned long code = strtoul(yytext + 2, NULL, 16); + if (code == 0 || code > 0x10ffff) + code = 0xfffd; + Tcl_UniChar ch = code; /* no overflow expected */ + Tcl_DString dstring; + Tcl_DStringInit(&dstring); + char * string = Tcl_UniCharToUtfDString(&ch, 1, &dstring); + string_append_str(string_buf, string); + Tcl_DStringFree(&dstring); + } + \\. string_append_strn(string_buf, yytext, yyleng); +} + +<DQUOTE>[^\\\n\"]+ | +<SQUOTE>[^\\\n\']+ { + string_append_strn(string_buf, yytext, yyleng); +} + +<SQUOTE,DQUOTE>{ + \n | + <<EOF>> { + yy_pop_state(yyscanner); + yylval->as_text = string_free_and_get_buffer(string_buf); + return STRING_OPEN; + } +} + +<INITIAL,META_MODE,STAT_MODE>{SPACE} + +<<EOF>> { + BEGIN(INITIAL); + yyterminate(); +} + +<INITIAL,META_MODE,STAT_MODE>. { + return *yytext; +} + +%% diff --git a/poliqarp-library/sakura/meta-value.c b/poliqarp-library/sakura/meta-value.c new file mode 100644 index 0000000000000000000000000000000000000000..14c48df6ae70783dfca8cfe653b7fb7539ff0191 --- /dev/null +++ b/poliqarp-library/sakura/meta-value.c @@ -0,0 +1,258 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <errno.h> + +#include <sakura/meta-value.h> + +static size_t meta_key_lookup(const struct poliqarp_corpus *corpus, + const char *key) +{ + const struct poliqarp_backend_meta *backend_meta = + poliqarp_get_const_backend(corpus, meta); + size_t num_keys = poliqarp_backend_meta_num_keys(backend_meta); + size_t i; + + for (i = 0; i < num_keys; ++i) + if (strcmp(key, poliqarp_backend_meta_key_fetch(backend_meta, i)) == 0) + break; + if (i == num_keys) { + errno = ENOENT; + return (size_t)-1; + } + + return i; +} + +struct poliqarp_meta_value *poliqarp_meta_value_create_date( + const struct poliqarp_corpus *corpus, const char *key_text, + struct poliqarp_meta_date date, enum poliqarp_meta_match_strategy strategy) +{ + struct poliqarp_meta_value *this; + size_t key; + + key = meta_key_lookup(corpus, key_text); + if (key == (size_t)-1) + return NULL; + + this = malloc(sizeof *this); + this->domain = POLIQARP_DOMAIN_DATE; + this->strategy = strategy; + this->key = key; + this->value_as.date = date; + + return this; +} + +struct poliqarp_meta_value *poliqarp_meta_value_create_undefined( + const struct poliqarp_corpus *corpus, const char *key_text) +{ + struct poliqarp_meta_value *this; + size_t key; + + key = meta_key_lookup(corpus, key_text); + if (key == (size_t)-1) + return NULL; + + this = malloc(sizeof *this); + this->domain = POLIQARP_DOMAIN_UNDEFINED; + this->strategy = POLIQARP_STRATEGY_EQUAL; + this->key = key; + + return this; +} + +struct poliqarp_meta_value *poliqarp_meta_value_create_text( + const struct poliqarp_corpus *corpus, const char *key_text, + const struct poliqarp_regexp *pattern) +{ + struct poliqarp_meta_value *this; + size_t key; + + key = meta_key_lookup(corpus, key_text); + if (key == (size_t)-1) + return NULL; + + this = malloc(sizeof *this); + this->domain = POLIQARP_DOMAIN_TEXT; + this->strategy = POLIQARP_STRATEGY_EQUAL; + this->key = key; + this->value_as.value_pattern = *pattern; + + return this; +} + +void poliqarp_meta_value_destroy(struct poliqarp_meta_value *this) +{ + switch (this->domain) { + case POLIQARP_DOMAIN_DATE: + break; + case POLIQARP_DOMAIN_TEXT: + poliqarp_regexp_destroy(&this->value_as.value_pattern); + break; + case POLIQARP_DOMAIN_UNDEFINED: + break; + } + free(this); +} + +bool poliqarp_meta_value_can_be_optimized(const struct poliqarp_meta_value *this, + const struct poliqarp_meta_value *that, enum poliqarp_logic_operator oper) +{ + /* TODO */ + return false; +} + +struct poliqarp_value *poliqarp_meta_value_optimize( + const struct poliqarp_meta_value *this, const struct poliqarp_meta_value *that, + enum poliqarp_logic_operator oper) +{ + abort(); /* See the TODO above. */ +} + +int poliqarp_meta_value_compare(const struct poliqarp_meta_value *this, + const struct poliqarp_meta_value *that) +{ + return this == that ? 0 : 1; +} + +static int compare_date(struct poliqarp_meta_date d1, struct poliqarp_meta_date d2) +{ + return d1.year != d2.year ? d1.year - d2.year : + d1.month != d2.month ? d1.month - d2.month : + d1.day - d2.day; +} + +bool poliqarp_meta_value_eval(const struct poliqarp_meta_value *this, + const struct poliqarp_corpus *corpus, const struct poliqarp_document *document) +{ + const struct poliqarp_backend_meta *backend_meta = + poliqarp_get_const_backend(corpus, meta); + struct poliqarp_binary_metadata meta; + int i; + + switch (this->domain) { + case POLIQARP_DOMAIN_DATE: + for (i = document->meta_high - document->meta_low; i > 0; --i) { + meta = poliqarp_backend_meta_fetch(backend_meta, document->meta_high - i); + if (meta.type == POLIQARP_METADATA_DATE && meta.key == this->key) + switch (this->strategy) { + case POLIQARP_STRATEGY_EQUAL: + return compare_date(meta.value_as.date, + this->value_as.date) == 0; + case POLIQARP_STRATEGY_GREATER: + return compare_date(meta.value_as.date, + this->value_as.date) > 0; + case POLIQARP_STRATEGY_GREATER_EQUAL: + return compare_date(meta.value_as.date, + this->value_as.date) >= 0; + case POLIQARP_STRATEGY_SMALLER: + return compare_date(meta.value_as.date, + this->value_as.date) < 0; + case POLIQARP_STRATEGY_SMALLER_EQUAL: + return compare_date(meta.value_as.date, + this->value_as.date) <= 0; + default: + abort(); /* Should not happen. */ + } + } + return false; + + case POLIQARP_DOMAIN_TEXT: + for (i = document->meta_high - document->meta_low; i > 0; --i) { + meta = poliqarp_backend_meta_fetch(backend_meta, document->meta_high - i); + if ((meta.type == POLIQARP_METADATA_SINGLE || + meta.type == POLIQARP_METADATA_MULTI) && meta.key == this->key) + { + if (poliqarp_regexp_match(&this->value_as.value_pattern, + poliqarp_backend_meta_value_fetch(backend_meta, + meta.value_as.text))) + { + return true; + } + } + } + return false; + + case POLIQARP_DOMAIN_UNDEFINED: + for (i = document->meta_high - document->meta_low; i > 0; --i) { + meta = poliqarp_backend_meta_fetch(backend_meta, document->meta_high - i); + if (meta.type == POLIQARP_METADATA_UNDEFINED && meta.key == this->key) + return true; + } + return false; + + default: + abort(); /* Should not happen. */ + } /* switch(this->domain) */ +} + +void poliqarp_meta_value_modify(struct poliqarp_meta_value *this, + enum poliqarp_meta_match_strategy strategy) +{ + this->strategy = strategy; +} + +/** + * Parses the text of the form 'yearSEPmonthSEPday' to produce a + * metadata date value. SEP can be any non-digit character. + * If day or month and day are omitted, the result has 0 on the respective + * field. + * + * Returns 0 if the expression was successfully formed, -1 if a parse error + * occurred. + */ +int poliqarp_meta_date_parse(const char *text, struct poliqarp_meta_date *date) +{ + char *ptr; + unsigned long val; + + date->year = date->month = date->day = 0; + + if (text[0] == '\0') + return -1; + val = strtoul(text, &ptr, 10); + date->year = val; + if (date->year != val || val == 0) + return -1; /* integer overflow or 0 */ + if (ptr[0] == '\0') + return 0; + if (ptr[1] == '\0') + return -1; /* no digits after separator */ + val = strtoul(ptr + 1, &ptr, 10); + date->month = val; + if (date->month != val || val == 0 || val > 12) + return -1; /* integer overflow or invalid month */ + if (ptr[0] == '\0') + return 0; + if (ptr[1] == '\0') + return -1; /* no digits after separator */ + val = strtoul(ptr + 1, &ptr, 10); + date->day = val; + if (date->day != val || val == 0 || val > 31) + return -1; /* integer overflow or invalid day */ + if (ptr[0] == '\0') + return 0; + else + return -1; /* trailing garbage */ +} diff --git a/poliqarp-library/sakura/meta-value.h b/poliqarp-library/sakura/meta-value.h new file mode 100644 index 0000000000000000000000000000000000000000..bbb90c0b8fefe1a19970817456e9a17b77de7fd8 --- /dev/null +++ b/poliqarp-library/sakura/meta-value.h @@ -0,0 +1,103 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef POLIQARP_META_VALUE_H +#define POLIQARP_META_VALUE_H + +#include <stdlib.h> +#include <sakura/corpus.h> +#include <sakura/abi.h> +#include <sakura/value.h> +#include <sakura/regexp.h> +#include <sakura/date-span.h> + +/** @todo */ +enum poliqarp_meta_match_domain { + POLIQARP_DOMAIN_DATE, + POLIQARP_DOMAIN_TEXT, + POLIQARP_DOMAIN_UNDEFINED +}; + +/** @todo */ +enum poliqarp_meta_match_strategy { + POLIQARP_STRATEGY_EQUAL, + POLIQARP_STRATEGY_GREATER, + POLIQARP_STRATEGY_GREATER_EQUAL, + POLIQARP_STRATEGY_SMALLER, + POLIQARP_STRATEGY_SMALLER_EQUAL +}; + +/** @todo */ +struct poliqarp_meta_value { + enum poliqarp_meta_match_domain domain; /**< operator domain. */ + enum poliqarp_meta_match_strategy strategy; /**< operator match strategy. */ + size_t key; + struct { + struct poliqarp_regexp value_pattern; + struct poliqarp_meta_date date; + struct poliqarp_date_span date_span; + } value_as; +}; + +/** @todo */ +struct poliqarp_meta_value *poliqarp_meta_value_create_date( + const struct poliqarp_corpus *corpus, const char *key_text, + struct poliqarp_meta_date date, enum poliqarp_meta_match_strategy strategy); + +/** @todo */ +struct poliqarp_meta_value *poliqarp_meta_value_create_undefined( + const struct poliqarp_corpus *corpus, const char *key_text); + +/** @todo */ +struct poliqarp_meta_value *poliqarp_meta_value_create_text( + const struct poliqarp_corpus *corpus, const char *key_text, + const struct poliqarp_regexp *pattern); + +/** @todo */ +void poliqarp_meta_value_destroy(struct poliqarp_meta_value *this); + +/** @todo */ +bool poliqarp_meta_value_can_be_optimized(const struct poliqarp_meta_value *this, + const struct poliqarp_meta_value *that, enum poliqarp_logic_operator oper); + +/** @todo */ +struct poliqarp_value *poliqarp_meta_value_optimize( + const struct poliqarp_meta_value *this, const struct poliqarp_meta_value *that, + enum poliqarp_logic_operator oper); + +/** @todo */ +int poliqarp_meta_value_compare(const struct poliqarp_meta_value *this, + const struct poliqarp_meta_value *that); + +/** @todo */ +bool poliqarp_meta_value_eval(const struct poliqarp_meta_value *this, + const struct poliqarp_corpus *corpus, const struct poliqarp_document *document); + +/** @todo */ +void poliqarp_meta_value_modify(struct poliqarp_meta_value *this, + enum poliqarp_meta_match_strategy strategy); + +int poliqarp_meta_date_parse(const char *text, struct poliqarp_meta_date *date); + + +#endif diff --git a/poliqarp-library/sakura/parser.y b/poliqarp-library/sakura/parser.y new file mode 100644 index 0000000000000000000000000000000000000000..fcc68d86ade2afd1ebdc52c1fa4321dacacf1ebb --- /dev/null +++ b/poliqarp-library/sakura/parser.y @@ -0,0 +1,1024 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +%{ +#include <poliqarp-config.h> + +#include <errno.h> +#include <math.h> + +#include <sakura/lexer.h> +#include <sakura/corpus.h> +#include <sakura/query.h> +#include <sakura/expression.h> +#include <sakura/value.h> +#include <sakura/value-orth.h> +#include <sakura/value-space.h> +#include <sakura/value-base.h> +#include <sakura/value-tag.h> +#include <sakura/value-pos.h> +#include <sakura/value-attr.h> +#include <sakura/value-interp.h> +#include <sakura/value-type.h> +#include <sakura/meta-value.h> +#include <sakura/common/graph.h> +#include <sakura/common/hash-table.h> + +/* Called by yyparse on error */ +void yyerror(yyscan_t scanner, struct poliqarp_query *query, const char *message) +{ + poliqarp_error_message_set(query->error, _(message)); +} + +int yylex(void *, void *); + +static size_t yytnamerr(char *yyres, const char *yystr) +{ + const char *translation = _(yystr); + size_t length = strlen(translation); + if (yyres != NULL) + strcpy(yyres, translation); + return length; +} +#if 0 + M_("$undefined"); + M_("$end"); +#endif +#define yytnamerr yytnamerr + +#define YY_ _ + +struct poliqarp_value *poliqarp_to_interp_value(const struct poliqarp_corpus *corpus, + struct poliqarp_value *value) +{ + struct poliqarp_value *result; + result = poliqarp_value_interp_create(corpus, value); + poliqarp_value_destroy(value); + return result; +} + +static bool validate_expression(void *expression, struct poliqarp_error *error) +{ + if (expression == NULL) { + if (errno == EINVAL) + poliqarp_error_message_set(error, _("Phrasal and segment constraints cannot be combined")); + else + poliqarp_error_from_system(error, NULL); + return false; + } + return true; +} + +static bool require_syntax(struct poliqarp_corpus *corpus, struct poliqarp_error *error) +{ + if (corpus->syntax.syntax) + return true; + else { + poliqarp_error_message_set(error, _("Phrasal queries are not allowed for this corpus")); + return false; + } +} + +%} + +%pure_parser +%lex-param { yyscan_t yyscanner } +%parse-param { void *yyscanner } +%parse-param { struct poliqarp_query *query } +%expect 30 +%error-verbose + +%union { + char *as_text; + struct { + enum poliqarp_match_strategy strategy; + bool use_disamb; + bool negate; + } as_op; + struct { + enum poliqarp_meta_match_strategy strategy; + bool negate; + } as_mop; + struct { + unsigned int flags; + unsigned int xflags; + char *text; + } as_flagged_regexp; + int as_int; + double as_double; + struct nfs_graph as_query_part; + struct poliqarp_expression *as_expression; + struct poliqarp_query *as_query; + struct poliqarp_within *as_within; + struct { + int min; + int max; + } as_quantifier; +} + +%token <as_text> IDENT +%token <as_text> STRING +%token <as_text> STRING_OPEN +%token <as_int> VARIABLE +%token <as_int> INTEGER +%token <as_int> SIGNED_INTEGER +%token <as_double> REAL_NUMBER +%token <as_op> OP +%token <as_mop> MOP +%token <void> META +%token <void> WITHIN +%token <void> HEAD +%token <void> SYNH +%token <void> SEMH +%token <void> GROUP_BY +%token <void> STAT_INTERP +%token <void> STAT_INTERP_RANDOM +%token <void> STAT_INTERP_COMBINE +%token <void> STAT_SORT_AFRONTE +%token <void> STAT_SORT_BY +%token <void> STAT_SORT_BY_FREQ +%token <void> STAT_SORT_BY_CP +%token <void> STAT_SORT_BY_SCP +%token <void> STAT_SORT_BY_MAXCP +%token <void> STAT_SORT_BY_DICE +%token <void> STAT_BIAS +%token <void> STAT_MIN +%token <void> STAT_COUNT + +%left '|' +%left '&' +%left '!' +%left CONCAT +%nonassoc OPERATOR + +%type <as_text> expanded_alias +%type <as_text> ident_string +%type <as_int> signed_integer +%type <as_int> unsigned_integer +%type <as_double> real_number +%type <as_flagged_regexp> flagged_regexp +%type <as_flagged_regexp> flagged_regexp_m +%type <as_expression> simple_exp +%type <as_expression> exp +%type <as_expression> exp_m +%type <as_expression> exp_phrase +%type <as_query_part> query_part +%type <as_query> query +%type <as_quantifier> quantifier +%type <as_expression> meta +%type <as_within> within + +%destructor +{ + free($$); +} IDENT STRING expanded_alias ident_string + +%destructor +{ + free($$.text); +} flagged_regexp flagged_regexp_m + +%destructor +{ + poliqarp_expression_destroy($$); +} simple_exp exp exp_m exp_phrase meta + +%destructor +{ + free($$); +} within + +%% + +query: + query_part within meta group_by + { + if (query->rewrite_in_progress) + { + query->rewrite_graph = $1; + } else { + poliqarp_query_set_within(query, $2); + poliqarp_query_set_meta_expression(query, $3); + graph_nfs_to_dfs(&query->graph, $1); + } + } +; + +meta: + { $$ = NULL; } + | META exp_m + { + if (query->rewrite_in_progress) { + poliqarp_error_message_set(query->error, _("Metadata part is not allowed in a rewritten query")); + poliqarp_expression_destroy($2); + YYABORT; + } + $$ = $2; + } +; + +within: + { $$ = NULL; } + | WITHIN expanded_alias + { + if (query->rewrite_in_progress) { + poliqarp_error_message_set(query->error, _("Within part is not allowed in a rewritten query")); + free($2); + YYABORT; + } + struct poliqarp_subdocument_set *set = + poliqarp_backend_subdocument_lookup_set(&query->corpus->subdocument, $2); + if (set == NULL) { + poliqarp_error_message_set(query->error, _("There is no such structure %s"), $2); + free($2); + YYABORT; + } + $$ = poliqarp_within_create_subdocument(set); + } + | WITHIN '[' exp ']' + { + if (poliqarp_expression_type($3) != POLIQARP_EXPRESSION_PHRASE) { + poliqarp_error_message_set(query->error, _("Within expression does not denote a phrase")); + poliqarp_expression_destroy($3); + YYABORT; + } + $$ = poliqarp_within_create_phrase($3); + } +; + +group_by: + { } + | GROUP_BY stat_attr_list stat_aux_attr_list stat_interp stat_sort stat_min stat_count + { + poliqarp_error_message_set(query->error, _("Statistical queries are not supported")); + YYABORT; + } +; + +stat_attr: + ident_string + { } + | signed_integer '.' ident_string + { } +; + +stat_attr_list: + stat_attr + { } + | stat_attr ',' stat_attr_list + { } +; + +stat_aux_attr_list: + { } + | ';' stat_attr_list + { } +; + +stat_interp: + { } + | STAT_INTERP STAT_INTERP_RANDOM + { } + | STAT_INTERP STAT_INTERP_COMBINE + { } +; + +stat_sort: + { } + | STAT_SORT_AFRONTE + { } + | STAT_SORT_BY STAT_SORT_BY_FREQ + { } + | STAT_SORT_BY stat_collating_function stat_bias + { } +; + +stat_collating_function: + STAT_SORT_BY_CP + { } + | STAT_SORT_BY_SCP + { } + | STAT_SORT_BY_MAXCP + { } + | STAT_SORT_BY_DICE + { } +; + +stat_bias: + { } + | STAT_BIAS real_number + { } +; + +stat_min: + { } + | STAT_MIN unsigned_integer + { } +; + +stat_count: + { } + | STAT_COUNT unsigned_integer + { } +; + +simple_exp: + '[' ']' + { $$ = NULL; } + | '[' exp ']' + { $$ = $2; } +; + +exp_phrase: + HEAD OP simple_exp simple_exp + { + if (!require_syntax(query->corpus, query->error)) { + poliqarp_expression_destroy($3); + poliqarp_expression_destroy($4); + YYABORT; + } + $$ = poliqarp_phrase_expression($3, $4, false, ($2.strategy == POLIQARP_STRATEGY_ALL)); + if ($2.negate) + $$ = poliqarp_not_expression($$); + } + | HEAD OP simple_exp + { + if (!require_syntax(query->corpus, query->error)) { + poliqarp_expression_destroy($3); + YYABORT; + } + $$ = poliqarp_phrase_expression($3, $3, true, ($2.strategy == POLIQARP_STRATEGY_ALL)); + if ($2.negate) + $$ = poliqarp_not_expression($$); + } + | SYNH OP simple_exp + { + if (!require_syntax(query->corpus, query->error)) { + poliqarp_expression_destroy($3); + YYABORT; + } + $$ = poliqarp_phrase_expression($3, NULL, false, ($2.strategy == POLIQARP_STRATEGY_ALL)); + if ($2.negate) + $$ = poliqarp_not_expression($$); + } + | SEMH OP simple_exp + { + if (!require_syntax(query->corpus, query->error)) { + poliqarp_expression_destroy($3); + YYABORT; + } + $$ = poliqarp_phrase_expression(NULL, $3, false, ($2.strategy == POLIQARP_STRATEGY_ALL)); + if ($2.negate) + $$ = poliqarp_not_expression($$); + } +; + +query_part: + flagged_regexp + { + struct poliqarp_value *value; + struct poliqarp_expression *expr; + char *rewritten_query_text = NULL; + if (!query->rewrite_in_progress) + rewritten_query_text = poliqarp_rewrite_query(query->rewrite, $1.text, $1.flags, $1.xflags); + if (rewritten_query_text == NULL) { + struct poliqarp_regexp regexp; + int rc = poliqarp_regexp_create(®exp, $1.text, $1.flags, $1.xflags); + if (rc != 0) { + poliqarp_error_from_regexp(query->error, ®exp, + _("Error in regular expression '%s'"), $1.text); + poliqarp_regexp_destroy(®exp); + free($1.text); + YYABORT; + } + value = poliqarp_value_create_orth(query->corpus, ®exp); + poliqarp_regexp_destroy(®exp); + poliqarp_value_use_strategy(value, POLIQARP_STRATEGY_ANY); + poliqarp_value_use_disamb(value, true); + } + else + value = NULL; + free($1.text); + if (value != NULL) { + expr = poliqarp_expression_create_value(value, + (poliqarp_value_eval_fn)poliqarp_value_eval, + (poliqarp_value_compare_fn)poliqarp_value_compare, + (poliqarp_value_destroy_fn)poliqarp_value_destroy); + graph_nfs_atomic(&$$, &query->graph, + graph_store_symbol(&query->graph, expr)); + } else { + int rc; + yyscan_t scanner; + yylex_init(&scanner); + yy_scan_string(rewritten_query_text, scanner); + query->rewrite_in_progress = true; + rc = yyparse(scanner, query); + query->rewrite_in_progress = false; + yylex_destroy(scanner); + free(rewritten_query_text); + switch (rc) { + case 0: + $$ = query->rewrite_graph; + break; + case 2: + errno = ENOMEM; + poliqarp_error_from_system(query->error, NULL); + /* fall through */ + case 1: + YYABORT; + default: + abort(); /* should not happen */ + } + } + } + | '(' query_part ')' + { $$ = $2; } + | simple_exp + { + void *symbol; + if ($1 != NULL) { + symbol = graph_store_symbol(&query->graph, $1); + if (symbol == NULL) { + poliqarp_error_from_system(query->error, NULL); + poliqarp_expression_destroy($1); + YYABORT; + } + } + else + symbol = SYMBOL_DOT; + int rc = graph_nfs_atomic(&$$, &query->graph, symbol); + if (rc != 0) { + poliqarp_error_from_system(query->error, NULL); + poliqarp_expression_destroy($1); + YYABORT; + } + } + | '[' exp error + { + poliqarp_expression_destroy($2); + poliqarp_error_message_set(query->error, _("Missing ]")); + YYABORT; + } + | query_part '|' query_part + { + int rc = graph_nfs_union(&$$, &query->graph, $1, $3, 0); + if (rc != 0) { + poliqarp_error_from_system(query->error, NULL); + YYABORT; + } + } + | query_part '|' error + { + poliqarp_error_message_set(query->error, _("Parse error after |")); + YYABORT; + } + | query_part '^' query_part %prec CONCAT + { + query->eflags |= POLIQARP_QEFLAG_HAS_ALIGN; + int rc = graph_nfs_concat(&$$, &query->graph, $1, $3, 1); + if (rc != 0) { + poliqarp_error_from_system(query->error, NULL); + YYABORT; + } + } + | query_part query_part %prec CONCAT + { + int rc = graph_nfs_concat(&$$, &query->graph, $1, $2, 0); + if (rc != 0) { + poliqarp_error_from_system(query->error, NULL); + YYABORT; + } + } + | query_part quantifier + { + int rc = graph_nfs_quantify(&$$, &query->graph, $1, $2.min, $2.max); + if (rc != 0) { + if (errno == EINVAL) + poliqarp_error_message_set(query->error, _("Incorrect quantification")); + else + poliqarp_error_from_system(query->error, NULL); + YYABORT; + } + } +; + +exp_m: + IDENT MOP flagged_regexp_m + { + const struct poliqarp_corpus *corpus = query->corpus; + struct poliqarp_meta_value *value; + char *value_string = $3.text; + unsigned flags = $3.flags; + unsigned xflags = $3.xflags; + + enum poliqarp_meta_type mtype = poliqarp_get_metadata_type( + poliqarp_get_const_backend(corpus, meta), $1); + if (mtype == POLIQARP_META_TYPE_UNKNOWN) { + poliqarp_error_message_set(query->error, _("Unknown metadata type")); + free($1); free(value_string); + YYABORT; + } + if (strlen(value_string) == 0) + value = poliqarp_meta_value_create_undefined(corpus, $1); + else if (mtype == POLIQARP_META_TYPE_STRING) { + if ($2.strategy == POLIQARP_STRATEGY_EQUAL) { + struct poliqarp_regexp regexp; + int rc = poliqarp_regexp_create(®exp, value_string, flags, xflags); + if (rc != 0) { + poliqarp_error_from_regexp(query->error, ®exp, + _("Error in regular expression '%s'"), value_string); + poliqarp_regexp_destroy(®exp); + free($1); free(value_string); + YYABORT; + } + value = poliqarp_meta_value_create_text(corpus, $1, ®exp); + } else { + poliqarp_error_message_set(query->error, _("Unordered metadata type")); + free($1); free(value_string); + YYABORT; + } + } else { /* mtype == META_TYPE_DATE */ + struct poliqarp_meta_date date; + if (poliqarp_meta_date_parse(value_string, &date) == -1) { + poliqarp_error_message_set(query->error, _("Malformed date")); + free($1); free(value_string); + YYABORT; + } + if ($2.strategy == POLIQARP_STRATEGY_GREATER || + $2.strategy == POLIQARP_STRATEGY_SMALLER_EQUAL) { + if (date.month == 0) date.month = 99; + if (date.day == 0) date.day = 99; + } + if ($2.strategy == POLIQARP_STRATEGY_EQUAL) { + if (date.year && date.month && date.day) + value = poliqarp_meta_value_create_date(corpus, $1, + date, $2.strategy); + else { + struct poliqarp_meta_value *vlow, *vhigh; + vlow = poliqarp_meta_value_create_date(corpus, $1, + date, POLIQARP_STRATEGY_GREATER_EQUAL); + if (date.month == 0) date.month = 99; + if (date.day == 0) date.day = 99; + vhigh = poliqarp_meta_value_create_date(corpus, $1, + date, POLIQARP_STRATEGY_SMALLER_EQUAL); + $$ = poliqarp_and_expression( + poliqarp_expression_create_value(vlow, + (poliqarp_value_eval_fn)poliqarp_meta_value_eval, + (poliqarp_value_compare_fn)poliqarp_meta_value_compare, + (poliqarp_value_destroy_fn)poliqarp_meta_value_destroy), + poliqarp_expression_create_value(vhigh, + (poliqarp_value_eval_fn)poliqarp_meta_value_eval, + (poliqarp_value_compare_fn)poliqarp_meta_value_compare, + (poliqarp_value_destroy_fn)poliqarp_meta_value_destroy)); + goto expression_created; + } + } else { + value = poliqarp_meta_value_create_date(corpus, $1, + date, $2.strategy); + } + } + + /* create expression */ + $$ = poliqarp_expression_create_value(value, + (poliqarp_value_eval_fn)poliqarp_meta_value_eval, + (poliqarp_value_compare_fn)poliqarp_meta_value_compare, + (poliqarp_value_destroy_fn)poliqarp_meta_value_destroy); +expression_created: + /* negate if neccesary */ + if ($2.negate) + $$ = poliqarp_not_expression($$); + /* cleanup */ + free($1); + free($3.text); + } + | exp_m '|' exp_m + { + $$ = poliqarp_or_expression($1, $3); + if (!validate_expression($$, query->error)) + YYABORT; + } + | exp_m '&' exp_m + { + $$ = poliqarp_and_expression($1, $3); + if (!validate_expression($$, query->error)) + YYABORT; + } + | '(' exp_m ')' + { $$ = $2; } + | '!' exp_m + { $$ = poliqarp_not_expression ($2); } +; + +exp: + exp_phrase + { $$ = $1; } + | IDENT OP INTEGER + { + const struct poliqarp_corpus *corpus = query->corpus; + const struct entity *entity; + struct poliqarp_value *value; + + /* lookup name */ + entity = lookup_const_entity(&corpus->config.named_items, $1); + if (entity == NULL || entity->tag == NULL) { + poliqarp_error_message_set(query->error, _("There is no such attribute: %s"), $1); + free($1); + YYABORT; + } + + /* create things based on entity type */ + switch (*(enum poliqarp_entity_type *)entity->tag) { + case POLIQARP_ENTITY_ITEM_SPACE: + value = poliqarp_value_create_space(corpus, ($3 != 0) != $2.negate); + break; + case POLIQARP_ENTITY_ITEM_POS: + case POLIQARP_ENTITY_ITEM_BASE: + case POLIQARP_ENTITY_ITEM_TAG: + case POLIQARP_ENTITY_ITEM_TYPE: + case POLIQARP_ENTITY_ATTR: + poliqarp_error_message_set(query->error, _("Integers are not valid values for the attribute: %s"), $1); + free($1); + YYABORT; + break; + default: + poliqarp_error_message_set(query->error, _("There is no such attribute: %s"), $1); + free($1); + YYABORT; + break; + } + + /* create expression */ + $$ = poliqarp_expression_create_value(value, + (poliqarp_value_eval_fn)poliqarp_value_eval, + (poliqarp_value_compare_fn)poliqarp_value_compare, + (poliqarp_value_destroy_fn)poliqarp_value_destroy); + /* cleanup */ + free($1); + } + | IDENT OP flagged_regexp + { + const struct poliqarp_corpus *corpus = query->corpus; + const struct entity *entity; + struct poliqarp_value *value; + char *value_string = $3.text; + unsigned flags = $3.flags; + unsigned xflags = $3.xflags; + + /* lookup name */ + entity = lookup_const_entity(&corpus->config.named_items, $1); + if (entity == NULL || entity->tag == NULL) { + poliqarp_error_message_set(query->error, _("There is no such attribute: %s"), $1); + free($1); free(value_string); + YYABORT; + } + + struct poliqarp_regexp regexp; + int rc = poliqarp_regexp_create(®exp, value_string, flags, xflags); + if (rc != 0) { + poliqarp_error_from_regexp(query->error, ®exp, + _("Error in regular expression '%s'"), value_string); + poliqarp_regexp_destroy(®exp); + free($1); free(value_string); + YYABORT; + } + + /* create things based on entity type */ + switch (*(enum poliqarp_entity_type *)entity->tag) { + case POLIQARP_ENTITY_ITEM_ORTH: + value = poliqarp_value_create_orth(corpus, ®exp); + break; + case POLIQARP_ENTITY_ITEM_POS: + value = poliqarp_value_create_pos(corpus, ®exp); + break; + case POLIQARP_ENTITY_ITEM_BASE: + value = $2.use_disamb ? + poliqarp_value_create_base__disambiguated(corpus, ®exp) : + poliqarp_value_create_base__ambiguous(corpus, ®exp); + break; + case POLIQARP_ENTITY_ITEM_TAG: + value = poliqarp_value_create_tag(corpus, ®exp); + break; + case POLIQARP_ENTITY_ITEM_TYPE: + if (!require_syntax(query->corpus, query->error)) { + poliqarp_regexp_destroy(®exp); + free($1); free(value_string); + YYABORT; + } + value = poliqarp_value_create_type(corpus, ®exp); + break; + case POLIQARP_ENTITY_ATTR: + value = poliqarp_value_create_attr(corpus, $1, ®exp); + break; + default: + poliqarp_error_message_set(query->error, _("There is no such attribute: %s"), $1); + poliqarp_regexp_destroy(®exp); + free($1); free(value_string); + YYABORT; + break; + } + + /* apply operator effects */ + poliqarp_value_use_strategy(value, $2.strategy); + poliqarp_value_use_disamb(value, $2.use_disamb); + + switch (*(enum poliqarp_entity_type *)entity->tag) { + case POLIQARP_ENTITY_ITEM_ORTH: + case POLIQARP_ENTITY_ITEM_TYPE: + break; + case POLIQARP_ENTITY_ITEM_POS: + case POLIQARP_ENTITY_ITEM_BASE: + case POLIQARP_ENTITY_ITEM_TAG: + case POLIQARP_ENTITY_ATTR: + value = poliqarp_to_interp_value(corpus, value); + break; + default: + abort(); /* Should not happen. */ + } + + /* create expression */ + $$ = poliqarp_expression_create_value(value, + (poliqarp_value_eval_fn)poliqarp_value_eval, + (poliqarp_value_compare_fn)poliqarp_value_compare, + (poliqarp_value_destroy_fn)poliqarp_value_destroy); + + /* negate if neccesary */ + if ($2.negate) + $$ = poliqarp_not_expression($$); + + /* cleanup */ + poliqarp_regexp_destroy(®exp); + free($1); + free(value_string); + } + | IDENT OP VARIABLE + { + const struct poliqarp_corpus *corpus = query->corpus; + const struct entity *entity; + struct poliqarp_value **values; + struct poliqarp_expression **children = NULL; + size_t i, j, n_values; + + if ($3 >= POLIQARP_MAX_VARIABLES || $3 < 0) { + poliqarp_error_message_set(query->error, _("Variable number is too large")); + free($1); + YYABORT; + } + + /* lookup name */ + entity = lookup_const_entity(&corpus->config.named_items, $1); + if (entity == NULL || entity->tag == NULL) { + poliqarp_error_message_set(query->error, _("There is no such attribute: %s"), $1); + free($1); + YYABORT; + } + + /* create things based on entity type */ + switch (*(enum poliqarp_entity_type *)entity->tag) { + case POLIQARP_ENTITY_ITEM_POS: + values = poliqarp_value_create_all_pos(corpus, &n_values); + break; + case POLIQARP_ENTITY_ATTR: + values = poliqarp_value_create_all_attr(corpus, entity->name, &n_values); + break; + default: + poliqarp_error_message_set(query->error, _("There is no such attribute: %s"), $1); + free($1); + YYABORT; + break; + } + free($1); + if (values == NULL) { + poliqarp_error_from_system(query->error, NULL); + YYABORT; + } + for (i = 0; i < n_values; i++) { + /* apply operator effects */ + poliqarp_value_use_strategy(values[i], $2.strategy); + poliqarp_value_use_disamb(values[i], $2.use_disamb); + values[i] = poliqarp_to_interp_value(corpus, values[i]); + if (values[i] == NULL) + break; + } + if (i != n_values) { + poliqarp_error_from_system(query->error, NULL); + for (j = 0; j < i; j++) + poliqarp_value_destroy(values[j]); + free(values); + YYABORT; + } + children = malloc(n_values * sizeof(*children)); + if (children == NULL) { + poliqarp_error_from_system(query->error, NULL); + YYABORT; + } + + for (i = 0; i < n_values; i++) + children[i] = NULL; + for (i = 0; i < n_values; i++) { + children[i] = poliqarp_expression_create_value(values[i], + (poliqarp_value_eval_fn)poliqarp_value_eval, + (poliqarp_value_compare_fn)poliqarp_value_compare, + (poliqarp_value_destroy_fn)poliqarp_value_destroy); + if (children[i] == NULL) + break; + values[i] = NULL; + } + if (i != n_values) { + poliqarp_error_from_system(query->error, NULL); + for (j = 0; j < i; j++) + poliqarp_expression_destroy(children[j]); + for (j = i; j < n_values; j++) + poliqarp_value_destroy(values[j]); + free(values); + YYABORT; + } + + free(values); + + /* create expression */ + $$ = poliqarp_variable_expression(n_values, children, $3); + if ($$ == NULL) { + poliqarp_error_from_system(query->error, NULL); + for (i = 0; i < n_values; i++) + poliqarp_expression_destroy(children[i]); + YYABORT; + } + /* negate if neccesary */ + if ($2.negate) + $$ = poliqarp_not_expression($$); + /* check variable types, ranges etc. */ + query->eflags |= POLIQARP_QEFLAG_HAS_VARIABLES; + if (query->variable_types[$3] == NULL) + query->variable_types[$3] = entity->name; + else if (query->variable_types[$3] != entity->name) { + poliqarp_error_message_set(query->error, _("Incompatible variable types")); + poliqarp_expression_destroy($$); + YYABORT; + } + poliqarp_expression_variable_ranges($$, query->variable_ranges); + /* cleanup */ + } + | exp '|' exp + { + $$ = poliqarp_or_expression($1, $3); + if (!validate_expression($$, query->error)) + YYABORT; + } + | exp '&' exp + { + $$ = poliqarp_and_expression($1, $3); + if (!validate_expression($$, query->error)) + YYABORT; + } + | '(' exp ')' + { $$ = $2; } + | '!' exp + { $$ = poliqarp_not_expression ($2); } +; + +/* quantifiers */ +quantifier: + '{' '}' + { $$.min = 1; $$.max = 1; } + | '{' INTEGER ',' INTEGER '}' + { $$.min = $2; $$.max = $4; } + | '{' INTEGER '}' + { $$.min = $2; $$.max = $2; } + | '{' ',' INTEGER '}' + { $$.min = 0; $$.max = $3; } + | '{' INTEGER ',' '}' + { $$.min = $2; $$.max = -1; } + | '+' + { $$.min = 1; $$.max = -1; } + | '?' + { $$.min = 0; $$.max = 1; } + | '*' + { $$.min = 0; $$.max = -1; } + | '{' error + { + poliqarp_error_message_set(query->error, _("Parse error after {")); + YYABORT; + } +; + + +flagged_regexp_m: + expanded_alias + { + $$.flags = (query->flags & POLIQARP_QFLAG_META_I) ? REG_ICASE : 0; + $$.flags |= REG_NOSUB; + $$.xflags = (query->flags & POLIQARP_QFLAG_META_X) ? POLIQARP_REG_NO_ANCHORS : 0; + $$.text = $1; + } + | expanded_alias '/' IDENT + { + $$.flags = (query->flags & POLIQARP_QFLAG_META_I) ? REG_ICASE : 0; + $$.flags |= REG_NOSUB; + $$.xflags = (query->flags & POLIQARP_QFLAG_META_X) ? POLIQARP_REG_NO_ANCHORS : 0; + poliqarp_parse_regexp_flags($3, &$$.flags, &$$.xflags); /* extra flags */ + free($3); + $$.text = $1; + } + | expanded_alias '/' error + { + poliqarp_error_message_set(query->error, _("Missing flags")); + free($1); + YYABORT; + } +; + +flagged_regexp: + expanded_alias + { + $$.flags = (query->flags & POLIQARP_QFLAG_QUERY_I) ? REG_ICASE : 0; + $$.xflags = (query->flags & POLIQARP_QFLAG_QUERY_X) ? POLIQARP_REG_NO_ANCHORS : 0; + $$.text = $1; + } + | expanded_alias '/' IDENT + { + $$.flags = (query->flags & POLIQARP_QFLAG_QUERY_I) ? REG_ICASE : 0; + $$.xflags = (query->flags & POLIQARP_QFLAG_QUERY_X) ? POLIQARP_REG_NO_ANCHORS : 0; + poliqarp_parse_regexp_flags($3, &$$.flags, &$$.xflags); /* extra flags */ + free($3); + $$.text = $1; + } +; + +expanded_alias: + ident_string + { + if (query->aliases) { + const char *text = hash_table_const_get(query->aliases, $1); + $$ = text ? strdup (text) : $1; + } else + $$ = $1; + } +; + +ident_string: + IDENT + | STRING + { $$ = $1; } + | STRING_OPEN error + { + poliqarp_error_message_set(query->error, _("No closing quote after string: %s"), $1); + free($1); + YYABORT; + } +; + +unsigned_integer: + INTEGER + { + $$ = $1; + } +; + +signed_integer: + unsigned_integer + { + $$ = $1; + } + | SIGNED_INTEGER + { + $$ = $1; + } +; + +real_number: + signed_integer + { + $$ = $1; + } + | REAL_NUMBER + { + if (isnan($1)) { + poliqarp_error_message_set(query->error, _("Invalid floating-point number")); + YYABORT; + } + $$ = $1; + } +; + +%% + diff --git a/poliqarp-library/sakura/poliqarp-private.h b/poliqarp-library/sakura/poliqarp-private.h new file mode 100644 index 0000000000000000000000000000000000000000..9666ec3061379cfcd0a82d0733138925540bd27d --- /dev/null +++ b/poliqarp-library/sakura/poliqarp-private.h @@ -0,0 +1,39 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +/** + * @file poliqarp-private.h + * @brief Private structures of libpoliqarp. + * + * This file includes several other files that together define private + * structures of sakura. + */ + +#ifndef POLIQARP_PRIVATE_H +#define POLIQARP_PRIVATE_H + +#include <sakura/abi.h> +#include <sakura/corpus.h> +#include <sakura/query.h> + +#endif /* POLIQARP_PRIVATE_H */ diff --git a/poliqarp-library/sakura/poliqarp.c b/poliqarp-library/sakura/poliqarp.c new file mode 100644 index 0000000000000000000000000000000000000000..7d1a5c5bc595c09ff273ebd1bc122d6652ec37c8 --- /dev/null +++ b/poliqarp-library/sakura/poliqarp.c @@ -0,0 +1,425 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <locale.h> +#include <stdio.h> + +#include <foostring/foostring.h> +#include <sakura/poliqarp.h> + +#include <unibits/strcoll.h> + +#define POLIQARP_MAJOR_VERSION 1 +#define POLIQARP_MINOR_VERSION 3 +#define POLIQARP_REVISION_NUMBER 11 +#define POLIQARP_LIBRARY_NAME "sakura" + +const int poliqarp_major_version = POLIQARP_MAJOR_VERSION; +const int poliqarp_minor_version = POLIQARP_MINOR_VERSION; +const int poliqarp_revision_number = POLIQARP_REVISION_NUMBER; +const char poliqarp_library_name[] = POLIQARP_LIBRARY_NAME; + +int poliqarp_create(const char *locale, struct poliqarp_error *error) +{ + int rc; + setlocale(LC_ALL, locale); + rc = poliqarp_regexp_validate_utf8() || unibits_validate_utf8(); + if (rc < 0) + goto error; + if (rc > 0) { + /* Maybe UTF-8 variant of LC_CTYPE is fine? */ + const char *locale = setlocale(LC_CTYPE, NULL); + if (locale != NULL) { + const char *locale_end = locale; + while (*locale_end != '\0' && *locale_end != '.') + locale_end++; + size_t length = locale_end - locale; + char *new_locale = malloc(length + 7); + if (new_locale == NULL) + goto error; + new_locale[0] = '\0'; + strncat(new_locale, locale, length); + strcat(new_locale + length, ".UTF-8"); + setlocale(LC_CTYPE, new_locale); + free(new_locale); + rc = poliqarp_regexp_validate_utf8() || unibits_validate_utf8(); + if (rc < 0) + goto error; + } + } + if (rc > 0) { + /* Maybe LC_COLLATE is fine? */ + const char *locale = setlocale(LC_COLLATE, NULL); + if (locale != NULL) { + char *new_locale = malloc(strlen(locale) + 7); + if (new_locale == NULL) + goto error; + strcpy(new_locale, locale); + setlocale(LC_CTYPE, new_locale); + rc = poliqarp_regexp_validate_utf8() || unibits_validate_utf8(); + if (rc < 0) + goto error; + if (rc > 0) { + /* Maybe UTF-8 variant of LC_COLLATE is fine? */ + char *new_locale_end = new_locale; + while (*new_locale_end != '\0' && *new_locale_end != '.') + new_locale_end++; + strcpy(new_locale_end, ".UTF-8"); + setlocale(LC_CTYPE, new_locale); + rc = poliqarp_regexp_validate_utf8() || unibits_validate_utf8(); + if (rc < 0) + goto error; + } + free(new_locale); + } + } + if (rc > 0) { + /* Maybe en_US.UTF-8 is available? */ + setlocale(LC_CTYPE, "en_US.UTF-8"); + rc = poliqarp_regexp_validate_utf8() || unibits_validate_utf8(); + } + if (rc != 0) { + poliqarp_error_message_set(error, _("Unable to set a UTF-8 locale")); + return -1; + } + return 0; +error: + poliqarp_error_from_system(error, _("Unable to initialize the Poliqarp library")); + return -1; +} + +int poliqarp_destroy(void) +{ + return 0; +} + +int poliqarp_get_corpus_info(const struct poliqarp_corpus *corpus, + struct poliqarp_corpus_info *info) +{ + info->num_segments = poliqarp_backend_corpus_size(&corpus->corpus); + info->num_types = poliqarp_backend_orth_num_items( + poliqarp_get_const_backend(corpus, orth)); + info->num_lemmata = poliqarp_backend_base_num_items__disamb( + poliqarp_get_const_backend(corpus, base)); + info->num_tags = poliqarp_backend_tag_num_items( + poliqarp_get_const_backend(corpus, tag)); + return 0; +} + +int poliqarp_get_segment(struct poliqarp_segment *segment, + struct poliqarp_corpus *corpus, size_t index) +{ +#ifndef NDEBUG + if (index >= poliqarp_backend_corpus_size(&corpus->corpus)) + return -1; +#endif + + segment->corpus = corpus; + segment->segment = poliqarp_backend_corpus_get(&corpus->corpus, index); + + return 0; +} + +int poliqarp_get_segment_info(const struct poliqarp_segment *segment, + struct poliqarp_segment_info *info) +{ + info->space_before = segment->segment.orth_space_id & 1; + info->text = poliqarp_backend_orth_fetch( + poliqarp_get_const_backend(segment->corpus, orth), + segment->segment.orth_space_id >> 1); + return 0; +} + +int poliqarp_get_disambiguated_interpretations( + const struct poliqarp_segment *segment, + struct poliqarp_interpretation_set *set) +{ + set->corpus = segment->corpus; + set->set = segment->segment.interp_disamb_id; + set->disamb = true; + return 0; +} + +int poliqarp_get_ambiguous_interpretations( + const struct poliqarp_segment *segment, + struct poliqarp_interpretation_set *set) +{ + set->corpus = segment->corpus; + set->set = segment->segment.interp_amb_id; + set->disamb = false; + return 0; +} + +int poliqarp_get_interpretation_set_info( + const struct poliqarp_interpretation_set *set, + struct poliqarp_interpretation_set_info *info) +{ + info->size = set->disamb ? + poliqarp_backend_interp_length__disamb( + poliqarp_get_const_backend(set->corpus, interp), set->set) : + poliqarp_backend_interp_length__amb( + poliqarp_get_const_backend(set->corpus, interp), set->set); + return 0; +} + +int poliqarp_get_interpretation(const struct poliqarp_interpretation_set *set, + struct poliqarp_interpretation *interp, size_t index) +{ + const struct poliqarp_binary_interp *binterp; + binterp = set->disamb ? + poliqarp_backend_interp_fetch__disamb( + poliqarp_get_const_backend(set->corpus, interp), set->set) : + poliqarp_backend_interp_fetch__amb( + poliqarp_get_const_backend(set->corpus, interp), set->set); + interp->corpus = set->corpus; + interp->disamb = set->disamb; + interp->interp = binterp[index]; + POLIQARP_INTERP_LE_TO_HE(interp->interp); + return 0; +} + +int poliqarp_get_interpretation_info( + const struct poliqarp_interpretation *interp, + struct poliqarp_interpretation_info *info) +{ + info->base = interp->disamb ? + poliqarp_backend_base_fetch__disamb( + poliqarp_get_const_backend(interp->corpus, base), + interp->interp.base_id) : + poliqarp_backend_base_fetch__amb( + poliqarp_get_const_backend(interp->corpus, base), + interp->interp.base_id); + info->tag = poliqarp_backend_tag_fetch( + poliqarp_get_const_backend(interp->corpus, tag), interp->interp.tag_id); + return 0; +} + +int poliqarp_define_alias(struct poliqarp_corpus *corpus, const char *name, + const char *value) +{ + if (hash_table_set(&(poliqarp_get_backend(corpus, config)->aliases), name, + strdup(value))) + { + return -1; + } + return 0; +} + +int poliqarp_delete_alias(struct poliqarp_corpus *corpus, const char *name) +{ + if (hash_table_unset(&(poliqarp_get_backend(corpus, config)->aliases), name)) + return -1; + return 0; +} + +static void get_aliases_iterator(const char *key, const void *value, + void *env) +{ + struct poliqarp_alias **alias = (struct poliqarp_alias **)env; + (*alias)->name = key; + (*alias)->value = (const char *)value; + (*alias)++; +} + +int poliqarp_get_aliases(const struct poliqarp_corpus *corpus, + struct poliqarp_alias_list *aliases) +{ + const struct hash_table *table = + &(poliqarp_get_const_backend(corpus, config)->aliases); + struct poliqarp_alias *tmp; + aliases->num_aliases = hash_table_num_items(table); + tmp = aliases->aliases = malloc(aliases->num_aliases * + sizeof(struct poliqarp_alias)); + hash_table_iterate(table, &tmp, get_aliases_iterator); + return 0; +} + +int poliqarp_free_aliases(struct poliqarp_alias_list *aliases) +{ + free(aliases->aliases); + return 0; +} + +int poliqarp_get_metadata_set(const struct poliqarp_corpus *corpus, + size_t id, struct poliqarp_metadata_set *meta) +{ + struct poliqarp_document document; + if (poliqarp_backend_document_fetch(&corpus->document, id, &document) == -1) + return -1; + meta->corpus = corpus; + meta->low = document.meta_low; + meta->high = document.meta_high; + return 0; +} + +size_t poliqarp_metadata_count(const struct poliqarp_metadata_set *meta) +{ + return meta->high - meta->low; +} + +int poliqarp_get_metadata(const struct poliqarp_metadata_set *set, + size_t index, struct poliqarp_metadata *meta) +{ + index += set->low; + meta->corpus = set->corpus; + meta->meta = poliqarp_backend_meta_fetch(poliqarp_get_const_backend( + set->corpus, meta), index); + return 0; +} + +int poliqarp_get_metadata_types(struct poliqarp_metadata_types *types, + const struct poliqarp_corpus *corpus) +{ + int num = 0; + struct poliqarp_meta_type_list *mtypes = corpus->meta.types; + + while (mtypes) { + ++num; + mtypes = mtypes->next; + } + types->types = malloc(num * sizeof *(types->types)); + if (types->types == NULL) + return -1; + types->num_types = num; + + mtypes = corpus->meta.types; + while (num) { + num--; + types->types[num].key = mtypes->key; + types->types[num].type = + (mtypes->type == POLIQARP_META_TYPE_STRING) ? POLIQARP_META_TEXT : + POLIQARP_META_DATE; + mtypes = mtypes->next; + } + return 0; +} + +int poliqarp_free_metadata_types(struct poliqarp_metadata_types *types) +{ + free(types->types); + return 0; +} + +int poliqarp_get_metadata_info(const struct poliqarp_metadata *meta, + struct poliqarp_metadata_info *info) +{ + info->key = poliqarp_backend_meta_key_fetch(poliqarp_get_const_backend( + meta->corpus, meta), meta->meta.key); + switch (meta->meta.type) { + case POLIQARP_METADATA_SINGLE: + case POLIQARP_METADATA_MULTI: + info->type = POLIQARP_META_TEXT; + info->value.text = poliqarp_backend_meta_value_fetch( + poliqarp_get_const_backend(meta->corpus, meta), + meta->meta.value_as.text); + break; + case POLIQARP_METADATA_DATE: + info->type = POLIQARP_META_DATE; + info->value.date.year = meta->meta.value_as.date.year; + info->value.date.month = meta->meta.value_as.date.month; + info->value.date.day = meta->meta.value_as.date.day; + break; + case POLIQARP_METADATA_UNDEFINED: + info->type = POLIQARP_META_UNDEFINED; + break; + default: + abort(); /* Should not happen. */ + } + return 0; +} + +int poliqarp_get_tagset_info(const struct poliqarp_corpus *corpus, + struct poliqarp_tagset_info *info) +{ + const struct poliqarp_backend_config *cfg = + poliqarp_get_const_backend(corpus, config); + struct entity *entity; + struct poliqarp_attr *attr; + struct poliqarp_attr_value *aval; + struct poliqarp_part_of_speech *pos; + struct poliqarp_attr_instance *ainst; + string_t s; + + /* first pass: gather number of classes and categories */ + info->num_categories = info->num_classes = 0; + for (entity = cfg->named_items.first_entity; entity; + entity = entity->next_entity) + { + switch (*(enum poliqarp_entity_type *)entity->tag) { + case POLIQARP_ENTITY_POS: info->num_classes++; break; + case POLIQARP_ENTITY_ATTR: info->num_categories++; break; + default: break; + } + } + + /* allocate memory */ + info->classes = malloc(info->num_classes * sizeof(*(info->classes))); + info->categories = malloc(info->num_categories * sizeof(*(info->categories))); + + /* second pass: retrieve the info */ + info->num_categories = info->num_classes = 0; + for (entity = cfg->named_items.first_entity; entity; + entity = entity->next_entity) + { + switch (*(enum poliqarp_entity_type *)entity->tag) { + case POLIQARP_ENTITY_POS: + s = string_create(); + string_append_str(s, entity->name); + pos = (struct poliqarp_part_of_speech *)entity->data; + for (ainst = pos->first_instance; ainst; + ainst = ainst->next_instance) + { + string_append_str(s, " "); + if (ainst->is_optional) + string_append_str(s, "["); + string_append_str(s, ainst->attr->self->name); + if (ainst->is_optional) + string_append_str(s, "]"); + } + info->classes[info->num_classes++] = string_free_and_get_buffer(s); + break; + case POLIQARP_ENTITY_ATTR: + s = string_create(); + string_append_str(s, entity->name); + attr = (struct poliqarp_attr *)entity->data; + for (aval = attr->first_value; aval; aval = aval->next_value) { + string_append_str(s, " "); + string_append_str(s, aval->self->name); + } + info->categories[info->num_categories++] = string_free_and_get_buffer(s); + break; + default: + break; + } + } + return 0; +} + +void poliqarp_free_tagset_info(struct poliqarp_tagset_info *info) +{ + size_t i; + for (i = 0; i < info->num_classes; i++) + free(info->classes[i]); + for (i = 0; i < info->num_categories; i++) + free(info->categories[i]); +} diff --git a/poliqarp-library/sakura/poliqarp.h b/poliqarp-library/sakura/poliqarp.h new file mode 100644 index 0000000000000000000000000000000000000000..a779d677f2e0b6c0678e39883e6cbfe2a756640d --- /dev/null +++ b/poliqarp-library/sakura/poliqarp.h @@ -0,0 +1,670 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +/** + * @file poliqarp.h + * @brief Public API of libpoliqarp. + * + * This file defines a set of low-level corpus structures and functions. + * Any library that implements them can be linked with the Poliqarp daemon and + * used with Poliqarp clients. The default implementation is codenamed + * `sakura'. + */ + +#ifndef POLIQARP_H +#define POLIQARP_H + +#include <poliqarp-config.h> + +#include <stdlib.h> +#include <progress/progress.h> + +/** Version number and name of the library. */ +extern const int poliqarp_major_version; +extern const int poliqarp_minor_version; +extern const int poliqarp_revision_number; +extern const char poliqarp_library_name[]; + +/* Declaration of private structures. */ + +/** + * Logically, a corpus is just a sequence of segments. However, this file does + * not tell how it is precisely defined -- that is the job of + * poliqarp-private.h, included below. Instead, it declares functions that + * return a particular segment in the corpus, or number of segments contained + * therein, etc. + */ +struct poliqarp_corpus; + +/** + * A parsed query can have just about any internal structure. + */ +struct poliqarp_query; + + +/** + * An error message wrapper. + */ +struct poliqarp_error; + +/** + * A match buffer is an array holding results of a query. It generally + * has a fixed size, but can be enlarged or shrunk as needed. + */ +struct poliqarp_match_buffer; + +/** + * A segment is a triple <O, D, A>, where O is a string (denoting orthographic + * form of the segment -- the form as it appears in the text) and D and A are + * sets of interpretations (see below), corresponding to disambiguated and + * ambiguous interpretations, respectively. + */ +struct poliqarp_segment; + +/** + * Straightforwardly enough, an interpretation set is a collection of + * interpretations (which see below). + */ +struct poliqarp_interpretation_set; + +/** + * An interpretation is a pair <b, t> of strings, representing the base form + * and tag of a segment, respectively. As far as this API is concerned, tags + * do not have internal structure; they are just strings. + */ +struct poliqarp_interpretation; + +/** + * This is a collection of pieces of metadata that can be specified for a + * document. + */ +struct poliqarp_metadata_set; + +/** + * A single element of the metadata set. + */ +struct poliqarp_metadata; + +/** + * State of a pseudo-random number generator. + */ +struct poliqarp_random_state; + +/* General library routines. */ + +/** + * Initializes the library. Must be used once before any other library + * functions are called. + * @param locale name to be passed to setlocale() + * @return 0 if the initialization was successful, -1 otherwise. + */ +int poliqarp_create(const char *locale, struct poliqarp_error *error); + +/** + * Frees all resources used internally by the library. + * @return 0 if the cleanup was successful, -1 otherwise. + */ +int poliqarp_destroy(void); + +/** + * Initializes per-thread private data of the library. Must be called once + * before any other library functions are called by every thread that uses + * them. + * @return 0 if the initialization was successful, -1 otherwise. + */ +int poliqarp_thread_init(void **data); + +/** + * Destructive counterpart of poliqarp_thread_init(). + * @return 0 if the cleanup was successful, -1 otherwise. + */ +int poliqarp_thread_done(void *data); + +/* Corpus-related routines. */ + +/** + * Public corpus information. + */ +struct poliqarp_corpus_info { + size_t num_segments; /**< Number of segments in this corpus. */ + size_t num_types; /**< Number of types of segments in this corpus. */ + size_t num_lemmata; /**< Number of types of lemmata in this corpus. */ + size_t num_tags; /**< Number of types of tags in this corpus. */ +}; + +/** + * Public tagset information. + */ +struct poliqarp_tagset_info { + size_t num_categories; /**< Number of grammatical categories. */ + char **categories; /**< Array of category descriptions. */ + size_t num_classes; /**< Number of grammatical classes. */ + char **classes; /**< Array of class descriptions. */ +}; + +/** + * Opens a corpus. + * @param corpus The structure to be initialized. + * @param name Some kind of name for the corpus. It could be a file name, a + * common prefix of several file names (as is the case with sakura), etc. + * @param progress Indicator of the progress of this operation. + * @return 0 if the corpus was opened successfully, -1 in case of an error. + * @note It is safe to call this function in a thread that can be cancelled + * with a deferred cancellation request. + */ +int poliqarp_open_corpus(struct poliqarp_corpus *corpus, const char *name, + progress_t *progress, struct poliqarp_error *error); + +/** + * Closes a corpus and frees all resources associated with it. + * @param corpus The corpus to be closed. + * @return 0 if the corpus was closed successfully, -1 in case of an error. + * @note Think of this function as a hint to the library that a particular + * corpus should be closed. In particular, the library might as well leave + * it open if it so desires, or close it after a certain amount of time + * (for instance if many users seem to be using this corpus intensively). + */ +int poliqarp_close_corpus(struct poliqarp_corpus *corpus); + +/** + * Retrieves information about the corpus. + * @param corpus The corpus to extract information from. + * @param info The destination structure. + * @return 0 upon successful completion, -1 upon error. + */ +int poliqarp_get_corpus_info(const struct poliqarp_corpus *corpus, + struct poliqarp_corpus_info *info); + +/** + * Retrieves the tagset used by the corpus. + * @param corpus The corpus to extract tagset information from. + * @param info The destination structure. + * @return 0 upon successful completion, -1 on error. + */ +int poliqarp_get_tagset_info(const struct poliqarp_corpus *corpus, + struct poliqarp_tagset_info *info); + +/** + * Frees all resources allocated by poliqarp_get_tagset_info(). + */ +void poliqarp_free_tagset_info(struct poliqarp_tagset_info *info); + +/* Query-related routines. */ + +/** + * Query flags. + */ +#define POLIQARP_QFLAG_QUERY_I 1 /**< Query case-insensitive */ +#define POLIQARP_QFLAG_QUERY_X 2 /**< Query: not whole words */ +#define POLIQARP_QFLAG_META_I 4 /**< Metadata case-insensitive */ +#define POLIQARP_QFLAG_META_X 8 /**< Metadata: not whole words */ + +/** + * Analyzes query text and initializes a query structure. + * @param query The query structure to be initialized. + * @param text Text of the query. + * @param corpus The corpus that this query will be run on. + * @param flags Query flags, a combination of QFLAG_*. + * @param rewrite Name of query rewriting rules or NULL. + * @return 0 upon successful completion, -1 otherwise (e.g. + * in case of parse error). + */ +int poliqarp_create_query(struct poliqarp_query *query, const char *text, + struct poliqarp_corpus *corpus, int flags, const char *rewrite, + struct poliqarp_random_state *random_state, + struct poliqarp_error *error); + +/** + * Destroys a query object. + * @param query The query to be destroyed. + * @return 0 upon successful destroy, -1 if an error occurred. + */ +int poliqarp_destroy_query(struct poliqarp_query *query); + +/** + * Sends a message to the client, notifying it that several new + * results of a query have been found. This function does not belong to + * the corpus library, but is implemented in the daemon and declared here + * as a means for the search routine to communicate with the outside world. + * @param session The session parameter passed to poliqarp_produce. + */ +extern void async_notify_new_results(void *session); + +/** + * Executes a query on the corpus, producing at most a given number of + * results. This function can be called multiple times to increasingly + * produce more results. + * @param buffer The match buffer to store results in. + * @param count Maximum number of results to be produced. + * @param query The query to be executed. + * @param progress Structure indicating progress of the operation. + * @param session The session to which send messages about new results. + * @param notify_step If this is non-zero, asynchronous messages are sent + * to the session specified by the former parameter each time this many + * new results are found. Otherwise, no messages are sent. + * @param max_match_length Maximum permissible match length, in segments. + * @return 0 upon successful completion, -1 on error. + * @note It is safe to call this function in a thread that can be cancelled + * with a deferred cancellation request. + */ +int poliqarp_produce(struct poliqarp_match_buffer *buffer, size_t count, + struct poliqarp_query *query, progress_t *progress, void *session, + size_t notify_step, size_t max_match_length); + +/* Match buffer operations. */ + +/** + * Match column. Designates one of the columns available in the match for + * sorting. + */ +enum poliqarp_column { + POLIQARP_COLUMN_LEFT_CONTEXT, /**< Selects left context for sorting. */ + POLIQARP_COLUMN_LEFT_MATCH, /**< Selects left match for sorting. */ + POLIQARP_COLUMN_MATCH, /**< Selects entire match for sorting. */ + POLIQARP_COLUMN_RIGHT_MATCH, /**< Selects right match for sorting. */ + POLIQARP_COLUMN_RIGHT_CONTEXT /**< Selects right context for sorting. */ +}; + +/** + * Sorting criteria. Required by sorting routine. + */ +struct poliqarp_sort_info { + enum poliqarp_column column; /**< What to sort by. */ + bool ascending; /**< True iff sorting in ascending order. */ + bool atergo; /**< True iff sorting a tergo. */ + size_t context; /**< Width of match context, in segments. + Used when sorting by context. */ +}; + +/** + * Public information about the match buffer. + */ +struct poliqarp_match_buffer_info { + size_t capacity; /**< Size of buffer. */ + size_t used; /**< Number of stored results. */ + size_t num_results; + /**< Number of results spotted during query execution. */ +}; + +/** + * Creates a match buffer. + * @param buffer The buffer structure to be initialized. + * @param size Size of buffer. + * @return 0 upon successful creation, -1 otherwise. + */ +int poliqarp_create_match_buffer(struct poliqarp_match_buffer *buffer, + size_t size); + +/** + * Destroys a match buffer. + * @param buffer The buffer to be destroyed. + * @return 0 upon successful destruction, -1 otherwise. + */ +int poliqarp_destroy_match_buffer(struct poliqarp_match_buffer *buffer); + +/** + * Retrieves information about match buffer. + * @param buffer The buffer to extract information from. + * @param info Structure that will hold the result. + */ +int poliqarp_get_match_buffer_info(struct poliqarp_match_buffer *buffer, + struct poliqarp_match_buffer_info *info); + +/** + * Sorts match buffer according to the given criteria. + * @param buffer The buffer to sort results in. + * @param criteria Criteria of sorting. + * @param progress Structure indicating progress of the operation. + * @return 0 if sorting succeeded, -1 if it failed. + * @note This routine performs a stable sort, which makes it possible to sort + * the buffer using multiple criteria. To do that, it suffices to call this + * function several times, starting with least significant criteria and + * finishing with most significant ones. + * @note It is safe to call this function in a thread that can be cancelled + * with a deferred cancellation request. + */ +int poliqarp_sort_match_buffer(struct poliqarp_match_buffer *buffer, + const struct poliqarp_sort_info *criteria, progress_t *progress); + +/** + * Removes all matches from a match buffer. The size of the buffer remains + * unchanged. + * @param buffer The buffer to be cleared. + * @return 0 upon successful completion, -1 on error. + */ +int poliqarp_forget(struct poliqarp_match_buffer *buffer); + +/** + * Resizes a match buffer, possibly dropping several matches. + * When the buffer gets enlarged, empty slots are added at the end. + * When the buffer gets shrunk, the effect is twofold: first, empty + * elements (if any) are removed from the end of the buffer; if that is not + * sufficient, oldest matches are dropped to match the new size. + * @param buffer The buffer to be resized. + * @param size New size of the buffer. + * @return 0 if the buffer got successfully resized, -1 on failure. + */ +int poliqarp_resize_match_buffer(struct poliqarp_match_buffer *buffer, + size_t size); + +/* Match operations. */ + +/** + * Structure of a match is invariant to Poliqarp's design, so here it is -- + * basically a range with a point between its borders. + */ +struct poliqarp_match { + size_t start; /** Offset of the first segment that belongs to this + match. */ + size_t end; /** Offset of one-past-end segment in this match. */ + size_t focus; /** Offset (relative to start of corpus) of focus point. */ + size_t document; /** Document identifier associated with this match. */ +}; + +/** + * Retrieves a match from a match buffer. + * @param buffer The buffer to retrieve the match from. + * @param match The match to be retrieved. + * @param index Index of the match in buffer. + * @return 0 upon successful retrieval, -1 on error. + * @note Does not do range checking if NDEBUG is defined. + */ +int poliqarp_get_match(const struct poliqarp_match_buffer *buffer, + struct poliqarp_match *match, size_t index); + +/** + * Creates a match for the whole document. + * @param corpus Corpus that contains the document. + * @param match The match to be created. + * @param document Index of the document (valid values for this argument are + * values of 'document' field from struct poliqarp_match). + * @return 0 upon successful completion, -1 on error. + */ +int poliqarp_get_match_for_document(const struct poliqarp_corpus *corpus, + size_t document, struct poliqarp_match *match); + +/* Segment operations. */ + +/** + * Public segment information. + */ +struct poliqarp_segment_info { + const char *text; /**< Orthographic form of a segment. */ + bool space_before; /**< Is there a space before this segment? */ +}; + +/** + * Retrieves segment by index from a corpus. + * @param segment Where to store the retrieved segment. + * @param corpus The corpus to retrieve the segment from. + * @param index Index of the segment to be retrieved. + * @return 0 upon successful retrieval, -1 on error. + * @note Does not do range checking if NDEBUG is defined. + */ +int poliqarp_get_segment(struct poliqarp_segment *segment, + struct poliqarp_corpus *corpus, size_t index); + +/** + * Retrieves information about a segment. + * @param segment Segment to extract information from. + * @param info The destination structure. + * @return 0 upon successful retrieval, -1 on error. + */ +int poliqarp_get_segment_info(const struct poliqarp_segment *segment, + struct poliqarp_segment_info *info); + +/* Interpretation set operations. */ + +/** + * Public information about interpretation set. + */ +struct poliqarp_interpretation_set_info { + size_t size; /**< Number of interpretations. */ +}; + +/** + * Retrieves set of disambiguated interpretations of a segment. + * @param segment The segment to extract the set from. + * @param set The destination structure. + * @return 0 upon successful retrieval, -1 on error. + */ +int poliqarp_get_disambiguated_interpretations( + const struct poliqarp_segment *segment, + struct poliqarp_interpretation_set *set); + +/** + * Retrieves set of ambiguous interpretations of a segment. + * @param segment The segment to extract the set from. + * @param set The destination structure. + * @return 0 upon successful retrieval, -1 on error. + */ +int poliqarp_get_ambiguous_interpretations( + const struct poliqarp_segment *segment, + struct poliqarp_interpretation_set *set); + +/** + * Retrieves information about a set of interpretations. + * @param set The set to extract the information from. + * @param info The destination structure. + * @return 0 upon successful retrieval, -1 on error. + */ +int poliqarp_get_interpretation_set_info( + const struct poliqarp_interpretation_set *set, + struct poliqarp_interpretation_set_info *info); + +/* Interpretation operations. */ + +/** + * Public information about an interpretation. + */ +struct poliqarp_interpretation_info { + const char *base; /**< Base form of the segment. */ + const char *tag; /**< Unparsed tag. */ +}; + +/** + * Retrieves an interpretation from a set. + * @param set The set to extract the interpretation from. + * @param interp The destination structure. + * @param index Index of the interpretation in a set. + * @return 0 upon successful retrieval, -1 on error. + */ +int poliqarp_get_interpretation(const struct poliqarp_interpretation_set *set, + struct poliqarp_interpretation *interp, size_t index); + +/** + * Retrieves information about an interpretation. + * @param interp The interpretation to extract information from. + * @param info The destination structure. + * @return 0 upon successful retrieval, -1 on error. + */ +int poliqarp_get_interpretation_info( + const struct poliqarp_interpretation *interp, + struct poliqarp_interpretation_info *info); + +/* Alias operations. */ + +/** + * Public alias information. + */ +struct poliqarp_alias { + const char *name; + const char *value; +}; + +/** + * Public alias list information. + */ +struct poliqarp_alias_list { + struct poliqarp_alias *aliases; + size_t num_aliases; +}; + +/** + * Defines an alias for an open corpus. From now on, all occurrences of + * `name' when querying this corpus will be replaced by `value'. + * @param corpus Corpus to define alias for. + * @param name Name of the alias. + * @param value Value of the alias. + * @return 0 upon successful completion, -1 on error. + */ +int poliqarp_define_alias(struct poliqarp_corpus *corpus, const char *name, + const char *value); + +/** + * Deletes an alias that is currently defined for this corpus. + * @param corpus Corpus to define alias for. + * @param name Name of the alias that is being deleted. + * @return 0 upon successful completion, -1 on error (e.g. no such alias + * exists). + */ +int poliqarp_delete_alias(struct poliqarp_corpus *corpus, const char *name); + +/** + * Retrieves the list of aliases available for an open corpus. + * @param corpus Corpus to retrieve aliases for. + * @param aliases Pointer to a structure that will contain the aliases. + * @return 0 on successful completion, -1 on error. + */ +int poliqarp_get_aliases(const struct poliqarp_corpus *corpus, + struct poliqarp_alias_list *aliases); + +/** + * Frees the memory allocated for the alias list by poliqarp_get_aliases(). + * @param aliases Structure to be released. + * @return 0 on successful completion, -1 on error. + */ +int poliqarp_free_aliases(struct poliqarp_alias_list *aliases); + +/* Metadata operations. */ + +/** + * A piece of metadata is a 'key-value' pair. The key can be an arbitrary + * string, whereas values come in two flavours: textual and date. This + * enum defines the type of metadata. + */ +enum poliqarp_metadata_type { + POLIQARP_META_TEXT, /**< Textual piece of metadata. */ + POLIQARP_META_DATE, /**< Date-holding piece of metadata. */ + POLIQARP_META_UNDEFINED /**< This information has not been defined + for this corpus. */ +}; + +/** + * One possible value type for a metadata can be a date. This is useful when + * specifying information such as date of creation, date of first publication, + * etc. + */ +struct poliqarp_date { + int year; /**< Year. */ + int month; /**< Month. */ + int day; /**< Day. */ +}; + +/** + * A binding of key name to type of metadata. + */ +struct poliqarp_metadata_type_binding { + char *key; /**< Name of the key. */ + enum poliqarp_metadata_type type; /**< Type of values for this key. */ +}; + +/** + * The set of metadata types: an array of bindings of metadata keys to + * values. + */ +struct poliqarp_metadata_types { + struct poliqarp_metadata_type_binding *types; /**< The set proper. */ + size_t num_types; /**< Number of types. */ +}; + +/** + * Public information about a piece of metadata. + */ +struct poliqarp_metadata_info { + enum poliqarp_metadata_type type; /**< Type of this metadata. */ + const char *key; /**< Name of key. */ + union { + const char *text; /**< Textual value. */ + struct poliqarp_date date; /**< Date value. */ + } value; /**< Value union. */ +}; + +/** + * Retrieves the set of metadata for a given document. + * @param corpus Corpus that contains the metadata. + * @param document Index of the document (valid values for this argument are + * values of 'document' field from struct poliqarp_match). + * @param set The destination structure. + * @return 0 upon successful completion, -1 on error. + */ +int poliqarp_get_metadata_set(const struct poliqarp_corpus *corpus, + size_t document, struct poliqarp_metadata_set *set); + +/** + * Retrieves the set of metadata types defined for a given corpus. + * @param corpus Corpus that contains the metadata. + * @param types The structure to contain the set. + * @return 0 upon successful completion, -1 on error. + */ +int poliqarp_get_metadata_types(struct poliqarp_metadata_types *types, + const struct poliqarp_corpus *corpus); + +/** + * Frees the resources allocated by poliqarp_get_metadata_types. + * @param types The structure to be freed. + * @return 0 upon successful completion, -1 on error. + */ +int poliqarp_free_metadata_types(struct poliqarp_metadata_types *types); + +/** + * Returns number of pieces of metadata in a set. + * @param set The set of metadata. + */ +size_t poliqarp_metadata_count(const struct poliqarp_metadata_set *set); + +/** + * Returns a single piece of metadata from a set. + * @param set The set of metadata. + * @param index Index of the piece of metadata to retrieve. + * @param meta The destination structure. + */ +int poliqarp_get_metadata(const struct poliqarp_metadata_set *set, + size_t index, struct poliqarp_metadata *meta); + +/** + * Retrieves information about a single piece of metadata. + * @param meta The metadata to query about. + * @param info The public information structure. + * @return 0 on successful retrieval, -1 on failure. + */ +int poliqarp_get_metadata_info(const struct poliqarp_metadata *meta, + struct poliqarp_metadata_info *info); + +/* + * Finally, include the definitions of private structures so that their sizes + * are known to the user. + */ +#include <sakura/poliqarp-private.h> + +#endif /* POLIQARP_H */ diff --git a/poliqarp-library/sakura/query-rewrite.c b/poliqarp-library/sakura/query-rewrite.c new file mode 100644 index 0000000000000000000000000000000000000000..661bf41aec232aefbe6161dea34ad0177e1b659d --- /dev/null +++ b/poliqarp-library/sakura/query-rewrite.c @@ -0,0 +1,206 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2008-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <stdlib.h> + +#include <sakura/regexp.h> +#include <sakura/query-rewrite.h> +#include <foostring/foostring.h> + +struct poliqarp_query_rewrite { + struct poliqarp_query_rewrite_rule *first; + struct poliqarp_query_rewrite_rule *last; +}; + +struct poliqarp_query_rewrite_rule { + struct poliqarp_regexp pattern; + char *replacement; + struct poliqarp_query_rewrite_rule *next; +}; + +int poliqarp_create_query_rewrite_table( + struct poliqarp_query_rewrite_table *this) +{ + return create_hash_table(&this->hash_table, 7, HASHTABLE_DUPLICATE_KEYS, NULL); +} + +static struct poliqarp_query_rewrite *poliqarp_create_query_rewrite() +{ + struct poliqarp_query_rewrite *this; + this = malloc(sizeof *this); + if (this == NULL) + return NULL; + this->first = this->last = NULL; + return this; +} + +static struct poliqarp_query_rewrite_rule *poliqarp_create_query_rewrite_rule( + const char *pattern, const char *replacement) +{ + int rc; + struct poliqarp_query_rewrite_rule *this; + assert(pattern != NULL); + assert(replacement != NULL); + this = malloc(sizeof *this); + if (this == NULL) + return NULL; + rc = poliqarp_regexp_create(&this->pattern, pattern, 0, POLIQARP_REG_NO_ANCHORS); + if (rc != 0) { + free(this); + errno = EINVAL; + return NULL; + } + this->replacement = strdup(replacement); + this->next = NULL; + return this; +} + +static struct poliqarp_query_rewrite_rule *poliqarp_destroy_query_rewrite_rule(struct poliqarp_query_rewrite_rule *this) +{ + struct poliqarp_query_rewrite_rule *next; + assert(this != NULL); + poliqarp_regexp_destroy(&this->pattern); + free(this->replacement); + next = this->next; + free(this); + return next; +} + +static void poliqarp_destroy_query_rewrite(void *this) +{ + struct poliqarp_query_rewrite_rule *rule; + struct poliqarp_query_rewrite *rewrite = this; + assert(rewrite != NULL); + rule = rewrite->first; + while (rule != NULL) + rule = poliqarp_destroy_query_rewrite_rule(rule); + free(this); +} + +void poliqarp_destroy_query_rewrite_table( + struct poliqarp_query_rewrite_table *this) +{ + assert(this != NULL); + destroy_hash_table(&this->hash_table, poliqarp_destroy_query_rewrite); +} + +struct poliqarp_query_rewrite *poliqarp_get_query_rewrite( + struct poliqarp_query_rewrite_table *this, const char *name, bool create) +{ + struct poliqarp_query_rewrite *rewrite; + assert(this != NULL); + if (name == NULL) + return NULL; + rewrite = hash_table_get(&this->hash_table, name); + if (rewrite == NULL && create) { + rewrite = poliqarp_create_query_rewrite(); + hash_table_set(&this->hash_table, name, rewrite); + } + return rewrite; +} + +int poliqarp_add_query_rewrite_rule(struct poliqarp_query_rewrite *this, + const char *pattern, const char *replacement) +{ + struct poliqarp_query_rewrite_rule *rule; + assert(this != NULL); + rule = poliqarp_create_query_rewrite_rule(pattern, replacement); + if (rule == NULL) + return -1; + if (this->last == NULL) { + assert(this->first == NULL); + this->first = rule; + } + else + this->last->next = rule; + this->last = rule; + return 0; +} + +char *poliqarp_rewrite_query(struct poliqarp_query_rewrite *this, + const char *query, unsigned int flags, unsigned int xflags) +{ + struct poliqarp_query_rewrite_rule *rule; + if (this == NULL) + return NULL; + for (rule = this->first; rule != NULL; rule = rule->next) { + regmatch_t submatches[10]; + if (poliqarp_regexp_match_ex(&rule->pattern, query, 10, submatches)) { + string_t buffer; + const char *replacement; + bool dollar = false; + buffer = string_create(); + for (replacement = rule->replacement; *replacement; replacement++) { + if (dollar) { + switch (*replacement) { + case '$': + string_append_char(buffer, '$'); + break; + case '<': case '>': + if ((xflags & POLIQARP_REG_NO_ANCHORS) != 0) + string_append_str(buffer, ".*"); + break; + case 'i': case 'I': + if ((flags & REG_ICASE) != 0) + string_append_str(buffer, "/i"); + break; + case '1': case '2': case '3': + case '4': case '5': case '6': + case '7': case '8': case '9': + { + regmatch_t *submatch = submatches + (*replacement - '0'); + if (submatch->rm_so != -1) { + regoff_t i; + for (i = submatch->rm_so; i < submatch->rm_eo; i++) { + const char ch = query[i]; + switch (ch) { + case '"': + case '\'': + case '\\': + string_append_char(buffer, '\\'); + /* fall through */ + default: + string_append_char(buffer, ch); + } + } + } + break; + } + default: + string_append_char(buffer, '$'); + string_append_char(buffer, *replacement); + break; + } + dollar = false; + } else { + if (*replacement == '$') + dollar = true; + else + string_append_char(buffer, *replacement); + } + } + return string_free_and_get_buffer(buffer); + } + } + return NULL; +} diff --git a/poliqarp-library/sakura/query-rewrite.h b/poliqarp-library/sakura/query-rewrite.h new file mode 100644 index 0000000000000000000000000000000000000000..059d200c97b8ebd1f2e2901ad7108f76ed111847 --- /dev/null +++ b/poliqarp-library/sakura/query-rewrite.h @@ -0,0 +1,52 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2008-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + + +#ifndef POLIQARP_QUERY_REWRITE_H +#define POLIQARP_QUERY_REWRITE_H + +#include <sakura/common/hash-table.h> + +struct poliqarp_query_rewrite_table { + struct hash_table hash_table; +}; + +struct poliqarp_query_rewrite; + +struct poliqarp_query_rewrite_rule; + +int poliqarp_create_query_rewrite_table(struct poliqarp_query_rewrite_table *); + +void poliqarp_destroy_query_rewrite_table( + struct poliqarp_query_rewrite_table *); + +struct poliqarp_query_rewrite *poliqarp_get_query_rewrite( + struct poliqarp_query_rewrite_table *, const char *name, bool create); + +int poliqarp_add_query_rewrite_rule(struct poliqarp_query_rewrite *, + const char *pattern, const char *replacement); + +char *poliqarp_rewrite_query(struct poliqarp_query_rewrite *this, + const char *query, unsigned int flags, unsigned int xflags); + +#endif diff --git a/poliqarp-library/sakura/query.c b/poliqarp-library/sakura/query.c new file mode 100644 index 0000000000000000000000000000000000000000..84135531c77800e839e8333998277028a03eb97d --- /dev/null +++ b/poliqarp-library/sakura/query.c @@ -0,0 +1,1381 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <errno.h> + +#include <sakura/random.h> +#include <sakura/query.h> +#include <sakura/lexer.h> +#include <sakura/common/bitstream.h> + +#define POLIQARP_UNINDEX_THRESHOLD 200 + +int yyparse(yyscan_t scanner, struct poliqarp_query *query); + +struct poliqarp_within *poliqarp_within_create_subdocument( + struct poliqarp_subdocument_set *set) +{ + struct poliqarp_within *result = malloc(sizeof(*result)); + result->type = WITHIN_SUBDOCUMENT; + result->as.subdocument = set; + return result; +} + +struct poliqarp_within *poliqarp_within_create_phrase( + struct poliqarp_expression *phrase) +{ + struct poliqarp_within *result = malloc(sizeof(*result)); + result->type = WITHIN_PHRASE; + result->as.phrase = phrase; + return result; +} + +int poliqarp_create_query(struct poliqarp_query *this, + const char *query_text, struct poliqarp_corpus *corpus, int flags, + const char *rewrite, struct poliqarp_random_state *random_state, + struct poliqarp_error *error) +{ + yyscan_t scanner; + size_t i; + int rc; + + assert(this != NULL); + assert(corpus != NULL); + assert(query_text != NULL); + + this->aliases = &(poliqarp_get_const_backend(corpus, config)->aliases); + bitset_arena_create_dummy(&this->area.arena); + graph_create(&this->graph, (set_compare_fn)poliqarp_expression_compare, + (set_free_fn)poliqarp_expression_destroy); + this->corpus = corpus; + this->error = error; + this->random_state = random_state; + + yylex_init(&scanner); + this->query_text = strdup(query_text); + yy_scan_string(this->query_text, scanner); + this->meta_expression = NULL; + this->eflags = 0; + this->flags = flags; + memset(this->variable_ranges, 0, sizeof this->variable_ranges); + for (i = 0; i < POLIQARP_MAX_VARIABLES; i++) + this->variable_types[i] = NULL; + this->rewrite = poliqarp_get_query_rewrite( + &corpus->config.query_rewrite_table, + rewrite, false); + this->rewrite_in_progress = false; + switch (yyparse(scanner, this)) { + case 0: + break; + case 2: + errno = ENOMEM; + poliqarp_error_from_system(this->error, NULL); + /* fall through */ + case 1: + rc = -1; + poliqarp_destroy_query(this); + goto cleanup; + default: + abort(); /* should not happen */ + } + this->have_last_context = false; + this->max_segment = 0; + this->progress = 0; + this->progress_helper = 0; + poliqarp_create_search_area(this); + + /* reset corpus backends */ + corpus->document.current = 0; + if (this->within && this->within->type == WITHIN_SUBDOCUMENT) + this->within->as.subdocument->current = 0; + if (corpus->syntax.syntax) + poliqarp_backend_syntax_reset(&corpus->syntax); + rc = 0; +cleanup: + yylex_destroy(scanner); + return rc; +} + +void poliqarp_query_set_meta_expression(struct poliqarp_query *this, + struct poliqarp_expression *exp) +{ + assert(this != NULL); + + if (this->meta_expression) + poliqarp_expression_destroy(this->meta_expression); + this->meta_expression = exp; +} + +void poliqarp_query_set_within(struct poliqarp_query *this, + struct poliqarp_within *within) +{ + assert(this != NULL); + + this->within = within; +} + +int poliqarp_destroy_query(struct poliqarp_query *this) +{ + assert(this != NULL); + + if (this->corpus) { + free(this->query_text); + graph_destroy(&this->graph); + if (this->meta_expression) + poliqarp_expression_destroy(this->meta_expression); + this->corpus = NULL; + bitset_arena_destroy(&this->area.arena); + } + return 0; +} + +enum xmode { cleanup, look, found, phrase_found, not_found, add, boundary, finish }; + +struct produce_helper { + size_t last_segment; + const struct poliqarp_searchable_area *area; + const struct dfs_node *root; + void *session; + int notify_step; + progress_t *progress; + size_t progress_helper; + bool document_fixup; +}; + +static inline uint64_t update_progress(struct poliqarp_query *this, + progress_t *progress, size_t pos, uint64_t helper) +{ + if (pos > this->max_segment) { + uint32_t num_segments = + poliqarp_backend_corpus_size(&this->corpus->corpus); + helper += (uint64_t)100 * (pos - this->max_segment); + this->max_segment = pos; + while (helper >= num_segments) { + helper -= num_segments; + progress_advance(progress, 1); + } + } + return helper; +} + +struct return_stack { + size_t size; + size_t pointer; + struct poliqarp_search_context *data; +}; + +static void return_stack_create(struct return_stack *stack) +{ + stack->size = 16; + stack->pointer = 0; + stack->data = malloc(stack->size * sizeof(stack->data[0])); +} + +static void return_stack_destroy(struct return_stack *stack) +{ + free(stack->data); +} + +static void return_stack_push(struct return_stack *stack, + const struct poliqarp_search_context *p) +{ + if (stack->pointer == stack->size) { + stack->size *= 2; + stack->data = realloc(stack->data, stack->size * sizeof(stack->data[0])); + } + stack->data[stack->pointer++] = *p; +} + +static void return_stack_push_link(struct return_stack *stack, + const struct poliqarp_search_context *p) +{ + struct poliqarp_search_context q = *p; + q.link = q.link->next; + return_stack_push(stack, &q); +} + +static void return_stack_push_phrase(struct return_stack *stack, + const struct poliqarp_search_context *p) +{ + struct poliqarp_search_context q = *p; + q.phrase++; + return_stack_push(stack, &q); +} + +static struct poliqarp_search_context return_stack_pop(struct return_stack *stack) +{ + return stack->data[--stack->pointer]; +} + +static bool return_stack_empty(const struct return_stack *stack) +{ + return (stack->pointer == 0); +} + +static void return_stack_wipe(struct return_stack *stack) +{ + stack->pointer = 0; +} + +static inline struct poliqarp_binary_segment get_segment( + struct poliqarp_backend_corpus *this, size_t index) +{ + if (index != file_reader_current(&this->corpus)) + poliqarp_backend_corpus_seek(this, index); + return poliqarp_backend_corpus_next(this); +} + +static bool phrase_expression_eval_single(const struct poliqarp_expression *expression, + struct poliqarp_corpus *corpus, const struct poliqarp_syntax_group *phrase) +{ + bool res = true; + struct poliqarp_binary_segment pos; + if (expression->as.phrase.synh) { + if (phrase->u.noncoord.synh != POLIQARP_SYNTAX_GROUP_UNKNOWN) { + pos = get_segment(&corpus->corpus, phrase->u.noncoord.synh); + res = poliqarp_expression_eval(expression->as.phrase.synh, corpus, &pos, NULL); + } else { + res = false; + } + } + if (!res) + return false; + if (expression->as.phrase.same) + return phrase->u.noncoord.synh == phrase->u.noncoord.semh; + if (expression->as.phrase.semh) { + if (phrase->u.noncoord.semh != POLIQARP_SYNTAX_GROUP_UNKNOWN) { + pos = get_segment(&corpus->corpus, phrase->u.noncoord.semh); + return poliqarp_expression_eval(expression->as.phrase.semh, corpus, &pos, NULL); + } else { + return false; + } + } else + return true; +} + +static bool phrase_expression_eval(const struct poliqarp_expression *expression, + struct poliqarp_corpus *corpus, size_t i) +{ + bool result = false; + struct poliqarp_syntax_group *groups = corpus->syntax.groups; + switch (expression->type) { + case POLIQARP_EXPRESSION_CONSTANT: + return expression->as.constant; + case POLIQARP_EXPRESSION_VALUE: + return BIT_ARRAY_GET(((struct poliqarp_value *)(expression->as.value.value))->bits, + groups[i].type); + case POLIQARP_EXPRESSION_AND: + return expression->as.expression.negate ^ + (phrase_expression_eval(expression->as.expression.left, corpus, i) && + phrase_expression_eval(expression->as.expression.right, corpus, i)); + case POLIQARP_EXPRESSION_OR: + return expression->as.expression.negate ^ + (phrase_expression_eval(expression->as.expression.left, corpus, i) || + phrase_expression_eval(expression->as.expression.right, corpus, i)); + case POLIQARP_EXPRESSION_PHRASE: + if (groups[i].type == POLIQARP_SYNTAX_GROUP_COORD) { + int len = groups[i].u.coord.length; + result = expression->as.phrase.negate ^ expression->as.phrase.all; + while (--len) { + if (++i == corpus->syntax.size) + i = 0; + if (groups[i].type == POLIQARP_SYNTAX_GROUP_COORD || groups[i].type == POLIQARP_SYNTAX_GROUP_CONJUNCTION) + continue; + if (expression->as.phrase.all) { + if (!phrase_expression_eval_single(expression, corpus, groups + i)) { + result = !result; + break; + } + } else { + if (phrase_expression_eval_single(expression, corpus, groups + i)) { + result = !result; + break; + } + } + } + } else { + result = expression->as.phrase.negate ^ + phrase_expression_eval_single(expression, corpus, groups + i); + } + return result; + case POLIQARP_EXPRESSION_VARIABLE: + return phrase_expression_eval(expression->as.variable.children[0], + corpus, i); + /* FIXME */ + case POLIQARP_EXPRESSION_INVALID: + abort(); /* Should not happen. */ + } + return false; /* muffle potential warnings */ +} + +static size_t find_phrase_satisfying_expression(struct poliqarp_corpus *corpus, + uint32_t offset, size_t phrase, const struct poliqarp_expression *expr) +{ + struct poliqarp_backend_syntax *this = &corpus->syntax; + if (phrase == (size_t)-1 || offset / 1024 > this->groups[phrase].to / 1024) { + this->start = this->end = phrase = 0; + this->pos = en4(*((uint32_t *)tinydb_fetch_item(&this->offsets, offset / 1024))); + } + while (phrase == this->end || this->groups[phrase].to < offset || + !phrase_expression_eval(expr, corpus, phrase)) + { + if (phrase == this->end) { + if (poliqarp_backend_syntax_next(this) == -1) + return -1; + this->start = phrase; + } else { + if (++phrase == this->size) + phrase = 0; + } + } + return phrase; +} + +static int check_boundaries(struct poliqarp_search_context *ctx, + struct poliqarp_query *this) +{ + int check_index = (ctx->index % this->area.granularity == 0); + for (;;) { + if (check_index) { + size_t i = ctx->index / this->area.granularity; + while (i < this->area.num_bits && + !bitset_arena_get(&this->area.arena, this->area.area, i)) + { + i++; + } + if (ctx->index < i * this->area.granularity) + ctx->index = i * this->area.granularity; + if (ctx->index >= poliqarp_backend_corpus_size(&this->corpus->corpus)) + return -1; + } + if (ctx->index >= ctx->document.corpus_high) { + /* try to read next document */ + if (poliqarp_backend_document_next(&this->corpus->document, &ctx->document) == -1) + return -1; + if (ctx->index < ctx->document.corpus_low || ctx->index >= ctx->document.corpus_high) { + poliqarp_backend_document_search(&this->corpus->document, ctx->index); + if (poliqarp_backend_document_next(&this->corpus->document, &ctx->document) == -1) + return -1; + } + while (this->meta_expression && + poliqarp_expression_eval(this->meta_expression, this->corpus, &ctx->document, NULL) == false) + { + if (poliqarp_backend_document_next(&this->corpus->document, &ctx->document) == -1) + return -1; + } + if (ctx->index < ctx->document.corpus_low) + ctx->index = ctx->document.corpus_low; + check_index = 1; + continue; + } + if (this->within && this->within->type == WITHIN_SUBDOCUMENT && + ctx->index >= ctx->subdocument.corpus_high) + { + /* try to read next subdocument */ + if (poliqarp_subdocument_next(this->within->as.subdocument, &ctx->subdocument) == -1) + return -1; + if (ctx->index >= ctx->subdocument.corpus_high) { + poliqarp_subdocument_search(this->within->as.subdocument, ctx->index); + if (poliqarp_subdocument_next(this->within->as.subdocument, &ctx->subdocument) == -1) + return -1; + } + if (ctx->index < ctx->subdocument.corpus_low) + ctx->index = ctx->subdocument.corpus_low; + check_index = 1; + continue; + } + if (this->within && this->within->type == WITHIN_PHRASE) { + /* This function is called in the circumstances where we can safely + reset the phrase list. We want to scroll to the next phrase that + satisfies the current `within phrase' expression. */ + struct poliqarp_backend_syntax *syntax = &this->corpus->syntax; + ctx->within_phrase = find_phrase_satisfying_expression(this->corpus, ctx->index, + ctx->within_phrase, this->within->as.phrase); + ctx->phrase = syntax->start; + if (ctx->within_phrase == (size_t)-1) + return -1; + if (ctx->index < syntax->groups[ctx->within_phrase].from) { + ctx->index = syntax->groups[ctx->within_phrase].from; + check_index = 1; + continue; + } + } + return 0; + } +} + +static bool end_of_within(struct poliqarp_search_context *ctx, + struct poliqarp_query *this) +{ + return + ((ctx->index == ctx->document.corpus_high) || + (this->within && this->within->type == WITHIN_SUBDOCUMENT && + ctx->index == ctx->subdocument.corpus_high) || + (this->within && this->within->type == WITHIN_PHRASE && + ctx->index == this->corpus->syntax.groups[ctx->within_phrase].to + 1)); +} + +static void poliqarp_produce_cleanup(void *data) +{ + struct return_stack *stack = data; + return_stack_destroy(stack); +} + +static bool poliqarp_reset_variable_bindings(struct poliqarp_query *this, + size_t *variable_bindings) +{ + size_t i; + for (i = 0; i < POLIQARP_MAX_VARIABLES; i++) + variable_bindings[i] = 0; + return true; +} + +static bool poliqarp_next_variable_bindings(struct poliqarp_query *this, + size_t *variable_bindings) +{ + size_t i; + for (i = 0; i < POLIQARP_MAX_VARIABLES; i++) + { + if (variable_bindings[i] > 0) + { + variable_bindings[i]--; + break; + } + else + { + size_t r = this->variable_ranges[i]; + variable_bindings[i] = (r > 0) ? (r - 1) : 0; + } + } + return i < POLIQARP_MAX_VARIABLES; +} + +static inline void poliqarp_swap_query_result( + struct poliqarp_match_buffer *this, size_t i) +{ + pthread_mutex_lock(&this->mutex); + assert(i < this->used); + struct poliqarp_match temp = this->match[i]; + this->match[i] = this->match[this->used - 1]; + this->match[this->used - 1] = temp; + pthread_mutex_unlock(&this->mutex); +} + +static inline void poliqarp_replace_query_result( + struct poliqarp_match_buffer *this, size_t i, + const struct poliqarp_match *match) +{ + pthread_mutex_lock(&this->mutex); + assert(i < this->used); + this->match[i] = *match; + pthread_mutex_unlock(&this->mutex); +} + +int poliqarp_produce(struct poliqarp_match_buffer *result, size_t max, + struct poliqarp_query *this, progress_t *progress, void *session, + size_t notify_step, size_t max_match_length) +{ + struct poliqarp_corpus *corpus = this->corpus; + size_t corpus_size; + enum xmode next_mode, mode; + struct poliqarp_search_context ctx; + struct return_stack stack; + struct poliqarp_binary_segment pos; + struct poliqarp_match match; + uint64_t progress_helper = 0; +#ifndef POLIQARP_SINGLE_THREADED + int cancel_state; +#endif + size_t variable_bindings[POLIQARP_MAX_VARIABLES]; + +#ifndef POLIQARP_SINGLE_THREADED + pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cancel_state); +#endif + corpus_size = poliqarp_backend_corpus_size(&corpus->corpus); + pthread_mutex_lock(&progress->mutex); + result->corpus = corpus; + pthread_mutex_unlock(&progress->mutex); + return_stack_create(&stack); + if (this->have_last_context) + ctx = this->last_context; + else { + progress_reset(progress); + ctx.m_start = (size_t)-1; + ctx.phrase = 0; + ctx.document.corpus_high = ctx.subdocument.corpus_high = 0; + ctx.within_phrase = (size_t)-1; + } + + if (this->eflags & POLIQARP_QEFLAG_HAS_VARIABLES) + { + poliqarp_reset_variable_bindings(this, variable_bindings); + ctx.index = -1; + } + +#ifndef POLIQARP_SINGLE_THREADED + pthread_cleanup_push(poliqarp_produce_cleanup, &stack); +#endif + next_mode = cleanup; /* just to shut up compilers */ + for (mode = cleanup; ; mode = next_mode) { + switch (mode) { + case cleanup: + if (return_stack_empty(&stack)) { + ctx.node = this->graph.dfs.root; + ctx.link = NULL; + if (this->eflags & POLIQARP_QEFLAG_HAS_VARIABLES) + { + if (ctx.m_start != (size_t)-1) + ctx.index = ctx.m_start; + if (!poliqarp_next_variable_bindings(this, variable_bindings)) + ctx.index++; + } + else + ctx.index = ctx.m_start + 1; + ctx.m_start = ctx.m_end = ctx.m_focus = ctx.c_focus = (size_t)-1; + } else { + ctx = return_stack_pop(&stack); + next_mode = look; /* skip checking boundaries if backtracking */ + break; + } + /* fall through */ + + case boundary: + if (check_boundaries(&ctx, this) == -1 || ctx.index >= corpus_size) { + next_mode = finish; + break; + } + /* fall through */ + + case look: +#ifndef POLIQARP_SINGLE_THREADED + pthread_setcancelstate(cancel_state, NULL); + pthread_testcancel(); /* cancel opportunity */ + pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL); +#endif + next_mode = not_found; + if (ctx.link == NULL) + ctx.link = ctx.node->first_link; + pos = get_segment(&corpus->corpus, ctx.index); + while (ctx.link) { + if (ctx.m_start != (size_t)-1 && ctx.index - ctx.m_start >= max_match_length) + { + /* skip */ + } else if (ctx.link->symbol != SYMBOL_DOT && poliqarp_expression_type( + (const struct poliqarp_expression *)ctx.link->symbol) == POLIQARP_EXPRESSION_PHRASE) + { + if (!corpus->syntax.syntax) + break; + for (;;) { + if (ctx.phrase == corpus->syntax.end) { + if (poliqarp_backend_syntax_next(&corpus->syntax) == -1) + goto look_breakout; + } + if (corpus->syntax.groups[ctx.phrase].from < ctx.index) { + ctx.phrase++; + continue; + } else if (corpus->syntax.groups[ctx.phrase].from == ctx.index) { + if (phrase_expression_eval((const struct poliqarp_expression *)ctx.link->symbol, + corpus, ctx.phrase)) + { + next_mode = phrase_found; + goto look_breakout; + } else + ctx.phrase++; + } else + break; + } + } else if (ctx.link->symbol == SYMBOL_DOT || + (ctx.index < corpus_size && + poliqarp_expression_eval(ctx.link->symbol, corpus, &pos, variable_bindings))) + { + if (ctx.link->next) + return_stack_push_link(&stack, &ctx); + next_mode = found; + if (ctx.link->flags) + ctx.c_focus = ctx.index; + break; + } + ctx.link = ctx.link->next; + } +look_breakout: + if (ctx.node->flags.is_final) { + ctx.m_focus = ctx.c_focus; /* promote focus candidate */ + ctx.m_end = ctx.index; + } + break; + + case found: + ctx.node = ctx.link->to; /* follow link */ + ctx.link = NULL; + if (ctx.m_start == (size_t)-1) /* remember first item */ + ctx.m_start = ctx.index; + ctx.index++; + if (end_of_within(&ctx, this)) { + if (ctx.node->flags.is_final) + ctx.m_end = ctx.index; + next_mode = ctx.m_end != (size_t)-1 ? add : cleanup; + } else + next_mode = look; + progress_helper = update_progress(this, progress, ctx.index, + progress_helper); + break; + + case phrase_found: + /* FIXME: check whether this is not the last phrase */ + return_stack_push_phrase(&stack, &ctx); + ctx.node = ctx.link->to; /* follow link */ + ctx.link = NULL; + if (ctx.m_start == (size_t)-1) /* remember first item */ + ctx.m_start = ctx.index; + ctx.index = corpus->syntax.groups[ctx.phrase].to + 1; + if (end_of_within(&ctx, this)) { + if (ctx.node->flags.is_final) + ctx.m_end = ctx.index; + next_mode = ctx.m_end != (size_t)-1 ? add : cleanup; + } else + next_mode = look; + progress_helper = update_progress(this, progress, ctx.index, + progress_helper); + break; + + case not_found: + if (ctx.m_start != (size_t)-1) { + if (ctx.m_end == (size_t)-1 || ctx.m_start == ctx.m_end) + next_mode = cleanup; + else + next_mode = add; + /* note: we don't advance here */ + } else if (this->eflags & POLIQARP_QEFLAG_HAS_VARIABLES) { + next_mode = cleanup; + } else { + ctx.index++; + if ((ctx.index & 0xffff) == 0) { + /* From time to time, update the progress even if there are + * no matches. */ + progress_helper = update_progress(this, progress, ctx.index, + progress_helper); + } + next_mode = boundary; + } + break; + + case add: + match.start = ctx.m_start; + assert(ctx.m_end != (size_t)-1); + match.end = ctx.m_end; + match.focus = ctx.m_focus == (size_t)-1 ? ctx.m_start : ctx.m_focus; + match.document = corpus->document.current - 1; + pthread_mutex_lock(&result->mutex); + result->num_results++; + pthread_mutex_unlock(&result->mutex); + if (result->used < result->capacity) { + poliqarp_append_query_result(result, &match); + if (this->random_state) { + size_t i = poliqarp_random(this->random_state) % result->num_results; + poliqarp_swap_query_result(result, i); + } else { + if (notify_step > 0 && result->num_results % notify_step == 0) + async_notify_new_results(session); + if (result->num_results == max) + goto out; + } + } else { + if (this->random_state) { + size_t i = poliqarp_random(this->random_state) % result->num_results; + if (i < result->used) + poliqarp_replace_query_result(result, i, &match); + } else { + goto out; + } + } + + return_stack_wipe(&stack); + /* Comment out the following line for (slower) support of subresults, + i.e. results that are part of other results. */ + ctx.m_start = ctx.m_end - 1; + next_mode = cleanup; + break; + + case finish: + goto out; + } + } + +out: + if (result->used > max) + result->used = max; + this->have_last_context = true; + this->last_context = ctx; +#ifndef POLIQARP_SINGLE_THREADED + pthread_cleanup_pop(1); + pthread_setcancelstate(cancel_state, NULL); +#else + poliqarp_produce_cleanup(&stack); +#endif + return 0; +} + +int poliqarp_create_match_buffer(struct poliqarp_match_buffer *this, + size_t capacity) +{ + assert(this != NULL); + + this->match = malloc(sizeof *this->match * capacity); + this->used = 0; + this->num_results = 0; + this->capacity = capacity; + this->sort_arena = NULL; + pthread_mutex_init(&this->mutex, NULL); + return 0; +} + +int poliqarp_destroy_match_buffer(struct poliqarp_match_buffer *this) +{ + assert(this != NULL); + free(this->match); + if (this->sort_arena) + marena_destroy(this->sort_arena); + pthread_mutex_destroy(&this->mutex); + return 0; +} + +int poliqarp_get_match_buffer_info(struct poliqarp_match_buffer *buffer, + struct poliqarp_match_buffer_info *info) +{ + pthread_mutex_lock(&buffer->mutex); + info->capacity = buffer->capacity; + info->used = buffer->used; + info->num_results = buffer->num_results; + pthread_mutex_unlock(&buffer->mutex); + return 0; +} + +int poliqarp_forget(struct poliqarp_match_buffer *buffer) +{ + buffer->used = 0; + buffer->num_results = 0; + return 0; +} + +int poliqarp_resize_match_buffer(struct poliqarp_match_buffer *buffer, + size_t size) +{ + if (size < buffer->used) { + memmove(buffer->match, buffer->match + buffer->used - size, + size * sizeof *buffer->match); + buffer->used = size; + } + buffer->match = realloc(buffer->match, size * sizeof *buffer->match); + buffer->capacity = size; + return 0; +} + +struct sort_helper_descriptor { + uint32_t *segments; + size_t length; +}; + +static inline int sort_cmp(struct sort_helper_descriptor *hdesc, uint32_t x, + uint32_t y, const struct poliqarp_sort_info *criteria, + struct poliqarp_corpus *corpus) +{ + int res = 0; + size_t xl = 0, yl = 0; + if (criteria->atergo) { + if (hdesc[x].length && hdesc[y].length) { + xl = hdesc[x].length - 1; + yl = hdesc[y].length - 1; + while (xl && yl) { + uint32_t xo = hdesc[x].segments[xl], yo = hdesc[y].segments[yl]; + size_t i1, i2; + i1 = poliqarp_backend_orth_atergo_fetch(poliqarp_get_const_backend( + corpus, orth), xo); + i2 = poliqarp_backend_orth_atergo_fetch(poliqarp_get_const_backend( + corpus, orth), yo); + res = i1 == i2 ? 0 : (i1 < i2 ? -1 : +1); + if (res) + break; + xl--; yl--; + } + } else { + xl = hdesc[x].length; + yl = hdesc[y].length; + } + } else { + while (xl < hdesc[x].length && yl < hdesc[y].length) { + uint32_t xo = hdesc[x].segments[xl], yo = hdesc[y].segments[yl]; + size_t i1, i2; + i1 = poliqarp_backend_orth_afronte_fetch(poliqarp_get_const_backend( + corpus, orth), xo); + i2 = poliqarp_backend_orth_afronte_fetch(poliqarp_get_const_backend( + corpus, orth), yo); + res = i1 == i2 ? 0 : (i1 < i2 ? -1 : +1); + if (res) + break; + xl++; yl++; + } + xl = hdesc[x].length - xl; + yl = hdesc[y].length - yl; + } + res = res ? res : xl ? 1 : yl ? -1 : 0; + if (!criteria->ascending) + res = -res; + return (int) res; +} + +static void sort_helper(struct sort_helper_descriptor *hdesc, uint32_t *arr, + size_t start, size_t end, uint32_t *scratch, + const struct poliqarp_sort_info *criteria, struct poliqarp_corpus *corpus, + progress_t *progress) +{ + size_t i = 0, length = end - start, mid = length / 2, l = start, r = l + mid; + if (start + 1 >= end) + return; + sort_helper(hdesc, arr, start, start + mid, scratch, criteria, corpus, + progress); + sort_helper(hdesc, arr, start + mid, end, scratch, criteria, corpus, + progress); + for (i = 0; i < length; i++) { + if (l < start + mid && + (r == end || sort_cmp(hdesc, arr[l], arr[r], criteria, corpus) <= 0)) + { + scratch[i] = arr[l++]; + } else { + scratch[i] = arr[r++]; + } + } + memcpy(arr + start, scratch, length * sizeof(*arr)); +#ifndef POLIQARP_SINGLE_THREADED + pthread_testcancel(); +#endif +} + +struct read_serializer { + size_t left; + size_t right; + size_t pos; +}; + +static int compare_serializers(const void *x, const void *y) +{ + const struct read_serializer *xx = (const struct read_serializer *)x, + *yy = (const struct read_serializer *)y; + return (long)xx->left - (long)yy->left; +} + +static struct sort_helper_descriptor *sort_prepare_helper( + struct poliqarp_match_buffer *buffer, + const struct poliqarp_sort_info *criteria) +{ + struct sort_helper_descriptor *result; + size_t i; + struct read_serializer *ser; + + if (buffer->sort_arena == NULL) { + buffer->sort_arena = malloc(sizeof(*(buffer->sort_arena))); + if (buffer->sort_arena == NULL) + return NULL; + marena_create(buffer->sort_arena); + } + result = malloc(buffer->used * sizeof(*result)); + if (result == NULL) + return NULL; + ser = malloc(buffer->used * sizeof(*ser)); + if (ser == NULL) { + free(result); + return NULL; + } + for (i = 0; i < buffer->used; i++) { + size_t left = 0, right = 0; + size_t segnum; + switch (criteria->column) { + case POLIQARP_COLUMN_LEFT_CONTEXT: + left = buffer->match[i].start < criteria->context ? 0 : + buffer->match[i].start - criteria->context; + right = buffer->match[i].start; + break; + case POLIQARP_COLUMN_LEFT_MATCH: + left = buffer->match[i].start; + right = buffer->match[i].focus; + break; + case POLIQARP_COLUMN_MATCH: + left = buffer->match[i].start; + right = buffer->match[i].end; + break; + case POLIQARP_COLUMN_RIGHT_MATCH: + left = buffer->match[i].focus; + right = buffer->match[i].end; + break; + case POLIQARP_COLUMN_RIGHT_CONTEXT: + left = buffer->match[i].end; + right = buffer->match[i].end + criteria->context; + segnum = poliqarp_backend_corpus_size(&buffer->corpus->corpus); + if (right > segnum) + right = segnum; + break; + default: + abort(); /* Should not happen. */ + } + ser[i].left = left; + ser[i].right = right; + ser[i].pos = i; + } + qsort(ser, buffer->used, sizeof(*ser), compare_serializers); + for (i = 0; i < buffer->used; i++) + { + size_t k = ser[i].pos, left = ser[i].left, right = ser[i].right; + result[k].length = right - left; + if (result[k].length > 0) { + size_t j; + result[k].segments = marena_alloc(buffer->sort_arena, + sizeof(uint32_t) * result[k].length); + if (result[k].segments == NULL) { + free(result); + return NULL; + } + for (j = 0; j < result[k].length; j++) { + struct poliqarp_binary_segment s = poliqarp_backend_corpus_get( + &buffer->corpus->corpus, left + j); + result[k].segments[j] = s.orth_space_id >> 1; + } + } else { + result[k].segments = NULL; + } + } + free(ser); + return result; +} + +struct poliqarp_sort_sandbox { + uint32_t *scratch; + uint32_t *array; + struct sort_helper_descriptor *helper_descriptor; + struct poliqarp_match_buffer *buffer; +}; + +static void poliqarp_sort_match_buffer_cleanup(void *data) +{ + struct poliqarp_sort_sandbox *cleanup_data = data; + free(cleanup_data->scratch); + free(cleanup_data->array); + free(cleanup_data->helper_descriptor); + if (cleanup_data->buffer->sort_arena != NULL) + marena_release(cleanup_data->buffer->sort_arena); +} + +/* + * Main sorting routine -- implements mergesort. + * + * Note: DO NOT use qsort(), because: + * (a) this might not be portable; on some systems we might have to provide + * a replacement anyway + * (b) it imposes a performance impact because of calls to comparator + * (c) it is not stable by default -- it CAN be made stable, but that means + * performance costs + * (d) it does not support progress reporting + */ +int poliqarp_sort_match_buffer(struct poliqarp_match_buffer *buffer, + const struct poliqarp_sort_info *criteria, progress_t *progress) +{ + struct poliqarp_sort_sandbox sandbox = { NULL, NULL, NULL, NULL }; + uint32_t i; + int res = 0; +#ifndef POLIQARP_SINGLE_THREADED + int cancel_state; +#endif + + if (buffer->used < 2) + return 0; + +#ifndef POLIQARP_SINGLE_THREADED + pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cancel_state); + pthread_cleanup_push(poliqarp_sort_match_buffer_cleanup, &sandbox); +#endif + sandbox.buffer = buffer; + if ((sandbox.helper_descriptor = sort_prepare_helper(buffer, criteria)) == NULL) + goto err; + sandbox.scratch = malloc(buffer->used * sizeof(*sandbox.scratch)); + if (sandbox.scratch == NULL) + { + res = -1; + goto err; + } + sandbox.array = malloc(buffer->used * sizeof(*sandbox.array)); + if (sandbox.array == NULL) + { + res = -1; + goto err; + } + for (i = 0; i < buffer->used; i++) + sandbox.array[i] = i; +#ifndef POLIQARP_SINGLE_THREADED + pthread_setcancelstate(cancel_state, NULL); +#endif + sort_helper(sandbox.helper_descriptor, sandbox.array, 0, buffer->used, + sandbox.scratch, criteria, buffer->corpus, progress); +#ifndef POLIQARP_SINGLE_THREADED + pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL); +#endif + + /* rearrange buffer */ + for (i = 0; i < buffer->used; i++) + sandbox.scratch[sandbox.array[i]] = i; + for (i = 0; i < buffer->used; i++) { + struct poliqarp_match m = buffer->match[i]; + buffer->match[i] = buffer->match[sandbox.array[i]]; + buffer->match[sandbox.array[i]] = m; + sandbox.array[sandbox.scratch[i]] = sandbox.array[i]; + sandbox.scratch[sandbox.array[i]] = sandbox.scratch[i]; + }; +err: +#ifndef POLIQARP_SINGLE_THREADED + ; /* just to shut up compilers */ + pthread_cleanup_pop(1); + pthread_setcancelstate(cancel_state, NULL); +#else + poliqarp_sort_match_buffer_cleanup(&sandbox); +#endif + return res; +} + +int poliqarp_get_match(const struct poliqarp_match_buffer *buffer, + struct poliqarp_match *match, size_t index) +{ +#ifndef NDEBUG + if (index >= buffer->used) + return -1; +#endif + *match = buffer->match[index]; + return 0; +} + +int poliqarp_get_match_for_document(const struct poliqarp_corpus *corpus, + size_t id, struct poliqarp_match *match) +{ + struct poliqarp_document document; + if (poliqarp_backend_document_fetch(&corpus->document, id, &document) == -1) + return -1; + match->start = match->focus = document.corpus_low; + match->end = document.corpus_high; + match->document = id; + return 0; +} + +void poliqarp_append_query_result(struct poliqarp_match_buffer *this, + const struct poliqarp_match *match) +{ + pthread_mutex_lock(&this->mutex); + assert(this->used < this->capacity); + this->match[this->used++] = *match; + pthread_mutex_unlock(&this->mutex); +} + + +static void poliqarp_unindex_item(bitset bs, struct poliqarp_searchable_area *area, + size_t distance, size_t what, struct poliqarp_rindex *index) +{ + uint32_t freq, i, b, pos = 0; + size_t dmin = distance / area->granularity, + dmax = (distance + area->granularity - 1) / area->granularity; + poliqarp_rindex_set(index, what); + freq = decode_gamma(&index->ibs); + b = get_golomb_parameter(area->num_bits, freq); + for (i = 0; i < freq; i++) { + uint32_t x = decode_golomb(&index->ibs, b); + if (i == 0) + pos = x - 1; + else + pos += x; + if (pos >= dmin) + bitset_arena_set(&area->arena, bs, pos - dmin); + if (pos >= dmax) + bitset_arena_set(&area->arena, bs, pos - dmax); + } +} + +static bitset poliqarp_unindex_value(struct poliqarp_value *value, + struct poliqarp_searchable_area *area, size_t distance, int indices, + const struct poliqarp_backend_index *index) +{ + struct poliqarp_rindex *my_index; + size_t i, pos; + bitset res; + + if (value->num_hits > POLIQARP_UNINDEX_THRESHOLD) + return NULL; + switch (value->domain) { + case POLIQARP_DOMAIN_ORTH: + my_index = index->orth_index; + break; + case POLIQARP_DOMAIN_SPACE: + my_index = NULL; + break; + case POLIQARP_DOMAIN_INTERP__DISAMB: + my_index = index->disamb_index; + break; + case POLIQARP_DOMAIN_INTERP__AMB: + my_index = index->amb_index; + break; + default: + abort(); /* Should not happen. */ + } + if (my_index == NULL) + return NULL; + res = bitset_arena_alloc(&area->arena); + i = value->num_hits; + pos = 0; + while (i) { + if (BIT_ARRAY_GET(value->bits, pos)) { + poliqarp_unindex_item(res, area, distance, pos, my_index); + i--; + } + pos++; + } + return res; +} + +static bitset poliqarp_unindex_expression(struct poliqarp_expression *expr, + struct poliqarp_searchable_area *area, size_t distance, int indices, + const struct poliqarp_backend_index *index) +{ + bitset bs_left, bs_right; + if (expr == SYMBOL_DOT) + return NULL; + switch (expr->type) { + case POLIQARP_EXPRESSION_CONSTANT: + return NULL; + case POLIQARP_EXPRESSION_AND: + if (expr->as.expression.negate) + return NULL; + bs_left = poliqarp_unindex_expression(expr->as.expression.left, area, + distance, indices, index); + bs_right = poliqarp_unindex_expression(expr->as.expression.right, area, + distance, indices, index); + if (bs_left == NULL) + return bs_right; + if (bs_right == NULL) + return bs_left; + bitset_arena_intersect(&area->arena, bs_left, bs_right); + return bs_left; + case POLIQARP_EXPRESSION_OR: + if (expr->as.expression.negate) + return NULL; + bs_left = poliqarp_unindex_expression(expr->as.expression.left, area, + distance, indices, index); + bs_right = poliqarp_unindex_expression(expr->as.expression.right, area, + distance, indices, index); + if (bs_left == NULL || bs_right == NULL) + return NULL; + bitset_arena_union(&area->arena, bs_left, bs_right); + return bs_left; + case POLIQARP_EXPRESSION_VALUE: + if (expr->as.value.negate) + return NULL; + return poliqarp_unindex_value(expr->as.value.value, area, distance, + indices, index); + default: + return NULL; + }; +} + +void poliqarp_recursive_create_area(struct poliqarp_searchable_area *area, + bitset work, struct dfs_node *node, int indices, + const struct poliqarp_backend_index *index) +{ + bitset bs; + struct dfs_link *link; + + if (node->distance == (size_t)-1 || node->flags.is_final) { + bitset_arena_union(&area->arena, area->area, work); + return; + } + bs = bitset_arena_copy(&area->arena, work); + for (link = node->first_link; link != NULL; link = link->next) { + bitset link_bs; + + bitset_arena_copy_to(&area->arena, bs, work); + link_bs = poliqarp_unindex_expression(link->symbol, area, node->distance, + indices, index); + if (link_bs) + bitset_arena_intersect(&area->arena, work, link_bs); + poliqarp_recursive_create_area(area, work, link->to, indices, index); + } +} + +struct dfs_queue_item { + void* symbol; + const struct dfs_node *node; + size_t distance; +}; + +struct dfs_queue { + struct dfs_queue_item *items; + size_t start; + size_t length; + size_t max_length; + size_t max_distance; + bool overflown; +}; + +static void dfs_queue_create(struct dfs_queue *queue, size_t max_length, size_t max_distance) +{ + queue->items = calloc(max_length, sizeof(struct dfs_queue_item)); + if (queue->items == NULL) + abort(); + queue->start = 0; + queue->length = 0; + queue->max_length = max_length; + queue->max_distance = max_distance; + queue->overflown = false; +} + +static void dfs_queue_destroy(struct dfs_queue *queue) +{ + free(queue->items); + queue->items = NULL; +} + +static void dfs_queue_push(struct dfs_queue *queue, void *symbol, + const struct dfs_node *node, size_t distance) +{ + size_t i; + if (queue->overflown) + return; + if (node->flags.is_final && distance < queue->max_distance) + queue->max_distance = distance; + if (distance > queue->max_distance) + { + queue->overflown = true; + return; + } + if (queue->length >= queue->max_length) + { + queue->overflown = true; + while (queue->length > 0) + { + i = (queue->start + queue->length - 1) % queue->max_length; + if (queue->items[i].distance >= distance) + queue->length--; + else + break; + } + return; + } + i = (queue->start + queue->length) % queue->max_length; + queue->items[i].symbol = symbol; + queue->items[i].node = node; + queue->items[i].distance = distance; + queue->length++; +} + +static const struct dfs_queue_item* dfs_queue_pop(struct dfs_queue *queue) +{ + size_t i; + if (queue->length == 0) + return NULL; + i = queue->start; + queue->start = (queue->start + 1) % queue->max_length; + queue->length--; + return queue->items + i; +} + +static void poliqarp_bfs_create_area(struct poliqarp_searchable_area *area, + struct dfs_node *start_node, size_t num_nodes, + int indices, const struct poliqarp_backend_index *index) +{ + bitset work_bs; + size_t current_distance = 0; + const struct dfs_queue_item *queue_item; + const struct dfs_link *link; + struct dfs_queue queue; + dfs_queue_create(&queue, 3 * num_nodes, num_nodes); + work_bs = bitset_arena_alloc(&area->arena); + for (link = start_node->first_link; link != NULL; link = link->next) + dfs_queue_push(&queue, link->symbol, link->to, 0); + while (1) + { + queue_item = dfs_queue_pop(&queue); + if (queue_item == NULL) + break; + if (queue_item->distance > current_distance) + { + if (work_bs != NULL) + bitset_arena_intersect(&area->arena, area->area, work_bs); + work_bs = poliqarp_unindex_expression(queue_item->symbol, area, + queue_item->distance, indices, index); + current_distance++; + } + else if (work_bs) + { + bitset tmp_bs = poliqarp_unindex_expression(queue_item->symbol, area, + queue_item->distance, indices, index); + if (tmp_bs) + bitset_arena_union(&area->arena, work_bs, tmp_bs); + else + work_bs = NULL; + } + for (link = queue_item->node->first_link; link != NULL; link = link->next) + dfs_queue_push(&queue, link->symbol, link->to, current_distance + 1); + } + if (work_bs != NULL) + bitset_arena_intersect(&area->arena, area->area, work_bs); + dfs_queue_destroy(&queue); +} + +void poliqarp_create_search_area(struct poliqarp_query *this) +{ + struct poliqarp_searchable_area *area = &this->area; + size_t num_segments = poliqarp_backend_corpus_size(&this->corpus->corpus); + + if (this->corpus->config.cdf.indices == 0) { + area->num_bits = 1; + area->granularity = num_segments; + } else { + area->granularity = this->corpus->config.cdf.granularity; + area->num_bits = num_segments / area->granularity; + if (area->num_bits * area->granularity < num_segments) + area->num_bits++; + } + + bitset_arena_create(&area->arena, area->num_bits, NULL); + area->area = bitset_arena_alloc(&area->arena); + if (this->corpus->config.cdf.indices == 0) { + bitset_arena_set(&area->arena, area->area, 0); + } else { + /* Try depth-first search to restrict area: */ + bitset work = bitset_arena_alloc_ones(&area->arena); + poliqarp_recursive_create_area(area, work, this->graph.dfs.root, + this->corpus->config.cdf.indices, poliqarp_get_const_backend(this->corpus, + index)); + /* Try breadth-first search to restrict area: */ + poliqarp_bfs_create_area(area, + this->graph.dfs.root, this->graph.dfs.num_nodes, + this->corpus->config.cdf.indices, + poliqarp_get_const_backend(this->corpus, index)); + } + area->num_segments = area->granularity * bitset_count_ones(&area->arena, + area->area); + if (bitset_arena_get(&area->arena, area->area, area->num_bits - 1)) + area->num_segments = area->num_segments - area->granularity + + (num_segments - 1) % area->granularity + 1; +} diff --git a/poliqarp-library/sakura/query.h b/poliqarp-library/sakura/query.h new file mode 100644 index 0000000000000000000000000000000000000000..65681378b5603deaba89206fa1964be32a82676e --- /dev/null +++ b/poliqarp-library/sakura/query.h @@ -0,0 +1,167 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef POLIQARP_QUERY_H +#define POLIQARP_QUERY_H + +#include <pthread.h> + +#include <time.h> + +#include <sakura/poliqarp.h> +#include <sakura/corpus.h> +#include <sakura/expression.h> +#include <sakura/meta-value.h> + +#include <sakura/common/bs.h> +#include <sakura/common/graph.h> +#include <sakura/common/hash-table.h> + +/** @defgroup poliqarp_query Query module */ +/** @{ */ +/** @file query.h Query things */ + +/* TODO: Add value-related-headers */ + +struct poliqarp_query; +struct poliqarp_match_buffer; +struct poliqarp_match; + +struct poliqarp_searchable_area { + size_t num_bits; + size_t num_segments; + size_t granularity; + bitset area; + bitset_arena arena; +}; + +#define POLIQARP_QEFLAG_HAS_ALIGN 1 +#define POLIQARP_QEFLAG_HAS_VARIABLES 2 + +struct poliqarp_within { + enum poliqarp_within_type { + WITHIN_SUBDOCUMENT, + WITHIN_PHRASE + } type; + union { + struct poliqarp_subdocument_set *subdocument; + struct poliqarp_expression *phrase; + } as; +}; + +struct poliqarp_within *poliqarp_within_create_subdocument( + struct poliqarp_subdocument_set *set); +struct poliqarp_within *poliqarp_within_create_phrase( + struct poliqarp_expression *phrase); + +struct poliqarp_search_context { + const struct dfs_node *node; + struct dfs_link *link; + size_t index; + size_t phrase; + size_t within_phrase; + size_t m_start; + size_t m_end; + size_t m_focus; + size_t c_focus; + struct poliqarp_document document; + struct poliqarp_subdocument subdocument; +}; + + +/** + * User makes a query, here it is. + */ +struct poliqarp_query { + struct poliqarp_corpus *corpus; /**< Corpus. */ + char *query_text; /**< Original query text. */ + struct graph_env graph; /**< Match graph. */ + struct poliqarp_expression *meta_expression; /**< Meta expression. */ + struct poliqarp_search_context last_context; /**< Last search context. */ + bool have_last_context; /**< Whether we have executed + this query before. */ + struct poliqarp_within *within; /**< Within object. */ + const struct hash_table *aliases; /**< Hash table with aliases. */ + struct poliqarp_searchable_area area; /**< Area of search. */ + int progress; /**< Progress of the query at + the start of searching. */ + size_t max_segment; /**< Offset of the rightmost + segment we have seen. */ + int flags; /**< Flags of query. */ + int eflags; /**< Extended (internal) flags + of query. */ + size_t variable_ranges[POLIQARP_MAX_VARIABLES]; /**< Ranges of variables. */ + void *variable_types[POLIQARP_MAX_VARIABLES]; /**< Types of variables. */ + size_t progress_helper; /**< Field used to keep track + of progress. */ + struct poliqarp_query_rewrite *rewrite; /**< Rules of query rewriting. */ + bool rewrite_in_progress; /**< Query rewrite in progress? */ + struct nfs_graph rewrite_graph; /**< Temporary graph for query rewriting. */ + struct poliqarp_error *error; /**< Compile error. */ + struct poliqarp_random_state *random_state; /**< State of a pseudo-random number generator. */ +}; + +/** + * Set constraining meta-data expression. + * @param this Query object. + * @param exp Expression object. + */ +void poliqarp_query_set_meta_expression(struct poliqarp_query *this, + struct poliqarp_expression *exp); + +/** + * Set within constraint. + * @param this Query object. + * @param within Within constraint. + **/ +void poliqarp_query_set_within(struct poliqarp_query *this, + struct poliqarp_within *within); + +/** Query result structure. */ +struct poliqarp_match_buffer { + struct poliqarp_corpus *corpus; /**< Corpus that the matches are valid + on. */ + struct poliqarp_match *match; /**< Array of matches. */ + size_t capacity; /**< Capacity of the match array. */ + size_t used; /**< Number of used items in the + match array. */ + size_t num_results; + /**< Number of results spotted during query execution. */ + struct marena *sort_arena; /**< Sort scratchpad memory source. */ + pthread_mutex_t mutex; +}; + +/** + * Append match to result buffer. + * It should be EXPLICITLY verified that enough space exists. + * @param this Result object. + * @param match Match object. + */ +void poliqarp_append_query_result(struct poliqarp_match_buffer *this, + const struct poliqarp_match *match); + +void poliqarp_create_search_area(struct poliqarp_query *this); + +/** @} */ + +#endif diff --git a/poliqarp-library/sakura/random.c b/poliqarp-library/sakura/random.c new file mode 100644 index 0000000000000000000000000000000000000000..5a80100aaa6c5ba331bd876aac2b15a685913aea --- /dev/null +++ b/poliqarp-library/sakura/random.c @@ -0,0 +1,86 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef SAKURA_SINGLE_THREADED +#include <pthread.h> +#else +#include <sys/types.h> +#include <unistd.h> +#endif +#include <sys/time.h> + +#include "random.h" + +/* The following code is based on the small noncryptographic pseudorandom + * number generator code[1] by Bob Jenkins, which is in the public domain. + * [1] http://burtleburtle.net/bob/rand/talksmall.html + */ + +static inline uint32_t rot(uint32_t x, uint32_t k) +{ + return (x << k) | (x >> (32 - k)); +} + +uint32_t poliqarp_random(struct poliqarp_random_state *state) +{ + uint32_t e = state->a - rot(state->b, 27); + state->a = state->b ^ rot(state->c, 17); + state->b = state->c + state->d; + state->c = state->d + e; + state->d = e + state->a; + return state->d; +} + +void poliqarp_srandom(struct poliqarp_random_state *state, uint32_t seed) +{ + uint32_t i; + state->a = 0xf1ea5eed; + state->b = state->c = state->d = seed; + for (i = 0; i < 20; ++i) + poliqarp_random(state); +} + +void poliqarp_srandom_time(struct poliqarp_random_state *state) +{ + uint32_t seed; + struct timeval tv; + gettimeofday(&tv, NULL); + seed = 0; + seed += (uint32_t) tv.tv_sec; + seed += (uint32_t) tv.tv_usec; +#ifndef SAKURA_SINGLE_THREADED +#ifdef WIN32 + seed += (uint32_t) pthread_getw32threadhandle_np(pthread_self()); +#else + /* FIXME: IEEE Std 1003.1-2001/Cor 2-2004, item XBD/TC2/D6/26 is applied, + * adding pthread_t to the list of types that are not required to be + * arithmetic types, thus allowing pthread_t to be defined as a structure. + * http://www.opengroup.org/onlinepubs/009695399/basedefs/sys/types.h.html#tag_13_67_10 + */ + seed += (uint32_t) pthread_self(); +#endif +#else + seed += (uint32_t) getpid(); +#endif + poliqarp_srandom(state, seed); +} diff --git a/poliqarp-library/sakura/random.h b/poliqarp-library/sakura/random.h new file mode 100644 index 0000000000000000000000000000000000000000..a0aaa460f42275f04fc97b9f4641aa765319a64a --- /dev/null +++ b/poliqarp-library/sakura/random.h @@ -0,0 +1,38 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef POLIQARP_RANDOM_H +#define POLIQARP_RANDOM_H + +#include <stdint.h> + +struct poliqarp_random_state { + uint32_t a, b, c, d; +}; + +uint32_t poliqarp_random(struct poliqarp_random_state *state); +void poliqarp_srandom(struct poliqarp_random_state *state, uint32_t seed); +void poliqarp_srandom_time(struct poliqarp_random_state *state); + +#endif + diff --git a/poliqarp-library/sakura/regexp.c b/poliqarp-library/sakura/regexp.c new file mode 100644 index 0000000000000000000000000000000000000000..a3bee2505bdd7c1152dc2ff8680b18ffa2ae8e4c --- /dev/null +++ b/poliqarp-library/sakura/regexp.c @@ -0,0 +1,128 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <errno.h> +#include <string.h> + +#include <sakura/regexp.h> + +#define C1 "a" +#define C2 "\xce\xb1" /* GREEK SMALL LETTER ALPHA */ +#define C3 "\xe0\xa4\x85" /* DEVANAGARI LETTER A */ +#define C4 "\xf4\x80\x80\x80" /* Plane 15 Private Use, First */ + +int poliqarp_regexp_validate_utf8() +{ + int rc; + regex_t preg; + const char *string = C1 C2 C3 C4; + const char *regex = "^" C1 "[" C2 "][" C3 "][" C4 "]$"; + rc = regcomp(&preg, regex, REG_EXTENDED | REG_NOSUB); + /* A match is expected for UTF-8 encoding *only*. */ + if (rc != 0) { + errno = (rc == REG_ESPACE) ? ENOMEM : EINVAL; + return -1; + } + rc = regexec(&preg, string, 0, NULL, 0); + regfree(&preg); + if (rc != 0) { + errno = EILSEQ; + return 1; + } + return 0; +} + +#undef C1 +#undef C2 +#undef C3 +#undef C4 + +/* create a regular expression */ +int poliqarp_regexp_create(struct poliqarp_regexp *this, const char *pattern, + unsigned int flags, unsigned int xflags) +{ + char *use_pattern; + bool remade_pattern = false; + + if (xflags & POLIQARP_REG_NO_ANCHORS) { + use_pattern = (char *)pattern; + this->simple = false; + } else { + use_pattern = malloc(strlen(pattern) + 5); + sprintf(use_pattern, "^(%s)$", pattern); + remade_pattern = true; + this->simple = (flags & REG_ICASE) ? false : + (strpbrk(pattern, "|*+?{,}()^$.\\-[]=:\"") == NULL); + } + this->pattern = strdup(pattern); + this->status = regcomp(&this->preg, use_pattern, flags | REG_EXTENDED); + if (remade_pattern) + free(use_pattern); + if (this->status) { + size_t size; + size = regerror(this->status, &this->preg, 0, 0); + this->message = malloc(size); + if (this->message != NULL) + regerror(this->status, &this->preg, this->message, size); + } else + this->message = NULL; + return this->status; +} + +/* free the regular expression */ +void poliqarp_regexp_destroy(struct poliqarp_regexp *this) +{ + if (this->status == 0) + regfree(&this->preg); + free(this->pattern); + free(this->message); +} + +void poliqarp_parse_regexp_flags(const char *text, unsigned int *flags, + unsigned int *xflags) +{ + unsigned result = *flags; /* this provides continuity but requires + initialization by the caller */ + unsigned xresult = *xflags; + int c; + + while ((c = *text++)) + switch (c) { + case 'i': + result |= REG_ICASE; + break; + case 'I': + result &= ~REG_ICASE; + break; + case 'x': + xresult |= POLIQARP_REG_NO_ANCHORS; + break; + case 'X': + xresult &= ~POLIQARP_REG_NO_ANCHORS; + break; + default: + break; + } + *flags = result; + *xflags = xresult; +} diff --git a/poliqarp-library/sakura/regexp.h b/poliqarp-library/sakura/regexp.h new file mode 100644 index 0000000000000000000000000000000000000000..f41943e585e3af9c37eec013839d8ca0309f3753 --- /dev/null +++ b/poliqarp-library/sakura/regexp.h @@ -0,0 +1,141 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef POLIQARP_REGEXP_H +#define POLIQARP_REGEXP_H + +#include <sakura/exception.h> + +#include <stdlib.h> +#include <stdio.h> +#include <sys/types.h> + +/** Don't transform regular expression to "^(%s)$". */ +#define POLIQARP_REG_NO_ANCHORS 1 + +/* If we are using UTF-8 internally, include the headers of TCL regexp library + * and add some wrappers, otherwise fall back to standard regexp library. */ + +#ifdef USE_TCL_REGEX + +#include <win32/regex/regguts.h> + +extern int regcomp(regex_t * preg, const char *regex, int flags); +extern int regexec(const regex_t * preg, const char *string, + size_t nmatch, regmatch_t pmatch[], int eflags); +#else +#include <regex.h> +#endif /* USE_TCL_REGEX */ + +/** Nifty Poliqarp regular expressions wrapper. */ +struct poliqarp_regexp { + char *pattern; /**< Text of pattern. */ + char *message; /**< Error message, if any. */ + int status; /**< Status code. */ + bool simple; /**< Is strcmp() enough to execute this RE? */ + regex_t preg; /**< Regular expression object. */ +}; + +/** + * Check if current locale settings allow match agains UTF-8 strings. + * + * @note In case of a non-UTF-8 locale <code>errno</code> is set to + * <code>EILSEQ</code>. + * + * @return 0 for UTF-8 locale, 1 for non-UTF-8 locale, -1 on error. + */ +int poliqarp_regexp_validate_utf8(void); + +/** + * Create a regular expression. + * + * @param this pointer to the structure to be initialized. + * @param pattern NULL-terminated string that follows the POSIX extended + * regular expressions syntax + * @param flags POSIX-specific flags: bitwise-inclusive OR of zero or more of + * the following flags: REG_ICASE, REG_NOSUB + * @param xflags Poliqarp-specific flags: bitwise-inclusive OR of zero or more + * of the following flags: POLIQARP_REG_NO_ANCHORS + * + * @return 0 on success, a non-zero code otherwise. + * + * @see poliqarp_regexp_destroy + * */ +int poliqarp_regexp_create(struct poliqarp_regexp *this, const char *pattern, + unsigned int flags, unsigned int xflags); + +/** + * Free the regular expression. + * + * @param this pointer to the structure to be destroyed + * + * @see poliqarp_regexp_create + */ +void poliqarp_regexp_destroy(struct poliqarp_regexp *this); + +/** Parse regular expression flags. */ +void poliqarp_parse_regexp_flags(const char *text, unsigned int *flags, + unsigned int *xflags); + +/** + * Try to match the given string to the pattern and obtain substring match + * addressing information. + * + * Unless <code>REG_NOSUB</code> was set for the compilation of the regular + * expression, it is possible to obtain substring match addressing information. + * + * @param this compiled regular expression + * @param string string to be matched against the regular expression + * @param nmatch number of elements of <code>pmatch</code> + * @param pmatch buffer to store substring match addressing information + * + * @see poliqarp_regexp_match + */ +static inline bool poliqarp_regexp_match_ex(const struct poliqarp_regexp *this, + const char *string, size_t nmatch, regmatch_t pmatch[]) +{ + return regexec(&this->preg, string, nmatch, pmatch, 0) != REG_NOMATCH; +} + +/** + * Try to match the given string to the pattern. + * + * @param this compiled regular expression + * @param string string to be matched against the regular expression + * + * @see poliqarp_regexp_match_ex + */ +static inline bool poliqarp_regexp_match(const struct poliqarp_regexp *this, + const char *string) +{ + return poliqarp_regexp_match_ex(this, string, 0, NULL); +} + +/** + * Extract error message from the unsuccessfully compiled regular expression. + */ +void poliqarp_error_from_regexp(struct poliqarp_error *error, + const struct poliqarp_regexp *this, + const char *fmt, ...); + +#endif diff --git a/poliqarp-library/sakura/value-attr.c b/poliqarp-library/sakura/value-attr.c new file mode 100644 index 0000000000000000000000000000000000000000..a4cba00dfbc451d05bcb9b7708be773fe9927c17 --- /dev/null +++ b/poliqarp-library/sakura/value-attr.c @@ -0,0 +1,137 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <errno.h> + +#include <sakura/value-attr.h> +#include <sakura/regexp.h> + +struct poliqarp_value *poliqarp_value_create_attr(const struct poliqarp_corpus *corpus, + const char *name, const struct poliqarp_regexp *pattern) +{ + struct poliqarp_value *this; + + struct poliqarp_attr *attr; + + const struct poliqarp_backend_tag *backend_tag = &corpus->tag; + const struct poliqarp_backend_config *backend_config = &corpus->config; + const struct poliqarp_attr_value *value; + const struct entity *entity; + + size_t i; + + unsigned int *bitmap; + + /* find our attribute */ + entity = lookup_const_entity(&backend_config->named_items, name); + if (entity == NULL || + *(enum poliqarp_entity_type *) entity->tag != POLIQARP_ENTITY_ATTR) + { + errno = ENOENT; + return NULL; + } + attr = entity->data; + + /* create bitmap big enough to hold each part of speech */ + bitmap = malloc(BIT_ARRAY_LENGTH_BYTES(bitmap, attr->num_values)); + memset(bitmap, 0, BIT_ARRAY_LENGTH_BYTES(bitmap, attr->num_values)); + + for (value = attr->first_value; value; value = value->next_value) { + if (poliqarp_regexp_match(pattern, value->self->name)) + BIT_ARRAY_SET(bitmap, value->id); + } + + /* now allocate value */ + this = malloc(sizeof *this); + this->num_items = poliqarp_backend_tag_num_items(backend_tag); + this->num_bytes = BIT_ARRAY_LENGTH_BYTES(this->bits, this->num_items); + this->num_hits = 0; + this->bits = malloc(this->num_bytes); + this->domain = POLIQARP_DOMAIN_TAG; + + /* clear the bit field */ + memset(this->bits, 0, this->num_bytes); + + { + size_t attr_id = attr->id; + const struct poliqarp_attr_value *attr_value; + /* iterate over all tags and find those that match */ + for (i = 0; i < this->num_items; ++i) { + attr_value = poliqarp_backend_parsed_tag_fetch(backend_tag, i)-> + attr_value[attr_id]; + if (attr_value && BIT_ARRAY_GET(bitmap, attr_value->id)) { + BIT_ARRAY_SET(this->bits, i); + ++this->num_hits; + } + } + } + + free(bitmap); + return this; +} + +struct poliqarp_value **poliqarp_value_create_all_attr(const struct poliqarp_corpus *corpus, + const char *name, size_t *n) +{ + struct poliqarp_value **values, *this; + + const struct poliqarp_backend_tag *backend_tag = &corpus->tag; + const struct poliqarp_backend_config *backend_config = &corpus->config; + const struct poliqarp_attr_value *avalue1, *avalue2; + const struct entity *entity; + struct poliqarp_attr *attr; + size_t n_tags, i; + + entity = lookup_const_entity(&backend_config->named_items, name); + if (entity == NULL || + *(enum poliqarp_entity_type *) entity->tag != POLIQARP_ENTITY_ATTR) + { + errno = ENOENT; + return NULL; + } + attr = entity->data; + + *n = attr->num_values; + values = malloc(*n * sizeof(*this)); + + n_tags = poliqarp_backend_tag_num_items(backend_tag); + for (avalue1 = attr->first_value; avalue1; avalue1 = avalue1->next_value) { + values[avalue1->id] = this = malloc(sizeof(*this)); + this->num_items = n_tags; + this->num_bytes = BIT_ARRAY_LENGTH_BYTES(this->bits, n_tags); + this->bits = malloc(this->num_bytes); + this->domain = POLIQARP_DOMAIN_TAG; + this->num_hits = 0; + memset(this->bits, 0, this->num_bytes); + for (i = 0; i < n_tags; i++) { + const struct poliqarp_parsed_tag *parsed_tag = + poliqarp_backend_parsed_tag_fetch(backend_tag, i); + avalue2 = parsed_tag->attr_value[attr->id]; + if (avalue2 && (avalue1->id == avalue2->id)) { + BIT_ARRAY_SET(this->bits, i); + this->num_hits++; + } + } + } + return values; +} diff --git a/poliqarp-library/sakura/value-attr.h b/poliqarp-library/sakura/value-attr.h new file mode 100644 index 0000000000000000000000000000000000000000..35b8acb4c32e2e98b74bf2dcd174f80312d81132 --- /dev/null +++ b/poliqarp-library/sakura/value-attr.h @@ -0,0 +1,52 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef POLIQARP_VALUE_ATTR_H +#define POLIQARP_VALUE_ATTR_H + +#include <sakura/corpus.h> +#include <sakura/regexp.h> +#include <sakura/exception.h> +#include <sakura/value.h> + +/** @addtogroup poliqarp_value */ +/** @{ */ +/** @file value-attr.h Values that match given attribute */ + +/** + * Create an attribute that matches given name. + * @param name Name of the attribute. + * @param pattern Pattern for acceptable values. + */ +struct poliqarp_value *poliqarp_value_create_attr(const struct poliqarp_corpus *corpus, + const char *name, const struct poliqarp_regexp *pattern); + +/** + * Create values for given attribute. + */ +struct poliqarp_value **poliqarp_value_create_all_attr(const struct poliqarp_corpus *corpus, + const char *name, size_t *n); + +/** @} */ + +#endif diff --git a/poliqarp-library/sakura/value-base.c b/poliqarp-library/sakura/value-base.c new file mode 100644 index 0000000000000000000000000000000000000000..7cfed618af17b8dccfcb7800a2ac986e5388d9a3 --- /dev/null +++ b/poliqarp-library/sakura/value-base.c @@ -0,0 +1,42 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <sakura/value-base.h> + +struct poliqarp_value *poliqarp_value_create_base__disambiguated( + const struct poliqarp_corpus *corpus, const struct poliqarp_regexp *pattern) +{ + struct poliqarp_value *this = NULL; + this = poliqarp_value_pattern_create(&corpus->base.dict_disamb, pattern); + this->domain = POLIQARP_DOMAIN_BASE; + return this; +} + +struct poliqarp_value *poliqarp_value_create_base__ambiguous( + const struct poliqarp_corpus *corpus, const struct poliqarp_regexp *pattern) +{ + struct poliqarp_value *this = NULL; + this = poliqarp_value_pattern_create(&corpus->base.dict_amb, pattern); + this->domain = POLIQARP_DOMAIN_BASE; + return this; +} diff --git a/poliqarp-library/sakura/value-base.h b/poliqarp-library/sakura/value-base.h new file mode 100644 index 0000000000000000000000000000000000000000..c4dfb4a5c103aab0554d42b0c94a22374c8b0c4e --- /dev/null +++ b/poliqarp-library/sakura/value-base.h @@ -0,0 +1,52 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef POLIQARP_VALUE_BASE_H +#define POLIQARP_VALUE_BASE_H + +#include <sakura/corpus.h> +#include <sakura/regexp.h> +#include <sakura/exception.h> +#include <sakura/value-pattern.h> + +/** @addtogroup poliqarp_value */ +/** @{ */ +/** @file value-base.h Values from the _BASE domain. **/ + +/** + * Create a value that matches given disambiguated base. + * This constructor should be used for DISAMBIGUATED lookups + */ +struct poliqarp_value *poliqarp_value_create_base__disambiguated( + const struct poliqarp_corpus *corpus, const struct poliqarp_regexp *pattern); + +/** + * Create a value that matches given ambiguous base. + * This constructor should be used for AMBIGUOUS lookups + */ +struct poliqarp_value *poliqarp_value_create_base__ambiguous( + const struct poliqarp_corpus *corpus, const struct poliqarp_regexp *pattern); + +/** @} */ + +#endif diff --git a/poliqarp-library/sakura/value-interp.c b/poliqarp-library/sakura/value-interp.c new file mode 100644 index 0000000000000000000000000000000000000000..a3de0ed81169581493e281a5aafcc8becc347094 --- /dev/null +++ b/poliqarp-library/sakura/value-interp.c @@ -0,0 +1,199 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <sakura/value-interp.h> + +static void poliqarp_value_interp_create__disamb(struct poliqarp_value *this, + const struct poliqarp_corpus *corpus, const struct poliqarp_value *value) +{ + size_t i; + unsigned *bits = value->bits; + const struct poliqarp_binary_interp *interp; + const struct poliqarp_binary_interp *interp_end; + + switch (value->domain) { + case POLIQARP_DOMAIN_BASE: + switch (value->strategy) { + case POLIQARP_STRATEGY_ANY: + for (i = 0; i < this->num_items; ++i) { + interp = poliqarp_backend_interp_fetch__disamb(&corpus->interp, + i); + interp_end = interp + + poliqarp_backend_interp_length__disamb(&corpus->interp, i); + if (poliqarp_value_micro_eval_base_any(bits, interp, + interp_end)) + { + BIT_ARRAY_SET(this->bits, i); + ++this->num_hits; + } + } + break; + case POLIQARP_STRATEGY_ALL: + for (i = 0; i < this->num_items; ++i) { + interp = poliqarp_backend_interp_fetch__disamb(&corpus->interp, + i); + interp_end = interp + + poliqarp_backend_interp_length__disamb(&corpus->interp, i); + if (poliqarp_value_micro_eval_base_all(bits, interp, + interp_end)) + { + BIT_ARRAY_SET(this->bits, i); + ++this->num_hits; + } + } + break; + } + break; + case POLIQARP_DOMAIN_TAG: + switch (value->strategy) { + case POLIQARP_STRATEGY_ANY: + for (i = 0; i < this->num_items; ++i) { + interp = poliqarp_backend_interp_fetch__disamb(&corpus->interp, + i); + interp_end = interp + + poliqarp_backend_interp_length__disamb(&corpus->interp, i); + if (poliqarp_value_micro_eval_tag_any(bits, interp, interp_end)) + { + BIT_ARRAY_SET(this->bits, i); + ++this->num_hits; + } + } + break; + case POLIQARP_STRATEGY_ALL: + for (i = 0; i < this->num_items; ++i) { + interp = poliqarp_backend_interp_fetch__disamb(&corpus->interp, + i); + interp_end = interp + + poliqarp_backend_interp_length__disamb(&corpus->interp, i); + if (poliqarp_value_micro_eval_tag_all(bits, interp, interp_end)) + { + BIT_ARRAY_SET(this->bits, i); + ++this->num_hits; + } + } + break; + } + break; + default: + abort(); /* Should not happen. */ + } +} + +static void poliqarp_value_interp_create__amb(struct poliqarp_value *this, + const struct poliqarp_corpus *corpus, const struct poliqarp_value *value) +{ + size_t i; + unsigned *bits = value->bits; + const struct poliqarp_binary_interp *interp; + const struct poliqarp_binary_interp *interp_end; + + switch (value->domain) { + case POLIQARP_DOMAIN_BASE: + switch (value->strategy) { + case POLIQARP_STRATEGY_ANY: + for (i = 0; i < this->num_items; ++i) { + interp = poliqarp_backend_interp_fetch__amb(&corpus->interp, i); + interp_end = interp + + poliqarp_backend_interp_length__amb(&corpus->interp, i); + if (poliqarp_value_micro_eval_base_any(bits, interp, + interp_end)) + { + BIT_ARRAY_SET(this->bits, i); + ++this->num_hits; + } + } + break; + case POLIQARP_STRATEGY_ALL: + for (i = 0; i < this->num_items; ++i) { + interp = poliqarp_backend_interp_fetch__amb(&corpus->interp, i); + interp_end = interp + + poliqarp_backend_interp_length__amb(&corpus->interp, i); + if (poliqarp_value_micro_eval_base_all(bits, interp, + interp_end)) + { + BIT_ARRAY_SET(this->bits, i); + ++this->num_hits; + } + } + break; + } + break; + case POLIQARP_DOMAIN_TAG: + switch (value->strategy) { + case POLIQARP_STRATEGY_ANY: + for (i = 0; i < this->num_items; ++i) { + interp = poliqarp_backend_interp_fetch__amb(&corpus->interp, i); + interp_end = interp + + poliqarp_backend_interp_length__amb(&corpus->interp, i); + if (poliqarp_value_micro_eval_tag_any(bits, interp, interp_end)) + { + BIT_ARRAY_SET(this->bits, i); + ++this->num_hits; + } + } + break; + case POLIQARP_STRATEGY_ALL: + for (i = 0; i < this->num_items; ++i) { + interp = poliqarp_backend_interp_fetch__amb(&corpus->interp, i); + interp_end = interp + + poliqarp_backend_interp_length__amb(&corpus->interp, i); + if (poliqarp_value_micro_eval_tag_all(bits, interp, interp_end)) + { + BIT_ARRAY_SET(this->bits, i); + ++this->num_hits; + } + } + break; + } + break; + default: + abort(); /* Should not happen. */ + } +} + +struct poliqarp_value *poliqarp_value_interp_create(const struct poliqarp_corpus + *corpus, const struct poliqarp_value *value) +{ + struct poliqarp_value *this; + + /* do the allocation */ + this = malloc(sizeof *this); + this->num_items = value->use_disamb ? + poliqarp_backend_interp_num_items__disamb(&corpus->interp) : + poliqarp_backend_interp_num_items__amb(&corpus->interp); + this->num_bytes = BIT_ARRAY_LENGTH_BYTES(this->bits, this->num_items); + this->num_hits = 0; + this->bits = malloc(this->num_bytes); + this->strategy = value->strategy; + this->domain = value-> use_disamb ? POLIQARP_DOMAIN_INTERP__DISAMB : + POLIQARP_DOMAIN_INTERP__AMB; + + /* clear the bit field */ + memset(this->bits, 0, this->num_bytes); + + if (value->use_disamb) + poliqarp_value_interp_create__disamb(this, corpus, value); + else + poliqarp_value_interp_create__amb(this, corpus, value); + return this; +} diff --git a/poliqarp-library/sakura/value-interp.h b/poliqarp-library/sakura/value-interp.h new file mode 100644 index 0000000000000000000000000000000000000000..6c8fac1dc624b5fb1bffb7a8e26743f43c095fde --- /dev/null +++ b/poliqarp-library/sakura/value-interp.h @@ -0,0 +1,41 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef POLIQARP_VALUE_INTERP_H +#define POLIQARP_VALUE_INTERP_H + +#include <sakura/corpus.h> +#include <sakura/exception.h> +#include <sakura/regexp.h> +#include <sakura/value.h> + +/** @addtogroup poliqarp_value */ +/** @{ */ + +/** Create a value that matches given interpretation. */ +struct poliqarp_value *poliqarp_value_interp_create(const struct poliqarp_corpus *corpus, + const struct poliqarp_value *value); + +/** @} */ + +#endif diff --git a/poliqarp-library/sakura/value-orth.c b/poliqarp-library/sakura/value-orth.c new file mode 100644 index 0000000000000000000000000000000000000000..8d72c3c0aad0853963c987d17222dbf1db2a7f0c --- /dev/null +++ b/poliqarp-library/sakura/value-orth.c @@ -0,0 +1,35 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <sakura/value-orth.h> + +struct poliqarp_value *poliqarp_value_create_orth(const struct poliqarp_corpus *corpus, + const struct poliqarp_regexp *pattern) +{ + struct poliqarp_value *this = NULL; + + this = poliqarp_value_pattern_create(&corpus->orth.dict, pattern); + this->domain = POLIQARP_DOMAIN_ORTH; + + return this; +} diff --git a/poliqarp-library/sakura/value-orth.h b/poliqarp-library/sakura/value-orth.h new file mode 100644 index 0000000000000000000000000000000000000000..8b42cbad186dc863002619a937cf1b1a39684665 --- /dev/null +++ b/poliqarp-library/sakura/value-orth.h @@ -0,0 +1,44 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef POLIQARP_VALUE_ORTH_H +#define POLIQARP_VALUE_ORTH_H + +#include <sakura/corpus.h> +#include <sakura/regexp.h> +#include <sakura/value-pattern.h> + +/** @addtogroup poliqarp_value */ +/** @{ */ +/** @file value-orth.h Values that match given orth */ + +/** + * Create a value that matches in the _ORTH domain to given pattern. + * @param pattern Pattern of acceptable values. + */ +struct poliqarp_value *poliqarp_value_create_orth(const struct poliqarp_corpus *corpus, + const struct poliqarp_regexp *pattern); + +/** @} */ + +#endif diff --git a/poliqarp-library/sakura/value-pattern.c b/poliqarp-library/sakura/value-pattern.c new file mode 100644 index 0000000000000000000000000000000000000000..9c483338eaa80ea0c5b1173faba882488689e417 --- /dev/null +++ b/poliqarp-library/sakura/value-pattern.c @@ -0,0 +1,57 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <sakura/value-pattern.h> + +/* create a new vbase */ +struct poliqarp_value *poliqarp_value_pattern_create(const struct newdict *dict, + const struct poliqarp_regexp *regexp) +{ + struct poliqarp_value *this; + size_t i; + + /* do the allocation */ + this = malloc(sizeof *this); + this->num_items = dict->num_items; + this->num_bytes = BIT_ARRAY_LENGTH_BYTES(this->bits, this->num_items); + this->num_hits = 0; + this->bits = malloc(this->num_bytes); + + /* clear the bit field */ + memset(this->bits, 0, this->num_bytes); + + if (dict->index != NULL && regexp->simple) { + void *hit = hash_table_get(dict->index, regexp->pattern); + if (hit == NULL) /* no hits */ + return this; + BIT_ARRAY_SET(this->bits, ((intptr_t)hit) - 1); + ++this->num_hits; + return this; + } + for (i = 0; i < this->num_items; ++i) + if (poliqarp_regexp_match(regexp, GET_ITEM(dict, i))) { + BIT_ARRAY_SET(this->bits, i); + ++this->num_hits; + } + return this; +} diff --git a/poliqarp-library/sakura/value-pattern.h b/poliqarp-library/sakura/value-pattern.h new file mode 100644 index 0000000000000000000000000000000000000000..861f002b628063e120271b5359a7df284dbd9d80 --- /dev/null +++ b/poliqarp-library/sakura/value-pattern.h @@ -0,0 +1,48 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef POLIQARP_VALUE_PATTERN_H +#define POLIQARP_VALUE_PATTERN_H + +#include <sakura/corpus.h> +#include <sakura/exception.h> +#include <sakura/regexp.h> +#include <sakura/value.h> + +#include <sakura/common/newdict.h> + +/** @addtogroup poliqarp_value */ +/** @{ */ +/** @file value-pattern.h Create a value that matches any pattern */ + +/** + * Create a value that matches the pattern in the given dictionary. + * @param dict The dictionary. + * @param regexp The pattern. + */ +struct poliqarp_value *poliqarp_value_pattern_create(const struct newdict *dict, + const struct poliqarp_regexp *regexp); + +/** @} */ + +#endif diff --git a/poliqarp-library/sakura/value-pos.c b/poliqarp-library/sakura/value-pos.c new file mode 100644 index 0000000000000000000000000000000000000000..d8e92dda06f984a04bf75f748bb8159c1b4fa5d5 --- /dev/null +++ b/poliqarp-library/sakura/value-pos.c @@ -0,0 +1,130 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <sakura/value-pos.h> + +struct poliqarp_value *poliqarp_value_create_pos(const struct poliqarp_corpus *corpus, + const struct poliqarp_regexp *pattern) +{ + struct poliqarp_value *this; + + const struct poliqarp_backend_tag *backend_tag = &corpus->tag; + const struct poliqarp_backend_config *backend_config = &corpus->config; + const struct poliqarp_parsed_tag *parsed_tag; + const struct entity *entity; + + size_t i; + size_t num_bytes; + size_t num_hits_pos; + + unsigned int *bitmap; + + /* create bitmap big enough to hold each part of speech */ + num_bytes = BIT_ARRAY_LENGTH_BYTES(bitmap, backend_config->num_pos); + bitmap = malloc(num_bytes); + memset(bitmap, 0, num_bytes); + + /* iterate over all entities and skip non part-of-speech ones + * match the pattern and assign result to bitmap */ + num_hits_pos = 0; + for (entity = backend_config->named_items.first_entity; entity != NULL; + entity = entity->next_entity) + { + if (entity->tag == NULL || + *(enum poliqarp_entity_type *) entity->tag != POLIQARP_ENTITY_POS) + { + continue; + } + if (poliqarp_regexp_match(pattern, entity->name)) { + BIT_ARRAY_SET(bitmap, + ((struct poliqarp_part_of_speech *)entity->data)->id); + num_hits_pos++; + } + } + + /* now allocate value */ + this = malloc(sizeof *this); + this->num_items = poliqarp_backend_tag_num_items(backend_tag); + this->num_bytes = BIT_ARRAY_LENGTH_BYTES(this->bits, this->num_items); + this->num_hits = 0; + this->bits = malloc(this->num_bytes); + this->domain = POLIQARP_DOMAIN_TAG; + + /* clear the bit field */ + memset(this->bits, 0, this->num_bytes); + + if (num_hits_pos > 0) + /* iterate over all tags and find those that match */ + for (i = 0; i < this->num_items; ++i) { + parsed_tag = poliqarp_backend_parsed_tag_fetch(backend_tag, i); + if (BIT_ARRAY_GET(bitmap, parsed_tag->pos->id)) { + BIT_ARRAY_SET(this->bits, i); + ++this->num_hits; + } + } + free(bitmap); + return this; +} + +struct poliqarp_value **poliqarp_value_create_all_pos(const struct poliqarp_corpus *corpus, + size_t *n) +{ + struct poliqarp_value **values, *this; + + const struct poliqarp_backend_tag *backend_tag = &corpus->tag; + const struct poliqarp_backend_config *backend_config = &corpus->config; + const struct poliqarp_parsed_tag *parsed_tag; + const struct entity *entity; + size_t n_tags, i; + + *n = backend_config->num_pos; + values = malloc(*n * sizeof(*this)); + + i = 0; + n_tags = poliqarp_backend_tag_num_items(backend_tag); + for (entity = backend_config->named_items.first_entity; entity != NULL; + entity = entity->next_entity) + { + if (entity->tag == NULL || + *(enum poliqarp_entity_type *) entity->tag != POLIQARP_ENTITY_POS) + { + continue; + } + values[i] = this = malloc(sizeof(*this)); + this->num_items = n_tags; + this->num_bytes = BIT_ARRAY_LENGTH_BYTES(this->bits, n_tags); + this->bits = malloc(this->num_bytes); + this->domain = POLIQARP_DOMAIN_TAG; + this->num_hits = 0; + memset(this->bits, 0, this->num_bytes); + i++; + this++; + } + for (i = 0; i < n_tags; i++) { + parsed_tag = poliqarp_backend_parsed_tag_fetch(backend_tag, i); + this = values[parsed_tag->pos->id]; + BIT_ARRAY_SET(this->bits, i); + values[parsed_tag->pos->id]->num_hits++; + } + return values; +} diff --git a/poliqarp-library/sakura/value-pos.h b/poliqarp-library/sakura/value-pos.h new file mode 100644 index 0000000000000000000000000000000000000000..153940944d650f5819e62d934ff97da41b9f6a53 --- /dev/null +++ b/poliqarp-library/sakura/value-pos.h @@ -0,0 +1,45 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef POLIQARP_VALUE_POS_H +#define POLIQARP_VALUE_POS_H + +#include <sakura/corpus.h> +#include <sakura/regexp.h> +#include <sakura/exception.h> +#include <sakura/value.h> + +/** @addtogroup poliqarp_value */ +/** @{ */ + +/** Create a value that matches given part of speech. */ +struct poliqarp_value *poliqarp_value_create_pos(const struct poliqarp_corpus *corpus, + const struct poliqarp_regexp *pattern); + +/** For each part of speech, create a value that matches it. */ +struct poliqarp_value **poliqarp_value_create_all_pos(const struct poliqarp_corpus *corpus, + size_t *n); + +/** @} */ + +#endif diff --git a/poliqarp-library/sakura/value-space.c b/poliqarp-library/sakura/value-space.c new file mode 100644 index 0000000000000000000000000000000000000000..d5f4d650a29d0ba50af864c84d286e0a12a6e8b1 --- /dev/null +++ b/poliqarp-library/sakura/value-space.c @@ -0,0 +1,40 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2008-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <sakura/value-orth.h> + +struct poliqarp_value *poliqarp_value_create_space(const struct poliqarp_corpus *corpus, + bool bool_value) +{ + struct poliqarp_value *this = NULL; + + this = malloc(sizeof *this); + this->num_items = 1; + this->num_bytes = BIT_ARRAY_LENGTH_BYTES(this->bits, this->num_items); + this->num_hits = 0; + this->bits = malloc(this->num_bytes); + this->bits[0] = bool_value; + this->domain = POLIQARP_DOMAIN_SPACE; + + return this; +} diff --git a/poliqarp-library/sakura/value-space.h b/poliqarp-library/sakura/value-space.h new file mode 100644 index 0000000000000000000000000000000000000000..af7d9eb77d3c6c6253c79ae5a75acd29944ee727 --- /dev/null +++ b/poliqarp-library/sakura/value-space.h @@ -0,0 +1,40 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2008-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef POLIQARP_VALUE_SPACE_H +#define POLIQARP_VALUE_SPACE_H + +#include <sakura/corpus.h> + +/** @addtogroup poliqarp_value */ +/** @{ */ +/** @file value-space.h Values that match given space */ + +/** + * Create a value that matches in the _SPACE domain to the given boolean value. + */ +struct poliqarp_value *poliqarp_value_create_space(const struct poliqarp_corpus *corpus, + bool bool_value); + +/** @} */ +#endif diff --git a/poliqarp-library/sakura/value-tag.c b/poliqarp-library/sakura/value-tag.c new file mode 100644 index 0000000000000000000000000000000000000000..696e944d1a84cdfdf818414a1d17b94161cbed8d --- /dev/null +++ b/poliqarp-library/sakura/value-tag.c @@ -0,0 +1,33 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <sakura/value-tag.h> + +struct poliqarp_value *poliqarp_value_create_tag(const struct poliqarp_corpus *corpus, + const struct poliqarp_regexp *pattern) +{ + struct poliqarp_value *this = NULL; + this = poliqarp_value_pattern_create(&corpus->tag.dict, pattern); + this->domain = POLIQARP_DOMAIN_TAG; + return this; +} diff --git a/poliqarp-library/sakura/value-tag.h b/poliqarp-library/sakura/value-tag.h new file mode 100644 index 0000000000000000000000000000000000000000..8ad019dd0554bfa57c093a7044276c8cfd9bf733 --- /dev/null +++ b/poliqarp-library/sakura/value-tag.h @@ -0,0 +1,41 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef POLIQARP_VALUE_TAG_H +#define POLIQARP_VALUE_TAG_H + +#include <sakura/corpus.h> +#include <sakura/regexp.h> +#include <sakura/exception.h> +#include <sakura/value-pattern.h> + +/** @addtogroup poliqarp_value */ +/** @{ */ + +/** Create a value that matches given tag. */ +struct poliqarp_value *poliqarp_value_create_tag(const struct poliqarp_corpus *corpus, + const struct poliqarp_regexp *pattern); + +/** @} */ + +#endif diff --git a/poliqarp-library/sakura/value-type.c b/poliqarp-library/sakura/value-type.c new file mode 100644 index 0000000000000000000000000000000000000000..dfc084df82771b395b749f3044285ce85ca4b7f9 --- /dev/null +++ b/poliqarp-library/sakura/value-type.c @@ -0,0 +1,40 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <errno.h> + +#include <sakura/value-type.h> + +struct poliqarp_value *poliqarp_value_create_type( + const struct poliqarp_corpus *corpus, const struct poliqarp_regexp *pattern) +{ + struct poliqarp_value *this = NULL; + + if (!corpus->syntax.syntax) { + errno = EINVAL; + return NULL; + } + this = poliqarp_value_pattern_create(&corpus->syntax.dict_types, pattern); + this->domain = POLIQARP_DOMAIN_TYPE; + return this; +} diff --git a/poliqarp-library/sakura/value-type.h b/poliqarp-library/sakura/value-type.h new file mode 100644 index 0000000000000000000000000000000000000000..347196fb1a997521c1371ca14919a15703b38ace --- /dev/null +++ b/poliqarp-library/sakura/value-type.h @@ -0,0 +1,40 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef POLIQARP_VALUE_TYPE_H +#define POLIQARP_VALUE_TYPE_H + +#include <sakura/corpus.h> +#include <sakura/regexp.h> +#include <sakura/value-pattern.h> + +/** @addtogroup poliqarp_value */ +/** @{ */ +/** @file value-type.h Values that match given type */ + +struct poliqarp_value *poliqarp_value_create_type(const struct poliqarp_corpus *corpus, + const struct poliqarp_regexp *pattern); + +/** @} */ + +#endif diff --git a/poliqarp-library/sakura/value.c b/poliqarp-library/sakura/value.c new file mode 100644 index 0000000000000000000000000000000000000000..ec3ea647424c535eb7f7a899fa9575a0945a9e29 --- /dev/null +++ b/poliqarp-library/sakura/value.c @@ -0,0 +1,62 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <sakura/value.h> + +bool poliqarp_value_can_be_optimized(const struct poliqarp_value *this, + const struct poliqarp_value *that, enum poliqarp_logic_operator oper) +{ + /* TODO */ + return false; +} + +struct poliqarp_value *poliqarp_value_optimize(const struct poliqarp_value *this, + const struct poliqarp_value *that, enum poliqarp_logic_operator oper) +{ + abort(); /* See the TODO above. */ +} + +int poliqarp_value_compare(const struct poliqarp_value *this, + const struct poliqarp_value *that) +{ + return this->domain == that->domain && this->strategy == that->strategy && + this->num_hits == that->num_hits ? + memcmp(this->bits, that->bits, this->num_bytes) : -1; +} + +void poliqarp_value_destroy(struct poliqarp_value *this) +{ + free(this->bits); + free(this); +} + +void poliqarp_value_use_strategy(struct poliqarp_value *this, + enum poliqarp_match_strategy strategy) +{ + this->strategy = strategy; +} + +void poliqarp_value_use_disamb(struct poliqarp_value *this, bool use_disamb) +{ + this->use_disamb = use_disamb; +} diff --git a/poliqarp-library/sakura/value.h b/poliqarp-library/sakura/value.h new file mode 100644 index 0000000000000000000000000000000000000000..7775847b377af3350c45e23944b8162f4d400526 --- /dev/null +++ b/poliqarp-library/sakura/value.h @@ -0,0 +1,254 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef POLIQARP_VALUE_H +#define POLIQARP_VALUE_H + +#include <stdlib.h> + +#include <sakura/common/bit-routines.h> +#include <sakura/corpus.h> +#include <sakura/abi.h> + +/** @defgroup poliqarp_value Expression Value Representation */ +/** @{ */ +/** @file value.h Expression value representation */ + +/** + * Match domain. + * + * Some values match only in a specific domain. + * For example the expression: + * base='foo' + * will create an expression that will match in the POLIQARP_DOMAIN_BASE domain + * to all instances of the word 'foo'. + * This is used to know which dictionary to use + **/ +enum poliqarp_match_domain { + POLIQARP_DOMAIN_ORTH, /**< Orth domain. */ + POLIQARP_DOMAIN_SPACE, /**< Space domain. */ + POLIQARP_DOMAIN_BASE, /**< Base domain. */ + POLIQARP_DOMAIN_TAG, /**< Tag domain. */ + POLIQARP_DOMAIN_INTERP__DISAMB, /**< Disambiguated interpretation domain. */ + POLIQARP_DOMAIN_INTERP__AMB, /**< Ambiguous interpretation domain. */ + POLIQARP_DOMAIN_TYPE /**< Phrasal type domain. */ +}; + +/** + * Match strategy. Determines how the value behaves when attempting to match + * given position. Directly resembles differences between the '=' and '~' + * operators + */ +enum poliqarp_match_strategy { + POLIQARP_STRATEGY_ANY, /**< Match succeeds when any interpretation is + matched. */ + POLIQARP_STRATEGY_ALL /**< Match succeeds when all interpretations are + matched. */ +}; + +/** Logic operator, used to create complex expressions. */ +enum poliqarp_logic_operator { + POLIQARP_OPERATOR_AND, /**< And operator. */ + POLIQARP_OPERATOR_OR /**< Or operator. */ +}; + +/** + * Value structure. + * + * Values can be evaluated on given corpus and segment. + * The result of such evaluation can be one of: 'matches' or 'does not match' + * Values can be combined to build expressions + */ +struct poliqarp_value { + unsigned *bits; /**< Bit field indicating 'match' or 'no match'. */ + size_t num_bytes; /**< Length of that field in bytes. */ + size_t num_items; /**< Number of bits used in that field. */ + size_t num_hits; /**< Number of one's in the used part of that field. */ + enum poliqarp_match_domain domain; /**< Operator domain. */ + enum poliqarp_match_strategy strategy; + /**< Operator match strategy. */ + bool use_disamb; /**< Process only disambiguated interpretations. */ +}; + +/** + * Check if an expression made of two values and operator can be optimized. + * @param this Left value. + * @param that Right value. + * @param oper Operator. + */ +bool poliqarp_value_can_be_optimized(const struct poliqarp_value *this, + const struct poliqarp_value *that, enum poliqarp_logic_operator oper); + +/** + * Optimize an expression made of two values and an opeator. + * This function makes sure that this and that are taken care of. + * Caller is responsible for freeing the result. + * @param this Left value. + * @param that Right value. + * @param oper Operator. + */ +struct poliqarp_value *poliqarp_value_optimize(const struct poliqarp_value *this, + const struct poliqarp_value *that, enum poliqarp_logic_operator oper); + +/** + * Comparison operator. + * This is used to enforce some ordering of values so that they can be + * sorted and looked up quickly + **/ +int poliqarp_value_compare(const struct poliqarp_value *this, + const struct poliqarp_value *that); + +/** Value destructor. */ +void poliqarp_value_destroy(struct poliqarp_value *this); + +/** Modify the value to use given match strategy. */ +void poliqarp_value_use_strategy(struct poliqarp_value *this, + enum poliqarp_match_strategy strategy); + +/** Modify the value to use given disambiguation settings. */ +void poliqarp_value_use_disamb(struct poliqarp_value *this, bool use_disamb); + +/** + * Micro-evaluate the value for this interpretation. + * Use the MATCH_ANY strategy for TAG domain + */ +static inline bool poliqarp_value_micro_eval_tag_any(unsigned *bits, + const struct poliqarp_binary_interp *interp, + const struct poliqarp_binary_interp *interp_end) +{ + for (; interp < interp_end; ++interp) { + struct poliqarp_binary_interp terp = *interp; + POLIQARP_INTERP_LE_TO_HE(terp); + if (BIT_ARRAY_GET(bits, terp.tag_id)) + return true; + } + return false; +} + +/** + * Micro-evaluate the value for this interpretation. + * Use the MATCH_ALL strategy for TAG domain + */ +static inline bool poliqarp_value_micro_eval_tag_all(unsigned *bits, + const struct poliqarp_binary_interp *interp, + const struct poliqarp_binary_interp *interp_end) +{ + /* try to find a true value, exit on first false found */ + for (; interp < interp_end; ++interp) { + struct poliqarp_binary_interp terp = *interp; + POLIQARP_INTERP_LE_TO_HE(terp); + if (BIT_ARRAY_GET(bits, terp.tag_id) == false) + return false; + else + goto found_true; + } + return false; +found_true: + /* now that we've already seen one true value we can just exit on any + * false value we encounter */ + for (; interp < interp_end; ++interp) { + struct poliqarp_binary_interp terp = *interp; + POLIQARP_INTERP_LE_TO_HE(terp); + if (BIT_ARRAY_GET(bits, terp.tag_id) == false) + return false; + } + return true; +} + +/** + * Micro-evaluate the value for this interpretation. + * Use the MATCH_ANY strategy for BASE domain + */ +static inline bool poliqarp_value_micro_eval_base_any(unsigned *bits, + const struct poliqarp_binary_interp *interp, + const struct poliqarp_binary_interp *interp_end) +{ + for (; interp < interp_end; ++interp) { + struct poliqarp_binary_interp terp = *interp; + POLIQARP_INTERP_LE_TO_HE(terp); + if (BIT_ARRAY_GET(bits, terp.base_id)) + return true; + } + return false; +} + +/** + * Micro-evaluate the value for this interpretation. + * Use the MATCH_ALL strategy for BASE domain + */ +static inline bool poliqarp_value_micro_eval_base_all(unsigned *bits, + const struct poliqarp_binary_interp *interp, + const struct poliqarp_binary_interp *interp_end) +{ + /* try to find a true value, exit on first false found */ + for (; interp < interp_end; ++interp) { + struct poliqarp_binary_interp terp = *interp; + POLIQARP_INTERP_LE_TO_HE(terp); + if (BIT_ARRAY_GET(bits, terp.base_id) == false) + return false; + else + goto found_true; + } + return false; +found_true: + /* now that we've already seen one true value we can just exit on any + * false value we encounter */ + for (; interp < interp_end; ++interp) { + struct poliqarp_binary_interp terp = *interp; + POLIQARP_INTERP_LE_TO_HE(terp); + if (BIT_ARRAY_GET(bits, terp.base_id) == false) + return false; + } + return true; +} + +/** + * Evaluate value and return the result. + * @param this Value object. + * @param corpus Corpus object. + * @param pos Position for which the value is calculated. + */ +static inline bool poliqarp_value_eval(const struct poliqarp_value *this, + const struct poliqarp_corpus *corpus, + const struct poliqarp_binary_segment *pos) +{ + switch (this->domain) { + case POLIQARP_DOMAIN_ORTH: + return BIT_ARRAY_GET(this->bits, pos->orth_space_id >> 1); + case POLIQARP_DOMAIN_SPACE: + return BIT_ARRAY_GET(this->bits, 0) == (pos->orth_space_id & 1); + case POLIQARP_DOMAIN_INTERP__DISAMB: + return BIT_ARRAY_GET(this->bits, pos->interp_disamb_id); + case POLIQARP_DOMAIN_INTERP__AMB: + return BIT_ARRAY_GET(this->bits, pos->interp_amb_id); + case POLIQARP_DOMAIN_TYPE: + return false; + default: + abort(); /* Should not happen. */ + } +} + +/** @} */ + +#endif /* VALUE_H */ + diff --git a/poliqarp-library/unibits/strcoll.c b/poliqarp-library/unibits/strcoll.c new file mode 100644 index 0000000000000000000000000000000000000000..a29d3979e841c26a71be799b2e9d964bbd53b52f --- /dev/null +++ b/poliqarp-library/unibits/strcoll.c @@ -0,0 +1,160 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <errno.h> +#include <limits.h> +#include <stdbool.h> +#include <stdlib.h> +#include <wchar.h> +#include <wctype.h> +#include <string.h> + +#ifdef _WIN32 +#include <windows.h> +#endif + +static inline size_t unibits_mbstowcs(wchar_t *dest, const char *src, size_t n) +{ +#ifdef _WIN32 + int rc; + if (dest == NULL) + n = 0; + else if (n == 0) + return 0; + rc = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src, -1, dest, n); + if (rc == 0) { + if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) + errno = ENOMEM; + else + errno = EILSEQ; + } + return (size_t)(rc - 1); +#else + return mbstowcs(dest, src, n); +#endif +} + +int unibits_validate_utf8(void) +{ + if ( + unibits_mbstowcs(NULL, "a", 0) == 1 && + unibits_mbstowcs(NULL, "\xce\xb1" /* GREEK SMALL LETTER ALPHA */, 0) == 1 && + unibits_mbstowcs(NULL, "\xe0\xa4\x85" /* DEVANAGARI LETTER A */, 0) == 1 + /* No check for anything beyond BMP, as Windows does not support that. */ + ) + /* The condition is expected to be true for UTF-8 encoding *only*. */ + return 0; + if (errno == EILSEQ) + return 1; + else + return -1; +} + +static inline wchar_t* unibits_wcsrev(wchar_t *w) +{ +#ifdef _WIN32 + return wcsrev(w); +#else + wchar_t *end = wcsrchr(w, L'\0'); + if (end == w) + return w; + end--; + while (w < end) { + wchar_t tmp = *w; + *w = *end; + *end = tmp; + w++; + end--; + } + return w; +#endif +} + +static inline int unibits_wcsicoll(wchar_t *w1, wchar_t *w2) +{ +#ifdef _WIN32 + return wcsicoll(w1, w2); +#else + size_t i; + for (i = 0; w1[i] != L'\0'; i++) + w1[i] = towlower(w1[i]); + for (i = 0; w2[i] != L'\0'; i++) + w2[i] = towlower(w2[i]); + return wcscoll(w1, w2); +#endif +} + +static inline int unibits_strcoll(const char *s1, const char *s2, bool atergo, bool case_sensitive) +{ + wchar_t *w1 = NULL, *w2 = NULL; + size_t n1 = unibits_mbstowcs(NULL, s1, 0) + 1; + size_t n2 = unibits_mbstowcs(NULL, s2, 0) + 1; + if (n1 == 0 || n2 == 0) + goto error; + w1 = malloc(n1 * sizeof (wchar_t)); + if (w1 == NULL) + goto error; + w2 = malloc(n2 * sizeof (wchar_t)); + if (w2 == NULL) + goto error; + if (unibits_mbstowcs(w1, s1, n1) + 1 != n1) + goto error; + if (unibits_mbstowcs(w2, s2, n2) + 1 != n2) + goto error; + if (atergo) { + unibits_wcsrev(w1); + unibits_wcsrev(w2); + } + int rc; + if (case_sensitive) + rc = wcscoll(w1, w2); + else + rc = unibits_wcsicoll(w1, w2); + free(w1); + free(w2); + return rc; +error: + free(w1); + free(w2); + return INT_MIN; +} + +int unibits_afronte_strcoll(const char *s1, const char *s2) +{ + return unibits_strcoll(s1, s2, false, true); +} + +int unibits_afronte_stricoll(const char *s1, const char *s2) +{ + return unibits_strcoll(s1, s2, false, false); +} + +int unibits_atergo_strcoll(const char *s1, const char *s2) +{ + return unibits_strcoll(s1, s2, true, true); +} + +int unibits_atergo_stricoll(const char *s1, const char *s2) +{ + return unibits_strcoll(s1, s2, true, false); +} diff --git a/poliqarp-library/unibits/strcoll.h b/poliqarp-library/unibits/strcoll.h new file mode 100644 index 0000000000000000000000000000000000000000..ac9a752adbcec6bd53235d5d0ad2d76e44d1da5b --- /dev/null +++ b/poliqarp-library/unibits/strcoll.h @@ -0,0 +1,96 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2008-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +/** + * Check if the currect locale settings are appropriate for + * unibits_afronte_strcoll and its counterparts. + * + * @note In case of an unsupported locale, <code>errno</code> is set to + * <code>EILSEQ</code>. + * + * @return 0 for a supported locale, 1 for an unsupported one, -1 on error. + * + * @see unibits_afronte_strcoll + * @see unibits_afronte_stricoll + * @see unibits_atergo_strcoll + * @see unibits_atergo_stricoll + */ +int unibits_validate_utf8(void); + + +/** + * Compare two UTF-8 strings. Return an integer less than, equal to, or greater + * than zero if s1 is found, respectively, to be less than, to match, or be + * greater than s2. + * + * @note The comparison is based on strings interpreted as appropriate for the + * current locale for category LC_COLLATE. + * + * @note For Unix systems, the character encoding used in the current locale is + * expected to be UTF-8. For Windows system, this function is not dependent + * upon the locale character set. + */ +int unibits_afronte_strcoll(const char *s1, const char *s2); + +/** + * Compare two UTF-8 encoded strings, ignoring the case of the characters. + * Return an integer less than, equal to, or greater than zero if s1 is found, + * respectively, to be less than, to match, or be greater than s2. + * + * @note The comparison is based on strings interpreted as appropriate for the + * current locale for category LC_COLLATE. + * + * @note For Unix systems, the character encoding used in the current locale is + * expected to be UTF-8. For Windows systems, this function is not dependent + * upon the locale character set. + */ +int unibits_afronte_stricoll(const char *s1, const char *s2); + +/** + * Compare two UTF-8 encoded strings, using reverse order of characters. + * Return an integer less than, equal to, or greater than zero if s1 is found, + * respectively, to be less than, to match, or be greater than s2. + * + * @note The comparison is based on strings interpreted as appropriate for the + * current locale for category LC_COLLATE. + * + * @note For Unix systems, the character encoding used in the current locale is + * expected to be UTF-8. For Windows systems, this function is not dependent + * upon the locale character set. + */ +int unibits_atergo_strcoll(const char *s1, const char *s2); + +/** + * Compare two UTF-8 encoded strings, using reverse order of characters, + * ignoring the case of the characters. Return an integer less than, equal to, + * or greater than zero if s1 is found, respectively, to be less than, to + * match, or be greater than s2. + * + * @note The comparison is based on strings interpreted as appropriate for the + * current locale for category LC_COLLATE. + * + * @note For Unix systems, the character encoding used in the current locale is + * expected to be UTF-8. For Windows systems, this function is not dependent + * upon the locale character set. + */ +int unibits_atergo_stricoll(const char *s1, const char *s2); diff --git a/poliqarp-library/unibits/tclUniData.h b/poliqarp-library/unibits/tclUniData.h new file mode 100644 index 0000000000000000000000000000000000000000..c2a688b55a6c2d10b193f6fbbb6b7eb0e045ca6a --- /dev/null +++ b/poliqarp-library/unibits/tclUniData.h @@ -0,0 +1,905 @@ +/* + * tclUniData.c -- + * + * Declarations of Unicode character information tables. This file is + * automatically generated by the tools/uniParse.tcl script. Do not + * modify this file by hand. + * + * Copyright (c) 1998 by Scriptics Corporation. + * All rights reserved. + * + * See the file ``doc/COPYING.Tcl`` for information on usage and redistribution + * of this file. + */ + +/* + * A 16-bit Unicode character is split into two parts in order to index + * into the following tables. The lower OFFSET_BITS comprise an offset + * into a page of characters. The upper bits comprise the page number. + */ + +#define OFFSET_BITS 5 + +/* + * The pageMap is indexed by page number and returns an alternate page number + * that identifies a unique page of characters. Many Unicode characters map + * to the same alternate page number. + */ + +static unsigned char pageMap[] = { + 0, 1, 2, 3, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 7, 15, 16, 17, + 18, 19, 20, 21, 22, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 7, 32, + 7, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 47, + 48, 49, 50, 51, 52, 35, 47, 53, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, + 58, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 80, 81, + 84, 85, 80, 86, 87, 88, 89, 90, 91, 92, 35, 93, 94, 95, 35, 96, 97, + 98, 99, 100, 101, 102, 35, 47, 103, 104, 35, 35, 105, 106, 107, 47, + 47, 108, 47, 47, 109, 47, 110, 111, 47, 112, 47, 113, 114, 115, 116, + 114, 47, 117, 118, 35, 47, 47, 119, 90, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 120, 121, 47, 47, 122, + 35, 35, 35, 35, 47, 123, 124, 125, 126, 47, 127, 128, 47, 129, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 7, 7, 7, 7, 130, 7, 7, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, + 149, 150, 151, 152, 153, 154, 155, 156, 156, 156, 156, 156, 156, 156, + 157, 158, 159, 160, 161, 162, 35, 35, 35, 160, 163, 164, 165, 166, + 167, 168, 169, 160, 160, 160, 160, 170, 171, 172, 173, 174, 160, 160, + 175, 35, 35, 35, 35, 176, 177, 178, 179, 180, 181, 35, 35, 160, 160, + 160, 160, 160, 160, 160, 160, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 182, 160, 160, 155, 160, 160, 160, 160, 160, 160, 170, 183, 184, 185, + 90, 47, 186, 90, 47, 187, 188, 189, 47, 47, 190, 128, 35, 35, 191, + 192, 193, 194, 192, 195, 196, 197, 160, 160, 160, 198, 160, 160, 199, + 197, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 200, 35, 35, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 201, 35, 35, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 202, 203, 204, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 205, 35, 35, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, + 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, + 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, + 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, + 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 207, 207, + 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, + 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, + 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, + 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, + 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, + 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, + 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, + 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, + 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, + 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, + 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, + 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, + 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, + 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, + 207, 207, 47, 47, 47, 47, 47, 47, 47, 47, 47, 208, 35, 35, 35, 35, + 35, 35, 209, 210, 211, 47, 47, 212, 213, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 214, 215, 47, 216, 47, 217, 218, 35, 219, 220, 221, 47, + 47, 47, 222, 223, 2, 224, 225, 226, 227, 228, 229, 230, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 231, 35, 232, 233, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 208, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 47, 234, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 235, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, + 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, + 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, + 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, + 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, + 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, + 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, + 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, + 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, + 207, 207, 207, 236, 207, 207, 207, 207, 207, 207, 207, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35 +}; + +/* + * The groupMap is indexed by combining the alternate page number with + * the page offset and returns a group number that identifies a unique + * set of character attributes. + */ + +static unsigned char groupMap[] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3, 3, 4, 3, 3, 3, 5, 6, 3, 7, 3, 8, + 3, 3, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 3, 3, 7, 7, 7, 3, 3, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 5, 3, 6, 11, 12, 11, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 5, 7, 6, 7, 1, 2, 3, 4, 4, 4, 4, 14, 14, 11, 14, 15, 16, + 7, 8, 14, 11, 14, 7, 17, 17, 11, 18, 14, 3, 11, 17, 15, 19, 17, 17, + 17, 3, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 7, 10, 10, 10, 10, 10, 10, 10, 15, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 7, 13, 13, 13, 13, 13, 13, 13, 20, 21, 22, + 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, + 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, + 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 23, 24, 21, 22, 21, + 22, 21, 22, 15, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, + 22, 21, 22, 15, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, + 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, + 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 25, + 21, 22, 21, 22, 21, 22, 26, 15, 27, 21, 22, 21, 22, 28, 21, 22, 29, + 29, 21, 22, 15, 30, 31, 32, 21, 22, 29, 33, 34, 35, 36, 21, 22, 15, + 15, 35, 37, 15, 38, 21, 22, 21, 22, 21, 22, 39, 21, 22, 39, 15, 15, + 21, 22, 39, 21, 22, 40, 40, 21, 22, 21, 22, 41, 21, 22, 15, 42, 21, + 22, 15, 43, 42, 42, 42, 42, 44, 45, 46, 44, 45, 46, 44, 45, 46, 21, + 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 47, 21, + 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, + 15, 44, 45, 46, 21, 22, 48, 49, 21, 22, 21, 22, 21, 22, 21, 22, 0, + 0, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, + 21, 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 15, 15, 15, 50, 51, 15, 52, 52, 15, 53, 15, + 54, 15, 15, 15, 15, 52, 15, 15, 55, 15, 15, 15, 15, 56, 57, 15, 15, + 15, 15, 15, 57, 15, 15, 58, 15, 15, 59, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 60, 15, 15, 60, 15, 15, 15, 15, 60, 15, 61, 61, 15, 15, + 15, 15, 15, 15, 62, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 0, 0, 63, + 63, 63, 63, 63, 63, 63, 63, 63, 11, 11, 63, 63, 63, 63, 63, 63, 63, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 63, 63, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 63, 63, 63, 63, + 63, 11, 11, 11, 11, 11, 11, 11, 11, 11, 63, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 65, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, + 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 11, + 0, 0, 0, 0, 63, 0, 0, 0, 3, 0, 0, 0, 0, 0, 11, 11, 66, 3, 67, 67, 67, + 0, 68, 0, 69, 69, 15, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 0, 10, 10, 10, 10, 10, 10, 10, 10, 10, 70, 71, + 71, 71, 15, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 72, 13, 13, 13, 13, 13, 13, 13, 13, 13, 73, 74, 74, 0, + 75, 76, 77, 77, 77, 78, 79, 15, 0, 0, 21, 22, 21, 22, 21, 22, 21, 22, + 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 80, 81, 47, + 15, 82, 83, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, 84, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, + 81, 81, 81, 81, 21, 22, 14, 64, 64, 64, 64, 0, 85, 85, 0, 0, 21, 22, + 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, + 22, 77, 21, 22, 21, 22, 0, 0, 21, 22, 0, 0, 21, 22, 0, 0, 0, 21, 22, + 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, + 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, + 21, 22, 0, 0, 21, 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 86, 86, 86, 86, + 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, + 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, + 0, 0, 63, 3, 3, 3, 3, 3, 3, 0, 87, 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 15, 0, 3, 8, 0, 0, + 0, 0, 0, 0, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 0, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 0, 64, 64, 64, 3, 64, 3, 64, + 64, 3, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 0, 0, 0, 0, 0, 42, 42, 42, 3, 3, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 3, 0, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 0, 0, 0, 0, 0, 63, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 3, 3, 3, 3, 0, 0, 64, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 3, 42, 64, + 64, 64, 64, 64, 64, 64, 85, 85, 64, 64, 64, 64, 64, 64, 63, 63, 64, + 64, 14, 64, 64, 64, 64, 0, 0, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 42, 42, + 42, 14, 14, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 88, 42, + 64, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 0, 0, 0, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 42, 42, 42, 42, 42, 42, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, + 64, 89, 0, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 0, 0, 64, 42, 89, 89, 89, 64, 64, 64, 64, 64, 64, + 64, 64, 89, 89, 89, 89, 64, 0, 0, 42, 64, 64, 64, 64, 0, 0, 0, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 64, 64, 3, 3, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, + 89, 89, 0, 42, 42, 42, 42, 42, 42, 42, 42, 0, 0, 42, 42, 0, 0, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 0, 42, 42, 42, 42, 42, 42, 42, 0, 42, 0, 0, 0, 42, + 42, 42, 42, 0, 0, 64, 0, 89, 89, 89, 64, 64, 64, 64, 0, 0, 89, 89, + 0, 0, 89, 89, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 89, 0, 0, 0, 0, 42, 42, + 0, 42, 42, 42, 64, 64, 0, 0, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 42, 42, + 4, 4, 17, 17, 17, 17, 17, 17, 14, 0, 0, 0, 0, 0, 0, 0, 64, 0, 0, 42, + 42, 42, 42, 42, 42, 0, 0, 0, 0, 42, 42, 0, 0, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 0, + 42, 42, 42, 42, 42, 42, 42, 0, 42, 42, 0, 42, 42, 0, 42, 42, 0, 0, + 64, 0, 89, 89, 89, 64, 64, 0, 0, 0, 0, 64, 64, 0, 0, 64, 64, 64, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 42, 42, 42, 42, 0, 42, 0, 0, 0, 0, 0, + 0, 0, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 64, 64, 42, 42, 42, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 64, 64, 89, 0, 42, 42, 42, 42, 42, 42, 42, + 0, 42, 0, 42, 42, 42, 0, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 0, 42, 42, 42, 42, 42, + 42, 42, 0, 42, 42, 0, 42, 42, 42, 42, 42, 0, 0, 64, 42, 89, 89, 89, + 64, 64, 64, 64, 64, 0, 64, 64, 89, 0, 89, 89, 64, 0, 0, 42, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 42, 0, 0, 0, 0, 0, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 0, 42, 42, 42, 42, 42, 42, 42, 0, 42, + 42, 0, 0, 42, 42, 42, 42, 0, 0, 64, 42, 89, 64, 89, 64, 64, 64, 0, + 0, 0, 89, 89, 0, 0, 89, 89, 64, 0, 0, 0, 0, 0, 0, 0, 0, 64, 89, 0, + 0, 0, 0, 42, 42, 0, 42, 42, 42, 0, 0, 0, 0, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 89, + 0, 42, 42, 42, 42, 42, 42, 0, 0, 0, 42, 42, 42, 0, 42, 42, 42, 42, + 0, 0, 0, 42, 42, 0, 42, 0, 42, 42, 0, 0, 0, 42, 42, 0, 0, 0, 42, 42, + 42, 0, 0, 0, 42, 42, 42, 42, 42, 42, 42, 42, 0, 42, 42, 42, 0, 0, 0, + 0, 89, 89, 64, 89, 89, 0, 0, 0, 89, 89, 89, 0, 89, 89, 89, 64, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 89, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 9, 9, 9, 9, 9, 9, 9, 9, 9, 17, 17, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 89, 89, 89, 0, 42, 42, 42, 42, 42, 42, 42, 42, 0, 42, + 42, 42, 0, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 0, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 0, 42, 42, 42, 42, 42, 0, 0, 0, 0, 64, 64, 64, 89, 89, + 89, 89, 0, 64, 64, 64, 0, 64, 64, 64, 64, 0, 0, 0, 0, 0, 0, 0, 64, + 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 42, 42, 0, 0, 0, 0, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 89, + 89, 0, 42, 42, 42, 42, 42, 42, 42, 42, 0, 42, 42, 42, 0, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 0, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 0, 42, 42, 42, + 42, 42, 0, 0, 0, 0, 89, 64, 89, 89, 89, 89, 89, 0, 64, 89, 89, 0, 89, + 89, 64, 64, 0, 0, 0, 0, 0, 0, 0, 89, 89, 0, 0, 0, 0, 0, 0, 0, 42, 0, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 0, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 0, 0, 0, 0, 89, 89, 89, 64, 64, + 64, 0, 0, 89, 89, 89, 0, 89, 89, 89, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 89, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 89, 89, 0, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 0, 0, 0, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 0, 42, 42, 42, 42, 42, 42, 42, 42, 42, 0, 42, 0, 0, + 42, 42, 42, 42, 42, 42, 42, 0, 0, 0, 64, 0, 0, 0, 0, 89, 89, 89, 64, + 64, 64, 0, 64, 0, 89, 89, 89, 89, 89, 89, 89, 89, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 89, 89, 3, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 64, 42, 42, 64, 64, 64, 64, 64, 64, 64, 0, 0, 0, 0, 4, 42, 42, + 42, 42, 42, 42, 63, 64, 64, 64, 64, 64, 64, 64, 64, 3, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 3, 3, 0, 0, 0, 0, 0, 42, 42, 0, 42, 0, 0, 42, 42, + 0, 42, 0, 0, 42, 0, 0, 0, 0, 0, 0, 42, 42, 42, 42, 0, 42, 42, 42, 42, + 42, 42, 42, 0, 42, 42, 42, 0, 42, 0, 42, 0, 0, 42, 42, 0, 42, 42, 42, + 42, 64, 42, 42, 64, 64, 64, 64, 64, 64, 0, 64, 64, 42, 0, 0, 42, 42, + 42, 42, 42, 0, 63, 0, 64, 64, 64, 64, 64, 64, 0, 0, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 0, 0, 42, 42, 0, 0, 42, 14, 14, 14, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 14, 14, 14, 14, 14, 64, 64, 14, 14, 14, + 14, 14, 14, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 14, 64, 14, 64, 14, 64, 5, 6, 5, 6, 89, 89, 42, 42, 42, + 42, 42, 42, 42, 42, 0, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 0, 0, 0, 0, 0, 0, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 89, 64, 64, 64, 64, 64, 3, 64, 64, 42, + 42, 42, 42, 0, 0, 0, 0, 64, 64, 64, 64, 64, 64, 64, 64, 0, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 0, 14, 14, 14, 14, 14, 14, 14, 14, 64, 14, 14, 14, 14, 14, 14, 0, 0, + 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 42, 42, 0, 42, + 42, 42, 42, 42, 0, 42, 42, 0, 89, 64, 64, 64, 64, 89, 64, 0, 0, 0, + 64, 64, 89, 64, 0, 0, 0, 0, 0, 0, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 3, + 3, 3, 3, 3, 3, 42, 42, 42, 42, 42, 42, 89, 89, 64, 64, 0, 0, 0, 0, + 0, 0, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, + 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, + 77, 77, 77, 77, 77, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 0, 0, 0, 0, 3, 0, 0, 0, 0, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 0, + 0, 0, 0, 0, 42, 42, 42, 42, 0, 0, 0, 0, 0, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 0, 0, 0, 0, 0, 0, 42, 42, 42, + 42, 42, 42, 42, 0, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 0, 42, 0, 42, 42, 42, 42, 0, 0, 42, 42, 42, 42, 42, 42, 42, + 0, 42, 0, 42, 42, 42, 42, 0, 0, 42, 42, 42, 42, 42, 42, 42, 0, 42, + 0, 42, 42, 42, 42, 0, 0, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 0, 42, 0, 42, 42, 42, 42, 0, 0, 42, 42, 42, 42, 42, 42, + 42, 0, 42, 0, 42, 42, 42, 42, 0, 0, 42, 42, 42, 42, 42, 42, 42, 0, + 42, 42, 42, 42, 42, 42, 42, 0, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 0, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 0, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, + 3, 9, 9, 9, 9, 9, 9, 9, 9, 9, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, + 17, 0, 0, 0, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 3, 3, 42, 42, 42, 42, 42, + 42, 42, 42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 5, 6, 0, 0, 0, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 3, 3, 3, 90, 90, 90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 89, 89, 89, 64, 64, 64, 64, 64, 64, 64, 89, 89, 89, 89, 89, + 89, 89, 89, 64, 89, 89, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 3, 3, 3, 3, 3, 3, 3, 4, 3, 0, 0, 0, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, + 3, 3, 3, 3, 8, 3, 3, 3, 3, 88, 88, 88, 88, 0, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 0, 0, 0, 0, 0, 0, 42, 42, 42, 63, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 0, 0, 0, 0, 0, 0, 0, + 0, 42, 42, 42, 42, 42, 42, 42, 42, 42, 64, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 22, 21, 22, 21, 22, 21, + 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 15, 15, + 15, 15, 15, 91, 0, 0, 0, 0, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, + 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 21, 22, 0, + 0, 0, 0, 0, 0, 92, 92, 92, 92, 92, 92, 92, 92, 93, 93, 93, 93, 93, + 93, 93, 93, 92, 92, 92, 92, 92, 92, 0, 0, 93, 93, 93, 93, 93, 93, 0, + 0, 92, 92, 92, 92, 92, 92, 92, 92, 93, 93, 93, 93, 93, 93, 93, 93, + 92, 92, 92, 92, 92, 92, 92, 92, 93, 93, 93, 93, 93, 93, 93, 93, 92, + 92, 92, 92, 92, 92, 0, 0, 93, 93, 93, 93, 93, 93, 0, 0, 15, 92, 15, + 92, 15, 92, 15, 92, 0, 93, 0, 93, 0, 93, 0, 93, 92, 92, 92, 92, 92, + 92, 92, 92, 93, 93, 93, 93, 93, 93, 93, 93, 94, 94, 95, 95, 95, 95, + 96, 96, 97, 97, 98, 98, 99, 99, 0, 0, 92, 92, 92, 92, 92, 92, 92, 92, + 100, 100, 100, 100, 100, 100, 100, 100, 92, 92, 92, 92, 92, 92, 92, + 92, 100, 100, 100, 100, 100, 100, 100, 100, 92, 92, 92, 92, 92, 92, + 92, 92, 100, 100, 100, 100, 100, 100, 100, 100, 92, 92, 15, 101, 15, + 0, 15, 15, 93, 93, 102, 102, 103, 11, 104, 11, 11, 11, 15, 101, 15, + 0, 15, 15, 105, 105, 105, 105, 103, 11, 11, 11, 92, 92, 15, 15, 0, + 0, 15, 15, 93, 93, 106, 106, 0, 11, 11, 11, 92, 92, 15, 15, 15, 107, + 15, 15, 93, 93, 108, 108, 109, 11, 11, 11, 0, 0, 15, 101, 15, 0, 15, + 15, 110, 110, 111, 111, 103, 11, 11, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 88, 88, 88, 88, 8, 8, 8, 8, 8, 8, 3, 3, 16, 19, 5, 16, 16, + 19, 5, 16, 3, 3, 3, 3, 3, 3, 3, 3, 112, 113, 88, 88, 88, 88, 88, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 16, 19, 3, 3, 3, 3, 12, 12, 3, 3, 3, 7, + 5, 6, 0, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 88, 88, 88, 88, 88, 88, 17, + 0, 0, 0, 17, 17, 17, 17, 17, 17, 7, 7, 7, 5, 6, 15, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 7, 7, 7, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 85, 85, 85, 85, 64, 85, 85, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 14, 77, + 14, 14, 14, 14, 77, 14, 14, 15, 77, 77, 77, 15, 15, 77, 77, 77, 15, + 14, 77, 14, 14, 14, 77, 77, 77, 77, 77, 14, 14, 14, 14, 14, 14, 77, + 14, 114, 14, 77, 14, 115, 116, 77, 77, 14, 15, 77, 77, 14, 77, 15, + 42, 42, 42, 42, 15, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, + 117, 117, 117, 117, 117, 118, 118, 118, 118, 118, 118, 118, 118, 118, + 118, 118, 118, 118, 118, 118, 118, 90, 90, 90, 90, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 7, 7, 7, 7, 7, 14, 14, 14, 14, 14, 7, 7, 14, 14, + 14, 14, 7, 14, 14, 7, 14, 14, 7, 14, 14, 14, 14, 14, 14, 14, 7, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 7, 7, 14, 14, 7, + 14, 7, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 14, 14, 14, 14, 14, 14, 14, 14, 7, 7, 7, 7, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 7, 7, 14, 14, 14, 14, 14, 14, 14, 5, 6, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 0, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 0, 0, 0, 0, 0, 14, 14, 14, 14, 14, 14, 14, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, + 17, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 119, 119, 119, 119, 119, 119, + 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, + 119, 119, 119, 119, 119, 119, 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 7, 14, 14, 14, 14, 14, 14, 14, 14, 14, 7, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 0, 0, 0, 0, 0, 0, 0, 0, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 0, 0, 0, 0, 0, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 7, 14, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, + 14, 14, 14, 0, 14, 14, 14, 14, 0, 0, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 0, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 0, 14, 0, 14, 14, 14, 14, 0, 0, 0, 14, 0, 14, 14, + 14, 14, 14, 14, 14, 0, 0, 14, 14, 14, 14, 14, 14, 14, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 14, 0, 0, 0, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 0, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 0, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 0, 14, 14, 14, 14, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 0, + 0, 0, 0, 2, 3, 3, 3, 14, 63, 42, 90, 5, 6, 5, 6, 5, 6, 5, 6, 5, 6, + 14, 14, 5, 6, 5, 6, 5, 6, 5, 6, 8, 5, 6, 6, 14, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 64, 64, 64, 64, 64, 64, 8, 63, 63, 63, 63, 63, 14, + 14, 90, 90, 90, 0, 0, 0, 14, 14, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 0, 0, 0, 0, 64, 64, + 11, 11, 63, 63, 0, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 12, 63, + 63, 63, 0, 0, 0, 0, 0, 0, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 0, 0, 0, 0, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 0, 14, 14, 17, 17, 17, + 17, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 0, 0, 0, 17, 17, 17, 17, 17, 17, 17, 17, 17, + 17, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 0, 0, 0, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 0, 0, 0, 0, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 0, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 0, 0, 0, 0, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 0, 0, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 42, 42, 42, 42, 42, 42, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 0, 0, 0, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 0, 0, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 0, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 0, 14, 14, 14, 0, 14, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 42, 42, 42, 42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 121, 121, 121, 121, 121, 121, 121, + 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, + 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 122, 122, 122, + 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, + 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, + 122, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 15, 15, 15, 15, + 15, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 15, 15, 15, 15, 0, + 0, 0, 0, 0, 42, 64, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 7, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 0, 42, 42, 42, 42, + 42, 0, 42, 0, 42, 42, 0, 42, 42, 0, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 0, 0, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 0, 0, 0, 0, 64, 64, 64, 64, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 8, 8, 12, 12, 5, 6, 5, 6, 5, + 6, 5, 6, 5, 6, 5, 6, 5, 6, 5, 6, 0, 0, 0, 0, 3, 3, 3, 3, 12, 12, 12, + 3, 3, 3, 0, 3, 3, 3, 3, 8, 5, 6, 5, 6, 5, 6, 3, 3, 3, 7, 8, 7, 7, 7, + 0, 3, 4, 3, 3, 0, 0, 0, 0, 42, 42, 42, 0, 42, 0, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 0, 0, 88, 0, 3, 3, 3, 4, 3, 3, 3, 5, 6, 3, 7, 3, 8, 3, 3, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 3, 3, 7, 7, 7, 3, 11, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 5, 7, 6, 7, 0, 0, 3, 5, 6, 3, 12, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 63, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 63, + 63, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 0, 0, 0, + 42, 42, 42, 42, 42, 42, 0, 0, 42, 42, 42, 42, 42, 42, 0, 0, 42, 42, + 42, 42, 42, 42, 0, 0, 42, 42, 42, 0, 0, 0, 4, 4, 7, 11, 14, 4, 4, 0, + 14, 7, 7, 7, 7, 14, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 88, 88, 88, 14, + 14, 42, 17, 42, 42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 123, 123, 123, + 126, 126, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 89, 64, 14, 14, 14, + 14, 14, 0, 0, 77, 77, 15, 15, 77, 15, 15, 77, 77, 15, 77, 77, 15, 77, + 77, 15, 15, 77, 15, 15, 77, 77, 15, 77, 77, 15, 77, 77, 15, 15, 77, + 15, 15, 77, 77, 15, 77, 77, 15, 77, 77, 15, 15, 77, 77, 15, 15, 77, + 15, 15, 77, 77, 15, 15, 77, 15, 15, 77, 77, 15, 15, 9, 9, 9, 42, 42, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 88, 0, 88, 88, 88, 88, 88, 88, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 122, 122, + 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, + 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, + 122 +}; + +/* + * Each group represents a unique set of character attributes. The attributes + * are encoded into a 32-bit value as follows: + * + * Bits 0-4 Character category: see the constants listed below. + * + * Bits 5-7 Case delta type: 000 = identity + * 010 = add delta for lower + * 011 = add delta for lower, add 1 for title + * 100 = sutract delta for title/upper + * 101 = sub delta for upper, sub 1 for title + * 110 = sub delta for upper, add delta for lower + * + * Bits 8-21 Reserved for future use. + * + * Bits 22-31 Case delta: delta for case conversions. This should be the + * highest field so we can easily sign extend. + */ + +static int groups[] = { + 0, 15, 12, 25, 27, 21, 22, 26, 20, 9, 134217793, 28, 19, 134217858, + 29, 2, 23, 11, 1178599554, 24, -507510654, 4194369, 4194434, -834666431, + 973078658, -507510719, 1258291330, 880803905, 864026689, 859832385, + 331350081, 847249473, 851443777, 868220993, -406847358, 884998209, + 876609601, 893386817, 897581121, 914358337, 910164033, 918552641, + 5, -234880894, 8388705, 4194499, 8388770, 331350146, -406847423, + -234880959, 880803970, 864026754, 859832450, 847249538, 851443842, + 868221058, 876609666, 884998274, 893386882, 897581186, 914358402, + 910164098, 918552706, 4, 6, -352321402, 159383617, 155189313, + 268435521, 264241217, 159383682, 155189378, 130023554, 268435586, + 264241282, 260046978, 239075458, 1, 197132418, 226492546, 360710274, + 335544450, -251658175, 402653314, 335544385, 7, 201326657, 201326722, + 16, 8, 10, 247464066, -33554302, -33554367, -310378366, -360710014, + -419430270, -536870782, -469761918, -528482174, -33554365, -37748606, + -310378431, -37748669, 155189378, -360710079, -419430335, -29359998, + -469761983, -29360063, -536870847, -528482239, 13, 14, -1463812031, + -801111999, -293601215, 67108938, 67109002, 109051997, 109052061, + 18, 17, 8388673, 12582977, 8388738, 12583042 +}; + +/* + * The following constants are used to determine the category of a + * Unicode character. + */ + +#define UNICODE_CATEGORY_MASK 0X1F + +enum { + UNASSIGNED, + UPPERCASE_LETTER, + LOWERCASE_LETTER, + TITLECASE_LETTER, + MODIFIER_LETTER, + OTHER_LETTER, + NON_SPACING_MARK, + ENCLOSING_MARK, + COMBINING_SPACING_MARK, + DECIMAL_DIGIT_NUMBER, + LETTER_NUMBER, + OTHER_NUMBER, + SPACE_SEPARATOR, + LINE_SEPARATOR, + PARAGRAPH_SEPARATOR, + CONTROL, + FORMAT, + PRIVATE_USE, + SURROGATE, + CONNECTOR_PUNCTUATION, + DASH_PUNCTUATION, + OPEN_PUNCTUATION, + CLOSE_PUNCTUATION, + INITIAL_QUOTE_PUNCTUATION, + FINAL_QUOTE_PUNCTUATION, + OTHER_PUNCTUATION, + MATH_SYMBOL, + CURRENCY_SYMBOL, + MODIFIER_SYMBOL, + OTHER_SYMBOL +}; + +/* + * The following macros extract the fields of the character info. The + * GetDelta() macro is complicated because we can't rely on the C compiler + * to do sign extension on right shifts. + */ + +#define GetCaseType(info) (((info) & 0xE0) >> 5) +#define GetCategory(info) ((info) & 0x1F) +#define GetDelta(info) (((info) > 0) ? ((info) >> 22) : (~(~((info)) >> 22))) + +/* + * This macro extracts the information about a character from the + * Unicode character tables. + */ + +#define GetUniCharInfo(ch) (groups[groupMap[(pageMap[(((int)(ch)) & 0xffff) >> OFFSET_BITS] << OFFSET_BITS) | ((ch) & ((1 << OFFSET_BITS)-1))]]) + diff --git a/poliqarp-library/unibits/tclUtf.c b/poliqarp-library/unibits/tclUtf.c new file mode 100644 index 0000000000000000000000000000000000000000..e2dc3104896270cfbe83c78560252c571939119a --- /dev/null +++ b/poliqarp-library/unibits/tclUtf.c @@ -0,0 +1,1914 @@ +/* + * tclUtf.c -- + * + * Routines for manipulating UTF-8 strings. + * + * Copyright (c) 1997-1998 Sun Microsystems, Inc. + * + * See the file ``doc/COPYING.Tcl`` for information on usage and redistribution + * of this file. + */ + +#include "unibits.h" + +#include <poliqarp-config.h> + +/* + * Include the static character classification tables and macros. + */ + +#include "tclUniData.h" + +/* + * The following macros are used for fast character category tests. The x_BITS + * values are shifted right by the category value to determine whether the + * given category is included in the set. + */ + +#define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \ + | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1<<OTHER_LETTER)) + +#define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER) + +#define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \ + | (1 << PARAGRAPH_SEPARATOR)) + +#define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION) + +#define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \ + (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \ + (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \ + (1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \ + (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \ + (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \ + (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \ + (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \ + (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL)) + +#define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \ + (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \ + (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \ + (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION)) + +/* + * Unicode characters less than this value are represented by themselves in + * UTF-8 strings. + */ + +#define UNICODE_SELF 0x80 + +/* + * The following structures are used when mapping between Unicode (UCS-2) and + * UTF-8. + */ + +CONST unsigned char Tcl_Utf8TotalBytes[256] = { + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, +#if TCL_UTF_MAX > 3 + 4,4,4,4,4,4,4,4, +#else + 1,1,1,1,1,1,1,1, +#endif +#if TCL_UTF_MAX > 4 + 5,5,5,5, +#else + 1,1,1,1, +#endif +#if TCL_UTF_MAX > 5 + 6,6,6,6 +#else + 1,1,1,1 +#endif +}; + +#define totalBytes Tcl_Utf8TotalBytes + +/* + * Functions used only in this module. + */ + +static int UtfCount(int ch); + +/* + *--------------------------------------------------------------------------- + * + * UtfCount -- + * + * Find the number of bytes in the Utf character "ch". + * + * Results: + * The return values is the number of bytes in the Utf character "ch". + * + * Side effects: + * None. + * + *--------------------------------------------------------------------------- + */ + +INLINE static int +UtfCount( + int ch) /* The Tcl_UniChar whose size is returned. */ +{ + if ((ch > 0) && (ch < UNICODE_SELF)) { + return 1; + } + if (ch <= 0x7FF) { + return 2; + } + if (ch <= 0xFFFF) { + return 3; + } +#if TCL_UTF_MAX > 3 + if (ch <= 0x1FFFFF) { + return 4; + } + if (ch <= 0x3FFFFFF) { + return 5; + } + if (ch <= 0x7FFFFFFF) { + return 6; + } +#endif + return 3; +} + +/* + *--------------------------------------------------------------------------- + * + * Tcl_UniCharToUtf -- + * + * Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the + * provided buffer. Equivalent to Plan 9 runetochar(). + * + * Results: + * The return values is the number of bytes in the buffer that were + * consumed. + * + * Side effects: + * None. + * + *--------------------------------------------------------------------------- + */ + +INLINE static int +Tcl_UniCharToUtf( + int ch, /* The Tcl_UniChar to be stored in the + * buffer. */ + char *buf) /* Buffer in which the UTF-8 representation of + * the Tcl_UniChar is stored. Buffer must be + * large enough to hold the UTF-8 character + * (at most TCL_UTF_MAX bytes). */ +{ + if ((ch > 0) && (ch < UNICODE_SELF)) { + buf[0] = (char) ch; + return 1; + } + if (ch >= 0) { + if (ch <= 0x7FF) { + buf[1] = (char) ((ch | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 6) | 0xC0); + return 2; + } + if (ch <= 0xFFFF) { + three: + buf[2] = (char) ((ch | 0x80) & 0xBF); + buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 12) | 0xE0); + return 3; + } + +#if TCL_UTF_MAX > 3 + if (ch <= 0x1FFFFF) { + buf[3] = (char) ((ch | 0x80) & 0xBF); + buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF); + buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 18) | 0xF0); + return 4; + } + if (ch <= 0x3FFFFFF) { + buf[4] = (char) ((ch | 0x80) & 0xBF); + buf[3] = (char) (((ch >> 6) | 0x80) & 0xBF); + buf[2] = (char) (((ch >> 12) | 0x80) & 0xBF); + buf[1] = (char) (((ch >> 18) | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 24) | 0xF8); + return 5; + } + if (ch <= 0x7FFFFFFF) { + buf[5] = (char) ((ch | 0x80) & 0xBF); + buf[4] = (char) (((ch >> 6) | 0x80) & 0xBF); + buf[3] = (char) (((ch >> 12) | 0x80) & 0xBF); + buf[2] = (char) (((ch >> 18) | 0x80) & 0xBF); + buf[1] = (char) (((ch >> 24) | 0x80) & 0xBF); + buf[0] = (char) ((ch >> 30) | 0xFC); + return 6; + } +#endif + } + + ch = 0xFFFD; + goto three; +} + +/* + *--------------------------------------------------------------------------- + * + * Tcl_UniCharToUtfDString -- + * + * Convert the given Unicode string to UTF-8. + * + * Results: + * The return value is a pointer to the UTF-8 representation of the + * Unicode string. Storage for the return value is appended to the end of + * dsPtr. + * + * Side effects: + * None. + * + *--------------------------------------------------------------------------- + */ + +char * +Tcl_UniCharToUtfDString( + CONST Tcl_UniChar *uniStr, /* Unicode string to convert to UTF-8. */ + int uniLength, /* Length of Unicode string in Tcl_UniChars + * (must be >= 0). */ + Tcl_DString *dsPtr) /* UTF-8 representation of string is appended + * to this previously initialized DString. */ +{ + CONST Tcl_UniChar *w, *wEnd; + char *p, *string; + int oldLength; + + /* + * UTF-8 string length in bytes will be <= Unicode string length * + * TCL_UTF_MAX. + */ + + oldLength = Tcl_DStringLength(dsPtr); + Tcl_DStringSetLength(dsPtr, (oldLength + uniLength + 1) * TCL_UTF_MAX); + string = Tcl_DStringValue(dsPtr) + oldLength; + + p = string; + wEnd = uniStr + uniLength; + for (w = uniStr; w < wEnd; ) { + p += Tcl_UniCharToUtf(*w, p); + w++; + } + Tcl_DStringSetLength(dsPtr, oldLength + (p - string)); + + return string; +} + +/* + *--------------------------------------------------------------------------- + * + * Tcl_UtfToUniChar -- + * + * Extract the Tcl_UniChar represented by the UTF-8 string. Bad UTF-8 + * sequences are converted to valid Tcl_UniChars and processing + * continues. Equivalent to Plan 9 chartorune(). + * + * The caller must ensure that the source buffer is long enough that this + * routine does not run off the end and dereference non-existent memory + * looking for trail bytes. If the source buffer is known to be '\0' + * terminated, this cannot happen. Otherwise, the caller should call + * Tcl_UtfCharComplete() before calling this routine to ensure that + * enough bytes remain in the string. + * + * Results: + * *chPtr is filled with the Tcl_UniChar, and the return value is the + * number of bytes from the UTF-8 string that were consumed. + * + * Side effects: + * None. + * + *--------------------------------------------------------------------------- + */ + +int +Tcl_UtfToUniChar( + register CONST char *src, /* The UTF-8 string. */ + register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by + * the UTF-8 string. */ +{ + register int byte; + + /* + * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones. + */ + + byte = *((unsigned char *) src); + if (byte < 0xC0) { + /* + * Handles properly formed UTF-8 characters between 0x01 and 0x7F. + * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid + * characters representing themselves. + */ + + *chPtr = (Tcl_UniChar) byte; + return 1; + } else if (byte < 0xE0) { + if ((src[1] & 0xC0) == 0x80) { + /* + * Two-byte-character lead-byte followed by a trail-byte. + */ + + *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (src[1] & 0x3F)); + return 2; + } + + /* + * A two-byte-character lead-byte not followed by trail-byte + * represents itself. + */ + + *chPtr = (Tcl_UniChar) byte; + return 1; + } else if (byte < 0xF0) { + if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) { + /* + * Three-byte-character lead byte followed by two trail bytes. + */ + + *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12) + | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F)); + return 3; + } + + /* + * A three-byte-character lead-byte not followed by two trail-bytes + * represents itself. + */ + + *chPtr = (Tcl_UniChar) byte; + return 1; + } +#if TCL_UTF_MAX > 3 + { + int ch, total, trail; + + total = totalBytes[byte]; + trail = total - 1; + if (trail > 0) { + ch = byte & (0x3F >> trail); + do { + src++; + if ((*src & 0xC0) != 0x80) { + *chPtr = byte; + return 1; + } + ch <<= 6; + ch |= (*src & 0x3F); + trail--; + } while (trail > 0); + *chPtr = ch; + return total; + } + } +#endif + + *chPtr = (Tcl_UniChar) byte; + return 1; +} + +/* + *--------------------------------------------------------------------------- + * + * Tcl_UtfToUniCharDString -- + * + * Convert the UTF-8 string to Unicode. + * + * Results: + * The return value is a pointer to the Unicode representation of the + * UTF-8 string. Storage for the return value is appended to the end of + * dsPtr. The Unicode string is terminated with a Unicode NULL character. + * + * Side effects: + * None. + * + *--------------------------------------------------------------------------- + */ + +Tcl_UniChar * +Tcl_UtfToUniCharDString( + CONST char *src, /* UTF-8 string to convert to Unicode. */ + int length, /* Length of UTF-8 string in bytes, or -1 for + * strlen(). */ + Tcl_DString *dsPtr) /* Unicode representation of string is + * appended to this previously initialized + * DString. */ +{ + Tcl_UniChar *w, *wString; + CONST char *p, *end; + int oldLength; + + if (length < 0) { + length = strlen(src); + } + + /* + * Unicode string length in Tcl_UniChars will be <= UTF-8 string length in + * bytes. + */ + + oldLength = Tcl_DStringLength(dsPtr); + Tcl_DStringSetLength(dsPtr, + (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar))); + wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength); + + w = wString; + end = src + length; + for (p = src; p < end; ) { + p += TclUtfToUniChar(p, w); + w++; + } + *w = '\0'; + Tcl_DStringSetLength(dsPtr, + (oldLength + ((char *) w - (char *) wString))); + + return wString; +} + +/* + *--------------------------------------------------------------------------- + * + * Tcl_UtfCharComplete -- + * + * Determine if the UTF-8 string of the given length is long enough to be + * decoded by Tcl_UtfToUniChar(). This does not ensure that the UTF-8 + * string is properly formed. Equivalent to Plan 9 fullrune(). + * + * Results: + * The return value is 0 if the string is not long enough, non-zero + * otherwise. + * + * Side effects: + * None. + * + *--------------------------------------------------------------------------- + */ + +int +Tcl_UtfCharComplete( + CONST char *src, /* String to check if first few bytes contain + * a complete UTF-8 character. */ + int length) /* Length of above string in bytes. */ +{ + int ch; + + ch = *((unsigned char *) src); + return length >= totalBytes[ch]; +} + +/* + *--------------------------------------------------------------------------- + * + * Tcl_NumUtfChars -- + * + * Returns the number of characters (not bytes) in the UTF-8 string, not + * including the terminating NULL byte. This is equivalent to Plan 9 + * utflen() and utfnlen(). + * + * Results: + * As above. + * + * Side effects: + * None. + * + *--------------------------------------------------------------------------- + */ + +int +Tcl_NumUtfChars( + register CONST char *src, /* The UTF-8 string to measure. */ + int length) /* The length of the string in bytes, or -1 + * for strlen(string). */ +{ + Tcl_UniChar ch; + register Tcl_UniChar *chPtr = &ch; + register int i; + + /* + * The separate implementations are faster. + * + * Since this is a time-sensitive function, we also do the check for the + * single-byte char case specially. + */ + + i = 0; + if (length < 0) { + while (*src != '\0') { + src += TclUtfToUniChar(src, chPtr); + i++; + } + } else { + register int n; + + while (length > 0) { + if (UCHAR(*src) < 0xC0) { + length--; + src++; + } else { + n = Tcl_UtfToUniChar(src, chPtr); + length -= n; + src += n; + } + i++; + } + } + return i; +} + +/* + *--------------------------------------------------------------------------- + * + * Tcl_UtfFindFirst -- + * + * Returns a pointer to the first occurance of the given Tcl_UniChar in + * the NULL-terminated UTF-8 string. The NULL terminator is considered + * part of the UTF-8 string. Equivalent to Plan 9 utfrune(). + * + * Results: + * As above. If the Tcl_UniChar does not exist in the given string, the + * return value is NULL. + * + * Side effects: + * None. + * + *--------------------------------------------------------------------------- + */ + +CONST char * +Tcl_UtfFindFirst( + CONST char *src, /* The UTF-8 string to be searched. */ + int ch) /* The Tcl_UniChar to search for. */ +{ + int len; + Tcl_UniChar find; + + while (1) { + len = TclUtfToUniChar(src, &find); + if (find == ch) { + return src; + } + if (*src == '\0') { + return NULL; + } + src += len; + } +} + +/* + *--------------------------------------------------------------------------- + * + * Tcl_UtfFindLast -- + * + * Returns a pointer to the last occurance of the given Tcl_UniChar in + * the NULL-terminated UTF-8 string. The NULL terminator is considered + * part of the UTF-8 string. Equivalent to Plan 9 utfrrune(). + * + * Results: + * As above. If the Tcl_UniChar does not exist in the given string, the + * return value is NULL. + * + * Side effects: + * None. + * + *--------------------------------------------------------------------------- + */ + +CONST char * +Tcl_UtfFindLast( + CONST char *src, /* The UTF-8 string to be searched. */ + int ch) /* The Tcl_UniChar to search for. */ +{ + int len; + Tcl_UniChar find; + CONST char *last; + + last = NULL; + while (1) { + len = TclUtfToUniChar(src, &find); + if (find == ch) { + last = src; + } + if (*src == '\0') { + break; + } + src += len; + } + return last; +} + +/* + *--------------------------------------------------------------------------- + * + * Tcl_UtfNext -- + * + * Given a pointer to some current location in a UTF-8 string, move + * forward one character. The caller must ensure that they are not asking + * for the next character after the last character in the string. + * + * Results: + * The return value is the pointer to the next character in the UTF-8 + * string. + * + * Side effects: + * None. + * + *--------------------------------------------------------------------------- + */ + +CONST char * +Tcl_UtfNext( + CONST char *src) /* The current location in the string. */ +{ + Tcl_UniChar ch; + + return src + TclUtfToUniChar(src, &ch); +} + +/* + *--------------------------------------------------------------------------- + * + * Tcl_UtfPrev -- + * + * Given a pointer to some current location in a UTF-8 string, move + * backwards one character. This works correctly when the pointer is in + * the middle of a UTF-8 character. + * + * Results: + * The return value is a pointer to the previous character in the UTF-8 + * string. If the current location was already at the beginning of the + * string, the return value will also be a pointer to the beginning of + * the string. + * + * Side effects: + * None. + * + *--------------------------------------------------------------------------- + */ + +CONST char * +Tcl_UtfPrev( + CONST char *src, /* The current location in the string. */ + CONST char *start) /* Pointer to the beginning of the string, to + * avoid going backwards too far. */ +{ + CONST char *look; + int i, byte; + + src--; + look = src; + for (i = 0; i < TCL_UTF_MAX; i++) { + if (look < start) { + if (src < start) { + src = start; + } + break; + } + byte = *((unsigned char *) look); + if (byte < 0x80) { + break; + } + if (byte >= 0xC0) { + return look; + } + look--; + } + return src; +} + +/* + *--------------------------------------------------------------------------- + * + * Tcl_UniCharAtIndex -- + * + * Returns the Unicode character represented at the specified character + * (not byte) position in the UTF-8 string. + * + * Results: + * As above. + * + * Side effects: + * None. + * + *--------------------------------------------------------------------------- + */ + +Tcl_UniChar +Tcl_UniCharAtIndex( + register CONST char *src, /* The UTF-8 string to dereference. */ + register int index) /* The position of the desired character. */ +{ + Tcl_UniChar ch; + + while (index >= 0) { + index--; + src += TclUtfToUniChar(src, &ch); + } + return ch; +} + +/* + *--------------------------------------------------------------------------- + * + * Tcl_UtfAtIndex -- + * + * Returns a pointer to the specified character (not byte) position in + * the UTF-8 string. + * + * Results: + * As above. + * + * Side effects: + * None. + * + *--------------------------------------------------------------------------- + */ + +CONST char * +Tcl_UtfAtIndex( + register CONST char *src, /* The UTF-8 string. */ + register int index) /* The position of the desired character. */ +{ + Tcl_UniChar ch; + + while (index > 0) { + index--; + src += TclUtfToUniChar(src, &ch); + } + return src; +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UtfToUpper -- + * + * Convert lowercase characters to uppercase characters in a UTF string + * in place. The conversion may shrink the UTF string. + * + * Results: + * Returns the number of bytes in the resulting string excluding the + * trailing null. + * + * Side effects: + * Writes a terminating null after the last converted character. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UtfToUpper( + char *str) /* String to convert in place. */ +{ + Tcl_UniChar ch, upChar; + char *src, *dst; + int bytes; + + /* + * Iterate over the string until we hit the terminating null. + */ + + src = dst = str; + while (*src) { + bytes = TclUtfToUniChar(src, &ch); + upChar = Tcl_UniCharToUpper(ch); + + /* + * To keep badly formed Utf strings from getting inflated by the + * conversion (thereby causing a segfault), only copy the upper case + * char to dst if its size is <= the original char. + */ + + if (bytes < UtfCount(upChar)) { + memcpy(dst, src, (size_t) bytes); + dst += bytes; + } else { + dst += Tcl_UniCharToUtf(upChar, dst); + } + src += bytes; + } + *dst = '\0'; + return (dst - str); +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UtfToLower -- + * + * Convert uppercase characters to lowercase characters in a UTF string + * in place. The conversion may shrink the UTF string. + * + * Results: + * Returns the number of bytes in the resulting string excluding the + * trailing null. + * + * Side effects: + * Writes a terminating null after the last converted character. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UtfToLower( + char *str) /* String to convert in place. */ +{ + Tcl_UniChar ch, lowChar; + char *src, *dst; + int bytes; + + /* + * Iterate over the string until we hit the terminating null. + */ + + src = dst = str; + while (*src) { + bytes = TclUtfToUniChar(src, &ch); + lowChar = Tcl_UniCharToLower(ch); + + /* + * To keep badly formed Utf strings from getting inflated by the + * conversion (thereby causing a segfault), only copy the lower case + * char to dst if its size is <= the original char. + */ + + if (bytes < UtfCount(lowChar)) { + memcpy(dst, src, (size_t) bytes); + dst += bytes; + } else { + dst += Tcl_UniCharToUtf(lowChar, dst); + } + src += bytes; + } + *dst = '\0'; + return (dst - str); +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UtfToTitle -- + * + * Changes the first character of a UTF string to title case or uppercase + * and the rest of the string to lowercase. The conversion happens in + * place and may shrink the UTF string. + * + * Results: + * Returns the number of bytes in the resulting string excluding the + * trailing null. + * + * Side effects: + * Writes a terminating null after the last converted character. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UtfToTitle( + char *str) /* String to convert in place. */ +{ + Tcl_UniChar ch, titleChar, lowChar; + char *src, *dst; + int bytes; + + /* + * Capitalize the first character and then lowercase the rest of the + * characters until we get to a null. + */ + + src = dst = str; + + if (*src) { + bytes = TclUtfToUniChar(src, &ch); + titleChar = Tcl_UniCharToTitle(ch); + + if (bytes < UtfCount(titleChar)) { + memcpy(dst, src, (size_t) bytes); + dst += bytes; + } else { + dst += Tcl_UniCharToUtf(titleChar, dst); + } + src += bytes; + } + while (*src) { + bytes = TclUtfToUniChar(src, &ch); + lowChar = Tcl_UniCharToLower(ch); + + if (bytes < UtfCount(lowChar)) { + memcpy(dst, src, (size_t) bytes); + dst += bytes; + } else { + dst += Tcl_UniCharToUtf(lowChar, dst); + } + src += bytes; + } + *dst = '\0'; + return (dst - str); +} + +/* + *---------------------------------------------------------------------- + * + * TclpUtfNcmp2 -- + * + * Compare at most numBytes bytes of utf-8 strings cs and ct. Both cs and + * ct are assumed to be at least numBytes bytes long. + * + * Results: + * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +TclpUtfNcmp2( + CONST char *cs, /* UTF string to compare to ct. */ + CONST char *ct, /* UTF string cs is compared to. */ + unsigned long numBytes) /* Number of *bytes* to compare. */ +{ + /* + * We can't simply call 'memcmp(cs, ct, numBytes);' because we need to + * check for Tcl's \xC0\x80 non-utf-8 null encoding. Otherwise utf-8 lexes + * fine in the strcmp manner. + */ + + register int result = 0; + + for ( ; numBytes != 0; numBytes--, cs++, ct++) { + if (*cs != *ct) { + result = UCHAR(*cs) - UCHAR(*ct); + break; + } + } + if (numBytes && ((UCHAR(*cs) == 0xC0) || (UCHAR(*ct) == 0xC0))) { + unsigned char c1, c2; + + c1 = ((UCHAR(*cs) == 0xC0) && (UCHAR(cs[1]) == 0x80)) ? 0 : UCHAR(*cs); + c2 = ((UCHAR(*ct) == 0xC0) && (UCHAR(ct[1]) == 0x80)) ? 0 : UCHAR(*ct); + result = (c1 - c2); + } + return result; +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UtfNcmp -- + * + * Compare at most numChars UTF chars of string cs to string ct. Both cs + * and ct are assumed to be at least numChars UTF chars long. + * + * Results: + * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UtfNcmp( + CONST char *cs, /* UTF string to compare to ct. */ + CONST char *ct, /* UTF string cs is compared to. */ + unsigned long numChars) /* Number of UTF chars to compare. */ +{ + Tcl_UniChar ch1, ch2; + + /* + * Cannot use 'memcmp(cs, ct, n);' as byte representation of \u0000 (the + * pair of bytes 0xc0,0x80) is larger than byte representation of \u0001 + * (the byte 0x01.) + */ + + while (numChars-- > 0) { + /* + * n must be interpreted as chars, not bytes. This should be called + * only when both strings are of at least n chars long (no need for \0 + * check) + */ + + cs += TclUtfToUniChar(cs, &ch1); + ct += TclUtfToUniChar(ct, &ch2); + if (ch1 != ch2) { + return (ch1 - ch2); + } + } + return 0; +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UtfNcasecmp -- + * + * Compare at most numChars UTF chars of string cs to string ct case + * insensitive. Both cs and ct are assumed to be at least numChars UTF + * chars long. + * + * Results: + * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UtfNcasecmp( + CONST char *cs, /* UTF string to compare to ct. */ + CONST char *ct, /* UTF string cs is compared to. */ + unsigned long numChars) /* Number of UTF chars to compare. */ +{ + Tcl_UniChar ch1, ch2; + while (numChars-- > 0) { + /* + * n must be interpreted as chars, not bytes. + * This should be called only when both strings are of + * at least n chars long (no need for \0 check) + */ + cs += TclUtfToUniChar(cs, &ch1); + ct += TclUtfToUniChar(ct, &ch2); + if (ch1 != ch2) { + ch1 = Tcl_UniCharToLower(ch1); + ch2 = Tcl_UniCharToLower(ch2); + if (ch1 != ch2) { + return (ch1 - ch2); + } + } + } + return 0; +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharToUpper -- + * + * Compute the uppercase equivalent of the given Unicode character. + * + * Results: + * Returns the uppercase Unicode character. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +Tcl_UniChar +Tcl_UniCharToUpper( + int ch) /* Unicode character to convert. */ +{ + int info = GetUniCharInfo(ch); + + if (GetCaseType(info) & 0x04) { + return (Tcl_UniChar) (ch - GetDelta(info)); + } else { + return ch; + } +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharToLower -- + * + * Compute the lowercase equivalent of the given Unicode character. + * + * Results: + * Returns the lowercase Unicode character. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +Tcl_UniChar +Tcl_UniCharToLower( + int ch) /* Unicode character to convert. */ +{ + int info = GetUniCharInfo(ch); + + if (GetCaseType(info) & 0x02) { + return (Tcl_UniChar) (ch + GetDelta(info)); + } else { + return ch; + } +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharToTitle -- + * + * Compute the titlecase equivalent of the given Unicode character. + * + * Results: + * Returns the titlecase Unicode character. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +Tcl_UniChar +Tcl_UniCharToTitle( + int ch) /* Unicode character to convert. */ +{ + int info = GetUniCharInfo(ch); + int mode = GetCaseType(info); + + if (mode & 0x1) { + /* + * Subtract or add one depending on the original case. + */ + + return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1)); + } else if (mode == 0x4) { + return (Tcl_UniChar) (ch - GetDelta(info)); + } else { + return ch; + } +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharLen -- + * + * Find the length of a UniChar string. The str input must be null + * terminated. + * + * Results: + * Returns the length of str in UniChars (not bytes). + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharLen( + CONST Tcl_UniChar *uniStr) /* Unicode string to find length of. */ +{ + int len = 0; + + while (*uniStr != '\0') { + len++; + uniStr++; + } + return len; +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharNcmp -- + * + * Compare at most numChars unichars of string ucs to string uct. + * Both ucs and uct are assumed to be at least numChars unichars long. + * + * Results: + * Return <0 if ucs < uct, 0 if ucs == uct, or >0 if ucs > uct. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharNcmp( + CONST Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ + CONST Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ + unsigned long numChars) /* Number of unichars to compare. */ +{ +#ifdef WORDS_BIGENDIAN + /* + * We are definitely on a big-endian machine; memcmp() is safe + */ + + return memcmp(ucs, uct, numChars*sizeof(Tcl_UniChar)); + +#else /* !WORDS_BIGENDIAN */ + /* + * We can't simply call memcmp() because that is not lexically correct. + */ + + for ( ; numChars != 0; ucs++, uct++, numChars--) { + if (*ucs != *uct) { + return (*ucs - *uct); + } + } + return 0; +#endif /* WORDS_BIGENDIAN */ +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharNcasecmp -- + * + * Compare at most numChars unichars of string ucs to string uct case + * insensitive. Both ucs and uct are assumed to be at least numChars + * unichars long. + * + * Results: + * Return <0 if ucs < uct, 0 if ucs == uct, or >0 if ucs > uct. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharNcasecmp( + CONST Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ + CONST Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ + unsigned long numChars) /* Number of unichars to compare. */ +{ + for ( ; numChars != 0; numChars--, ucs++, uct++) { + if (*ucs != *uct) { + Tcl_UniChar lcs = Tcl_UniCharToLower(*ucs); + Tcl_UniChar lct = Tcl_UniCharToLower(*uct); + + if (lcs != lct) { + return (lcs - lct); + } + } + } + return 0; +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharIsAlnum -- + * + * Test if a character is an alphanumeric Unicode character. + * + * Results: + * Returns 1 if character is alphanumeric. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharIsAlnum( + int ch) /* Unicode character to test. */ +{ + register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); + + return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1); +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharIsAlpha -- + * + * Test if a character is an alphabetic Unicode character. + * + * Results: + * Returns 1 if character is alphabetic. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharIsAlpha( + int ch) /* Unicode character to test. */ +{ + register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); + return ((ALPHA_BITS >> category) & 1); +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharIsControl -- + * + * Test if a character is a Unicode control character. + * + * Results: + * Returns non-zero if character is a control. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharIsControl( + int ch) /* Unicode character to test. */ +{ + return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL); +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharIsDigit -- + * + * Test if a character is a numeric Unicode character. + * + * Results: + * Returns non-zero if character is a digit. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharIsDigit( + int ch) /* Unicode character to test. */ +{ + return (GetUniCharInfo(ch)&UNICODE_CATEGORY_MASK) == DECIMAL_DIGIT_NUMBER; +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharIsGraph -- + * + * Test if a character is any Unicode print character except space. + * + * Results: + * Returns non-zero if character is printable, but not space. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharIsGraph( + int ch) /* Unicode character to test. */ +{ + register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); + return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' ')); +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharIsLower -- + * + * Test if a character is a lowercase Unicode character. + * + * Results: + * Returns non-zero if character is lowercase. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharIsLower( + int ch) /* Unicode character to test. */ +{ + return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER); +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharIsPrint -- + * + * Test if a character is a Unicode print character. + * + * Results: + * Returns non-zero if character is printable. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharIsPrint( + int ch) /* Unicode character to test. */ +{ + register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); + return ((PRINT_BITS >> category) & 1); +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharIsPunct -- + * + * Test if a character is a Unicode punctuation character. + * + * Results: + * Returns non-zero if character is punct. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharIsPunct( + int ch) /* Unicode character to test. */ +{ + register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); + return ((PUNCT_BITS >> category) & 1); +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharIsSpace -- + * + * Test if a character is a whitespace Unicode character. + * + * Results: + * Returns non-zero if character is a space. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharIsSpace( + int ch) /* Unicode character to test. */ +{ + register int category; + + /* + * If the character is within the first 127 characters, just use the + * standard C function, otherwise consult the Unicode table. + */ + + if (ch < 0x80) { + return isspace(UCHAR(ch)); /* INTL: ISO space */ + } else { + category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); + return ((SPACE_BITS >> category) & 1); + } +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharIsUpper -- + * + * Test if a character is a uppercase Unicode character. + * + * Results: + * Returns non-zero if character is uppercase. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharIsUpper( + int ch) /* Unicode character to test. */ +{ + return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER); +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharIsWordChar -- + * + * Test if a character is alphanumeric or a connector punctuation mark. + * + * Results: + * Returns 1 if character is a word character. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharIsWordChar( + int ch) /* Unicode character to test. */ +{ + register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); + + return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1); +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharCaseMatch -- + * + * See if a particular Unicode string matches a particular pattern. + * Allows case insensitivity. This is the Unicode equivalent of the char* + * Tcl_StringCaseMatch. The UniChar strings must be NULL-terminated. + * This has no provision for counted UniChar strings, thus should not be + * used where NULLs are expected in the UniChar string. Use + * TclUniCharMatch where possible. + * + * Results: + * The return value is 1 if string matches pattern, and 0 otherwise. The + * matching operation permits the following special characters in the + * pattern: *?\[] (see the manual entry for details on what these mean). + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharCaseMatch( + CONST Tcl_UniChar *uniStr, /* Unicode String. */ + CONST Tcl_UniChar *uniPattern, + /* Pattern, which may contain special + * characters. */ + int nocase) /* 0 for case sensitive, 1 for insensitive */ +{ + Tcl_UniChar ch1, p; + + while (1) { + p = *uniPattern; + + /* + * See if we're at the end of both the pattern and the string. If so, + * we succeeded. If we're at the end of the pattern but not at the end + * of the string, we failed. + */ + + if (p == 0) { + return (*uniStr == 0); + } + if ((*uniStr == 0) && (p != '*')) { + return 0; + } + + /* + * Check for a "*" as the next pattern character. It matches any + * substring. We handle this by skipping all the characters up to the + * next matching one in the pattern, and then calling ourselves + * recursively for each postfix of string, until either we match or we + * reach the end of the string. + */ + + if (p == '*') { + /* + * Skip all successive *'s in the pattern + */ + + while (*(++uniPattern) == '*') { + /* empty body */ + } + p = *uniPattern; + if (p == 0) { + return 1; + } + if (nocase) { + p = Tcl_UniCharToLower(p); + } + while (1) { + /* + * Optimization for matching - cruise through the string + * quickly if the next char in the pattern isn't a special + * character + */ + + if ((p != '[') && (p != '?') && (p != '\\')) { + if (nocase) { + while (*uniStr && (p != *uniStr) + && (p != Tcl_UniCharToLower(*uniStr))) { + uniStr++; + } + } else { + while (*uniStr && (p != *uniStr)) { + uniStr++; + } + } + } + if (Tcl_UniCharCaseMatch(uniStr, uniPattern, nocase)) { + return 1; + } + if (*uniStr == 0) { + return 0; + } + uniStr++; + } + } + + /* + * Check for a "?" as the next pattern character. It matches any + * single character. + */ + + if (p == '?') { + uniPattern++; + uniStr++; + continue; + } + + /* + * Check for a "[" as the next pattern character. It is followed by a + * list of characters that are acceptable, or by a range (two + * characters separated by "-"). + */ + + if (p == '[') { + Tcl_UniChar startChar, endChar; + + uniPattern++; + ch1 = (nocase ? Tcl_UniCharToLower(*uniStr) : *uniStr); + uniStr++; + while (1) { + if ((*uniPattern == ']') || (*uniPattern == 0)) { + return 0; + } + startChar = (nocase ? Tcl_UniCharToLower(*uniPattern) + : *uniPattern); + uniPattern++; + if (*uniPattern == '-') { + uniPattern++; + if (*uniPattern == 0) { + return 0; + } + endChar = (nocase ? Tcl_UniCharToLower(*uniPattern) + : *uniPattern); + uniPattern++; + if (((startChar <= ch1) && (ch1 <= endChar)) + || ((endChar <= ch1) && (ch1 <= startChar))) { + /* + * Matches ranges of form [a-z] or [z-a]. + */ + break; + } + } else if (startChar == ch1) { + break; + } + } + while (*uniPattern != ']') { + if (*uniPattern == 0) { + uniPattern--; + break; + } + uniPattern++; + } + uniPattern++; + continue; + } + + /* + * If the next pattern character is '\', just strip off the '\' so we + * do exact matching on the character that follows. + */ + + if (p == '\\') { + if (*(++uniPattern) == '\0') { + return 0; + } + } + + /* + * There's no special character. Just make sure that the next bytes of + * each string match. + */ + + if (nocase) { + if (Tcl_UniCharToLower(*uniStr) != + Tcl_UniCharToLower(*uniPattern)) { + return 0; + } + } else if (*uniStr != *uniPattern) { + return 0; + } + uniStr++; + uniPattern++; + } +} + +/* + *---------------------------------------------------------------------- + * + * TclUniCharMatch -- + * + * See if a particular Unicode string matches a particular pattern. + * Allows case insensitivity. This is the Unicode equivalent of the char* + * Tcl_StringCaseMatch. This variant of Tcl_UniCharCaseMatch uses counted + * Strings, so embedded NULLs are allowed. + * + * Results: + * The return value is 1 if string matches pattern, and 0 otherwise. The + * matching operation permits the following special characters in the + * pattern: *?\[] (see the manual entry for details on what these mean). + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +TclUniCharMatch( + CONST Tcl_UniChar *string, /* Unicode String. */ + int strLen, /* Length of String */ + CONST Tcl_UniChar *pattern, /* Pattern, which may contain special + * characters. */ + int ptnLen, /* Length of Pattern */ + int nocase) /* 0 for case sensitive, 1 for insensitive */ +{ + CONST Tcl_UniChar *stringEnd, *patternEnd; + Tcl_UniChar p; + + stringEnd = string + strLen; + patternEnd = pattern + ptnLen; + + while (1) { + /* + * See if we're at the end of both the pattern and the string. If so, + * we succeeded. If we're at the end of the pattern but not at the end + * of the string, we failed. + */ + + if (pattern == patternEnd) { + return (string == stringEnd); + } + p = *pattern; + if ((string == stringEnd) && (p != '*')) { + return 0; + } + + /* + * Check for a "*" as the next pattern character. It matches any + * substring. We handle this by skipping all the characters up to the + * next matching one in the pattern, and then calling ourselves + * recursively for each postfix of string, until either we match or we + * reach the end of the string. + */ + + if (p == '*') { + /* + * Skip all successive *'s in the pattern. + */ + + while (*(++pattern) == '*') { + /* empty body */ + } + if (pattern == patternEnd) { + return 1; + } + p = *pattern; + if (nocase) { + p = Tcl_UniCharToLower(p); + } + while (1) { + /* + * Optimization for matching - cruise through the string + * quickly if the next char in the pattern isn't a special + * character. + */ + + if ((p != '[') && (p != '?') && (p != '\\')) { + if (nocase) { + while ((string < stringEnd) && (p != *string) + && (p != Tcl_UniCharToLower(*string))) { + string++; + } + } else { + while ((string < stringEnd) && (p != *string)) { + string++; + } + } + } + if (TclUniCharMatch(string, stringEnd - string, + pattern, patternEnd - pattern, nocase)) { + return 1; + } + if (string == stringEnd) { + return 0; + } + string++; + } + } + + /* + * Check for a "?" as the next pattern character. It matches any + * single character. + */ + + if (p == '?') { + pattern++; + string++; + continue; + } + + /* + * Check for a "[" as the next pattern character. It is followed by a + * list of characters that are acceptable, or by a range (two + * characters separated by "-"). + */ + + if (p == '[') { + Tcl_UniChar ch1, startChar, endChar; + + pattern++; + ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string); + string++; + while (1) { + if ((*pattern == ']') || (pattern == patternEnd)) { + return 0; + } + startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern); + pattern++; + if (*pattern == '-') { + pattern++; + if (pattern == patternEnd) { + return 0; + } + endChar = (nocase ? Tcl_UniCharToLower(*pattern) + : *pattern); + pattern++; + if (((startChar <= ch1) && (ch1 <= endChar)) + || ((endChar <= ch1) && (ch1 <= startChar))) { + /* + * Matches ranges of form [a-z] or [z-a]. + */ + break; + } + } else if (startChar == ch1) { + break; + } + } + while (*pattern != ']') { + if (pattern == patternEnd) { + pattern--; + break; + } + pattern++; + } + pattern++; + continue; + } + + /* + * If the next pattern character is '\', just strip off the '\' so we + * do exact matching on the character that follows. + */ + + if (p == '\\') { + if (++pattern == patternEnd) { + return 0; + } + } + + /* + * There's no special character. Just make sure that the next bytes of + * each string match. + */ + + if (nocase) { + if (Tcl_UniCharToLower(*string) != Tcl_UniCharToLower(*pattern)) { + return 0; + } + } else if (*string != *pattern) { + return 0; + } + string++; + pattern++; + } +} diff --git a/poliqarp-library/unibits/tclUtils.c b/poliqarp-library/unibits/tclUtils.c new file mode 100644 index 0000000000000000000000000000000000000000..0bffdf8bf75aab74313cbedf799a55fc226c6af1 --- /dev/null +++ b/poliqarp-library/unibits/tclUtils.c @@ -0,0 +1,138 @@ +/* + * tclUtil.c -- + * + * This file contains utility functions that are used by many Tcl + * commands. + * + * Copyright (c) 1987-1993 The Regents of the University of California. + * Copyright (c) 1994-1998 Sun Microsystems, Inc. + * Copyright (c) 2001 by Kevin B. Kenny. All rights reserved. + * + * See the file ``doc/COPYING.Tcl`` for information on usage and redistribution + * of this file. + */ + +#include <poliqarp-config.h> + +#include "unibits.h" + +/* + *---------------------------------------------------------------------- + * + * Tcl_DStringInit -- + * + * Initializes a dynamic string, discarding any previous contents of the + * string (Tcl_DStringFree should have been called already if the dynamic + * string was previously in use). + * + * Results: + * None. + * + * Side effects: + * The dynamic string is initialized to be empty. + * + *---------------------------------------------------------------------- + */ + +void +Tcl_DStringInit( + Tcl_DString *dsPtr) /* Pointer to structure for dynamic string. */ +{ + dsPtr->string = dsPtr->staticSpace; + dsPtr->length = 0; + dsPtr->spaceAvl = TCL_DSTRING_STATIC_SIZE; + dsPtr->staticSpace[0] = '\0'; +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_DStringFree -- + * + * Frees up any memory allocated for the dynamic string and reinitializes + * the string to an empty state. + * + * Results: + * None. + * + * Side effects: + * The previous contents of the dynamic string are lost, and the new + * value is an empty string. + * + *---------------------------------------------------------------------- + */ + +void +Tcl_DStringFree( + Tcl_DString *dsPtr) /* Structure describing dynamic string. */ +{ + if (dsPtr->string != dsPtr->staticSpace) { + ckfree(dsPtr->string); + } + dsPtr->string = dsPtr->staticSpace; + dsPtr->length = 0; + dsPtr->spaceAvl = TCL_DSTRING_STATIC_SIZE; + dsPtr->staticSpace[0] = '\0'; +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_DStringSetLength -- + * + * Change the length of a dynamic string. This can cause the string to + * either grow or shrink, depending on the value of length. + * + * Results: + * None. + * + * Side effects: + * The length of dsPtr is changed to length and a null byte is stored at + * that position in the string. If length is larger than the space + * allocated for dsPtr, then a panic occurs. + * + *---------------------------------------------------------------------- + */ + +void +Tcl_DStringSetLength( + Tcl_DString *dsPtr, /* Structure describing dynamic string. */ + int length) /* New length for dynamic string. */ +{ + int newsize; + + if (length < 0) { + length = 0; + } + if (length >= dsPtr->spaceAvl) { + /* + * There are two interesting cases here. In the first case, the user + * may be trying to allocate a large buffer of a specific size. It + * would be wasteful to overallocate that buffer, so we just allocate + * enough for the requested size plus the trailing null byte. In the + * second case, we are growing the buffer incrementally, so we need + * behavior similar to Tcl_DStringAppend. The requested length will + * usually be a small delta above the current spaceAvl, so we'll end + * up doubling the old size. This won't grow the buffer quite as + * quickly, but it should be close enough. + */ + + newsize = dsPtr->spaceAvl * 2; + if (length < newsize) { + dsPtr->spaceAvl = newsize; + } else { + dsPtr->spaceAvl = length + 1; + } + if (dsPtr->string == dsPtr->staticSpace) { + char *newString = ckalloc((unsigned) dsPtr->spaceAvl); + + memcpy(newString, dsPtr->string, (size_t) dsPtr->length); + dsPtr->string = newString; + } else { + dsPtr->string = (char *) ckrealloc((void *) dsPtr->string, + (size_t) dsPtr->spaceAvl); + } + } + dsPtr->length = length; + dsPtr->string[length] = 0; +} diff --git a/poliqarp-library/unibits/unibits-extra.h b/poliqarp-library/unibits/unibits-extra.h new file mode 100644 index 0000000000000000000000000000000000000000..ef4be8a7521c33a9e445e6da7e156994d2f7f2ed --- /dev/null +++ b/poliqarp-library/unibits/unibits-extra.h @@ -0,0 +1,46 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2008-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#ifndef UNIBITS_UNIBITS_EXTRA_H +#define UNIBITS_UNIBITS_EXTRA_H + +#include <stdlib.h> +#include <string.h> +#include <ctype.h> + +#define ckalloc malloc +#define ckfree free +#define ckrealloc realloc + +#define EXTERN extern +#define CONST const +#define VOID void +#define INLINE inline +#define UCHAR(c) ((unsigned char) c) + +#define TCL_UTF_MAX 4 +typedef unsigned int Tcl_UniChar; + +extern const unsigned char Tcl_Utf8TotalBytes[256]; + +#endif diff --git a/poliqarp-library/unibits/unibits.h b/poliqarp-library/unibits/unibits.h new file mode 100644 index 0000000000000000000000000000000000000000..18c090cae9fbb834e3cc1671a4d62c3cf48dc1c6 --- /dev/null +++ b/poliqarp-library/unibits/unibits.h @@ -0,0 +1,82 @@ +/* + * This header file combines bits of Tcl's ``tcl.h``, ``tclInt.h`` and + * ``tclDecl.h`` files. + * + * Copyright (c) 1987-1994 The Regents of the University of California. + * Copyright (c) 1993-1997 Lucent Technologies. + * Copyright (c) 1994-1998 Sun Microsystems, Inc. + * Copyright (c) 1998-2000 by Scriptics Corporation. + * Copyright (c) 2001-2002 by Kevin B. Kenny. All rights reserved. + * Copyright (c) 2007 Daniel A. Steffen <das@users.sourceforge.net> + * + * See the file ``doc/COPYING.Tcl`` for information on usage and + * redistribution of this file, and for a DISCLAIMER OF ALL WARRANTIES. + */ + +#ifndef UNIBITS_UNIBITS_H +#define UNIBITS_UNIBITS_H + +#include <unibits/unibits-extra.h> + +/* + * The structure defined below is used to hold dynamic strings. The only + * fields that clients should use are string and length, accessible via the + * macros Tcl_DStringValue and Tcl_DStringLength. + */ + +#define TCL_DSTRING_STATIC_SIZE 200 +typedef struct Tcl_DString { + char *string; /* Points to beginning of string: either + * staticSpace below or a malloced array. */ + int length; /* Number of non-NULL characters in the + * string. */ + int spaceAvl; /* Total number of bytes available for the + * string and its terminating NULL char. */ + char staticSpace[TCL_DSTRING_STATIC_SIZE]; + /* Space to use in common case where string is + * small. */ +} Tcl_DString; + +#define Tcl_DStringLength(dsPtr) ((dsPtr)->length) +#define Tcl_DStringValue(dsPtr) ((dsPtr)->string) + +/* + *---------------------------------------------------------------- + * Macro used by the Tcl core get a unicode char from a utf string. It checks + * to see if we have a one-byte utf char before calling the real + * Tcl_UtfToUniChar, as this will save a lot of time for primarily ascii + * string handling. The macro's expression result is 1 for the 1-byte case or + * the result of Tcl_UtfToUniChar. The ANSI C "prototype" for this macro is: + * + * MODULE_SCOPE int TclUtfToUniChar(const char *string, Tcl_UniChar *ch); + *---------------------------------------------------------------- + */ + +#define TclUtfToUniChar(str, chPtr) \ + ((((unsigned char) *(str)) < 0xC0) ? \ + ((*(chPtr) = (Tcl_UniChar) *(str)), 1) \ + : Tcl_UtfToUniChar(str, chPtr)) + + +EXTERN int Tcl_UniCharIsAlnum (int ch); +EXTERN int Tcl_UniCharIsAlpha (int ch); +EXTERN int Tcl_UniCharIsDigit (int ch); +EXTERN int Tcl_UniCharIsLower (int ch); +EXTERN int Tcl_UniCharIsSpace (int ch); +EXTERN int Tcl_UniCharIsUpper (int ch); +EXTERN Tcl_UniChar Tcl_UniCharToLower (int ch); +EXTERN Tcl_UniChar Tcl_UniCharToTitle (int ch); +EXTERN Tcl_UniChar Tcl_UniCharToUpper (int ch); + +EXTERN char * Tcl_UniCharToUtfDString (CONST Tcl_UniChar * uniStr, + int uniLength, Tcl_DString * dsPtr); +EXTERN Tcl_UniChar * Tcl_UtfToUniCharDString (CONST char * src, + int length, Tcl_DString * dsPtr); + +EXTERN void Tcl_DStringFree (Tcl_DString * dsPtr); +EXTERN void Tcl_DStringInit (Tcl_DString * dsPtr); +EXTERN void Tcl_DStringSetLength (Tcl_DString * dsPtr, + int length); +EXTERN int Tcl_UtfToLower (char * src); + +#endif diff --git a/poliqarp-library/utils/poliqarpc.c b/poliqarp-library/utils/poliqarpc.c new file mode 100644 index 0000000000000000000000000000000000000000..7da78055ea3b62ad1b53cfdaa020925d2804611e --- /dev/null +++ b/poliqarp-library/utils/poliqarpc.c @@ -0,0 +1,284 @@ +/* + * This file is part of the Poliqarp suite. + * + * Copyright (C) 2008-2009 by Instytut Podstaw Informatyki Polskiej + * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish + * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. + * + * This file may be distributed and/or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation and appearing in the file gpl.txt included in the packaging + * of this file. (See http://www.gnu.org/licenses/translations.html for + * unofficial translations.) + * + * A commercial license is available from IPI PAN (contact + * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more + * information). Licensees holding a valid commercial license from IPI + * PAN may use this file in accordance with that license. + * + * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING + * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE. + */ + +#include <poliqarp-config.h> + +#include <sakura/poliqarp.h> +#include <sakura/random.h> +#include <stdio.h> +#include <getopt.h> +#include <errno.h> + +#define DEFAULT_CONTEXT_WIDTH 5 +#define DEFAULT_LIMIT 1000 + +bool quiet = false; +bool random_sample = false; +char *corpusname = NULL; +char **query_strings = NULL; +bool tags_context = false; +bool tags_match = true; +size_t context = DEFAULT_CONTEXT_WIDTH; +size_t limit = DEFAULT_LIMIT; +char *query_rewrite = "default"; + +static void display_usage(const char *progname) +{ + printf( + "Usage: %s [OPTIONS] CORPUS QUERY\n" + "Execute a Poliqarp query on a corpus and print its results to\n" + "standard output in the CSV format.\n\n" + " -h, --help display this help and exit\n" + " -v, --version display version information and exit\n" + " -q, --quiet suppress progress messages printed\n" + " to standard error stream\n" + " -c, --context=WIDTH set context width, in segments\n" + " -m, --no-tags-match don't display base forms and tags in matches\n" + " -t, --tags-context display base forms and tags in contexts\n" + " -Q, --query-rewrite=QR set rewriting rules for implicit queries\n" + " -L, --limit=N produce at most N results\n" + " -A, --all produce all results\n" + " -R, --random produce a random sample\n" + "\n" + "Report bugs to <" PACKAGE_BUGREPORT ">.\n", + progname); + exit(EXIT_SUCCESS); +} + +static void display_version() +{ + printf("Poliqarp command-line client, version %d.%d.%d\n", + poliqarp_major_version, poliqarp_minor_version, + poliqarp_revision_number); + exit(EXIT_SUCCESS); +} + +static void cmdline_process(int argc, char *argv[]) +{ + const struct option options[] = { + { "help", 0, NULL, 'h' }, + { "version", 0, NULL, 'v' }, + { "quiet", 0, NULL, 'q' }, + { "context", 1, NULL, 'c' }, + { "no-tags-match", 0, NULL, 'm' }, + { "tags-context", 0, NULL, 't' }, + { "query-rewrite", 1, NULL, 'Q' }, + { "limit" , 1, NULL, 'L' }, + { "all", 0, NULL, 'A' }, + { "random", 0, NULL, 'R' }, + { 0, 0, 0, 0 } + }; + int opt; + + while ((opt = getopt_long(argc, argv, "hvqc:mtQ:L:AR", options, NULL)) != -1) + { + switch (opt) { + case 'h': + display_usage(*argv); + break; + case 'v': + display_version(); + break; + case 'q': + quiet = true; + break; + case 'c': + { + char *endptr; + errno = 0; + unsigned long context_long = strtoul(optarg, &endptr, 10); + context = context_long; + if (*optarg == '\0' || *endptr != '\0' + || errno != 0 || context != context_long) { + fprintf(stderr, "Incorrect context size\n"); + exit(EXIT_FAILURE); + } + } + break; + case 'm': + tags_match = false; + break; + case 't': + tags_context = true; + break; + case 'Q': + query_rewrite = optarg; + break; + case 'L': + { + char *endptr; + errno = 0; + unsigned long limit_long = strtoul(optarg, &endptr, 10); + limit = limit_long; + if (*optarg == '\0' || *endptr != '\0' + || errno != 0 || limit != limit_long + || limit == 0) { + fprintf(stderr, "Incorrect match buffer size\n"); + exit(EXIT_FAILURE); + } + } + break; + case 'A': + limit = (size_t) -1; + break; + case 'R': + random_sample = true; + } + } + if (optind == argc) + display_usage(*argv); + corpusname = argv[optind++]; + if (optind == argc) + display_usage(*argv); + query_strings = argv + optind; +} + +static void quote_comma(const char *s) +{ + while (*s) { + if (*s == '"') + putchar('\\'); + putchar(*s++); + } +} + +static void output_range(struct poliqarp_corpus *corpus, size_t from, size_t to, bool display_tags) +{ + size_t i, j; + putchar('"'); + for (j = from; j < to; j++) { + struct poliqarp_segment segment; + struct poliqarp_segment_info info; + struct poliqarp_interpretation_set set; + struct poliqarp_interpretation_set_info sinfo; + struct poliqarp_interpretation interp; + struct poliqarp_interpretation_info iinfo; + + poliqarp_get_segment(&segment, corpus, j); + poliqarp_get_segment_info(&segment, &info); + poliqarp_get_disambiguated_interpretations(&segment, &set); + poliqarp_get_interpretation_set_info(&set, &sinfo); + if (info.space_before) + putchar(' '); + quote_comma(info.text); + if (!display_tags) + continue; + for (i = 0; i < sinfo.size; i++) { + poliqarp_get_interpretation(&set, &interp, i); + poliqarp_get_interpretation_info(&interp, &iinfo); + fputs(" [", stdout); + quote_comma(iinfo.base); + putchar(':'); + quote_comma(iinfo.tag); + putchar(']'); + } + } + putchar('"'); +} + +static struct poliqarp_corpus corpus; +static struct poliqarp_match_buffer buffer; +static size_t result_counter = 0; + +void async_notify_new_results(void *session) +{ + size_t i; + struct poliqarp_match_buffer_info info; + poliqarp_get_match_buffer_info(&buffer, &info); + for (i = 0; i < info.used && result_counter < limit; i++) { + struct poliqarp_match match; + struct poliqarp_match document_range; + size_t lc, rc; + poliqarp_get_match(&buffer, &match, i); + poliqarp_get_match_for_document(&corpus, match.document, &document_range); + lc = match.start < document_range.start + context + ? document_range.start + : match.start - context; + rc = match.end + context > document_range.end + ? document_range.end + : match.end + context; + output_range(&corpus, lc, match.start, tags_context); printf(","); + if (match.start < match.focus) { + output_range(&corpus, match.start, match.focus, tags_match); printf(","); + } + output_range(&corpus, match.focus, match.end, tags_match); printf(","); + output_range(&corpus, match.end, rc, tags_context); puts(""); + result_counter++; + } + if (result_counter < limit) + poliqarp_forget(&buffer); + return; +} + +int main(int argc, char *argv[]) +{ + struct poliqarp_query query; + progress_t progress; + struct poliqarp_error error = poliqarp_error_none; + int status = EXIT_SUCCESS; + + cmdline_process(argc, argv); + if (poliqarp_create("" /* use the system locale */, &error) != 0) { + fprintf(stderr, "%s\n", poliqarp_error_message_get(&error)); + poliqarp_error_message_set(&error, NULL); + return EXIT_FAILURE; + } + progress_init(&progress); + + fprintf(stderr, "Opening corpus...\n"); + if (poliqarp_open_corpus(&corpus, corpusname, &progress, &error) == -1) { + fprintf(stderr, "%s.\n", poliqarp_error_message_get(&error)); + return EXIT_FAILURE; + } + poliqarp_create_match_buffer(&buffer, random_sample ? limit : 0x100); + + struct poliqarp_random_state random_state; + poliqarp_srandom_time(&random_state); + while (*query_strings != NULL) { + fprintf(stderr, "Compiling query...\n"); + if (poliqarp_create_query(&query, *query_strings, &corpus, 0, + query_rewrite, random_sample ? &random_state : NULL, &error) == -1) + { + fprintf(stderr, "Unable to compile query, reason: %s.\n", + poliqarp_error_message_get(&error)); + status = EXIT_FAILURE; + goto query_failed; + } + fprintf(stderr, "Executing query...\n"); + poliqarp_produce(&buffer, limit, &query, &progress, NULL, + random_sample ? (size_t) -1 : 0x100, -1); + async_notify_new_results(NULL); + fprintf(stderr, "Query successfully executed, %" PRIuSIZE " results found.\n", result_counter); + poliqarp_destroy_query(&query); + query_strings++; + if (*query_strings != NULL) + printf("\n"); + poliqarp_forget(&buffer); + } +query_failed: + poliqarp_error_message_set(&error, NULL); + poliqarp_destroy_match_buffer(&buffer); + poliqarp_close_corpus(&corpus); + poliqarp_destroy(); + return status; +} diff --git a/poliqarp/CMakeLists.txt b/poliqarp/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..769cb667a1874ef452048ae8f36dc1be3a3c3730 --- /dev/null +++ b/poliqarp/CMakeLists.txt @@ -0,0 +1,9 @@ +PROJECT(Corpus2Poliqarp) +cmake_minimum_required(VERSION 2.8.0) +include_directories("/usr/local/include/sakura/") +add_library(corpus2poliqarp SHARED pqclient.cpp) + +target_link_libraries(corpus2poliqarp libpoliqarp corpus2) + + + diff --git a/poliqarp/pqclient.cpp b/poliqarp/pqclient.cpp new file mode 100644 index 0000000000000000000000000000000000000000..24d3423cd33bda637fbe5dbf8246c3fecf942fd8 --- /dev/null +++ b/poliqarp/pqclient.cpp @@ -0,0 +1 @@ +#include "pqclient.h" diff --git a/poliqarp/pqclient.h b/poliqarp/pqclient.h new file mode 100644 index 0000000000000000000000000000000000000000..3ee0e1bc53896ec1096951d0942af07ca96e3320 --- /dev/null +++ b/poliqarp/pqclient.h @@ -0,0 +1,26 @@ +#ifndef CORPUS2_POLIQARP_PQCLIENT_H +#define CORPUS2_POLIQARP_PQCLIENT_H + +#include <boost/utility.hpp> + +namespace Corpus2 +{ + +class PoliqarpClient : boost::noncopyable +{ +public: + PoliqarpClient(const std::string path); + ~PoliqarpClient(); + + void reload_corpus(const std::string& path); + void restart(); + int execute_query(); + int compile_query(const std::string& q); + + size_t get_corpus_size() const; +private: +}; + +} + +#endif /* CORPUS2_POLIQARP_PQCLIENT_H */