/* * This file is part of the Poliqarp suite. * * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. * * This file may be distributed and/or modified under the terms of the * GNU General Public License version 2 as published by the Free Software * Foundation and appearing in the file gpl.txt included in the packaging * of this file. (See http://www.gnu.org/licenses/translations.html for * unofficial translations.) * * A commercial license is available from IPI PAN (contact * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more * information). Licensees holding a valid commercial license from IPI * PAN may use this file in accordance with that license. * * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE. */ #include <locale.h> #include <stdio.h> #include <foostring/foostring.h> #include <sakura/poliqarp.h> #include <unibits/strcoll.h> #define POLIQARP_MAJOR_VERSION 1 #define POLIQARP_MINOR_VERSION 3 #define POLIQARP_REVISION_NUMBER 11 #define POLIQARP_LIBRARY_NAME "sakura" const int poliqarp_major_version = POLIQARP_MAJOR_VERSION; const int poliqarp_minor_version = POLIQARP_MINOR_VERSION; const int poliqarp_revision_number = POLIQARP_REVISION_NUMBER; const char poliqarp_library_name[] = POLIQARP_LIBRARY_NAME; int poliqarp_create(const char *locale, struct poliqarp_error *error) { int rc; setlocale(LC_ALL, locale); rc = poliqarp_regexp_validate_utf8() || unibits_validate_utf8(); if (rc < 0) goto error; if (rc > 0) { /* Maybe UTF-8 variant of LC_CTYPE is fine? */ const char *locale = setlocale(LC_CTYPE, NULL); if (locale != NULL) { const char *locale_end = locale; while (*locale_end != '\0' && *locale_end != '.') locale_end++; size_t length = locale_end - locale; char *new_locale = malloc(length + 7); if (new_locale == NULL) goto error; new_locale[0] = '\0'; strncat(new_locale, locale, length); strcat(new_locale + length, ".UTF-8"); setlocale(LC_CTYPE, new_locale); free(new_locale); rc = poliqarp_regexp_validate_utf8() || unibits_validate_utf8(); if (rc < 0) goto error; } } if (rc > 0) { /* Maybe LC_COLLATE is fine? */ const char *locale = setlocale(LC_COLLATE, NULL); if (locale != NULL) { char *new_locale = malloc(strlen(locale) + 7); if (new_locale == NULL) goto error; strcpy(new_locale, locale); setlocale(LC_CTYPE, new_locale); rc = poliqarp_regexp_validate_utf8() || unibits_validate_utf8(); if (rc < 0) goto error; if (rc > 0) { /* Maybe UTF-8 variant of LC_COLLATE is fine? */ char *new_locale_end = new_locale; while (*new_locale_end != '\0' && *new_locale_end != '.') new_locale_end++; strcpy(new_locale_end, ".UTF-8"); setlocale(LC_CTYPE, new_locale); rc = poliqarp_regexp_validate_utf8() || unibits_validate_utf8(); if (rc < 0) goto error; } free(new_locale); } } if (rc > 0) { /* Maybe en_US.UTF-8 is available? */ setlocale(LC_CTYPE, "en_US.UTF-8"); rc = poliqarp_regexp_validate_utf8() || unibits_validate_utf8(); } if (rc != 0) { poliqarp_error_message_set(error, _("Unable to set a UTF-8 locale")); return -1; } return 0; error: poliqarp_error_from_system(error, _("Unable to initialize the Poliqarp library")); return -1; } int poliqarp_destroy(void) { return 0; } int poliqarp_get_corpus_info(const struct poliqarp_corpus *corpus, struct poliqarp_corpus_info *info) { info->num_segments = poliqarp_backend_corpus_size(&corpus->corpus); info->num_types = poliqarp_backend_orth_num_items( poliqarp_get_const_backend(corpus, orth)); info->num_lemmata = poliqarp_backend_base_num_items__disamb( poliqarp_get_const_backend(corpus, base)); info->num_tags = poliqarp_backend_tag_num_items( poliqarp_get_const_backend(corpus, tag)); return 0; } int poliqarp_get_segment(struct poliqarp_segment *segment, struct poliqarp_corpus *corpus, size_t index) { #ifndef NDEBUG if (index >= poliqarp_backend_corpus_size(&corpus->corpus)) return -1; #endif segment->corpus = corpus; segment->segment = poliqarp_backend_corpus_get(&corpus->corpus, index); return 0; } int poliqarp_get_segment_info(const struct poliqarp_segment *segment, struct poliqarp_segment_info *info) { info->space_before = segment->segment.orth_space_id & 1; info->text = poliqarp_backend_orth_fetch( poliqarp_get_const_backend(segment->corpus, orth), segment->segment.orth_space_id >> 1); return 0; } int poliqarp_get_disambiguated_interpretations( const struct poliqarp_segment *segment, struct poliqarp_interpretation_set *set) { set->corpus = segment->corpus; set->set = segment->segment.interp_disamb_id; set->disamb = true; return 0; } int poliqarp_get_ambiguous_interpretations( const struct poliqarp_segment *segment, struct poliqarp_interpretation_set *set) { set->corpus = segment->corpus; set->set = segment->segment.interp_amb_id; set->disamb = false; return 0; } int poliqarp_get_interpretation_set_info( const struct poliqarp_interpretation_set *set, struct poliqarp_interpretation_set_info *info) { info->size = set->disamb ? poliqarp_backend_interp_length__disamb( poliqarp_get_const_backend(set->corpus, interp), set->set) : poliqarp_backend_interp_length__amb( poliqarp_get_const_backend(set->corpus, interp), set->set); return 0; } int poliqarp_get_interpretation(const struct poliqarp_interpretation_set *set, struct poliqarp_interpretation *interp, size_t index) { const struct poliqarp_binary_interp *binterp; binterp = set->disamb ? poliqarp_backend_interp_fetch__disamb( poliqarp_get_const_backend(set->corpus, interp), set->set) : poliqarp_backend_interp_fetch__amb( poliqarp_get_const_backend(set->corpus, interp), set->set); interp->corpus = set->corpus; interp->disamb = set->disamb; interp->interp = binterp[index]; POLIQARP_INTERP_LE_TO_HE(interp->interp); return 0; } int poliqarp_get_interpretation_info( const struct poliqarp_interpretation *interp, struct poliqarp_interpretation_info *info) { info->base = interp->disamb ? poliqarp_backend_base_fetch__disamb( poliqarp_get_const_backend(interp->corpus, base), interp->interp.base_id) : poliqarp_backend_base_fetch__amb( poliqarp_get_const_backend(interp->corpus, base), interp->interp.base_id); info->tag = poliqarp_backend_tag_fetch( poliqarp_get_const_backend(interp->corpus, tag), interp->interp.tag_id); return 0; } int poliqarp_define_alias(struct poliqarp_corpus *corpus, const char *name, const char *value) { if (hash_table_set(&(poliqarp_get_backend(corpus, config)->aliases), name, strdup(value))) { return -1; } return 0; } int poliqarp_delete_alias(struct poliqarp_corpus *corpus, const char *name) { if (hash_table_unset(&(poliqarp_get_backend(corpus, config)->aliases), name)) return -1; return 0; } static void get_aliases_iterator(const char *key, const void *value, void *env) { struct poliqarp_alias **alias = (struct poliqarp_alias **)env; (*alias)->name = key; (*alias)->value = (const char *)value; (*alias)++; } int poliqarp_get_aliases(const struct poliqarp_corpus *corpus, struct poliqarp_alias_list *aliases) { const struct hash_table *table = &(poliqarp_get_const_backend(corpus, config)->aliases); struct poliqarp_alias *tmp; aliases->num_aliases = hash_table_num_items(table); tmp = aliases->aliases = malloc(aliases->num_aliases * sizeof(struct poliqarp_alias)); hash_table_iterate(table, &tmp, get_aliases_iterator); return 0; } int poliqarp_free_aliases(struct poliqarp_alias_list *aliases) { free(aliases->aliases); return 0; } int poliqarp_get_metadata_set(const struct poliqarp_corpus *corpus, size_t id, struct poliqarp_metadata_set *meta) { struct poliqarp_document document; if (poliqarp_backend_document_fetch(&corpus->document, id, &document) == -1) return -1; meta->corpus = corpus; meta->low = document.meta_low; meta->high = document.meta_high; return 0; } size_t poliqarp_metadata_count(const struct poliqarp_metadata_set *meta) { return meta->high - meta->low; } int poliqarp_get_metadata(const struct poliqarp_metadata_set *set, size_t index, struct poliqarp_metadata *meta) { index += set->low; meta->corpus = set->corpus; meta->meta = poliqarp_backend_meta_fetch(poliqarp_get_const_backend( set->corpus, meta), index); return 0; } int poliqarp_get_metadata_types(struct poliqarp_metadata_types *types, const struct poliqarp_corpus *corpus) { int num = 0; struct poliqarp_meta_type_list *mtypes = corpus->meta.types; while (mtypes) { ++num; mtypes = mtypes->next; } types->types = malloc(num * sizeof *(types->types)); if (types->types == NULL) return -1; types->num_types = num; mtypes = corpus->meta.types; while (num) { num--; types->types[num].key = mtypes->key; types->types[num].type = (mtypes->type == POLIQARP_META_TYPE_STRING) ? POLIQARP_META_TEXT : POLIQARP_META_DATE; mtypes = mtypes->next; } return 0; } int poliqarp_free_metadata_types(struct poliqarp_metadata_types *types) { free(types->types); return 0; } int poliqarp_get_metadata_info(const struct poliqarp_metadata *meta, struct poliqarp_metadata_info *info) { info->key = poliqarp_backend_meta_key_fetch(poliqarp_get_const_backend( meta->corpus, meta), meta->meta.key); switch (meta->meta.type) { case POLIQARP_METADATA_SINGLE: case POLIQARP_METADATA_MULTI: info->type = POLIQARP_META_TEXT; info->value.text = poliqarp_backend_meta_value_fetch( poliqarp_get_const_backend(meta->corpus, meta), meta->meta.value_as.text); break; case POLIQARP_METADATA_DATE: info->type = POLIQARP_META_DATE; info->value.date.year = meta->meta.value_as.date.year; info->value.date.month = meta->meta.value_as.date.month; info->value.date.day = meta->meta.value_as.date.day; break; case POLIQARP_METADATA_UNDEFINED: info->type = POLIQARP_META_UNDEFINED; break; default: abort(); /* Should not happen. */ } return 0; } int poliqarp_get_tagset_info(const struct poliqarp_corpus *corpus, struct poliqarp_tagset_info *info) { const struct poliqarp_backend_config *cfg = poliqarp_get_const_backend(corpus, config); struct entity *entity; struct poliqarp_attr *attr; struct poliqarp_attr_value *aval; struct poliqarp_part_of_speech *pos; struct poliqarp_attr_instance *ainst; string_t s; /* first pass: gather number of classes and categories */ info->num_categories = info->num_classes = 0; for (entity = cfg->named_items.first_entity; entity; entity = entity->next_entity) { switch (*(enum poliqarp_entity_type *)entity->tag) { case POLIQARP_ENTITY_POS: info->num_classes++; break; case POLIQARP_ENTITY_ATTR: info->num_categories++; break; default: break; } } /* allocate memory */ info->classes = malloc(info->num_classes * sizeof(*(info->classes))); info->categories = malloc(info->num_categories * sizeof(*(info->categories))); /* second pass: retrieve the info */ info->num_categories = info->num_classes = 0; for (entity = cfg->named_items.first_entity; entity; entity = entity->next_entity) { switch (*(enum poliqarp_entity_type *)entity->tag) { case POLIQARP_ENTITY_POS: s = string_create(); string_append_str(s, entity->name); pos = (struct poliqarp_part_of_speech *)entity->data; for (ainst = pos->first_instance; ainst; ainst = ainst->next_instance) { string_append_str(s, " "); if (ainst->is_optional) string_append_str(s, "["); string_append_str(s, ainst->attr->self->name); if (ainst->is_optional) string_append_str(s, "]"); } info->classes[info->num_classes++] = string_free_and_get_buffer(s); break; case POLIQARP_ENTITY_ATTR: s = string_create(); string_append_str(s, entity->name); attr = (struct poliqarp_attr *)entity->data; for (aval = attr->first_value; aval; aval = aval->next_value) { string_append_str(s, " "); string_append_str(s, aval->self->name); } info->categories[info->num_categories++] = string_free_and_get_buffer(s); break; default: break; } } return 0; } void poliqarp_free_tagset_info(struct poliqarp_tagset_info *info) { size_t i; for (i = 0; i < info->num_classes; i++) free(info->classes[i]); for (i = 0; i < info->num_categories; i++) free(info->categories[i]); }