Skip to content
Snippets Groups Projects
Select Git revision
  • 85ec635b4bd6980ccfd6580ead46041751e22251
  • master default protected
  • fix-words-ann
  • wccl-rules-migration
  • develop
5 results

tagaction.cpp

  • poliqarp.c 13.43 KiB
    /*
     * This file is part of the Poliqarp suite.
     * 
     * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej
     * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish
     * Academy of Sciences; cf. www.ipipan.waw.pl).  All rights reserved.
     * 
     * This file may be distributed and/or modified under the terms of the
     * GNU General Public License version 2 as published by the Free Software
     * Foundation and appearing in the file gpl.txt included in the packaging
     * of this file.  (See http://www.gnu.org/licenses/translations.html for
     * unofficial translations.)
     * 
     * A commercial license is available from IPI PAN (contact
     * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more
     * information).  Licensees holding a valid commercial license from IPI
     * PAN may use this file in accordance with that license.
     * 
     * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING
     * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     * PURPOSE.
     */
    
    #include <locale.h>
    #include <stdio.h>
    
    #include <foostring/foostring.h>
    #include <sakura/poliqarp.h>
    
    #include <unibits/strcoll.h>
    
    #define POLIQARP_MAJOR_VERSION 1
    #define POLIQARP_MINOR_VERSION 3
    #define POLIQARP_REVISION_NUMBER 12
    #define POLIQARP_LIBRARY_NAME    "sakura"
    
    const int poliqarp_major_version = POLIQARP_MAJOR_VERSION;
    const int poliqarp_minor_version = POLIQARP_MINOR_VERSION;
    const int poliqarp_revision_number = POLIQARP_REVISION_NUMBER;
    const char poliqarp_library_name[] = POLIQARP_LIBRARY_NAME;
    
    int poliqarp_create(const char *locale, struct poliqarp_error *error)
    {
       int rc;
       setlocale(LC_ALL, locale);
       rc = poliqarp_regexp_validate_utf8() || unibits_validate_utf8();
       if (rc < 0)
          goto error;
       if (rc > 0) {
          /* Maybe UTF-8 variant of LC_CTYPE is fine? */
          const char *locale = setlocale(LC_CTYPE, NULL);
          if (locale != NULL) {
             const char *locale_end = locale;
             while (*locale_end != '\0' && *locale_end != '.')
                locale_end++;
             size_t length = locale_end - locale;
             char *new_locale = malloc(length + 7);
             if (new_locale == NULL)
                goto error;
             new_locale[0] = '\0';
             strncat(new_locale, locale, length);
             strcat(new_locale + length, ".UTF-8");
             setlocale(LC_CTYPE, new_locale);
             free(new_locale);
             rc = poliqarp_regexp_validate_utf8() || unibits_validate_utf8();
             if (rc < 0)
                goto error;
          }
       }
       if (rc > 0) {
          /* Maybe LC_COLLATE is fine? */
          const char *locale = setlocale(LC_COLLATE, NULL);
          if (locale != NULL) {
             char *new_locale = malloc(strlen(locale) + 7);
             if (new_locale == NULL)
                goto error;
             strcpy(new_locale, locale);
             setlocale(LC_CTYPE, new_locale);
             rc = poliqarp_regexp_validate_utf8() || unibits_validate_utf8();
             if (rc < 0)
                goto error;
             if (rc > 0) {
                /* Maybe UTF-8 variant of LC_COLLATE is fine? */
                char *new_locale_end = new_locale;
                while (*new_locale_end != '\0' && *new_locale_end != '.')
                   new_locale_end++;
                strcpy(new_locale_end, ".UTF-8");
                setlocale(LC_CTYPE, new_locale);
                rc = poliqarp_regexp_validate_utf8() || unibits_validate_utf8();
                if (rc < 0)
                   goto error;
             }
             free(new_locale);
          }
       }
       if (rc > 0) {
          /* Maybe en_US.UTF-8 is available? */
          setlocale(LC_CTYPE, "en_US.UTF-8");
          rc = poliqarp_regexp_validate_utf8() || unibits_validate_utf8();
       }
       if (rc != 0) {
          poliqarp_error_message_set(error, _("Unable to set a UTF-8 locale"));
          return -1;
       }
       return 0;
    error:
       poliqarp_error_from_system(error, _("Unable to initialize the Poliqarp library"));
       return -1;
    }
    
    int poliqarp_destroy(void)
    {
       return 0;
    }
    
    int poliqarp_get_corpus_info(const struct poliqarp_corpus *corpus,
       struct poliqarp_corpus_info *info)
    {
       info->num_segments = poliqarp_backend_corpus_size(&corpus->corpus);
       info->num_types = poliqarp_backend_orth_num_items(
          poliqarp_get_const_backend(corpus, orth));
       info->num_lemmata = poliqarp_backend_base_num_items__disamb(
          poliqarp_get_const_backend(corpus, base));
       info->num_tags = poliqarp_backend_tag_num_items(
          poliqarp_get_const_backend(corpus, tag));
       return 0;
    }
    
    int poliqarp_get_segment(struct poliqarp_segment *segment, 
       struct poliqarp_corpus *corpus, size_t index)
    {
    #ifndef NDEBUG
       if (index >= poliqarp_backend_corpus_size(&corpus->corpus))
          return -1;
    #endif  
    
       segment->corpus = corpus;
       segment->segment = poliqarp_backend_corpus_get(&corpus->corpus, index);
    
       return 0;
    }
    
    int poliqarp_get_segment_info(const struct poliqarp_segment *segment,
       struct poliqarp_segment_info *info)
    {
       info->space_before = segment->segment.orth_space_id & 1;
       info->text = poliqarp_backend_orth_fetch(
          poliqarp_get_const_backend(segment->corpus, orth), 
          segment->segment.orth_space_id >> 1);
       return 0;
    }
    
    int poliqarp_get_disambiguated_interpretations(
       const struct poliqarp_segment *segment,
       struct poliqarp_interpretation_set *set)
    {
       set->corpus = segment->corpus;
       set->set = segment->segment.interp_disamb_id;
       set->disamb = true;
       return 0;
    }
    
    int poliqarp_get_ambiguous_interpretations(
       const struct poliqarp_segment *segment,
       struct poliqarp_interpretation_set *set)
    {
       set->corpus = segment->corpus;
       set->set = segment->segment.interp_amb_id;
       set->disamb = false;
       return 0;
    }
    
    int poliqarp_get_interpretation_set_info(
       const struct poliqarp_interpretation_set *set,
       struct poliqarp_interpretation_set_info *info)
    {
       info->size = set->disamb ? 
          poliqarp_backend_interp_length__disamb(
             poliqarp_get_const_backend(set->corpus, interp), set->set) :
          poliqarp_backend_interp_length__amb(
             poliqarp_get_const_backend(set->corpus, interp), set->set);
       return 0;
    }
    
    int poliqarp_get_interpretation(const struct poliqarp_interpretation_set *set,
       struct poliqarp_interpretation *interp, size_t index)
    {
       const struct poliqarp_binary_interp *binterp;
       binterp = set->disamb ?
          poliqarp_backend_interp_fetch__disamb(
             poliqarp_get_const_backend(set->corpus, interp), set->set) :
          poliqarp_backend_interp_fetch__amb(
             poliqarp_get_const_backend(set->corpus, interp), set->set);
       interp->corpus = set->corpus;
       interp->disamb = set->disamb;
       interp->interp = binterp[index];
       POLIQARP_INTERP_LE_TO_HE(interp->interp);
       return 0;
    }
    
    int poliqarp_get_interpretation_info(
       const struct poliqarp_interpretation *interp,
       struct poliqarp_interpretation_info *info)
    {
       info->base = interp->disamb ?
          poliqarp_backend_base_fetch__disamb(
             poliqarp_get_const_backend(interp->corpus, base), 
             interp->interp.base_id) :
          poliqarp_backend_base_fetch__amb(
             poliqarp_get_const_backend(interp->corpus, base), 
             interp->interp.base_id);
       info->tag = poliqarp_backend_tag_fetch(
          poliqarp_get_const_backend(interp->corpus, tag), interp->interp.tag_id);
       return 0;
    }
    
    int poliqarp_define_alias(struct poliqarp_corpus *corpus, const char *name,
       const char *value)
    {
       if (hash_table_set(&(poliqarp_get_backend(corpus, config)->aliases), name, 
          strdup(value)))
       {
          return -1;
       }
       return 0;
    }
    
    int poliqarp_delete_alias(struct poliqarp_corpus *corpus, const char *name)
    {
       if (hash_table_unset(&(poliqarp_get_backend(corpus, config)->aliases), name))
          return -1;
       return 0;
    }
    
    static void get_aliases_iterator(const char *key, const void *value,
       void *env)
    {
       struct poliqarp_alias **alias = (struct poliqarp_alias **)env;
       (*alias)->name = key;
       (*alias)->value = (const char *)value;
       (*alias)++;
    }
    
    int poliqarp_get_aliases(const struct poliqarp_corpus *corpus,
       struct poliqarp_alias_list *aliases)
    {
       const struct hash_table *table = 
          &(poliqarp_get_const_backend(corpus, config)->aliases);
       struct poliqarp_alias *tmp;
       aliases->num_aliases = hash_table_num_items(table);
       tmp = aliases->aliases = malloc(aliases->num_aliases * 
          sizeof(struct poliqarp_alias));
       hash_table_iterate(table, &tmp, get_aliases_iterator);
       return 0;
    }
    
    int poliqarp_free_aliases(struct poliqarp_alias_list *aliases)
    {
       free(aliases->aliases);
       return 0;
    }
    
    int poliqarp_get_metadata_set(const struct poliqarp_corpus *corpus,
       size_t id, struct poliqarp_metadata_set *meta)
    {
       struct poliqarp_document document;
       if (poliqarp_backend_document_fetch(&corpus->document, id, &document) == -1)
          return -1;
       meta->corpus = corpus;
       meta->low = document.meta_low;
       meta->high = document.meta_high;
       return 0;
    }
    
    size_t poliqarp_metadata_count(const struct poliqarp_metadata_set *meta)
    {
       return meta->high - meta->low;
    }
    
    int poliqarp_get_metadata(const struct poliqarp_metadata_set *set, 
       size_t index, struct poliqarp_metadata *meta)
    {
       index += set->low;
       meta->corpus = set->corpus;   
       meta->meta = poliqarp_backend_meta_fetch(poliqarp_get_const_backend(
          set->corpus, meta), index);
       return 0;
    }
    
    int poliqarp_get_metadata_types(struct poliqarp_metadata_types *types,
       const struct poliqarp_corpus *corpus)
    {
       int num = 0;
       struct poliqarp_meta_type_list *mtypes = corpus->meta.types;
       
       while (mtypes) {
          ++num;
          mtypes = mtypes->next;
       }
       types->types = malloc(num * sizeof *(types->types));
       if (types->types == NULL)
          return -1;
       types->num_types = num;
       
       mtypes = corpus->meta.types;
       while (num) {
          num--;
          types->types[num].key = mtypes->key;
          types->types[num].type = 
             (mtypes->type == POLIQARP_META_TYPE_STRING) ? POLIQARP_META_TEXT :
             POLIQARP_META_DATE;
          mtypes = mtypes->next;
       }
       return 0;
    }
    
    int poliqarp_free_metadata_types(struct poliqarp_metadata_types *types)
    {
       free(types->types);
       return 0;
    }
    
    int poliqarp_get_metadata_info(const struct poliqarp_metadata *meta,
       struct poliqarp_metadata_info *info)
    {
       info->key = poliqarp_backend_meta_key_fetch(poliqarp_get_const_backend(
          meta->corpus, meta), meta->meta.key);
       switch (meta->meta.type) {
          case POLIQARP_METADATA_SINGLE:
          case POLIQARP_METADATA_MULTI:
             info->type = POLIQARP_META_TEXT;
             info->value.text = poliqarp_backend_meta_value_fetch(
                poliqarp_get_const_backend(meta->corpus, meta), 
                meta->meta.value_as.text);
             break;
          case POLIQARP_METADATA_DATE:
             info->type = POLIQARP_META_DATE;
             info->value.date.year = meta->meta.value_as.date.year;
             info->value.date.month = meta->meta.value_as.date.month;
             info->value.date.day = meta->meta.value_as.date.day;
             break;
          case POLIQARP_METADATA_UNDEFINED:
             info->type = POLIQARP_META_UNDEFINED;
             break;
          default:
             abort(); /* Should not happen. */
       }
       return 0;
    }
    
    int poliqarp_get_tagset_info(const struct poliqarp_corpus *corpus,
       struct poliqarp_tagset_info *info)
    {
       const struct poliqarp_backend_config *cfg = 
          poliqarp_get_const_backend(corpus, config);
       struct entity *entity;
       struct poliqarp_attr *attr;
       struct poliqarp_attr_value *aval;
       struct poliqarp_part_of_speech *pos;
       struct poliqarp_attr_instance *ainst;
       string_t s;
    
       /* first pass: gather number of classes and categories */
       info->num_categories = info->num_classes = 0;
       for (entity = cfg->named_items.first_entity; entity; 
          entity = entity->next_entity) 
       {
          switch (*(enum poliqarp_entity_type *)entity->tag) {
             case POLIQARP_ENTITY_POS: info->num_classes++; break;
             case POLIQARP_ENTITY_ATTR: info->num_categories++; break;
             default: break;
          }
       }
    
       /* allocate memory */
       info->classes = malloc(info->num_classes * sizeof(*(info->classes)));
       info->categories = malloc(info->num_categories * sizeof(*(info->categories)));
    
       /* second pass: retrieve the info */
       info->num_categories = info->num_classes = 0;
       for (entity = cfg->named_items.first_entity; entity;
          entity = entity->next_entity)
       {
          switch (*(enum poliqarp_entity_type *)entity->tag) {
             case POLIQARP_ENTITY_POS:
                s = string_create();
                string_append_str(s, entity->name);
                pos = (struct poliqarp_part_of_speech *)entity->data;
                for (ainst = pos->first_instance; ainst; 
                   ainst = ainst->next_instance)
                {
                   string_append_str(s, " ");
                   if (ainst->is_optional)
                      string_append_str(s, "[");
                   string_append_str(s, ainst->attr->self->name);
                   if (ainst->is_optional)
                      string_append_str(s, "]");
                }
                info->classes[info->num_classes++] = string_free_and_get_buffer(s);
                break;
             case POLIQARP_ENTITY_ATTR: 
                s = string_create();
                string_append_str(s, entity->name);
                attr = (struct poliqarp_attr *)entity->data;
                for (aval = attr->first_value; aval; aval = aval->next_value) {
                   string_append_str(s, " ");
                   string_append_str(s, aval->self->name);
                }
                info->categories[info->num_categories++] = string_free_and_get_buffer(s);
                break;
             default:
                break;
          }
       }
       return 0;
    }
    
    void poliqarp_free_tagset_info(struct poliqarp_tagset_info *info)
    {
       size_t i;
       for (i = 0; i < info->num_classes; i++)
          free(info->classes[i]);
       for (i = 0; i < info->num_categories; i++)
          free(info->categories[i]);
    }