/* * This file is part of the Poliqarp suite. * * Copyright (C) 2004-2009 by Instytut Podstaw Informatyki Polskiej * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish * Academy of Sciences; cf. www.ipipan.waw.pl). All rights reserved. * * This file may be distributed and/or modified under the terms of the * GNU General Public License version 2 as published by the Free Software * Foundation and appearing in the file gpl.txt included in the packaging * of this file. (See http://www.gnu.org/licenses/translations.html for * unofficial translations.) * * A commercial license is available from IPI PAN (contact * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more * information). Licensees holding a valid commercial license from IPI * PAN may use this file in accordance with that license. * * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE. */ /** * @file poliqarp.h * @brief Public API of libpoliqarp. * * This file defines a set of low-level corpus structures and functions. * Any library that implements them can be linked with the Poliqarp daemon and * used with Poliqarp clients. The default implementation is codenamed * `sakura'. */ #ifndef POLIQARP_H #define POLIQARP_H #include <poliqarp-config.h> #include <stdlib.h> #include <progress/progress.h> /** Version number and name of the library. */ extern const int poliqarp_major_version; extern const int poliqarp_minor_version; extern const int poliqarp_revision_number; extern const char poliqarp_library_name[]; /* Declaration of private structures. */ /** * Logically, a corpus is just a sequence of segments. However, this file does * not tell how it is precisely defined -- that is the job of * poliqarp-private.h, included below. Instead, it declares functions that * return a particular segment in the corpus, or number of segments contained * therein, etc. */ struct poliqarp_corpus; /** * A parsed query can have just about any internal structure. */ struct poliqarp_query; /** * An error message wrapper. */ struct poliqarp_error; /** * A match buffer is an array holding results of a query. It generally * has a fixed size, but can be enlarged or shrunk as needed. */ struct poliqarp_match_buffer; /** * A segment is a triple <O, D, A>, where O is a string (denoting orthographic * form of the segment -- the form as it appears in the text) and D and A are * sets of interpretations (see below), corresponding to disambiguated and * ambiguous interpretations, respectively. */ struct poliqarp_segment; /** * Straightforwardly enough, an interpretation set is a collection of * interpretations (which see below). */ struct poliqarp_interpretation_set; /** * An interpretation is a pair <b, t> of strings, representing the base form * and tag of a segment, respectively. As far as this API is concerned, tags * do not have internal structure; they are just strings. */ struct poliqarp_interpretation; /** * This is a collection of pieces of metadata that can be specified for a * document. */ struct poliqarp_metadata_set; /** * A single element of the metadata set. */ struct poliqarp_metadata; /** * State of a pseudo-random number generator. */ struct poliqarp_random_state; /* General library routines. */ /** * Initializes the library. Must be used once before any other library * functions are called. * @param locale name to be passed to setlocale() * @return 0 if the initialization was successful, -1 otherwise. */ int poliqarp_create(const char *locale, struct poliqarp_error *error); /** * Frees all resources used internally by the library. * @return 0 if the cleanup was successful, -1 otherwise. */ int poliqarp_destroy(void); /** * Initializes per-thread private data of the library. Must be called once * before any other library functions are called by every thread that uses * them. * @return 0 if the initialization was successful, -1 otherwise. */ int poliqarp_thread_init(void **data); /** * Destructive counterpart of poliqarp_thread_init(). * @return 0 if the cleanup was successful, -1 otherwise. */ int poliqarp_thread_done(void *data); /* Corpus-related routines. */ /** * Public corpus information. */ struct poliqarp_corpus_info { size_t num_segments; /**< Number of segments in this corpus. */ size_t num_types; /**< Number of types of segments in this corpus. */ size_t num_lemmata; /**< Number of types of lemmata in this corpus. */ size_t num_tags; /**< Number of types of tags in this corpus. */ }; /** * Public tagset information. */ struct poliqarp_tagset_info { size_t num_categories; /**< Number of grammatical categories. */ char **categories; /**< Array of category descriptions. */ size_t num_classes; /**< Number of grammatical classes. */ char **classes; /**< Array of class descriptions. */ }; /** * Opens a corpus. * @param corpus The structure to be initialized. * @param name Some kind of name for the corpus. It could be a file name, a * common prefix of several file names (as is the case with sakura), etc. * @param progress Indicator of the progress of this operation. * @return 0 if the corpus was opened successfully, -1 in case of an error. * @note It is safe to call this function in a thread that can be cancelled * with a deferred cancellation request. */ int poliqarp_open_corpus(struct poliqarp_corpus *corpus, const char *name, progress_t *progress, struct poliqarp_error *error); /** * Closes a corpus and frees all resources associated with it. * @param corpus The corpus to be closed. * @return 0 if the corpus was closed successfully, -1 in case of an error. * @note Think of this function as a hint to the library that a particular * corpus should be closed. In particular, the library might as well leave * it open if it so desires, or close it after a certain amount of time * (for instance if many users seem to be using this corpus intensively). */ int poliqarp_close_corpus(struct poliqarp_corpus *corpus); /** * Retrieves information about the corpus. * @param corpus The corpus to extract information from. * @param info The destination structure. * @return 0 upon successful completion, -1 upon error. */ int poliqarp_get_corpus_info(const struct poliqarp_corpus *corpus, struct poliqarp_corpus_info *info); /** * Retrieves the tagset used by the corpus. * @param corpus The corpus to extract tagset information from. * @param info The destination structure. * @return 0 upon successful completion, -1 on error. */ int poliqarp_get_tagset_info(const struct poliqarp_corpus *corpus, struct poliqarp_tagset_info *info); /** * Frees all resources allocated by poliqarp_get_tagset_info(). */ void poliqarp_free_tagset_info(struct poliqarp_tagset_info *info); /* Query-related routines. */ /** * Query flags. */ #define POLIQARP_QFLAG_QUERY_I 1 /**< Query case-insensitive */ #define POLIQARP_QFLAG_QUERY_X 2 /**< Query: not whole words */ #define POLIQARP_QFLAG_META_I 4 /**< Metadata case-insensitive */ #define POLIQARP_QFLAG_META_X 8 /**< Metadata: not whole words */ /** * Analyzes query text and initializes a query structure. * @param query The query structure to be initialized. * @param text Text of the query. * @param corpus The corpus that this query will be run on. * @param flags Query flags, a combination of QFLAG_*. * @param rewrite Name of query rewriting rules or NULL. * @return 0 upon successful completion, -1 otherwise (e.g. * in case of parse error). */ int poliqarp_create_query(struct poliqarp_query *query, const char *text, struct poliqarp_corpus *corpus, int flags, const char *rewrite, struct poliqarp_random_state *random_state, struct poliqarp_error *error); /** * Destroys a query object. * @param query The query to be destroyed. * @return 0 upon successful destroy, -1 if an error occurred. */ int poliqarp_destroy_query(struct poliqarp_query *query); /** * Sends a message to the client, notifying it that several new * results of a query have been found. This function does not belong to * the corpus library, but is implemented in the daemon and declared here * as a means for the search routine to communicate with the outside world. * @param session The session parameter passed to poliqarp_produce. */ extern void async_notify_new_results(void *session); /** * Executes a query on the corpus, producing at most a given number of * results. This function can be called multiple times to increasingly * produce more results. * @param buffer The match buffer to store results in. * @param count Maximum number of results to be produced. * @param query The query to be executed. * @param progress Structure indicating progress of the operation. * @param session The session to which send messages about new results. * @param notify_step If this is non-zero, asynchronous messages are sent * to the session specified by the former parameter each time this many * new results are found. Otherwise, no messages are sent. * @param max_match_length Maximum permissible match length, in segments. * @return 0 upon successful completion, -1 on error. * @note It is safe to call this function in a thread that can be cancelled * with a deferred cancellation request. */ int poliqarp_produce(struct poliqarp_match_buffer *buffer, size_t count, struct poliqarp_query *query, progress_t *progress, void *session, size_t notify_step, size_t max_match_length); /* Match buffer operations. */ /** * Match column. Designates one of the columns available in the match for * sorting. */ enum poliqarp_column { POLIQARP_COLUMN_LEFT_CONTEXT, /**< Selects left context for sorting. */ POLIQARP_COLUMN_LEFT_MATCH, /**< Selects left match for sorting. */ POLIQARP_COLUMN_MATCH, /**< Selects entire match for sorting. */ POLIQARP_COLUMN_RIGHT_MATCH, /**< Selects right match for sorting. */ POLIQARP_COLUMN_RIGHT_CONTEXT /**< Selects right context for sorting. */ }; /** * Sorting criteria. Required by sorting routine. */ struct poliqarp_sort_info { enum poliqarp_column column; /**< What to sort by. */ bool ascending; /**< True iff sorting in ascending order. */ bool atergo; /**< True iff sorting a tergo. */ size_t context; /**< Width of match context, in segments. Used when sorting by context. */ }; /** * Public information about the match buffer. */ struct poliqarp_match_buffer_info { size_t capacity; /**< Size of buffer. */ size_t used; /**< Number of stored results. */ size_t num_results; /**< Number of results spotted during query execution. */ }; /** * Creates a match buffer. * @param buffer The buffer structure to be initialized. * @param size Size of buffer. * @return 0 upon successful creation, -1 otherwise. */ int poliqarp_create_match_buffer(struct poliqarp_match_buffer *buffer, size_t size); /** * Destroys a match buffer. * @param buffer The buffer to be destroyed. * @return 0 upon successful destruction, -1 otherwise. */ int poliqarp_destroy_match_buffer(struct poliqarp_match_buffer *buffer); /** * Retrieves information about match buffer. * @param buffer The buffer to extract information from. * @param info Structure that will hold the result. */ int poliqarp_get_match_buffer_info(struct poliqarp_match_buffer *buffer, struct poliqarp_match_buffer_info *info); /** * Sorts match buffer according to the given criteria. * @param buffer The buffer to sort results in. * @param criteria Criteria of sorting. * @param progress Structure indicating progress of the operation. * @return 0 if sorting succeeded, -1 if it failed. * @note This routine performs a stable sort, which makes it possible to sort * the buffer using multiple criteria. To do that, it suffices to call this * function several times, starting with least significant criteria and * finishing with most significant ones. * @note It is safe to call this function in a thread that can be cancelled * with a deferred cancellation request. */ int poliqarp_sort_match_buffer(struct poliqarp_match_buffer *buffer, const struct poliqarp_sort_info *criteria, progress_t *progress); /** * Removes all matches from a match buffer. The size of the buffer remains * unchanged. * @param buffer The buffer to be cleared. * @return 0 upon successful completion, -1 on error. */ int poliqarp_forget(struct poliqarp_match_buffer *buffer); /** * Resizes a match buffer, possibly dropping several matches. * When the buffer gets enlarged, empty slots are added at the end. * When the buffer gets shrunk, the effect is twofold: first, empty * elements (if any) are removed from the end of the buffer; if that is not * sufficient, oldest matches are dropped to match the new size. * @param buffer The buffer to be resized. * @param size New size of the buffer. * @return 0 if the buffer got successfully resized, -1 on failure. */ int poliqarp_resize_match_buffer(struct poliqarp_match_buffer *buffer, size_t size); /* Match operations. */ /** * Structure of a match is invariant to Poliqarp's design, so here it is -- * basically a range with a point between its borders. */ struct poliqarp_match { size_t start; /** Offset of the first segment that belongs to this match. */ size_t end; /** Offset of one-past-end segment in this match. */ size_t focus; /** Offset (relative to start of corpus) of focus point. */ size_t document; /** Document identifier associated with this match. */ }; /** * Retrieves a match from a match buffer. * @param buffer The buffer to retrieve the match from. * @param match The match to be retrieved. * @param index Index of the match in buffer. * @return 0 upon successful retrieval, -1 on error. * @note Does not do range checking if NDEBUG is defined. */ int poliqarp_get_match(const struct poliqarp_match_buffer *buffer, struct poliqarp_match *match, size_t index); /** * Creates a match for the whole document. * @param corpus Corpus that contains the document. * @param match The match to be created. * @param document Index of the document (valid values for this argument are * values of 'document' field from struct poliqarp_match). * @return 0 upon successful completion, -1 on error. */ int poliqarp_get_match_for_document(const struct poliqarp_corpus *corpus, size_t document, struct poliqarp_match *match); /* Segment operations. */ /** * Public segment information. */ struct poliqarp_segment_info { const char *text; /**< Orthographic form of a segment. */ bool space_before; /**< Is there a space before this segment? */ }; /** * Retrieves segment by index from a corpus. * @param segment Where to store the retrieved segment. * @param corpus The corpus to retrieve the segment from. * @param index Index of the segment to be retrieved. * @return 0 upon successful retrieval, -1 on error. * @note Does not do range checking if NDEBUG is defined. */ int poliqarp_get_segment(struct poliqarp_segment *segment, struct poliqarp_corpus *corpus, size_t index); /** * Retrieves information about a segment. * @param segment Segment to extract information from. * @param info The destination structure. * @return 0 upon successful retrieval, -1 on error. */ int poliqarp_get_segment_info(const struct poliqarp_segment *segment, struct poliqarp_segment_info *info); /* Interpretation set operations. */ /** * Public information about interpretation set. */ struct poliqarp_interpretation_set_info { size_t size; /**< Number of interpretations. */ }; /** * Retrieves set of disambiguated interpretations of a segment. * @param segment The segment to extract the set from. * @param set The destination structure. * @return 0 upon successful retrieval, -1 on error. */ int poliqarp_get_disambiguated_interpretations( const struct poliqarp_segment *segment, struct poliqarp_interpretation_set *set); /** * Retrieves set of ambiguous interpretations of a segment. * @param segment The segment to extract the set from. * @param set The destination structure. * @return 0 upon successful retrieval, -1 on error. */ int poliqarp_get_ambiguous_interpretations( const struct poliqarp_segment *segment, struct poliqarp_interpretation_set *set); /** * Retrieves information about a set of interpretations. * @param set The set to extract the information from. * @param info The destination structure. * @return 0 upon successful retrieval, -1 on error. */ int poliqarp_get_interpretation_set_info( const struct poliqarp_interpretation_set *set, struct poliqarp_interpretation_set_info *info); /* Interpretation operations. */ /** * Public information about an interpretation. */ struct poliqarp_interpretation_info { const char *base; /**< Base form of the segment. */ const char *tag; /**< Unparsed tag. */ }; /** * Retrieves an interpretation from a set. * @param set The set to extract the interpretation from. * @param interp The destination structure. * @param index Index of the interpretation in a set. * @return 0 upon successful retrieval, -1 on error. */ int poliqarp_get_interpretation(const struct poliqarp_interpretation_set *set, struct poliqarp_interpretation *interp, size_t index); /** * Retrieves information about an interpretation. * @param interp The interpretation to extract information from. * @param info The destination structure. * @return 0 upon successful retrieval, -1 on error. */ int poliqarp_get_interpretation_info( const struct poliqarp_interpretation *interp, struct poliqarp_interpretation_info *info); /* Alias operations. */ /** * Public alias information. */ struct poliqarp_alias { const char *name; const char *value; }; /** * Public alias list information. */ struct poliqarp_alias_list { struct poliqarp_alias *aliases; size_t num_aliases; }; /** * Defines an alias for an open corpus. From now on, all occurrences of * `name' when querying this corpus will be replaced by `value'. * @param corpus Corpus to define alias for. * @param name Name of the alias. * @param value Value of the alias. * @return 0 upon successful completion, -1 on error. */ int poliqarp_define_alias(struct poliqarp_corpus *corpus, const char *name, const char *value); /** * Deletes an alias that is currently defined for this corpus. * @param corpus Corpus to define alias for. * @param name Name of the alias that is being deleted. * @return 0 upon successful completion, -1 on error (e.g. no such alias * exists). */ int poliqarp_delete_alias(struct poliqarp_corpus *corpus, const char *name); /** * Retrieves the list of aliases available for an open corpus. * @param corpus Corpus to retrieve aliases for. * @param aliases Pointer to a structure that will contain the aliases. * @return 0 on successful completion, -1 on error. */ int poliqarp_get_aliases(const struct poliqarp_corpus *corpus, struct poliqarp_alias_list *aliases); /** * Frees the memory allocated for the alias list by poliqarp_get_aliases(). * @param aliases Structure to be released. * @return 0 on successful completion, -1 on error. */ int poliqarp_free_aliases(struct poliqarp_alias_list *aliases); /* Metadata operations. */ /** * A piece of metadata is a 'key-value' pair. The key can be an arbitrary * string, whereas values come in two flavours: textual and date. This * enum defines the type of metadata. */ enum poliqarp_metadata_type { POLIQARP_META_TEXT, /**< Textual piece of metadata. */ POLIQARP_META_DATE, /**< Date-holding piece of metadata. */ POLIQARP_META_UNDEFINED /**< This information has not been defined for this corpus. */ }; /** * One possible value type for a metadata can be a date. This is useful when * specifying information such as date of creation, date of first publication, * etc. */ struct poliqarp_date { int year; /**< Year. */ int month; /**< Month. */ int day; /**< Day. */ }; /** * A binding of key name to type of metadata. */ struct poliqarp_metadata_type_binding { char *key; /**< Name of the key. */ enum poliqarp_metadata_type type; /**< Type of values for this key. */ }; /** * The set of metadata types: an array of bindings of metadata keys to * values. */ struct poliqarp_metadata_types { struct poliqarp_metadata_type_binding *types; /**< The set proper. */ size_t num_types; /**< Number of types. */ }; /** * Public information about a piece of metadata. */ struct poliqarp_metadata_info { enum poliqarp_metadata_type type; /**< Type of this metadata. */ const char *key; /**< Name of key. */ union { const char *text; /**< Textual value. */ struct poliqarp_date date; /**< Date value. */ } value; /**< Value union. */ }; /** * Retrieves the set of metadata for a given document. * @param corpus Corpus that contains the metadata. * @param document Index of the document (valid values for this argument are * values of 'document' field from struct poliqarp_match). * @param set The destination structure. * @return 0 upon successful completion, -1 on error. */ int poliqarp_get_metadata_set(const struct poliqarp_corpus *corpus, size_t document, struct poliqarp_metadata_set *set); /** * Retrieves the set of metadata types defined for a given corpus. * @param corpus Corpus that contains the metadata. * @param types The structure to contain the set. * @return 0 upon successful completion, -1 on error. */ int poliqarp_get_metadata_types(struct poliqarp_metadata_types *types, const struct poliqarp_corpus *corpus); /** * Frees the resources allocated by poliqarp_get_metadata_types. * @param types The structure to be freed. * @return 0 upon successful completion, -1 on error. */ int poliqarp_free_metadata_types(struct poliqarp_metadata_types *types); /** * Returns number of pieces of metadata in a set. * @param set The set of metadata. */ size_t poliqarp_metadata_count(const struct poliqarp_metadata_set *set); /** * Returns a single piece of metadata from a set. * @param set The set of metadata. * @param index Index of the piece of metadata to retrieve. * @param meta The destination structure. */ int poliqarp_get_metadata(const struct poliqarp_metadata_set *set, size_t index, struct poliqarp_metadata *meta); /** * Retrieves information about a single piece of metadata. * @param meta The metadata to query about. * @param info The public information structure. * @return 0 on successful retrieval, -1 on failure. */ int poliqarp_get_metadata_info(const struct poliqarp_metadata *meta, struct poliqarp_metadata_info *info); /* * Finally, include the definitions of private structures so that their sizes * are known to the user. */ #include <sakura/poliqarp-private.h> #endif /* POLIQARP_H */