Commit 01556927 authored by Arkadiusz Janz's avatar Arkadiusz Janz

Merge branch 'develop' into 'master'

Develop

See merge request !4
parents ae6014eb 769bca3e
Pipeline #2801 passed with stage
in 4 minutes and 30 seconds
......@@ -11,4 +11,4 @@ build_image:
- docker:18.09.7-dind
script:
- APT_USERNAME=aptuser
- docker build . -t corpus2 --build-arg APT_USERNAME --build-arg APT_PASSWORD
- docker build . -t corpus2mwe --build-arg APT_USERNAME --build-arg APT_PASSWORD
......@@ -48,6 +48,9 @@ endif(PYTHON_VERSION)
SET(CPACK_PACKAGE_VERSION "${CORPUS2MWE_VERSION}")
set(WCCL_ANNOTATOR_PACKAGE_VERSION "0.1")
add_subdirectory(tools/wccl-annotator)
SET(CPACK_GENERATOR "DEB")
SET(CPACK_DEBIAN_PACKAGE_MAINTAINER "g419")
INCLUDE(CPack)
......@@ -30,7 +30,7 @@ RUN apt-get update && apt-get install -y \
# Install wccl from branch with change
WORKDIR /home/install
RUN git clone --single-branch \
--branch param_ann \
--branch ann_base_bug \
https://gitlab.clarin-pl.eu/analysers/wccl.git
RUN mkdir wccl/src/build && \
cat wccl/src/libmwereader/mwereader.cpp && \
......
#!/bin/bash
docker build ../../.. -t corpus2mwe_wccl_param_ann -f Dockerfile --no-cache
docker run --rm corpus2mwe_wccl_param_ann bash -c '
cd corpus2mwe/tools
echo "==> (1/6) Annotating without setting annotation name (default annotation and dict):"
python mwe_converter.py -c /home/install/corpus2mwe/cclmwe/tests/testdata/ccl.xml \
-o /home/install/corpus2mwe/cclmwe/tests/testdata/ccl_out.xml
cat /home/install/corpus2mwe/cclmwe/tests/testdata/ccl_out.xml
echo "==> (2/6) Annotating with annotation test_ann (defaut dict):"
python mwe_converter.py -c /home/install/corpus2mwe/cclmwe/tests/testdata/ccl.xml \
-o /home/install/corpus2mwe/cclmwe/tests/testdata/ccl_out2.xml \
-a test_ann
cat /home/install/corpus2mwe/cclmwe/tests/testdata/ccl_out2.xml
echo "==> (3/6) Annotating with annotation test_ann and wccl dict testdata/test_wccl_1.xml:"
python mwe_converter.py -c /home/install/corpus2mwe/cclmwe/tests/testdata/ccl.xml \
-o /home/install/corpus2mwe/cclmwe/tests/testdata/ccl_out3.xml \
-a test_ann \
-d ../cclmwe/tests/custom_annotations/testdata/test_wccl_1.xml
cat /home/install/corpus2mwe/cclmwe/tests/testdata/ccl_out3.xml
echo "==> (4/6) Annotating with two wccl dicts:
testdata/test_wccl_2a.xml (test_ann_a), testdata/test_wccl_2b.xml (test_ann_b):"
python mwe_converter.py -c /home/install/corpus2mwe/cclmwe/tests/testdata/ccl.xml \
-o /home/install/corpus2mwe/cclmwe/tests/testdata/ccl_out4.xml \
-D ../cclmwe/tests/custom_annotations/testdata/wccl_list.tsv
cat /home/install/corpus2mwe/cclmwe/tests/testdata/ccl_out4.xml
echo "==> (5/6) Annotating two files with wccl dict testdata/test_wccl_1.xml (test_ann):"
python mwe_converter.py -c ../cclmwe/tests/custom_annotations/testdata/ccl_list.txt \
-o /home/install/corpus2mwe/cclmwe/tests/testdata/ccl_annotated.txt \
-d ../cclmwe/tests/custom_annotations/testdata/test_wccl_1.xml \
-a test_ann \
--batch
echo "==> Generated files:"
cat /home/install/corpus2mwe/cclmwe/tests/testdata/ccl_annotated.txt
echo "==> /home/install/corpus2mwe/cclmwe/tests/testdata/ccl.xml.mwe":
cat /home/install/corpus2mwe/cclmwe/tests/testdata/ccl.xml.mwe
echo "==> /home/install/corpus2mwe/cclmwe/tests/testdata/ccl2.xml.mwe (none annotated)":
cat /home/install/corpus2mwe/cclmwe/tests/testdata/ccl2.xml.mwe
echo "==> (6/6) Annotating two files with two wccl dicts:
testdata/test_wccl_2a.xml (test_ann_a), testdata/test_wccl_2b.xml (test_ann_b):"
python mwe_converter.py -c ../cclmwe/tests/custom_annotations/testdata/ccl_list.txt \
-o /home/install/corpus2mwe/cclmwe/tests/testdata/ccl_annotated2.txt \
-D ../cclmwe/tests/custom_annotations/testdata/wccl_list.tsv \
--batch
echo "==> Generated files:"
cat /home/install/corpus2mwe/cclmwe/tests/testdata/ccl_annotated.txt
echo "==> /home/install/corpus2mwe/cclmwe/tests/testdata/ccl.xml.mwe":
cat /home/install/corpus2mwe/cclmwe/tests/testdata/ccl.xml.mwe
echo "==> /home/install/corpus2mwe/cclmwe/tests/testdata/ccl2.xml.mwe":
cat /home/install/corpus2mwe/cclmwe/tests/testdata/ccl2.xml.mwe
'
# Execute this script to call test cases inside testing container
# docker build ../../.. -t corpus2mwe_wccl_param_ann -f Dockerfile --no-cache
docker build ../../.. -t corpus2mwe_wccl_param_ann -f Dockerfile
docker run --rm \
-v $(pwd)/test_container.sh:/home/install/test_container.sh \
corpus2mwe_wccl_param_ann \
bash -c '/home/install/test_container.sh'
#!/bin/bash
# Script called inside the container
echo "==> (1/7) Annotating without setting annotation name (default annotation and dict):"
wccl_annotator -c /home/install/corpus2mwe/cclmwe/tests/testdata/ccl.xml \
-o /home/install/corpus2mwe/cclmwe/tests/testdata/ccl_out.xml
cat /home/install/corpus2mwe/cclmwe/tests/testdata/ccl_out.xml
echo "==> (2/7) Annotating with annotation test_ann (defaut dict):"
wccl_annotator -c /home/install/corpus2mwe/cclmwe/tests/testdata/ccl.xml \
-o /home/install/corpus2mwe/cclmwe/tests/testdata/ccl_out2.xml \
-a test_ann
cat /home/install/corpus2mwe/cclmwe/tests/testdata/ccl_out2.xml
echo "==> (3/7) Annotating with annotation test_ann and wccl dict testdata/test_wccl_1.xml:"
wccl_annotator -c /home/install/corpus2mwe/cclmwe/tests/testdata/ccl.xml \
-o /home/install/corpus2mwe/cclmwe/tests/testdata/ccl_out3.xml \
-a test_ann \
-d /home/install/corpus2mwe/cclmwe/tests/custom_annotations/testdata/test_wccl_1.xml
cat /home/install/corpus2mwe/cclmwe/tests/testdata/ccl_out3.xml
echo "==> (4/7) Annotating with two wccl dicts:
testdata/test_wccl_2a.xml (test_ann_a), testdata/test_wccl_2b.xml (test_ann_b):"
wccl_annotator -c /home/install/corpus2mwe/cclmwe/tests/testdata/ccl.xml \
-o /home/install/corpus2mwe/cclmwe/tests/testdata/ccl_out4.xml \
-D /home/install/corpus2mwe/cclmwe/tests/custom_annotations/testdata/wccl_list.tsv
cat /home/install/corpus2mwe/cclmwe/tests/testdata/ccl_out4.xml
echo "==> (5/7) Annotating two files with wccl dict testdata/test_wccl_1.xml (test_ann):"
wccl_annotator -c /home/install/corpus2mwe/cclmwe/tests/custom_annotations/testdata/ccl_list.txt \
-o /home/install/corpus2mwe/cclmwe/tests/testdata/ccl_annotated.txt \
-d /home/install/corpus2mwe/cclmwe/tests/custom_annotations/testdata/test_wccl_1.xml \
-a test_ann \
--batch
echo "==> Generated files:"
cat /home/install/corpus2mwe/cclmwe/tests/testdata/ccl_annotated.txt
echo "==> /home/install/corpus2mwe/cclmwe/tests/testdata/ccl.xml.mwe":
cat /home/install/corpus2mwe/cclmwe/tests/testdata/ccl.xml.mwe
echo "==> /home/install/corpus2mwe/cclmwe/tests/testdata/ccl2.xml.mwe (none annotated)":
cat /home/install/corpus2mwe/cclmwe/tests/testdata/ccl2.xml.mwe
echo "==> (6/7) Annotating two files with two wccl dicts:
testdata/test_wccl_2a.xml (test_ann_a), testdata/test_wccl_2b.xml (test_ann_b):"
wccl_annotator -c /home/install/corpus2mwe/cclmwe/tests/custom_annotations/testdata/ccl_list.txt \
-o /home/install/corpus2mwe/cclmwe/tests/testdata/ccl_annotated2.txt \
-D /home/install/corpus2mwe/cclmwe/tests/custom_annotations/testdata/wccl_list.tsv \
--batch
echo "==> Generated files:"
cat /home/install/corpus2mwe/cclmwe/tests/testdata/ccl_annotated.txt
echo "==> /home/install/corpus2mwe/cclmwe/tests/testdata/ccl.xml.mwe":
cat /home/install/corpus2mwe/cclmwe/tests/testdata/ccl.xml.mwe
echo "==> /home/install/corpus2mwe/cclmwe/tests/testdata/ccl2.xml.mwe":
cat /home/install/corpus2mwe/cclmwe/tests/testdata/ccl2.xml.mwe
echo "==> (7/7) Annotating with two wccl dicts - calling as python module:"
python -c '
from wccl_annotator.wccl_annotator import WcclAnnotator
WcclAnnotator().process("/home/install/corpus2mwe/cclmwe/tests/testdata/ccl.xml", \
"/home/install/corpus2mwe/cclmwe/tests/testdata/ccl_out7.xml", \
ann_2_wccl_dict="/home/install/corpus2mwe/cclmwe/tests/custom_annotations/testdata/wccl_list.tsv")
'
cat /home/install/corpus2mwe/cclmwe/tests/testdata/ccl_out7.xml
echo "==> Testing against bug with base correctness (test_base_ann_bug1.xml):"
wccl_annotator -c /home/install/corpus2mwe/cclmwe/tests/custom_annotations/testdata/test_base_ann_bug1.xml \
-o /home/install/corpus2mwe/cclmwe/tests/testdata/ccl_ann_bug1_out.xml \
-d /home/install/corpus2mwe/cclmwe/tests/custom_annotations/testdata/test_wccl_longest_first_3.xml
cat /home/install/corpus2mwe/cclmwe/tests/testdata/ccl_ann_bug1_out.xml
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE chunkList SYSTEM "ccl.dtd">
<chunkList>
<chunk id="ch1">
<sentence id="s1">
<tok>
<orth>plac</orth>
<lex disamb="1"><base>plac</base><ctag>subst:sg:nom:m3</ctag></lex>
</tok>
<tok>
<orth>zabaw</orth>
<lex disamb="1"><base>zabawa</base><ctag>subst:pl:gen:f</ctag></lex>
</tok>
<tok>
<orth>i</orth>
<lex disamb="1"><base>i</base><ctag>conj</ctag></lex>
</tok>
<tok>
<orth>tenis</orth>
<lex disamb="1"><base>tenis</base><ctag>subst:sg:acc:m3</ctag></lex>
</tok>
<tok>
<orth>stołowy</orth>
<lex disamb="1"><base>stołowy</base><ctag>adj:sg:acc:m3:pos</ctag></lex>
</tok>
<ns/>
<tok>
<orth>.</orth>
<lex disamb="1"><base>.</base><ctag>interp</ctag></lex>
</tok>
</sentence>
</chunk>
<chunk id="ch2">
<sentence id="s2">
<tok>
<orth>plac</orth>
<lex disamb="1"><base>plac</base><ctag>subst:sg:nom:m3</ctag></lex>
</tok>
<tok>
<orth>zabaw</orth>
<lex disamb="1"><base>zabawa</base><ctag>subst:pl:gen:f</ctag></lex>
</tok>
<tok>
<orth>i</orth>
<lex disamb="1"><base>i</base><ctag>conj</ctag></lex>
</tok>
<tok>
<orth>tenis</orth>
<lex disamb="1"><base>tenis</base><ctag>subst:sg:acc:m3</ctag></lex>
</tok>
<tok>
<orth>stołowy</orth>
<lex disamb="1"><base>stołowy</base><ctag>adj:sg:acc:m3:pos</ctag></lex>
</tok>
</sentence>
</chunk>
<chunk id="ch3">
<sentence id="s3">
<tok>
<orth>plac</orth>
<lex disamb="1"><base>plac</base><ctag>subst:sg:nom:m3</ctag></lex>
</tok>
<tok>
<orth>zabaw</orth>
<lex disamb="1"><base>zabawa</base><ctag>subst:pl:gen:f</ctag></lex>
</tok>
<tok>
<orth>i</orth>
<lex disamb="1"><base>i</base><ctag>conj</ctag></lex>
</tok>
<tok>
<orth>tenis</orth>
<lex disamb="1"><base>tenis</base><ctag>subst:sg:acc:m3</ctag></lex>
</tok>
<tok>
<orth>stołowy</orth>
<lex disamb="1"><base>stołowy</base><ctag>adj:sg:acc:m3:pos</ctag></lex>
</tok>
<tok>
<orth>oraz</orth>
<lex disamb="1"><base>oraz</base><ctag>conj</ctag></lex>
</tok>
<tok>
<orth>inne</orth>
<lex disamb="1"><base>inny</base><ctag>adj:pl:nom:m3:pos</ctag></lex>
</tok>
</sentence>
</chunk>
</chunkList>
<units_description tagset="nkjp">
<mwegroup class="neo" id="SubstAdj" name="SubstAdj" type="fix">
<condition>
and(
equal(base[0],$s:A),
equal(base[1],$s:B),
inter(base[0],$s:A), inter(class[0],{subst}),
inter(base[1],$s:B), inter(class[1],{adj}),
setvar($Pos0, 0),
setvar($Pos1, 1),
setvar($Head, 0)
)
</condition>
<instances>
<MWE base="tenis stołowy">
<var name="A">tenis</var>
<var name="B">stołowy</var>
</MWE>
</instances>
</mwegroup>
<mwegroup class="neo" id="SubstSubst" name="SubstSubst" type="fix">
<condition>
and(
equal(base[0],$s:A),
equal(base[1],$s:B),
inter(base[0],$s:A), inter(class[0],{subst}),
inter(base[1],$s:B), inter(class[1],{subst}),
setvar($Pos0, 0),
setvar($Pos1, 1),
setvar($Head, 0)
)
</condition>
<instances>
<MWE base="plac zabaw">
<var name="A">plac</var>
<var name="B">zabawa</var>
</MWE>
</instances>
</mwegroup>
<mwegroup class="neo" id="Subst" name="Subst" type="fix">
<condition>
and(
equal(base[0],$s:A),
inter(base[0],$s:A), inter(class[0],{subst}),
setvar($Pos0, 0),
setvar($Head, 0)
)
</condition>
<instances>
<MWE base="tenis">
<var name="A">tenis</var>
</MWE>
<MWE base="plac">
<var name="A">plac</var>
</MWE>
</instances>
</mwegroup>
</units_description>
find_program(PYTHON "python")
if (PYTHON)
set(SETUP_PY_IN "${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in")
set(SETUP_PY "${CMAKE_CURRENT_BINARY_DIR}/setup.py")
set(OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/build/timestamp")
configure_file(${SETUP_PY_IN} ${SETUP_PY})
add_custom_command(OUTPUT ${OUTPUT}
COMMAND ${PYTHON} ${SETUP_PY} build
COMMAND ${CMAKE_COMMAND} -E touch ${OUTPUT})
add_custom_target(target ALL DEPENDS ${OUTPUT})
install(CODE "execute_process(COMMAND ${PYTHON} ${SETUP_PY} install)")
endif()
# Overview
Package facilitating usage of ```corpus2mwe``` in order to annotate CCL
document with WCCL dictionary. Supports running with many WCCL dicts and/or
many CCL documents.
This tool can be used as standalone tool (```wccl_annotator```) or as a module.
# Usage
## As command line tool
```
usage: wccl_annotator [-h] -c CCL_FILE [-o OUT_FILE] [-m] [-t TAGSET]
[-a ANNOTATION] [-d WCCL_DICT] [-D WCCL_DICTS_LIST] [-b]
[-v] [-s SEPARATOR]
optional arguments:
-h, --help show this help message and exit
-c CCL_FILE, --ccl CCL_FILE
CCL file or text file with list of paths to CCL files
(for batch mode)
-o OUT_FILE, --output OUT_FILE
Required when processing single document. If used with
'--batch-mode', then list of output files will be
stored under given path.
-m, --mwe_merged
-t TAGSET, --tagset TAGSET
-a ANNOTATION, --annotation ANNOTATION
Name of annotation to set
-d WCCL_DICT, --dict WCCL_DICT
WCCL dict with terms to annotate
-D WCCL_DICTS_LIST, --dicts-list WCCL_DICTS_LIST
Tabular file with annotations and paths to related
WCCL dicts to use. Use "--separator" to specify
separator in this file.
-b, --batch-mode If enabled, then input file is treated as list of ccl
files. If output path is present, then list of created
files will be stored there. Processed files will have
'.mwe' suffix added.
-v, --verbose
-s SEPARATOR, --separator SEPARATOR
Only applicable, when using "--dicts-list". Specifies
separator.
```
## As python module
```python
from wccl_annotator.wccl_annotator import WcclAnnotator
annotator = WcclAnnotator()
...
annotator.process(input_file, output_file, ann_2_wccl_dict=selected_dicts_set)
```
# Installation
This package is installed together with ```corpus2mwe```, no additional actions
is required.
# Tests
You can run manual tests (requires verification of content of console output)
by calling script corpus2mwe/src/cclmwe/tests/custom_annotations/test.sh,
which uses prepared container to run this scripts for different cases.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from setuptools import setup
setup(
name='wccl_annotator',
author='Arkadiusz Janz, Grzegorz Kostkowski',
description='''Tool for annotating terms from WCCL dictionary in CCL
documents.''',
version='${WCCL_ANNOTATOR_PACKAGE_VERSION}',
package_dir={ '': '${CMAKE_CURRENT_SOURCE_DIR}' },
license='',
packages=[
'wccl_annotator'
],
zip_safe=False,
entry_points={
'console_scripts':
['wccl_annotator = wccl_annotator.wccl_annotator:main']
}
)
#!/usr/bin/env python
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import argparse
import tempfile
from corpus_ccl import cclutils as ccl
import corpus2mwe as mwe
import argparse
def parser():
def parse_args(args=None):
aparser = argparse.ArgumentParser()
aparser.add_argument('-c', '--ccl', dest='ccl_file',
help='CCL file or text file with list of paths to CCL files (for batch mode)',
......@@ -45,105 +41,9 @@ def parser():
aparser.add_argument('-s', '--separator',
default='\t',
help='Only applicable, when using "--dicts-list". Specifies separator.')
return aparser
def _get_new_tmpfile_path():
"""
Generates path for new temporary file without creating it.
"""
defult_tmp_dir = tempfile._get_default_tempdir()
temp_name = next(tempfile._get_candidate_names())
return f"{defult_tmp_dir}/{temp_name}"
def _read_dict(wccl_dicts_list, separator):
ann_2_wccl_dict = {}
with open(wccl_dicts_list, 'r') as ifile:
for l in ifile:
try:
ann, wccl_dict = l.rstrip().split(separator)
ann_2_wccl_dict[ann] = wccl_dict
except:
raise ValueError(f"Cannot parse file: {args.wccl_dicts_list}")
return ann_2_wccl_dict
class MWEConverter(object):
def __init__(self, annotation_2_wccl_dict, tagset='nkjp'):
self.readers = []
self.annotation_2_wccl_dict = annotation_2_wccl_dict
if tagset:
self.tagset = ccl.get_tagset(tagset)
else:
self.tagset = ccl.get_tagset('nkjp')
def convert(self, ccl_file, out_mwe_file, annots_used=False):
"""
Creates readers for every dict passed in constructor (once, before
processing first file) and annotates given ccl file using them: passes
result (temporary file) of previous reader into the next reader. The
ultimate resulted CCL file is stored under given path.
"""
if not self.readers:
self._load_readers(ccl_file, annots_used)
curr_ccl_file = ccl_file
tmp_file = _get_new_tmpfile_path()
for r in self.readers:
r.set_files(curr_ccl_file)
out_doc = r.read()
ccl.write_ccl(out_doc, tmp_file)
curr_ccl_file = tmp_file
ccl.write_ccl(out_doc, out_mwe_file)
os.remove(tmp_file)
def _load_readers(self, ccl_file, annots_used):
for ann, d in self.annotation_2_wccl_dict.items():
r = mwe.CclMWEReader(ccl_file, self.tagset, d, ann)
r.use_annotations(annots_used)
self.readers.append(r)
def run_batch(ccls_list_path, converter, mwe_merged, out_list=None, verbose=False):
out_files = []
with open(ccls_list_path, 'r') as ifile:
for ccl_file in ifile:
ccl_file = ccl_file.rstrip()
out_file = f"{ccl_file}.mwe"
if verbose:
print(f"Processing file '{ccl_file}' into '{out_file}' ...")
try:
converter.convert(ccl_file, out_file, mwe_merged)
except Exception as e:
print(f"Cannot process file '{ccl_file}':")
print(e)
print(f"Skipping file '{ccl_file}'")
out_files.append(out_file)
if out_list:
with open(out_list, 'w') as ofile:
for f in out_files:
ofile.write('{}\n'.format(f))
def main(argv=None):
aparser = parser()
args = aparser.parse_args(argv)
if args.wccl_dicts_list:
assert not args.wccl_dict
ann_2_wccl_dict = _read_dict(args.wccl_dicts_list, args.separator)
if args: # given args from outside
return aparser.parse_args(args)
else:
ann_2_wccl_dict = {args.annotation: args.wccl_dict}
converter = MWEConverter(ann_2_wccl_dict, tagset=args.tagset)
if args.batch_mode:
run_batch(args.ccl_file, converter, args.mwe_merged, out_list=args.out_file,
verbose=args.verbose)
else:
assert args.out_file
converter.convert(args.ccl_file, args.out_file, args.mwe_merged)
return aparser.parse_args()
if __name__ == "__main__":
main()
import tempfile
def get_new_tmpfile_path():
"""
Generates path for new temporary file without creating it.
"""
defult_tmp_dir = tempfile._get_default_tempdir()
temp_name = next(tempfile._get_candidate_names())
return f"{defult_tmp_dir}/{temp_name}"
def read_dict(tabular_file_path, separator):
"""
Reads tabular file (e.g. TSV) with two columns into dict (first column
as key and second as value)
"""
d = {}
with open(tabular_file_path, 'r') as ifile:
for l in ifile:
try:
k, v = l.rstrip().split(separator)
d[k] = v
except:
raise ValueError(f"Cannot parse file: {tabular_file_path}")
return d
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
from corpus_ccl import cclutils as ccl
import corpus2mwe as mwe
from wccl_annotator.argsparser import parse_args
from wccl_annotator.utils import read_dict, get_new_tmpfile_path
class MWEConverter(object):
"""
Class interacting with corpus2mwe package
"""
def __init__(self, annotation_2_wccl_dict, tagset='nkjp'):
"""