Commit ae6014eb authored by Arkadiusz Janz's avatar Arkadiusz Janz

Merge branch 'develop' into 'master'

develop to master

See merge request !2
parents 91bc7855 3bbabc37
Pipeline #2542 passed with stage
in 3 minutes and 40 seconds
......@@ -11,17 +11,20 @@ typedef boost::shared_ptr<MWEReader> MWEReaderPtr;
CclMWEReader::CclMWEReader(const std::string &doc_path, const Tagset &tagset,
std::string mwe_file) : BaseRelReader("document"), annotate(true)
std::string mwe_file, std::string ann_chan_name)
: BaseRelReader("document"), annotate(true)
{
make_readers(tagset, doc_path);
BaseRelReader::make_id_doc(doc_path, doc_path);
if (mwe_file == "")
mwe_file = get_mwe_dict("mwe");
init_mwes(mwe_file);
if (ann_chan_name != "")
set_annotation_channel(ann_chan_name);
}
CclMWEReader::CclMWEReader(const std::string &doc_path, const std::string &rel_path,
const Tagset &tagset, std::string mwe_file)
const Tagset &tagset, std::string mwe_file, std::string ann_chan_name)
: BaseRelReader("document"), annotate(true)
{
make_readers(tagset, doc_path, rel_path);
......@@ -29,6 +32,8 @@ CclMWEReader::CclMWEReader(const std::string &doc_path, const std::string &rel_p
if (mwe_file == "")
mwe_file = get_mwe_dict("mwe");
init_mwes(mwe_file);
if (ann_chan_name != "")
set_annotation_channel(ann_chan_name);
}
void CclMWEReader::use_annotations(bool val) {
......@@ -61,6 +66,11 @@ void CclMWEReader::set_files(const std::string &doc_path,
BaseRelReader::make_id_doc(doc_path, rel_path);
}
void CclMWEReader::set_annotation_channel(const std::string &chan_name)
{
boost::static_pointer_cast<MWEReader>(reader_)->set_annotation_channel(chan_name);
}
void CclMWEReader::init_mwes(const std::string &mwe_file) {
if (reader_) {
reader_->set_option("inner:ccl");
......
......@@ -19,18 +19,20 @@ namespace Corpus2MWE {
typedef boost::shared_ptr<TokenReader> TokenReaderPtr;
CclMWEReader(const std::string &doc_path, const Tagset &tagset,
std::string mwe_file="");
std::string mwe_file="", std::string ann_chan_name="");
CclMWEReader(const std::string &doc_path, const std::string &rel_path,
const Tagset &tagset, std::string mwe_file="");
const Tagset &tagset, std::string mwe_file="",
std::string ann_chan_name="");
/** set new file to read */
void set_files(const std::string &doc_path);
void set_files(const std::string &doc_path, const std::string &rel_path);
void use_annotations(bool annots_used);
/// set name of annotation to add for found mwe / terms
void set_annotation_channel(const std::string &chan_name);
/** get the reader object */
TokenReaderPtr reader();
TokenReaderPtr reader();
private:
/**
......
FROM clarinpl/python:3.6
RUN apt-get update && apt-get install -y \
libxml++2.6-dev \
libloki-dev \
libboost-all-dev \
libicu-dev \
libffi-dev \
libssl-dev \
libxml2-utils \
cmake \
swig \
pwrutils \
gdebi-core \
antlr \
libantlr-dev \
default-jdk \
git
RUN mkdir -p /home/install
WORKDIR /home/install
RUN bash -c "wget -q -O - http://apt.clarin-pl.eu/KEY.gpg | apt-key add -"
RUN bash -c "echo 'deb https://apt.clarin-pl.eu/ /' > /etc/apt/sources.list.d/clarin.list"
# Install corpus2
RUN apt-get update && apt-get install -y \
corpus2-python3.6
# Install wccl from branch with change
WORKDIR /home/install
RUN git clone --single-branch \
--branch param_ann \
https://gitlab.clarin-pl.eu/analysers/wccl.git
RUN mkdir wccl/src/build && \
cat wccl/src/libmwereader/mwereader.cpp && \
cd wccl/src/build && \
cmake .. && \
make -j 8 && \
make install && \
ldconfig
# install corpus2mwe from this repository (branch param_ann)
WORKDIR /home/install
COPY ./ ./corpus2mwe
RUN mkdir corpus2mwe/build && \
cd corpus2mwe/build && \
cmake .. && \
make -j 8 && \
make install && \
ldconfig
RUN pip install corpus_ccl
#!/bin/bash
docker build ../../.. -t corpus2mwe_wccl_param_ann -f Dockerfile --no-cache
docker run --rm corpus2mwe_wccl_param_ann bash -c '
cd corpus2mwe/tools
echo "==> (1/6) Annotating without setting annotation name (default annotation and dict):"
python mwe_converter.py -c /home/install/corpus2mwe/cclmwe/tests/testdata/ccl.xml \
-o /home/install/corpus2mwe/cclmwe/tests/testdata/ccl_out.xml
cat /home/install/corpus2mwe/cclmwe/tests/testdata/ccl_out.xml
echo "==> (2/6) Annotating with annotation test_ann (defaut dict):"
python mwe_converter.py -c /home/install/corpus2mwe/cclmwe/tests/testdata/ccl.xml \
-o /home/install/corpus2mwe/cclmwe/tests/testdata/ccl_out2.xml \
-a test_ann
cat /home/install/corpus2mwe/cclmwe/tests/testdata/ccl_out2.xml
echo "==> (3/6) Annotating with annotation test_ann and wccl dict testdata/test_wccl_1.xml:"
python mwe_converter.py -c /home/install/corpus2mwe/cclmwe/tests/testdata/ccl.xml \
-o /home/install/corpus2mwe/cclmwe/tests/testdata/ccl_out3.xml \
-a test_ann \
-d ../cclmwe/tests/custom_annotations/testdata/test_wccl_1.xml
cat /home/install/corpus2mwe/cclmwe/tests/testdata/ccl_out3.xml
echo "==> (4/6) Annotating with two wccl dicts:
testdata/test_wccl_2a.xml (test_ann_a), testdata/test_wccl_2b.xml (test_ann_b):"
python mwe_converter.py -c /home/install/corpus2mwe/cclmwe/tests/testdata/ccl.xml \
-o /home/install/corpus2mwe/cclmwe/tests/testdata/ccl_out4.xml \
-D ../cclmwe/tests/custom_annotations/testdata/wccl_list.tsv
cat /home/install/corpus2mwe/cclmwe/tests/testdata/ccl_out4.xml
echo "==> (5/6) Annotating two files with wccl dict testdata/test_wccl_1.xml (test_ann):"
python mwe_converter.py -c ../cclmwe/tests/custom_annotations/testdata/ccl_list.txt \
-o /home/install/corpus2mwe/cclmwe/tests/testdata/ccl_annotated.txt \
-d ../cclmwe/tests/custom_annotations/testdata/test_wccl_1.xml \
-a test_ann \
--batch
echo "==> Generated files:"
cat /home/install/corpus2mwe/cclmwe/tests/testdata/ccl_annotated.txt
echo "==> /home/install/corpus2mwe/cclmwe/tests/testdata/ccl.xml.mwe":
cat /home/install/corpus2mwe/cclmwe/tests/testdata/ccl.xml.mwe
echo "==> /home/install/corpus2mwe/cclmwe/tests/testdata/ccl2.xml.mwe (none annotated)":
cat /home/install/corpus2mwe/cclmwe/tests/testdata/ccl2.xml.mwe
echo "==> (6/6) Annotating two files with two wccl dicts:
testdata/test_wccl_2a.xml (test_ann_a), testdata/test_wccl_2b.xml (test_ann_b):"
python mwe_converter.py -c ../cclmwe/tests/custom_annotations/testdata/ccl_list.txt \
-o /home/install/corpus2mwe/cclmwe/tests/testdata/ccl_annotated2.txt \
-D ../cclmwe/tests/custom_annotations/testdata/wccl_list.tsv \
--batch
echo "==> Generated files:"
cat /home/install/corpus2mwe/cclmwe/tests/testdata/ccl_annotated.txt
echo "==> /home/install/corpus2mwe/cclmwe/tests/testdata/ccl.xml.mwe":
cat /home/install/corpus2mwe/cclmwe/tests/testdata/ccl.xml.mwe
echo "==> /home/install/corpus2mwe/cclmwe/tests/testdata/ccl2.xml.mwe":
cat /home/install/corpus2mwe/cclmwe/tests/testdata/ccl2.xml.mwe
'
/home/install/corpus2mwe/cclmwe/tests/testdata/ccl.xml
/home/install/corpus2mwe/cclmwe/tests/testdata/ccl2.xml
\ No newline at end of file
<units_description tagset="nkjp">
<mwegroup class="neo" id="SubstSubstAdj" name="SubstSubstAdj" type="fix">
<condition>
and(
equal(base[0],$s:A),
equal(base[1],$s:B),
equal(base[2],$s:C),
inter(base[0],$s:A), inter(class[0],{subst}),
inter(base[1],$s:B), inter(class[1],{subst}),
inter(base[2],$s:C), inter(class[2],{adj}),
setvar($Pos0, 0),
setvar($Pos1, 1),
setvar($Pos2, 2),
setvar($Head, 0)
)
</condition>
<instances>
<MWE base="zakład ubezpieczeń społecznych">
<var name="A">zakład</var>
<var name="B">ubezpieczenie</var>
<var name="C">społeczny</var>
</MWE>
</instances>
</mwegroup>
<mwegroup class="neo" id="Subst" name="Subst" type="fix">
<condition>
and(
equal(base[0],$s:A),
inter(base[0],$s:A), inter(class[0],{subst}),
setvar($Pos0, 0),
setvar($Head, 0)
)
</condition>
<instances>
<MWE base="kartka">
<var name="A">kartka</var>
</MWE>
</instances>
</mwegroup>
</units_description>
<units_description tagset="nkjp">
<mwegroup class="neo" id="SubstSubstAdj" name="SubstSubstAdj" type="fix">
<condition>
and(
equal(base[0],$s:A),
equal(base[1],$s:B),
equal(base[2],$s:C),
inter(base[0],$s:A), inter(class[0],{subst}),
inter(base[1],$s:B), inter(class[1],{subst}),
inter(base[2],$s:C), inter(class[2],{adj}),
setvar($Pos0, 0),
setvar($Pos1, 1),
setvar($Pos2, 2),
setvar($Head, 0)
)
</condition>
<instances>
<MWE base="zakład ubezpieczeń społecznych">
<var name="A">zakład</var>
<var name="B">ubezpieczenie</var>
<var name="C">społeczny</var>
</MWE>
</instances>
</mwegroup>
</units_description>
<units_description tagset="nkjp">
<mwegroup class="neo" id="AdjSubst" name="AdjSubst" type="fix">
<condition>
and(
equal(base[0],$s:A),
equal(base[1],$s:B),
inter(base[0],$s:A), inter(class[0],{adj}),
inter(base[1],$s:B), inter(class[1],{subst}),
setvar($Pos0, 0),
setvar($Pos1, 1),
setvar($Head, 0)
)
</condition>
<instances>
<MWE base="blady ranek">
<var name="A">blady</var>
<var name="B">ranek</var>
</MWE>
<MWE base="biała księga">
<var name="A">biały</var>
<var name="B">księga</var>
</MWE>
<MWE base="czerwona kartka">
<var name="A">czerwony</var>
<var name="B">kartka</var>
</MWE>
</instances>
</mwegroup>
</units_description>
test_ann_a /home/install/corpus2mwe/cclmwe/tests/custom_annotations/testdata/test_wccl_2a.xml
test_ann_b /home/install/corpus2mwe/cclmwe/tests/custom_annotations/testdata/test_wccl_2b.xml
\ No newline at end of file
......@@ -24,10 +24,11 @@ namespace Corpus2MWE {
typedef boost::shared_ptr<Corpus2::TokenReader> TokenReaderPtr;
CclMWEReader(const std::string &doc_path, const std::string &rel_path,
const Corpus2::Tagset & tagset, std::string mwe_file="");
const Corpus2::Tagset & tagset, std::string mwe_file="",
std::string ann_chan_name="");
CclMWEReader(const std::string &doc_path, const Corpus2::Tagset & tagset,
std::string mwe_file="");
std::string mwe_file="", std::string ann_chan_name="");
boost::shared_ptr<Corpus2::whole::Document> read();
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import argparse
import tempfile
from corpus_ccl import cclutils as ccl
import corpus2mwe as mwe
def parser():
aparser = argparse.ArgumentParser()
aparser.add_argument('-c', '--ccl', dest='ccl_file', required=True)
aparser.add_argument('-o', '--output', dest='out_file', required=True)
aparser.add_argument('-c', '--ccl', dest='ccl_file',
help='CCL file or text file with list of paths to CCL files (for batch mode)',
required=True)
aparser.add_argument('-o', '--output', dest='out_file',
required=False,
help='''Required when processing single document. If used with
'--batch-mode', then list of output files will be stored under given path.''')
aparser.add_argument('-m', '--mwe_merged', action='store_false')
aparser.add_argument('-t', '--tagset', default='nkjp')
aparser.add_argument('-a', '--annotation', default='mwe',
help='Name of annotation to set')
aparser.add_argument(
'-d',
'--dict',
dest='wccl_dict',
default='',
help='WCCL dict with terms to annotate')
aparser.add_argument(
'-D',
'--dicts-list',
dest='wccl_dicts_list',
help='''Tabular file with annotations and paths to related WCCL dicts
to use. Use "--separator" to specify separator in this file. ''')
aparser.add_argument(
'-b',
'--batch-mode',
action='store_true',
dest='batch_mode',
help='''If enabled, then input file is treated as list of ccl
files. If output path is present, then list of created
files will be stored there. Processed files will have
'.mwe' suffix added.''')
aparser.add_argument('-v', '--verbose', action='store_true')
aparser.add_argument('-s', '--separator',
default='\t',
help='Only applicable, when using "--dicts-list". Specifies separator.')
return aparser
def _get_new_tmpfile_path():
"""
Generates path for new temporary file without creating it.
"""
defult_tmp_dir = tempfile._get_default_tempdir()
temp_name = next(tempfile._get_candidate_names())
return f"{defult_tmp_dir}/{temp_name}"
def _read_dict(wccl_dicts_list, separator):
ann_2_wccl_dict = {}
with open(wccl_dicts_list, 'r') as ifile:
for l in ifile:
try:
ann, wccl_dict = l.rstrip().split(separator)
ann_2_wccl_dict[ann] = wccl_dict
except:
raise ValueError(f"Cannot parse file: {args.wccl_dicts_list}")
return ann_2_wccl_dict
class MWEConverter(object):
def __init__(self, tagset='nkjp'):
self.reader = None
def __init__(self, annotation_2_wccl_dict, tagset='nkjp'):
self.readers = []
self.annotation_2_wccl_dict = annotation_2_wccl_dict
if tagset:
self.tagset = ccl.get_tagset(tagset)
else:
self.tagset = ccl.get_tagset('nkjp')
def convert(self, ccl_file, out_mwe_file, annots_used=False):
if not self.reader:
self.reader = mwe.CclMWEReader(ccl_file, self.tagset)
self.reader.use_annotations(annots_used)
else:
self.reader.set_files(ccl_file)
mwe_doc = self.reader.read()
ccl.write_ccl(mwe_doc, out_mwe_file)
"""
Creates readers for every dict passed in constructor (once, before
processing first file) and annotates given ccl file using them: passes
result (temporary file) of previous reader into the next reader. The
ultimate resulted CCL file is stored under given path.
"""
if not self.readers:
self._load_readers(ccl_file, annots_used)
curr_ccl_file = ccl_file
tmp_file = _get_new_tmpfile_path()
for r in self.readers:
r.set_files(curr_ccl_file)
out_doc = r.read()
ccl.write_ccl(out_doc, tmp_file)
curr_ccl_file = tmp_file
ccl.write_ccl(out_doc, out_mwe_file)
os.remove(tmp_file)
def _load_readers(self, ccl_file, annots_used):
for ann, d in self.annotation_2_wccl_dict.items():
r = mwe.CclMWEReader(ccl_file, self.tagset, d, ann)
r.use_annotations(annots_used)
self.readers.append(r)
def run_batch(ccls_list_path, converter, mwe_merged, out_list=None, verbose=False):
out_files = []
with open(ccls_list_path, 'r') as ifile:
for ccl_file in ifile:
ccl_file = ccl_file.rstrip()
out_file = f"{ccl_file}.mwe"
if verbose:
print(f"Processing file '{ccl_file}' into '{out_file}' ...")
try:
converter.convert(ccl_file, out_file, mwe_merged)
except Exception as e:
print(f"Cannot process file '{ccl_file}':")
print(e)
print(f"Skipping file '{ccl_file}'")
out_files.append(out_file)
if out_list:
with open(out_list, 'w') as ofile:
for f in out_files:
ofile.write('{}\n'.format(f))
def main(argv=None):
aparser = parser()
args = aparser.parse_args(argv)
converter = MWEConverter(tagset=args.tagset)
converter.convert(args.ccl_file, args.out_file, args.mwe_merged)
if args.wccl_dicts_list:
assert not args.wccl_dict
ann_2_wccl_dict = _read_dict(args.wccl_dicts_list, args.separator)
else:
ann_2_wccl_dict = {args.annotation: args.wccl_dict}
converter = MWEConverter(ann_2_wccl_dict, tagset=args.tagset)
if args.batch_mode:
run_batch(args.ccl_file, converter, args.mwe_merged, out_list=args.out_file,
verbose=args.verbose)
else:
assert args.out_file
converter.convert(args.ccl_file, args.out_file, args.mwe_merged)
if __name__ == "__main__":
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment