Skip to content
Snippets Groups Projects
Commit 7d793d41 authored by Łukasz Pszenny's avatar Łukasz Pszenny
Browse files

Version update

parent 03bc35b2
Branches
No related tags found
1 merge request!39Develop
......@@ -13,7 +13,7 @@
Clone this repository and install COMBO (we suggest creating a virtualenv/conda environment with Python 3.6+, as a bundle of required packages will be installed):
```bash
pip install -U pip setuptools wheel
pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.4
pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.5
```
Run the following commands in your Python console to make predictions with a pre-trained model:
```python
......@@ -42,21 +42,24 @@ We encourage you to use the [beginner's tutorial](https://colab.research.google.
## Citing
### Accepted at EMNLP'21 demo session :tada: :fire:
If you use COMBO in your research, please cite [COMBO: State-of-the-Art Morphosyntactic Analysis](https://arxiv.org/abs/2109.05361)
If you use COMBO in your research, please cite [COMBO: State-of-the-Art Morphosyntactic Analysis](https://aclanthology.org/2021.emnlp-demo.7)
```bibtex
@misc{klimaszewski2021combo,
title={COMBO: State-of-the-Art Morphosyntactic Analysis},
author={Mateusz Klimaszewski and Alina Wróblewska},
year={2021},
eprint={2109.05361},
archivePrefix={arXiv},
primaryClass={cs.CL}
@inproceedings{klimaszewski-wroblewska-2021-combo-state,
title = "{COMBO}: State-of-the-Art Morphosyntactic Analysis",
author = "Klimaszewski, Mateusz and
Wr{\'o}blewska, Alina",
booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
month = nov,
year = "2021",
address = "Online and Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.emnlp-demo.7",
pages = "50--62",
abstract = "We introduce COMBO {--} a fully neural NLP system for accurate part-of-speech tagging, morphological analysis, lemmatisation, and (enhanced) dependency parsing. It predicts categorical morphosyntactic features whilst also exposes their vector representations, extracted from hidden layers. COMBO is an easy to install Python package with automatically downloadable pre-trained models for over 40 languages. It maintains a balance between efficiency and quality. As it is an end-to-end system and its modules are jointly trained, its training is competitively fast. As its models are optimised for accuracy, they achieve often better prediction quality than SOTA. The COMBO library is available at: https://gitlab.clarin-pl.eu/syntactic-tools/combo.",
}
```
If you use an EUD module in your research, please cite [COMBO: A New Module for EUD Parsing](https://aclanthology.org/2021.iwpt-1.16/)
If you use an EUD module in your research, please cite [COMBO: A New Module for EUD Parsing](https://aclanthology.org/2021.iwpt-1.16)
```bibtex
@inproceedings{klimaszewski-wroblewska-2021-combo,
title = "{COMBO}: A New Module for {EUD} Parsing",
......
......@@ -77,11 +77,6 @@ local in_features(name) = !(std.length(std.find(name, features)) == 0);
local in_targets(name) = !(std.length(std.find(name, targets)) == 0);
local use_transformer = pretrained_transformer_name != null;
# Transformer encoder options
local use_transformer_encoder = if std.length(std.extVar("use_transformer_encoder")) == "True" then true else false;
local num_layers_transformer_encoder = 6;
local num_attention_heads = 8;
# Verify some configuration requirements
assert in_features("token"): "Key 'token' must be in features!";
assert in_features("char"): "Key 'char' must be in features!";
......@@ -257,17 +252,7 @@ assert pretrained_tokens == null || pretrained_transformer_name == null: "Can't
},
},
loss_weights: loss_weights,
seq_encoder: if use_transformer_encoder then {
type: "pytorch_transformer",
input_dim: (char_dim + projected_embedding_dim +
(if in_features('xpostag') then xpostag_dim else 0) +
(if in_features('lemma') then lemma_char_dim else 0) +
(if in_features('upostag') then upostag_dim else 0) +
(if in_features('feats') then feats_dim else 0)),
num_layers: num_layers_transformer_encoder,
feedforward_hidden_dim: hidden_size,
num_attention_heads: num_attention_heads,
positional_encoding: "sinusoidal"} else {
seq_encoder: {
type: "combo_encoder",
layer_dropout_probability: 0.33,
stacked_bilstm: {
......@@ -281,7 +266,7 @@ assert pretrained_tokens == null || pretrained_transformer_name == null: "Can't
num_layers: num_layers,
recurrent_dropout_probability: 0.33,
layer_dropout_probability: 0.33
}
},
},
dependency_relation: {
type: "combo_dependency_parsing_from_vocab",
......
......@@ -8,7 +8,7 @@ import torch
from allennlp import data as allen_data
from allennlp.common import checks, util
from allennlp.data import fields as allen_fields, vocabulary
from conllu import parser, TokenList
from conllu import parser
from dataclasses import dataclass
from overrides import overrides
......@@ -27,7 +27,6 @@ class UniversalDependenciesDatasetReader(allen_data.DatasetReader):
features: List[str] = None,
targets: List[str] = None,
use_sem: bool = False,
max_input_embedder: int = None,
**kwargs,
) -> None:
super().__init__(**kwargs)
......@@ -49,7 +48,6 @@ class UniversalDependenciesDatasetReader(allen_data.DatasetReader):
"Remove {} from either features or targets.".format(intersection)
)
self.use_sem = use_sem
self.max_input_embedder = max_input_embedder
# *.conllu readers configuration
fields = list(parser.DEFAULT_FIELDS)
......@@ -90,9 +88,6 @@ class UniversalDependenciesDatasetReader(allen_data.DatasetReader):
@overrides
def text_to_instance(self, tree: conllu.TokenList) -> allen_data.Instance:
if self.max_input_embedder:
tree = TokenList(tokens = tree.tokens[: self.max_input_embedder],
metadata = tree.metadata)
fields_: Dict[str, allen_data.Field] = {}
tree_tokens = [t for t in tree if isinstance(t["id"], int)]
tokens = [_Token(t["token"],
......@@ -122,10 +117,6 @@ class UniversalDependenciesDatasetReader(allen_data.DatasetReader):
text_field,
label_namespace="feats_labels")
elif target_name == "head":
if self.max_input_embedder:
target_values = [0 if v == "_" else int(v) for v in target_values]
target_values = [v for v in target_values if v < self.max_input_embedder]
else:
target_values = [0 if v == "_" else int(v) for v in target_values]
fields_[target_name] = allen_fields.SequenceLabelField(target_values, text_field,
label_namespace=target_name + "_labels")
......@@ -139,8 +130,6 @@ class UniversalDependenciesDatasetReader(allen_data.DatasetReader):
t_deps = t["deps"]
if t_deps and t_deps != "_":
for rel, head in t_deps:
if int(head) >= self.max_input_embedder:
continue
# EmoryNLP skips the first edge, if there are two edges between the same
# nodes. Thanks to that one is in a tree and another in a graph.
# This snippet follows that approach.
......
......@@ -57,8 +57,6 @@ flags.DEFINE_string(name="serialization_dir", default=None,
help="Model serialization directory (default - system temp dir).")
flags.DEFINE_boolean(name="tensorboard", default=False,
help="When provided model will log tensorboard metrics.")
flags.DEFINE_boolean(name="use_transformer_encoder", default=False,
help="Indicator whether to use transformer encoder or BiLSTM (default)")
# Finetune after training flags
flags.DEFINE_list(name="finetuning_training_data_path", default="",
......@@ -199,7 +197,6 @@ def _get_ext_vars(finetuning: bool = False) -> Dict:
"num_epochs": str(FLAGS.num_epochs),
"word_batch_size": str(FLAGS.word_batch_size),
"use_tensorboard": str(FLAGS.tensorboard),
"use_transformer_encoder": str(FLAGS.use_transformer_encoder)
}
......
......@@ -228,8 +228,7 @@ class COMBO(predictor.Predictor):
@classmethod
def from_pretrained(cls, path: str, tokenizer=tokenizers.SpacyTokenizer(),
batch_size: int = 1024,
cuda_device: int = -1,
max_input_embedder: int = None):
cuda_device: int = -1):
util.import_module_and_submodules("combo.commands")
util.import_module_and_submodules("combo.models")
util.import_module_and_submodules("combo.training")
......@@ -246,13 +245,6 @@ class COMBO(predictor.Predictor):
archive = models.load_archive(model_path, cuda_device=cuda_device)
model = archive.model
dataset_reader = allen_data.DatasetReader.from_params(archive.config["dataset_reader"],
max_input_embedder = max_input_embedder)
logger.info("Using pretrained transformer embedder may require truncating tokenized sentences.")
if max_input_embedder:
logger.info(f"Currently they are truncated to {max_input_embedder} first tokens")
else:
logger.info("Currently they are not truncated")
dataset_reader = allen_data.DatasetReader.from_params(
archive.config["dataset_reader"])
return cls(model, dataset_reader, tokenizer, batch_size)
......@@ -2,7 +2,7 @@
Clone this repository and install COMBO (we suggest using virtualenv/conda with Python 3.6+):
```bash
pip install -U pip setuptools wheel
pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.4
pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.5
combo --helpfull
```
......@@ -11,7 +11,7 @@ combo --helpfull
python -m venv venv
source venv/bin/activate
pip install -U pip setuptools wheel
pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.4
pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.5
```
### Conda example:
......@@ -19,7 +19,7 @@ pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.4
conda create -n combo python=3.8
conda activate combo
pip install -U pip setuptools wheel
pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.1
pip install --index-url https://pypi.clarin-pl.eu/simple combo==1.0.5
```
## Problems & solutions
......
import pathlib
from absl import app
from absl import flags
from scripts import utils
LANG = ["Polish"]
TREEBANKS = {"Polish" : "UD_Polish-PDB"}
FLAGS = flags.FLAGS
flags.DEFINE_string(name="data_dir", default="/home/pszenny/combo",
help="Path to data directory.")
flags.DEFINE_string(name="models_dir", default="/home/pszenny/combo/tmp/",
help="Model serialization dir.")
flags.DEFINE_integer(name="cuda_device", default=-1,
help="Cuda device id (-1 for cpu).")
flags.DEFINE_boolean(name="expect_prefix", default=True,
help="Whether to expect allennlp prefix.")
flags.DEFINE_integer(name="batch_size", default=32,
help="Batch size.")
def run(_):
for encoder in ["BiLSTM","transformer"]:
models_dir = pathlib.Path(FLAGS.models_dir)
for model_dir in models_dir.iterdir():
lang = model_dir.name
if lang not in LANG:
print("Skipping unknown directory: ", lang)
continue
if FLAGS.expect_prefix:
model_dir = pathlib.Path(models_dir) / (lang + "/" + encoder + "/")
model_dir = list(model_dir.iterdir())
assert len(model_dir) == 1, f"There is incorrect count of models {model_dir}"
model_dir = model_dir[0]
data_dir = pathlib.Path(FLAGS.data_dir)
data_dir = data_dir / TREEBANKS[lang]
files = list(data_dir.iterdir())
test_file = [f for f in files if "test" in f.name and ".conllu" in f.name]
assert len(test_file) == 1, f"Couldn't find training file."
test_file = test_file[0]
output_pred = data_dir / f'{lang}_pred.conllu'
command = f"""combo --mode predict --model_path {model_dir}
--input_file {test_file}
--output_file {output_pred}
--cuda_device {FLAGS.cuda_device}
--batch_size {FLAGS.batch_size}
--silent
"""
utils.execute_command(command)
return 1
def main():
app.run(run)
if __name__ == "__main__":
main()
\ No newline at end of file
"""Script to train Dependency Parsing models based on UD 2.x data."""
import pathlib
from absl import app
from absl import flags
from scripts import utils
# # ls -1 | xargs -i echo "\"{}\","
# UD 2.7
TREEBANKS = ["UD_Polish-PDB"]
embedding_model = "allegro/herbert-base-cased"
FLAGS = flags.FLAGS
flags.DEFINE_list(name="treebanks", default=TREEBANKS,
help=f"Treebanks to train. Possible values: {TREEBANKS}.")
flags.DEFINE_string(name="data_dir", default="/home/pszenny/combo/",
help="Path to UD data directory.")
flags.DEFINE_string(name="serialization_dir", default="/home/pszenny/combo/tmp/",
help="Model serialization directory.")
flags.DEFINE_integer(name="cuda_device", default=-1,
help="Cuda device id (-1 for cpu).")
flags.DEFINE_string(name="train_config_path", default="/home/pszenny/combo/combo/config.template.jsonnet",
help="Directory of jsonnet config file")
flags.DEFINE_boolean(name="use_transformer_encoder", default=False,
help="Indicator whether to use transformer encoder or BiLSTM (default)")
def run(_):
treebanks_dir = pathlib.Path(FLAGS.data_dir)
for treebank in FLAGS.treebanks:
assert treebank in TREEBANKS, f"Unknown treebank {treebank}."
treebank_dir = treebanks_dir / treebank
treebank_parts = treebank[3:].split("-")
language = treebank_parts[0]
files = list(treebank_dir.iterdir())
training_file = [f for f in files if "train" in f.name and ".conllu" in f.name]
assert len(training_file) == 1, f"Couldn't find training file."
training_file_path = training_file[0]
valid_file = [f for f in files if "dev" in f.name and ".conllu" in f.name]
assert len(valid_file) == 1, f"Couldn't find validation file."
valid_file_path = valid_file[0]
serialization_dir = pathlib.Path(FLAGS.serialization_dir) / (language+"/BiLSTM/")
serialization_dir.mkdir(exist_ok=True, parents=True)
word_batch_size = 2500
command = f"""time combo --mode train
--cuda_device {FLAGS.cuda_device}
--training_data_path {training_file_path}
--validation_data_path {valid_file_path}
--pretrained_transformer_name {embedding_model}
--serialization_dir {serialization_dir}
--use_transformer_encoder {FLAGS.use_transformer_encoder}
--config_path {FLAGS.train_config_path}
--notensorboard
--word_batch_size {word_batch_size}
--targets deprel,head,upostag,lemma,feats,xpostag
"""
utils.execute_command(command)
FLAGS.use_transformer_encoder = True
serialization_dir = pathlib.Path(FLAGS.serialization_dir) / (language+"/transformer/")
serialization_dir.mkdir(exist_ok=True, parents=True)
command = f"""time combo --mode train
--cuda_device {FLAGS.cuda_device}
--training_data_path {training_file_path}
--validation_data_path {valid_file_path}
--pretrained_transformer_name {embedding_model}
--serialization_dir {serialization_dir}
--use_transformer_encoder {FLAGS.use_transformer_encoder}
--config_path {FLAGS.train_config_path}
--notensorboard
--word_batch_size {word_batch_size}
--targets deprel,head,upostag,lemma,feats,xpostag
"""
utils.execute_command(command)
def main():
app.run(run)
if __name__ == "__main__":
main()
......@@ -23,7 +23,7 @@ REQUIREMENTS = [
setup(
name='combo',
version='1.0.4',
version='1.0.5',
author='Mateusz Klimaszewski',
author_email='M.Klimaszewski@ii.pw.edu.pl',
install_requires=REQUIREMENTS,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment