Skip to content
Snippets Groups Projects
Commit 46c68f26 authored by Łukasz Pszenny's avatar Łukasz Pszenny Committed by Łukasz Pszenny
Browse files

adding scripts to train new UD with instruction, fixing numpy problem on...

adding scripts to train new UD with instruction, fixing numpy problem on Mac(M1), fixing link to download models
parent 41374773
Branches
Tags
1 merge request!44Switching to UD 2.9
......@@ -15,7 +15,7 @@ DATA_TO_PATH = {
"ud25" : "ud_25",
"ud27" : "ud_27",
"ud29" : "ud_29"}
_URL = "http://s3.clarin-pl.eu/models/combo/{data}/{model}.tar.gz"
_URL = "http://s3.clarin-pl.eu/dspace/combo/{data}/{model}.tar.gz"
_HOME_DIR = os.getenv("HOME", os.curdir)
_CACHE_DIR = os.getenv("COMBO_DIR", os.path.join(_HOME_DIR, ".combo"))
......
This diff is collapsed.
%% Cell type:markdown id:8a556c39 tags:
# Notebook used to divide set of UDs into 3 approximately same training files
%% Cell type:code id:e2a3aa1d tags:
``` python
import os
from os import listdir
from os.path import isfile, join
from tqdm import tqdm
import pandas as pd
def get_dir_size(path='.'):
total = 0
with os.scandir(path) as it:
for entry in it:
if entry.is_file():
total += entry.stat().st_size
elif entry.is_dir():
total += get_dir_size(entry.path)
return total
```
%% Cell type:code id:c0e73444 tags:
``` python
UD_dir = "/home/pszenny/Downloads/ud2.9"
```
%% Cell type:code id:28bab566 tags:
``` python
list1 = [] # stores names of UD datasets to include in 1st training file
list2 = [] # stores names of UD datasets to include in 2nd training file
list3 = [] # stores names of UD datasets to include in 3rd training file
list1s = 0
list2s = 0
list3s = 0
has_no_lemma = []
has_no_upos = []
has_no_xpos = []
has_no_feats = []
has_no_head = []
has_no_deprel = []
```
%% Cell type:code id:41a56660 tags:
``` python
for file in os.listdir(UD_dir):
d = os.path.join(UD_dir, file)
if os.path.isdir(d):
onlyfiles = [f for f in listdir(d) if isfile(join(d, f))]
has_dev = any(["dev" in f and ".conllu" in f for f in onlyfiles])
has_test = any(["test" in f and ".conllu" in f for f in onlyfiles])
has_train = any(["train" in f and ".conllu" in f for f in onlyfiles])
if not (has_train and has_dev and has_test):
continue
for f in listdir(d):
tmp_path = ""
if ".conllu" in f and ("dev" in f or "test" in f or "train" in f):
tmp_path = os.path.join(d, f)
if ".conllu" in f and "train" in f:
train_file = os.path.join(d, f)
if tmp_path and os.path.getsize(tmp_path) < 1000:
continue
size = get_dir_size(d)
lemmas = []
upos = []
xpos = []
feats = []
head = []
deprel = []
# checking what data is inside training file
with open(train_file,'r') as rf:
for line in rf:
words = line.split("\t")
if len(words) == 10:
lemmas.append(words[2])
upos.append(words[3])
xpos.append(words[4])
feats.append(words[5])
head.append(words[6])
deprel.append(words[7])
if line == "\n":
break
if set(lemmas)=={'_'}:
has_no_lemma.append(file)
if set(upos)=={'_'}:
has_no_upos.append(file)
if set(xpos)=={'_'}:
has_no_xpos.append(file)
if set(feats)=={'_'}:
has_no_feats.append(file)
if set(head)=={'_'}:
has_no_head.append(file)
if set(deprel)=={'_'}:
has_no_deprel.append(file)
# file division
if list1s == min(list1s,list2s,list3s):
list1.append(file)
list1s += size
continue
if list2s == min(list1s,list2s,list3s):
list2.append(file)
list2s += size
continue
if list3s == min(list1s,list2s,list3s):
list3.append(file)
list3s += size
continue
```
%% Cell type:markdown id:73b7aed2 tags:
Adjusting train files to lack of data in files
%% Cell type:code id:32fa08d0 tags:
``` python
#ADJUST TRAIN FILES
train_file = set(list1) #change list1 to list2 or list3 to adjust all train files.
no_lemma_feats_xpos = set(has_no_lemma).intersection(set(has_no_feats),set(has_no_xpos))
no_xpos_lemma = set(has_no_xpos).intersection(set(has_no_lemma)) - no_lemma_feats_xpos
no_xpos_feats = set(has_no_xpos).intersection(set(has_no_feats)) - no_lemma_feats_xpos
no_lemma_feats = set(has_no_lemma).intersection(set(has_no_feats)) - no_lemma_feats_xpos
no_xpos = set(has_no_xpos)-no_xpos_lemma-no_xpos_feats
no_lemma = set(has_no_lemma)-no_xpos_lemma-no_lemma_feats
no_feats = set(has_no_feats)-no_xpos_feats-no_lemma_feats
print("\nOnly lacks of XPOS: \n")
print(train_file.intersection(no_xpos))
print("\nOnly lacks of FEATS: \n")
print(train_file.intersection(no_feats))
print("\nOnly lacks of LEMMA: \n")
print(train_file.intersection(no_lemma))
print("\nOnly lacks of XPOS LEMMA: \n")
print(train_file.intersection(no_xpos_lemma))
print("\nOnly lacks of FEATS LEMMA: \n")
print(train_file.intersection(no_lemma_feats))
print("\nOnly lacks of XPOS FEATS: \n")
print(train_file.intersection(no_xpos_feats))
print("\nOnly lacks of XPOS FEATS LEMMA: \n")
print(train_file.intersection(no_lemma_feats_xpos))
# output of this cell should be used to adjust train.py
```
%% Output
Only lack of XPOS:
{'UD_Turkish-Penn', 'UD_Armenian-ArmTDP', 'UD_Russian-Taiga', 'UD_Portuguese-Bosque', 'UD_Danish-DDT', 'UD_Norwegian-NynorskLIA', 'UD_Hungarian-Szeged', 'UD_French-GSD', 'UD_Basque-BDT', 'UD_Western_Armenian-ArmTDP'}
Only lack of FEATS:
{'UD_Galician-CTG', 'UD_Korean-GSD', 'UD_Korean-Kaist', 'UD_Italian-ISDT'}
Only lack of LEMMA:
{'UD_Old_French-SRCMF'}
Only lack of XPOS LEMMA:
set()
Only lack of FEATS LEMMA:
{'UD_Swedish_Sign_Language-SSLC', 'UD_Maltese-MUDT', 'UD_English-ESL'}
Only lack of XPOS FEATS:
set()
Only lack of XPOS FEATS LEMMA:
set()
%% Cell type:markdown id:2d11c2b2 tags:
Additional info about training files. Cells creating dataset with number of sentences per UD language.
%% Cell type:code id:aacbff01 tags:
``` python
datasets = []
sent_count = []
for file in tqdm(os.listdir(UD_dir)):
d = os.path.join(UD_dir, file)
if os.path.isdir(d):
onlyfiles = [f for f in listdir(d) if isfile(join(d, f))]
has_dev = any(["dev" in f and ".conllu" in f for f in onlyfiles])
has_test = any(["test" in f and ".conllu" in f for f in onlyfiles])
has_train = any(["train" in f and ".conllu" in f for f in onlyfiles])
if not (has_train and has_dev and has_test):
continue
for f in listdir(d):
tmp_path = ""
if ".conllu" in f and ("dev" in f or "test" in f or "train" in f):
tmp_path = os.path.join(d, f)
if ".conllu" in f and "train" in f:
train_file = os.path.join(d, f)
if tmp_path and os.path.getsize(tmp_path) < 1000:
continue
datasets.append(file)
sent_count.append(count_sentences(train_file))
data_sentences = pd.DataFrame(
{'set': datasets,
'sent_count': sent_count})
data_sentences.sort_values(by=['sent_count'], inplace=True)
```
%% Cell type:code id:724e8e37 tags:
``` python
data_sentences.head(10)
```
from conll18_ud_eval import *
from absl import app
from absl import flags
import pathlib
import csv
# this script requires having conll18_ud_eval.py file in the same directory. It is available here
# https://universaldependencies.org/conll18/
FLAGS = flags.FLAGS
flags.DEFINE_string(name="pred_dir", default=r"/home/pszenny/Desktop/IPI_PAN/evaluate_UD/predictions_UD_29/pred",
help="Path to directory with predictions on test sets.")
flags.DEFINE_string(name="ud_dir", default=r"/home/pszenny/Desktop/IPI_PAN/evaluate_UD/predictions_UD_29/ud_files",
help="Path to directory with UD datasets up to UD_treebank/files .")
flags.DEFINE_string(name="models_dir", default=r"/tmp/lustre_shared/lukasz/models_UD_2.9",
help="Path to directory with trained models treebank/allennlp_folder/files.")
flags.DEFINE_string(name="UD_version", default="29",
help="UD version number.")
flags.DEFINE_string(name="URL_download", default="http://s3.clarin-pl.eu/dspace/combo/ud_29/{model}.tar.gz",
help="template URL to download model with {model} where model name should be placed.")
flags.DEFINE_string(name="URL_licence",
default="https://github.com/UniversalDependencies/{treebank}/blob/r2.9/LICENSE.txt",
help="template URL to license.txt with {treebank} where treebank name should be placed.")
def evaluate_wrapper(gold_file, system_file):
# function that overloads function from conll18_ud_eval.py
# Load CoNLL-U files
gold_ud = load_conllu_file(gold_file)
system_ud = load_conllu_file(system_file)
return evaluate(gold_ud, system_ud)
def run(_):
path_to_folder_with_predictions = pathlib.Path(FLAGS.pred_dir)
path_to_folder_with_ud = pathlib.Path(FLAGS.ud_dir)
path_to_folder_with_models = pathlib.Path(FLAGS.models_dir)
URL_download = FLAGS.URL_download
URL_licence = FLAGS.URL_licence
# changing model name and creating dictionary with key: treebank value: model name
directory = list(path_to_folder_with_models.iterdir())
treebank_model_name = {}
for filename in directory:
allen_folders = list(filename.iterdir())
assert len(allen_folders) == 1, f"Multiple allen nlp serialization folders."
allen_folder = allen_folders[0]
language = str(filename).split("/")[-1].split("_")[1].split("-")[0]
if "model.tar.gz" not in [str(files).split("/")[-1] for files in list(allen_folder.iterdir())]:
continue
if sum(language in str(s) for s in directory) != 1:
new_name = str(filename).split("/")[-1].split("_")[1].lower() + f"-ud{FLAGS.UD_version}.tar.gz"
else:
new_name = language.lower() + f"-ud{FLAGS.UD_version}.tar.gz"
model_path = allen_folder / "model.tar.gz"
model_path.rename(pathlib.Path(allen_folder, new_name))
treebank_model_name[filename] = new_name
# evaluating models
all_result = [["Treebank", "Model name", "Model link", "UPOS", "XPOS", "UFeats", "AllTags", "Lemmas", "UAS", "LAS",
"CLAS", "MLAS", "BLEX", "LICENSE"]]
for filename in list(path_to_folder_with_predictions.iterdir()):
path_to_predictions = path_to_folder_with_predictions / filename
folder_with_data = str(filename).split("/")[-1].replace("predictions_test.conllu", "")
ud_folder = path_to_folder_with_ud / folder_with_data
ud_files = list(ud_folder.iterdir())
test_file = [f for f in ud_files if "test" in f.name and ".conllu" in f.name]
assert len(test_file) == 1, f"Couldn't find training file."
test_file_path = test_file[0]
evaluation = evaluate_wrapper(str(test_file_path), str(path_to_predictions))
metrics_evaluation = [folder_with_data, treebank_model_name[folder_with_data],
URL_download.format(model=treebank_model_name[folder_with_data])]
for metric in ["UPOS", "XPOS", "UFeats", "AllTags", "Lemmas", "UAS", "LAS", "CLAS",
"MLAS", "BLEX"]:
metrics_evaluation.append(round(100 * evaluation[metric].precision, 2))
metrics_evaluation.append(URL_licence.format(treebank=folder_with_data))
all_result.append(metrics_evaluation)
# saving google sheet performance table
with open("google_sheet.csv", "w", newline="") as f:
writer = csv.writer(f)
writer.writerows(all_result)
# creating gitlab performance table
performance_table_gitlab = []
for row in all_result:
new_row = "|" + row[0] + "|[" + row[1] + "](" + row[2] + ")|" + "|".join(row[3:]) + "|"
performance_table_gitlab.append(new_row)
with open('performance_git.txt', 'w') as fo:
fo.write('\n'.join(str(i) for i in performance_table_gitlab))
def main():
app.run(run)
if __name__ == "__main__":
main()
"""Script to train Dependency Parsing models based on UD 2.x data."""
import pathlib
from absl import app
from absl import flags
from scripts import utils
# # ls -1 | xargs -i echo "\"{}\","
# UD 2.7
TREEBANKS = [
"UD_Norwegian-Bokmaal",
"UD_Norwegian-Nynorsk",
"UD_Norwegian-NynorskLIA",
"UD_Persian-PerDT",
"UD_Persian-Seraji",
"UD_Polish-LFG",
"UD_Polish-PDB",
"UD_Portuguese-Bosque",
"UD_Portuguese-GSD",
"UD_Romanian-Nonstandard",
"UD_Romanian-RRT",
"UD_Romanian-SiMoNERo",
"UD_Russian-GSD",
"UD_Russian-SynTagRus",
"UD_Russian-Taiga",
"UD_Serbian-SET",
"UD_Slovak-SNK",
"UD_Slovenian-SSJ",
"UD_Spanish-AnCora",
"UD_Spanish-GSD",
"UD_Swedish-LinES",
"UD_Swedish-Talbanken",
# "UD_Tamil-TTB",
"UD_Telugu-MTG",
"UD_Turkish-Atis",
"UD_Turkish-BOUN",
# "UD_Turkish-FrameNet",
"UD_Turkish-IMST",
"UD_Turkish-Kenet",
"UD_Turkish-Penn",
"UD_Turkish-Tourism",
"UD_Ukrainian-IU",
"UD_Urdu-UDTB",
"UD_Uyghur-UDT",
"UD_Vietnamese-VTB",
"UD_Welsh-CCG",
]
FLAGS = flags.FLAGS
flags.DEFINE_list(name="treebanks", default=TREEBANKS,
help=f"Treebanks to predict")
flags.DEFINE_string(name="data_dir", default="/tmp/lustre_shared/lukasz/UD_2.9/",
help="Path to UD data directory.")
flags.DEFINE_string(name="output_dir", default="/tmp/lustre_shared/lukasz/predictions_UD_29/",
help="Output directory for predictions")
flags.DEFINE_string(name="models_dir", default="/tmp/lustre_shared/lukasz/models_UD_2.9/",
help="Directory where models are stored in subfolders with the name of treebank")
def run(_):
treebanks_dir = pathlib.Path(FLAGS.data_dir)
for treebank in FLAGS.treebanks:
assert treebank in TREEBANKS, f"Unknown treebank {treebank}."
treebank_dir = treebanks_dir / treebank
files = list(treebank_dir.iterdir())
test_file = [f for f in files if "test" in f.name and ".conllu" in f.name]
assert len(test_file) == 1, f"Couldn't find training file."
test_file_path = test_file[0]
output_path = pathlib.Path(FLAGS.output_dir) / (treebank + "predictions_test.conllu")
model_directory = pathlib.Path(FLAGS.models_dir) / treebank
files = list(model_directory.iterdir())
assert len(files) == 1, f"Couldn't find model directory file."
model_directory = model_directory / files[0]
command = f"""
time combo --mode predict --model_path {model_directory}
--input_file {test_file_path}
--output_file {output_path}
--cuda_device 0
--batch_size 32
--silent
"""
utils.execute_command(command)
def main():
app.run(run)
if __name__ == "__main__":
main()
# Training models on Universal dependency datasets
1. Download fasttext embeddings using:
- download_fasttext.py
2. Divide UD dataset into train batches, adjust train scripts, using:
- UD division notebook.ipynb
3. Train models by running adjusted train.py scripts
4. Predict on test sets using:
- predict_UD.py
5. Compute metrics for google sheet and performance table in gitlab, first one will be saved in google_sheet.csv,
the latter one in performance_git.txt. Script requires to download official ud evaluation script from
https://universaldependencies.org/conll18/ and place it the same directory as create_performance_table.py script
- create_performance_table.py
\ No newline at end of file
......@@ -223,22 +223,33 @@ def run(_):
"""
# Datasets without XPOS
if treebank in {"UD_Armenian-ArmTDP", "UD_Basque-BDT", "UD_Danish-DDT", "UD_Hungarian-Szeged", "UD_French-GSD",
"UD_Marathi-UFAL", "UD_Norwegian-Bokmaal"}:
if treebank in {'UD_Danish-DDT', 'UD_Western_Armenian-ArmTDP', 'UD_Basque-BDT', 'UD_Hungarian-Szeged', 'UD_Russian-Taiga', 'UD_Portuguese-Bosque', 'UD_Norwegian-NynorskLIA', 'UD_Turkish-Penn', 'UD_French-GSD', 'UD_Armenian-ArmTDP'}:
command = command + " --targets deprel,head,upostag,lemma,feats"
# Datasets without FEATS
if treebank in {"UD_Japanese-GSD", "UD_Korean-Kaist"}:
if treebank in {'UD_Galician-CTG', 'UD_Italian-ISDT', 'UD_Korean-Kaist', 'UD_Korean-GSD'}:
command = command + " --targets deprel,head,upostag,xpostag,lemma"
# Datasets without LEMMA
if treebank in {'UD_Old_French-SRCMF'}:
command = command + " --targets deprel,head,upostag,xpostag,feats"
# Datasets without XPOS and LEMMA
if treebank in {}:
command = command + " --targets deprel,head,upostag,feats"
# Datasets without LEMMA and FEATS
if treebank in {"UD_Maltese-MUDT"}:
if treebank in {'UD_English-ESL', 'UD_Maltese-MUDT', 'UD_Swedish_Sign_Language-SSLC'}:
command = command + " --targets deprel,head,upostag,xpostag"
# Datasets without XPOS and FEATS
if treebank in {"UD_Telugu-MTG"}:
if treebank in {}:
command = command + " --targets deprel,head,upostag,lemma"
# Datasets without XPOS, FEATS and LEMMA
if treebank in {}:
command = command + " --targets deprel,head,upostag"
# Reduce word_batch_size
word_batch_size = 2500
if treebank in {"UD_German-HDT", "UD_Marathi-UFAL"}:
......
......@@ -9,7 +9,7 @@ REQUIREMENTS = [
'jsonnet==0.15.0',
'filelock==3.0;python_version>="3.9"',
'numpy==1.19.4;python_version<"3.9"',
'numpy==1.22.0;python_version>="3.9"',
'numpy==1.22.0',
'overrides==3.1.0',
'requests==2.23.0',
'sentencepiece==0.1.83;python_version<"3.8"',
......@@ -18,8 +18,7 @@ REQUIREMENTS = [
'scipy<1.6.0;python_version<"3.7"', # SciPy 1.6.0 works for 3.7+
'scipy==1.6.0;python_version>="3.7"',
'spacy==2.3.2',
'scikit-learn<=0.23.2;python_version<"3.9"',
'scikit-learn==0.23.2;python_version>="3.9"',
'scikit-learn==0.23.2',
'torch==1.7.1',
'tqdm==4.43.0',
'transformers==4.0.1',
......@@ -38,7 +37,8 @@ setup(
keywords="nlp natural-language-processing dependency-parsing",
setup_requires=['pytest-runner',
'pytest-pylint',
'numpy==1.22.0;python_version>="3.9"',
'scikit-learn==0.23.2',
'numpy==1.22.0',
'scipy==1.6.0;python_version>="3.7"'],
tests_require=['pytest', 'pylint'],
python_requires='>=3.6',
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment