Skip to content
Snippets Groups Projects
Commit d12e5ec7 authored by Mateusz Klimaszewski's avatar Mateusz Klimaszewski
Browse files

Add lv local model. Merge raw txt files.

parent d3fe22d2
2 merge requests!37Release 1.0.4.,!36Release 1.0.4
...@@ -36,7 +36,7 @@ FLAGS = flags.FLAGS ...@@ -36,7 +36,7 @@ FLAGS = flags.FLAGS
flags.DEFINE_list(name="lang", default=list(LANG2TREEBANK.keys()), flags.DEFINE_list(name="lang", default=list(LANG2TREEBANK.keys()),
help=f"Language of models to train. Possible values: {LANG2TREEBANK.keys()}.") help=f"Language of models to train. Possible values: {LANG2TREEBANK.keys()}.")
flags.DEFINE_string(name="data_dir", default="", flags.DEFINE_string(name="data_dir", default="",
help="Path to 'iwpt2020stdata' directory.") help="Path to IWPT'21 data directory.")
flags.DEFINE_string(name="serialization_dir", default="/tmp/", flags.DEFINE_string(name="serialization_dir", default="/tmp/",
help="Model serialization dir.") help="Model serialization dir.")
flags.DEFINE_integer(name="cuda_device", default=-1, flags.DEFINE_integer(name="cuda_device", default=-1,
...@@ -68,9 +68,11 @@ def run(_): ...@@ -68,9 +68,11 @@ def run(_):
assert data_dir.is_dir(), f"'{data_dir}' is not a directory!" assert data_dir.is_dir(), f"'{data_dir}' is not a directory!"
treebanks = LANG2TREEBANK[lang] treebanks = LANG2TREEBANK[lang]
full_language = treebanks[0].split("-")[0]
train_paths = [] train_paths = []
dev_paths = [] dev_paths = []
train_raw_paths = []
dev_raw_paths = []
# TODO Uncomment when IWPT'21 Shared Task ends. # TODO Uncomment when IWPT'21 Shared Task ends.
# During shared task duration test data is not available. # During shared task duration test data is not available.
test_paths = [] test_paths = []
...@@ -90,19 +92,33 @@ def run(_): ...@@ -90,19 +92,33 @@ def run(_):
# elif "test" in name: # elif "test" in name:
# collapse_nodes(data_dir, treebank_file, output) # collapse_nodes(data_dir, treebank_file, output)
# test_paths.append(output) # test_paths.append(output)
if ".txt" in name:
if "train" in name:
train_raw_paths.append(path_to_str(treebank_file))
elif "dev" in name:
dev_raw_paths.append(path_to_str(treebank_file))
lang_data_dir = pathlib.Path(data_dir / lang) merged_dataset_name = "IWPT"
lang_data_dir = pathlib.Path(data_dir / f"UD_{full_language}-{merged_dataset_name}")
lang_data_dir.mkdir(exist_ok=True) lang_data_dir.mkdir(exist_ok=True)
train_path = lang_data_dir / "train.conllu" suffix = f"{lang}_{merged_dataset_name}-ud".lower()
dev_path = lang_data_dir / "dev.conllu" train_path = lang_data_dir / f"{suffix}-train.conllu"
# TODO Uncomment dev_path = lang_data_dir / f"{suffix}-dev.conllu"
# test_path = lang_data_dir / "test.conllu" test_path = lang_data_dir / f"{suffix}-test.conllu"
train_raw_path = lang_data_dir / f"{suffix}-train.txt"
dev_raw_path = lang_data_dir / f"{suffix}-dev.txt"
test_raw_path = lang_data_dir / f"{suffix}-test.txt"
merge_files(train_paths, output=train_path) merge_files(train_paths, output=train_path)
merge_files(dev_paths, output=dev_path) merge_files(dev_paths, output=dev_path)
# TODO Uncomment # TODO Change to test_paths instead of dev_paths after IWPT'21
# merge_files(test_paths, output=test_path) merge_files(dev_paths, output=test_path)
merge_files(train_raw_paths, output=train_raw_path)
merge_files(dev_raw_paths, output=dev_raw_path)
# TODO Change to test_raw_paths instead of dev_paths after IWPT'21
merge_files(dev_raw_paths, output=test_raw_path)
serialization_dir = pathlib.Path(FLAGS.serialization_dir) / lang serialization_dir = pathlib.Path(FLAGS.serialization_dir) / lang
serialization_dir.mkdir(exist_ok=True, parents=True) serialization_dir.mkdir(exist_ok=True, parents=True)
......
...@@ -21,6 +21,7 @@ LANG2TRANSFORMER = { ...@@ -21,6 +21,7 @@ LANG2TRANSFORMER = {
"ta": "/tmp/lustre_shared/mklimasz/transformers/wikibert-base-ta-cased/", "ta": "/tmp/lustre_shared/mklimasz/transformers/wikibert-base-ta-cased/",
"sk": "/tmp/lustre_shared/mklimasz/transformers/wikibert-base-sk-cased/", "sk": "/tmp/lustre_shared/mklimasz/transformers/wikibert-base-sk-cased/",
"lt": "/tmp/lustre_shared/mklimasz/transformers/wikibert-base-lt-cased/", "lt": "/tmp/lustre_shared/mklimasz/transformers/wikibert-base-lt-cased/",
"lv": "/tmp/lustre_shared/mklimasz/transformers/wikibert-base-lv-cased/",
"cs": "/tmp/lustre_shared/mklimasz/transformers/wikibert-base-cs-cased/", "cs": "/tmp/lustre_shared/mklimasz/transformers/wikibert-base-cs-cased/",
"et": "/tmp/lustre_shared/mklimasz/transformers/etwiki-bert/", "et": "/tmp/lustre_shared/mklimasz/transformers/etwiki-bert/",
# "uk": http://dl.turkunlp.org/wikibert/wikibert-base-uk-cased/ # "uk": http://dl.turkunlp.org/wikibert/wikibert-base-uk-cased/
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment