Skip to content
Snippets Groups Projects
Commit 2c8e3061 authored by Mateusz Klimaszewski's avatar Mateusz Klimaszewski
Browse files

Add lv local model. Merge raw txt files.

parent 208e57a4
No related merge requests found
...@@ -36,7 +36,7 @@ FLAGS = flags.FLAGS ...@@ -36,7 +36,7 @@ FLAGS = flags.FLAGS
flags.DEFINE_list(name="lang", default=list(LANG2TREEBANK.keys()), flags.DEFINE_list(name="lang", default=list(LANG2TREEBANK.keys()),
help=f"Language of models to train. Possible values: {LANG2TREEBANK.keys()}.") help=f"Language of models to train. Possible values: {LANG2TREEBANK.keys()}.")
flags.DEFINE_string(name="data_dir", default="", flags.DEFINE_string(name="data_dir", default="",
help="Path to 'iwpt2020stdata' directory.") help="Path to IWPT'21 data directory.")
flags.DEFINE_string(name="serialization_dir", default="/tmp/", flags.DEFINE_string(name="serialization_dir", default="/tmp/",
help="Model serialization dir.") help="Model serialization dir.")
flags.DEFINE_integer(name="cuda_device", default=-1, flags.DEFINE_integer(name="cuda_device", default=-1,
...@@ -68,9 +68,11 @@ def run(_): ...@@ -68,9 +68,11 @@ def run(_):
assert data_dir.is_dir(), f"'{data_dir}' is not a directory!" assert data_dir.is_dir(), f"'{data_dir}' is not a directory!"
treebanks = LANG2TREEBANK[lang] treebanks = LANG2TREEBANK[lang]
full_language = treebanks[0].split("-")[0]
train_paths = [] train_paths = []
dev_paths = [] dev_paths = []
train_raw_paths = []
dev_raw_paths = []
# TODO Uncomment when IWPT'21 Shared Task ends. # TODO Uncomment when IWPT'21 Shared Task ends.
# During shared task duration test data is not available. # During shared task duration test data is not available.
test_paths = [] test_paths = []
...@@ -90,19 +92,33 @@ def run(_): ...@@ -90,19 +92,33 @@ def run(_):
# elif "test" in name: # elif "test" in name:
# collapse_nodes(data_dir, treebank_file, output) # collapse_nodes(data_dir, treebank_file, output)
# test_paths.append(output) # test_paths.append(output)
if ".txt" in name:
if "train" in name:
train_raw_paths.append(path_to_str(treebank_file))
elif "dev" in name:
dev_raw_paths.append(path_to_str(treebank_file))
lang_data_dir = pathlib.Path(data_dir / lang) merged_dataset_name = "IWPT"
lang_data_dir = pathlib.Path(data_dir / f"UD_{full_language}-{merged_dataset_name}")
lang_data_dir.mkdir(exist_ok=True) lang_data_dir.mkdir(exist_ok=True)
train_path = lang_data_dir / "train.conllu" suffix = f"{lang}_{merged_dataset_name}-ud".lower()
dev_path = lang_data_dir / "dev.conllu" train_path = lang_data_dir / f"{suffix}-train.conllu"
# TODO Uncomment dev_path = lang_data_dir / f"{suffix}-dev.conllu"
# test_path = lang_data_dir / "test.conllu" test_path = lang_data_dir / f"{suffix}-test.conllu"
train_raw_path = lang_data_dir / f"{suffix}-train.txt"
dev_raw_path = lang_data_dir / f"{suffix}-dev.txt"
test_raw_path = lang_data_dir / f"{suffix}-test.txt"
merge_files(train_paths, output=train_path) merge_files(train_paths, output=train_path)
merge_files(dev_paths, output=dev_path) merge_files(dev_paths, output=dev_path)
# TODO Uncomment # TODO Change to test_paths instead of dev_paths after IWPT'21
# merge_files(test_paths, output=test_path) merge_files(dev_paths, output=test_path)
merge_files(train_raw_paths, output=train_raw_path)
merge_files(dev_raw_paths, output=dev_raw_path)
# TODO Change to test_raw_paths instead of dev_paths after IWPT'21
merge_files(dev_raw_paths, output=test_raw_path)
serialization_dir = pathlib.Path(FLAGS.serialization_dir) / lang serialization_dir = pathlib.Path(FLAGS.serialization_dir) / lang
serialization_dir.mkdir(exist_ok=True, parents=True) serialization_dir.mkdir(exist_ok=True, parents=True)
......
...@@ -21,6 +21,7 @@ LANG2TRANSFORMER = { ...@@ -21,6 +21,7 @@ LANG2TRANSFORMER = {
"ta": "/tmp/lustre_shared/mklimasz/transformers/wikibert-base-ta-cased/", "ta": "/tmp/lustre_shared/mklimasz/transformers/wikibert-base-ta-cased/",
"sk": "/tmp/lustre_shared/mklimasz/transformers/wikibert-base-sk-cased/", "sk": "/tmp/lustre_shared/mklimasz/transformers/wikibert-base-sk-cased/",
"lt": "/tmp/lustre_shared/mklimasz/transformers/wikibert-base-lt-cased/", "lt": "/tmp/lustre_shared/mklimasz/transformers/wikibert-base-lt-cased/",
"lv": "/tmp/lustre_shared/mklimasz/transformers/wikibert-base-lv-cased/",
"cs": "/tmp/lustre_shared/mklimasz/transformers/wikibert-base-cs-cased/", "cs": "/tmp/lustre_shared/mklimasz/transformers/wikibert-base-cs-cased/",
"et": "/tmp/lustre_shared/mklimasz/transformers/etwiki-bert/", "et": "/tmp/lustre_shared/mklimasz/transformers/etwiki-bert/",
# "uk": http://dl.turkunlp.org/wikibert/wikibert-base-uk-cased/ # "uk": http://dl.turkunlp.org/wikibert/wikibert-base-uk-cased/
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment