Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
7
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
Syntactic Tools
combo
Commits
d12e5ec7
Commit
d12e5ec7
authored
May 05, 2021
by
Mateusz Klimaszewski
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Add lv local model. Merge raw txt files.
parent
d3fe22d2
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
26 additions
and
9 deletions
+26
-9
scripts/train_iwpt21.py
scripts/train_iwpt21.py
+25
-9
scripts/utils.py
scripts/utils.py
+1
-0
No files found.
scripts/train_iwpt21.py
View file @
d12e5ec7
...
...
@@ -36,7 +36,7 @@ FLAGS = flags.FLAGS
flags
.
DEFINE_list
(
name
=
"lang"
,
default
=
list
(
LANG2TREEBANK
.
keys
()),
help
=
f
"Language of models to train. Possible values:
{
LANG2TREEBANK
.
keys
()
}
."
)
flags
.
DEFINE_string
(
name
=
"data_dir"
,
default
=
""
,
help
=
"Path to
'iwpt2020st
data
'
directory."
)
help
=
"Path to
IWPT'21
data directory."
)
flags
.
DEFINE_string
(
name
=
"serialization_dir"
,
default
=
"/tmp/"
,
help
=
"Model serialization dir."
)
flags
.
DEFINE_integer
(
name
=
"cuda_device"
,
default
=-
1
,
...
...
@@ -68,9 +68,11 @@ def run(_):
assert
data_dir
.
is_dir
(),
f
"'
{
data_dir
}
' is not a directory!"
treebanks
=
LANG2TREEBANK
[
lang
]
full_language
=
treebanks
[
0
].
split
(
"-"
)[
0
]
train_paths
=
[]
dev_paths
=
[]
train_raw_paths
=
[]
dev_raw_paths
=
[]
# TODO Uncomment when IWPT'21 Shared Task ends.
# During shared task duration test data is not available.
test_paths
=
[]
...
...
@@ -90,19 +92,33 @@ def run(_):
# elif "test" in name:
# collapse_nodes(data_dir, treebank_file, output)
# test_paths.append(output)
if
".txt"
in
name
:
if
"train"
in
name
:
train_raw_paths
.
append
(
path_to_str
(
treebank_file
))
elif
"dev"
in
name
:
dev_raw_paths
.
append
(
path_to_str
(
treebank_file
))
lang_data_dir
=
pathlib
.
Path
(
data_dir
/
lang
)
merged_dataset_name
=
"IWPT"
lang_data_dir
=
pathlib
.
Path
(
data_dir
/
f
"UD_
{
full_language
}
-
{
merged_dataset_name
}
"
)
lang_data_dir
.
mkdir
(
exist_ok
=
True
)
train_path
=
lang_data_dir
/
"train.conllu"
dev_path
=
lang_data_dir
/
"dev.conllu"
# TODO Uncomment
# test_path = lang_data_dir / "test.conllu"
suffix
=
f
"
{
lang
}
_
{
merged_dataset_name
}
-ud"
.
lower
()
train_path
=
lang_data_dir
/
f
"
{
suffix
}
-train.conllu"
dev_path
=
lang_data_dir
/
f
"
{
suffix
}
-dev.conllu"
test_path
=
lang_data_dir
/
f
"
{
suffix
}
-test.conllu"
train_raw_path
=
lang_data_dir
/
f
"
{
suffix
}
-train.txt"
dev_raw_path
=
lang_data_dir
/
f
"
{
suffix
}
-dev.txt"
test_raw_path
=
lang_data_dir
/
f
"
{
suffix
}
-test.txt"
merge_files
(
train_paths
,
output
=
train_path
)
merge_files
(
dev_paths
,
output
=
dev_path
)
# TODO Uncomment
# merge_files(test_paths, output=test_path)
# TODO Change to test_paths instead of dev_paths after IWPT'21
merge_files
(
dev_paths
,
output
=
test_path
)
merge_files
(
train_raw_paths
,
output
=
train_raw_path
)
merge_files
(
dev_raw_paths
,
output
=
dev_raw_path
)
# TODO Change to test_raw_paths instead of dev_paths after IWPT'21
merge_files
(
dev_raw_paths
,
output
=
test_raw_path
)
serialization_dir
=
pathlib
.
Path
(
FLAGS
.
serialization_dir
)
/
lang
serialization_dir
.
mkdir
(
exist_ok
=
True
,
parents
=
True
)
...
...
scripts/utils.py
View file @
d12e5ec7
...
...
@@ -21,6 +21,7 @@ LANG2TRANSFORMER = {
"ta"
:
"/tmp/lustre_shared/mklimasz/transformers/wikibert-base-ta-cased/"
,
"sk"
:
"/tmp/lustre_shared/mklimasz/transformers/wikibert-base-sk-cased/"
,
"lt"
:
"/tmp/lustre_shared/mklimasz/transformers/wikibert-base-lt-cased/"
,
"lv"
:
"/tmp/lustre_shared/mklimasz/transformers/wikibert-base-lv-cased/"
,
"cs"
:
"/tmp/lustre_shared/mklimasz/transformers/wikibert-base-cs-cased/"
,
"et"
:
"/tmp/lustre_shared/mklimasz/transformers/etwiki-bert/"
,
# "uk": http://dl.turkunlp.org/wikibert/wikibert-base-uk-cased/
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment