Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
combo
Manage
Activity
Members
Labels
Plan
Issues
20
Issue boards
Milestones
Wiki
Redmine
Code
Merge requests
2
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Syntactic Tools
combo
Commits
2c8e3061
Commit
2c8e3061
authored
3 years ago
by
Mateusz Klimaszewski
Browse files
Options
Downloads
Patches
Plain Diff
Add lv local model. Merge raw txt files.
parent
208e57a4
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
scripts/train_iwpt21.py
+25
-9
25 additions, 9 deletions
scripts/train_iwpt21.py
scripts/utils.py
+1
-0
1 addition, 0 deletions
scripts/utils.py
with
26 additions
and
9 deletions
scripts/train_iwpt21.py
+
25
−
9
View file @
2c8e3061
...
...
@@ -36,7 +36,7 @@ FLAGS = flags.FLAGS
flags
.
DEFINE_list
(
name
=
"
lang
"
,
default
=
list
(
LANG2TREEBANK
.
keys
()),
help
=
f
"
Language of models to train. Possible values:
{
LANG2TREEBANK
.
keys
()
}
.
"
)
flags
.
DEFINE_string
(
name
=
"
data_dir
"
,
default
=
""
,
help
=
"
Path to
'
iwpt2020st
data
'
directory.
"
)
help
=
"
Path to
IWPT
'
21
data directory.
"
)
flags
.
DEFINE_string
(
name
=
"
serialization_dir
"
,
default
=
"
/tmp/
"
,
help
=
"
Model serialization dir.
"
)
flags
.
DEFINE_integer
(
name
=
"
cuda_device
"
,
default
=-
1
,
...
...
@@ -68,9 +68,11 @@ def run(_):
assert
data_dir
.
is_dir
(),
f
"'
{
data_dir
}
'
is not a directory!
"
treebanks
=
LANG2TREEBANK
[
lang
]
full_language
=
treebanks
[
0
].
split
(
"
-
"
)[
0
]
train_paths
=
[]
dev_paths
=
[]
train_raw_paths
=
[]
dev_raw_paths
=
[]
# TODO Uncomment when IWPT'21 Shared Task ends.
# During shared task duration test data is not available.
test_paths
=
[]
...
...
@@ -90,19 +92,33 @@ def run(_):
# elif "test" in name:
# collapse_nodes(data_dir, treebank_file, output)
# test_paths.append(output)
if
"
.txt
"
in
name
:
if
"
train
"
in
name
:
train_raw_paths
.
append
(
path_to_str
(
treebank_file
))
elif
"
dev
"
in
name
:
dev_raw_paths
.
append
(
path_to_str
(
treebank_file
))
lang_data_dir
=
pathlib
.
Path
(
data_dir
/
lang
)
merged_dataset_name
=
"
IWPT
"
lang_data_dir
=
pathlib
.
Path
(
data_dir
/
f
"
UD_
{
full_language
}
-
{
merged_dataset_name
}
"
)
lang_data_dir
.
mkdir
(
exist_ok
=
True
)
train_path
=
lang_data_dir
/
"
train.conllu
"
dev_path
=
lang_data_dir
/
"
dev.conllu
"
# TODO Uncomment
# test_path = lang_data_dir / "test.conllu"
suffix
=
f
"
{
lang
}
_
{
merged_dataset_name
}
-ud
"
.
lower
()
train_path
=
lang_data_dir
/
f
"
{
suffix
}
-train.conllu
"
dev_path
=
lang_data_dir
/
f
"
{
suffix
}
-dev.conllu
"
test_path
=
lang_data_dir
/
f
"
{
suffix
}
-test.conllu
"
train_raw_path
=
lang_data_dir
/
f
"
{
suffix
}
-train.txt
"
dev_raw_path
=
lang_data_dir
/
f
"
{
suffix
}
-dev.txt
"
test_raw_path
=
lang_data_dir
/
f
"
{
suffix
}
-test.txt
"
merge_files
(
train_paths
,
output
=
train_path
)
merge_files
(
dev_paths
,
output
=
dev_path
)
# TODO Uncomment
# merge_files(test_paths, output=test_path)
# TODO Change to test_paths instead of dev_paths after IWPT'21
merge_files
(
dev_paths
,
output
=
test_path
)
merge_files
(
train_raw_paths
,
output
=
train_raw_path
)
merge_files
(
dev_raw_paths
,
output
=
dev_raw_path
)
# TODO Change to test_raw_paths instead of dev_paths after IWPT'21
merge_files
(
dev_raw_paths
,
output
=
test_raw_path
)
serialization_dir
=
pathlib
.
Path
(
FLAGS
.
serialization_dir
)
/
lang
serialization_dir
.
mkdir
(
exist_ok
=
True
,
parents
=
True
)
...
...
This diff is collapsed.
Click to expand it.
scripts/utils.py
+
1
−
0
View file @
2c8e3061
...
...
@@ -21,6 +21,7 @@ LANG2TRANSFORMER = {
"
ta
"
:
"
/tmp/lustre_shared/mklimasz/transformers/wikibert-base-ta-cased/
"
,
"
sk
"
:
"
/tmp/lustre_shared/mklimasz/transformers/wikibert-base-sk-cased/
"
,
"
lt
"
:
"
/tmp/lustre_shared/mklimasz/transformers/wikibert-base-lt-cased/
"
,
"
lv
"
:
"
/tmp/lustre_shared/mklimasz/transformers/wikibert-base-lv-cased/
"
,
"
cs
"
:
"
/tmp/lustre_shared/mklimasz/transformers/wikibert-base-cs-cased/
"
,
"
et
"
:
"
/tmp/lustre_shared/mklimasz/transformers/etwiki-bert/
"
,
# "uk": http://dl.turkunlp.org/wikibert/wikibert-base-uk-cased/
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment