Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
combo
Manage
Activity
Members
Labels
Plan
Issues
20
Issue boards
Milestones
Wiki
Redmine
Code
Merge requests
2
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Syntactic Tools
combo
Merge requests
!9
Enhanced dependency parsing develop to master
Code
Review changes
Check out branch
Download
Patches
Plain diff
Merged
Enhanced dependency parsing develop to master
develop
into
master
Overview
0
Commits
19
Pipelines
0
Changes
29
Merged
Mateusz Klimaszewski
requested to merge
develop
into
master
4 years ago
Overview
0
Commits
19
Pipelines
0
Changes
1
Expand
0
0
Merge request reports
Viewing commit
426d24f1
Prev
Next
Show latest version
1 file
+
138
−
0
Expand all files
Inline
Compare changes
Side-by-side
Inline
Show whitespace changes
Show one file at a time
426d24f1
Add script for training enhanced dependency parsing models based on IWPT'20 Shared Task data.
· 426d24f1
Mateusz Klimaszewski
authored
4 years ago
scripts/train_eud.py
0 → 100644
+
138
−
0
Options
"""
Script to train Enhanced Dependency Parsing models based on IWPT
'
20 Shared Task data.
Might require:
conda install -c bioconda perl-list-moreutils
conda install -c bioconda perl-namespace-autoclean
conda install -c bioconda perl-moose
conda install -c dan_blanchard perl-moosex-semiaffordanceaccessor
"""
import
os
import
pathlib
import
subprocess
from
typing
import
List
from
absl
import
app
from
absl
import
flags
FLAGS
=
flags
.
FLAGS
LANG2TREEBANK
=
{
"
ar
"
:
[
"
Arabic-PADT
"
],
"
bg
"
:
[
"
Bulgarian-BTB
"
],
"
cs
"
:
[
"
Czech-FicTree
"
,
"
Czech-CAC
"
,
"
Czech-PDT
"
,
"
Czech-PUD
"
],
"
nl
"
:
[
"
Dutch-Alpino
"
,
"
Dutch-LassySmall
"
],
"
en
"
:
[
"
English-EWT
"
,
"
English-PUD
"
],
"
et
"
:
[
"
Estonian-EDT
"
,
"
Estonian-EWT
"
],
"
fi
"
:
[
"
Finnish-TDT
"
,
"
Finnish-PUD
"
],
"
fr
"
:
[
"
French-Sequoia
"
,
"
French-FQB
"
],
"
it
"
:
[
"
Italian-ISDT
"
],
"
lv
"
:
[
"
Latvian-LVTB
"
],
"
lt
"
:
[
"
Lithuanian-ALKSNIS
"
],
"
pl
"
:
[
"
Polish-LFG
"
,
"
Polish-PDB
"
,
"
Polish-PUD
"
],
"
ru
"
:
[
"
Russian-SynTagRus
"
],
"
sk
"
:
[
"
Slovak-SNK
"
],
"
sv
"
:
[
"
Swedish-Talbanken
"
,
"
Swedish-PUD
"
],
"
ta
"
:
[
"
Tamil-TTB
"
],
"
uk
"
:
[
"
Ukrainian-IU
"
],
}
LANG2TRANSFORMER
=
{
"
en
"
:
"
bert-base-cased
"
,
"
pl
"
:
"
allegro/herbert-base-cased
"
,
}
flags
.
DEFINE_list
(
name
=
"
lang
"
,
default
=
list
(
LANG2TREEBANK
.
keys
()),
help
=
f
"
Language of models to train. Possible values:
{
LANG2TREEBANK
.
keys
()
}
.
"
)
flags
.
DEFINE_string
(
name
=
"
data_dir
"
,
default
=
""
,
help
=
"
Path to
'
iwpt2020stdata
'
directory.
"
)
flags
.
DEFINE_string
(
name
=
"
serialization_dir
"
,
default
=
"
/tmp/
"
,
help
=
"
Model serialization dir.
"
)
flags
.
DEFINE_integer
(
name
=
"
cuda_device
"
,
default
=-
1
,
help
=
"
Cuda device id (-1 for cpu).
"
)
def
path_to_str
(
path
:
pathlib
.
Path
)
->
str
:
return
str
(
path
.
resolve
())
def
merge_files
(
files
:
List
[
str
],
output
:
pathlib
.
Path
):
if
not
output
.
exists
():
os
.
system
(
f
"
cat
{
'
'
.
join
(
files
)
}
>
{
output
}
"
)
def
execute_command
(
command
,
output_file
=
None
):
command
=
[
c
for
c
in
command
.
split
()
if
c
.
strip
()]
if
output_file
:
with
open
(
output_file
,
"
w
"
)
as
f
:
subprocess
.
run
(
command
,
check
=
True
,
stdout
=
f
)
else
:
subprocess
.
run
(
command
,
check
=
True
)
def
collapse_nodes
(
data_dir
:
pathlib
.
Path
,
treebank_file
:
pathlib
.
Path
,
output
:
str
):
output_path
=
pathlib
.
Path
(
output
)
if
not
output_path
.
exists
():
execute_command
(
f
"
perl
{
path_to_str
(
data_dir
/
'
tools
'
/
'
enhanced_collapse_empty_nodes.pl
'
)
}
"
f
"
{
path_to_str
(
treebank_file
)
}
"
,
output
)
def
run
(
_
):
languages
=
FLAGS
.
lang
for
lang
in
languages
:
assert
lang
in
LANG2TREEBANK
,
f
"'
{
lang
}
'
must be one of
{
list
(
LANG2TREEBANK
.
keys
())
}
.
"
data_dir
=
pathlib
.
Path
(
FLAGS
.
data_dir
)
assert
data_dir
.
is_dir
(),
f
"'
{
data_dir
}
'
is not a directory!
"
treebanks
=
LANG2TREEBANK
[
lang
]
train_paths
=
[]
dev_paths
=
[]
test_paths
=
[]
for
treebank
in
treebanks
:
treebank_dir
=
data_dir
/
f
"
UD_
{
treebank
}
"
assert
treebank_dir
.
exists
()
and
treebank_dir
.
is_dir
(),
f
"'
{
treebank_dir
}
'
directory doesn
'
t exists.
"
for
treebank_file
in
treebank_dir
.
iterdir
():
name
=
treebank_file
.
name
if
"
conllu
"
in
name
and
"
fixed
"
not
in
name
:
output
=
path_to_str
(
treebank_file
).
replace
(
'
.conllu
'
,
'
.fixed.conllu
'
)
if
"
train
"
in
name
:
collapse_nodes
(
data_dir
,
treebank_file
,
output
)
train_paths
.
append
(
output
)
elif
"
dev
"
in
name
:
collapse_nodes
(
data_dir
,
treebank_file
,
output
)
dev_paths
.
append
(
output
)
elif
"
test
"
in
name
:
collapse_nodes
(
data_dir
,
treebank_file
,
output
)
test_paths
.
append
(
output
)
lang_data_dir
=
pathlib
.
Path
(
data_dir
/
lang
)
lang_data_dir
.
mkdir
(
exist_ok
=
True
)
train_path
=
lang_data_dir
/
"
train.conllu
"
dev_path
=
lang_data_dir
/
"
dev.conllu
"
test_path
=
lang_data_dir
/
"
test.conllu
"
merge_files
(
train_paths
,
output
=
train_path
)
merge_files
(
dev_paths
,
output
=
dev_path
)
merge_files
(
test_paths
,
output
=
test_path
)
serialization_dir
=
pathlib
.
Path
(
FLAGS
.
serialization_dir
)
/
lang
serialization_dir
.
mkdir
(
exist_ok
=
True
)
execute_command
(
""
.
join
(
f
"""
combo --mode train
--training_data
{
train_path
}
--validation_data
{
dev_path
}
--targets feats,upostag,xpostag,head,deprel,lemma,deps
--pretrained_transformer_name
{
LANG2TRANSFORMER
[
lang
]
}
--serialization_dir
{
serialization_dir
}
--cuda_device
{
FLAGS
.
cuda_device
}
--word_batch_size 2500
--config_path
{
pathlib
.
Path
.
cwd
()
/
'
config.graph.template.jsonnet
'
}
--tensorboard
"""
.
splitlines
()))
def
main
():
app
.
run
(
run
)
if
__name__
==
"
__main__
"
:
main
()