Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
combo
Manage
Activity
Members
Labels
Plan
Issues
20
Issue boards
Milestones
Wiki
Redmine
Code
Merge requests
2
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Syntactic Tools
combo
Commits
426d24f1
Commit
426d24f1
authored
4 years ago
by
Mateusz Klimaszewski
Committed by
Mateusz Klimaszewski
4 years ago
Browse files
Options
Downloads
Patches
Plain Diff
Add script for training enhanced dependency parsing models based on IWPT'20 Shared Task data.
parent
23e0c9ce
Branches
Branches containing commit
Tags
Tags containing commit
2 merge requests
!9
Enhanced dependency parsing develop to master
,
!8
Enhanced dependency parsing
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
scripts/train_eud.py
+138
-0
138 additions, 0 deletions
scripts/train_eud.py
with
138 additions
and
0 deletions
scripts/train_eud.py
0 → 100644
+
138
−
0
View file @
426d24f1
"""
Script to train Enhanced Dependency Parsing models based on IWPT
'
20 Shared Task data.
Might require:
conda install -c bioconda perl-list-moreutils
conda install -c bioconda perl-namespace-autoclean
conda install -c bioconda perl-moose
conda install -c dan_blanchard perl-moosex-semiaffordanceaccessor
"""
import
os
import
pathlib
import
subprocess
from
typing
import
List
from
absl
import
app
from
absl
import
flags
FLAGS
=
flags
.
FLAGS
LANG2TREEBANK
=
{
"
ar
"
:
[
"
Arabic-PADT
"
],
"
bg
"
:
[
"
Bulgarian-BTB
"
],
"
cs
"
:
[
"
Czech-FicTree
"
,
"
Czech-CAC
"
,
"
Czech-PDT
"
,
"
Czech-PUD
"
],
"
nl
"
:
[
"
Dutch-Alpino
"
,
"
Dutch-LassySmall
"
],
"
en
"
:
[
"
English-EWT
"
,
"
English-PUD
"
],
"
et
"
:
[
"
Estonian-EDT
"
,
"
Estonian-EWT
"
],
"
fi
"
:
[
"
Finnish-TDT
"
,
"
Finnish-PUD
"
],
"
fr
"
:
[
"
French-Sequoia
"
,
"
French-FQB
"
],
"
it
"
:
[
"
Italian-ISDT
"
],
"
lv
"
:
[
"
Latvian-LVTB
"
],
"
lt
"
:
[
"
Lithuanian-ALKSNIS
"
],
"
pl
"
:
[
"
Polish-LFG
"
,
"
Polish-PDB
"
,
"
Polish-PUD
"
],
"
ru
"
:
[
"
Russian-SynTagRus
"
],
"
sk
"
:
[
"
Slovak-SNK
"
],
"
sv
"
:
[
"
Swedish-Talbanken
"
,
"
Swedish-PUD
"
],
"
ta
"
:
[
"
Tamil-TTB
"
],
"
uk
"
:
[
"
Ukrainian-IU
"
],
}
LANG2TRANSFORMER
=
{
"
en
"
:
"
bert-base-cased
"
,
"
pl
"
:
"
allegro/herbert-base-cased
"
,
}
flags
.
DEFINE_list
(
name
=
"
lang
"
,
default
=
list
(
LANG2TREEBANK
.
keys
()),
help
=
f
"
Language of models to train. Possible values:
{
LANG2TREEBANK
.
keys
()
}
.
"
)
flags
.
DEFINE_string
(
name
=
"
data_dir
"
,
default
=
""
,
help
=
"
Path to
'
iwpt2020stdata
'
directory.
"
)
flags
.
DEFINE_string
(
name
=
"
serialization_dir
"
,
default
=
"
/tmp/
"
,
help
=
"
Model serialization dir.
"
)
flags
.
DEFINE_integer
(
name
=
"
cuda_device
"
,
default
=-
1
,
help
=
"
Cuda device id (-1 for cpu).
"
)
def
path_to_str
(
path
:
pathlib
.
Path
)
->
str
:
return
str
(
path
.
resolve
())
def
merge_files
(
files
:
List
[
str
],
output
:
pathlib
.
Path
):
if
not
output
.
exists
():
os
.
system
(
f
"
cat
{
'
'
.
join
(
files
)
}
>
{
output
}
"
)
def
execute_command
(
command
,
output_file
=
None
):
command
=
[
c
for
c
in
command
.
split
()
if
c
.
strip
()]
if
output_file
:
with
open
(
output_file
,
"
w
"
)
as
f
:
subprocess
.
run
(
command
,
check
=
True
,
stdout
=
f
)
else
:
subprocess
.
run
(
command
,
check
=
True
)
def
collapse_nodes
(
data_dir
:
pathlib
.
Path
,
treebank_file
:
pathlib
.
Path
,
output
:
str
):
output_path
=
pathlib
.
Path
(
output
)
if
not
output_path
.
exists
():
execute_command
(
f
"
perl
{
path_to_str
(
data_dir
/
'
tools
'
/
'
enhanced_collapse_empty_nodes.pl
'
)
}
"
f
"
{
path_to_str
(
treebank_file
)
}
"
,
output
)
def
run
(
_
):
languages
=
FLAGS
.
lang
for
lang
in
languages
:
assert
lang
in
LANG2TREEBANK
,
f
"'
{
lang
}
'
must be one of
{
list
(
LANG2TREEBANK
.
keys
())
}
.
"
data_dir
=
pathlib
.
Path
(
FLAGS
.
data_dir
)
assert
data_dir
.
is_dir
(),
f
"'
{
data_dir
}
'
is not a directory!
"
treebanks
=
LANG2TREEBANK
[
lang
]
train_paths
=
[]
dev_paths
=
[]
test_paths
=
[]
for
treebank
in
treebanks
:
treebank_dir
=
data_dir
/
f
"
UD_
{
treebank
}
"
assert
treebank_dir
.
exists
()
and
treebank_dir
.
is_dir
(),
f
"'
{
treebank_dir
}
'
directory doesn
'
t exists.
"
for
treebank_file
in
treebank_dir
.
iterdir
():
name
=
treebank_file
.
name
if
"
conllu
"
in
name
and
"
fixed
"
not
in
name
:
output
=
path_to_str
(
treebank_file
).
replace
(
'
.conllu
'
,
'
.fixed.conllu
'
)
if
"
train
"
in
name
:
collapse_nodes
(
data_dir
,
treebank_file
,
output
)
train_paths
.
append
(
output
)
elif
"
dev
"
in
name
:
collapse_nodes
(
data_dir
,
treebank_file
,
output
)
dev_paths
.
append
(
output
)
elif
"
test
"
in
name
:
collapse_nodes
(
data_dir
,
treebank_file
,
output
)
test_paths
.
append
(
output
)
lang_data_dir
=
pathlib
.
Path
(
data_dir
/
lang
)
lang_data_dir
.
mkdir
(
exist_ok
=
True
)
train_path
=
lang_data_dir
/
"
train.conllu
"
dev_path
=
lang_data_dir
/
"
dev.conllu
"
test_path
=
lang_data_dir
/
"
test.conllu
"
merge_files
(
train_paths
,
output
=
train_path
)
merge_files
(
dev_paths
,
output
=
dev_path
)
merge_files
(
test_paths
,
output
=
test_path
)
serialization_dir
=
pathlib
.
Path
(
FLAGS
.
serialization_dir
)
/
lang
serialization_dir
.
mkdir
(
exist_ok
=
True
)
execute_command
(
""
.
join
(
f
"""
combo --mode train
--training_data
{
train_path
}
--validation_data
{
dev_path
}
--targets feats,upostag,xpostag,head,deprel,lemma,deps
--pretrained_transformer_name
{
LANG2TRANSFORMER
[
lang
]
}
--serialization_dir
{
serialization_dir
}
--cuda_device
{
FLAGS
.
cuda_device
}
--word_batch_size 2500
--config_path
{
pathlib
.
Path
.
cwd
()
/
'
config.graph.template.jsonnet
'
}
--tensorboard
"""
.
splitlines
()))
def
main
():
app
.
run
(
run
)
if
__name__
==
"
__main__
"
:
main
()
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment