Skip to content
Snippets Groups Projects
Commit bc8bdc77 authored by Maja Jabłońska's avatar Maja Jabłońska
Browse files

Add get_slices_if_not_provided to data/dataset.py

parent 1380bf60
No related branches found
No related tags found
1 merge request!46Merge COMBO 3.0 into master
import logging import logging
from combo import data
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -7,5 +8,30 @@ logger = logging.getLogger(__name__) ...@@ -7,5 +8,30 @@ logger = logging.getLogger(__name__)
class DatasetReader: class DatasetReader:
pass pass
class UniversalDependenciesDatasetReader(DatasetReader): class UniversalDependenciesDatasetReader(DatasetReader):
pass pass
def get_slices_if_not_provided(vocab: data.Vocabulary):
if hasattr(vocab, "slices"):
return vocab.slices
if "feats_labels" in vocab.get_namespaces():
idx2token = vocab.get_index_to_token_vocabulary("feats_labels")
for _, v in dict(idx2token).items():
if v not in ["_", "__PAD__"]:
empty_value = v.split("=")[0] + "=None"
vocab.add_token_to_namespace(empty_value, "feats_labels")
slices = {}
for idx, name in vocab.get_index_to_token_vocabulary("feats_labels").items():
# There are 2 types features: with (Case=Acc) or without assigment (None).
# Here we group their indices by name (before assigment sign).
name = name.split("=")[0]
if name in slices:
slices[name].append(idx)
else:
slices[name] = [idx]
vocab.slices = slices
return vocab.slices
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment