Skip to content
Snippets Groups Projects
Commit 02d737a1 authored by piotrmp's avatar piotrmp
Browse files

Bugfix.

parent 470059c6
1 merge request!2Multiword generation
......@@ -3,7 +3,7 @@ UD_Afrikaans-AfriBooms af Afrikaans
UD_Ancient_Greek-PROIEL ? Ancient_Greek *
UD_Ancient_Greek-Perseus ? Ancient_Greek
UD_Ancient_Hebrew-PTNK ? Ancient_Hebrew
UD_Arabic-NYUAD ar Arabic
#UD_Arabic-NYUAD ar Arabic
UD_Arabic-PADT ar Arabic *
UD_Armenian-ArmTDP hy Armenian *
UD_Armenian-BSUT hy Armenian
......
......@@ -68,7 +68,7 @@ class Lambo():
return model_name
@classmethod
def from_path(cls, model_path, model_name):
def from_path(cls, model_path, model_name, with_splitter = True):
"""
Obtain a LAMBO segmenter by reading a model from a given path.
......@@ -79,7 +79,7 @@ class Lambo():
model = torch.load(model_path / (model_name + '.pth'), map_location=torch.device('cpu'))
dict = Lambo.read_dict(model_path / (model_name + '.dict'))
splitter=None
if (model_path / (model_name + '_subwords.pth')).exists():
if with_splitter and (model_path / (model_name + '_subwords.pth')).exists():
from lambo.subwords.splitter import LamboSplitter
splitter = LamboSplitter.from_path(model_path, model_name)
return cls(model, dict, splitter)
......@@ -229,7 +229,7 @@ class Lambo():
if token_end:
# End of token
token = Token(turn_offset + token_begin, turn_offset + i + 1, text[token_begin:(i + 1)], mwtoken_end)
if mwtoken_end:
if mwtoken_end and self.splitter:
subwords=self.splitter.split(token.text)
if len(subwords)==1:
token.is_multi_word=False
......
......@@ -48,11 +48,11 @@ def print_document_to_conll(document, path):
token_text = token_text_with_whitespace_for_conllu(token, document, turn, sentence).strip()
if token_text == '':
continue
if token.is_multi_word and len(token.words) > 1:
if token.is_multi_word and len(token.subwords) > 1:
file1.write(str(token_id))
file1.write('-' + str(token_id + len(token.words) - 1))
file1.write('-' + str(token_id + len(token.subwords) - 1))
file1.write('\t' + token_text + '\t_\t_\t_\t_\t_\t_\t_\t_\n')
for word in token.words:
for word in token.subwords:
file1.write(str(token_id) + '\t' + word + '\t_\t_\t_\t_\t' + str(token_id - 1) + '\t_\t_\t_\n')
token_id += 1
else:
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment