diff --git a/src/lambo/resources/languages.txt b/src/lambo/resources/languages.txt index 9e213ff758500adbb9cc64f1850feed0dc739dc6..9fbf954031526e7508b3edf029f2eacaac64791e 100644 --- a/src/lambo/resources/languages.txt +++ b/src/lambo/resources/languages.txt @@ -3,7 +3,7 @@ UD_Afrikaans-AfriBooms af Afrikaans UD_Ancient_Greek-PROIEL ? Ancient_Greek * UD_Ancient_Greek-Perseus ? Ancient_Greek UD_Ancient_Hebrew-PTNK ? Ancient_Hebrew -UD_Arabic-NYUAD ar Arabic +#UD_Arabic-NYUAD ar Arabic UD_Arabic-PADT ar Arabic * UD_Armenian-ArmTDP hy Armenian * UD_Armenian-BSUT hy Armenian diff --git a/src/lambo/segmenter/lambo.py b/src/lambo/segmenter/lambo.py index bd949b800838ea1b6fef07f2d4012a51f4fb83d6..1057237824acd8f49979745a232b36b9b087e4fe 100644 --- a/src/lambo/segmenter/lambo.py +++ b/src/lambo/segmenter/lambo.py @@ -68,7 +68,7 @@ class Lambo(): return model_name @classmethod - def from_path(cls, model_path, model_name): + def from_path(cls, model_path, model_name, with_splitter = True): """ Obtain a LAMBO segmenter by reading a model from a given path. @@ -79,7 +79,7 @@ class Lambo(): model = torch.load(model_path / (model_name + '.pth'), map_location=torch.device('cpu')) dict = Lambo.read_dict(model_path / (model_name + '.dict')) splitter=None - if (model_path / (model_name + '_subwords.pth')).exists(): + if with_splitter and (model_path / (model_name + '_subwords.pth')).exists(): from lambo.subwords.splitter import LamboSplitter splitter = LamboSplitter.from_path(model_path, model_name) return cls(model, dict, splitter) @@ -229,7 +229,7 @@ class Lambo(): if token_end: # End of token token = Token(turn_offset + token_begin, turn_offset + i + 1, text[token_begin:(i + 1)], mwtoken_end) - if mwtoken_end: + if mwtoken_end and self.splitter: subwords=self.splitter.split(token.text) if len(subwords)==1: token.is_multi_word=False diff --git a/src/lambo/utils/printer.py b/src/lambo/utils/printer.py index a55ecd980e3f95a7fe53fbbfcb3d75a3a746f856..08001e896bffef8bacbf52ad2bceabeae4f180f7 100644 --- a/src/lambo/utils/printer.py +++ b/src/lambo/utils/printer.py @@ -48,11 +48,11 @@ def print_document_to_conll(document, path): token_text = token_text_with_whitespace_for_conllu(token, document, turn, sentence).strip() if token_text == '': continue - if token.is_multi_word and len(token.words) > 1: + if token.is_multi_word and len(token.subwords) > 1: file1.write(str(token_id)) - file1.write('-' + str(token_id + len(token.words) - 1)) + file1.write('-' + str(token_id + len(token.subwords) - 1)) file1.write('\t' + token_text + '\t_\t_\t_\t_\t_\t_\t_\t_\n') - for word in token.words: + for word in token.subwords: file1.write(str(token_id) + '\t' + word + '\t_\t_\t_\t_\t' + str(token_id - 1) + '\t_\t_\t_\n') token_id += 1 else: