From a843ed90c9b5b5724e0b8074c195c85529a13550 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martyna=20Wi=C4=85cek?= <martyna.wiacek@ipipan.waw.pl> Date: Thu, 20 Apr 2023 15:05:44 +0200 Subject: [PATCH] unescape ascii characters --- combo/data/api.py | 5 +++-- combo/predict.py | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/combo/data/api.py b/combo/data/api.py index f25d035..8a9208f 100644 --- a/combo/data/api.py +++ b/combo/data/api.py @@ -1,5 +1,6 @@ import collections import dataclasses +import html import json import os import string @@ -55,12 +56,12 @@ class Sentence: # numpy.savez(hash + '_trimmed.npz', self.relation_distribution[1:, 1:]) return json.dumps({ - "tokens": [(t.token, t.lemma, t.upostag, t.xpostag, t. feats, t.head, t.deprel) for t in self.tokens], + "tokens": [(str(html.unescape(t.token)), str(html.unescape(t.lemma)), t.upostag, t.xpostag, t. feats, t.head, t.deprel) for t in self.tokens], # "sentence_embedding": self.sentence_embedding, "head": [t.head for t in self.tokens], "relation_distribution_hash": hash, "path_file": str(os.path.join(save_relation_distribution_path, hash + '.npz')) - }, cls=NumpyArrayEncoder) + }, cls=NumpyArrayEncoder, ensure_ascii=False) def __len__(self): return len(self.tokens) diff --git a/combo/predict.py b/combo/predict.py index 90915a8..356cafa 100644 --- a/combo/predict.py +++ b/combo/predict.py @@ -58,6 +58,7 @@ class COMBO(predictor.Predictor): sys.exit(1) def predict(self, sentence: Union[str, List[str], List[List[str]], List[data.Sentence]]): + sentence = sentence.replace(".", " .") if isinstance(sentence, str): if isinstance(self._tokenizer,lambo.LamboTokenizer): segmented = self._tokenizer.segment(sentence) @@ -247,7 +248,7 @@ class COMBO(predictor.Predictor): @classmethod def with_lambo_tokenizer(cls, model: models.Model, - dataset_reader: allen_data.DatasetReader, lambo_model_name : str = 'en'): + dataset_reader: allen_data.DatasetReader, lambo_model_name : str = 'de'): return cls(model, dataset_reader, lambo.LamboTokenizer(lambo_model_name)) @classmethod -- GitLab