diff --git a/combo/data/api.py b/combo/data/api.py index f25d035d5f428314055185449799e2ca5ff8987c..8a9208f1039cd4831cb7368a0f4b7c5292032e61 100644 --- a/combo/data/api.py +++ b/combo/data/api.py @@ -1,5 +1,6 @@ import collections import dataclasses +import html import json import os import string @@ -55,12 +56,12 @@ class Sentence: # numpy.savez(hash + '_trimmed.npz', self.relation_distribution[1:, 1:]) return json.dumps({ - "tokens": [(t.token, t.lemma, t.upostag, t.xpostag, t. feats, t.head, t.deprel) for t in self.tokens], + "tokens": [(str(html.unescape(t.token)), str(html.unescape(t.lemma)), t.upostag, t.xpostag, t. feats, t.head, t.deprel) for t in self.tokens], # "sentence_embedding": self.sentence_embedding, "head": [t.head for t in self.tokens], "relation_distribution_hash": hash, "path_file": str(os.path.join(save_relation_distribution_path, hash + '.npz')) - }, cls=NumpyArrayEncoder) + }, cls=NumpyArrayEncoder, ensure_ascii=False) def __len__(self): return len(self.tokens) diff --git a/combo/predict.py b/combo/predict.py index 90915a89bc9ab5101bef1bac4f249f061098dba1..356cafae1851a757168e8ff94b7182afdd203192 100644 --- a/combo/predict.py +++ b/combo/predict.py @@ -58,6 +58,7 @@ class COMBO(predictor.Predictor): sys.exit(1) def predict(self, sentence: Union[str, List[str], List[List[str]], List[data.Sentence]]): + sentence = sentence.replace(".", " .") if isinstance(sentence, str): if isinstance(self._tokenizer,lambo.LamboTokenizer): segmented = self._tokenizer.segment(sentence) @@ -247,7 +248,7 @@ class COMBO(predictor.Predictor): @classmethod def with_lambo_tokenizer(cls, model: models.Model, - dataset_reader: allen_data.DatasetReader, lambo_model_name : str = 'en'): + dataset_reader: allen_data.DatasetReader, lambo_model_name : str = 'de'): return cls(model, dataset_reader, lambo.LamboTokenizer(lambo_model_name)) @classmethod