unescape ascii characters

a843ed90 · Martyna Wiącek · 2e2d1673 · a843ed90 · a843ed90
Commit a843ed90 authored 2 years ago by Martyna Wiącek
--- a/combo/data/api.py
+++ b/combo/data/api.py
 import collections
 import dataclasses
+import html
 import json
 import os
 import string
@@ -55,12 +56,12 @@ class Sentence:
        # numpy.savez(hash + '_trimmed.npz', self.relation_distribution[1:, 1:])
        return json.dumps({
-            "tokens": [(t.token, t.lemma, t.upostag, t.xpostag, t. feats, t.head, t.deprel) for t in self.tokens],
+            "tokens": [(str(html.unescape(t.token)), str(html.unescape(t.lemma)), t.upostag, t.xpostag, t. feats, t.head, t.deprel) for t in self.tokens],
            # "sentence_embedding": self.sentence_embedding,
            "head": [t.head for t in self.tokens],
            "relation_distribution_hash": hash,
            "path_file": str(os.path.join(save_relation_distribution_path, hash + '.npz'))
-        }, cls=NumpyArrayEncoder)
+        }, cls=NumpyArrayEncoder, ensure_ascii=False)
    def __len__(self):
        return len(self.tokens)

--- a/combo/predict.py
+++ b/combo/predict.py
@@ -58,6 +58,7 @@ class COMBO(predictor.Predictor):
            sys.exit(1)
    def predict(self, sentence: Union[str, List[str], List[List[str]], List[data.Sentence]]):
+        sentence = sentence.replace(".", " .")
        if isinstance(sentence, str):
            if isinstance(self._tokenizer,lambo.LamboTokenizer):
                segmented = self._tokenizer.segment(sentence)
@@ -247,7 +248,7 @@ class COMBO(predictor.Predictor):
    @classmethod
    def with_lambo_tokenizer(cls, model: models.Model,
-                             dataset_reader: allen_data.DatasetReader, lambo_model_name : str = 'en'):
+                             dataset_reader: allen_data.DatasetReader, lambo_model_name : str = 'de'):
        return cls(model, dataset_reader, lambo.LamboTokenizer(lambo_model_name))
    @classmethod