Skip to content
Snippets Groups Projects
Commit a843ed90 authored by Martyna Wiącek's avatar Martyna Wiącek
Browse files

unescape ascii characters

parent 2e2d1673
Branches
No related merge requests found
Pipeline #9714 failed with stage
in 1 minute and 42 seconds
import collections import collections
import dataclasses import dataclasses
import html
import json import json
import os import os
import string import string
...@@ -55,12 +56,12 @@ class Sentence: ...@@ -55,12 +56,12 @@ class Sentence:
# numpy.savez(hash + '_trimmed.npz', self.relation_distribution[1:, 1:]) # numpy.savez(hash + '_trimmed.npz', self.relation_distribution[1:, 1:])
return json.dumps({ return json.dumps({
"tokens": [(t.token, t.lemma, t.upostag, t.xpostag, t. feats, t.head, t.deprel) for t in self.tokens], "tokens": [(str(html.unescape(t.token)), str(html.unescape(t.lemma)), t.upostag, t.xpostag, t. feats, t.head, t.deprel) for t in self.tokens],
# "sentence_embedding": self.sentence_embedding, # "sentence_embedding": self.sentence_embedding,
"head": [t.head for t in self.tokens], "head": [t.head for t in self.tokens],
"relation_distribution_hash": hash, "relation_distribution_hash": hash,
"path_file": str(os.path.join(save_relation_distribution_path, hash + '.npz')) "path_file": str(os.path.join(save_relation_distribution_path, hash + '.npz'))
}, cls=NumpyArrayEncoder) }, cls=NumpyArrayEncoder, ensure_ascii=False)
def __len__(self): def __len__(self):
return len(self.tokens) return len(self.tokens)
......
...@@ -58,6 +58,7 @@ class COMBO(predictor.Predictor): ...@@ -58,6 +58,7 @@ class COMBO(predictor.Predictor):
sys.exit(1) sys.exit(1)
def predict(self, sentence: Union[str, List[str], List[List[str]], List[data.Sentence]]): def predict(self, sentence: Union[str, List[str], List[List[str]], List[data.Sentence]]):
sentence = sentence.replace(".", " .")
if isinstance(sentence, str): if isinstance(sentence, str):
if isinstance(self._tokenizer,lambo.LamboTokenizer): if isinstance(self._tokenizer,lambo.LamboTokenizer):
segmented = self._tokenizer.segment(sentence) segmented = self._tokenizer.segment(sentence)
...@@ -247,7 +248,7 @@ class COMBO(predictor.Predictor): ...@@ -247,7 +248,7 @@ class COMBO(predictor.Predictor):
@classmethod @classmethod
def with_lambo_tokenizer(cls, model: models.Model, def with_lambo_tokenizer(cls, model: models.Model,
dataset_reader: allen_data.DatasetReader, lambo_model_name : str = 'en'): dataset_reader: allen_data.DatasetReader, lambo_model_name : str = 'de'):
return cls(model, dataset_reader, lambo.LamboTokenizer(lambo_model_name)) return cls(model, dataset_reader, lambo.LamboTokenizer(lambo_model_name))
@classmethod @classmethod
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment