diff --git a/combo/data/tokenizers/lambo_tokenizer.py b/combo/data/tokenizers/lambo_tokenizer.py index f37b07640e8cf7f22add0cc56ba2a9d049c3d201..c5f4451034ec614cacfc08429be9bdbd644b7229 100644 --- a/combo/data/tokenizers/lambo_tokenizer.py +++ b/combo/data/tokenizers/lambo_tokenizer.py @@ -49,4 +49,3 @@ class LamboTokenizer(Tokenizer): sentences.append([t.text for t in sentence.tokens]) return sentences - diff --git a/combo/data/tokenizers/tokenizer.py b/combo/data/tokenizers/tokenizer.py index 7b9269c4021d4bb658c1e04dbc190c2755e63762..163036e32747d9390b31dc713bc5a7177c15ebfc 100644 --- a/combo/data/tokenizers/tokenizer.py +++ b/combo/data/tokenizers/tokenizer.py @@ -72,3 +72,11 @@ class Tokenizer(FromParameters): Returns the number of special tokens added for a pair of sequences. """ return 0 + + def segment(self, text: str) -> List[List[str]]: + """ + Full segmentation - segment into sentences + :param text: + :return: + """ + return [[]] diff --git a/combo/main.py b/combo/main.py index dd4bdd292626789424fb7216d5b3a22f5dd1b4ba..a76db04fe90caa40020ddc616282a10a2c2038f6 100755 --- a/combo/main.py +++ b/combo/main.py @@ -27,6 +27,8 @@ from combo.modules.model import Model from combo.utils import ConfigurationError from combo.utils.matrices import extract_combo_matrices +import codecs + logging.setLoggerClass(ComboLogger) logger = logging.getLogger(__name__) _FEATURES = ["token", "char", "upostag", "xpostag", "lemma", "feats"] @@ -383,9 +385,13 @@ def run(_): if FLAGS.input_file == '-': print("Interactive mode.") sentence = input("Sentence: ") - prediction = predictor(sentence) + prediction = [p.tokens for p in predictor(sentence)] + # Flatten the prediction + flattened_prediction = [] + for p in prediction: + flattened_prediction.extend(p) print("{:15} {:15} {:10} {:10} {:10}".format('TOKEN', 'LEMMA', 'UPOS', 'HEAD', 'DEPREL')) - for token in prediction.tokens: + for token in flattened_prediction: print("{:15} {:15} {:10} {:10} {:10}".format(token.text, token.lemma, token.upostag, token.head, token.deprel)) elif FLAGS.output_file: @@ -410,14 +416,13 @@ def run(_): else: tokenizer = LamboTokenizer(tokenizer_language) - with open(FLAGS.input_file, "r") as file: + with open(FLAGS.input_file, "r", encoding='utf-8') as file: input_sentences = tokenizer.segment(file.read()) + predictions = predictor.predict(input_sentences) with open(FLAGS.output_file, "w") as file: - for sentence in tqdm(input_sentences): - prediction = predictor.predict(' '.join(sentence)) + for prediction in tqdm(predictions): file.writelines(api.sentence2conllu(prediction, keep_semrel=dataset_reader.use_sem).serialize()) - predictions.append(prediction) if FLAGS.save_matrices: logger.info("Saving matrices", prefix=prefix) diff --git a/combo/predict.py b/combo/predict.py index 4450d2f33a8881156cb60ccac2bde96903dd477b..8363e50e74638cad4e3751a49baebc03cfb91a9c 100644 --- a/combo/predict.py +++ b/combo/predict.py @@ -74,8 +74,9 @@ class COMBO(PredictorModule): def predict(self, sentence: Union[str, List[str], List[List[str]], List[data.Sentence]]): if isinstance(sentence, str): - return self.predict_json({"sentence": sentence}) - elif isinstance(sentence, list): + sentence = self.dataset_reader.tokenizer.segment(sentence) + + if isinstance(sentence, list): if len(sentence) == 0: return [] example = sentence[0]