Correct LAMBO segmentation

fed57c54 · Maja Jablonska · 17cda2e9 · fed57c54 · fed57c54 · fed57c54
Commit fed57c54 authored Nov 21, 2023 by Maja Jablonska
--- a/combo/data/tokenizers/lambo_tokenizer.py
+++ b/combo/data/tokenizers/lambo_tokenizer.py
@@ -49,4 +49,3 @@ class LamboTokenizer(Tokenizer):
                sentences.append([t.text for t in sentence.tokens])
        return sentences
--- a/combo/data/tokenizers/tokenizer.py
+++ b/combo/data/tokenizers/tokenizer.py
@@ -72,3 +72,11 @@ class Tokenizer(FromParameters):
        Returns the number of special tokens added for a pair of sequences.
        """
        return 0
+    def segment(self, text: str) -> List[List[str]]:
+        """
+        Full segmentation - segment into sentences
+        :param text:
+        :return:
+        """
+        return [[]]
--- a/combo/main.py
+++ b/combo/main.py
@@ -27,6 +27,8 @@ from combo.modules.model import Model
 from combo.utils import ConfigurationError
 from combo.utils.matrices import extract_combo_matrices
+import codecs
 logging.setLoggerClass(ComboLogger)
 logger = logging.getLogger(__name__)
 _FEATURES = ["token", "char", "upostag", "xpostag", "lemma", "feats"]
@@ -383,9 +385,13 @@ def run(_):
        if FLAGS.input_file == '-':
            print("Interactive mode.")
            sentence = input("Sentence: ")
-            prediction = predictor(sentence)
+            prediction = [p.tokens for p in predictor(sentence)]
+            # Flatten the prediction
+            flattened_prediction = []
+            for p in prediction:
+                flattened_prediction.extend(p)
            print("{:15} {:15} {:10} {:10} {:10}".format('TOKEN', 'LEMMA', 'UPOS', 'HEAD', 'DEPREL'))
-            for token in prediction.tokens:
+            for token in flattened_prediction:
                print("{:15} {:15} {:10} {:10} {:10}".format(token.text, token.lemma, token.upostag, token.head,
                                                             token.deprel))
        elif FLAGS.output_file:
@@ -410,14 +416,13 @@ def run(_):
            else:
                tokenizer = LamboTokenizer(tokenizer_language)
-                with open(FLAGS.input_file, "r") as file:
+                with open(FLAGS.input_file, "r", encoding='utf-8') as file:
                    input_sentences = tokenizer.segment(file.read())
+                predictions = predictor.predict(input_sentences)
                with open(FLAGS.output_file, "w") as file:
-                    for sentence in tqdm(input_sentences):
+                    for prediction in tqdm(predictions):
-                        prediction = predictor.predict(' '.join(sentence))
                        file.writelines(api.sentence2conllu(prediction,
                                                            keep_semrel=dataset_reader.use_sem).serialize())
-                        predictions.append(prediction)
            if FLAGS.save_matrices:
                logger.info("Saving matrices", prefix=prefix)

--- a/combo/predict.py
+++ b/combo/predict.py
@@ -74,8 +74,9 @@ class COMBO(PredictorModule):
    def predict(self, sentence: Union[str, List[str], List[List[str]], List[data.Sentence]]):
        if isinstance(sentence, str):
-            return self.predict_json({"sentence": sentence})
+            sentence = self.dataset_reader.tokenizer.segment(sentence)
-        elif isinstance(sentence, list):
+        if isinstance(sentence, list):
            if len(sentence) == 0:
                return []
            example = sentence[0]