Skip to content
Snippets Groups Projects
Commit fed57c54 authored by Maja Jablonska's avatar Maja Jablonska
Browse files

Correct LAMBO segmentation

parent 17cda2e9
No related branches found
No related tags found
1 merge request!46Merge COMBO 3.0 into master
...@@ -49,4 +49,3 @@ class LamboTokenizer(Tokenizer): ...@@ -49,4 +49,3 @@ class LamboTokenizer(Tokenizer):
sentences.append([t.text for t in sentence.tokens]) sentences.append([t.text for t in sentence.tokens])
return sentences return sentences
...@@ -72,3 +72,11 @@ class Tokenizer(FromParameters): ...@@ -72,3 +72,11 @@ class Tokenizer(FromParameters):
Returns the number of special tokens added for a pair of sequences. Returns the number of special tokens added for a pair of sequences.
""" """
return 0 return 0
def segment(self, text: str) -> List[List[str]]:
"""
Full segmentation - segment into sentences
:param text:
:return:
"""
return [[]]
...@@ -27,6 +27,8 @@ from combo.modules.model import Model ...@@ -27,6 +27,8 @@ from combo.modules.model import Model
from combo.utils import ConfigurationError from combo.utils import ConfigurationError
from combo.utils.matrices import extract_combo_matrices from combo.utils.matrices import extract_combo_matrices
import codecs
logging.setLoggerClass(ComboLogger) logging.setLoggerClass(ComboLogger)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_FEATURES = ["token", "char", "upostag", "xpostag", "lemma", "feats"] _FEATURES = ["token", "char", "upostag", "xpostag", "lemma", "feats"]
...@@ -383,9 +385,13 @@ def run(_): ...@@ -383,9 +385,13 @@ def run(_):
if FLAGS.input_file == '-': if FLAGS.input_file == '-':
print("Interactive mode.") print("Interactive mode.")
sentence = input("Sentence: ") sentence = input("Sentence: ")
prediction = predictor(sentence) prediction = [p.tokens for p in predictor(sentence)]
# Flatten the prediction
flattened_prediction = []
for p in prediction:
flattened_prediction.extend(p)
print("{:15} {:15} {:10} {:10} {:10}".format('TOKEN', 'LEMMA', 'UPOS', 'HEAD', 'DEPREL')) print("{:15} {:15} {:10} {:10} {:10}".format('TOKEN', 'LEMMA', 'UPOS', 'HEAD', 'DEPREL'))
for token in prediction.tokens: for token in flattened_prediction:
print("{:15} {:15} {:10} {:10} {:10}".format(token.text, token.lemma, token.upostag, token.head, print("{:15} {:15} {:10} {:10} {:10}".format(token.text, token.lemma, token.upostag, token.head,
token.deprel)) token.deprel))
elif FLAGS.output_file: elif FLAGS.output_file:
...@@ -410,14 +416,13 @@ def run(_): ...@@ -410,14 +416,13 @@ def run(_):
else: else:
tokenizer = LamboTokenizer(tokenizer_language) tokenizer = LamboTokenizer(tokenizer_language)
with open(FLAGS.input_file, "r") as file: with open(FLAGS.input_file, "r", encoding='utf-8') as file:
input_sentences = tokenizer.segment(file.read()) input_sentences = tokenizer.segment(file.read())
predictions = predictor.predict(input_sentences)
with open(FLAGS.output_file, "w") as file: with open(FLAGS.output_file, "w") as file:
for sentence in tqdm(input_sentences): for prediction in tqdm(predictions):
prediction = predictor.predict(' '.join(sentence))
file.writelines(api.sentence2conllu(prediction, file.writelines(api.sentence2conllu(prediction,
keep_semrel=dataset_reader.use_sem).serialize()) keep_semrel=dataset_reader.use_sem).serialize())
predictions.append(prediction)
if FLAGS.save_matrices: if FLAGS.save_matrices:
logger.info("Saving matrices", prefix=prefix) logger.info("Saving matrices", prefix=prefix)
......
...@@ -74,8 +74,9 @@ class COMBO(PredictorModule): ...@@ -74,8 +74,9 @@ class COMBO(PredictorModule):
def predict(self, sentence: Union[str, List[str], List[List[str]], List[data.Sentence]]): def predict(self, sentence: Union[str, List[str], List[List[str]], List[data.Sentence]]):
if isinstance(sentence, str): if isinstance(sentence, str):
return self.predict_json({"sentence": sentence}) sentence = self.dataset_reader.tokenizer.segment(sentence)
elif isinstance(sentence, list):
if isinstance(sentence, list):
if len(sentence) == 0: if len(sentence) == 0:
return [] return []
example = sentence[0] example = sentence[0]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment