Skip to content
Snippets Groups Projects
Commit fed57c54 authored by Maja Jablonska's avatar Maja Jablonska
Browse files

Correct LAMBO segmentation

parent 17cda2e9
Branches
Tags
1 merge request!46Merge COMBO 3.0 into master
......@@ -49,4 +49,3 @@ class LamboTokenizer(Tokenizer):
sentences.append([t.text for t in sentence.tokens])
return sentences
......@@ -72,3 +72,11 @@ class Tokenizer(FromParameters):
Returns the number of special tokens added for a pair of sequences.
"""
return 0
def segment(self, text: str) -> List[List[str]]:
"""
Full segmentation - segment into sentences
:param text:
:return:
"""
return [[]]
......@@ -27,6 +27,8 @@ from combo.modules.model import Model
from combo.utils import ConfigurationError
from combo.utils.matrices import extract_combo_matrices
import codecs
logging.setLoggerClass(ComboLogger)
logger = logging.getLogger(__name__)
_FEATURES = ["token", "char", "upostag", "xpostag", "lemma", "feats"]
......@@ -383,9 +385,13 @@ def run(_):
if FLAGS.input_file == '-':
print("Interactive mode.")
sentence = input("Sentence: ")
prediction = predictor(sentence)
prediction = [p.tokens for p in predictor(sentence)]
# Flatten the prediction
flattened_prediction = []
for p in prediction:
flattened_prediction.extend(p)
print("{:15} {:15} {:10} {:10} {:10}".format('TOKEN', 'LEMMA', 'UPOS', 'HEAD', 'DEPREL'))
for token in prediction.tokens:
for token in flattened_prediction:
print("{:15} {:15} {:10} {:10} {:10}".format(token.text, token.lemma, token.upostag, token.head,
token.deprel))
elif FLAGS.output_file:
......@@ -410,14 +416,13 @@ def run(_):
else:
tokenizer = LamboTokenizer(tokenizer_language)
with open(FLAGS.input_file, "r") as file:
with open(FLAGS.input_file, "r", encoding='utf-8') as file:
input_sentences = tokenizer.segment(file.read())
predictions = predictor.predict(input_sentences)
with open(FLAGS.output_file, "w") as file:
for sentence in tqdm(input_sentences):
prediction = predictor.predict(' '.join(sentence))
for prediction in tqdm(predictions):
file.writelines(api.sentence2conllu(prediction,
keep_semrel=dataset_reader.use_sem).serialize())
predictions.append(prediction)
if FLAGS.save_matrices:
logger.info("Saving matrices", prefix=prefix)
......
......@@ -74,8 +74,9 @@ class COMBO(PredictorModule):
def predict(self, sentence: Union[str, List[str], List[List[str]], List[data.Sentence]]):
if isinstance(sentence, str):
return self.predict_json({"sentence": sentence})
elif isinstance(sentence, list):
sentence = self.dataset_reader.tokenizer.segment(sentence)
if isinstance(sentence, list):
if len(sentence) == 0:
return []
example = sentence[0]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment