Skip to content
Snippets Groups Projects
Commit 25ccba60 authored by Martyna Wiącek's avatar Martyna Wiącek
Browse files

Fixed multiword prediction + bug that made the code write empty predictions

parent 7a95b22c
No related branches found
No related tags found
1 merge request!47Fixed multiword prediction + bug that made the code write empty predictions
...@@ -28,7 +28,7 @@ def _sentence_tokens(token: Token, ...@@ -28,7 +28,7 @@ def _sentence_tokens(token: Token,
split_subwords: Optional[bool] = None) -> List[Token]: split_subwords: Optional[bool] = None) -> List[Token]:
if split_subwords and len(token.subwords) > 0: if split_subwords and len(token.subwords) > 0:
subword_idxs = [next(_token_idx()) for _ in range(len(token.subwords))] subword_idxs = [next(_token_idx()) for _ in range(len(token.subwords))]
multiword = (token.text, (subword_idxs[0], subword_idxs[1])) multiword = (token.text, (subword_idxs[0], subword_idxs[-1]))
tokens = [Token(idx=s_idx, text=subword, multiword=multiword) for (s_idx, subword) tokens = [Token(idx=s_idx, text=subword, multiword=multiword) for (s_idx, subword)
in zip(subword_idxs, token.subwords)] in zip(subword_idxs, token.subwords)]
return tokens return tokens
...@@ -74,12 +74,14 @@ class LamboTokenizer(Tokenizer): ...@@ -74,12 +74,14 @@ class LamboTokenizer(Tokenizer):
for turn in document.turns: for turn in document.turns:
sentence_tokens = [] sentence_tokens = []
for sentence in turn.sentences: for sentence in turn.sentences:
_reset_idx()
for token in sentence.tokens: for token in sentence.tokens:
sentence_tokens.extend(_sentence_tokens(token, split_subwords)) sentence_tokens.extend(_sentence_tokens(token, split_subwords))
tokens.append(sentence_tokens) tokens.append(sentence_tokens)
elif split_level.upper() == "SENTENCE": elif split_level.upper() == "SENTENCE":
for turn in document.turns: for turn in document.turns:
for sentence in turn.sentences: for sentence in turn.sentences:
_reset_idx()
sentence_tokens = [] sentence_tokens = []
for token in sentence.tokens: for token in sentence.tokens:
sentence_tokens.extend(_sentence_tokens(token, split_subwords)) sentence_tokens.extend(_sentence_tokens(token, split_subwords))
...@@ -87,6 +89,7 @@ class LamboTokenizer(Tokenizer): ...@@ -87,6 +89,7 @@ class LamboTokenizer(Tokenizer):
else: else:
for turn in document.turns: for turn in document.turns:
for sentence in turn.sentences: for sentence in turn.sentences:
_reset_idx()
for token in sentence.tokens: for token in sentence.tokens:
tokens.extend(_sentence_tokens(token, split_subwords)) tokens.extend(_sentence_tokens(token, split_subwords))
tokens = [tokens] tokens = [tokens]
...@@ -116,13 +119,32 @@ class LamboTokenizer(Tokenizer): ...@@ -116,13 +119,32 @@ class LamboTokenizer(Tokenizer):
if turns: if turns:
sentence_tokens = [] sentence_tokens = []
for sentence in turn.sentences: for sentence in turn.sentences:
_reset_idx()
if not turns: if not turns:
sentence_tokens = [] sentence_tokens = []
for token in sentence.tokens: for token in sentence.tokens:
if len(token.subwords) > 0 and split_subwords: if len(token.subwords) > 0 and split_subwords:
sentence_tokens.extend([s for s in token.subwords]) # @TODO this is a very dirty fix for Lambo model's shortcomings
# I noticed that for longer words with multiwords it tends to remove the last letter in the last multiword
# so this is a quick workaround to fix it
# check if subwords in token.subwords are consistent with token.text
if "".join(token.subwords) != token.text:
fixed_subwords = []
text_it = 0
for i, subword in enumerate(token.subwords):
if token.text[text_it:text_it + len(subword)] == subword:
if i == len(token.subwords) - 1 and (text_it + len(subword) < len(token.text)):
subword = token.text[text_it:]
fixed_subwords.append(subword)
text_it += len(subword)
else: else:
sentence_tokens.append(token.text) fixed_subwords.append(token.text[text_it:text_it + len(subword)])
text_it += len(subword)
token.subwords = fixed_subwords
# sentence_tokens.extend(_sentence_tokens(token, split_subwords))
# else:
sentence_tokens.extend(_sentence_tokens(token, split_subwords))
if not turns: if not turns:
sentences.append(sentence_tokens) sentences.append(sentence_tokens)
if turns: if turns:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment