Skip to content
Snippets Groups Projects
Commit 055715ab authored by piotrmp's avatar piotrmp
Browse files

Initial structure for subwords.

parent 519a4b36
Branches
Tags
1 merge request!2Multiword generation
...@@ -19,7 +19,7 @@ class Token: ...@@ -19,7 +19,7 @@ class Token:
self.end = end self.end = end
self.text = text self.text = text
self.is_multi_word = is_multi_word self.is_multi_word = is_multi_word
self.words = [] self.subwords = []
def addWord(self,word): def addSubword(self, subword):
self.words.append(word) self.subwords.append(subword)
...@@ -162,7 +162,7 @@ def read_document(file_path, random_separators): ...@@ -162,7 +162,7 @@ def read_document(file_path, random_separators):
if len(numbers) == 1: if len(numbers) == 1:
if word_range[0] <= numbers[0] <= word_range[1]: if word_range[0] <= numbers[0] <= word_range[1]:
# Individual word within multi-word token # Individual word within multi-word token
lastToken.addWord(form) lastToken.addSubword(form)
else: else:
# Individual word not covered # Individual word not covered
token = Token(current_offset, current_offset + len(form), form, False) token = Token(current_offset, current_offset + len(form), form, False)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment