diff --git a/src/lambo/data/token.py b/src/lambo/data/token.py index 737890e6a1618b8cc26a246c60f6ed0414bae765..71cbbe6a97c0ae151ee4f3888f6b256d1027c7e7 100644 --- a/src/lambo/data/token.py +++ b/src/lambo/data/token.py @@ -19,7 +19,7 @@ class Token: self.end = end self.text = text self.is_multi_word = is_multi_word - self.words = [] + self.subwords = [] - def addWord(self,word): - self.words.append(word) + def addSubword(self, subword): + self.subwords.append(subword) diff --git a/src/lambo/subwords/__init__.py b/src/lambo/subwords/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/lambo/subwords/model.py b/src/lambo/subwords/model.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/lambo/subwords/subber.py b/src/lambo/subwords/subber.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/lambo/subwords/train.py b/src/lambo/subwords/train.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/lambo/utils/ud_reader.py b/src/lambo/utils/ud_reader.py index 9f9febf139bdb5aeccb7128da05b9bb8ea4563d7..99fdbada4f20302500590a8d19389900696d6bba 100644 --- a/src/lambo/utils/ud_reader.py +++ b/src/lambo/utils/ud_reader.py @@ -162,7 +162,7 @@ def read_document(file_path, random_separators): if len(numbers) == 1: if word_range[0] <= numbers[0] <= word_range[1]: # Individual word within multi-word token - lastToken.addWord(form) + lastToken.addSubword(form) else: # Individual word not covered token = Token(current_offset, current_offset + len(form), form, False)