From 055715ab1cca19a8868942f1f10a002ce14f3126 Mon Sep 17 00:00:00 2001 From: piotrmp <piotr.m.przybyla@gmail.com> Date: Thu, 1 Dec 2022 16:58:04 +0100 Subject: [PATCH] Initial structure for subwords. --- src/lambo/data/token.py | 6 +++--- src/lambo/subwords/__init__.py | 0 src/lambo/subwords/model.py | 0 src/lambo/subwords/subber.py | 0 src/lambo/subwords/train.py | 0 src/lambo/utils/ud_reader.py | 2 +- 6 files changed, 4 insertions(+), 4 deletions(-) create mode 100644 src/lambo/subwords/__init__.py create mode 100644 src/lambo/subwords/model.py create mode 100644 src/lambo/subwords/subber.py create mode 100644 src/lambo/subwords/train.py diff --git a/src/lambo/data/token.py b/src/lambo/data/token.py index 737890e..71cbbe6 100644 --- a/src/lambo/data/token.py +++ b/src/lambo/data/token.py @@ -19,7 +19,7 @@ class Token: self.end = end self.text = text self.is_multi_word = is_multi_word - self.words = [] + self.subwords = [] - def addWord(self,word): - self.words.append(word) + def addSubword(self, subword): + self.subwords.append(subword) diff --git a/src/lambo/subwords/__init__.py b/src/lambo/subwords/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/lambo/subwords/model.py b/src/lambo/subwords/model.py new file mode 100644 index 0000000..e69de29 diff --git a/src/lambo/subwords/subber.py b/src/lambo/subwords/subber.py new file mode 100644 index 0000000..e69de29 diff --git a/src/lambo/subwords/train.py b/src/lambo/subwords/train.py new file mode 100644 index 0000000..e69de29 diff --git a/src/lambo/utils/ud_reader.py b/src/lambo/utils/ud_reader.py index 9f9febf..99fdbad 100644 --- a/src/lambo/utils/ud_reader.py +++ b/src/lambo/utils/ud_reader.py @@ -162,7 +162,7 @@ def read_document(file_path, random_separators): if len(numbers) == 1: if word_range[0] <= numbers[0] <= word_range[1]: # Individual word within multi-word token - lastToken.addWord(form) + lastToken.addSubword(form) else: # Individual word not covered token = Token(current_offset, current_offset + len(form), form, False) -- GitLab