Subwords support in reading, writing and printing.

9113ee76 · piotrmp · 7f1ed3e2 · 9113ee76 · 9113ee76 · 9113ee76
Commit 9113ee76 authored Dec 1, 2022 by piotrmp
--- a/src/lambo/data/token.py
+++ b/src/lambo/data/token.py
@@ -13,8 +13,13 @@ class Token:
        respect to the document's text
        :param text: text covered by the token
        :param is_multi_word: is this a multi-word token
+        :param words: the words inside (for multi-word tokens)
        """
        self.begin = begin
        self.end = end
        self.text = text
        self.is_multi_word = is_multi_word
+        self.words = []
+        
+    def addWord(self,word):
+        self.words.append(word)
--- a/src/lambo/utils/printer.py
+++ b/src/lambo/utils/printer.py
@@ -20,7 +20,7 @@ def print_document_to_screen(document):
            formatted = ''
            for token in sentence.tokens:
                if token.is_multi_word:
-                    formatted += '((' + token.text + '))'
+                    formatted += '(' + token.text+ '=(' + '-'.join(token.words) + '))'
                else:
                    formatted += '(' + token.text + ')'
            print('TOKENS: ' + formatted)
@@ -48,13 +48,13 @@ def print_document_to_conll(document, path):
                    token_text = token_text_with_whitespace_for_conllu(token, document, turn, sentence).strip()
                    if token_text == '':
                        continue
-                    if token.is_multi_word:
+                    if token.is_multi_word and len(token.words) > 1:
                        file1.write(str(token_id))
-                        file1.write('-' + str(token_id + 1))
+                        file1.write('-' + str(token_id + len(token.words) - 1))
                        file1.write('\t' + token_text + '\t_\t_\t_\t_\t_\t_\t_\t_\n')
-                        token_id += 2
-                        file1.write(str(token_id - 2) + '\t_\t_\t_\t_\t_\t' + str(token_id - 3) + '\t_\t_\t_\n')
-                        file1.write(str(token_id - 1) + '\t_\t_\t_\t_\t_\t' + str(token_id - 2) + '\t_\t_\t_\n')
+                        for word in token.words:
+                            file1.write(str(token_id) + '\t' + word + '\t_\t_\t_\t_\t' + str(token_id - 1) + '\t_\t_\t_\n')
+                            token_id += 1
                    else:
                        file1.write(str(token_id))
                        file1.write('\t' + token_text + '\t_\t_\t_\t_\t' + str(token_id - 1) + '\t_\t_\t_\n')

--- a/src/lambo/utils/ud_reader.py
+++ b/src/lambo/utils/ud_reader.py
@@ -125,9 +125,10 @@ def read_document(file_path, random_separators):
    turn_text = ""
    sentence = Sentence()
    sentence_text = ""
-    banned_range = [0, 0]
+    word_range = [0, 0]
    current_offset = 0
    separator = ''
+    lastToken = None
    for line in file_path.read_text().split('\n'):
        if line.startswith('#'):
            # Comment, ignore
@@ -149,7 +150,7 @@ def read_document(file_path, random_separators):
            turn.add_sentence(sentence)
            sentence = Sentence()
            sentence_text = ""
-            banned_range = [0, 0]
+            word_range = [0, 0]
        else:
            parts = line.split('\t')
            is_copy = any(x.startswith('CopyOf=') for x in parts[-1].split('|')) or ('.' in parts[0])
@@ -159,9 +160,9 @@ def read_document(file_path, random_separators):
            form = parts[1]
            space_after_no = ('SpaceAfter=No' in parts[-1].split('|'))
            if len(numbers) == 1:
-                if banned_range[0] <= numbers[0] <= banned_range[1]:
+                if word_range[0] <= numbers[0] <= word_range[1]:
                    # Individual word within multi-word token
-                    pass
+                    lastToken.addWord(form)
                else:
                    # Individual word not covered
                    token = Token(current_offset, current_offset + len(form), form, False)
@@ -186,7 +187,8 @@ def read_document(file_path, random_separators):
                sentence_text += separator
                current_offset += len(separator)
                sentence.add_token(token)
-                banned_range = numbers
+                word_range = numbers
+                lastToken = token
    turn.set_text(turn_text)
    document = Document()
    document.set_text(turn_text)