From 9113ee76126e18cdf87176cd0cc6d7ad81ab85c4 Mon Sep 17 00:00:00 2001
From: piotrmp <piotr.m.przybyla@gmail.com>
Date: Thu, 1 Dec 2022 16:33:13 +0100
Subject: [PATCH] Subwords support in reading, writing and printing.

---
 src/lambo/data/token.py      |  5 +++++
 src/lambo/utils/printer.py   | 12 ++++++------
 src/lambo/utils/ud_reader.py | 12 +++++++-----
 3 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/src/lambo/data/token.py b/src/lambo/data/token.py
index d9304b5..737890e 100644
--- a/src/lambo/data/token.py
+++ b/src/lambo/data/token.py
@@ -13,8 +13,13 @@ class Token:
         respect to the document's text
         :param text: text covered by the token
         :param is_multi_word: is this a multi-word token
+        :param words: the words inside (for multi-word tokens)
         """
         self.begin = begin
         self.end = end
         self.text = text
         self.is_multi_word = is_multi_word
+        self.words = []
+        
+    def addWord(self,word):
+        self.words.append(word)
diff --git a/src/lambo/utils/printer.py b/src/lambo/utils/printer.py
index 25ccf39..3de74e7 100644
--- a/src/lambo/utils/printer.py
+++ b/src/lambo/utils/printer.py
@@ -20,7 +20,7 @@ def print_document_to_screen(document):
             formatted = ''
             for token in sentence.tokens:
                 if token.is_multi_word:
-                    formatted += '((' + token.text + '))'
+                    formatted += '(' + token.text+ '=(' + '-'.join(token.words) + '))'
                 else:
                     formatted += '(' + token.text + ')'
             print('TOKENS: ' + formatted)
@@ -48,13 +48,13 @@ def print_document_to_conll(document, path):
                     token_text = token_text_with_whitespace_for_conllu(token, document, turn, sentence).strip()
                     if token_text == '':
                         continue
-                    if token.is_multi_word:
+                    if token.is_multi_word and len(token.words) > 1:
                         file1.write(str(token_id))
-                        file1.write('-' + str(token_id + 1))
+                        file1.write('-' + str(token_id + len(token.words) - 1))
                         file1.write('\t' + token_text + '\t_\t_\t_\t_\t_\t_\t_\t_\n')
-                        token_id += 2
-                        file1.write(str(token_id - 2) + '\t_\t_\t_\t_\t_\t' + str(token_id - 3) + '\t_\t_\t_\n')
-                        file1.write(str(token_id - 1) + '\t_\t_\t_\t_\t_\t' + str(token_id - 2) + '\t_\t_\t_\n')
+                        for word in token.words:
+                            file1.write(str(token_id) + '\t' + word + '\t_\t_\t_\t_\t' + str(token_id - 1) + '\t_\t_\t_\n')
+                            token_id += 1
                     else:
                         file1.write(str(token_id))
                         file1.write('\t' + token_text + '\t_\t_\t_\t_\t' + str(token_id - 1) + '\t_\t_\t_\n')
diff --git a/src/lambo/utils/ud_reader.py b/src/lambo/utils/ud_reader.py
index 1fecc8e..9f9febf 100644
--- a/src/lambo/utils/ud_reader.py
+++ b/src/lambo/utils/ud_reader.py
@@ -125,9 +125,10 @@ def read_document(file_path, random_separators):
     turn_text = ""
     sentence = Sentence()
     sentence_text = ""
-    banned_range = [0, 0]
+    word_range = [0, 0]
     current_offset = 0
     separator = ''
+    lastToken = None
     for line in file_path.read_text().split('\n'):
         if line.startswith('#'):
             # Comment, ignore
@@ -149,7 +150,7 @@ def read_document(file_path, random_separators):
             turn.add_sentence(sentence)
             sentence = Sentence()
             sentence_text = ""
-            banned_range = [0, 0]
+            word_range = [0, 0]
         else:
             parts = line.split('\t')
             is_copy = any(x.startswith('CopyOf=') for x in parts[-1].split('|')) or ('.' in parts[0])
@@ -159,9 +160,9 @@ def read_document(file_path, random_separators):
             form = parts[1]
             space_after_no = ('SpaceAfter=No' in parts[-1].split('|'))
             if len(numbers) == 1:
-                if banned_range[0] <= numbers[0] <= banned_range[1]:
+                if word_range[0] <= numbers[0] <= word_range[1]:
                     # Individual word within multi-word token
-                    pass
+                    lastToken.addWord(form)
                 else:
                     # Individual word not covered
                     token = Token(current_offset, current_offset + len(form), form, False)
@@ -186,7 +187,8 @@ def read_document(file_path, random_separators):
                 sentence_text += separator
                 current_offset += len(separator)
                 sentence.add_token(token)
-                banned_range = numbers
+                word_range = numbers
+                lastToken = token
     turn.set_text(turn_text)
     document = Document()
     document.set_text(turn_text)
-- 
GitLab