diff --git a/src/lambo/data/token.py b/src/lambo/data/token.py index d9304b5e275ead2b4d1653a54284f7e8751d8ec6..737890e6a1618b8cc26a246c60f6ed0414bae765 100644 --- a/src/lambo/data/token.py +++ b/src/lambo/data/token.py @@ -13,8 +13,13 @@ class Token: respect to the document's text :param text: text covered by the token :param is_multi_word: is this a multi-word token + :param words: the words inside (for multi-word tokens) """ self.begin = begin self.end = end self.text = text self.is_multi_word = is_multi_word + self.words = [] + + def addWord(self,word): + self.words.append(word) diff --git a/src/lambo/utils/printer.py b/src/lambo/utils/printer.py index 25ccf393e927380f694a37c73179bfeaa6cbf5a7..3de74e70fd900f11af97bd7f143d997332605c1e 100644 --- a/src/lambo/utils/printer.py +++ b/src/lambo/utils/printer.py @@ -20,7 +20,7 @@ def print_document_to_screen(document): formatted = '' for token in sentence.tokens: if token.is_multi_word: - formatted += '((' + token.text + '))' + formatted += '(' + token.text+ '=(' + '-'.join(token.words) + '))' else: formatted += '(' + token.text + ')' print('TOKENS: ' + formatted) @@ -48,13 +48,13 @@ def print_document_to_conll(document, path): token_text = token_text_with_whitespace_for_conllu(token, document, turn, sentence).strip() if token_text == '': continue - if token.is_multi_word: + if token.is_multi_word and len(token.words) > 1: file1.write(str(token_id)) - file1.write('-' + str(token_id + 1)) + file1.write('-' + str(token_id + len(token.words) - 1)) file1.write('\t' + token_text + '\t_\t_\t_\t_\t_\t_\t_\t_\n') - token_id += 2 - file1.write(str(token_id - 2) + '\t_\t_\t_\t_\t_\t' + str(token_id - 3) + '\t_\t_\t_\n') - file1.write(str(token_id - 1) + '\t_\t_\t_\t_\t_\t' + str(token_id - 2) + '\t_\t_\t_\n') + for word in token.words: + file1.write(str(token_id) + '\t' + word + '\t_\t_\t_\t_\t' + str(token_id - 1) + '\t_\t_\t_\n') + token_id += 1 else: file1.write(str(token_id)) file1.write('\t' + token_text + '\t_\t_\t_\t_\t' + str(token_id - 1) + '\t_\t_\t_\n') diff --git a/src/lambo/utils/ud_reader.py b/src/lambo/utils/ud_reader.py index 1fecc8ed116f44b5b17aa5cb907f5af032d68d00..9f9febf139bdb5aeccb7128da05b9bb8ea4563d7 100644 --- a/src/lambo/utils/ud_reader.py +++ b/src/lambo/utils/ud_reader.py @@ -125,9 +125,10 @@ def read_document(file_path, random_separators): turn_text = "" sentence = Sentence() sentence_text = "" - banned_range = [0, 0] + word_range = [0, 0] current_offset = 0 separator = '' + lastToken = None for line in file_path.read_text().split('\n'): if line.startswith('#'): # Comment, ignore @@ -149,7 +150,7 @@ def read_document(file_path, random_separators): turn.add_sentence(sentence) sentence = Sentence() sentence_text = "" - banned_range = [0, 0] + word_range = [0, 0] else: parts = line.split('\t') is_copy = any(x.startswith('CopyOf=') for x in parts[-1].split('|')) or ('.' in parts[0]) @@ -159,9 +160,9 @@ def read_document(file_path, random_separators): form = parts[1] space_after_no = ('SpaceAfter=No' in parts[-1].split('|')) if len(numbers) == 1: - if banned_range[0] <= numbers[0] <= banned_range[1]: + if word_range[0] <= numbers[0] <= word_range[1]: # Individual word within multi-word token - pass + lastToken.addWord(form) else: # Individual word not covered token = Token(current_offset, current_offset + len(form), form, False) @@ -186,7 +187,8 @@ def read_document(file_path, random_separators): sentence_text += separator current_offset += len(separator) sentence.add_token(token) - banned_range = numbers + word_range = numbers + lastToken = token turn.set_text(turn_text) document = Document() document.set_text(turn_text)