Skip to content
Snippets Groups Projects
Commit 9113ee76 authored by piotrmp's avatar piotrmp
Browse files

Subwords support in reading, writing and printing.

parent 7f1ed3e2
No related branches found
No related tags found
1 merge request!2Multiword generation
......@@ -13,8 +13,13 @@ class Token:
respect to the document's text
:param text: text covered by the token
:param is_multi_word: is this a multi-word token
:param words: the words inside (for multi-word tokens)
"""
self.begin = begin
self.end = end
self.text = text
self.is_multi_word = is_multi_word
self.words = []
def addWord(self,word):
self.words.append(word)
......@@ -20,7 +20,7 @@ def print_document_to_screen(document):
formatted = ''
for token in sentence.tokens:
if token.is_multi_word:
formatted += '((' + token.text + '))'
formatted += '(' + token.text+ '=(' + '-'.join(token.words) + '))'
else:
formatted += '(' + token.text + ')'
print('TOKENS: ' + formatted)
......@@ -48,13 +48,13 @@ def print_document_to_conll(document, path):
token_text = token_text_with_whitespace_for_conllu(token, document, turn, sentence).strip()
if token_text == '':
continue
if token.is_multi_word:
if token.is_multi_word and len(token.words) > 1:
file1.write(str(token_id))
file1.write('-' + str(token_id + 1))
file1.write('-' + str(token_id + len(token.words) - 1))
file1.write('\t' + token_text + '\t_\t_\t_\t_\t_\t_\t_\t_\n')
token_id += 2
file1.write(str(token_id - 2) + '\t_\t_\t_\t_\t_\t' + str(token_id - 3) + '\t_\t_\t_\n')
file1.write(str(token_id - 1) + '\t_\t_\t_\t_\t_\t' + str(token_id - 2) + '\t_\t_\t_\n')
for word in token.words:
file1.write(str(token_id) + '\t' + word + '\t_\t_\t_\t_\t' + str(token_id - 1) + '\t_\t_\t_\n')
token_id += 1
else:
file1.write(str(token_id))
file1.write('\t' + token_text + '\t_\t_\t_\t_\t' + str(token_id - 1) + '\t_\t_\t_\n')
......
......@@ -125,9 +125,10 @@ def read_document(file_path, random_separators):
turn_text = ""
sentence = Sentence()
sentence_text = ""
banned_range = [0, 0]
word_range = [0, 0]
current_offset = 0
separator = ''
lastToken = None
for line in file_path.read_text().split('\n'):
if line.startswith('#'):
# Comment, ignore
......@@ -149,7 +150,7 @@ def read_document(file_path, random_separators):
turn.add_sentence(sentence)
sentence = Sentence()
sentence_text = ""
banned_range = [0, 0]
word_range = [0, 0]
else:
parts = line.split('\t')
is_copy = any(x.startswith('CopyOf=') for x in parts[-1].split('|')) or ('.' in parts[0])
......@@ -159,9 +160,9 @@ def read_document(file_path, random_separators):
form = parts[1]
space_after_no = ('SpaceAfter=No' in parts[-1].split('|'))
if len(numbers) == 1:
if banned_range[0] <= numbers[0] <= banned_range[1]:
if word_range[0] <= numbers[0] <= word_range[1]:
# Individual word within multi-word token
pass
lastToken.addWord(form)
else:
# Individual word not covered
token = Token(current_offset, current_offset + len(form), form, False)
......@@ -186,7 +187,8 @@ def read_document(file_path, random_separators):
sentence_text += separator
current_offset += len(separator)
sentence.add_token(token)
banned_range = numbers
word_range = numbers
lastToken = token
turn.set_text(turn_text)
document = Document()
document.set_text(turn_text)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment