Skip to content
Snippets Groups Projects
Commit 9113ee76 authored by piotrmp's avatar piotrmp
Browse files

Subwords support in reading, writing and printing.

parent 7f1ed3e2
1 merge request!2Multiword generation
...@@ -13,8 +13,13 @@ class Token: ...@@ -13,8 +13,13 @@ class Token:
respect to the document's text respect to the document's text
:param text: text covered by the token :param text: text covered by the token
:param is_multi_word: is this a multi-word token :param is_multi_word: is this a multi-word token
:param words: the words inside (for multi-word tokens)
""" """
self.begin = begin self.begin = begin
self.end = end self.end = end
self.text = text self.text = text
self.is_multi_word = is_multi_word self.is_multi_word = is_multi_word
self.words = []
def addWord(self,word):
self.words.append(word)
...@@ -20,7 +20,7 @@ def print_document_to_screen(document): ...@@ -20,7 +20,7 @@ def print_document_to_screen(document):
formatted = '' formatted = ''
for token in sentence.tokens: for token in sentence.tokens:
if token.is_multi_word: if token.is_multi_word:
formatted += '((' + token.text + '))' formatted += '(' + token.text+ '=(' + '-'.join(token.words) + '))'
else: else:
formatted += '(' + token.text + ')' formatted += '(' + token.text + ')'
print('TOKENS: ' + formatted) print('TOKENS: ' + formatted)
...@@ -48,13 +48,13 @@ def print_document_to_conll(document, path): ...@@ -48,13 +48,13 @@ def print_document_to_conll(document, path):
token_text = token_text_with_whitespace_for_conllu(token, document, turn, sentence).strip() token_text = token_text_with_whitespace_for_conllu(token, document, turn, sentence).strip()
if token_text == '': if token_text == '':
continue continue
if token.is_multi_word: if token.is_multi_word and len(token.words) > 1:
file1.write(str(token_id)) file1.write(str(token_id))
file1.write('-' + str(token_id + 1)) file1.write('-' + str(token_id + len(token.words) - 1))
file1.write('\t' + token_text + '\t_\t_\t_\t_\t_\t_\t_\t_\n') file1.write('\t' + token_text + '\t_\t_\t_\t_\t_\t_\t_\t_\n')
token_id += 2 for word in token.words:
file1.write(str(token_id - 2) + '\t_\t_\t_\t_\t_\t' + str(token_id - 3) + '\t_\t_\t_\n') file1.write(str(token_id) + '\t' + word + '\t_\t_\t_\t_\t' + str(token_id - 1) + '\t_\t_\t_\n')
file1.write(str(token_id - 1) + '\t_\t_\t_\t_\t_\t' + str(token_id - 2) + '\t_\t_\t_\n') token_id += 1
else: else:
file1.write(str(token_id)) file1.write(str(token_id))
file1.write('\t' + token_text + '\t_\t_\t_\t_\t' + str(token_id - 1) + '\t_\t_\t_\n') file1.write('\t' + token_text + '\t_\t_\t_\t_\t' + str(token_id - 1) + '\t_\t_\t_\n')
......
...@@ -125,9 +125,10 @@ def read_document(file_path, random_separators): ...@@ -125,9 +125,10 @@ def read_document(file_path, random_separators):
turn_text = "" turn_text = ""
sentence = Sentence() sentence = Sentence()
sentence_text = "" sentence_text = ""
banned_range = [0, 0] word_range = [0, 0]
current_offset = 0 current_offset = 0
separator = '' separator = ''
lastToken = None
for line in file_path.read_text().split('\n'): for line in file_path.read_text().split('\n'):
if line.startswith('#'): if line.startswith('#'):
# Comment, ignore # Comment, ignore
...@@ -149,7 +150,7 @@ def read_document(file_path, random_separators): ...@@ -149,7 +150,7 @@ def read_document(file_path, random_separators):
turn.add_sentence(sentence) turn.add_sentence(sentence)
sentence = Sentence() sentence = Sentence()
sentence_text = "" sentence_text = ""
banned_range = [0, 0] word_range = [0, 0]
else: else:
parts = line.split('\t') parts = line.split('\t')
is_copy = any(x.startswith('CopyOf=') for x in parts[-1].split('|')) or ('.' in parts[0]) is_copy = any(x.startswith('CopyOf=') for x in parts[-1].split('|')) or ('.' in parts[0])
...@@ -159,9 +160,9 @@ def read_document(file_path, random_separators): ...@@ -159,9 +160,9 @@ def read_document(file_path, random_separators):
form = parts[1] form = parts[1]
space_after_no = ('SpaceAfter=No' in parts[-1].split('|')) space_after_no = ('SpaceAfter=No' in parts[-1].split('|'))
if len(numbers) == 1: if len(numbers) == 1:
if banned_range[0] <= numbers[0] <= banned_range[1]: if word_range[0] <= numbers[0] <= word_range[1]:
# Individual word within multi-word token # Individual word within multi-word token
pass lastToken.addWord(form)
else: else:
# Individual word not covered # Individual word not covered
token = Token(current_offset, current_offset + len(form), form, False) token = Token(current_offset, current_offset + len(form), form, False)
...@@ -186,7 +187,8 @@ def read_document(file_path, random_separators): ...@@ -186,7 +187,8 @@ def read_document(file_path, random_separators):
sentence_text += separator sentence_text += separator
current_offset += len(separator) current_offset += len(separator)
sentence.add_token(token) sentence.add_token(token)
banned_range = numbers word_range = numbers
lastToken = token
turn.set_text(turn_text) turn.set_text(turn_text)
document = Document() document = Document()
document.set_text(turn_text) document.set_text(turn_text)
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment