Skip to content
Snippets Groups Projects
Commit 3b8104e8 authored by Michał Pogoda's avatar Michał Pogoda
Browse files

PEP8

parent e62e9d41
Branches
No related tags found
No related merge requests found
Pipeline #3775 failed
......@@ -3,31 +3,27 @@ import os
class WordSplit(object):
def __init__(self):
self.sym_spell = SymSpell(max_dictionary_edit_distance=0,
prefix_length=7)
self.sym_spell.load_dictionary('vocab.txt',
term_index=0,
count_index=1)
self.sym_spell = SymSpell(max_dictionary_edit_distance=0, prefix_length=7)
self.sym_spell.load_dictionary("vocab.txt", term_index=0, count_index=1)
def load_data(self, path):
with open(path, 'r', encoding='utf8') as f:
with open(path, "r", encoding="utf8") as f:
return f.readlines()
def save_data(self, path, text):
with open(path, 'w', encoding='utf8') as f:
with open(path, "w", encoding="utf8") as f:
return f.writelines(text)
def handle_word_transfer(self, text):
clean_text = []
for counter, line in enumerate(text):
if len(line) > 2 and line[-2] == '-' and len(text)-1 != counter:
word = line.split(' ')[-1] + text[counter + 1].split(' ')[0]
word = word.replace('\n', '').replace('-', '')
sentence = line.replace(line.split(' ')[-1], word)
if len(line) > 2 and line[-2] == "-" and len(text) - 1 != counter:
word = line.split(" ")[-1] + text[counter + 1].split(" ")[0]
word = word.replace("\n", "").replace("-", "")
sentence = line.replace(line.split(" ")[-1], word)
clean_text.append(sentence)
text[counter + 1] = ' '.join(text[counter + 1].split(' ')[1:])
text[counter + 1] = " ".join(text[counter + 1].split(" ")[1:])
else:
clean_text.append(line)
return clean_text
......@@ -39,17 +35,21 @@ class WordSplit(object):
os.makedirs(output_file, exist_ok=True)
for file in folder:
text = []
data = self.load_data(input_file+'/'+file)
data = self.load_data(input_file + "/" + file)
for input_term in self.handle_word_transfer(data):
if input_term:
result = self.sym_spell.word_segmentation(input_term, max_segmentation_word_length=32)
text.append(result.corrected_string+'\n')
self.save_data(f'{output_file}/{file}', text)
result = self.sym_spell.word_segmentation(
input_term, max_segmentation_word_length=32
)
text.append(result.corrected_string + "\n")
self.save_data(f"{output_file}/{file}", text)
else:
text = []
data = self.load_data(input_file)
for input_term in self.handle_word_transfer(data):
if input_term:
result = self.sym_spell.word_segmentation(input_term, max_segmentation_word_length=32)
text.append(result.corrected_string + '\n')
result = self.sym_spell.word_segmentation(
input_term, max_segmentation_word_length=32
)
text.append(result.corrected_string + "\n")
self.save_data(output_file, text)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment