Skip to content
Snippets Groups Projects
Commit 369e1b6c authored by Mateusz Gniewkowski's avatar Mateusz Gniewkowski
Browse files

second optimization

parent 15b207b4
No related branches found
No related tags found
No related merge requests found
...@@ -103,5 +103,4 @@ COPY stopwords.list stopwords.list ...@@ -103,5 +103,4 @@ COPY stopwords.list stopwords.list
COPY resp-cmd.ini resp-cmd.ini COPY resp-cmd.ini resp-cmd.ini
COPY p4.ccl p4.ccl COPY p4.ccl p4.ccl
COPY start.sh start.sh COPY start.sh start.sh
COPY text.ccl text.ccl
...@@ -77,7 +77,7 @@ filename = resp.tsv ...@@ -77,7 +77,7 @@ filename = resp.tsv
[resp.documents] [resp.documents]
extractors = sin_ext extractors = sin_ext
documents = p4.ccl documents = text.ccl
tagset = nkjp tagset = nkjp
document_id_generator = docid document_id_generator = docid
......
...@@ -19,6 +19,7 @@ from __future__ import absolute_import, unicode_literals, division ...@@ -19,6 +19,7 @@ from __future__ import absolute_import, unicode_literals, division
import logging import logging
from collections import defaultdict
from ...base import DataStructureProcessorBase from ...base import DataStructureProcessorBase
from ....utils.config import fields as conff from ....utils.config import fields as conff
...@@ -65,6 +66,70 @@ class TableRemoveSubStrings(DataStructureProcessorBase): ...@@ -65,6 +66,70 @@ class TableRemoveSubStrings(DataStructureProcessorBase):
# storage is not used here # storage is not used here
removed_idxs = set() removed_idxs = set()
index = defaultdict(list)
for idx, rec in enumerate(data_structure):
lemmas = rec.lemma.split()
for lemma in lemmas:
index[lemma].append((idx, rec))
for lemma, recs in index.iteritems():
if len(recs) > 10:
continue
for idx1, rec1 in recs:
if idx1 in removed_idxs:
continue
next_idx = idx1 + 1
gval1 = (rec1[self.group_by]
if self.group_by is not None
else None)
for idx2, rec2 in recs:
if idx2 in removed_idxs:
continue
gval2 = (rec2[self.group_by]
if self.group_by is not None
else None)
# Do not compare substrings from different groups (if grouping
# is active)
if gval1 != gval2:
continue
for kname in self.key_columns:
kval1 = rec1[kname]
kval2 = rec2[kname]
# Do not remove complete duplicates
if kval1 == kval2:
continue
if kval1 in kval2:
_log.debug(
'Removing “%s” ∈ “%s”',
UStr(kval1),
UStr(kval2),
)
removed_idxs.add(idx1)
break
elif kval2 in kval1:
_log.debug(
'Removing “%s” ∈ “%s”',
UStr(kval2),
UStr(kval1),
)
removed_idxs.add(idx2)
break
data_structure.remove_rows(*removed_idxs)
_log.debug('Done removing sub-strings')
def process_old(self, data_structure, storage):
_log.debug('Removing sub-strings on key columns %r', self.key_columns)
# storage is not used here
removed_idxs = set()
for idx1, rec1 in enumerate(data_structure): for idx1, rec1 in enumerate(data_structure):
if idx1 in removed_idxs: if idx1 in removed_idxs:
continue continue
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment