From 369e1b6c8d17c9cc5c203eb6e28d66b688520b69 Mon Sep 17 00:00:00 2001 From: Mateusz Gniewkowski <mateusz.gniewkowski@pwr.edu.pl> Date: Wed, 2 Sep 2020 13:23:20 +0200 Subject: [PATCH] second optimization --- Dockerfile | 3 +- resp-cmd.ini | 2 +- .../table/processors/remove_substrings.py | 65 +++++++++++++++++++ 3 files changed, 67 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 65b248d..60a83f1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -103,5 +103,4 @@ COPY stopwords.list stopwords.list COPY resp-cmd.ini resp-cmd.ini COPY p4.ccl p4.ccl COPY start.sh start.sh - - +COPY text.ccl text.ccl diff --git a/resp-cmd.ini b/resp-cmd.ini index b37f215..35c4b08 100644 --- a/resp-cmd.ini +++ b/resp-cmd.ini @@ -77,7 +77,7 @@ filename = resp.tsv [resp.documents] extractors = sin_ext -documents = p4.ccl +documents = text.ccl tagset = nkjp document_id_generator = docid diff --git a/resp/resp/data_structures/table/processors/remove_substrings.py b/resp/resp/data_structures/table/processors/remove_substrings.py index b203630..0ecd176 100644 --- a/resp/resp/data_structures/table/processors/remove_substrings.py +++ b/resp/resp/data_structures/table/processors/remove_substrings.py @@ -19,6 +19,7 @@ from __future__ import absolute_import, unicode_literals, division import logging +from collections import defaultdict from ...base import DataStructureProcessorBase from ....utils.config import fields as conff @@ -65,6 +66,70 @@ class TableRemoveSubStrings(DataStructureProcessorBase): # storage is not used here removed_idxs = set() + index = defaultdict(list) + for idx, rec in enumerate(data_structure): + lemmas = rec.lemma.split() + for lemma in lemmas: + index[lemma].append((idx, rec)) + + for lemma, recs in index.iteritems(): + if len(recs) > 10: + continue + for idx1, rec1 in recs: + if idx1 in removed_idxs: + continue + + next_idx = idx1 + 1 + gval1 = (rec1[self.group_by] + if self.group_by is not None + else None) + + for idx2, rec2 in recs: + if idx2 in removed_idxs: + continue + + gval2 = (rec2[self.group_by] + if self.group_by is not None + else None) + + # Do not compare substrings from different groups (if grouping + # is active) + if gval1 != gval2: + continue + + for kname in self.key_columns: + kval1 = rec1[kname] + kval2 = rec2[kname] + + # Do not remove complete duplicates + if kval1 == kval2: + continue + + if kval1 in kval2: + _log.debug( + 'Removing “%s” ∈ “%s”', + UStr(kval1), + UStr(kval2), + ) + removed_idxs.add(idx1) + break + elif kval2 in kval1: + _log.debug( + 'Removing “%s” ∈ “%s”', + UStr(kval2), + UStr(kval1), + ) + removed_idxs.add(idx2) + break + + data_structure.remove_rows(*removed_idxs) + _log.debug('Done removing sub-strings') + + def process_old(self, data_structure, storage): + _log.debug('Removing sub-strings on key columns %r', self.key_columns) + # storage is not used here + removed_idxs = set() + for idx1, rec1 in enumerate(data_structure): if idx1 in removed_idxs: continue -- GitLab