second optimization

369e1b6c · Mateusz Gniewkowski · 15b207b4 · 369e1b6c · 369e1b6c · 369e1b6c
Commit 369e1b6c authored Sep 2, 2020 by Mateusz Gniewkowski
--- a/Dockerfile
+++ b/Dockerfile
@@ -103,5 +103,4 @@ COPY stopwords.list stopwords.list
 COPY resp-cmd.ini resp-cmd.ini
 COPY p4.ccl p4.ccl
 COPY start.sh start.sh
+COPY text.ccl text.ccl
--- a/resp-cmd.ini
+++ b/resp-cmd.ini
@@ -77,7 +77,7 @@ filename = resp.tsv
 [resp.documents]
 extractors = sin_ext
-documents = p4.ccl
+documents = text.ccl
 tagset = nkjp
 document_id_generator = docid

--- a/resp/resp/data_structures/table/processors/remove_substrings.py
+++ b/resp/resp/data_structures/table/processors/remove_substrings.py
@@ -19,6 +19,7 @@ from __future__ import absolute_import, unicode_literals, division
 import logging
+from collections import defaultdict
 from ...base import DataStructureProcessorBase
 from ....utils.config import fields as conff
@@ -65,6 +66,70 @@ class TableRemoveSubStrings(DataStructureProcessorBase):
        # storage is not used here
        removed_idxs = set()
+	index = defaultdict(list)
+	for idx, rec in enumerate(data_structure):			
+	    lemmas = rec.lemma.split()
+	    for lemma in lemmas:
+		index[lemma].append((idx, rec))
+	for lemma, recs in index.iteritems():
+	    if len(recs) > 10:
+		continue
+	    for idx1, rec1 in recs:
+		if idx1 in removed_idxs:
+		    continue
+		next_idx = idx1 + 1
+		gval1 = (rec1[self.group_by]
+			 if self.group_by is not None
+			 else None)
+		for idx2, rec2 in recs:
+		    if idx2 in removed_idxs:
+			continue
+		    gval2 = (rec2[self.group_by]
+			     if self.group_by is not None
+			     else None)
+		    # Do not compare substrings from different groups (if grouping
+		    # is active)
+		    if gval1 != gval2:
+			continue
+		    for kname in self.key_columns:
+			kval1 = rec1[kname]
+			kval2 = rec2[kname]
+			# Do not remove complete duplicates
+			if kval1 == kval2:
+			    continue
+			if kval1 in kval2:
+			    _log.debug(
+				'Removing “%s” ∈ “%s”',
+				UStr(kval1),
+				UStr(kval2),
+			    )
+			    removed_idxs.add(idx1)
+			    break
+			elif kval2 in kval1:
+			    _log.debug(
+				'Removing “%s” ∈ “%s”',
+				UStr(kval2),
+				UStr(kval1),
+			    )
+			    removed_idxs.add(idx2)
+			    break
+        data_structure.remove_rows(*removed_idxs)
+        _log.debug('Done removing sub-strings')
+    def process_old(self, data_structure, storage):
+        _log.debug('Removing sub-strings on key columns %r', self.key_columns)
+        # storage is not used here
+        removed_idxs = set()
        for idx1, rec1 in enumerate(data_structure):
            if idx1 in removed_idxs:
                continue