From 369e1b6c8d17c9cc5c203eb6e28d66b688520b69 Mon Sep 17 00:00:00 2001
From: Mateusz Gniewkowski <mateusz.gniewkowski@pwr.edu.pl>
Date: Wed, 2 Sep 2020 13:23:20 +0200
Subject: [PATCH] second optimization

---
 Dockerfile                                    |  3 +-
 resp-cmd.ini                                  |  2 +-
 .../table/processors/remove_substrings.py     | 65 +++++++++++++++++++
 3 files changed, 67 insertions(+), 3 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 65b248d..60a83f1 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -103,5 +103,4 @@ COPY stopwords.list stopwords.list
 COPY resp-cmd.ini resp-cmd.ini
 COPY p4.ccl p4.ccl
 COPY start.sh start.sh
-
-
+COPY text.ccl text.ccl
diff --git a/resp-cmd.ini b/resp-cmd.ini
index b37f215..35c4b08 100644
--- a/resp-cmd.ini
+++ b/resp-cmd.ini
@@ -77,7 +77,7 @@ filename = resp.tsv
 
 [resp.documents]
 extractors = sin_ext
-documents = p4.ccl
+documents = text.ccl
 tagset = nkjp
 document_id_generator = docid
 
diff --git a/resp/resp/data_structures/table/processors/remove_substrings.py b/resp/resp/data_structures/table/processors/remove_substrings.py
index b203630..0ecd176 100644
--- a/resp/resp/data_structures/table/processors/remove_substrings.py
+++ b/resp/resp/data_structures/table/processors/remove_substrings.py
@@ -19,6 +19,7 @@ from __future__ import absolute_import, unicode_literals, division
 
 
 import logging
+from collections import defaultdict
 
 from ...base import DataStructureProcessorBase
 from ....utils.config import fields as conff
@@ -65,6 +66,70 @@ class TableRemoveSubStrings(DataStructureProcessorBase):
         # storage is not used here
         removed_idxs = set()
         
+	index = defaultdict(list)
+	for idx, rec in enumerate(data_structure):			
+	    lemmas = rec.lemma.split()
+	    for lemma in lemmas:
+		index[lemma].append((idx, rec))
+
+	for lemma, recs in index.iteritems():
+	    if len(recs) > 10:
+		continue
+	    for idx1, rec1 in recs:
+		if idx1 in removed_idxs:
+		    continue
+
+		next_idx = idx1 + 1
+		gval1 = (rec1[self.group_by]
+			 if self.group_by is not None
+			 else None)
+
+		for idx2, rec2 in recs:
+		    if idx2 in removed_idxs:
+			continue
+
+		    gval2 = (rec2[self.group_by]
+			     if self.group_by is not None
+			     else None)
+
+		    # Do not compare substrings from different groups (if grouping
+		    # is active)
+		    if gval1 != gval2:
+			continue
+
+		    for kname in self.key_columns:
+			kval1 = rec1[kname]
+			kval2 = rec2[kname]
+
+			# Do not remove complete duplicates
+			if kval1 == kval2:
+			    continue
+
+			if kval1 in kval2:
+			    _log.debug(
+				'Removing “%s” ∈ “%s”',
+				UStr(kval1),
+				UStr(kval2),
+			    )
+			    removed_idxs.add(idx1)
+			    break
+			elif kval2 in kval1:
+			    _log.debug(
+				'Removing “%s” ∈ “%s”',
+				UStr(kval2),
+				UStr(kval1),
+			    )
+			    removed_idxs.add(idx2)
+			    break
+
+        data_structure.remove_rows(*removed_idxs)
+        _log.debug('Done removing sub-strings')
+
+    def process_old(self, data_structure, storage):
+        _log.debug('Removing sub-strings on key columns %r', self.key_columns)
+        # storage is not used here
+        removed_idxs = set()
+        
         for idx1, rec1 in enumerate(data_structure):
             if idx1 in removed_idxs:
                 continue
-- 
GitLab