diff --git a/Dockerfile b/Dockerfile index 65b248deda192b5688ffe24476602661759be09c..60a83f1fc9cc028bfb036042b46b078ed75f07a8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -103,5 +103,4 @@ COPY stopwords.list stopwords.list COPY resp-cmd.ini resp-cmd.ini COPY p4.ccl p4.ccl COPY start.sh start.sh - - +COPY text.ccl text.ccl diff --git a/resp-cmd.ini b/resp-cmd.ini index b37f215dd1e28d67e74bf6855eae30cd9400cad8..35c4b08e115a7c888189df043313c1106dc4fe6d 100644 --- a/resp-cmd.ini +++ b/resp-cmd.ini @@ -77,7 +77,7 @@ filename = resp.tsv [resp.documents] extractors = sin_ext -documents = p4.ccl +documents = text.ccl tagset = nkjp document_id_generator = docid diff --git a/resp/resp/data_structures/table/processors/remove_substrings.py b/resp/resp/data_structures/table/processors/remove_substrings.py index b203630b9296a9eb0fedf853b6d894b3eb93ab7c..0ecd176b8dd237ef557662c1ebec2d3bd8fb8215 100644 --- a/resp/resp/data_structures/table/processors/remove_substrings.py +++ b/resp/resp/data_structures/table/processors/remove_substrings.py @@ -19,6 +19,7 @@ from __future__ import absolute_import, unicode_literals, division import logging +from collections import defaultdict from ...base import DataStructureProcessorBase from ....utils.config import fields as conff @@ -65,6 +66,70 @@ class TableRemoveSubStrings(DataStructureProcessorBase): # storage is not used here removed_idxs = set() + index = defaultdict(list) + for idx, rec in enumerate(data_structure): + lemmas = rec.lemma.split() + for lemma in lemmas: + index[lemma].append((idx, rec)) + + for lemma, recs in index.iteritems(): + if len(recs) > 10: + continue + for idx1, rec1 in recs: + if idx1 in removed_idxs: + continue + + next_idx = idx1 + 1 + gval1 = (rec1[self.group_by] + if self.group_by is not None + else None) + + for idx2, rec2 in recs: + if idx2 in removed_idxs: + continue + + gval2 = (rec2[self.group_by] + if self.group_by is not None + else None) + + # Do not compare substrings from different groups (if grouping + # is active) + if gval1 != gval2: + continue + + for kname in self.key_columns: + kval1 = rec1[kname] + kval2 = rec2[kname] + + # Do not remove complete duplicates + if kval1 == kval2: + continue + + if kval1 in kval2: + _log.debug( + 'Removing “%s” ∈ “%s”', + UStr(kval1), + UStr(kval2), + ) + removed_idxs.add(idx1) + break + elif kval2 in kval1: + _log.debug( + 'Removing “%s” ∈ “%s”', + UStr(kval2), + UStr(kval1), + ) + removed_idxs.add(idx2) + break + + data_structure.remove_rows(*removed_idxs) + _log.debug('Done removing sub-strings') + + def process_old(self, data_structure, storage): + _log.debug('Removing sub-strings on key columns %r', self.key_columns) + # storage is not used here + removed_idxs = set() + for idx1, rec1 in enumerate(data_structure): if idx1 in removed_idxs: continue