From 76cba42e3a3b8add03d1777fd80033d6d565fdaf Mon Sep 17 00:00:00 2001 From: Bartlomiej <bartlomiej.piotr.bojanowski@gmail.com> Date: Wed, 18 Oct 2023 10:55:38 +0200 Subject: [PATCH] Add clarin_json --- src/pipeline/sequential_jsonl.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/pipeline/sequential_jsonl.py b/src/pipeline/sequential_jsonl.py index dc76e97..8418949 100644 --- a/src/pipeline/sequential_jsonl.py +++ b/src/pipeline/sequential_jsonl.py @@ -7,7 +7,7 @@ from src.input_parsers.interface import InputParser from src.pipeline.interface import Pipeline from src.replacers.interface import ReplacerInterface from src.suppressors.interface import Suppressor - +import clarin_json class SequentialJSONLPipeline(Pipeline): """Pipeline that runs the whole anonymization process on jsonl-splitted input. @@ -55,12 +55,10 @@ class SequentialJSONLPipeline(Pipeline): """ result = [] - with open(input_path, "r") as f: - for line in f.readlines(): - if line.strip() == "": - continue - parsed_input = self._input_parser.parse(line) + with clarin_json.open(input_path, 'r') as f: + for line in f: + parsed_input = self._input_parser.parse(line) detected_entities = [] for detector_name, detector in self._detectors.items(): detected_entities += detector.detect( -- GitLab