Add token reduction

0a3c8dd6 · Paweł Walkowiak · 2c2ffad8 · 0a3c8dd6 · 0a3c8dd6 · 0a3c8dd6
Commit 0a3c8dd6 authored 2 years ago by Paweł Walkowiak
--- a/data/datasets/.gitignore
+++ b/data/datasets/.gitignore
@@ -2,3 +2,4 @@
 /20_news
 /poleval
 /wiki_pl
+/ag_news
--- a/data/models/.gitignore
+++ b/data/models/.gitignore
@@ -2,3 +2,4 @@
 /20_news
 /wiki_pl
 /poleval
+/ag_news
--- a/data/reduced/.gitignore
+++ b/data/reduced/.gitignore
+/wiki_pl
+/enron_spam
--- a/dvc.lock
+++ b/dvc.lock
@@ -36,17 +36,17 @@ stages:
      md5: 3e16b22f59532c66beeadea958e0579a.dir
      size: 18505614
      nfiles: 6
-    - path: data/preprocessed/enron_spam/
-      md5: 99d604f84516cee94948054a97ffec5e.dir
-      size: 71403809
-      nfiles: 3
+    - path: data/reduced/enron_spam/
+      md5: ee6f2c141cd68b86e620f022f0ca0b5a.dir
+      size: 12933383
+      nfiles: 1
    - path: experiments/scripts/classify.py
-      md5: 6fc1a6a0a11ba6cd99a8b6625a96d9f5
-      size: 1181
+      md5: 8c4dc8293bc7d7f8b87b4788cea1b81e
+      size: 1176
    outs:
    - path: data/classification/enron_spam
-      md5: 5de1a2fcbae0de94f5fbfd2bb747d919.dir
-      size: 14585920
+      md5: 7e0017fe7f10a3a8bbd2c3dcf355cb34.dir
+      size: 12968818
      nfiles: 2
  explain@enron_spam:
    cmd: PYTHONPATH=. python experiments/scripts/explain.py --dataset_name enron_spam
@@ -133,17 +133,17 @@ stages:
      md5: fd453042628fb09c080ef05d34a32cce.dir
      size: 501711136
      nfiles: 7
-    - path: data/preprocessed/wiki_pl/
-      md5: 066634606f832b6c9d1db95293de7e04.dir
-      size: 77818549
-      nfiles: 3
+    - path: data/reduced/wiki_pl/
+      md5: 30359a1d253a3c1cee7affa7ae365ef3.dir
+      size: 31644651
+      nfiles: 1
    - path: experiments/scripts/classify.py
-      md5: 6fc1a6a0a11ba6cd99a8b6625a96d9f5
-      size: 1181
+      md5: 8c4dc8293bc7d7f8b87b4788cea1b81e
+      size: 1176
    outs:
    - path: data/classification/wiki_pl
-      md5: 88c3cea96b2cb3ddda1a82037bf6130a.dir
-      size: 44196727
+      md5: 8455064b5b3e39ffc35d3ac712b41c2d.dir
+      size: 31721772
      nfiles: 2
  preprocess_dataset@20_news:
    cmd: PYTHONPATH=. python experiments/scripts/tag_dataset.py --dataset_name 20_news
@@ -876,3 +876,43 @@ stages:
      md5: db1b512415d278115f76a74112f31c53.dir
      size: 57649801
      nfiles: 2
+  reduce@wiki_pl:
+    cmd: PYTHONPATH=. python experiments/scripts/token_reduce.py --dataset_name wiki_pl
+      --output_dir data/reduced/wiki_pl
+    deps:
+    - path: data/models/wiki_pl/
+      md5: fd453042628fb09c080ef05d34a32cce.dir
+      size: 501711136
+      nfiles: 7
+    - path: data/preprocessed/wiki_pl/
+      md5: 066634606f832b6c9d1db95293de7e04.dir
+      size: 77818549
+      nfiles: 3
+    - path: experiments/scripts/token_reduce.py
+      md5: aa1fed575c1a14835e55d8fb8bb7f14f
+      size: 4920
+    outs:
+    - path: data/reduced/wiki_pl
+      md5: 30359a1d253a3c1cee7affa7ae365ef3.dir
+      size: 31644651
+      nfiles: 1
+  reduce@enron_spam:
+    cmd: PYTHONPATH=. python experiments/scripts/token_reduce.py --dataset_name enron_spam
+      --output_dir data/reduced/enron_spam
+    deps:
+    - path: data/models/enron_spam/
+      md5: 3e16b22f59532c66beeadea958e0579a.dir
+      size: 18505614
+      nfiles: 6
+    - path: data/preprocessed/enron_spam/
+      md5: 99d604f84516cee94948054a97ffec5e.dir
+      size: 71403809
+      nfiles: 3
+    - path: experiments/scripts/token_reduce.py
+      md5: aa1fed575c1a14835e55d8fb8bb7f14f
+      size: 4920
+    outs:
+    - path: data/reduced/enron_spam
+      md5: ee6f2c141cd68b86e620f022f0ca0b5a.dir
+      size: 12933383
+      nfiles: 1
--- a/dvc.yaml
+++ b/dvc.yaml
@@ -44,6 +44,24 @@ stages:
        - data/preprocessed/${item}
      outs:
        - data/models/${item}/
+  reduce:
+    foreach:
+      - enron_spam
+      - poleval
+      - 20_news
+      - wiki_pl
+    do:
+      wdir: .
+      cmd: >-
+        PYTHONPATH=. python experiments/scripts/token_reduce.py
+        --dataset_name ${item}
+        --output_dir data/reduced/${item}
+      deps:
+        - experiments/scripts/token_reduce.py
+        - data/models/${item}/
+        - data/preprocessed/${item}/
+      outs:
+        - data/reduced/${item}
  classify:
    foreach:
      - enron_spam
@@ -59,7 +77,7 @@ stages:
      deps:
        - experiments/scripts/classify.py
        - data/models/${item}/
-        - data/preprocessed/${item}/
+        - data/reduced/${item}/
      outs:
        - data/classification/${item}
  explain:

--- a/experiments/scripts/classify.py
+++ b/experiments/scripts/classify.py
@@ -30,7 +30,7 @@ def main(
        dataset_name=dataset_name,
        device="cuda" if torch.cuda.is_available() else "cpu"
    )
-    test = pd.read_json(f"data/preprocessed/{dataset_name}/test.jsonl", lines=True)
+    test = pd.read_json(f"data/reduced/{dataset_name}/test.jsonl", lines=True)
    test_x = test["text"].tolist()
    test_y = test["label"]
    pred_y = classify(test_x)

--- a/experiments/scripts/token_reduce.py
+++ b/experiments/scripts/token_reduce.py
+"""Reduce sample size to 512 tokens"""
+
+from pathlib import Path
+import click
+import pandas as pd
+import spacy
+import uuid
+import shutil
+from tqdm import tqdm
+import os
+import json
+from text_attacks.utils import get_model_and_tokenizer
+from lpmn_client_biz import Connection, IOType, Task, download, upload
+
+TOKENS = "tokens"
+ORTH = "orth"
+LEXEMES = "lexemes"
+LEMMA = "lemma"
+MSTAG = "mstag"
+TEXT = "text"
+LEMMAS = "lemmas"
+TAGS = "tags"
+ORTHS = "orths"
+NER = "ner"
+
+
+def tag_sentences(sentences, lang: str):
+    results = {}
+    connection = Connection(config_file="experiments/configs/config.yml")
+    lpmn = [[{"postagger": {"lang": lang}}], 'makezip']
+    input_dir = str(uuid.uuid4())
+    os.makedirs(input_dir)
+    for idx, sentence in sentences.items():
+        with open(f'{input_dir}/file_{idx}',
+                  'w', encoding='utf8') as fout:
+            fout.write(sentence)
+
+    uploaded = upload(connection, input_dir)
+    task = Task(lpmn, connection)
+    result = task.run(uploaded, IOType.FILE, verbose=True)
+    archive_path = download(
+        connection,
+        result,
+        IOType.FILE,
+        filename=f'{uuid.uuid4()}.zip'
+    )
+    output_path = archive_path.replace('.zip', '')
+    shutil.unpack_archive(archive_path, output_path)
+    files = sorted(os.listdir(output_path), key=lambda x: int(x.split('_')[1]))
+    for j, filename in enumerate(files):
+        with open(f'{output_path}/{filename}', 'r') as file:
+            lines = [json.loads(line) for line in file.readlines()]
+            lemmas, tags, orths = [], [], []
+            if len(lines) > 0:
+                for idx, line in enumerate(lines):
+                    tokens = line[TOKENS]
+                    for token in tokens:
+                        lexeme = token[LEXEMES][0]
+                        lemmas.append(lexeme[LEMMA])
+                        tags.append(lexeme[MSTAG])
+                        orths.append(token[ORTH])
+            else:
+                tokens = lines[0][TOKENS]
+                for token in tokens:
+                    lexeme = token[LEXEMES][0]
+                    lemmas.append(lexeme[LEMMA])
+                    tags.append(lexeme[MSTAG])
+                    orths.append(token[ORTH])
+            results[int(filename.split('_')[1])] = {
+                LEMMAS: lemmas,
+                TAGS: tags,
+                ORTHS: orths
+            }
+    shutil.rmtree(input_dir)
+    os.remove(archive_path)
+    shutil.rmtree(output_path)
+    return results
+
+
+def add_ner(sentences, language):
+    model = "en_core_web_trf" if language == "en" else "pl_core_news_lg"
+    nlp = spacy.load(model)
+    ner_data = {}
+
+    for idx, text in tqdm(sentences.items()):
+        doc = nlp(text)
+        doc_ner = list()
+        for ent in doc.ents:
+            doc_ner.append({
+                "text": ent.text,
+                "start_char": ent.start_char,
+                "end_char": ent.end_char,
+                "label": ent.label_,
+            })
+        ner_data[idx] = doc_ner
+    return ner_data
+
+
+@click.command()
+@click.option(
+    "--dataset_name",
+    help="Dataset name",
+    type=str,
+)
+@click.option(
+    "--output_dir",
+    help="Path to output directory",
+    type=click.Path(path_type=Path),
+)
+def main(
+        dataset_name: str,
+        output_dir: Path,
+):
+    lang = {
+        "enron_spam": "en",
+        "poleval": "pl",
+        "20_news": "en",
+        "wiki_pl": "pl",
+    }[dataset_name]
+    output_dir.mkdir(parents=True, exist_ok=True)
+    model, tokenizer = get_model_and_tokenizer(
+        dataset_name=dataset_name
+    )
+    model.to("cpu")
+    model.eval()
+    test = pd.read_json(f"data/preprocessed/{dataset_name}/test.jsonl", lines=True)
+
+    texts = test["text"].tolist()
+    texts_reduced = {}
+    for i, sentence in test["text"].items():
+        encoded = tokenizer.encode(sentence, add_special_tokens=True, max_length=512, truncation=True)
+        decod_res = tokenizer.decode(encoded, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+        last_word = decod_res.split(" ")[-1]
+        max_len = len(" ".join(sentence.split(" ")[:512]))
+        idx = sentence.rfind(last_word, 0, max_len)
+        if idx + len(last_word) < len(sentence) and idx > 0:
+            texts_reduced[i] = sentence[:idx + len(last_word)]
+    print("To reduce ", len(texts_reduced), " of ", len(texts))
+
+    if len(texts_reduced) > 0:
+        tagged_reduced = tag_sentences(texts_reduced, lang)
+        ner_reduced = add_ner(texts_reduced, lang)
+        for idx, sentence in texts_reduced.items():
+            test.loc[idx, TEXT] = sentence
+            test.at[idx, LEMMAS] = tagged_reduced[idx][LEMMAS]
+            test.at[idx, TAGS] = tagged_reduced[idx][TAGS]
+            test.at[idx, ORTHS] = tagged_reduced[idx][ORTHS]
+            test.at[idx, NER] = ner_reduced[idx]
+    test.to_json(output_dir / "test.jsonl", orient="records", lines=True)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file