Skip to content
Snippets Groups Projects
Commit 0a3c8dd6 authored by Paweł Walkowiak's avatar Paweł Walkowiak
Browse files

Add token reduction

parent 2c2ffad8
Branches
No related merge requests found
......@@ -2,3 +2,4 @@
/20_news
/poleval
/wiki_pl
/ag_news
......@@ -2,3 +2,4 @@
/20_news
/wiki_pl
/poleval
/ag_news
/wiki_pl
/enron_spam
......@@ -36,17 +36,17 @@ stages:
md5: 3e16b22f59532c66beeadea958e0579a.dir
size: 18505614
nfiles: 6
- path: data/preprocessed/enron_spam/
md5: 99d604f84516cee94948054a97ffec5e.dir
size: 71403809
nfiles: 3
- path: data/reduced/enron_spam/
md5: ee6f2c141cd68b86e620f022f0ca0b5a.dir
size: 12933383
nfiles: 1
- path: experiments/scripts/classify.py
md5: 6fc1a6a0a11ba6cd99a8b6625a96d9f5
size: 1181
md5: 8c4dc8293bc7d7f8b87b4788cea1b81e
size: 1176
outs:
- path: data/classification/enron_spam
md5: 5de1a2fcbae0de94f5fbfd2bb747d919.dir
size: 14585920
md5: 7e0017fe7f10a3a8bbd2c3dcf355cb34.dir
size: 12968818
nfiles: 2
explain@enron_spam:
cmd: PYTHONPATH=. python experiments/scripts/explain.py --dataset_name enron_spam
......@@ -133,17 +133,17 @@ stages:
md5: fd453042628fb09c080ef05d34a32cce.dir
size: 501711136
nfiles: 7
- path: data/preprocessed/wiki_pl/
md5: 066634606f832b6c9d1db95293de7e04.dir
size: 77818549
nfiles: 3
- path: data/reduced/wiki_pl/
md5: 30359a1d253a3c1cee7affa7ae365ef3.dir
size: 31644651
nfiles: 1
- path: experiments/scripts/classify.py
md5: 6fc1a6a0a11ba6cd99a8b6625a96d9f5
size: 1181
md5: 8c4dc8293bc7d7f8b87b4788cea1b81e
size: 1176
outs:
- path: data/classification/wiki_pl
md5: 88c3cea96b2cb3ddda1a82037bf6130a.dir
size: 44196727
md5: 8455064b5b3e39ffc35d3ac712b41c2d.dir
size: 31721772
nfiles: 2
preprocess_dataset@20_news:
cmd: PYTHONPATH=. python experiments/scripts/tag_dataset.py --dataset_name 20_news
......@@ -876,3 +876,43 @@ stages:
md5: db1b512415d278115f76a74112f31c53.dir
size: 57649801
nfiles: 2
reduce@wiki_pl:
cmd: PYTHONPATH=. python experiments/scripts/token_reduce.py --dataset_name wiki_pl
--output_dir data/reduced/wiki_pl
deps:
- path: data/models/wiki_pl/
md5: fd453042628fb09c080ef05d34a32cce.dir
size: 501711136
nfiles: 7
- path: data/preprocessed/wiki_pl/
md5: 066634606f832b6c9d1db95293de7e04.dir
size: 77818549
nfiles: 3
- path: experiments/scripts/token_reduce.py
md5: aa1fed575c1a14835e55d8fb8bb7f14f
size: 4920
outs:
- path: data/reduced/wiki_pl
md5: 30359a1d253a3c1cee7affa7ae365ef3.dir
size: 31644651
nfiles: 1
reduce@enron_spam:
cmd: PYTHONPATH=. python experiments/scripts/token_reduce.py --dataset_name enron_spam
--output_dir data/reduced/enron_spam
deps:
- path: data/models/enron_spam/
md5: 3e16b22f59532c66beeadea958e0579a.dir
size: 18505614
nfiles: 6
- path: data/preprocessed/enron_spam/
md5: 99d604f84516cee94948054a97ffec5e.dir
size: 71403809
nfiles: 3
- path: experiments/scripts/token_reduce.py
md5: aa1fed575c1a14835e55d8fb8bb7f14f
size: 4920
outs:
- path: data/reduced/enron_spam
md5: ee6f2c141cd68b86e620f022f0ca0b5a.dir
size: 12933383
nfiles: 1
......@@ -44,6 +44,24 @@ stages:
- data/preprocessed/${item}
outs:
- data/models/${item}/
reduce:
foreach:
- enron_spam
- poleval
- 20_news
- wiki_pl
do:
wdir: .
cmd: >-
PYTHONPATH=. python experiments/scripts/token_reduce.py
--dataset_name ${item}
--output_dir data/reduced/${item}
deps:
- experiments/scripts/token_reduce.py
- data/models/${item}/
- data/preprocessed/${item}/
outs:
- data/reduced/${item}
classify:
foreach:
- enron_spam
......@@ -59,7 +77,7 @@ stages:
deps:
- experiments/scripts/classify.py
- data/models/${item}/
- data/preprocessed/${item}/
- data/reduced/${item}/
outs:
- data/classification/${item}
explain:
......
......@@ -30,7 +30,7 @@ def main(
dataset_name=dataset_name,
device="cuda" if torch.cuda.is_available() else "cpu"
)
test = pd.read_json(f"data/preprocessed/{dataset_name}/test.jsonl", lines=True)
test = pd.read_json(f"data/reduced/{dataset_name}/test.jsonl", lines=True)
test_x = test["text"].tolist()
test_y = test["label"]
pred_y = classify(test_x)
......
"""Reduce sample size to 512 tokens"""
from pathlib import Path
import click
import pandas as pd
import spacy
import uuid
import shutil
from tqdm import tqdm
import os
import json
from text_attacks.utils import get_model_and_tokenizer
from lpmn_client_biz import Connection, IOType, Task, download, upload
TOKENS = "tokens"
ORTH = "orth"
LEXEMES = "lexemes"
LEMMA = "lemma"
MSTAG = "mstag"
TEXT = "text"
LEMMAS = "lemmas"
TAGS = "tags"
ORTHS = "orths"
NER = "ner"
def tag_sentences(sentences, lang: str):
results = {}
connection = Connection(config_file="experiments/configs/config.yml")
lpmn = [[{"postagger": {"lang": lang}}], 'makezip']
input_dir = str(uuid.uuid4())
os.makedirs(input_dir)
for idx, sentence in sentences.items():
with open(f'{input_dir}/file_{idx}',
'w', encoding='utf8') as fout:
fout.write(sentence)
uploaded = upload(connection, input_dir)
task = Task(lpmn, connection)
result = task.run(uploaded, IOType.FILE, verbose=True)
archive_path = download(
connection,
result,
IOType.FILE,
filename=f'{uuid.uuid4()}.zip'
)
output_path = archive_path.replace('.zip', '')
shutil.unpack_archive(archive_path, output_path)
files = sorted(os.listdir(output_path), key=lambda x: int(x.split('_')[1]))
for j, filename in enumerate(files):
with open(f'{output_path}/{filename}', 'r') as file:
lines = [json.loads(line) for line in file.readlines()]
lemmas, tags, orths = [], [], []
if len(lines) > 0:
for idx, line in enumerate(lines):
tokens = line[TOKENS]
for token in tokens:
lexeme = token[LEXEMES][0]
lemmas.append(lexeme[LEMMA])
tags.append(lexeme[MSTAG])
orths.append(token[ORTH])
else:
tokens = lines[0][TOKENS]
for token in tokens:
lexeme = token[LEXEMES][0]
lemmas.append(lexeme[LEMMA])
tags.append(lexeme[MSTAG])
orths.append(token[ORTH])
results[int(filename.split('_')[1])] = {
LEMMAS: lemmas,
TAGS: tags,
ORTHS: orths
}
shutil.rmtree(input_dir)
os.remove(archive_path)
shutil.rmtree(output_path)
return results
def add_ner(sentences, language):
model = "en_core_web_trf" if language == "en" else "pl_core_news_lg"
nlp = spacy.load(model)
ner_data = {}
for idx, text in tqdm(sentences.items()):
doc = nlp(text)
doc_ner = list()
for ent in doc.ents:
doc_ner.append({
"text": ent.text,
"start_char": ent.start_char,
"end_char": ent.end_char,
"label": ent.label_,
})
ner_data[idx] = doc_ner
return ner_data
@click.command()
@click.option(
"--dataset_name",
help="Dataset name",
type=str,
)
@click.option(
"--output_dir",
help="Path to output directory",
type=click.Path(path_type=Path),
)
def main(
dataset_name: str,
output_dir: Path,
):
lang = {
"enron_spam": "en",
"poleval": "pl",
"20_news": "en",
"wiki_pl": "pl",
}[dataset_name]
output_dir.mkdir(parents=True, exist_ok=True)
model, tokenizer = get_model_and_tokenizer(
dataset_name=dataset_name
)
model.to("cpu")
model.eval()
test = pd.read_json(f"data/preprocessed/{dataset_name}/test.jsonl", lines=True)
texts = test["text"].tolist()
texts_reduced = {}
for i, sentence in test["text"].items():
encoded = tokenizer.encode(sentence, add_special_tokens=True, max_length=512, truncation=True)
decod_res = tokenizer.decode(encoded, skip_special_tokens=True, clean_up_tokenization_spaces=True)
last_word = decod_res.split(" ")[-1]
max_len = len(" ".join(sentence.split(" ")[:512]))
idx = sentence.rfind(last_word, 0, max_len)
if idx + len(last_word) < len(sentence) and idx > 0:
texts_reduced[i] = sentence[:idx + len(last_word)]
print("To reduce ", len(texts_reduced), " of ", len(texts))
if len(texts_reduced) > 0:
tagged_reduced = tag_sentences(texts_reduced, lang)
ner_reduced = add_ner(texts_reduced, lang)
for idx, sentence in texts_reduced.items():
test.loc[idx, TEXT] = sentence
test.at[idx, LEMMAS] = tagged_reduced[idx][LEMMAS]
test.at[idx, TAGS] = tagged_reduced[idx][TAGS]
test.at[idx, ORTHS] = tagged_reduced[idx][ORTHS]
test.at[idx, NER] = ner_reduced[idx]
test.to_json(output_dir / "test.jsonl", orient="records", lines=True)
if __name__ == "__main__":
main()
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment