Skip to content
Snippets Groups Projects
Commit 60c45d3e authored by pwalkow's avatar pwalkow
Browse files

Add tags

parent 0c324297
No related merge requests found
......@@ -16,9 +16,9 @@ stages:
cmd: PYTHONPATH=. python experiments/scripts/get_model.py --dataset_name enron_spam
--output_dir data/models/enron_spam
deps:
- path: data/datasets/enron_spam
md5: 66d44efedf37990b1989c81bbee085e0.dir
size: 53096069
- path: data/preprocessed/enron_spam
md5: b75efba1a62182dc8ac32acd1faf92ed.dir
size: 61709260
nfiles: 3
- path: experiments/scripts/get_model.py
md5: 5050f51b4019bba97af47971f6c7cab4
......@@ -32,21 +32,21 @@ stages:
cmd: PYTHONPATH=. python experiments/scripts/classify.py --dataset_name enron_spam
--output_dir data/classification/enron_spam
deps:
- path: data/datasets/enron_spam/
md5: 66d44efedf37990b1989c81bbee085e0.dir
size: 53096069
nfiles: 3
- path: data/models/enron_spam/
md5: 3e16b22f59532c66beeadea958e0579a.dir
size: 18505614
nfiles: 6
- path: data/preprocessed/enron_spam/
md5: b75efba1a62182dc8ac32acd1faf92ed.dir
size: 61709260
nfiles: 3
- path: experiments/scripts/classify.py
md5: 50f55b90eb47cbf448d83f8392dd37b6
size: 1102
md5: ba9284c90847fbbd0f2a6cca414d9636
size: 1106
outs:
- path: data/classification/enron_spam
md5: c7d42825b98b289f6a5ed3be1af14413.dir
size: 2763843
md5: 0450c0b672bc4a5db3cc7be2dac786bd.dir
size: 10674882
nfiles: 2
explain@enron_spam:
cmd: PYTHONPATH=. python experiments/scripts/explain.py --dataset_name enron_spam
......@@ -88,13 +88,13 @@ stages:
size: 1688836
nfiles: 3
- path: experiments/scripts/tag_dataset.py
md5: 1d911edcd336cacaec482e6b7570eb1a
size: 2716
md5: 2c4e097b3a278c12d19858f988232b44
size: 3435
outs:
- path: data/preprocessed/poleval/
md5: 8daba6ad0597214499ac9b96e8e47c9f.dir
size: 501920
nfiles: 1
md5: 854387459b193c5eba6db1273ca5ad23.dir
size: 2277282
nfiles: 3
preprocess_dataset@enron_spam:
cmd: PYTHONPATH=. python experiments/scripts/tag_dataset.py --dataset_name enron_spam
deps:
......@@ -103,10 +103,10 @@ stages:
size: 53096069
nfiles: 3
- path: experiments/scripts/tag_dataset.py
md5: 1d911edcd336cacaec482e6b7570eb1a
size: 2716
md5: 2c4e097b3a278c12d19858f988232b44
size: 3435
outs:
- path: data/preprocessed/enron_spam/
md5: 80c8dd3aa3bacf3afe8cf3138ab01d00.dir
size: 10639521
nfiles: 1
md5: b75efba1a62182dc8ac32acd1faf92ed.dir
size: 61709260
nfiles: 3
......@@ -30,6 +30,7 @@ stages:
get_model:
foreach:
- enron_spam
- poleval
do:
wdir: .
cmd: >-
......@@ -38,12 +39,13 @@ stages:
--output_dir data/models/${item}
deps:
- experiments/scripts/get_model.py
- data/datasets/${item}
- data/preprocessed/${item}
outs:
- data/models/${item}/
classify:
foreach:
- enron_spam
- poleval
do:
wdir: .
cmd: >-
......@@ -53,12 +55,13 @@ stages:
deps:
- experiments/scripts/classify.py
- data/models/${item}/
- data/datasets/${item}/
- data/preprocessed/${item}/
outs:
- data/classification/${item}
explain:
foreach:
- enron_spam
- poleval
do:
wdir: .
cmd: >-
......@@ -68,6 +71,6 @@ stages:
deps:
- experiments/scripts/explain.py
- data/models/${item}
- data/datasets/${item}
- data/proprocessed/${item}
outs:
- data/explanations/${item}/
......@@ -28,7 +28,7 @@ def main(
classify = get_classify_function(
dataset_name=dataset_name,
)
test = pd.read_json(f"data/datasets/{dataset_name}/test.jsonl", lines=True)
test = pd.read_json(f"data/preprocessed/{dataset_name}/test.jsonl", lines=True)
test_x = test["text"].tolist()
test_y = test["label"]
pred_y = classify(test_x)
......
......@@ -43,7 +43,7 @@ def main(
model, tokenizer = get_model_and_tokenizer(
dataset_name=dataset_name,
)
test = pd.read_json(f"data/datasets/{dataset_name}/adversarial.jsonl", lines=True)
test = pd.read_json(f"data/preprocessed/{dataset_name}/adversarial.jsonl", lines=True)
test_x = test["text"].tolist()
predict = build_predict_fun(model, tokenizer)
......
......@@ -17,10 +17,14 @@ LEMMAS = 'lemmas'
TAGS = 'tags'
def tag_sentence(connection: Connection, sentence: str, lang: str):
task = Task([{'postagger': {'output_type': 'json', 'lang': lang}}],
connection=connection)
output_file_id = task.run(sentence, IOType.TEXT)
def tag_sentence(sentence: str, lang: str):
connection = Connection(config_file="experiments/configs/config.yml")
lpmn = ["morphodita",
{"posconverter":
{"input_format": "ccl", "output_format": "json"}}] \
if lang == 'pl' else [{"spacy": {"lang": "en"}}]
task = Task(lpmn, connection=connection)
output_file_id = task.run(str(sentence), IOType.TEXT)
tokens = []
try:
clarin_json = json.loads(download(connection, output_file_id, IOType.TEXT).decode("utf-8"))
......@@ -40,7 +44,7 @@ def tag_sentence(connection: Connection, sentence: str, lang: str):
return lemmas, tags
def process_file(dataset_df, connection, lang, output_path):
def process_file(dataset_df, lang, output_path):
test_with_tags = pd.DataFrame(dataset_df)
lemmas_col, tags_col = [], []
cpus = cpu_count()
......@@ -49,9 +53,7 @@ def process_file(dataset_df, connection, lang, output_path):
for idx in tqdm(range(0, len(dataset_df), cpus)):
end = min(idx+cpus, len(dataset_df) + 1)
for sentence in dataset_df[TEXT][idx:end]:
results.append(pool.apply_async(tag_sentence, args=(connection,
sentence,
lang,)))
results.append(pool.apply_async(tag_sentence, args=[sentence, lang]))
for res in results:
lemmas, tags = res.get()
lemmas_col.append(lemmas)
......@@ -73,15 +75,21 @@ def process_file(dataset_df, connection, lang, output_path):
def main(dataset_name: str):
"""Downloads the dataset to the output directory."""
lang = 'en' if dataset_name == 'enron_spam' else 'pl'
conn = Connection(config_file="experiments/configs/config.yml")
output_dir = f"data/preprocessed/{dataset_name}"
os.makedirs(output_dir, exist_ok=True)
input_dir = f"data/datasets/{dataset_name}"
for file in os.listdir(input_dir):
if os.path.isfile(os.path.join(input_dir, file)):
process_file(pd.read_json(os.path.join(input_dir, file), lines=True),
conn, lang, os.path.join(output_dir, file))
if file == "test.jsonl":
process_file(pd.read_json(os.path.join(input_dir, file), lines=True),
lang, os.path.join(output_dir, file))
else:
test_with_tags = pd.DataFrame(pd.read_json(os.path.join(input_dir, file), lines=True))
test_with_tags[LEMMAS] = ['' for _ in range(len(test_with_tags))]
test_with_tags[TAGS] = ['' for _ in range(len(test_with_tags))]
with open(os.path.join(output_dir, file), mode="wt") as fd:
fd.write(test_with_tags.to_json(orient='records', lines=True))
if __name__ == "__main__":
......
"""Classification model for enron_spam"""
def get_model_and_tokenizer():
return None, None
def get_classify_function():
def fun(texts):
return "dummy"
return fun
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment