Skip to content
Snippets Groups Projects
Commit 60c45d3e authored by pwalkow's avatar pwalkow
Browse files

Add tags

parent 0c324297
No related branches found
No related tags found
No related merge requests found
......@@ -16,9 +16,9 @@ stages:
cmd: PYTHONPATH=. python experiments/scripts/get_model.py --dataset_name enron_spam
--output_dir data/models/enron_spam
deps:
- path: data/datasets/enron_spam
md5: 66d44efedf37990b1989c81bbee085e0.dir
size: 53096069
- path: data/preprocessed/enron_spam
md5: b75efba1a62182dc8ac32acd1faf92ed.dir
size: 61709260
nfiles: 3
- path: experiments/scripts/get_model.py
md5: 5050f51b4019bba97af47971f6c7cab4
......@@ -32,21 +32,21 @@ stages:
cmd: PYTHONPATH=. python experiments/scripts/classify.py --dataset_name enron_spam
--output_dir data/classification/enron_spam
deps:
- path: data/datasets/enron_spam/
md5: 66d44efedf37990b1989c81bbee085e0.dir
size: 53096069
nfiles: 3
- path: data/models/enron_spam/
md5: 3e16b22f59532c66beeadea958e0579a.dir
size: 18505614
nfiles: 6
- path: data/preprocessed/enron_spam/
md5: b75efba1a62182dc8ac32acd1faf92ed.dir
size: 61709260
nfiles: 3
- path: experiments/scripts/classify.py
md5: 50f55b90eb47cbf448d83f8392dd37b6
size: 1102
md5: ba9284c90847fbbd0f2a6cca414d9636
size: 1106
outs:
- path: data/classification/enron_spam
md5: c7d42825b98b289f6a5ed3be1af14413.dir
size: 2763843
md5: 0450c0b672bc4a5db3cc7be2dac786bd.dir
size: 10674882
nfiles: 2
explain@enron_spam:
cmd: PYTHONPATH=. python experiments/scripts/explain.py --dataset_name enron_spam
......@@ -88,13 +88,13 @@ stages:
size: 1688836
nfiles: 3
- path: experiments/scripts/tag_dataset.py
md5: 1d911edcd336cacaec482e6b7570eb1a
size: 2716
md5: 2c4e097b3a278c12d19858f988232b44
size: 3435
outs:
- path: data/preprocessed/poleval/
md5: 8daba6ad0597214499ac9b96e8e47c9f.dir
size: 501920
nfiles: 1
md5: 854387459b193c5eba6db1273ca5ad23.dir
size: 2277282
nfiles: 3
preprocess_dataset@enron_spam:
cmd: PYTHONPATH=. python experiments/scripts/tag_dataset.py --dataset_name enron_spam
deps:
......@@ -103,10 +103,10 @@ stages:
size: 53096069
nfiles: 3
- path: experiments/scripts/tag_dataset.py
md5: 1d911edcd336cacaec482e6b7570eb1a
size: 2716
md5: 2c4e097b3a278c12d19858f988232b44
size: 3435
outs:
- path: data/preprocessed/enron_spam/
md5: 80c8dd3aa3bacf3afe8cf3138ab01d00.dir
size: 10639521
nfiles: 1
md5: b75efba1a62182dc8ac32acd1faf92ed.dir
size: 61709260
nfiles: 3
......@@ -30,6 +30,7 @@ stages:
get_model:
foreach:
- enron_spam
- poleval
do:
wdir: .
cmd: >-
......@@ -38,12 +39,13 @@ stages:
--output_dir data/models/${item}
deps:
- experiments/scripts/get_model.py
- data/datasets/${item}
- data/preprocessed/${item}
outs:
- data/models/${item}/
classify:
foreach:
- enron_spam
- poleval
do:
wdir: .
cmd: >-
......@@ -53,12 +55,13 @@ stages:
deps:
- experiments/scripts/classify.py
- data/models/${item}/
- data/datasets/${item}/
- data/preprocessed/${item}/
outs:
- data/classification/${item}
explain:
foreach:
- enron_spam
- poleval
do:
wdir: .
cmd: >-
......@@ -68,6 +71,6 @@ stages:
deps:
- experiments/scripts/explain.py
- data/models/${item}
- data/datasets/${item}
- data/proprocessed/${item}
outs:
- data/explanations/${item}/
......@@ -28,7 +28,7 @@ def main(
classify = get_classify_function(
dataset_name=dataset_name,
)
test = pd.read_json(f"data/datasets/{dataset_name}/test.jsonl", lines=True)
test = pd.read_json(f"data/preprocessed/{dataset_name}/test.jsonl", lines=True)
test_x = test["text"].tolist()
test_y = test["label"]
pred_y = classify(test_x)
......
......@@ -43,7 +43,7 @@ def main(
model, tokenizer = get_model_and_tokenizer(
dataset_name=dataset_name,
)
test = pd.read_json(f"data/datasets/{dataset_name}/adversarial.jsonl", lines=True)
test = pd.read_json(f"data/preprocessed/{dataset_name}/adversarial.jsonl", lines=True)
test_x = test["text"].tolist()
predict = build_predict_fun(model, tokenizer)
......
......@@ -17,10 +17,14 @@ LEMMAS = 'lemmas'
TAGS = 'tags'
def tag_sentence(connection: Connection, sentence: str, lang: str):
task = Task([{'postagger': {'output_type': 'json', 'lang': lang}}],
connection=connection)
output_file_id = task.run(sentence, IOType.TEXT)
def tag_sentence(sentence: str, lang: str):
connection = Connection(config_file="experiments/configs/config.yml")
lpmn = ["morphodita",
{"posconverter":
{"input_format": "ccl", "output_format": "json"}}] \
if lang == 'pl' else [{"spacy": {"lang": "en"}}]
task = Task(lpmn, connection=connection)
output_file_id = task.run(str(sentence), IOType.TEXT)
tokens = []
try:
clarin_json = json.loads(download(connection, output_file_id, IOType.TEXT).decode("utf-8"))
......@@ -40,7 +44,7 @@ def tag_sentence(connection: Connection, sentence: str, lang: str):
return lemmas, tags
def process_file(dataset_df, connection, lang, output_path):
def process_file(dataset_df, lang, output_path):
test_with_tags = pd.DataFrame(dataset_df)
lemmas_col, tags_col = [], []
cpus = cpu_count()
......@@ -49,9 +53,7 @@ def process_file(dataset_df, connection, lang, output_path):
for idx in tqdm(range(0, len(dataset_df), cpus)):
end = min(idx+cpus, len(dataset_df) + 1)
for sentence in dataset_df[TEXT][idx:end]:
results.append(pool.apply_async(tag_sentence, args=(connection,
sentence,
lang,)))
results.append(pool.apply_async(tag_sentence, args=[sentence, lang]))
for res in results:
lemmas, tags = res.get()
lemmas_col.append(lemmas)
......@@ -73,15 +75,21 @@ def process_file(dataset_df, connection, lang, output_path):
def main(dataset_name: str):
"""Downloads the dataset to the output directory."""
lang = 'en' if dataset_name == 'enron_spam' else 'pl'
conn = Connection(config_file="experiments/configs/config.yml")
output_dir = f"data/preprocessed/{dataset_name}"
os.makedirs(output_dir, exist_ok=True)
input_dir = f"data/datasets/{dataset_name}"
for file in os.listdir(input_dir):
if os.path.isfile(os.path.join(input_dir, file)):
if file == "test.jsonl":
process_file(pd.read_json(os.path.join(input_dir, file), lines=True),
conn, lang, os.path.join(output_dir, file))
lang, os.path.join(output_dir, file))
else:
test_with_tags = pd.DataFrame(pd.read_json(os.path.join(input_dir, file), lines=True))
test_with_tags[LEMMAS] = ['' for _ in range(len(test_with_tags))]
test_with_tags[TAGS] = ['' for _ in range(len(test_with_tags))]
with open(os.path.join(output_dir, file), mode="wt") as fd:
fd.write(test_with_tags.to_json(orient='records', lines=True))
if __name__ == "__main__":
......
"""Classification model for enron_spam"""
def get_model_and_tokenizer():
return None, None
def get_classify_function():
def fun(texts):
return "dummy"
return fun
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment