diff --git a/README.md b/README.md index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..e5bd7a2c75e3512d3d53df746cbc9a197111cabf 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,5 @@ +``` +pip install morfeusz2 +pip install -r requirements.txt +pip install --no-deps git+ssh://git@gitlab.clarin-pl.eu/adversarial-attacks/textfooling.git@develop +``` diff --git a/dvc.yaml b/dvc.yaml index 533298e55d4632d5a8413d34b917d531bf334396..92afbaa266962d0bbc8d0f316ec7b6e38c6633d9 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -17,6 +17,8 @@ stages: foreach: - enron_spam - poleval + - 20_news + - wiki_pl do: wdir: . cmd: >- @@ -30,7 +32,6 @@ stages: get_model: foreach: - enron_spam - # - poleval do: wdir: . cmd: >- @@ -45,7 +46,8 @@ stages: classify: foreach: - enron_spam - #- poleval + - 20_news + - wiki_pl do: wdir: . cmd: >- @@ -61,7 +63,6 @@ stages: explain: foreach: - enron_spam - #- poleval do: wdir: . cmd: >- diff --git a/experiments/scripts/tag_dataset.py b/experiments/scripts/tag_dataset.py index e1b0671fc6ba9111847dabd3e1dea7a1fb7d71fb..4b24429426e51dcf8b3ba550d9e0d3977b5be7dd 100644 --- a/experiments/scripts/tag_dataset.py +++ b/experiments/scripts/tag_dataset.py @@ -74,7 +74,12 @@ def process_file(dataset_df, lang, output_path): ) def main(dataset_name: str): """Downloads the dataset to the output directory.""" - lang = 'en' if dataset_name == 'enron_spam' else 'pl' + lang = { + "enron_spam": "en", + "poleval": "pl", + "20_news": "en", + "wiki_pl": "pl", + }[dataset_name] output_dir = f"data/preprocessed/{dataset_name}" os.makedirs(output_dir, exist_ok=True) @@ -93,4 +98,4 @@ def main(dataset_name: str): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/requirements.txt b/requirements.txt index 66b509aeeee23df8984167e7e05aa922e01fbfa2..78255946c4d875b554424d2091bbdc35bb3798be 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,13 +2,16 @@ datasets transformers click scikit-learn -dvc[s3]==2.46.0 -shap==0.41.0 -lpmn_client_biz +dvc[s3] +shap +tqdm +transformers +tokenizers +sentence-transformers --find-links https://download.pytorch.org/whl/torch_stable.html torch==1.12.0+cu116 --index-url https://pypi.clarin-pl.eu/simple/ plwn-api -git+ssh://git@gitlab.clarin-pl.eu/adversarial-attacks/textfooling.git@develop +lpmn_client_biz