diff --git a/req2.txt b/req2.txt deleted file mode 100644 index 9bc6c5ef8e961964364a6eb3da3b4a9ec6dc1ec3..0000000000000000000000000000000000000000 --- a/req2.txt +++ /dev/null @@ -1,10 +0,0 @@ -dvc-gdrive==2.19.0 -dvclive==2.12.0 -pandas==1.4.2 -pip-chill==1.0.3 -pydeprecate==0.3.2 -pytorch-lightning==2.0.4 -pytorch-triton-rocm==2.0.2 -sentence-transformers==2.2.0 -#tensorflow==2.8.0 -wandb==0.12.11 diff --git a/requirements.txt b/requirements.txt index bc59ed160620d8fd3ea200a5133defe832d6138b..c11de2f6da1fd5284c1fce2662a71ff6e5f8a40e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,6 +7,7 @@ shap==0.41.0 tqdm>=4.65.0 tokenizers==0.13.2 sentence-transformers==2.2.2 +pandas==1.4.2 jupyter matplotlib spacy @@ -15,6 +16,8 @@ https://github.com/explosion/spacy-models/releases/download/pl_core_news_lg-3.5. --find-links https://download.pytorch.org/whl/torch_stable.html torch==1.12.0+cu116 +pytorch-lightning==2.0.4 +wandb==0.12.11 --index-url https://pypi.clarin-pl.eu/simple/ plwn-api==0.24 diff --git a/storage/data/.gitignore b/storage/data/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..e9800040b3eb6fb42a6bbb093e1e7cfe72e8f06e --- /dev/null +++ b/storage/data/.gitignore @@ -0,0 +1 @@ +/unhealthy_conversations diff --git a/storage/data/unhealthy_conversations.dvc b/storage/data/unhealthy_conversations.dvc new file mode 100644 index 0000000000000000000000000000000000000000..cfbee93809ca49ff0d12e6eb857510ce2b0cb90c --- /dev/null +++ b/storage/data/unhealthy_conversations.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 9c3abc0871f83db29875edb972144e54.dir + size: 235178486 + nfiles: 7 + path: unhealthy_conversations diff --git a/storage/trained_models/.gitignore b/storage/trained_models/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..e9800040b3eb6fb42a6bbb093e1e7cfe72e8f06e --- /dev/null +++ b/storage/trained_models/.gitignore @@ -0,0 +1 @@ +/unhealthy_conversations diff --git a/storage/trained_models/unhealthy_conversations.dvc b/storage/trained_models/unhealthy_conversations.dvc new file mode 100644 index 0000000000000000000000000000000000000000..97784aceb67d2d5c9fa39d4df9b1319f7fa16cbd --- /dev/null +++ b/storage/trained_models/unhealthy_conversations.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 5473df6f986edca362433e1de30e98cf.dir + size: 4978942 + nfiles: 42 + path: unhealthy_conversations diff --git a/text_attacks/models/unhealthy.py b/text_attacks/models/unhealthy.py new file mode 100644 index 0000000000000000000000000000000000000000..b341eb86ce8152cfd28c87389402c3b3d1a737a9 --- /dev/null +++ b/text_attacks/models/unhealthy.py @@ -0,0 +1,109 @@ +"""Classification model for ag_news""" +import torch +from tqdm import tqdm +from transformers import AutoTokenizer, AutoModelForSequenceClassification + +from personalized_nlp.learning.classifier import Classifier +from personalized_nlp.models import models as models_dict + + +datamodule = UnhealthyDataModule() +output_dim = ( + sum(datamodule.class_dims) +) +text_embedding_dim = datamodule.text_embedding_dim +model_cls = models_dict["baseline"] + + + +def get_model_and_tokenizer(): + tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/LaBSE") + model = model_cls( + output_dim=output_dim, + text_embedding_dim=text_embedding_dim, + annotator_num=datamodule.annotators_number, + bias_vector_length=len(datamodule.class_dims), + ) + model = Classifier( + model=model, lr=1e-2, class_dims=class_dims, class_names=class_names + ) + model.load_state_dict(torch.load( + "trained_models/unhealthy/baseline.pt")["state_dict"] + ) + model.eval() + return model, tokenizer + + + +def get_embeddings(texts, tokenizer, model, max_seq_len=256, mean_pooling=True): + model = model.eval() + + texts = [str(t) for t in texts] + def batch(iterable, n=1): + l = len(iterable) + for ndx in range(0, l, n): + yield iterable[ndx : min(ndx + n, l)] + def fun(texts): + logits = list() + i = 0 + for chunk in tqdm( + [texts[pos:pos + 128] for pos in range(0, len(texts), 128)] + ): + encoded_inputs = tokenizer( + chunk, + return_tensors="pt", + padding=True, + truncation=True, + max_length=512 + ).to(device) + with torch.no_grad(): + logits.append(model(**encoded_inputs).logits.cpu()) + logits = torch.cat(logits, dim=0) + pred_y = torch.argmax(logits, dim=1).tolist() + pred_y = [model.config.id2label[p] for p in pred_y] + return pred_y + + return fun + + device = "cuda" + + all_embeddings = [] + for batched_texts in batch(texts, 200): + with torch.no_grad(): + batch_encoding = tokenizer.batch_encode_plus( + batched_texts, + padding="longest", + add_special_tokens=True, + truncation=True, + max_length=max_seq_len, + return_tensors="pt" + ).to(device) + emb = model(**batch_encoding) + + if mean_pooling: + mask = batch_encoding["attention_mask"] > 0 + + for i in range(emb[0].size()[0]): + emb_pooled = emb[0][i, mask[i] > 0, :].mean(axis=0)[None, :] + all_embeddings.append(emb_pooled) + else: + emb_pooled = emb.last_hidden_state[:, 0, :] + all_embeddings.append(emb_pooled) + + return {"embeddings": torch.cat(all_embeddings, axis=0).to("cpu").cpu()} + + +def get_classify_funcion(device="cpu"): + + model, tokenizer = get_model_and_tokenizer() + model.eval() + model = model.to(device) + language_model = AutoModel.from_pretrained( + "sentence-transformers/LaBSE" + ).to("cuda") + + def predict(texts): + embeddings = get_embeddings(texts, tokenizer, language_model) + return model(embeddings) + + return predict