1

d390eaee · MGniew · 25bde6b0 · 25bde6b0 · d390eaee · d390eaee
Commit d390eaee authored 1 year ago by MGniew
--- a/req2.txt
+++ b/req2.txt
-dvc-gdrive==2.19.0
-dvclive==2.12.0
-pandas==1.4.2
-pip-chill==1.0.3
-pydeprecate==0.3.2
-pytorch-lightning==2.0.4
-pytorch-triton-rocm==2.0.2
-sentence-transformers==2.2.0
-#tensorflow==2.8.0
-wandb==0.12.11
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,6 +7,7 @@ shap==0.41.0
 tqdm>=4.65.0
 tokenizers==0.13.2
 sentence-transformers==2.2.2
+pandas==1.4.2
 jupyter
 matplotlib
 spacy
@@ -15,6 +16,8 @@ https://github.com/explosion/spacy-models/releases/download/pl_core_news_lg-3.5.

 --find-links https://download.pytorch.org/whl/torch_stable.html
 torch==1.12.0+cu116
+pytorch-lightning==2.0.4
+wandb==0.12.11

 --index-url https://pypi.clarin-pl.eu/simple/
 plwn-api==0.24

--- a/storage/data/.gitignore
+++ b/storage/data/.gitignore
+/unhealthy_conversations
--- a/storage/data/unhealthy_conversations.dvc
+++ b/storage/data/unhealthy_conversations.dvc
+outs:
+- md5: 9c3abc0871f83db29875edb972144e54.dir
+  size: 235178486
+  nfiles: 7
+  path: unhealthy_conversations
--- a/storage/trained_models/.gitignore
+++ b/storage/trained_models/.gitignore
+/unhealthy_conversations
--- a/storage/trained_models/unhealthy_conversations.dvc
+++ b/storage/trained_models/unhealthy_conversations.dvc
+outs:
+- md5: 5473df6f986edca362433e1de30e98cf.dir
+  size: 4978942
+  nfiles: 42
+  path: unhealthy_conversations
--- a/text_attacks/models/unhealthy.py
+++ b/text_attacks/models/unhealthy.py
+"""Classification model for ag_news"""
+import torch
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+
+from personalized_nlp.learning.classifier import Classifier
+from personalized_nlp.models import models as models_dict
+
+
+datamodule = UnhealthyDataModule()
+output_dim = (
+    sum(datamodule.class_dims)
+)
+text_embedding_dim = datamodule.text_embedding_dim
+model_cls = models_dict["baseline"]
+
+
+
+def get_model_and_tokenizer():
+    tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/LaBSE")
+    model = model_cls(
+		output_dim=output_dim,
+		text_embedding_dim=text_embedding_dim,
+		annotator_num=datamodule.annotators_number,
+		bias_vector_length=len(datamodule.class_dims),
+	)
+    model = Classifier(
+	    model=model, lr=1e-2, class_dims=class_dims, class_names=class_names
+	)
+    model.load_state_dict(torch.load(
+        "trained_models/unhealthy/baseline.pt")["state_dict"]
+    )
+    model.eval()
+    return model, tokenizer
+
+
+
+def get_embeddings(texts, tokenizer, model, max_seq_len=256, mean_pooling=True):
+    model = model.eval()
+
+    texts = [str(t) for t in texts]
+    def batch(iterable, n=1):
+        l = len(iterable)
+        for ndx in range(0, l, n):
+            yield iterable[ndx : min(ndx + n, l)]
+    def fun(texts):
+        logits = list()
+        i = 0
+        for chunk in tqdm(
+                [texts[pos:pos + 128] for pos in range(0, len(texts), 128)]
+        ):
+            encoded_inputs = tokenizer(
+                chunk,
+                return_tensors="pt",
+                padding=True,
+                truncation=True,
+                max_length=512
+            ).to(device)
+            with torch.no_grad():
+                logits.append(model(**encoded_inputs).logits.cpu())
+        logits = torch.cat(logits, dim=0)
+        pred_y = torch.argmax(logits, dim=1).tolist()
+        pred_y = [model.config.id2label[p] for p in pred_y]
+        return pred_y
+
+    return fun
+
+    device = "cuda"
+
+    all_embeddings = []
+    for batched_texts in batch(texts, 200):
+        with torch.no_grad():
+            batch_encoding = tokenizer.batch_encode_plus(
+                batched_texts,
+                padding="longest",
+                add_special_tokens=True,
+                truncation=True,
+                max_length=max_seq_len,
+                return_tensors="pt"
+            ).to(device)
+            emb = model(**batch_encoding)
+
+        if mean_pooling:
+            mask = batch_encoding["attention_mask"] > 0
+
+            for i in range(emb[0].size()[0]):
+                emb_pooled = emb[0][i, mask[i] > 0, :].mean(axis=0)[None, :]
+                all_embeddings.append(emb_pooled)
+        else:
+            emb_pooled = emb.last_hidden_state[:, 0, :]
+            all_embeddings.append(emb_pooled)
+            
+    return {"embeddings": torch.cat(all_embeddings, axis=0).to("cpu").cpu()}
+
+
+def get_classify_funcion(device="cpu"):
+
+    model, tokenizer = get_model_and_tokenizer()
+    model.eval()
+    model = model.to(device)
+    language_model = AutoModel.from_pretrained(
+		"sentence-transformers/LaBSE"
+	).to("cuda")
+
+    def predict(texts):
+        embeddings = get_embeddings(texts, tokenizer, language_model)
+        return model(embeddings)
+
+    return predict