Skip to content
Snippets Groups Projects
Commit d390eaee authored by MGniew's avatar MGniew
Browse files

1

parent 25bde6b0
No related merge requests found
dvc-gdrive==2.19.0
dvclive==2.12.0
pandas==1.4.2
pip-chill==1.0.3
pydeprecate==0.3.2
pytorch-lightning==2.0.4
pytorch-triton-rocm==2.0.2
sentence-transformers==2.2.0
#tensorflow==2.8.0
wandb==0.12.11
......@@ -7,6 +7,7 @@ shap==0.41.0
tqdm>=4.65.0
tokenizers==0.13.2
sentence-transformers==2.2.2
pandas==1.4.2
jupyter
matplotlib
spacy
......@@ -15,6 +16,8 @@ https://github.com/explosion/spacy-models/releases/download/pl_core_news_lg-3.5.
--find-links https://download.pytorch.org/whl/torch_stable.html
torch==1.12.0+cu116
pytorch-lightning==2.0.4
wandb==0.12.11
--index-url https://pypi.clarin-pl.eu/simple/
plwn-api==0.24
......
/unhealthy_conversations
outs:
- md5: 9c3abc0871f83db29875edb972144e54.dir
size: 235178486
nfiles: 7
path: unhealthy_conversations
/unhealthy_conversations
outs:
- md5: 5473df6f986edca362433e1de30e98cf.dir
size: 4978942
nfiles: 42
path: unhealthy_conversations
"""Classification model for ag_news"""
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from personalized_nlp.learning.classifier import Classifier
from personalized_nlp.models import models as models_dict
datamodule = UnhealthyDataModule()
output_dim = (
sum(datamodule.class_dims)
)
text_embedding_dim = datamodule.text_embedding_dim
model_cls = models_dict["baseline"]
def get_model_and_tokenizer():
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/LaBSE")
model = model_cls(
output_dim=output_dim,
text_embedding_dim=text_embedding_dim,
annotator_num=datamodule.annotators_number,
bias_vector_length=len(datamodule.class_dims),
)
model = Classifier(
model=model, lr=1e-2, class_dims=class_dims, class_names=class_names
)
model.load_state_dict(torch.load(
"trained_models/unhealthy/baseline.pt")["state_dict"]
)
model.eval()
return model, tokenizer
def get_embeddings(texts, tokenizer, model, max_seq_len=256, mean_pooling=True):
model = model.eval()
texts = [str(t) for t in texts]
def batch(iterable, n=1):
l = len(iterable)
for ndx in range(0, l, n):
yield iterable[ndx : min(ndx + n, l)]
def fun(texts):
logits = list()
i = 0
for chunk in tqdm(
[texts[pos:pos + 128] for pos in range(0, len(texts), 128)]
):
encoded_inputs = tokenizer(
chunk,
return_tensors="pt",
padding=True,
truncation=True,
max_length=512
).to(device)
with torch.no_grad():
logits.append(model(**encoded_inputs).logits.cpu())
logits = torch.cat(logits, dim=0)
pred_y = torch.argmax(logits, dim=1).tolist()
pred_y = [model.config.id2label[p] for p in pred_y]
return pred_y
return fun
device = "cuda"
all_embeddings = []
for batched_texts in batch(texts, 200):
with torch.no_grad():
batch_encoding = tokenizer.batch_encode_plus(
batched_texts,
padding="longest",
add_special_tokens=True,
truncation=True,
max_length=max_seq_len,
return_tensors="pt"
).to(device)
emb = model(**batch_encoding)
if mean_pooling:
mask = batch_encoding["attention_mask"] > 0
for i in range(emb[0].size()[0]):
emb_pooled = emb[0][i, mask[i] > 0, :].mean(axis=0)[None, :]
all_embeddings.append(emb_pooled)
else:
emb_pooled = emb.last_hidden_state[:, 0, :]
all_embeddings.append(emb_pooled)
return {"embeddings": torch.cat(all_embeddings, axis=0).to("cpu").cpu()}
def get_classify_funcion(device="cpu"):
model, tokenizer = get_model_and_tokenizer()
model.eval()
model = model.to(device)
language_model = AutoModel.from_pretrained(
"sentence-transformers/LaBSE"
).to("cuda")
def predict(texts):
embeddings = get_embeddings(texts, tokenizer, language_model)
return model(embeddings)
return predict
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment