-
Notifications
You must be signed in to change notification settings - Fork 26
/
Copy pathembeddings.py
76 lines (67 loc) · 2.46 KB
/
embeddings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import pandas as pd
import numpy as np
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from typing import List
from sklearn.feature_extraction.text import TfidfVectorizer
import imodelsx.embeddings
from copy import deepcopy
def get_embs(
texts: List[str], checkpoint: str = "bert-base-uncased", batch_size: int = 32,
aggregate: str = "mean"
) -> np.ndarray:
'''
Get embeddings for a list of texts.
Params
------
texts: List[str]: List of texts to get embeddings for.
checkpoint: str: Name of the checkpoint to use. Use tf-idf for linear embeddings.
batch_size: int: Batch size to use for inference.
aggregate: str: Aggregation method to use for the embeddings. Can be "mean" or "first" (to use CLS token for BERT).
'''
if checkpoint == "tf-idf":
return get_embs_linear(texts)
# load model
# get embeddings for each text from the corpus
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModel.from_pretrained(checkpoint).to("cuda")
# calculate embeddings
embs = []
for i in tqdm(range(0, len(texts), batch_size)):
t = texts[i: i + batch_size]
with torch.no_grad():
# tokenize
inputs = tokenizer(
t, return_tensors="pt", padding=True, truncation=True
).to("cuda")
# Shape: [batch_size, seq_len, hidden_size]
outputs = model(**inputs).last_hidden_state.detach().cpu().numpy()
# average over sequence length
if aggregate == "mean":
emb = np.mean(outputs, axis=1).squeeze()
elif aggregate == "first":
emb = outputs[:, 0, :].squeeze() # use CLS token
embs.append(deepcopy(emb))
embs = np.concatenate(embs)
return embs
def get_embs_linear(texts: List[str]) -> np.ndarray:
"""Get TF-IDF vectors for a list of texts.
Parameters
----------
texts (List[str]): List of texts to get TF-IDF vectors for.
Returns
-------
embs: np.ndarray: TF-IDF vectors for the input texts.
"""
vectorizer = TfidfVectorizer(
# tokenizer=AutoTokenizer.from_pretrained(checkpoint).tokenize,
# preprocessor=lambda x: x,
# token_pattern=None,
lowercase=False,
max_features=10000,
)
return vectorizer.fit_transform(texts).toarray()