-
Notifications
You must be signed in to change notification settings - Fork 0
/
encode-file.py
41 lines (33 loc) · 1.13 KB
/
encode-file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
from sentence_transformers import SentenceTransformer
import h5py
import json
import gzip
import sys
import os
import argparse
print("** loading model")
modelname = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
#'sentence-transformers/all-MiniLM-L6-v2'
modelnick = "sbert-multi-L12-v2"
model = SentenceTransformer(modelname)
def json_tweets(filename, output=None, key="text"):
T = []
if output is None:
output = os.path.basename(filename) + f"--{modelnick}.h5"
print(f"** reading {filename} dataset")
with open(filename) as f:
for line in f:
tweet = json.loads(line)
text = tweet[key]
# do some preprocessing?
T.append(text)
assert len(T) > 0, f"ERROR {filename} is empty"
print(T[:10])
print(f"** encoding with {modelname}")
emb = model.encode(T)
print(f"** saving embeddings in {output}")
with h5py.File(output, "w") as f:
f.attrs['filename'] = filename
f.attrs['modelname'] = modelname
f.attrs['modelnick'] = modelnick
f.create_dataset("emb", emb.shape, dtype=emb.dtype)[:] = emb