-
Notifications
You must be signed in to change notification settings - Fork 0
/
embed_arxiv.py
89 lines (70 loc) · 2.66 KB
/
embed_arxiv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import requests
import cohere
import json
import time
import logging
import os
from dotenv import load_dotenv
from tenacity import retry, stop_after_attempt, wait_random_exponential
BATCH_SIZE = 500
ARXIV_JSON = "data/arxiv_cs.CL.json"
ARXIV_EMBEDDINGS_JSONL = "data/arxiv_cs.CL_embedv3.jsonl"
def load_json(filename):
try:
with open(filename, 'r', encoding='utf-8') as file:
return json.load(file)
except Exception as e:
logging.error(f"An error occurred while loading JSON: {e}")
return []
@retry(wait=wait_random_exponential(min=1, max=5), stop=stop_after_attempt(5))
def embed(co_client, texts):
return co_client.embed(
model='embed-english-v3.0',
texts=texts,
input_type='search_document',
).embeddings
def embed_batch(batch, co_client, file):
titles = [paper['title'] for paper in batch]
summaries = [paper['summary'] for paper in batch]
title_embeddings = embed(co_client, titles)
summary_embeddings = embed(co_client, summaries)
for i, paper in enumerate(batch):
paper['embeddings'] = {
'title': title_embeddings[i],
'summary': summary_embeddings[i]
}
json.dump(paper, file)
file.write('\n')
def process_embeddings_and_save(data, co_client, filename):
try:
with open(filename, 'w', encoding='utf-8') as file: