-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtext_utils.py
81 lines (66 loc) · 2.78 KB
/
text_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import numpy as np
from numpy.linalg import norm
from typing import List, Tuple
from openai import OpenAI
import openai
import warnings
warnings.filterwarnings("ignore")
from config import MY_OPENAI_API_KEY,MY_IMG_MODEL,MY_TXT_MODEL
def cosine_similarity(vector_a, vector_b):
"""Calculate the cosine similarity between two vectors."""
dot_product = np.dot(vector_a, vector_b)
norm_a = norm(vector_a)
norm_b = norm(vector_b)
similarity = dot_product / (norm_a * norm_b)
return similarity
def create_embeddings(txt_list: List[str], model=MY_TXT_MODEL) -> List[np.ndarray]:
"""
주어진 텍스트 목록에 대한 embedding vector를 생성함
Args:
txt_list (List[str]): embedding을 생성할 텍스트 목록
model (str, optional): embedding 모델
Returns:
List[np.ndarray]: 각 embedding vector
"""
client = OpenAI()
response = client.embeddings.create(input=txt_list,model=model)
responses = [r.embedding for r in response.data]
return responses
from config import MY_OPENAI_API_KEY,MY_IMG_MODEL
def normal_chat_completion(input_prompt: str, model: str = MY_IMG_MODEL) -> dict:
"""
Openai chat completion을 활용하여 JSON output 생성
Args:
input_prompt (str): The input prompt to the chat model.
model (str, optional): Model name. Defaults to MY_IMG_MODEL.
Returns:
dict: The chat completion response formatted as a JSON object.
"""
client = openai.OpenAI()
response = client.chat.completions.create(
model=model,
response_format={ "type": "json_object" },
messages=[
{"role": "system", "content": 'You are a smart and intelligent program that understands information and provides output in JSON format'},
{"role": "user", "content":input_prompt}
]
)
return response
def search_similar_vector(query_feature: np.array, features: List[np.array], topk: int = 10) -> Tuple[np.array, np.array]:
"""
주어진 vector들과 비교하여, query_feature와 유사한 vector의 index와 유사도를 제공함
Args:
query_feature (np.array): input embedding vector
features (List[np.array]): embedding vector들의 list
topk (int, optional): top-k 유사한 벡터들의 개수
Returns:
Tuple[np.array, np.array]: 유사한 embedding vector들의 index & cosine similarity
"""
features_stack = np.vstack(features)
similarities = cosine_similarity([query_feature], features_stack).flatten()
sorted_indices_desc = np.argsort(similarities)[::-1]
# Get top-k indices
topk_indices = sorted_indices_desc[:topk]
# Retrieve the top-k cosine similarities
topk_similarities = similarities[topk_indices]
return topk_indices, topk_similarities