Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding embeddings results #47

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions repoqa/provider/embeddings/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from abc import ABC, abstractmethod
from typing import List

class BaseEmbeddingsProvider(ABC):
@abstractmethod
def find_best_match(
self, description, snippets, threshold=0
) -> str:
...
34 changes: 34 additions & 0 deletions repoqa/provider/embeddings/openai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import os
from typing import List, Tuple

from openai import Client
import numpy as np

from repoqa.provider.embeddings.base import BaseEmbeddingsProvider
from repoqa.provider.request.openai import make_auto_embeddings_request

class OpenAIEmbeddingsProvider(BaseEmbeddingsProvider):
def __init__(self, model, base_url: str = None):
self.model = model
self.client = Client(
api_key=os.getenv("OPENAI_API_KEY", "none"), base_url=base_url
)

def find_best_match(
self, description, snippets, threshold=0
) -> str:
all_texts = [ description ] + snippets
embedded_texts = make_auto_embeddings_request(self.client, all_texts, self.model)
query_embedded = np.array(embedded_texts[0])
max_similarity = 0
max_sim_index = 0

query_norm = np.linalg.norm(query_embedded)

for i in range(1, len(embedded_texts)):
similarity_score = (query_embedded @ np.array(embedded_texts[i])) / (query_norm * np.linalg.norm(embedded_texts[i])) # https://stackoverflow.com/questions/18424228/cosine-similarity-between-2-number-lists
if similarity_score > max_similarity:
max_similarity = similarity_score
max_sim_index = i

return all_texts[max_sim_index]
19 changes: 16 additions & 3 deletions repoqa/provider/request/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import signal
import time
from typing import List

import openai
from openai.types.chat import ChatCompletion
Expand All @@ -30,19 +31,25 @@ def make_request(
**kwargs,
)

def make_embeddings_request(
client: openai.Client,
texts: List[str],
model: str,
) -> List[List[float]]:
response = client.embeddings.create(input=texts, model=model, encoding_format="float")
return [d.embedding for d in response.data]

def handler(signum, frame):
# swallow signum and frame
raise Exception("end of time")


def make_auto_request(*args, **kwargs) -> ChatCompletion:
def make_request_with_retry(func, *args, **kwargs) -> ChatCompletion | List[List[float]]:
ret = None
while ret is None:
try:
signal.signal(signal.SIGALRM, handler)
signal.alarm(100)
ret = make_request(*args, **kwargs)
ret = func(*args, **kwargs)
signal.alarm(0)
except openai.RateLimitError:
print("Rate limit exceeded. Waiting...")
Expand All @@ -61,3 +68,9 @@ def make_auto_request(*args, **kwargs) -> ChatCompletion:
signal.alarm(0)
time.sleep(1)
return ret

def make_auto_request(*args, **kwargs) -> ChatCompletion:
return make_request_with_retry(make_request, *args, **kwargs)

def make_auto_embeddings_request(*args, **kwargs) -> List[List[float]]:
return make_request_with_retry(make_embeddings_request, *args, **kwargs)
50 changes: 44 additions & 6 deletions repoqa/search_needle_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import json
import os
from typing import List, Tuple
import difflib

from transformers import AutoTokenizer
from tree_sitter_languages import get_language, get_parser
Expand All @@ -29,6 +30,16 @@
" please retrieve and repeat the exact described function from the code context in a code block wrapped by ```:"
)

def _find_line(text, index):
if index < 0 or index >= len(text):
raise IndexError()
line = 0
for i, ch in enumerate(text):
if i == index:
return line
if ch == "\n" or ch == "\r":
line += 1
return line

def _backward_tokenizable_lines(lines, tokenizer, max_tokens):
"""Return the text and tokens from bottom to top"""
Expand Down Expand Up @@ -374,6 +385,8 @@ def evaluate_model(
eval_ignore_comments: bool = False, # ignore comments during score computation
trust_remote_code: bool = False,
attn_implementation=None,
is_embedding: bool = False,
embedding_chunk_line_count: int = 30
):
if backend is None:
if base_url is not None:
Expand Down Expand Up @@ -515,9 +528,14 @@ def evaluate_model(
return

if backend == "openai":
from repoqa.provider.openai import OpenAIProvider
if is_embedding:
from repoqa.provider.embeddings.openai import OpenAIEmbeddingsProvider

engine = OpenAIEmbeddingsProvider(model, base_url=base_url)
else:
from repoqa.provider.openai import OpenAIProvider

engine = OpenAIProvider(model, base_url=base_url)
engine = OpenAIProvider(model, base_url=base_url)
elif backend == "vllm":
from repoqa.provider.vllm import VllmProvider

Expand Down Expand Up @@ -563,10 +581,30 @@ def evaluate_model(
prompt = ""
for key in task["template"].split("\n"):
prompt += task[key]

replies = engine.generate_reply(
prompt, n=1, max_tokens=max_new_tokens, system_msg=system_message
)

if is_embedding:
tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-Instruct-hf")
tokenized_code_context = tokenizer.encode(task["code_context"])
prefix = tokenizer.decode(tokenized_code_context[:task["needle_token_start"]])
needle = tokenizer.decode(tokenized_code_context[task["needle_token_start"]:task["needle_token_end"]])
suffix = tokenizer.decode(tokenized_code_context[task["needle_token_end"]:])

prefix_lines = prefix.splitlines()
suffix_lines = suffix.splitlines()

prefix_split = ["\n".join(prefix_lines[line:min(line + embedding_chunk_line_count, len(prefix_lines))]) for line in range(0, len(prefix_lines), embedding_chunk_line_count)]
suffix_split = ["\n".join(suffix_lines[line:min(line + embedding_chunk_line_count, len(suffix_lines))]) for line in range(0, len(suffix_lines), embedding_chunk_line_count)]
snippets = prefix_split + [needle] + suffix_split
snippets = [snippet for snippet in snippets if len(snippet.strip()) > 0]

replies = engine.find_best_match(
task["description"],
snippets
)
else:
replies = engine.generate_reply(
prompt, n=1, max_tokens=max_new_tokens, system_msg=system_message
)
result = {**task, "output": replies}
f_out.write(json.dumps(result) + "\n")
f_out.flush()
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ openai
anthropic
google-generativeai
vllm
stop-sequencer
stop-sequencer