diff --git a/repoqa/provider/embeddings/base.py b/repoqa/provider/embeddings/base.py new file mode 100644 index 0000000..3be0611 --- /dev/null +++ b/repoqa/provider/embeddings/base.py @@ -0,0 +1,9 @@ +from abc import ABC, abstractmethod +from typing import List + +class BaseEmbeddingsProvider(ABC): + @abstractmethod + def find_best_match( + self, description, snippets, threshold=0 + ) -> str: + ... \ No newline at end of file diff --git a/repoqa/provider/embeddings/openai.py b/repoqa/provider/embeddings/openai.py new file mode 100644 index 0000000..49dc5c7 --- /dev/null +++ b/repoqa/provider/embeddings/openai.py @@ -0,0 +1,34 @@ +import os +from typing import List, Tuple + +from openai import Client +import numpy as np + +from repoqa.provider.embeddings.base import BaseEmbeddingsProvider +from repoqa.provider.request.openai import make_auto_embeddings_request + +class OpenAIEmbeddingsProvider(BaseEmbeddingsProvider): + def __init__(self, model, base_url: str = None): + self.model = model + self.client = Client( + api_key=os.getenv("OPENAI_API_KEY", "none"), base_url=base_url + ) + + def find_best_match( + self, description, snippets, threshold=0 + ) -> str: + all_texts = [ description ] + snippets + embedded_texts = make_auto_embeddings_request(self.client, all_texts, self.model) + query_embedded = np.array(embedded_texts[0]) + max_similarity = 0 + max_sim_index = 0 + + query_norm = np.linalg.norm(query_embedded) + + for i in range(1, len(embedded_texts)): + similarity_score = (query_embedded @ np.array(embedded_texts[i])) / (query_norm * np.linalg.norm(embedded_texts[i])) # https://stackoverflow.com/questions/18424228/cosine-similarity-between-2-number-lists + if similarity_score > max_similarity: + max_similarity = similarity_score + max_sim_index = i + + return all_texts[max_sim_index] \ No newline at end of file diff --git a/repoqa/provider/request/openai.py b/repoqa/provider/request/openai.py index 6f6e213..9c6e47b 100644 --- a/repoqa/provider/request/openai.py +++ b/repoqa/provider/request/openai.py @@ -4,6 +4,7 @@ import signal import time +from typing import List import openai from openai.types.chat import ChatCompletion @@ -30,19 +31,25 @@ def make_request( **kwargs, ) +def make_embeddings_request( + client: openai.Client, + texts: List[str], + model: str, +) -> List[List[float]]: + response = client.embeddings.create(input=texts, model=model, encoding_format="float") + return [d.embedding for d in response.data] def handler(signum, frame): # swallow signum and frame raise Exception("end of time") - -def make_auto_request(*args, **kwargs) -> ChatCompletion: +def make_request_with_retry(func, *args, **kwargs) -> ChatCompletion | List[List[float]]: ret = None while ret is None: try: signal.signal(signal.SIGALRM, handler) signal.alarm(100) - ret = make_request(*args, **kwargs) + ret = func(*args, **kwargs) signal.alarm(0) except openai.RateLimitError: print("Rate limit exceeded. Waiting...") @@ -61,3 +68,9 @@ def make_auto_request(*args, **kwargs) -> ChatCompletion: signal.alarm(0) time.sleep(1) return ret + +def make_auto_request(*args, **kwargs) -> ChatCompletion: + return make_request_with_retry(make_request, *args, **kwargs) + +def make_auto_embeddings_request(*args, **kwargs) -> List[List[float]]: + return make_request_with_retry(make_embeddings_request, *args, **kwargs) \ No newline at end of file diff --git a/repoqa/search_needle_function.py b/repoqa/search_needle_function.py index 1507902..38a5419 100644 --- a/repoqa/search_needle_function.py +++ b/repoqa/search_needle_function.py @@ -5,6 +5,7 @@ import json import os from typing import List, Tuple +import difflib from transformers import AutoTokenizer from tree_sitter_languages import get_language, get_parser @@ -29,6 +30,16 @@ " please retrieve and repeat the exact described function from the code context in a code block wrapped by ```:" ) +def _find_line(text, index): + if index < 0 or index >= len(text): + raise IndexError() + line = 0 + for i, ch in enumerate(text): + if i == index: + return line + if ch == "\n" or ch == "\r": + line += 1 + return line def _backward_tokenizable_lines(lines, tokenizer, max_tokens): """Return the text and tokens from bottom to top""" @@ -374,6 +385,8 @@ def evaluate_model( eval_ignore_comments: bool = False, # ignore comments during score computation trust_remote_code: bool = False, attn_implementation=None, + is_embedding: bool = False, + embedding_chunk_line_count: int = 30 ): if backend is None: if base_url is not None: @@ -515,9 +528,14 @@ def evaluate_model( return if backend == "openai": - from repoqa.provider.openai import OpenAIProvider + if is_embedding: + from repoqa.provider.embeddings.openai import OpenAIEmbeddingsProvider + + engine = OpenAIEmbeddingsProvider(model, base_url=base_url) + else: + from repoqa.provider.openai import OpenAIProvider - engine = OpenAIProvider(model, base_url=base_url) + engine = OpenAIProvider(model, base_url=base_url) elif backend == "vllm": from repoqa.provider.vllm import VllmProvider @@ -563,10 +581,30 @@ def evaluate_model( prompt = "" for key in task["template"].split("\n"): prompt += task[key] - - replies = engine.generate_reply( - prompt, n=1, max_tokens=max_new_tokens, system_msg=system_message - ) + + if is_embedding: + tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-Instruct-hf") + tokenized_code_context = tokenizer.encode(task["code_context"]) + prefix = tokenizer.decode(tokenized_code_context[:task["needle_token_start"]]) + needle = tokenizer.decode(tokenized_code_context[task["needle_token_start"]:task["needle_token_end"]]) + suffix = tokenizer.decode(tokenized_code_context[task["needle_token_end"]:]) + + prefix_lines = prefix.splitlines() + suffix_lines = suffix.splitlines() + + prefix_split = ["\n".join(prefix_lines[line:min(line + embedding_chunk_line_count, len(prefix_lines))]) for line in range(0, len(prefix_lines), embedding_chunk_line_count)] + suffix_split = ["\n".join(suffix_lines[line:min(line + embedding_chunk_line_count, len(suffix_lines))]) for line in range(0, len(suffix_lines), embedding_chunk_line_count)] + snippets = prefix_split + [needle] + suffix_split + snippets = [snippet for snippet in snippets if len(snippet.strip()) > 0] + + replies = engine.find_best_match( + task["description"], + snippets + ) + else: + replies = engine.generate_reply( + prompt, n=1, max_tokens=max_new_tokens, system_msg=system_message + ) result = {**task, "output": replies} f_out.write(json.dumps(result) + "\n") f_out.flush() diff --git a/requirements.txt b/requirements.txt index d7e81ac..2d18374 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,4 @@ openai anthropic google-generativeai vllm -stop-sequencer +stop-sequencer \ No newline at end of file