Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New experiments #30

Merged
merged 36 commits into from
Dec 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
11948a7
update litellm
semio Aug 28, 2024
83631be
use latest models in evaluator
semio Aug 28, 2024
75f4ae6
improve gpt4 evaluator
semio Aug 28, 2024
f912cac
add simple evaluator
semio Aug 28, 2024
3586e65
create archive for previous experiment
semio Aug 29, 2024
16ec673
add simple evaluator by default
semio Aug 29, 2024
7b539fc
gpt4o-2024-08-06
semio Aug 31, 2024
5178d23
fix claude region
semio Aug 31, 2024
44250ff
results for claude 3.5 sonnect
semio Sep 4, 2024
7e046b7
add fireworks ai configurations
semio Sep 10, 2024
aede16c
update evaluator config
semio Sep 10, 2024
8a2640e
new experiment for llama3.1
semio Sep 10, 2024
21edc6f
results for llama3.1
semio Sep 15, 2024
eea145b
include all evaluators in results
semio Sep 16, 2024
88e0059
notebooks
semio Sep 16, 2024
17a04f2
update questions
semio Sep 21, 2024
4cdfb02
update deps
semio Sep 21, 2024
205c080
BLOCK_NONE is restricted for now
semio Sep 21, 2024
8358b67
vertex_location is different between claude and gemini
semio Sep 21, 2024
cc9e39d
alibaba now has openai compatible endpoint, use it so we have caching
semio Sep 21, 2024
63766ee
add experiment and result for qwen-max-2024-09-19
semio Sep 28, 2024
ed75974
fix model names
semio Sep 30, 2024
753424a
make archive for 20240910 experiments
semio Sep 30, 2024
63eadb1
update dependencies
semio Nov 21, 2024
7d77230
update questions
semio Nov 21, 2024
b3abfa9
update scripts and notebooks
semio Nov 21, 2024
f1e207d
add xai
semio Nov 22, 2024
4bc9c3f
default to use 3 evaluators
semio Nov 22, 2024
3aaef54
experiment for xai
semio Nov 22, 2024
43647b4
add 60 days ttl to keys because the default for new litellm config is…
semio Nov 28, 2024
b521fda
add grok result
semio Dec 5, 2024
1aef498
archive previous results
semio Dec 5, 2024
ed4aa13
archive grok results
semio Dec 6, 2024
4d79f6d
new experiment and results
semio Dec 6, 2024
c720753
add claude evaluator
semio Dec 18, 2024
a57344f
move folder
semio Dec 20, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions automation-api/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ VERTEXAI_PROJECT="gapminder-ai"
VERTEXAI_LOCATIONS="asia-southeast1,asia-east2,asia-northeast1"
# follow the guide in automation-api/DEV.md#obtaining-developer-specific-service-account-credentials-base64-encoded
VERTEX_SERVICE_ACCOUNT_CREDENTIALS=""
# fireworks
FIREWORKS_API_KEY=""
# for xai
XAI_API_KEY=""

# For local development / notebooks etc
SERVICE_ACCOUNT_CREDENTIALS=""
Expand Down
6,590 changes: 3,411 additions & 3,179 deletions automation-api/poetry.lock

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions automation-api/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ duckdb = "^0.10.2"
duckdb-engine = "^0.12.0"
jupysql = "^0.10.10"
anthropic = {extras = ["vertex"], version = "^0.25.9"}
fireworks-ai = "^0.15.1"



Expand All @@ -85,6 +86,11 @@ ipykernel = "^6.6.0"
jupytext = "^1.14.4"
pytest-mock = "^3.6.1"

[[tool.poetry.source]]
name = "pytorch_cpu"
url = "https://download.pytorch.org/whl/cpu"
priority = "explicit"

[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
"""
ClaudeEvaluator is an evaluator that uses Anthropic's Claude model for evaluations.

The evaluator interfaces with Claude via litellm to present tasks and interpret
the model's responses to determine the quality or correctness of a given
experiment result.
"""
import copy
import logging

import litellm
from claude_evaluator_config import ClaudeEvaluatorConfig
from evaluator_common import (
CLASSIFY_STR,
calculate_choice_score,
choices_to_string,
completion_with_backpff,
extract_choice_from_response,
format_template,
)
from yival.evaluators.base_evaluator import BaseEvaluator
from yival.schemas.evaluator_config import (
EvaluatorOutput,
EvaluatorType,
MethodCalculationMethod,
MetricCalculatorConfig,
)
from yival.schemas.experiment_config import (
ExperimentResult,
InputData,
MultimodalOutput,
)

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class ClaudeEvaluator(BaseEvaluator):
"""Evaluator using Claude for evaluation."""

default_config = ClaudeEvaluatorConfig(name="claude_evaluator") # type: ignore

def __init__(self, config: ClaudeEvaluatorConfig):
super().__init__(config)
self.config = config

def evaluate(self, experiment_result: ExperimentResult) -> EvaluatorOutput:
"""Evaluate the experiment result using Claude."""
format_dict = copy.deepcopy(experiment_result.input_data.content)
format_dict["raw_output"] = experiment_result.raw_output.text_output

prompt = format_template(self.config.prompt, format_dict)
if isinstance(prompt, str):
prompt = [{"role": "user", "content": prompt}]

prompt[-1]["content"] += "\n\n" + CLASSIFY_STR.format(
choices=choices_to_string(self.config.choices)
)
response = completion_with_backpff(
model=self.config.model_name,
messages=prompt,
temperature=0.0,
n=1,
max_tokens=2000,
request_timeout=60,
caching=True,
)
response_content = response["choices"][0]["message"]["content"]
choice = extract_choice_from_response(response_content, self.config.choices)
score = calculate_choice_score(choice, self.config.choice_scores)
return EvaluatorOutput(
name=self.config.name,
result=score if score is not None else choice,
display_name=self.config.display_name,
metric_calculators=self.config.metric_calculators,
)


BaseEvaluator.register_evaluator(
"claude_evaluator", ClaudeEvaluator, ClaudeEvaluatorConfig
)


def main():
"""Main function to test the ClaudeEvaluator."""
from example_evaluator_data import (
choice_scores,
choices,
content,
prompt,
raw_output,
)

litellm.set_verbose = True

evaluator_config = ClaudeEvaluatorConfig(
name="claude_evaluator",
display_name="correctness test",
metric_calculators=[
MetricCalculatorConfig(
MethodCalculationMethod(MethodCalculationMethod.AVERAGE)
)
],
prompt=prompt,
choices=choices,
evaluator_type=EvaluatorType.INDIVIDUAL,
choice_scores=choice_scores,
)
input_data_example = InputData(content=content)

experiment_result_example = ExperimentResult(
input_data=input_data_example,
combination={"wrapper1": "var1", "wrapper2": "var2"},
raw_output=MultimodalOutput(text_output=raw_output),
latency=150.0,
token_usage=50,
)

evaluator = ClaudeEvaluator(evaluator_config)
result = evaluator.evaluate(experiment_result_example)
print("Result: ", result.result)


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from dataclasses import asdict, dataclass, field
from typing import Any, Dict, List, Optional, Union

from yival.schemas.evaluator_config import EvaluatorConfig, EvaluatorType


@dataclass
class ClaudeEvaluatorConfig(EvaluatorConfig):
evaluator_type: EvaluatorType = EvaluatorType.INDIVIDUAL
prompt: Union[str, List[Dict[str, str]]] = ""
choices: List[str] = field(default_factory=list)
model_name: str = "claude-3-5-sonnet-20241022"
description: str = "This is an evaluator that uses Anthropic's Claude model."
scale_description: str = "0-4"
choice_scores: Optional[Dict[str, float]] = None

def asdict(self) -> Dict[str, Any]:
return asdict(self)
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ def __init__(self, config: GPT4EvaluatorConfig):

def evaluate(self, experiment_result: ExperimentResult) -> EvaluatorOutput:
"""Evaluate the experiment result using OpenAI's prompt-based evaluation."""
assert isinstance(self.config, GPT4EvaluatorConfig)
format_dict = copy.deepcopy(experiment_result.input_data.content)
format_dict["raw_output"] = experiment_result.raw_output.text_output

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ class Llama3EvaluatorConfig(EvaluatorConfig):
evaluator_type: EvaluatorType = EvaluatorType.INDIVIDUAL
prompt: Union[str, List[Dict[str, str]]] = ""
choices: List[str] = field(default_factory=list)
model_name: str = "replicate/meta/meta-llama-3-70b-instruct"
model_name: str = "fireworks_ai/accounts/fireworks/models/llama-v3p1-405b-instruct"
description: str = "This is the description of the evaluator."
scale_description: str = "0-4"
choice_scores: Optional[Dict[str, float]] = None
Expand Down
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
safety_settings = [
{
"category": "HARM_CATEGORY_HARASSMENT",
"threshold": "BLOCK_NONE",
"threshold": "BLOCK_ONLY_HIGH",
},
{
"category": "HARM_CATEGORY_HATE_SPEECH",
"threshold": "BLOCK_NONE",
"threshold": "BLOCK_ONLY_HIGH",
},
{
"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
"threshold": "BLOCK_NONE",
"threshold": "BLOCK_ONLY_HIGH",
},
{
"category": "HARM_CATEGORY_DANGEROUS_CONTENT",
"threshold": "BLOCK_NONE",
"threshold": "BLOCK_ONLY_HIGH",
},
]
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,6 @@

# load env vars
from lib.config import read_config
from yival_experiments.custom_configuration.llms.alibaba_complete import (
llm_complete as alibaba_llm_complete,
)
from yival_experiments.custom_configuration.llms.palm_completion import safety_settings

read_config()
Expand All @@ -26,26 +23,31 @@
# vendor="OpenAI"
# )
# default_model_config = dict(
# model_id="vertex_ai/gemini-1.5-pro-preview-0409",
# model_id="vertex_ai/gemini-pro-experimental",
# params={"temperature": 0.5},
# vendor="Google",
# )
# default_model_config = dict(
# model_id="vertex_ai/claude-3-opus@20240229",
# params={"temperature": 0.5},
# vendor="Anthropic",
# )
# default_model_config = dict(
# model_id="replicate/meta/meta-llama-3-70b-instruct",
# params={"temperature": 0.5},
# vendor="Meta",
# )
default_model_config = dict(
model_id="vertex_ai/claude-3-opus@20240229",
params={"temperature": 0.5},
vendor="Anthropic",
)
default_model_config = dict(
model_id="replicate/meta/meta-llama-3-70b-instruct",
params={"temperature": 0.5},
vendor="Meta",
model_id="qwen-max", params={"temperature": 0.5}, vendor="Alibaba"
)
# set this to see verbose outputs
litellm.set_verbose = True
# enable caching in the evaluator.
# litellm.cache = litellm.Cache()
# to not use Redis for caching: uncomment the line above and comment the line below.
litellm.cache = litellm.Cache(type="redis", host="127.0.0.1", port=26379)
litellm.cache = litellm.Cache(
type="redis", host="127.0.0.1", port=26379, ttl=60 * 24 * 3600
)


def model_compare(
Expand Down Expand Up @@ -96,10 +98,10 @@ def model_compare(
litellm_params = dict(
model=model["model_id"],
messages=litellm_messages,
caching=False,
caching=True,
num_retries=10,
request_timeout=60,
**model["params"]
**model["params"],
)
if model["vendor"] == "Google":
# choose a vertex project location
Expand All @@ -109,23 +111,16 @@ def model_compare(
# google allows changing content filters. We will disable all
litellm_params["safety_settings"] = safety_settings
elif model["vendor"] == "Anthropic":
if "opus" in model["model_id"]:
# there is only one location where claude Opus is available.
litellm.vertex_location = "us-east5"
else:
litellm.vertex_location = "us-central1"

# all Anthropic models are abailable in us-east5
litellm.vertex_location = "us-east5"
elif model["vendor"] == "Alibaba":
# Alibaba has openai compatible endpoints
litellm_params["model"] = f"openai/{litellm_params['model']}"
litellm_params["api_key"] = os.getenv("DASHSCOPE_API_KEY")
litellm_params["api_base"] = "https://dashscope.aliyuncs.com/compatible-mode/v1"
try:
if model["vendor"] == "Alibaba":
# FIXME: alibaba's complete function doesn't support system prompt.
output = alibaba_llm_complete(
model_name=model["model_id"], prompt=prompt, **model["params"]
)
response = Response(output=output).output
response_text = response["choices"][0]["message"]["content"]
else:
response = Response(output=completion(**litellm_params)).output
response_text = response["choices"][0]["message"]["content"]
response = Response(output=completion(**litellm_params)).output
response_text = response["choices"][0]["message"]["content"]
except KeyboardInterrupt:
raise
except Exception as e:
Expand Down
Loading
Loading