From fec74f4c87e1609ef1ff1304342030dc0b12d99d Mon Sep 17 00:00:00 2001 From: Avnish Narayan Date: Mon, 4 Dec 2023 14:17:59 -0800 Subject: [PATCH 1/3] LLMPerfV2 Signed-off-by: Avnish Narayan --- .gitignore | 400 +++++++----- NOTICE.txt | 14 - README.md | 437 +++++++++++-- analyze-raw.ipynb | 588 ------------------ analyze-token-benchmark-results.ipynb | 327 ++++++++++ env_sample.txt | 19 - llm_correctness.py | 309 +++++++++ llmperf.py | 484 -------------- optional.txt | 2 - pre-commit.sh | 5 + pyproject.toml | 23 + requirements-dev.txt | 2 + requirements.txt | 18 - sonnet.txt | 518 --------------- src/llmperf/__init__.py | 1 + src/llmperf/common.py | 38 ++ src/llmperf/common_metrics.py | 17 + src/llmperf/models.py | 21 + src/llmperf/ray_clients/__init__.py | 0 src/llmperf/ray_clients/litellm_client.py | 100 +++ .../openai_chat_completions_client.py | 120 ++++ src/llmperf/ray_clients/sagemaker_client.py | 158 +++++ src/llmperf/ray_clients/vertexai_client.py | 135 ++++ src/llmperf/ray_llm_client.py | 22 + src/llmperf/requests_launcher.py | 48 ++ src/llmperf/sonnet.txt | 84 +++ src/llmperf/utils.py | 147 +++++ token_benchmark_ray.py | 464 ++++++++++++++ 28 files changed, 2647 insertions(+), 1854 deletions(-) delete mode 100644 NOTICE.txt delete mode 100644 analyze-raw.ipynb create mode 100644 analyze-token-benchmark-results.ipynb delete mode 100644 env_sample.txt create mode 100644 llm_correctness.py delete mode 100644 llmperf.py delete mode 100644 optional.txt create mode 100755 pre-commit.sh create mode 100644 pyproject.toml create mode 100644 requirements-dev.txt delete mode 100644 requirements.txt delete mode 100644 sonnet.txt create mode 100644 src/llmperf/__init__.py create mode 100644 src/llmperf/common.py create mode 100644 src/llmperf/common_metrics.py create mode 100644 src/llmperf/models.py create mode 100644 src/llmperf/ray_clients/__init__.py create mode 100644 src/llmperf/ray_clients/litellm_client.py create mode 100644 src/llmperf/ray_clients/openai_chat_completions_client.py create mode 100644 src/llmperf/ray_clients/sagemaker_client.py create mode 100644 src/llmperf/ray_clients/vertexai_client.py create mode 100644 src/llmperf/ray_llm_client.py create mode 100644 src/llmperf/requests_launcher.py create mode 100644 src/llmperf/sonnet.txt create mode 100644 src/llmperf/utils.py create mode 100644 token_benchmark_ray.py diff --git a/.gitignore b/.gitignore index 17584b8..54047ad 100644 --- a/.gitignore +++ b/.gitignore @@ -1,161 +1,247 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions +# The build output should clearly not be checked in +*test-output.xml +/bazel-* +/python/ray/core +/python/ray/pickle5_files/ +/python/ray/thirdparty_files/ +/python/ray/pyarrow_files/ +/python/ray/jars/ +/python/ray/cpp/ +/python/build +/python/dist +/python/python-driver-* +/python/ray/serve/generated +/thirdparty/pkg/ +/build/java +.jar +/dashboard/client/build + +# Files generated by flatc should be ignored +/src/ray/gcs/format/*_generated.h +/src/ray/object_manager/format/*_generated.h +/src/ray/raylet/format/*_generated.h +/java/runtime/src/main/java/io/ray/runtime/generated/* +/java/serve/src/main/java/io/ray/serve/generated/* + +# Files genrated by c++ worker should be ignored. +/cpp/example/thirdparty/ +/cpp/example/bazel-* +/python/ray/cpp + +# Redis temporary files +*dump.rdb + +# Python byte code files +*.pyc +python/.eggs +*.egg-info + +# Backup files +*.bak + +# Emacs temporary files +*~ +*# + +# Compiled Object files +*.slo +*.lo +*.o +*.xo +*.obj + +# Precompiled Headers +*.gch +*.pch + +# Compiled Dynamic libraries *.so +*.dylib +*.dll +python/ray/_raylet.pyd + +# Incremental linking files +*.ilk + +# Library export files +*.exp + +# Debug symbols +*.pdb + +# Fortran module files +*.mod +!deploy/ray-operator/go.mod + +# Compiled Static libraries +*.lai +*.la +*.a +*.lib + +# Executables +*.exe +*.out +*.app + +# Visual Studio files +/packages +*.suo +*.user +*.VC.db +*.VC.opendb + +# Protobuf-generated files +*_pb2.py +*.pb.h +*.pb.cc + +# Ray cluster configuration +scripts/nodes.txt + +# OS X folder attributes +.DS_Store + +# Debug files +*.dSYM/ +*.su + +# Python setup files +*.egg-info + +# Compressed files +*.gz + +# Datasets from examples +**/MNIST_data/ +**/cifar-10-batches-bin/ + +# Generated documentation files +/doc/_build +/doc/source/_static/thumbs +/doc/source/tune/generated_guides/ +/doc/source/**/doc/ + +# User-specific stuff: +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/dictionaries +.llvm-local.bazelrc + +# Sensitive or high-churn files: +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.xml +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml + +# Gradle: +.idea/**/gradle.xml +.idea/**/libraries +.idea + +# Website +/site/Gemfile.lock +/site/.sass-cache +/site/_site + +# Pytest Cache +**/.pytest_cache +**/.cache +.benchmarks +python-driver-* + +# Vscode +.vscode/ + +*.iml + +# Java +java/**/target +java/**/lib +java/**/.settings +java/**/.classpath +java/**/.project +java/runtime/native_dependencies/ +java/testng_custom.xml + +dependency-reduced-pom.xml + +# Cpp +cpp/example/thirdparty/ + +.clwb + +# pom.xml files generated from pom_template.xml +java/**/pom.xml + +# python virtual env +venv + +# pyenv version file +.python-version + +# Vim +.*.swp +*.swp +.*.swo +*.swo +tags +tags.lock +tags.temp +*.vim + +# Emacs +.#* + +# tools +tools/prometheus* + +# ray project files +project-id +.mypy_cache/ + +# release test related +.anyscale.yaml +test_state.json + +# workflow storage +workflow_data/ + +# vscode java extention generated +.factorypath + +# Jupyter Notebooks +**/.ipynb_checkpoints/ + +### Added by Hedron's Bazel Compile Commands Extractor: https://github.com/hedronvision/bazel-compile-commands-extractor +# The external link: Differs on Windows vs macOS/Linux, so we can't check it in. The pattern needs to not have a trailing / because it's a symlink on macOS/Linux. +/external +# Compiled output -> don't check in +/compile_commands.json +# Directory where clangd puts its indexing work +/.cache/ -# Distribution / packaging -.Python +# Auto-generated tag mapping +tag-mapping.json + +.bazeliskrc + +# ignore tmp files +*.tmp +out +temp* + +# build output build/ -develop-eggs/ dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -cover/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# poetry -# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -#poetry.lock - -# pdm -# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. -#pdm.lock -# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it -# in version control. -# https://pdm.fming.dev/#use-with-ide -.pdm.toml - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ - -# PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ -.env -results/ + +# results +output/ +*.json +result_outputs/ + +__pycache__ +**/__pycache__/ \ No newline at end of file diff --git a/NOTICE.txt b/NOTICE.txt deleted file mode 100644 index 4820e73..0000000 --- a/NOTICE.txt +++ /dev/null @@ -1,14 +0,0 @@ -[Project Name] -Copyright 2023-onwards Anyscale, Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. \ No newline at end of file diff --git a/README.md b/README.md index 966c4e4..d40604e 100644 --- a/README.md +++ b/README.md @@ -1,78 +1,407 @@ -# llmperf +# LLMPerf -LLMPerf is a tool for benchmarking and validating the performance of LLMs. +A Tool for evaulation the performance of LLM APIs. -Benchmarking: LLMPerf measures time to first token (TTFT), -inter-token latency (ITL) and requests that take longer than 3 seconds -to start returning data. +# Installation +```bash +git clone ... +cd LLMPerf +pip install -e . +``` -Validation: we send a simple query to the LLM and ensure the returned data -is valid. In particular it checks for inter-request cross-over -(request A gets the responses for request B). +# Basic Usage -Variation in input and output token lengths is a design parameter -since this is intended to be representative. This is because -there are some optimizations (e.g. continuous batching) that -we know work better with varying input and output length. +We implement 2 tests for evaluating LLMs: a load test to check for performance and a correctness test to check for correctness. -## Supported endpoints +## Load test -Currently supported endpoints include: +The load test spawns a number of concurrent requests to the LLM API and measures the inter-token latency and generation throughput per request and across concurrent requests. The prompt that is sent with each request is of the format: -- Any OpenAI compatible endpoints, including Anyscale Endpoints, -Anyscale Private Endpoints, OpenAI, Fireworks, Perplexity etc -- Any [Huggingface Text Generation Inference](https://github.com/huggingface/text-generation-inference) endpoints -- Together -- Vertex AI -- SageMaker +``` +Randomly stream lines from the following text. Don't generate eos tokens: +LINE 1, +LINE 2, +LINE 3, +... +``` -Please see `requirements.txt` for more details on dependency requirements. +Where the lines are randomly sampled from a collection of lines from Shakespeare sonnets. Tokens are counted using the `LlamaTokenizer` regardless of which LLM API is being tested. This is to ensure that the prompts are consistent across different LLM APIs. -## Upcoming refactor +To run the most basic load test you can the token_benchmark_ray script. -This is prototype code. We are currently refactoring the code to be more -extensible (including a pluggable endpoints, varying traffic load etc). +### OpenAI Compatible APIs +```bash +export OPENAI_API_KEY=secret_abcdefg +export OPENAI_API_BASE="https://api.endpoints.anyscale.com/v1" -In addition we plan to: +python token_benchmark_ray.py \ +--model "meta-llama/Llama-2-7b-chat-hf" \ +--mean-input-tokens 550 \ +--stddev-input-tokens 150 \ +--mean-output-tokens 150 \ +--stddev-output-tokens 10 \ +--max-num-completed-requests 2 \ +--timeout 600 \ +--num-concurrent-requests 1 \ +--results-dir "result_outputs" \ +--llm-api openai \ +--additional-sampling-params '{}' -- Make running the benchmark not only possible from -command line, but also possible to integrate easily into CI/CD or job scheduling -systems. -- Control where the generated files and information go. -- Automate report generation. +``` -We expect this refactor to be complete some time in November 2023. +### Anthropic +```bash +export ANTHROPIC_API_KEY=secret_abcdefg -## A note on rate limits +python token_benchmark_ray.py \ +--model "claude-2" \ +--mean-input-tokens 550 \ +--stddev-input-tokens 150 \ +--mean-output-tokens 150 \ +--stddev-output-tokens 10 \ +--max-num-completed-requests 2 \ +--timeout 600 \ +--num-concurrent-requests 1 \ +--results-dir "result_outputs" \ +--llm-api anthropic \ +--additional-sampling-params '{}' -Many LLM providers have extremely low rate limits by default (e.g. Perplexity 3 requests per 90 seconds). +``` -You can use the sleep parameter to overcome these difficulties, but it does affect the representativeness of the results. +### TogetherAI -Other systems do not have rate limits, but we consider that if the TTFT exceeds 3 second for more than -5% of queries that the system is overloaded. +```bash +export TOGETHERAI_API_KEY="YOUR_TOGETHER_KEY" +python token_benchmark_ray.py \ +--model "together_ai/togethercomputer/CodeLlama-7b-Instruct" \ +--mean-input-tokens 550 \ +--stddev-input-tokens 150 \ +--mean-output-tokens 150 \ +--stddev-output-tokens 10 \ +--max-num-completed-requests 2 \ +--timeout 600 \ +--num-concurrent-requests 1 \ +--results-dir "result_outputs" \ +--llm-api "litellm" \ +--additional-sampling-params '{}' -## Default values +``` -Default values are the ones that we use for testing Anyscale Endpoints. -The distribution of inputs and outputs roughly mirrors the input and output -patterns we see there. +### HuggingFacAPI -We recommend setting the seed (or using the provided seed) to reduce variance but -still have randomization. +```bash +export HUGGINGFACE_API_KEY="YOUR_HUGGINGFACE_API_KEY" +export HUGGINGFACE_API_BASE="YOUR_HUGGINGFACE_API_ENDPOINT" -Do a python llmperf.py --help to see all options. +python token_benchmark_ray.py \ +--model "huggingface/meta-llama/Llama-2-7b-chat-hf" \ +--mean-input-tokens 550 \ +--stddev-input-tokens 150 \ +--mean-output-tokens 150 \ +--stddev-output-tokens 10 \ +--max-num-completed-requests 2 \ +--timeout 600 \ +--num-concurrent-requests 1 \ +--results-dir "result_outputs" \ +--llm-api "litellm" \ +--additional-sampling-params '{}' -## Usage -1. Provide API base and key in .env file. Check out env_sample.txt -2. Test out Anyscale Endpoint with following command by sending 20 requests -`python llmperf.py -r 20 -m "meta-llama/Llama-2-70b-chat-hf"` -3. Control input token numbers by setting min/max lines, and control output token number by setting req-lines and max_tokens -`python llmperf.py -r 20 -f openai -m "gpt-3.5-turbo" --min-lines 8 --max-lines 10` -`python llmperf.py -r 20 -f openai -m "gpt-3.5-turbo" --req-lines 3 --max-tokens 128` -4. Control sleep between rounds to avoid hitting rate limit -`python llmperf.py -r 20 -f fireworks -m "accounts/fireworks/models/llama-v2-70b-chat" --sleep 10` -5. Output will be saved at **framework-timestamp.json** and **framework-timestamp_raw.json** -6. Use Jupyter with analyze-raw.ipynb to visualize and/or interact with the raw data. +``` +### LiteLLM + +LLMPerf can use LiteLLM to send prompts to LLM APIs. To see the environment variables to set for the provider and arguments that one should set for model and additional-sampling-params. + +see the [LiteLLM Provider Documentation](https://docs.litellm.ai/docs/providers). + +```bash +python token_benchmark_ray.py \ +--model "meta-llama/Llama-2-7b-chat-hf" \ +--mean-input-tokens 550 \ +--stddev-input-tokens 150 \ +--mean-output-tokens 150 \ +--stddev-output-tokens 10 \ +--max-num-completed-requests 2 \ +--timeout 600 \ +--num-concurrent-requests 1 \ +--results-dir "result_outputs" \ +--llm-api "litellm" \ +--additional-sampling-params '{}' + +``` + +### Vertex AI + +Here, --model is used for logging, not for selecting the model. The model is specified in the Vertex AI Endpoint ID. + +The GCLOUD_ACCESS_TOKEN needs to be somewhat regularly set, as the token generated by `gcloud auth print-access-token` expires after 15 minutes or so. + +Vertex AI doesn't return the total number of tokens that are generated by their endpoint, so tokens are counted using the LLama tokenizer. + +```bash + +gcloud auth application-default login +gcloud config set project YOUR_PROJECT_ID + +export GCLOUD_ACCESS_TOKEN=$(gcloud auth print-access-token) +export GCLOUD_PROJECT_ID=YOUR_PROJECT_ID +export GCLOUD_REGION=YOUR_REGION +export VERTEXAI_ENDPOINT_ID=YOUR_ENDPOINT_ID + +python token_benchmark_ray.py \ +--model "meta-llama/Llama-2-7b-chat-hf" \ +--mean-input-tokens 550 \ +--stddev-input-tokens 150 \ +--mean-output-tokens 150 \ +--stddev-output-tokens 10 \ +--max-num-completed-requests 2 \ +--timeout 600 \ +--num-concurrent-requests 1 \ +--results-dir "result_outputs" \ +--llm-api "vertexai" \ +--additional-sampling-params '{}' + +``` + +# Sagemaker + +Sagemaker doesn't return the total number of tokens that are generated by their endpoint, so tokens are counted using the LLama tokenizer. + +```bash + +export AWS_ACCESS_KEY_ID="YOUR_ACCESS_KEY_ID" +export AWS_SECRET_ACCESS_KEY="YOUR_SECRET_ACCESS_KEY"s +export AWS_SESSION_TOKEN="YOUR_SESSION_TOKEN" +export AWS_REGION_NAME="YOUR_ENDPOINTS_REGION_NAME" + +python llm_correctness.py \ +--model "llama-2-7b" \ +--llm-api "sagemaker" \ +--max-num-completed-requests 2 \ +--timeout 600 \ +--num-concurrent-requests 1 \ +--results-dir "result_outputs" \ + +``` + +see `python token_benchmark_ray.py --help` for more details on the arguments. + +## Correctness Test + +The correctness test spawns a number of concurrent requests to the LLM API with the following format: + +``` +Convert the following sequence of words into a number: {random_number_in_word_format}. Output just your final answer. +``` + +where random_number_in_word_format could be for example "one hundred and twenty three". The test then checks that the response contains that number in digit format which in this case would be 123. + +The test does this for a number of randomly generated numbers and reports the number of responses that contain a mismatch. + +To run the most basic correctness test you can run the the llm_correctness.py script. + +### OpenAI Compatible APIs + +```bash +export OPENAI_API_KEY=secret_abcdefg +export OPENAI_API_BASE=https://console.endpoints.anyscale.com/m/v1 + +python llm_correctness.py \ +--model "meta-llama/Llama-2-7b-chat-hf" \ +--max-num-completed-requests 150 \ +--timeout 600 \ +--num-concurrent-requests 10 \ +--results-dir "result_outputs" +``` + +### Anthropic + +```bash +export ANTHROPIC_API_KEY=secret_abcdefg + +python llm_correctness.py \ +--model "claude-2" \ +--llm-api "anthropic" \ +--max-num-completed-requests 5 \ +--timeout 600 \ +--num-concurrent-requests 1 \ +--results-dir "result_outputs" +``` + +### TogetherAI + +```bash +export TOGETHERAI_API_KEY="YOUR_TOGETHER_KEY" + +python llm_correctness.py \ +--model "together_ai/togethercomputer/CodeLlama-7b-Instruct" \ +--llm-api "litellm" \ +--max-num-completed-requests 2 \ +--timeout 600 \ +--num-concurrent-requests 1 \ +--results-dir "result_outputs" \ + +``` + +### HuggingFacAPI + +```bash +export HUGGINGFACE_API_KEY="YOUR_HUGGINGFACE_API_KEY" +export HUGGINGFACE_API_BASE="YOUR_HUGGINGFACE_API_ENDPOINT" + +python llm_correctness.py \ +--model "huggingface/meta-llama/Llama-2-7b-chat-hf" \ +--llm-api "litellm" \ +--max-num-completed-requests 2 \ +--timeout 600 \ +--num-concurrent-requests 1 \ +--results-dir "result_outputs" \ + +``` + +### LiteLLM + +LLMPerf can use LiteLLM to send prompts to LLM APIs. To see the environment variables to set for the provider and arguments that one should set for model and additional-sampling-params. + +see the [LiteLLM Provider Documentation](https://docs.litellm.ai/docs/providers). + +```bash +python llm_correctness.py \ +--model "meta-llama/Llama-2-7b-chat-hf" \ +--llm-api "litellm" \ +--max-num-completed-requests 2 \ +--timeout 600 \ +--num-concurrent-requests 1 \ +--results-dir "result_outputs" \ + +``` + +see `python llm_correctness.py --help` for more details on the arguments. + + +### Vertex AI + +Here, --model is used for logging, not for selecting the model. The model is specified in the Vertex AI Endpoint ID. + +The GCLOUD_ACCESS_TOKEN needs to be somewhat regularly set, as the token generated by `gcloud auth print-access-token` expires after 15 minutes or so. + +Vertex AI doesn't return the total number of tokens that are generated by their endpoint, so tokens are counted using the LLama tokenizer. + + +```bash + +gcloud auth application-default login +gcloud config set project YOUR_PROJECT_ID + +export GCLOUD_ACCESS_TOKEN=$(gcloud auth print-access-token) +export GCLOUD_PROJECT_ID=YOUR_PROJECT_ID +export GCLOUD_REGION=YOUR_REGION +export VERTEXAI_ENDPOINT_ID=YOUR_ENDPOINT_ID + +python llm_correctness.py \ +--model "meta-llama/Llama-2-7b-chat-hf" \ +--llm-api "vertexai" \ +--max-num-completed-requests 2 \ +--timeout 600 \ +--num-concurrent-requests 1 \ +--results-dir "result_outputs" \ + +``` + +### Sagemaker + +Sagemaker doesn't return the total number of tokens that are generated by their endpoint, so tokens are counted using the LLama tokenizer. + +```bash + +export AWS_ACCESS_KEY_ID="YOUR_ACCESS_KEY_ID" +export AWS_SECRET_ACCESS_KEY="YOUR_SECRET_ACCESS_KEY"s +export AWS_SESSION_TOKEN="YOUR_SESSION_TOKEN" +export AWS_REGION_NAME="YOUR_ENDPOINTS_REGION_NAME" + +python llm_correctness.py \ +--model "llama-2-7b" \ +--llm-api "sagemaker" \ +--max-num-completed-requests 2 \ +--timeout 600 \ +--num-concurrent-requests 1 \ +--results-dir "result_outputs" \ + +``` + +## Saving Results + +The results of the load test and correctness test are saved in the results directory specified by the `--results-dir` argument. The results are saved in 2 files, one with the summary metrics of the test, and one with metrics from each individual request that is returned. + +# Advanced Usage + +The correctness tests were implemented with the following workflow in mind: + +```python +import ray +from transformers import LlamaTokenizerFast + +from llmperf.ray_clients.openai_chat_completions_client import ( + OpenAIChatCompletionsClient, +) +from llmperf.models import RequestConfig +from llmperf.requests_launcher import RequestsLauncher + + +# Copying the environment variables and passing them to ray.init() is necessary +# For making any clients work. +ray.init(runtime_env={"env_vars": {"OPENAI_API_BASE" : "https://api.endpoints.anyscale.com/v1", + "OPENAI_API_KEY" : "YOUR_API_KEY"}}) + +base_prompt = "hello_world" +tokenizer = LlamaTokenizerFast.from_pretrained( + "hf-internal-testing/llama-tokenizer" +) +base_prompt_len = len(tokenizer.encode(base_prompt)) +prompt = (base_prompt, base_prompt_len) + +# Create a client for spawning requests +clients = [OpenAIChatCompletionsClient.remote()] + +req_launcher = RequestsLauncher(clients) + +req_config = RequestConfig( + model="meta-llama/Llama-2-7b-chat-hf", + prompt=prompt + ) + +req_launcher.launch_requests(req_config) +result = req_launcher.get_next_ready(block=True) +print(result) + +``` + +# Implementing New LLM Clients + +To implement a new LLM client, you need to implement the base class `llmperf.ray_llm_client.LLMClient` and decorate it as a ray actor. + +```python + +from llmperf.ray_llm_client import LLMClient +import ray + + +@ray.remote +class CustomLLMClient(LLMClient): + + def llm_request(self, request_config: RequestConfig) -> Tuple[Metrics, str, RequestConfig]: + """Make a single completion request to a LLM API + + Returns: + Metrics about the performance charateristics of the request. + The text generated by the request to the LLM API. + The request_config used to make the request. This is mainly for logging purposes. + + """ + ... + +``` + +# Legacy Codebase +The old LLMPerf code base can be found in the [llmperf-legacy](https://github.com/ray-project/llmval-legacy) repo. diff --git a/analyze-raw.ipynb b/analyze-raw.ipynb deleted file mode 100644 index a272fd7..0000000 --- a/analyze-raw.ipynb +++ /dev/null @@ -1,588 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 39, - "id": "dacfe98a-e81b-4089-9506-97a652993b5b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import time\n", - "import datetime\n" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "id": "17f7abe9-ed9e-466c-b034-577489aaf98b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "df = pd.read_json('anyscale-1697499992_raw.json')" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "id": "56da958f-694d-4e3e-a559-a275ae22d5d4", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Validity results:\n", - "Exception 419\n", - "OK 181\n", - "Name: valid, dtype: int64\n", - "Clean DF is: 181\n", - "Mean TTFT: 2209 ms (mean tokens in: 512, out: 138)\n", - "TTFT > 3 s: 24.86%\n", - "ITL (out): 77.51 ms/token, mean tokens/s output (out): 12.93 token/s\n" - ] - } - ], - "source": [ - "# This is the standard analysis we would do in the real script\n", - "\n", - "print('Validity results:')\n", - "print(df['valid'].value_counts())\n", - "cdf = df[df.valid !='Exception'].copy()\n", - "print(f'Clean DF is: {len(cdf)}')\n", - "cdf['inter_tokens_delay'] = cdf.total_time/cdf.tokens_out\n", - "cdf['total_tokens_per_s'] = (cdf.tokens_out + cdf.tokens_in)/cdf.total_time\n", - "cdf['out_tokens_per_s'] = cdf.tokens_out/cdf.total_time\n", - "mean_tokens_in = cdf['tokens_in'].mean() \n", - "mean_tokens_out = cdf['tokens_out'].mean() \n", - "mean_ttft = cdf['ttft'].mean()\n", - "gt_3_ttft = len(cdf[cdf['ttft'] > 3])/len(cdf)\n", - "print(f'Mean TTFT: {mean_ttft*1000:.0f} ms (mean tokens in: {mean_tokens_in:.0f}, out: {mean_tokens_out:.0f})')\n", - "print(f'TTFT > 3 s: {gt_3_ttft*100:.2f}%')\n", - "print(f'ITL (out): {cdf.inter_tokens_delay.mean()*1000:.2f} ms/token, mean tokens/s output (out): {cdf.out_tokens_per_s.mean():.2f} token/s')\n" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "id": "99936985-a21f-4738-9021-f9db8d67769a", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "181" - ] - }, - "execution_count": 51, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(cdf)" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "id": "4f5db74b-63ab-4268-b1a7-10b14641efb1", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 52, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "cdf.plot.scatter(y='ttft', x='tokens_in')" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "id": "f08d9085-d994-4754-a545-390cad1f4806", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 53, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "cdf.plot.scatter(y='total_time', x='tokens_out', ylim=[0,10])" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "id": "6f520389-aea3-4f0f-a23a-9094b57251c8", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ttfttotal_timetokens_intokens_outinter_tokens_delaytotal_tokens_per_sout_tokens_per_s
count181.000000181.000000181.000000181.000000181.000000181.000000181.000000
mean2.20935010.661870511.988950137.9447510.07750961.12358412.934065
std0.9108440.810370136.90519814.2819710.00385012.9826010.658547
min0.2467838.863190258.000000120.0000000.06459233.38616311.382172
25%1.64669610.251724404.000000131.0000000.07563050.12681112.559732
50%2.67537110.556036517.000000136.0000000.07716663.10508312.959061
75%2.99817510.958826630.000000140.0000000.07962071.55723913.222181
max3.19851816.923179738.000000262.0000000.08785784.33061215.481725
\n", - "
" - ], - "text/plain": [ - " ttft total_time tokens_in tokens_out inter_tokens_delay \\\n", - "count 181.000000 181.000000 181.000000 181.000000 181.000000 \n", - "mean 2.209350 10.661870 511.988950 137.944751 0.077509 \n", - "std 0.910844 0.810370 136.905198 14.281971 0.003850 \n", - "min 0.246783 8.863190 258.000000 120.000000 0.064592 \n", - "25% 1.646696 10.251724 404.000000 131.000000 0.075630 \n", - "50% 2.675371 10.556036 517.000000 136.000000 0.077166 \n", - "75% 2.998175 10.958826 630.000000 140.000000 0.079620 \n", - "max 3.198518 16.923179 738.000000 262.000000 0.087857 \n", - "\n", - " total_tokens_per_s out_tokens_per_s \n", - "count 181.000000 181.000000 \n", - "mean 61.123584 12.934065 \n", - "std 12.982601 0.658547 \n", - "min 33.386163 11.382172 \n", - "25% 50.126811 12.559732 \n", - "50% 63.105083 12.959061 \n", - "75% 71.557239 13.222181 \n", - "max 84.330612 15.481725 " - ] - }, - "execution_count": 54, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cdf.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "id": "81c9cdee-028f-448b-9179-04aa758e4f37", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 55, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "cdf['ttft'].plot.hist(bins=50)" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "id": "cd59c882", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "['',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '',\n", - " '']" - ] - }, - "execution_count": 56, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "list(df[df['valid'] == 'OK']['cause'].to_numpy())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "51469ab5", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "91d5b583-b052-4e07-a063-47c8e10ede52", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/analyze-token-benchmark-results.ipynb b/analyze-token-benchmark-results.ipynb new file mode 100644 index 0000000..d6c5a45 --- /dev/null +++ b/analyze-token-benchmark-results.ipynb @@ -0,0 +1,327 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "56950450", + "metadata": {}, + "source": [ + "# Token Benchmark Example Analysis\n", + "The following is an example of the analysis that can be done on individual responses that are saved when running `token_benchmark_ray.py` with the flag `--results-dir` which enables the saving of all responses." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "dacfe98a-e81b-4089-9506-97a652993b5b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "17f7abe9-ed9e-466c-b034-577489aaf98b", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
error_codeerror_msginter_token_latency_sttft_send_to_end_latency_srequest_output_throughput_token_per_snumber_total_tokensnumber_output_tokensnumber_input_tokens
0NaN[0.5549881670012831, 0.0009654169989510001, 0....0.5549881.61073444.07927270671635
1NaN[0.6019128750049271, 0.007011749999946, 0.0144...0.6019131.72572944.03935773076654
\n", + "
" + ], + "text/plain": [ + " error_code error_msg inter_token_latency_s \\\n", + "0 NaN [0.5549881670012831, 0.0009654169989510001, 0.... \n", + "1 NaN [0.6019128750049271, 0.007011749999946, 0.0144... \n", + "\n", + " ttft_s end_to_end_latency_s request_output_throughput_token_per_s \\\n", + "0 0.554988 1.610734 44.079272 \n", + "1 0.601913 1.725729 44.039357 \n", + "\n", + " number_total_tokens number_output_tokens number_input_tokens \n", + "0 706 71 635 \n", + "1 730 76 654 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# path to the individual responses json file\n", + "df = pd.read_json('/home/ray/default/llmperf/result_outputs/550_150_individual_responses.json')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "565a59e4", + "metadata": {}, + "outputs": [], + "source": [ + "valid_df = df[(df[\"error_code\"] != \"\")]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "102894bc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
error_codeerror_msginter_token_latency_sttft_send_to_end_latency_srequest_output_throughput_token_per_snumber_total_tokensnumber_output_tokensnumber_input_tokens
0NaN[0.5549881670012831, 0.0009654169989510001, 0....0.5549881.61073444.07927270671635
1NaN[0.6019128750049271, 0.007011749999946, 0.0144...0.6019131.72572944.03935773076654
\n", + "
" + ], + "text/plain": [ + " error_code error_msg inter_token_latency_s \\\n", + "0 NaN [0.5549881670012831, 0.0009654169989510001, 0.... \n", + "1 NaN [0.6019128750049271, 0.007011749999946, 0.0144... \n", + "\n", + " ttft_s end_to_end_latency_s request_output_throughput_token_per_s \\\n", + "0 0.554988 1.610734 44.079272 \n", + "1 0.601913 1.725729 44.039357 \n", + "\n", + " number_total_tokens number_output_tokens number_input_tokens \n", + "0 706 71 635 \n", + "1 730 76 654 " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "valid_df" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "c7519fc9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mean number of input tokens: 644.5. Mean number of output tokens: 73.5\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "final_df = pd.DataFrame()\n", + "final_df[\"number_input_tokens\"] = valid_df[\"number_input_tokens\"]\n", + "final_df[\"number_output_tokens\"] = valid_df[\"number_output_tokens\"]\n", + "final_df[\"ttft_s\"] = valid_df[\"ttft_s\"]\n", + "final_df[\"end_to_end_latency_s\"] = valid_df[\"end_to_end_latency_s\"]\n", + "final_df[\"generation_throughput\"] = valid_df[\"request_output_throughput_token_per_s\"]\n", + "\n", + "mean_tokens_in = final_df[\"number_input_tokens\"].mean()\n", + "mean_tokens_out = valid_df[\"number_output_tokens\"].mean()\n", + "print(f\"Mean number of input tokens: {mean_tokens_in}. Mean number of output tokens: {mean_tokens_out}\")\n", + "final_df.plot.scatter(x=\"number_input_tokens\", y=\"ttft_s\", title=\"Number of Input Tokens vs. TTFT\")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "a14de79c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjcAAAGzCAYAAADT4Tb9AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/SrBM8AAAACXBIWXMAAA9hAAAPYQGoP6dpAAAvAklEQVR4nO3de1TUVb/H8c+AAl7wUioocsRILTO19EiEVhaJ6aHMfLyVIpkdE08m6VNmiaaJWZKdMk3zkqdj+pSXWqWWktbpaI8nL1k9XlIjvIGQFxQTEPb5w+U8TWDCODCwfb/WmrWaPXv/5rt31HzW77d/Mw5jjBEAAIAlfLxdAAAAgCcRbgAAgFUINwAAwCqEGwAAYBXCDQAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuABTjcDg0cuRIb5dxVXA4HJo4caK3ywCsQrgBLOFwOEr12Lhxo7dLLZO77rpLbdq08cixNm3apIkTJ+rkyZMeOR6AyqmatwsA4Bn/9V//5fJ88eLFWrduXbH2G2+8sSLLqlQ2bdqkSZMmaciQIapXr563y5Ek/fbbb6pWjf8VA57Ef1GAJR555BGX5998843WrVtXrB2VS0BAgLdLAKzDZSngKpKbm6unn35aoaGh8vf3V6tWrfTqq6/KGHPZsVOmTJGPj4/eeOMNZ9uaNWvUpUsX1apVS4GBgerZs6d+/PFHl3FDhgxR7dq1dfjwYfXq1Uu1a9dWw4YNNWbMGBUWFnpkXjt37tSQIUN03XXXKSAgQMHBwXr00Uf166+/OvtMnDhRY8eOlSQ1b97ceZkuLS3N2ee9995Thw4dVKNGDV1zzTXq37+/Dh486PJeFy+T/eMf/1DXrl1Vs2ZNhYSEaPr06cXqOnfunCZOnKiWLVsqICBAjRs3Vu/evbV//35nn5L23Bw+fFiPPvqogoKC5O/vr5tuukkLFiwodvw33nhDN910k2rWrKn69eurY8eOWrJkiTtLCFiFMzfAVcIYo/vvv18bNmzQ0KFD1b59e3322WcaO3asDh8+rNdee+2SY59//nlNnTpVb7/9toYNGybpwmWwuLg4xcTE6OWXX9bZs2c1e/Zsde7cWdu3b1dYWJhzfGFhoWJiYhQREaFXX31V69ev14wZMxQeHq4nnnjiiue2bt06HThwQPHx8QoODtaPP/6ouXPn6scff9Q333wjh8Oh3r17a+/evXr//ff12muvqUGDBpKkhg0bSpJeeuklvfDCC+rbt68ee+wxZWVl6Y033tAdd9yh7du3u1zGOnHihLp3767evXurb9+++vDDD/XMM8/o5ptv1n333eec87/9278pNTVV/fv316hRo3T69GmtW7dOP/zwg8LDw0ucS2Zmpm677Tbnpu6GDRtqzZo1Gjp0qHJycvTUU09JkubNm6cnn3xSffr00ahRo3Tu3Dnt3LlTf//73zVw4MArXlOgSjMArJSQkGB+/5/4qlWrjCQzZcoUl359+vQxDofD7Nu3z9kmySQkJBhjjHn66aeNj4+PWbRokfP106dPm3r16plhw4a5HCsjI8PUrVvXpT0uLs5IMi+++KJL31tuucV06NDhsvO48847zU033fSnfc6ePVus7f333zeSzFdffeVse+WVV4wk8/PPP7v0TUtLM76+vuall15yaf/+++9NtWrVXNrvvPNOI8ksXrzY2ZaXl2eCg4PNQw895GxbsGCBkWRSUlKK1VZUVOT8Z0kmKSnJ+Xzo0KGmcePGJjs722VM//79Td26dZ1zfeCBBy67LsDVistSwFVi9erV8vX11ZNPPunS/vTTT8sYozVr1ri0G2M0cuRIvf7663rvvfcUFxfnfG3dunU6efKkBgwYoOzsbOfD19dXERER2rBhQ7H3Hz58uMvzLl266MCBAx6ZW40aNZz/fO7cOWVnZ+u2226TJG3btu2y41esWKGioiL17dvXZT7BwcFq0aJFsfnUrl3bZS+Tn5+fOnXq5DKf5cuXq0GDBvqP//iPYu/ncDhKrMMYo+XLlys2NlbGGJdaYmJidOrUKed86tWrp0OHDun//u//Ljs/4GrDZSngKvHLL7+oSZMmCgwMdGm/ePfUL7/84tK+ePFinTlzRrNnz9aAAQNcXvvpp58kSXfffXeJ71WnTh2X5wEBAc7LPxfVr19fJ06cKPtESnD8+HFNmjRJS5cu1bFjx1xeO3Xq1GXH//TTTzLGqEWLFiW+Xr16dZfnTZs2LRZQ6tevr507dzqf79+/X61atSrTnVBZWVk6efKk5s6dq7lz55bY5+L8nnnmGa1fv16dOnXS9ddfr27dumngwIGKiooq9fsBtiLcAChRVFSUduzYoTfffFN9+/bVNddc43ytqKhI0oV9N8HBwcXG/vED3dfXt1xr7du3rzZt2qSxY8eqffv2ql27toqKitS9e3dnrX+mqKhIDodDa9asKbHW2rVruzy/1HxMKTZmX64O6cKdb78/U/Z7bdu2lXQhlO7Zs0effPKJ1q5dq+XLl+utt97ShAkTNGnSpCuqA6jqCDfAVaJZs2Zav369Tp8+7XL2Zvfu3c7Xf+/666/X9OnTddddd6l79+5KTU11jru4GbZRo0aKjo6uoBmU7MSJE0pNTdWkSZM0YcIEZ/vFs0u/d6nLQeHh4TLGqHnz5mrZsqVH6goPD9ff//53FRQUFDvzcykNGzZUYGCgCgsLS7WutWrVUr9+/dSvXz/l5+erd+/eeumllzRu3DhuMcdVjT03wFWiR48eKiws1JtvvunS/tprr8nhcDjv8vm9tm3bavXq1dq1a5diY2P122+/SZJiYmJUp04dTZ06VQUFBcXGZWVllc8kSnDxLMofz5rMnDmzWN9atWpJUrFvKO7du7d8fX01adKkYscxxrjcUl5aDz30kLKzs4utd0m1XuTr66uHHnpIy5cv1w8//FDs9d+v6x9r8vPzU+vWrWWMKfHfCXA14cwNcJWIjY1V165dNX78eKWlpaldu3b6/PPP9dFHH+mpp5665K3Jt912mz766CP16NFDffr00apVq1SnTh3Nnj1bgwYN0q233qr+/furYcOGSk9P16effqqoqKgSP9TdlZWVpSlTphRrb968uR5++GHdcccdmj59ugoKChQSEqLPP/9cP//8c7H+HTp0kCSNHz9e/fv3V/Xq1RUbG6vw8HBNmTJF48aNU1pamnr16qXAwED9/PPPWrlypR5//HGNGTOmTDUPHjxYixcvVmJiorZs2aIuXbooNzdX69ev14gRI/TAAw+UOG7atGnasGGDIiIiNGzYMLVu3VrHjx/Xtm3btH79eh0/flyS1K1bNwUHBysqKkpBQUHatWuX3nzzTfXs2bPYvirgquOdm7QAlLc/3gpuzIVbuEePHm2aNGliqlevblq0aGFeeeUVl1uTjXG9Ffyijz76yFSrVs3069fPFBYWGmOM2bBhg4mJiTF169Y1AQEBJjw83AwZMsR8++23znFxcXGmVq1axepLSkoqVl9JLt56XdLjnnvuMcYYc+jQIfPggw+aevXqmbp165q//OUv5siRI8VuszbGmMmTJ5uQkBDj4+NT7Lbw5cuXm86dO5tatWqZWrVqmRtuuMEkJCSYPXv2uNRT0i3YcXFxplmzZi5tZ8+eNePHjzfNmzc31atXN8HBwaZPnz5m//79zj4l1ZiZmWkSEhJMaGioc9w999xj5s6d6+zz9ttvmzvuuMNce+21xt/f34SHh5uxY8eaU6dOXXZNAds5jLnCHXAAAACVCHtuAACAVQg3AADAKoQbAABgFcINAACwCuEGAABYhXADAACsctV9iV9RUZGOHDmiwMDAS34VOwAAqFyMMTp9+rSaNGkiH58/Pzdz1YWbI0eOKDQ01NtlAAAANxw8eFBNmzb90z5XXbi5+LXkBw8eVJ06dbxcDQAAKI2cnByFhoaW6udFrrpwc/FSVJ06dQg3AABUMaXZUsKGYgAAYBXCDQAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFcINAACwilfDzVdffaXY2Fg1adJEDodDq1atuuyYjRs36tZbb5W/v7+uv/56LVq0qNzrBAAAVYdXw01ubq7atWunWbNmlar/zz//rJ49e6pr167asWOHnnrqKT322GP67LPPyrlSAABQVXj1hzPvu+8+3XfffaXuP2fOHDVv3lwzZsyQJN144436+uuv9dprrykmJqa8ygQAAFVIldpzs3nzZkVHR7u0xcTEaPPmzZcck5eXp5ycHJcHAACwl1fP3JRVRkaGgoKCXNqCgoKUk5Oj3377TTVq1Cg2Jjk5WZMmTaqoEhX27KcV9l6ekjatp7dLAABcAp8rZVelzty4Y9y4cTp16pTzcfDgQW+XBAAAylGVOnMTHByszMxMl7bMzEzVqVOnxLM2kuTv7y9/f/+KKA8AAFQCVerMTWRkpFJTU13a1q1bp8jISC9VBAAAKhuvhpszZ85ox44d2rFjh6QLt3rv2LFD6enpki5cUho8eLCz//Dhw3XgwAH99a9/1e7du/XWW2/pb3/7m0aPHu2N8gEAQCXk1XDz7bff6pZbbtEtt9wiSUpMTNQtt9yiCRMmSJKOHj3qDDqS1Lx5c3366adat26d2rVrpxkzZuidd97hNnAAAODk1T03d911l4wxl3y9pG8fvuuuu7R9+/ZyrAoAAFRlVWrPDQAAwOUQbgAAgFUINwAAwCqEGwAAYBXCDQAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFcINAACwCuEGAABYhXADAACsQrgBAABWIdwAAACrEG4AAIBVCDcAAMAqhBsAAGAVwg0AALAK4QYAAFiFcAMAAKxCuAEAAFYh3AAAAKsQbgAAgFUINwAAwCqEGwAAYBXCDQAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFcINAACwCuEGAABYhXADAACsQrgBAABWIdwAAACrEG4AAIBVCDcAAMAqhBsAAGAVwg0AALAK4QYAAFiFcAMAAKxCuAEAAFYh3AAAAKsQbgAAgFUINwAAwCqEGwAAYBXCDQAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFa+Hm1mzZiksLEwBAQGKiIjQli1b/rT/zJkz1apVK9WoUUOhoaEaPXq0zp07V0HVAgCAys6r4WbZsmVKTExUUlKStm3bpnbt2ikmJkbHjh0rsf+SJUv07LPPKikpSbt27dL8+fO1bNkyPffccxVcOQAAqKy8Gm5SUlI0bNgwxcfHq3Xr1pozZ45q1qypBQsWlNh/06ZNioqK0sCBAxUWFqZu3bppwIABlz3bAwAArh5eCzf5+fnaunWroqOj/1mMj4+io6O1efPmEsfcfvvt2rp1qzPMHDhwQKtXr1aPHj0u+T55eXnKyclxeQAAAHtV89YbZ2dnq7CwUEFBQS7tQUFB2r17d4ljBg4cqOzsbHXu3FnGGJ0/f17Dhw//08tSycnJmjRpkkdrBwAAlZfXNxSXxcaNGzV16lS99dZb2rZtm1asWKFPP/1UkydPvuSYcePG6dSpU87HwYMHK7BiAABQ0bx25qZBgwby9fVVZmamS3tmZqaCg4NLHPPCCy9o0KBBeuyxxyRJN998s3Jzc/X4449r/Pjx8vEpntX8/f3l7+/v+QkAAIBKyWtnbvz8/NShQwelpqY624qKipSamqrIyMgSx5w9e7ZYgPH19ZUkGWPKr1gAAFBleO3MjSQlJiYqLi5OHTt2VKdOnTRz5kzl5uYqPj5ekjR48GCFhIQoOTlZkhQbG6uUlBTdcsstioiI0L59+/TCCy8oNjbWGXIAAMDVzavhpl+/fsrKytKECROUkZGh9u3ba+3atc5Nxunp6S5nap5//nk5HA49//zzOnz4sBo2bKjY2Fi99NJL3poCAACoZBzmKruek5OTo7p16+rUqVOqU6eOx48f9uynHj9meUub1tPbJQAALoHPlQvK8vldpe6WAgAAuBzCDQAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFcINAACwCuEGAABYhXADAACsQrgBAABWIdwAAACrEG4AAIBVCDcAAMAqhBsAAGAVwg0AALAK4QYAAFiFcAMAAKxCuAEAAFYh3AAAAKsQbgAAgFUINwAAwCqEGwAAYBXCDQAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFcINAACwCuEGAABYhXADAACsQrgBAABWIdwAAACrEG4AAIBVCDcAAMAqhBsAAGAVwg0AALAK4QYAAFiFcAMAAKxCuAEAAFYh3AAAAKsQbgAAgFUINwAAwCqEGwAAYBXCDQAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFbfCzYEDBzxdBwAAgEe4FW6uv/56de3aVe+9957OnTvn6ZoAAADc5la42bZtm9q2bavExEQFBwfr3//937Vlyxa3Cpg1a5bCwsIUEBCgiIiIyx7n5MmTSkhIUOPGjeXv76+WLVtq9erVbr03AACwj1vhpn379nr99dd15MgRLViwQEePHlXnzp3Vpk0bpaSkKCsrq1THWbZsmRITE5WUlKRt27apXbt2iomJ0bFjx0rsn5+fr3vvvVdpaWn68MMPtWfPHs2bN08hISHuTAMAAFjoijYUV6tWTb1799YHH3ygl19+Wfv27dOYMWMUGhqqwYMH6+jRo386PiUlRcOGDVN8fLxat26tOXPmqGbNmlqwYEGJ/RcsWKDjx49r1apVioqKUlhYmO688061a9fuSqYBAAAsckXh5ttvv9WIESPUuHFjpaSkaMyYMdq/f7/WrVunI0eO6IEHHrjk2Pz8fG3dulXR0dH/LMbHR9HR0dq8eXOJYz7++GNFRkYqISFBQUFBatOmjaZOnarCwsJLvk9eXp5ycnJcHgAAwF7V3BmUkpKihQsXas+ePerRo4cWL16sHj16yMfnQlZq3ry5Fi1apLCwsEseIzs7W4WFhQoKCnJpDwoK0u7du0scc+DAAX3xxRd6+OGHtXr1au3bt08jRoxQQUGBkpKSShyTnJysSZMmuTNNAABQBbkVbmbPnq1HH31UQ4YMUePGjUvs06hRI82fP/+KivujoqIiNWrUSHPnzpWvr686dOigw4cP65VXXrlkuBk3bpwSExOdz3NychQaGurRugAAQOXhVrj56aefLtvHz89PcXFxl3y9QYMG8vX1VWZmpkt7ZmamgoODSxzTuHFjVa9eXb6+vs62G2+8URkZGcrPz5efn1+xMf7+/vL3979svQAAwA5u7blZuHChPvjgg2LtH3zwgd59991SHcPPz08dOnRQamqqs62oqEipqamKjIwscUxUVJT27dunoqIiZ9vevXvVuHHjEoMNAAC4+rgVbpKTk9WgQYNi7Y0aNdLUqVNLfZzExETNmzdP7777rnbt2qUnnnhCubm5io+PlyQNHjxY48aNc/Z/4okndPz4cY0aNUp79+7Vp59+qqlTpyohIcGdaQAAAAu5dVkqPT1dzZs3L9berFkzpaenl/o4/fr1U1ZWliZMmKCMjAy1b99ea9eudW4yTk9Pd25SlqTQ0FB99tlnGj16tNq2bauQkBCNGjVKzzzzjDvTAAAAFnIr3DRq1Eg7d+4sdjfUd999p2uvvbZMxxo5cqRGjhxZ4msbN24s1hYZGalvvvmmTO8BAACuHm5dlhowYICefPJJbdiwQYWFhSosLNQXX3yhUaNGqX///p6uEQAAoNTcOnMzefJkpaWl6Z577lG1ahcOUVRUpMGDB5dpzw0AAICnuRVu/Pz8tGzZMk2ePFnfffedatSooZtvvlnNmjXzdH0AAABl4la4uahly5Zq2bKlp2oBAAC4Ym6Fm8LCQi1atEipqak6duyYy/fOSNIXX3zhkeIAAADKyq1wM2rUKC1atEg9e/ZUmzZt5HA4PF0XAACAW9wKN0uXLtXf/vY39ejRw9P1AAAAXBG3bgX38/PT9ddf7+laAAAArphb4ebpp5/W66+/LmOMp+sBAAC4Im5dlvr666+1YcMGrVmzRjfddJOqV6/u8vqKFSs8UhwAAEBZuRVu6tWrpwcffNDTtQAAAFwxt8LNwoULPV0HAACAR7i150aSzp8/r/Xr1+vtt9/W6dOnJUlHjhzRmTNnPFYcAABAWbl15uaXX35R9+7dlZ6erry8PN17770KDAzUyy+/rLy8PM2ZM8fTdQIAAJSKW2duRo0apY4dO+rEiROqUaOGs/3BBx9Uamqqx4oDAAAoK7fO3PzP//yPNm3aJD8/P5f2sLAwHT582COFAQAAuMOtMzdFRUUqLCws1n7o0CEFBgZecVEAAADucivcdOvWTTNnznQ+dzgcOnPmjJKSkvhJBgAA4FVuXZaaMWOGYmJi1Lp1a507d04DBw7UTz/9pAYNGuj999/3dI0AAACl5la4adq0qb777jstXbpUO3fu1JkzZzR06FA9/PDDLhuMAQAAKppb4UaSqlWrpkceecSTtQAAAFwxt8LN4sWL//T1wYMHu1UMAADAlXIr3IwaNcrleUFBgc6ePSs/Pz/VrFmTcAMAALzGrbulTpw44fI4c+aM9uzZo86dO7OhGAAAeJXbvy31Ry1atNC0adOKndUBAACoSB4LN9KFTcZHjhzx5CEBAADKxK09Nx9//LHLc2OMjh49qjfffFNRUVEeKQwAAMAdboWbXr16uTx3OBxq2LCh7r77bs2YMcMTdQEAALjFrXBTVFTk6ToAAAA8wqN7bgAAALzNrTM3iYmJpe6bkpLizlsAAAC4xa1ws337dm3fvl0FBQVq1aqVJGnv3r3y9fXVrbfe6uzncDg8UyUAAEApuRVuYmNjFRgYqHfffVf169eXdOGL/eLj49WlSxc9/fTTHi0SAACgtNzaczNjxgwlJyc7g40k1a9fX1OmTOFuKQAA4FVuhZucnBxlZWUVa8/KytLp06evuCgAAAB3uRVuHnzwQcXHx2vFihU6dOiQDh06pOXLl2vo0KHq3bu3p2sEAAAoNbf23MyZM0djxozRwIEDVVBQcOFA1app6NCheuWVVzxaIAAAQFm4FW5q1qypt956S6+88or2798vSQoPD1etWrU8WhwAAEBZXdGX+B09elRHjx5VixYtVKtWLRljPFUXAACAW9wKN7/++qvuuecetWzZUj169NDRo0clSUOHDuU2cAAA4FVuhZvRo0erevXqSk9PV82aNZ3t/fr109q1az1WHAAAQFm5tefm888/12effaamTZu6tLdo0UK//PKLRwoDAABwh1tnbnJzc13O2Fx0/Phx+fv7X3FRAAAA7nIr3HTp0kWLFy92Pnc4HCoqKtL06dPVtWtXjxUHAABQVm5dlpo+fbruueceffvtt8rPz9df//pX/fjjjzp+/Lj+93//19M1AgAAlJpbZ27atGmjvXv3qnPnznrggQeUm5ur3r17a/v27QoPD/d0jQAAAKVW5jM3BQUF6t69u+bMmaPx48eXR00AAABuK/OZm+rVq2vnzp3lUQsAAMAVc+uy1COPPKL58+d7uhYAAIAr5taG4vPnz2vBggVav369OnToUOw3pVJSUjxSHAAAQFmVKdwcOHBAYWFh+uGHH3TrrbdKkvbu3evSx+FweK46AACAMipTuGnRooWOHj2qDRs2SLrwcwv/+Z//qaCgoHIpDgAAoKzKtOfmj7/6vWbNGuXm5nq0IAAAgCvh1obii/4YdgAAALytTOHG4XAU21PDHhsAAFCZlGnPjTFGQ4YMcf445rlz5zR8+PBid0utWLHCcxUCAACUQZnCTVxcnMvzRx55xKPFAAAAXKkyhZuFCxeWVx0AAAAecUUbigEAACobwg0AALBKpQg3s2bNUlhYmAICAhQREaEtW7aUatzSpUvlcDjUq1ev8i0QAABUGV4PN8uWLVNiYqKSkpK0bds2tWvXTjExMTp27NifjktLS9OYMWPUpUuXCqoUAABUBV4PNykpKRo2bJji4+PVunVrzZkzRzVr1tSCBQsuOaawsFAPP/ywJk2apOuuu+5Pj5+Xl6ecnByXBwAAsJdXw01+fr62bt2q6OhoZ5uPj4+io6O1efPmS4578cUX1ahRIw0dOvSy75GcnKy6des6H6GhoR6pHQAAVE5eDTfZ2dkqLCws9sObQUFBysjIKHHM119/rfnz52vevHmleo9x48bp1KlTzsfBgwevuG4AAFB5lel7brzt9OnTGjRokObNm6cGDRqUaoy/v7/zG5UBAID9vBpuGjRoIF9fX2VmZrq0Z2ZmKjg4uFj//fv3Ky0tTbGxsc62oqIiSVK1atW0Z88ehYeHl2/RAACgUvPqZSk/Pz916NBBqampzraioiKlpqYqMjKyWP8bbrhB33//vXbs2OF83H///eratat27NjBfhoAAOD9y1KJiYmKi4tTx44d1alTJ82cOVO5ubmKj4+XJA0ePFghISFKTk5WQECA2rRp4zK+Xr16klSsHQAAXJ28Hm769eunrKwsTZgwQRkZGWrfvr3Wrl3r3GScnp4uHx+v37EOAACqCK+HG0kaOXKkRo4cWeJrGzdu/NOxixYt8nxBAACgyuKUCAAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFcINAACwCuEGAABYhXADAACsQrgBAABWIdwAAACrEG4AAIBVCDcAAMAqhBsAAGAVwg0AALAK4QYAAFiFcAMAAKxCuAEAAFYh3AAAAKsQbgAAgFUINwAAwCqEGwAAYBXCDQAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFcINAACwCuEGAABYhXADAACsQrgBAABWIdwAAACrEG4AAIBVCDcAAMAqhBsAAGAVwg0AALAK4QYAAFiFcAMAAKxCuAEAAFYh3AAAAKsQbgAAgFUINwAAwCqEGwAAYBXCDQAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFcINAACwCuEGAABYhXADAACsUinCzaxZsxQWFqaAgABFRERoy5Ytl+w7b948denSRfXr11f9+vUVHR39p/0BAMDVxevhZtmyZUpMTFRSUpK2bdumdu3aKSYmRseOHSux/8aNGzVgwABt2LBBmzdvVmhoqLp166bDhw9XcOUAAKAy8nq4SUlJ0bBhwxQfH6/WrVtrzpw5qlmzphYsWFBi///+7//WiBEj1L59e91www165513VFRUpNTU1AquHAAAVEZeDTf5+fnaunWroqOjnW0+Pj6Kjo7W5s2bS3WMs2fPqqCgQNdcc02Jr+fl5SknJ8flAQAA7OXVcJOdna3CwkIFBQW5tAcFBSkjI6NUx3jmmWfUpEkTl4D0e8nJyapbt67zERoaesV1AwCAysvrl6WuxLRp07R06VKtXLlSAQEBJfYZN26cTp065XwcPHiwgqsEAAAVqZo337xBgwby9fVVZmamS3tmZqaCg4P/dOyrr76qadOmaf369Wrbtu0l+/n7+8vf398j9QIAgMrPq2du/Pz81KFDB5fNwBc3B0dGRl5y3PTp0zV58mStXbtWHTt2rIhSAQBAFeHVMzeSlJiYqLi4OHXs2FGdOnXSzJkzlZubq/j4eEnS4MGDFRISouTkZEnSyy+/rAkTJmjJkiUKCwtz7s2pXbu2ateu7bV5AACAysHr4aZfv37KysrShAkTlJGRofbt22vt2rXOTcbp6eny8fnnCabZs2crPz9fffr0cTlOUlKSJk6cWJGlAwCASsjr4UaSRo4cqZEjR5b42saNG12ep6WllX9BAACgyqrSd0sBAAD8EeEGAABYhXADAACsQrgBAABWIdwAAACrEG4AAIBVCDcAAMAqhBsAAGAVwg0AALAK4QYAAFiFcAMAAKxCuAEAAFYh3AAAAKsQbgAAgFUINwAAwCqEGwAAYBXCDQAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFcINAACwCuEGAABYhXADAACsQrgBAABWIdwAAACrEG4AAIBVCDcAAMAqhBsAAGAVwg0AALAK4QYAAFiFcAMAAKxCuAEAAFYh3AAAAKsQbgAAgFUINwAAwCqEGwAAYBXCDQAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFcINAACwCuEGAABYhXADAACsQrgBAABWIdwAAACrEG4AAIBVCDcAAMAqhBsAAGAVwg0AALAK4QYAAFiFcAMAAKxSKcLNrFmzFBYWpoCAAEVERGjLli1/2v+DDz7QDTfcoICAAN18881avXp1BVUKAAAqO6+Hm2XLlikxMVFJSUnatm2b2rVrp5iYGB07dqzE/ps2bdKAAQM0dOhQbd++Xb169VKvXr30ww8/VHDlAACgMvJ6uElJSdGwYcMUHx+v1q1ba86cOapZs6YWLFhQYv/XX39d3bt319ixY3XjjTdq8uTJuvXWW/Xmm29WcOUAAKAyqubNN8/Pz9fWrVs1btw4Z5uPj4+io6O1efPmEsds3rxZiYmJLm0xMTFatWpVif3z8vKUl5fnfH7q1ClJUk5OzhVWX7KivLPlctzyVF5rAQC4cnyuuB7TGHPZvl4NN9nZ2SosLFRQUJBLe1BQkHbv3l3imIyMjBL7Z2RklNg/OTlZkyZNKtYeGhrqZtX2qTvT2xUAAGxSnp8rp0+fVt26df+0j1fDTUUYN26cy5meoqIiHT9+XNdee60cDocXKyt/OTk5Cg0N1cGDB1WnTh1vl1OlsHbuYd3cw7q5j7VzT1VcN2OMTp8+rSZNmly2r1fDTYMGDeTr66vMzEyX9szMTAUHB5c4Jjg4uEz9/f395e/v79JWr14994uugurUqVNl/ngrG9bOPaybe1g397F27qlq63a5MzYXeXVDsZ+fnzp06KDU1FRnW1FRkVJTUxUZGVnimMjISJf+krRu3bpL9gcAAFcXr1+WSkxMVFxcnDp27KhOnTpp5syZys3NVXx8vCRp8ODBCgkJUXJysiRp1KhRuvPOOzVjxgz17NlTS5cu1bfffqu5c+d6cxoAAKCS8Hq46devn7KysjRhwgRlZGSoffv2Wrt2rXPTcHp6unx8/nmC6fbbb9eSJUv0/PPP67nnnlOLFi20atUqtWnTxltTqLT8/f2VlJRU7LIcLo+1cw/r5h7WzX2snXtsXzeHKc09VQAAAFWE17/EDwAAwJMINwAAwCqEGwAAYBXCDQAAsArhBgAAWIVwU0V89dVXio2NVZMmTeRwOC75Q6G/l5eXp/Hjx6tZs2by9/dXWFiYy6+tz5s3T126dFH9+vVVv359RUdHa8uWLeU4C+8oj7X7vaVLl8rhcKhXr16eLdzLymvdTp48qYSEBDVu3Fj+/v5q2bKlVq9eXU6zqHjltW4zZ85Uq1atVKNGDYWGhmr06NE6d+5cOc3CO8q6dkOGDJHD4Sj2uOmmm1z6zZo1S2FhYQoICFBERIR1/58rj3VLTk7Wv/7rvyowMFCNGjVSr169tGfPnnKeiecQbqqI3NxctWvXTrNmzSr1mL59+yo1NVXz58/Xnj179P7776tVq1bO1zdu3KgBAwZow4YN2rx5s0JDQ9WtWzcdPny4PKbgNeWxdhelpaVpzJgx6tKliydLrhTKY93y8/N17733Ki0tTR9++KH27NmjefPmKSQkpDym4BXlsW5LlizRs88+q6SkJO3atUvz58/XsmXL9Nxzz5XHFLymrGv3+uuv6+jRo87HwYMHdc011+gvf/mLs8+yZcuUmJiopKQkbdu2Te3atVNMTIyOHTtWXtOocOWxbl9++aUSEhL0zTffaN26dSooKFC3bt2Um5tbXtPwLIMqR5JZuXLln/ZZs2aNqVu3rvn1119Lfdzz58+bwMBA8+67715hhZWXJ9fu/Pnz5vbbbzfvvPOOiYuLMw888IDnCq1kPLVus2fPNtddd53Jz8/3cIWVk6fWLSEhwdx9990ubYmJiSYqKsoTZVZKpVm7P1q5cqVxOBwmLS3N2dapUyeTkJDgfF5YWGiaNGlikpOTPVVqpeKpdfujY8eOGUnmyy+/vMIKKwZnbiz18ccfq2PHjpo+fbpCQkLUsmVLjRkzRr/99tslx5w9e1YFBQW65pprKrDSyqe0a/fiiy+qUaNGGjp0qJcqrVxKs24ff/yxIiMjlZCQoKCgILVp00ZTp05VYWGhFyv3rtKs2+23366tW7c6L6ccOHBAq1evVo8ePbxVdqU0f/58RUdHq1mzZpIunCncunWroqOjnX18fHwUHR2tzZs3e6vMSueP61aSU6dOSVKV+Xzw+s8voHwcOHBAX3/9tQICArRy5UplZ2drxIgR+vXXX7Vw4cISxzzzzDNq0qSJy/8IrkalWbuvv/5a8+fP144dO7xbbCVSmnU7cOCAvvjiCz388MNavXq19u3bpxEjRqigoEBJSUlenoF3lGbdBg4cqOzsbHXu3FnGGJ0/f17Dhw+37rLUlThy5IjWrFmjJUuWONuys7NVWFjo/Dmfi4KCgrR79+6KLrFSKmnd/qioqEhPPfWUoqKiqs5PHXn71BHKTqU47XjvvfeagIAAc/LkSWfb8uXLjcPhMGfPni3WPzk52dSvX9989913ni63UvHE2uXk5JiwsDCzevVq5+tclird31yLFi1MaGioOX/+vLPPjBkzTHBwcLnU7W2eWrcNGzaYoKAgM2/ePLNz506zYsUKExoaal588cXyLN+rSrN2vzd16lRz7bXXmry8PGfb4cOHjSSzadMml75jx441nTp18lSplYon1u2Phg8fbpo1a2YOHjzogQorBmduLNW4cWOFhISobt26zrYbb7xRxhgdOnRILVq0cLa/+uqrmjZtmtavX6+2bdt6o9xK5XJrl5ubq7S0NMXGxjpfLyoqkiRVq1ZNe/bsUXh4eIXX7W2l+Ztr3LixqlevLl9fX5c+GRkZys/Pl5+fnzdK96rSrNsLL7ygQYMG6bHHHpMk3XzzzcrNzdXjjz+u8ePHu/y48NXIGKMFCxZo0KBBLn9DDRo0kK+vrzIzM136Z2ZmKjg4uKLLrHQutW6/N3LkSH3yySf66quv1LRp0wqu0H1X938RFouKitKRI0d05swZZ9vevXvl4+Pj8gc6ffp0TZ48WWvXrlXHjh29UWqlc7m1u+GGG/T9999rx44dzsf999+vrl27aseOHQoNDfVi9d5Tmr+5qKgo7du3zxkGL/Zp3LjxVRlspNKt29mzZ4sFmIsB0fDbx/ryyy+1b9++Yvvf/Pz81KFDB6WmpjrbioqKlJqaqsjIyIous9K51LpJF/6uRo4cqZUrV+qLL75Q8+bNvVDhFfDeSSOUxenTp8327dvN9u3bjSSTkpJitm/fbn755RdjjDHPPvusGTRokEv/pk2bmj59+pgff/zRfPnll6ZFixbmsccec/aZNm2a8fPzMx9++KE5evSo83H69OkKn195Ko+1+yMbL0uVx7qlp6ebwMBAM3LkSLNnzx7zySefmEaNGpkpU6ZU+PzKS3msW1JSkgkMDDTvv/++OXDggPn8889NeHi46du3b4XPrzyVde0ueuSRR0xERESJx1y6dKnx9/c3ixYtMv/4xz/M448/burVq2cyMjLKdS4VqTzW7YknnjB169Y1GzdudPl8KGlbQ2VEuKkiNmzYYCQVe8TFxRljLny43nnnnS5jdu3aZaKjo02NGjVM06ZNTWJiossfZrNmzUo8ZlJSUsVNrAKUx9r9kY3hprzWbdOmTSYiIsL4+/ub6667zrz00ksue3CquvJYt4KCAjNx4kQTHh5uAgICTGhoqBkxYoQ5ceJExU2sArizdidPnjQ1atQwc+fOveRx33jjDfMv//Ivxs/Pz3Tq1Ml888035TiLilce61bS8SSZhQsXlu9kPMRhDOc0AQCAPdhzAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFcINAACwCuEGAABYhXADAACsQrgBAABWIdwAAACr/D/vevnJpwE9FgAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "all_token_latencies = valid_df['end_to_end_latency_s'].apply(pd.Series).stack()\n", + "all_token_latencies = all_token_latencies.reset_index(drop=True)\n", + "all_token_latencies.plot.hist(title=\"Token Latencies\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/env_sample.txt b/env_sample.txt deleted file mode 100644 index f2d6a4e..0000000 --- a/env_sample.txt +++ /dev/null @@ -1,19 +0,0 @@ -#Anyscale Endpoint -ANYSCALE_API_BASE="https://console.endpoints.anyscale.com/m/v1" -ANYSCALE_API_KEY="secret_xxxxx" - -#OpenAI Endpoint -OPENAI_API_BASE="https://api.openai.com/v1" -OPENAI_API_KEY="sk-xxxxx" - -#Fireworks Endpoint -FIREWORKS_API_BASE="https://api.fireworks.ai/inference/v1" -FIREWORKS_API_KEY="xxxxx" - -#vLLM Endpoint -VLLM_API_BASE="https://localhost/v1" -VLLM_API_KEY="xxxxx' - -# Huggingface Text Generation Inference -TGI_API_BASE="http://localhost:8001" -TGI_API_KEY="enter key" diff --git a/llm_correctness.py b/llm_correctness.py new file mode 100644 index 0000000..c9d102d --- /dev/null +++ b/llm_correctness.py @@ -0,0 +1,309 @@ +import argparse +import json +import os +from pathlib import Path +import random +import re +import time +from typing import Any, Dict, List, Optional, Tuple + +import num2words +import ray +from tqdm import tqdm + +from llmperf import common_metrics +from llmperf.common import SUPPORTED_APIS, construct_clients +from llmperf.models import RequestConfig +from llmperf.requests_launcher import RequestsLauncher +from llmperf.utils import ( + LLMPerfResults, +) + +MAX_RANDOM_NUMBER = 10000 + + +def llm_correctness( + model: str, + additional_sampling_params: Optional[Dict[str, Any]] = None, + num_concurrent_requests: int = 1, + max_num_completed_requests: int = 500, + test_timeout_s=90, + llm_api="chat", +) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]: + """Get the token throughput and latencies for the given model. + + Args: + model: The name of the model to query. + additional_sampling_params: Additional sampling parameters to send with the request. + For more information see the LLM APIs documentation for the completions + num_concurrent_requests: The number of concurrent requests to make. Increase + this to increase the amount of load and vice versa. + test_timeout_s: The amount of time to run the test for before reporting results. + llm_api: The type of request to make. Either "chat" or "litellm". + + Returns: + A tuple containing summary metrics and raw results from the test. + + """ + + if not additional_sampling_params: + additional_sampling_params = {} + + clients = construct_clients(llm_api=llm_api, num_clients=num_concurrent_requests) + req_launcher = RequestsLauncher(clients) + start_time = time.monotonic() + + num_errored_requests = 0 + num_mismatched_requests = 0 + num_completed_requests = 0 + + sampling_params = {"temperature": 0.0} + sampling_params.update(additional_sampling_params) + completed_requests = [] + iter = 0 + pbar = tqdm(total=max_num_completed_requests) + while ( + time.monotonic() - start_time < test_timeout_s + and num_completed_requests < max_num_completed_requests + ): + iter += 1 + rnd_number = random.randint(0, MAX_RANDOM_NUMBER) + rnd_num_words = num2words.num2words(rnd_number) + + prompt = f"Convert the following sequence of words into a number: {rnd_num_words}.\nPrint the number first." + + request_config = RequestConfig( + model=model, + prompt=(prompt, 0), + sampling_params=sampling_params, + metadata={"rnd_number": rnd_number}, + llm_api=llm_api, + ) + req_launcher.launch_requests(request_config) + + if not (iter % num_concurrent_requests): + completed_requests.extend(req_launcher.get_next_ready()) + pbar.update(len(completed_requests) - num_completed_requests) + num_completed_requests = len(completed_requests) + + pbar.close() + end_time = time.monotonic() + if end_time - start_time >= test_timeout_s: + print("Test timed out before all requests could be completed.") + + raw_results = [] + + print("Mismatched and errored requests.") + for out in completed_requests: + metrics, generated_text, completed_request_config = out + + raw_results.append( + { + "metrics": metrics, + "generated_text": generated_text, + "request_config": dict(completed_request_config), + } + ) + + # if there were no errors when making request. + if not metrics[common_metrics.ERROR_CODE]: + try: + commas_between_numbers_re = r"(\d+),(?=\d)" + gen_text_commas_removed = re.sub( + commas_between_numbers_re, r"\1", generated_text + ) + nums = re.findall(r"\d+", gen_text_commas_removed) + generated_text = gen_text_commas_removed.replace("\n", " ") + + assert str(completed_request_config.metadata["rnd_number"]) in nums + except: + num_mismatched_requests += 1 + print( + f" mismatched request: {generated_text}, expected: {completed_request_config.metadata['rnd_number']}" + ) + else: + num_errored_requests += 1 + print( + f" The request errored: {metrics[common_metrics.ERROR_CODE]}, " + f"{metrics[common_metrics.ERROR_MSG]} " + ) + print() + + error_rate = num_errored_requests / num_completed_requests + mismatch_rate = num_mismatched_requests / num_completed_requests + num_non_errored_requests = num_completed_requests - num_errored_requests + summary_metrics = {} + summary_metrics[common_metrics.NUM_ERRORS] = num_errored_requests + summary_metrics["num_mismatched_requests"] = num_mismatched_requests + summary_metrics["error_rate"] = error_rate + summary_metrics["mismatch_rate"] = mismatch_rate + summary_metrics[common_metrics.NUM_COMPLETED_REQUESTS] = num_completed_requests + summary_metrics["num_non_errored_requests"] = num_non_errored_requests + + # Metadata + summary_metrics["model"] = model + summary_metrics["num_concurrent_requests"] = num_concurrent_requests + summary_metrics["additional_sampling_params"] = additional_sampling_params + summary_metrics["llm_api"] = llm_api + + return summary_metrics, raw_results + + +def run( + llm_api: str, + model: str, + test_timeout_s: int, + max_num_completed_requests: int, + num_concurrent_requests: int, + additional_sampling_params: str, + results_dir: str, + user_metadata: Dict[str, str], +): + """ + Args: + llm_api: The type of request to make. Either "chat" or "litellm". + model: The name of the model to query. + max_num_completed_requests: The number of requests to complete before finishing the test. + test_timeout_s: The amount of time to run the test for before reporting results. + num_concurrent_requests: The number of concurrent requests to make. Increase + this to increase the amount of load and vice versa. + mean_input_tokens: The mean number of tokens to send in the prompt for the request. + stddev_input_tokens: The standard deviation of the number of tokens to send in the prompt for the request. + mean_output_tokens: The mean number of tokens to generate per request. + stddev_output_tokens: The standard deviation of the number of tokens to generate per request. + additional_sampling_params: Additional sampling parameters to send with the request. + For more information see the LLM APIs documentation for the completions. + results_dir: The directory to save the results to. + + """ + + summary_metrics, raw_results = llm_correctness( + model=model, + llm_api=llm_api, + test_timeout_s=test_timeout_s, + max_num_completed_requests=max_num_completed_requests, + num_concurrent_requests=num_concurrent_requests, + additional_sampling_params=json.loads(additional_sampling_params), + ) + + time.sleep(2) + + print( + f"Results for llm correctness test for {model} queried with the {llm_api} api." + ) + print( + f"Errors: {summary_metrics[common_metrics.NUM_ERRORS]}, " + f"Error rate: {summary_metrics['error_rate']}" + ) + + print( + f"Mismatched: {summary_metrics['num_mismatched_requests']}, " + f"Mismatch rate: {summary_metrics['mismatch_rate']}" + ) + print(f"Completed: {summary_metrics[common_metrics.NUM_COMPLETED_REQUESTS]}") + print(f"Completed without errors: {summary_metrics['num_non_errored_requests']}") + + if results_dir: + file_name = f"{model}_correctness" + file_name = re.sub(r"[^\w\d-]+", "-", file_name) + file_name = re.sub(r"-{2,}", "-", file_name) + summary_file_name = f"{file_name}_summary" + individual_responses_filename = f"{file_name}_individual_responses" + summary_metrics.update(user_metadata) + results = LLMPerfResults(name=summary_file_name, metadata=summary_metrics) + results_dir = Path(results_dir) + if not results_dir.exists(): + results_dir.mkdir(parents=True) + elif not results_dir.is_dir(): + raise ValueError(f"{results_dir} is not a directory") + with open(results_dir / f"{summary_file_name}.json", "w") as f: + json.dump(results.to_dict(), f, indent=4) + with open(results_dir / f"{individual_responses_filename}.json", "w") as f: + json.dump(raw_results, f, indent=4) + + +args = argparse.ArgumentParser(description="Run a correctness test for a given model.") + +args.add_argument( + "--model", type=str, required=True, help="The model to use for this load test." +) +args.add_argument( + "--num-concurrent-requests", + type=int, + default=10, + help=("The number of concurrent requests to send. (default: %(default)s)"), +) +args.add_argument( + "--timeout", + type=int, + default=90, + help="The amount of time to run the load test for. (default: %(default)s)", +) +args.add_argument( + "--max-num-completed-requests", + type=int, + default=50, + help=( + "The number of requests to complete before finishing the test. Note " + "that its possible for the test to timeout first. (default: %(default)s)" + ), +) +args.add_argument( + "--additional-sampling-params", + type=str, + default="{}", + help=( + "Additional sampling params to send with the each request to the LLM API. " + "(default: %(default)s) No additional sampling params are sent." + ), +) +args.add_argument( + "--results-dir", + type=str, + default="", + help=( + "The directory to save the results to. " + "(`default: %(default)s`) No results are saved)" + ), +) +args.add_argument( + "--llm-api", + type=str, + default="openai", + help=( + f"The type of request to make. The supported llm apis are {SUPPORTED_APIS} " + " (`default: %(default)s`)" + ), +) +args.add_argument( + "--metadata", + type=str, + default="", + help=( + "A comma separated list of metadata to include in the results, e.g. " + "name=foo,bar=1. These will be added to the metadata field of the results. " + ), +) + +if __name__ == "__main__": + args = args.parse_args() + + env_vars = dict(os.environ) + ray.init(runtime_env={"env_vars": env_vars}) + # Parse user metadata. + user_metadata = {} + if args.metadata: + for item in args.metadata.split(","): + key, value = item.split("=") + user_metadata[key] = value + + run( + llm_api=args.llm_api, + model=args.model, + test_timeout_s=args.timeout, + max_num_completed_requests=args.max_num_completed_requests, + num_concurrent_requests=args.num_concurrent_requests, + additional_sampling_params=args.additional_sampling_params, + results_dir=args.results_dir, + user_metadata=user_metadata, + ) diff --git a/llmperf.py b/llmperf.py deleted file mode 100644 index 8ae30a2..0000000 --- a/llmperf.py +++ /dev/null @@ -1,484 +0,0 @@ -import argparse -from collections import defaultdict -import ray, openai -from num2words import num2words -import time, os, sys, re, json, datetime -import random -from dotenv import load_dotenv -import pandas as pd -from transformers import LlamaTokenizerFast -from huggingface_hub import InferenceClient - -FRAMEWORKS = [ - "anyscale", - "openai", - "fireworks", - "vertexai", - "sagemaker", - "perplexity", - "together", - "vllm", - "tgi" -] - -os.environ["TOKENIZERS_PARALLELISM"] = "true" - -# TODO(mwk): We use one tokenizer for all models, but we should -# consider using each framework's tokenizer - -# TODO(mwk): too much dependence on args globally. Clean up methods to not directly -# read from args to facilitate writing scripts. - -tokenizer = LlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer") -sys_prompt = "You are a helpful assistant that respeonds with the answer in the most concise possible way." - - -class LineIterator: - """ - A helper class for parsing the byte stream input. - Reference: https://aws.amazon.com/blogs/machine-learning/elevating-the-generative-ai-experience-introducing-streaming-support-in-amazon-sagemaker-hosting/ - """ - - def __init__(self, stream): - self.byte_iterator = iter(stream) - self.buffer = io.BytesIO() - self.read_pos = 0 - self.ttft = 0 - - def __iter__(self): - return self - - def __next__(self): - while True: - self.buffer.seek(self.read_pos) - line = self.buffer.readline() - if line and line[-1] == ord("\n"): - if self.ttft == 0: - self.ttft = time.time() - self.read_pos += len(line) - return line[:-1], self.ttft, time.time() - # kyle: dealing with last ']' for chat output - if line and self.read_pos == self.buffer.getbuffer().nbytes - 1: - self.read_pos += 1 - return line, self.ttft, time.time() - try: - chunk = next(self.byte_iterator) - except StopIteration: - if self.read_pos < self.buffer.getbuffer().nbytes: - continue - raise - if "PayloadPart" not in chunk: - print("Unknown event type:" + chunk) - continue - self.buffer.seek(0, io.SEEK_END) - self.buffer.write(chunk["PayloadPart"]["Bytes"]) - - -# NOTE: The defaults are set to mirror our production traffic -def prompt_generator(num_digits=3, min_lines=15, max_lines=1000, file_lines=[]) -> str: - # Step 1: Generate a random number - # Generate the number of digits specified (e.g. if NUM_DIGITS = 3, then - # any number between 100 and 1000 is OK). - rnd_num = random.randrange(10 ** (num_digits - 1), 10 ** (num_digits)) - max_lines = max_lines if max_lines < len(file_lines) else len(file_lines) - rnd_num_lines = random.randrange(min_lines, max_lines) - rnd_picked_lines = "\n".join(random.sample(file_lines, rnd_num_lines)) - - # Step 2: convert to words. - rnd_num_words = num2words(rnd_num) - - # Step 3: convert to a prompt - user_prompt = f"Convert the following sequence of words into a number: {rnd_num_words}.\nPrint the number first. Then pick {args.req_lines} lines from these poem lines:\n{rnd_picked_lines}" - - return user_prompt, rnd_num - - -@ray.remote(num_cpus=0.001) -def validate(ep_config, sample_lines): - # The 4 is for the end and start tokens of the messages - prompt, rnd_num = prompt_generator( - args.num_digits, args.min_lines, args.max_lines, sample_lines - ) - tokens_in = len(tokenizer.encode(prompt)) + len(tokenizer.encode(sys_prompt)) + 4 - words = "" - id = None - st = et = ttft = 0 - if ep_config["framework"] in [ - "anyscale", - "openai", - "fireworks", - "perplexity", - "vllm", - ]: - messages = [ - {"role": "system", "content": sys_prompt}, - {"role": "user", "content": prompt}, - ] - try: - st = time.time() - response = openai.ChatCompletion.create( - model=ep_config["model"], - messages=messages, - api_key=ep_config["api_key"], - api_base=ep_config["api_base"], - max_tokens=args.max_tokens, - # Please keep temp at 0. Otherwise increases the number of mismatches. - temperature=0, - # Do not set to false. You will get bogus results. - stream=True, - ) - for tok in response: - id = tok.id - if tok.choices[0].delta: - delta = tok.choices[0].delta - if "content" in delta: - if ttft == 0: - ttft = time.time() - st - words += delta["content"] - et = time.time() - except Exception as e: - return ("Exception", -1, -1, -1, -1, str(e), "") - elif ep_config["framework"] == "together": - try: - st = time.time() - url = ep_config["api_base"] - payload = { - "model": ep_config["model"], - "prompt": sys_prompt + prompt, - "max_tokens": args.max_tokens, - "temperature": 0, - "stream_tokens": True, - } - headers = { - "accept": "application/json", - "content-type": "application/json", - "Authorization": f"Bearer {ep_config['api_key']}", - } - response = requests.post(url, json=payload, headers=headers) - response.raise_for_status() - client = sseclient.SSEClient(response) - for event in client.events(): - if ttft == 0: - ttft = time.time() - st - if event.data == "[DONE]": - break - partial_result = json.loads(event.data) - words += partial_result["choices"][0]["text"] - et = time.time() - except Exception as e: - return ("Exception", -1, -1, -1, -1, str(e), "") - elif ep_config["framework"] == "vertexai": - chat_model = ChatModel.from_pretrained(ep_config["model"]) - chat = chat_model.start_chat( - context=sys_prompt, - ) - try: - st = time.time() - responses = chat.send_message_streaming( - message=prompt, - temperature=0, - max_output_tokens=args.max_tokens, - ) - results = [] - for response in responses: - if ttft == 0: - ttft = time.time() - st - results.append(str(response)) - words = "".join(results) - - et = time.time() - except Exception as e: - return ("Exception", -1, -1, -1, -1, str(e), "") - - elif ep_config["framework"] == "sagemaker": - sm_runtime = boto3.client("sagemaker-runtime", region_name=ep_config["region"]) - message = { - "inputs": [ - [ - {"role": "system", "content": sys_prompt}, - {"role": "user", "content": prompt}, - ] - ], - "parameters": { - "max_new_tokens": args.max_tokens, - ## we can't set temperature to 0 in SM - "temperature": 0.01, - }, - } - try: - st = time.time() - response = sm_runtime.invoke_endpoint_with_response_stream( - EndpointName=ep_config["endpoint_name"], - ContentType="application/json", - Body=json.dumps(message), - CustomAttributes="accept_eula=true", - ) - event_stream = response["Body"] - json_byte = b"" - for line, ttft, et in LineIterator(event_stream): - json_byte += line - resp = json.loads(json_byte) - ttft = ttft - st - words = resp[0]["generation"]["content"] - et = time.time() - except Exception as e: - return ("Exception", -1, -1, -1, -1, str(e), "") - elif ep_config["framework"] == "tgi": - - model = ep_config["model"] if ep_config["api_base"] is None else ep_config["api_base"] - api_key = ep_config["api_key"] - client = InferenceClient(model=model, token=api_key) - query = f"[INST] {sys_prompt} {prompt} [/INST]" - try: - st = time.time() - response = client.text_generation(query, max_new_tokens=args.max_tokens, temperature=.1, stream=True) - for tok in response: - words += tok - if ttft == 0: - ttft = time.time() - st - et = time.time() - except Exception as e: - return ("Exception", -1, -1, -1, -1, str(e), "") - - # Get rid of commas. - tokens_out = len(tokenizer.encode(words)) - nums = re.findall(r"\d+", words) - if len(nums) > 0: - retval = int(nums[0]) - valid = "OK" - cause = "" - if retval != rnd_num: - valid = "Mismatch" - cause = f"Input = {rnd_num} output = {retval}\n.Output:\n {words}" - else: - valid = "Mismatch" - cause = f"Output unparseable. Input = {rnd_num}. Output:\n {words}" - return (valid, ttft, et - st, tokens_in, tokens_out, cause, id) - - -def endpoint_evaluation(ep_config, sample_lines): - query_results = [] - overall_start_time = time.time() - num_rounds = int(args.total_requests / args.concur_requests) - for i in range(num_rounds): - print(f"Starting round {i}") - st = time.time() - futures = [ - validate.remote(ep_config, sample_lines) - for _ in range(args.concur_requests) - ] - results = ray.get(futures) - query_results.extend(results) - et = time.time() - elt = et - st - tosleep = args.sleep - elt - if tosleep > 0: - print("Sleeping for %.4f seconds" % tosleep) - time.sleep(tosleep) - else: - print(f"No need to sleep for the next round") - print(f"Round {i} complete") - overall_end_time = time.time() - print(f"Overall execution time {overall_end_time-overall_start_time}") - return query_results - - -def results_analysis(query_results, results_dict): - df = pd.DataFrame( - query_results, - columns=[ - "valid", - "ttft", - "total_time", - "tokens_in", - "tokens_out", - "cause", - "id", - ], - ) - ts = int(time.time()) - fn = f'{results_dict["framework"]}-{ts}_raw.json' - df.to_json(fn) - print(f"Results saved to: {fn}") - - print("Validity results:") - print(df["valid"].value_counts()) - - value_counts = df["valid"].value_counts() - results_dict["num_valid"] = int(value_counts.get("OK", 0)) - results_dict["num_exceptions"] = int(value_counts.get("Exception", 0)) - results_dict["num_mismatch"] = int(value_counts.get("Mismatch", 0)) - results_dict["valid_rate"] = float( - results_dict["num_valid"] / results_dict["total_requests"] - ) - results_dict["mismatch_rate"] = float( - results_dict["num_mismatch"] / results_dict["total_requests"] - ) - results_dict["exception_rate"] = float( - results_dict["num_exceptions"] / results_dict["total_requests"] - ) - cdf = df[df.valid != "Exception"].copy() - print(f"Clean DF is: {len(cdf)}") - if len(cdf) > 0: - cdf["total_tokens_per_s"] = (cdf.tokens_out + cdf.tokens_in) / cdf.total_time - cdf["out_tokens_per_s"] = cdf.tokens_out / cdf.total_time - cdf["inter_tokens_delay"] = cdf.total_time / cdf.tokens_out - mean_e2e = cdf["total_time"].mean() - mean_tokens_in = cdf["tokens_in"].mean() - mean_tokens_out = cdf["tokens_out"].mean() - mean_ttft = cdf["ttft"].mean() - max_ttft = cdf["ttft"].max() - gt_3_ttft = len(cdf[cdf["ttft"] > 3]) / len(cdf) - print(f"Mean End-to-end: {mean_e2e*1000.0:.0f} ms") - print( - f"Mean TTFT: {mean_ttft*1000:.0f} ms (mean tokens in: {mean_tokens_in:.0f}, out: {mean_tokens_out:.0f})" - ) - print(f"Max TTFT: {max_ttft*1000:.0f} ms") - print(f"TTFT > 3 s: {gt_3_ttft*100:.2f}%") - print( - f"ITL (out): {cdf.inter_tokens_delay.mean()*1000:.2f} ms/token, mean tokens/s output (out): {cdf.out_tokens_per_s.mean():.2f} token/s" - ) - # Put things in a dictionary and save the results - results_dict["end_timestamp"] = datetime.datetime.fromtimestamp(ts).isoformat() - results_dict["total_time"] = float(cdf.total_time.mean()) - results_dict["mean_ttft"] = int(f"{mean_ttft*1000:.0f}") - results_dict["mean_tokens_in"] = mean_tokens_in - results_dict["mean_tokens_out"] = mean_tokens_out - results_dict["total_tokens_per_s"] = float(cdf.total_tokens_per_s.mean()) - results_dict["out_tokens_per_s"] = float(cdf.out_tokens_per_s.mean()) - results_dict["inter_token_delay"] = float(cdf.inter_tokens_delay.mean() * 1000) - - def error_analysis(df): - # Group exceptions based on exceptions cause. - exceptions = df[df.valid == "Exception"] - exceptions_by_cause = defaultdict(int) - # Ideally we should group by some error code - for cause in exceptions["cause"]: - exceptions_by_cause[cause] += 1 - print("Exceptions by cause:") - for cause, count in exceptions_by_cause.items(): - print(f" - {count}: {cause}") - - error_analysis(df) - results_dict["raw_output"] = fn - benchmark_result = f"{results_dict['framework']}-{ts}.json" - - with open(benchmark_result, "w") as fw: - fw.write(json.dumps(results_dict)) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "-f", "--framework", type=str, default="anyscale", help="Test frame name" - ) - parser.add_argument( - "-m", - "--model", - type=str, - default="meta-llama/Llama-2-70b-chat-hf", - help="model name", - ) - parser.add_argument( - "--random-lines-file-name", - type=str, - default="sonnet.txt", - help="Prompt sample file name", - ) - parser.add_argument("--min-lines", type=int, default=15, help="min number of lines") - parser.add_argument("--max-lines", type=int, default=50, help="max number of lines") - parser.add_argument( - "--req-lines", - type=int, - default=7, - help="Number of lines to request in prompt. Affects tokens out.", - ) - parser.add_argument( - "--num-digits", type=int, default=3, help="number of digits for mismatch search" - ) - parser.add_argument( - "--sleep", - type=int, - default=0, - help="sleep between rounds of requests (to deal with rate limiting)", - ) - parser.add_argument( - "-c", - "--concur-requests", - type=int, - default=10, - help="number of concurrent requests", - ) - parser.add_argument( - "-r", "--total-requests", type=int, default=300, help="number of total requests" - ) - parser.add_argument( - "--max-tokens", - type=int, - default=384, - help="Upper limit on the number of returned tokens to prevent 'runaway LLMs'.", - ) - parser.add_argument( - "--random-seed", - type=int, - default=117, - help="Random seed to standardize results. By default fully random.", - ) - args = parser.parse_args() - load_dotenv() - endpoint_config = {} - if args.random_seed >= 0: - random.seed(args.random_seed) - if args.framework not in FRAMEWORKS: - print(f"Choose a framework in {FRAMEWORKS}") - sys.exit(0) - elif args.framework == "anyscale": - endpoint_config["api_base"] = os.environ["ANYSCALE_API_BASE"] - endpoint_config["api_key"] = os.environ["ANYSCALE_API_KEY"] - elif args.framework == "openai": - endpoint_config["api_base"] = os.environ["OPENAI_API_BASE"] - endpoint_config["api_key"] = os.environ["OPENAI_API_KEY"] - elif args.framework == "fireworks": - endpoint_config["api_base"] = os.environ["FIREWORKS_API_BASE"] - endpoint_config["api_key"] = os.environ["FIREWORKS_API_KEY"] - elif args.framework == "perplexity": - endpoint_config["api_base"] = os.environ["PERPLEXITY_API_BASE"] - endpoint_config["api_key"] = os.environ["PERPLEXITY_API_KEY"] - elif args.framework == "together": - import requests, sseclient - - endpoint_config["api_base"] = os.environ["TOGETHER_API_BASE"] - endpoint_config["api_key"] = os.environ["TOGETHER_API_KEY"] - elif args.framework == "vertexai": - import vertexai - from vertexai.preview.language_models import ChatModel - - endpoint_config["api_base"] = "VertexAI Endpoint" - endpoint_config["project_id"] = os.environ["VERTEXAI_PROJECT_ID"] - vertexai.init(project=endpoint_config["project_id"]) - elif args.framework == "sagemaker": - import boto3 - - endpoint_config["api_base"] = "SageMaker Endpoint" - endpoint_config["region"] = os.environ["SAGEMAKER_REGION"] - endpoint_config["endpoint_name"] = os.environ["SAGEMAKER_ENDPOINT_NAME"] - elif args.framework == "vllm": - endpoint_config["api_base"] = os.environ["VLLM_API_BASE"] - endpoint_config["api_key"] = os.environ["VLLM_API_KEY"] - elif args.framework == "tgi": - endpoint_config["api_base"]=os.environ["TGI_API_BASE"] - endpoint_config["api_key"]=os.environ["TGI_API_KEY"] - - endpoint_config["framework"] = args.framework - endpoint_config["model"] = args.model - - f = open(args.random_lines_file_name, "r") - sample_lines = f.readlines() - f.close() - - ## Endpoint evaluation - query_results = endpoint_evaluation(endpoint_config, sample_lines) - - ## Results Analysis - args.api_base = endpoint_config["api_base"] - results_analysis(query_results, vars(args)) diff --git a/optional.txt b/optional.txt deleted file mode 100644 index 3d5578f..0000000 --- a/optional.txt +++ /dev/null @@ -1,2 +0,0 @@ -boto3 -google-cloud-aiplatform diff --git a/pre-commit.sh b/pre-commit.sh new file mode 100755 index 0000000..0eb053b --- /dev/null +++ b/pre-commit.sh @@ -0,0 +1,5 @@ +#!/bin/bash +echo "Running pre-hooks before committing..." + +echo "======FORMAT=====" +black . -q diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..7687fb2 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,23 @@ +[build-system] +requires = ["setuptools>=43.0.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "LLMPerf" +version = "0.1.0" +description = "A framework for load testing LLM APIs" +authors = [{name="Avnish Narayan", email="avnish@anyscale.com"}] +license = {text= "Apache-2.0"} +requires-python = ">=3.8, <3.11" +dependencies = ["pydantic<2.5", + "ray", + "pytest>=6.0", + "seaborn>=0.11", + "awscli>=1.22", + "typer>=0.4", + "litellm>=0.1.738", + "num2words", + "transformers", + "tqdm", + "boto3", + "google-cloud-aiplatform"] diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..4960786 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,2 @@ +# For lints +black \ No newline at end of file diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 573f8bf..0000000 --- a/requirements.txt +++ /dev/null @@ -1,18 +0,0 @@ -openai -num2words -python_dotenv -pandas -tiktoken -transformers==4.33.3 -matplotlib -scikit-learn -ray[default] -tokenizers==0.13.3 -huggingface-hub -### Extra dependency for Together -#requests -#sseclient-py -### Extra dependency for Vertex AI -#google-cloud-aiplatform -### Extra dependency for SageMaker -#boto3 diff --git a/sonnet.txt b/sonnet.txt deleted file mode 100644 index 34c444e..0000000 --- a/sonnet.txt +++ /dev/null @@ -1,518 +0,0 @@ -FROM fairest creatures we desire increase, -That thereby beauty's rose might never die, -But as the riper should by time decease, -His tender heir might bear his memory: -But thou, contracted to thine own bright eyes, -Feed'st thy light'st flame with self-substantial fuel, -Making a famine where abundance lies, -Thyself thy foe, to thy sweet self too cruel. -Thou that art now the world's fresh ornament -And only herald to the gaudy spring, -Within thine own bud buriest thy content -And, tender churl, makest waste in niggarding. -Pity the world, or else this glutton be, -To eat the world's due, by the grave and thee. -When forty winters shall beseige thy brow, -And dig deep trenches in thy beauty's field, -Thy youth's proud livery, so gazed on now, -Will be a tatter'd weed, of small worth held: -Then being ask'd where all thy beauty lies, -Where all the treasure of thy lusty days, -To say, within thine own deep-sunken eyes, -Were an all-eating shame and thriftless praise. -How much more praise deserved thy beauty's use, -If thou couldst answer 'This fair child of mine -Shall sum my count and make my old excuse,' -Proving his beauty by succession thine! -This were to be new made when thou art old, -And see thy blood warm when thou feel'st it cold. -Look in thy glass, and tell the face thou viewest -Now is the time that face should form another; -Whose fresh repair if now thou not renewest, -Thou dost beguile the world, unbless some mother. -For where is she so fair whose unear'd womb -Disdains the tillage of thy husbandry? -Or who is he so fond will be the tomb -Of his self-love, to stop posterity? -Thou art thy mother's glass, and she in thee -Calls back the lovely April of her prime: -So thou through windows of thine age shall see -Despite of wrinkles this thy golden time. -But if thou live, remember'd not to be, -Die single, and thine image dies with thee. -Unthrifty loveliness, why dost thou spend -Upon thyself thy beauty's legacy? -Nature's bequest gives nothing but doth lend, -And being frank she lends to those are free. -Then, beauteous niggard, why dost thou abuse -The bounteous largess given thee to give? -Profitless usurer, why dost thou use -So great a sum of sums, yet canst not live? -For having traffic with thyself alone, -Thou of thyself thy sweet self dost deceive. -Then how, when nature calls thee to be gone, -What acceptable audit canst thou leave? -Thy unused beauty must be tomb'd with thee, -Which, used, lives th' executor to be. -Those hours, that with gentle work did frame -The lovely gaze where every eye doth dwell, -Will play the tyrants to the very same -And that unfair which fairly doth excel: -For never-resting time leads summer on -To hideous winter and confounds him there; -Sap cheque'd with frost and lusty leaves quite gone, -Beauty o'ersnow'd and bareness every where: -Then, were not summer's distillation left, -A liquid prisoner pent in walls of glass, -Beauty's effect with beauty were bereft, -Nor it nor no remembrance what it was: -But flowers distill'd though they with winter meet, -Leese but their show; their substance still lives sweet. -Then let not winter's ragged hand deface -In thee thy summer, ere thou be distill'd: -Make sweet some vial; treasure thou some place -With beauty's treasure, ere it be self-kill'd. -That use is not forbidden usury, -Which happies those that pay the willing loan; -That's for thyself to breed another thee, -Or ten times happier, be it ten for one; -Ten times thyself were happier than thou art, -If ten of thine ten times refigured thee: -Then what could death do, if thou shouldst depart, -Leaving thee living in posterity? -Be not self-will'd, for thou art much too fair -To be death's conquest and make worms thine heir. -Lo! in the orient when the gracious light -Lifts up his burning head, each under eye -Doth homage to his new-appearing sight, -Serving with looks his sacred majesty; -And having climb'd the steep-up heavenly hill, -Resembling strong youth in his middle age, -yet mortal looks adore his beauty still, -Attending on his golden pilgrimage; -But when from highmost pitch, with weary car, -Like feeble age, he reeleth from the day, -The eyes, 'fore duteous, now converted are -From his low tract and look another way: -So thou, thyself out-going in thy noon, -Unlook'd on diest, unless thou get a son. -Music to hear, why hear'st thou music sadly? -Sweets with sweets war not, joy delights in joy. -Why lovest thou that which thou receivest not gladly, -Or else receivest with pleasure thine annoy? -If the true concord of well-tuned sounds, -By unions married, do offend thine ear, -They do but sweetly chide thee, who confounds -In singleness the parts that thou shouldst bear. -Mark how one string, sweet husband to another, -Strikes each in each by mutual ordering, -Resembling sire and child and happy mother -Who all in one, one pleasing note do sing: -Whose speechless song, being many, seeming one, -Sings this to thee: 'thou single wilt prove none.' -Is it for fear to wet a widow's eye -That thou consumest thyself in single life? -Ah! if thou issueless shalt hap to die. -The world will wail thee, like a makeless wife; -The world will be thy widow and still weep -That thou no form of thee hast left behind, -When every private widow well may keep -By children's eyes her husband's shape in mind. -Look, what an unthrift in the world doth spend -Shifts but his place, for still the world enjoys it; -But beauty's waste hath in the world an end, -And kept unused, the user so destroys it. -No love toward others in that bosom sits -That on himself such murderous shame commits. -For shame! deny that thou bear'st love to any, -Who for thyself art so unprovident. -Grant, if thou wilt, thou art beloved of many, -But that thou none lovest is most evident; -For thou art so possess'd with murderous hate -That 'gainst thyself thou stick'st not to conspire. -Seeking that beauteous roof to ruinate -Which to repair should be thy chief desire. -O, change thy thought, that I may change my mind! -Shall hate be fairer lodged than gentle love? -Be, as thy presence is, gracious and kind, -Or to thyself at least kind-hearted prove: -Make thee another self, for love of me, -That beauty still may live in thine or thee. -As fast as thou shalt wane, so fast thou growest -In one of thine, from that which thou departest; -And that fresh blood which youngly thou bestowest -Thou mayst call thine when thou from youth convertest. -Herein lives wisdom, beauty and increase: -Without this, folly, age and cold decay: -If all were minded so, the times should cease -And threescore year would make the world away. -Let those whom Nature hath not made for store, -Harsh featureless and rude, barrenly perish: -Look, whom she best endow'd she gave the more; -Which bounteous gift thou shouldst in bounty cherish: -She carved thee for her seal, and meant thereby -Thou shouldst print more, not let that copy die. -When I do count the clock that tells the time, -And see the brave day sunk in hideous night; -When I behold the violet past prime, -And sable curls all silver'd o'er with white; -When lofty trees I see barren of leaves -Which erst from heat did canopy the herd, -And summer's green all girded up in sheaves -Borne on the bier with white and bristly beard, -Then of thy beauty do I question make, -That thou among the wastes of time must go, -Since sweets and beauties do themselves forsake -And die as fast as they see others grow; -And nothing 'gainst Time's scythe can make defence -Save breed, to brave him when he takes thee hence. -O, that you were yourself! but, love, you are -No longer yours than you yourself here live: -Against this coming end you should prepare, -And your sweet semblance to some other give. -So should that beauty which you hold in lease -Find no determination: then you were -Yourself again after yourself's decease, -When your sweet issue your sweet form should bear. -Who lets so fair a house fall to decay, -Which husbandry in honour might uphold -Against the stormy gusts of winter's day -And barren rage of death's eternal cold? -O, none but unthrifts! Dear my love, you know -You had a father: let your son say so. -Not from the stars do I my judgment pluck; -And yet methinks I have astronomy, -But not to tell of good or evil luck, -Of plagues, of dearths, or seasons' quality; -Nor can I fortune to brief minutes tell, -Pointing to each his thunder, rain and wind, -Or say with princes if it shall go well, -By oft predict that I in heaven find: -But from thine eyes my knowledge I derive, -And, constant stars, in them I read such art -As truth and beauty shall together thrive, -If from thyself to store thou wouldst convert; -Or else of thee this I prognosticate: -Thy end is truth's and beauty's doom and date. -When I consider every thing that grows -Holds in perfection but a little moment, -That this huge stage presenteth nought but shows -Whereon the stars in secret influence comment; -When I perceive that men as plants increase, -Cheered and cheque'd even by the self-same sky, -Vaunt in their youthful sap, at height decrease, -And wear their brave state out of memory; -Then the conceit of this inconstant stay -Sets you most rich in youth before my sight, -Where wasteful Time debateth with Decay, -To change your day of youth to sullied night; -And all in war with Time for love of you, -As he takes from you, I engraft you new. -But wherefore do not you a mightier way -Make war upon this bloody tyrant, Time? -And fortify yourself in your decay -With means more blessed than my barren rhyme? -Now stand you on the top of happy hours, -And many maiden gardens yet unset -With virtuous wish would bear your living flowers, -Much liker than your painted counterfeit: -So should the lines of life that life repair, -Which this, Time's pencil, or my pupil pen, -Neither in inward worth nor outward fair, -Can make you live yourself in eyes of men. -To give away yourself keeps yourself still, -And you must live, drawn by your own sweet skill. -Who will believe my verse in time to come, -If it were fill'd with your most high deserts? -Though yet, heaven knows, it is but as a tomb -Which hides your life and shows not half your parts. -If I could write the beauty of your eyes -And in fresh numbers number all your graces, -The age to come would say 'This poet lies: -Such heavenly touches ne'er touch'd earthly faces.' -So should my papers yellow'd with their age -Be scorn'd like old men of less truth than tongue, -And your true rights be term'd a poet's rage -And stretched metre of an antique song: -But were some child of yours alive that time, -You should live twice; in it and in my rhyme. -Shall I compare thee to a summer's day? -Thou art more lovely and more temperate: -Rough winds do shake the darling buds of May, -And summer's lease hath all too short a date: -Sometime too hot the eye of heaven shines, -And often is his gold complexion dimm'd; -And every fair from fair sometime declines, -By chance or nature's changing course untrimm'd; -But thy eternal summer shall not fade -Nor lose possession of that fair thou owest; -Nor shall Death brag thou wander'st in his shade, -When in eternal lines to time thou growest: -So long as men can breathe or eyes can see, -So long lives this and this gives life to thee. -Devouring Time, blunt thou the lion's paws, -And make the earth devour her own sweet brood; -Pluck the keen teeth from the fierce tiger's jaws, -And burn the long-lived phoenix in her blood; -Make glad and sorry seasons as thou fleets, -And do whate'er thou wilt, swift-footed Time, -To the wide world and all her fading sweets; -But I forbid thee one most heinous crime: -O, carve not with thy hours my love's fair brow, -Nor draw no lines there with thine antique pen; -Him in thy course untainted do allow -For beauty's pattern to succeeding men. -Yet, do thy worst, old Time: despite thy wrong, -My love shall in my verse ever live young. -A woman's face with Nature's own hand painted -Hast thou, the master-mistress of my passion; -A woman's gentle heart, but not acquainted -With shifting change, as is false women's fashion; -An eye more bright than theirs, less false in rolling, -Gilding the object whereupon it gazeth; -A man in hue, all 'hues' in his controlling, -Much steals men's eyes and women's souls amazeth. -And for a woman wert thou first created; -Till Nature, as she wrought thee, fell a-doting, -And by addition me of thee defeated, -By adding one thing to my purpose nothing. -But since she prick'd thee out for women's pleasure, -Mine be thy love and thy love's use their treasure. -So is it not with me as with that Muse -Stirr'd by a painted beauty to his verse, -Who heaven itself for ornament doth use -And every fair with his fair doth rehearse -Making a couplement of proud compare, -With sun and moon, with earth and sea's rich gems, -With April's first-born flowers, and all things rare -That heaven's air in this huge rondure hems. -O' let me, true in love, but truly write, -And then believe me, my love is as fair -As any mother's child, though not so bright -As those gold candles fix'd in heaven's air: -Let them say more than like of hearsay well; -I will not praise that purpose not to sell. -My glass shall not persuade me I am old, -So long as youth and thou are of one date; -But when in thee time's furrows I behold, -Then look I death my days should expiate. -For all that beauty that doth cover thee -Is but the seemly raiment of my heart, -Which in thy breast doth live, as thine in me: -How can I then be elder than thou art? -O, therefore, love, be of thyself so wary -As I, not for myself, but for thee will; -Bearing thy heart, which I will keep so chary -As tender nurse her babe from faring ill. -Presume not on thy heart when mine is slain; -Thou gavest me thine, not to give back again. -As an unperfect actor on the stage -Who with his fear is put besides his part, -Or some fierce thing replete with too much rage, -Whose strength's abundance weakens his own heart. -So I, for fear of trust, forget to say -The perfect ceremony of love's rite, -And in mine own love's strength seem to decay, -O'ercharged with burden of mine own love's might. -O, let my books be then the eloquence -And dumb presagers of my speaking breast, -Who plead for love and look for recompense -More than that tongue that more hath more express'd. -O, learn to read what silent love hath writ: -To hear with eyes belongs to love's fine wit. -Mine eye hath play'd the painter and hath stell'd -Thy beauty's form in table of my heart; -My body is the frame wherein 'tis held, -And perspective it is the painter's art. -For through the painter must you see his skill, -To find where your true image pictured lies; -Which in my bosom's shop is hanging still, -That hath his windows glazed with thine eyes. -Now see what good turns eyes for eyes have done: -Mine eyes have drawn thy shape, and thine for me -Are windows to my breast, where-through the sun -Delights to peep, to gaze therein on thee; -Yet eyes this cunning want to grace their art; -They draw but what they see, know not the heart. -Let those who are in favour with their stars -Of public honour and proud titles boast, -Whilst I, whom fortune of such triumph bars, -Unlook'd for joy in that I honour most. -Great princes' favourites their fair leaves spread -But as the marigold at the sun's eye, -And in themselves their pride lies buried, -For at a frown they in their glory die. -The painful warrior famoused for fight, -After a thousand victories once foil'd, -Is from the book of honour razed quite, -And all the rest forgot for which he toil'd: -Then happy I, that love and am beloved -Where I may not remove nor be removed. -Lord of my love, to whom in vassalage -Thy merit hath my duty strongly knit, -To thee I send this written embassage, -To witness duty, not to show my wit: -Duty so great, which wit so poor as mine -May make seem bare, in wanting words to show it, -But that I hope some good conceit of thine -In thy soul's thought, all naked, will bestow it; -Till whatsoever star that guides my moving -Points on me graciously with fair aspect -And puts apparel on my tatter'd loving, -To show me worthy of thy sweet respect: -Then may I dare to boast how I do love thee; -Till then not show my head where thou mayst prove me. -Weary with toil, I haste me to my bed, -The dear repose for limbs with travel tired; -But then begins a journey in my head, -To work my mind, when body's work's expired: -For then my thoughts, from far where I abide, -Intend a zealous pilgrimage to thee, -And keep my drooping eyelids open wide, -Looking on darkness which the blind do see -Save that my soul's imaginary sight -Presents thy shadow to my sightless view, -Which, like a jewel hung in ghastly night, -Makes black night beauteous and her old face new. -Lo! thus, by day my limbs, by night my mind, -For thee and for myself no quiet find. -How can I then return in happy plight, -That am debarr'd the benefit of rest? -When day's oppression is not eased by night, -But day by night, and night by day, oppress'd? -And each, though enemies to either's reign, -Do in consent shake hands to torture me; -The one by toil, the other to complain -How far I toil, still farther off from thee. -I tell the day, to please them thou art bright -And dost him grace when clouds do blot the heaven: -So flatter I the swart-complexion'd night, -When sparkling stars twire not thou gild'st the even. -But day doth daily draw my sorrows longer -And night doth nightly make grief's strength seem stronger. -When, in disgrace with fortune and men's eyes, -I all alone beweep my outcast state -And trouble deal heaven with my bootless cries -And look upon myself and curse my fate, -Wishing me like to one more rich in hope, -Featured like him, like him with friends possess'd, -Desiring this man's art and that man's scope, -With what I most enjoy contented least; -Yet in these thoughts myself almost despising, -Haply I think on thee, and then my state, -Like to the lark at break of day arising -From sullen earth, sings hymns at heaven's gate; -For thy sweet love remember'd such wealth brings -That then I scorn to change my state with kings. -When to the sessions of sweet silent thought -I summon up remembrance of things past, -I sigh the lack of many a thing I sought, -And with old woes new wail my dear time's waste: -Then can I drown an eye, unused to flow, -For precious friends hid in death's dateless night, -And weep afresh love's long since cancell'd woe, -And moan the expense of many a vanish'd sight: -Then can I grieve at grievances foregone, -And heavily from woe to woe tell o'er -The sad account of fore-bemoaned moan, -Which I new pay as if not paid before. -But if the while I think on thee, dear friend, -All losses are restored and sorrows end. -Thy bosom is endeared with all hearts, -Which I by lacking have supposed dead, -And there reigns love and all love's loving parts, -And all those friends which I thought buried. -How many a holy and obsequious tear -Hath dear religious love stol'n from mine eye -As interest of the dead, which now appear -But things removed that hidden in thee lie! -Thou art the grave where buried love doth live, -Hung with the trophies of my lovers gone, -Who all their parts of me to thee did give; -That due of many now is thine alone: -Their images I loved I view in thee, -And thou, all they, hast all the all of me. -If thou survive my well-contented day, -When that churl Death my bones with dust shall cover, -And shalt by fortune once more re-survey -These poor rude lines of thy deceased lover, -Compare them with the bettering of the time, -And though they be outstripp'd by every pen, -Reserve them for my love, not for their rhyme, -Exceeded by the height of happier men. -O, then vouchsafe me but this loving thought: -'Had my friend's Muse grown with this growing age, -A dearer birth than this his love had brought, -To march in ranks of better equipage: -But since he died and poets better prove, -Theirs for their style I'll read, his for his love.' -Full many a glorious morning have I seen -Flatter the mountain-tops with sovereign eye, -Kissing with golden face the meadows green, -Gilding pale streams with heavenly alchemy; -Anon permit the basest clouds to ride -With ugly rack on his celestial face, -And from the forlorn world his visage hide, -Stealing unseen to west with this disgrace: -Even so my sun one early morn did shine -With all triumphant splendor on my brow; -But out, alack! he was but one hour mine; -The region cloud hath mask'd him from me now. -Yet him for this my love no whit disdaineth; -Suns of the world may stain when heaven's sun staineth. -Why didst thou promise such a beauteous day, -And make me travel forth without my cloak, -To let base clouds o'ertake me in my way, -Hiding thy bravery in their rotten smoke? -'Tis not enough that through the cloud thou break, -To dry the rain on my storm-beaten face, -For no man well of such a salve can speak -That heals the wound and cures not the disgrace: -Nor can thy shame give physic to my grief; -Though thou repent, yet I have still the loss: -The offender's sorrow lends but weak relief -To him that bears the strong offence's cross. -Ah! but those tears are pearl which thy love sheds, -And they are rich and ransom all ill deeds. -No more be grieved at that which thou hast done: -Roses have thorns, and silver fountains mud; -Clouds and eclipses stain both moon and sun, -And loathsome canker lives in sweetest bud. -All men make faults, and even I in this, -Authorizing thy trespass with compare, -Myself corrupting, salving thy amiss, -Excusing thy sins more than thy sins are; -For to thy sensual fault I bring in sense-- -Thy adverse party is thy advocate-- -And 'gainst myself a lawful plea commence: -Such civil war is in my love and hate -That I an accessary needs must be -To that sweet thief which sourly robs from me. -Let me confess that we two must be twain, -Although our undivided loves are one: -So shall those blots that do with me remain -Without thy help by me be borne alone. -In our two loves there is but one respect, -Though in our lives a separable spite, -Which though it alter not love's sole effect, -Yet doth it steal sweet hours from love's delight. -I may not evermore acknowledge thee, -Lest my bewailed guilt should do thee shame, -Nor thou with public kindness honour me, -Unless thou take that honour from thy name: -But do not so; I love thee in such sort -As, thou being mine, mine is thy good report. -As a decrepit father takes delight -To see his active child do deeds of youth, -So I, made lame by fortune's dearest spite, -Take all my comfort of thy worth and truth. -For whether beauty, birth, or wealth, or wit, -Or any of these all, or all, or more, -Entitled in thy parts do crowned sit, -I make my love engrafted to this store: -So then I am not lame, poor, nor despised, -Whilst that this shadow doth such substance give -That I in thy abundance am sufficed -And by a part of all thy glory live. -Look, what is best, that best I wish in thee: -This wish I have; then ten times happy me! \ No newline at end of file diff --git a/src/llmperf/__init__.py b/src/llmperf/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/llmperf/__init__.py @@ -0,0 +1 @@ + diff --git a/src/llmperf/common.py b/src/llmperf/common.py new file mode 100644 index 0000000..3efefa1 --- /dev/null +++ b/src/llmperf/common.py @@ -0,0 +1,38 @@ +from typing import List +from llmperf.ray_clients.litellm_client import LiteLLMClient +from llmperf.ray_clients.openai_chat_completions_client import ( + OpenAIChatCompletionsClient, +) +from llmperf.ray_clients.sagemaker_client import SageMakerClient +from llmperf.ray_clients.vertexai_client import VertexAIClient +from llmperf.ray_llm_client import LLMClient + + +SUPPORTED_APIS = ["openai", "anthropic", "litellm"] + + +def construct_clients(llm_api: str, num_clients: int) -> List[LLMClient]: + """Construct LLMClients that will be used to make requests to the LLM API. + + Args: + llm_api: The name of the LLM API to use. + num_clients: The number of concurrent requests to make. + + Returns: + The constructed LLMCLients + + """ + if llm_api == "openai": + clients = [OpenAIChatCompletionsClient.remote() for _ in range(num_clients)] + elif llm_api == "sagemaker": + clients = [SageMakerClient.remote() for _ in range(num_clients)] + elif llm_api == "vertexai": + clients = [VertexAIClient.remote() for _ in range(num_clients)] + elif llm_api in SUPPORTED_APIS: + clients = [LiteLLMClient.remote() for _ in range(num_clients)] + else: + raise ValueError( + f"llm_api must be one of the supported LLM APIs: {SUPPORTED_APIS}" + ) + + return clients diff --git a/src/llmperf/common_metrics.py b/src/llmperf/common_metrics.py new file mode 100644 index 0000000..40e2112 --- /dev/null +++ b/src/llmperf/common_metrics.py @@ -0,0 +1,17 @@ +# TODO (Avnishn): compute metrics in class +INTER_TOKEN_LAT = "inter_token_latency_s" +TTFT = "ttft_s" +E2E_LAT = "end_to_end_latency_s" +NUM_INPUT_TOKENS = "number_input_tokens" +NUM_OUTPUT_TOKENS = "number_output_tokens" +NUM_TOTAL_TOKENS = "number_total_tokens" +REQ_OUTPUT_THROUGHPUT = "request_output_throughput_token_per_s" +ERROR_MSG = "error_msg" +ERROR_CODE = "error_code" +ERROR_CODE_FREQ = "error_code_frequency" +NUM_ERRORS = "number_errors" +OUTPUT_THROUGHPUT = "mean_output_throughput_token_per_s" +NUM_COMPLETED_REQUESTS = "num_completed_requests" +COMPLETED_REQUESTS_PER_MIN = "num_completed_requests_per_min" +ERROR_RATE = "error_rate" +NUM_REQ_STARTED = "num_requests_started" diff --git a/src/llmperf/models.py b/src/llmperf/models.py new file mode 100644 index 0000000..be0d7ea --- /dev/null +++ b/src/llmperf/models.py @@ -0,0 +1,21 @@ +from typing import Any, Dict, List, Optional, Tuple +from pydantic import BaseModel + + +class RequestConfig(BaseModel): + """The configuration for a request to the LLM API. + + Args: + model: The model to use. + prompt: The prompt to provide to the LLM API. + sampling_params: Additional sampling parameters to send with the request. + For more information see the Router app's documentation for the completions + llm_api: The name of the LLM API to send the request to. + metadata: Additional metadata to attach to the request for logging or validation purposes. + """ + + model: str + prompt: Tuple[str, int] + sampling_params: Optional[Dict[str, Any]] = None + llm_api: Optional[str] = None + metadata: Optional[Dict[str, Any]] = None diff --git a/src/llmperf/ray_clients/__init__.py b/src/llmperf/ray_clients/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/llmperf/ray_clients/litellm_client.py b/src/llmperf/ray_clients/litellm_client.py new file mode 100644 index 0000000..b99201e --- /dev/null +++ b/src/llmperf/ray_clients/litellm_client.py @@ -0,0 +1,100 @@ +import time +from typing import Any, Dict +import ray + +from llmperf.ray_llm_client import LLMClient +from llmperf.models import RequestConfig +from llmperf import common_metrics + + +@ray.remote +class LiteLLMClient(LLMClient): + """Client for LiteLLM Completions API.""" + + def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]: + # litellm package isn't serializable, so we import it within the function + # to maintain compatibility with ray. + from litellm import completion, validate_environment + + prompt = request_config.prompt + prompt, prompt_len = prompt + + message = [ + {"role": "system", "content": ""}, + {"role": "user", "content": prompt}, + ] + assert ( + request_config.llm_api is not None + ), "the request config's llm_api must be set." + if request_config.llm_api == "litellm": + model = request_config.model + else: + model = request_config.llm_api + "/" + request_config.model + validation_result = validate_environment(model) + if validation_result["missing_keys"]: + raise ValueError( + f"The following environment vars weren't found but were necessary for " + f"the model {request_config.model}: {validation_result['missing_keys']}" + ) + body = { + "model": model, + "messages": message, + "stream": True, + } + sampling_params = request_config.sampling_params + body.update(sampling_params or {}) + + time_to_next_token = [] + tokens_received = 0 + ttft = 0 + error_response_code = -1 + generated_text = "" + error_msg = "" + output_throughput = 0 + total_request_time = 0 + + metrics = {} + + metrics[common_metrics.ERROR_CODE] = None + metrics[common_metrics.ERROR_MSG] = "" + + try: + start_time = time.monotonic() + most_recent_received_token_time = time.monotonic() + + response = completion(**body) + ttft = 0 + for tok in response: + if tok.choices[0].delta: + delta = tok.choices[0].delta + if delta.get("content", None): + if ttft == 0: + ttft = time.monotonic() - start_time + time_to_next_token.append(ttft) + else: + time_to_next_token.append( + time.monotonic() - most_recent_received_token_time + ) + generated_text += delta["content"] + most_recent_received_token_time = time.monotonic() + tokens_received += 1 + + total_request_time = time.monotonic() - start_time + + output_throughput = tokens_received / total_request_time + + except Exception as e: + metrics[common_metrics.ERROR_MSG] = error_msg + metrics[common_metrics.ERROR_CODE] = error_response_code + + print(f"Warning Or Error: {e}") + print(error_response_code) + + metrics[common_metrics.INTER_TOKEN_LAT] = sum(time_to_next_token) + metrics[common_metrics.TTFT] = ttft + metrics[common_metrics.E2E_LAT] = total_request_time + metrics[common_metrics.REQ_OUTPUT_THROUGHPUT] = output_throughput + metrics[common_metrics.NUM_TOTAL_TOKENS] = tokens_received + prompt_len + metrics[common_metrics.NUM_OUTPUT_TOKENS] = tokens_received + metrics[common_metrics.NUM_INPUT_TOKENS] = prompt_len + return metrics, generated_text, request_config diff --git a/src/llmperf/ray_clients/openai_chat_completions_client.py b/src/llmperf/ray_clients/openai_chat_completions_client.py new file mode 100644 index 0000000..f2e0a91 --- /dev/null +++ b/src/llmperf/ray_clients/openai_chat_completions_client.py @@ -0,0 +1,120 @@ +import json +import os +import time +from typing import Any, Dict + +import ray +import requests + +from llmperf.ray_llm_client import LLMClient +from llmperf.models import RequestConfig +from llmperf import common_metrics + + +@ray.remote +class OpenAIChatCompletionsClient(LLMClient): + """Client for OpenAI Chat Completions API.""" + + def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]: + prompt = request_config.prompt + prompt, prompt_len = prompt + + message = [ + {"role": "system", "content": ""}, + {"role": "user", "content": prompt}, + ] + model = request_config.model + body = { + "model": model, + "messages": message, + "stream": True, + } + sampling_params = request_config.sampling_params + body.update(sampling_params or {}) + time_to_next_token = [] + tokens_received = 0 + ttft = 0 + error_response_code = -1 + generated_text = "" + error_msg = "" + output_throughput = 0 + total_request_time = 0 + + metrics = {} + + metrics[common_metrics.ERROR_CODE] = None + metrics[common_metrics.ERROR_MSG] = "" + + start_time = time.monotonic() + most_recent_received_token_time = time.monotonic() + address = os.environ.get("OPENAI_API_BASE") + if not address: + raise ValueError("the environment variable OPENAI_API_BASE must be set.") + key = os.environ.get("OPENAI_API_KEY") + if not key: + raise ValueError("the environment variable OPENAI_API_KEY must be set.") + headers = {"Authorization": f"Bearer {key}"} + if not address: + raise ValueError("No host provided.") + if not address.endswith("/"): + address = address + "/" + address += "chat/completions" + try: + with requests.post( + address, + json=body, + stream=True, + timeout=180, + headers=headers, + ) as response: + if response.status_code != 200: + error_msg = response.text + error_response_code = response.status_code + response.raise_for_status() + for chunk in response.iter_lines(chunk_size=None): + chunk = chunk.strip() + + if not chunk: + continue + stem = "data: " + chunk = chunk[len(stem) :] + if chunk == b"[DONE]": + continue + tokens_received += 1 + data = json.loads(chunk) + + if "error" in data: + error_msg = data["error"]["message"] + error_response_code = data["error"]["code"] + raise RuntimeError(data["error"]["message"]) + + delta = data["choices"][0]["delta"] + if delta.get("content", None): + if not ttft: + ttft = time.monotonic() - start_time + time_to_next_token.append(ttft) + else: + time_to_next_token.append( + time.monotonic() - most_recent_received_token_time + ) + most_recent_received_token_time = time.monotonic() + generated_text += delta["content"] + + total_request_time = time.monotonic() - start_time + output_throughput = tokens_received / total_request_time + + except Exception as e: + metrics[common_metrics.ERROR_MSG] = error_msg + metrics[common_metrics.ERROR_CODE] = error_response_code + print(f"Warning Or Error: {e}") + print(error_response_code) + + metrics[common_metrics.INTER_TOKEN_LAT] = sum(time_to_next_token) #This should be same as metrics[common_metrics.E2E_LAT]. Leave it here for now + metrics[common_metrics.TTFT] = ttft + metrics[common_metrics.E2E_LAT] = total_request_time + metrics[common_metrics.REQ_OUTPUT_THROUGHPUT] = output_throughput + metrics[common_metrics.NUM_TOTAL_TOKENS] = tokens_received + prompt_len + metrics[common_metrics.NUM_OUTPUT_TOKENS] = tokens_received + metrics[common_metrics.NUM_INPUT_TOKENS] = prompt_len + + return metrics, generated_text, request_config diff --git a/src/llmperf/ray_clients/sagemaker_client.py b/src/llmperf/ray_clients/sagemaker_client.py new file mode 100644 index 0000000..ce15964 --- /dev/null +++ b/src/llmperf/ray_clients/sagemaker_client.py @@ -0,0 +1,158 @@ +import io +import json +import os +import time +from typing import Any, Dict + +import boto3 +import ray +from transformers import LlamaTokenizerFast + +from llmperf.ray_llm_client import LLMClient +from llmperf.models import RequestConfig +from llmperf import common_metrics + + +@ray.remote +class SageMakerClient(LLMClient): + """Client for OpenAI Chat Completions API.""" + + def __init__(self): + # Sagemaker doesn't return the number of tokens that are generated so we approximate it by + # using the llama tokenizer. + self.tokenizer = LlamaTokenizerFast.from_pretrained( + "hf-internal-testing/llama-tokenizer" + ) + + def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]: + if not os.environ.get("AWS_ACCESS_KEY_ID"): + raise ValueError("AWS_ACCESS_KEY_ID must be set.") + if not os.environ.get("AWS_SECRET_ACCESS_KEY"): + raise ValueError("AWS_SECRET_ACCESS_KEY must be set.") + if not os.environ.get("AWS_REGION_NAME"): + raise ValueError("AWS_REGION_NAME must be set.") + + prompt = request_config.prompt + prompt, prompt_len = prompt + + message = [ + {"role": "system", "content": ""}, + {"role": "user", "content": prompt}, + ] + model = request_config.model + sm_runtime = boto3.client( + "sagemaker-runtime", region_name=os.environ.get("AWS_REGION_NAME") + ) + + sampling_params = request_config.sampling_params + + if "max_tokens" in sampling_params: + sampling_params["max_new_tokens"] = sampling_params["max_tokens"] + del sampling_params["max_tokens"] + + message = { + "inputs": [ + [ + {"role": "system", "content": ""}, + {"role": "user", "content": prompt}, + ] + ], + "parameters": { + **request_config.sampling_params, + }, + } + + time_to_next_token = [] + tokens_received = 0 + ttft = 0 + error_response_code = None + generated_text = "" + error_msg = "" + output_throughput = 0 + total_request_time = 0 + metrics = {} + + start_time = time.monotonic() + most_recent_received_token_time = time.monotonic() + + try: + response = sm_runtime.invoke_endpoint_with_response_stream( + EndpointName=model, + ContentType="application/json", + Body=json.dumps(message), + CustomAttributes="accept_eula=true", + ) + + event_stream = response["Body"] + json_byte = b"" + for line, ttft, _ in LineIterator(event_stream): + json_byte += line + time_to_next_token.append( + time.monotonic() - most_recent_received_token_time + ) + most_recent_received_token_time = time.monotonic() + ttft = ttft - start_time + resp = json.loads(json_byte) + total_request_time = time.monotonic() - start_time + generated_text = resp[0]["generation"]["content"] + tokens_received = len(self.tokenizer.encode(generated_text)) + output_throughput = tokens_received / total_request_time + + except Exception as e: + print(f"Warning Or Error: {e}") + print(error_response_code) + error_msg = str(e) + error_response_code = 500 + + metrics[common_metrics.ERROR_MSG] = error_msg + metrics[common_metrics.ERROR_CODE] = error_response_code + metrics[common_metrics.INTER_TOKEN_LAT] = time_to_next_token + metrics[common_metrics.TTFT] = ttft + metrics[common_metrics.E2E_LAT] = total_request_time + metrics[common_metrics.REQ_OUTPUT_THROUGHPUT] = output_throughput + metrics[common_metrics.NUM_TOTAL_TOKENS] = tokens_received + prompt_len + metrics[common_metrics.NUM_OUTPUT_TOKENS] = tokens_received + metrics[common_metrics.NUM_INPUT_TOKENS] = prompt_len + + return metrics, generated_text, request_config + + +class LineIterator: + """ + A helper class for parsing the byte stream input. + Reference: https://aws.amazon.com/blogs/machine-learning/elevating-the-generative-ai-experience-introducing-streaming-support-in-amazon-sagemaker-hosting/ + """ + + def __init__(self, stream): + self.byte_iterator = iter(stream) + self.buffer = io.BytesIO() + self.read_pos = 0 + self.ttft = 0 + + def __iter__(self): + return self + + def __next__(self): + while True: + self.buffer.seek(self.read_pos) + line = self.buffer.readline() + if line and line[-1] == ord("\n"): + if self.ttft == 0: + self.ttft = time.monotonic() + self.read_pos += len(line) + return line[:-1], self.ttft, time.monotonic() + # kyle: dealing with last ']' for chat output + if line and self.read_pos == self.buffer.getbuffer().nbytes - 1: + self.read_pos += 1 + return line, self.ttft, time.monotonic() + try: + chunk = next(self.byte_iterator) + except StopIteration: + if self.read_pos < self.buffer.getbuffer().nbytes: + continue + raise + if "PayloadPart" not in chunk: + print("Unknown event type:" + chunk) + continue + self.buffer.seek(0, io.SEEK_END) + self.buffer.write(chunk["PayloadPart"]["Bytes"]) diff --git a/src/llmperf/ray_clients/vertexai_client.py b/src/llmperf/ray_clients/vertexai_client.py new file mode 100644 index 0000000..4e2cf12 --- /dev/null +++ b/src/llmperf/ray_clients/vertexai_client.py @@ -0,0 +1,135 @@ +import json +import os +import time +from typing import Any, Dict + +import ray +import requests +from transformers import LlamaTokenizerFast + +from llmperf.ray_llm_client import LLMClient +from llmperf.models import RequestConfig +from llmperf import common_metrics + + +@ray.remote +class VertexAIClient(LLMClient): + """Client for VertexAI API.""" + + def __init__(self): + # VertexAI doesn't return the number of tokens that are generated so we approximate it by + # using the llama tokenizer. + self.tokenizer = LlamaTokenizerFast.from_pretrained( + "hf-internal-testing/llama-tokenizer" + ) + + def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]: + project_id = os.environ.get("GCLOUD_PROJECT_ID") + region = os.environ.get("GCLOUD_REGION") + endpoint_id = os.environ.get("VERTEXAI_ENDPOINT_ID") + access_token = os.environ.get("GCLOUD_ACCESS_TOKEN").strip() + if not project_id: + raise ValueError("the environment variable GCLOUD_PROJECT_ID must be set.") + if not region: + raise ValueError("the environment variable GCLOUD_REGION must be set.") + if not endpoint_id: + raise ValueError( + "the environment variable VERTEXAI_ENDPOINT_ID must be set." + ) + if not access_token: + raise ValueError( + "the environment variable GCLOUD_ACCESS_TOKEN must be set." + ) + prompt = request_config.prompt + prompt, prompt_len = prompt + + time_to_next_token = [] + tokens_received = 0 + ttft = 0 + generated_text = "" + output_throughput = 0 + total_request_time = 0 + + metrics = {} + + metrics[common_metrics.ERROR_CODE] = None + metrics[common_metrics.ERROR_MSG] = "" + + try: + # Define the URL for the request + url = ( + f"https://{region}-aiplatform.googleapis.com/v1/projects/" + f"{project_id}/locations/{region}/endpoints/{endpoint_id}:predict" + ) + + # Define the headers + headers = { + "Authorization": f"Bearer {access_token}", + "Content-Type": "application/json", + } + + sampling_params = request_config.sampling_params + if "max_new_tokens" in sampling_params: + sampling_params["maxOutputTokens"] = sampling_params.pop( + "max_new_tokens" + ) + + # Define the data payload + data = {"instances": [{"prompt": prompt}], "parameters": sampling_params} + + # Make the POST request + start_time = time.monotonic() + response = requests.post(url, headers=headers, data=json.dumps(data)) + total_request_time = time.monotonic() - start_time + response_code = response.status_code + response.raise_for_status() + # output from the endpoint is in the form: + # {"predictions": ["Input: ... \nOutput:\n ..."]} + generated_text = response.json()["predictions"][0].split("\nOutput:\n")[1] + tokens_received = len(self.tokenizer.encode(generated_text)) + ttft = -1 + output_throughput = tokens_received / total_request_time + time_to_next_token = [ + total_request_time / tokens_received for _ in range(tokens_received) + ] + + except Exception as e: + metrics[common_metrics.ERROR_MSG] = str(e) + metrics[common_metrics.ERROR_CODE] = response_code + print(f"Warning Or Error: {e}") + print(response_code) + print(response_code) + + metrics[common_metrics.INTER_TOKEN_LAT] = time_to_next_token + metrics[common_metrics.TTFT] = ttft + metrics[common_metrics.E2E_LAT] = total_request_time + metrics[common_metrics.REQ_OUTPUT_THROUGHPUT] = output_throughput + metrics[common_metrics.NUM_TOTAL_TOKENS] = tokens_received + prompt_len + metrics[common_metrics.NUM_OUTPUT_TOKENS] = tokens_received + metrics[common_metrics.NUM_INPUT_TOKENS] = prompt_len + + return metrics, generated_text, request_config + + +if __name__ == "__main__": + # Run these before hand: + + # gcloud auth application-default login + # gcloud config set project YOUR_PROJECT_ID + # export GCLOUD_ACCESS_TOKEN=$(gcloud auth print-access-token) + # export GCLOUD_PROJECT_ID=YOUR_PROJECT_ID + # export GCLOUD_REGION=YOUR_REGION + # export VERTEXAI_ENDPOINT_ID=YOUR_ENDPOINT_ID + + client = VertexAIClient.remote() + request_config = RequestConfig( + prompt=("Give me ten interview questions for the role of program manager.", 10), + model="gpt3", + sampling_params={ + "temperature": 0.2, + "max_new_tokens": 256, + "top_k": 40, + "top_p": 0.95, + }, + ) + ray.get(client.llm_request.remote(request_config)) diff --git a/src/llmperf/ray_llm_client.py b/src/llmperf/ray_llm_client.py new file mode 100644 index 0000000..cc31639 --- /dev/null +++ b/src/llmperf/ray_llm_client.py @@ -0,0 +1,22 @@ +import abc +from typing import Any, Dict, Tuple + +from llmperf.models import RequestConfig + + +class LLMClient: + """A client for making requests to a LLM API e.g Anyscale Endpoints.""" + + @abc.abstractmethod + def llm_request( + self, request_config: RequestConfig + ) -> Tuple[Dict[str, Any], str, RequestConfig]: + """Make a single completion request to a LLM API + + Returns: + Metrics about the performance charateristics of the request. + The text generated by the request to the LLM API. + The request_config used to make the request. This is mainly for logging purposes. + + """ + ... diff --git a/src/llmperf/requests_launcher.py b/src/llmperf/requests_launcher.py new file mode 100644 index 0000000..d5a12ce --- /dev/null +++ b/src/llmperf/requests_launcher.py @@ -0,0 +1,48 @@ +from typing import Any, List + +from llmperf.ray_llm_client import LLMClient +from llmperf.models import RequestConfig +from ray.util import ActorPool + + +class RequestsLauncher: + """Launch requests from LLMClients to their respective LLM APIs.""" + + def __init__(self, llm_clients: List[LLMClient]): + self._llm_client_pool = ActorPool(llm_clients) + + def launch_requests(self, request_config: RequestConfig) -> None: + """Launch requests to the LLM API. + + Args: + request_config: The configuration for the request. + + """ + if self._llm_client_pool.has_free(): + self._llm_client_pool.submit( + lambda client, _request_config: client.llm_request.remote( + _request_config + ), + request_config, + ) + + def get_next_ready(self, block: bool = False) -> List[Any]: + """Return results that are ready from completed requests. + + Args: + block: Whether to block until a result is ready. + + Returns: + A list of results that are ready. + + """ + results = [] + if not block: + while self._llm_client_pool.has_next(): + results.append(self._llm_client_pool.get_next_unordered()) + else: + while not self._llm_client_pool.has_next(): + pass + while self._llm_client_pool.has_next(): + results.append(self._llm_client_pool.get_next_unordered()) + return results diff --git a/src/llmperf/sonnet.txt b/src/llmperf/sonnet.txt new file mode 100644 index 0000000..9f13ead --- /dev/null +++ b/src/llmperf/sonnet.txt @@ -0,0 +1,84 @@ +Shall I compare thee to a summer's day? +Thou art more lovely and more temperate: +Rough winds do shake the darling buds of May, +And summer's lease hath all too short a date: +Sometime too hot the eye of heaven shines, +And often is his gold complexion dimm'd; +And every fair from fair sometime declines, +By chance or nature's changing course untrimm'd; +But thy eternal summer shall not fade +Nor lose possession of that fair thou owest; +Nor shall Death brag thou wander'st in his shade, +When in eternal lines to time thou growest: +So long as men can breathe or eyes can see, +So long lives this and this gives life to thee. +Then let not winter's ragged hand deface +In thee thy summer, ere thou be distill'd: +Make sweet some vial; treasure thou some place +With beauty's treasure, ere it be self-kill'd. +That use is not forbidden usury, +Which happies those that pay the willing loan; +That's for thyself to breed another thee, +Or ten times happier, be it ten for one; +Ten times thyself were happier than thou art, +If ten of thine ten times refigured thee: +Then what could death do, if thou shouldst depart, +Leaving thee living in posterity? +Be not self-will'd, for thou art much too fair +To be death's conquest and make worms thine heir. +Where art thou, Muse, that thou forget'st so long +To speak of that which gives thee all thy might? +Spend'st thou thy fury on some worthless song, +Darkening thy power to lend base subjects light? +Return, forgetful Muse, and straight redeem +In gentle numbers time so idly spent; +Sing to the ear that doth thy lays esteem +And gives thy pen both skill and argument. +Rise, resty Muse, my love's sweet face survey, +If Time have any wrinkle graven there; +If any, be a satire to decay, +And make Time's spoils despised every where. +Give my love fame faster than Time wastes life; +So thou prevent'st his scythe and crooked knife. +My glass shall not persuade me I am old, +So long as youth and thou are of one date; +But when in thee time's furrows I behold, +Then look I death my days should expiate. +For all that beauty that doth cover thee +Is but the seemly raiment of my heart, +Which in thy breast doth live, as thine in me: +How can I then be elder than thou art? +O, therefore, love, be of thyself so wary +As I, not for myself, but for thee will; +Bearing thy heart, which I will keep so chary +As tender nurse her babe from faring ill. +Presume not on thy heart when mine is slain; +Thou gavest me thine, not to give back again. +So am I as the rich, whose blessed key +Can bring him to his sweet up-locked treasure, +The which he will not every hour survey, +For blunting the fine point of seldom pleasure. +Therefore are feasts so solemn and so rare, +Since, seldom coming, in the long year set, +Like stones of worth they thinly placed are, +Or captain jewels in the carcanet. +So is the time that keeps you as my chest, +Or as the wardrobe which the robe doth hide, +To make some special instant special blest, +By new unfolding his imprison'd pride. +Blessed are you, whose worthiness gives scope, +Being had, to triumph, being lack'd, to hope. +If there be nothing new, but that which is +Hath been before, how are our brains beguiled, +Which, labouring for invention, bear amiss +The second burden of a former child! +O, that record could with a backward look, +Even of five hundred courses of the sun, +Show me your image in some antique book, +Since mind at first in character was done! +That I might see what the old world could say +To this composed wonder of your frame; +Whether we are mended, or whether better they, +Or whether revolution be the same. +O, sure I am, the wits of former days +To subjects worse have given admiring praise. \ No newline at end of file diff --git a/src/llmperf/utils.py b/src/llmperf/utils.py new file mode 100644 index 0000000..4e3b2e9 --- /dev/null +++ b/src/llmperf/utils.py @@ -0,0 +1,147 @@ +import json +import math +import pathlib +import random +import subprocess +import time +from typing import Any, Dict, Tuple + +from transformers import LlamaTokenizerFast + + +RESULTS_VERSION = "2023-08-31" + + +class LLMPerfResults: + def __init__( + self, + name: str, + metadata: Dict[str, Any] = None, + ): + self.name = name + self.metadata = metadata or {} + self.timestamp = int(time.time()) + self.metadata["timestamp"] = self.timestamp + self.version = RESULTS_VERSION + + def to_dict(self): + data = { + "version": self.version, + "name": self.name, + } + data.update(self.metadata) + data = flatten_dict(data) + return data + + def json(self): + data = self.to_dict() + return json.dumps(data) + + +def upload_to_s3(results_path: str, s3_path: str) -> None: + """Upload the results to s3. + + Args: + results_path: The path to the results file. + s3_path: The s3 path to upload the results to. + + """ + + command = ["aws", "s3", "sync", results_path, f"{s3_path}/"] + result = subprocess.run(command) + if result.returncode == 0: + print("Files uploaded successfully!") + else: + print("An error occurred:") + print(result.stderr) + + +def randomly_sample_sonnet_lines_prompt( + prompt_tokens_mean: int = 550, + prompt_tokens_stddev: int = 250, + expect_output_tokens: int = 150, +) -> Tuple[str, int]: + """Generate a prompt that randomly samples lines from a the shakespeare sonnet at sonnet.txt. + + Args: + prompt_length_mean: The mean length of the prompt to generate. + prompt_len_stddev: The standard deviation of the length of the prompt to generate. + expect_output_tokens: The number of tokens to expect in the output. This is used to + determine the length of the prompt. The prompt will be generated such that the output + will be approximately this many tokens. + + Note: + tokens will be counted from the sonnet using the Llama tokenizer. Using one tokenizer + ensures a fairer comparison across different LLMs. For example, if gpt 3.5 tokenizes + a prompt in less tokens than Llama2, then this will be reflected in the results since + they will be fed identical prompts. + + Returns: + A tuple of the prompt and the length of the prompt. + """ + + tokenizer = LlamaTokenizerFast.from_pretrained( + "hf-internal-testing/llama-tokenizer" + ) + + get_token_length = lambda text: len(tokenizer.encode(text)) + + prompt = ( + "Randomly stream lines from the following text " + f"with {expect_output_tokens} output tokens. " + "Don't generate eos tokens:\n\n" + ) + # get a prompt length that is at least as long as the base + num_prompt_tokens = sample_random_positive_int( + prompt_tokens_mean, prompt_tokens_stddev + ) + while num_prompt_tokens < get_token_length(prompt): + num_prompt_tokens = sample_random_positive_int( + prompt_tokens_mean, prompt_tokens_stddev + ) + remaining_prompt_tokens = num_prompt_tokens - get_token_length(prompt) + sonnet_path = pathlib.Path(__file__).parent.resolve() / "sonnet.txt" + with open(sonnet_path, "r") as f: + sonnet_lines = f.readlines() + random.shuffle(sonnet_lines) + sampling_lines = True + while sampling_lines: + for line in sonnet_lines: + line_to_add = line + if remaining_prompt_tokens - get_token_length(line_to_add) < 0: + # This will cut off a line in the middle of a word, but that's ok since an + # llm should be able to handle that. + line_to_add = line_to_add[: int(math.ceil(remaining_prompt_tokens))] + sampling_lines = False + prompt += line_to_add + break + prompt += line_to_add + remaining_prompt_tokens -= get_token_length(line_to_add) + return (prompt, num_prompt_tokens) + + +def sample_random_positive_int(mean: int, stddev: int) -> int: + """Sample random numbers from a gaussian distribution until a positive number is sampled. + + Args: + mean: The mean of the gaussian distribution to sample from. + stddev: The standard deviation of the gaussian distribution to sample from. + + Returns: + A random positive integer sampled from the gaussian distribution. + """ + ret = -1 + while ret <= 0: + ret = int(random.gauss(mean, stddev)) + return ret + + +def flatten_dict(d, parent_key="", sep="_"): + items = [] + for k, v in d.items(): + new_key = f"{parent_key}{sep}{k}" if parent_key else k + if isinstance(v, dict): + items.extend(flatten_dict(v, new_key, sep=sep).items()) + else: + items.append((new_key, v)) + return dict(items) diff --git a/token_benchmark_ray.py b/token_benchmark_ray.py new file mode 100644 index 0000000..a078e35 --- /dev/null +++ b/token_benchmark_ray.py @@ -0,0 +1,464 @@ +import argparse +from collections.abc import Iterable +import json +import os +from pathlib import Path +import re +import time +from typing import Any, Dict, List, Optional, Tuple + +import pandas as pd +import ray + +from llmperf import common_metrics +from llmperf.common import SUPPORTED_APIS, construct_clients + +from llmperf.models import RequestConfig +from llmperf.requests_launcher import RequestsLauncher +from llmperf.utils import ( + randomly_sample_sonnet_lines_prompt, + LLMPerfResults, + sample_random_positive_int, +) +from tqdm import tqdm + +from transformers import LlamaTokenizerFast + +def get_token_throughput_latencies( + model: str, + mean_input_tokens: int, + stddev_input_tokens: int, + mean_output_tokens: int, + stddev_output_tokens: int, + additional_sampling_params: Optional[Dict[str, Any]] = None, + num_concurrent_requests: int = 1, + max_num_completed_requests: int = 500, + test_timeout_s=90, + llm_api="openai", +) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]: + """Get the token throughput and latencies for the given model. + + Args: + model: The name of the model to query. + mean_input_tokens: The mean number of tokens to send in the prompt for the request. + stddev_input_tokens: The standard deviation of the number of tokens to send in the prompt for the request. + mean_output_tokens: The mean number of tokens to generate per request. + stddev_output_tokens: The standard deviation of the number of tokens to generate per request. + additional_sampling_params: Additional sampling parameters to send with the request. + For more information see the LLM APIs documentation for the completions + num_concurrent_requests: The number of concurrent requests to make. Increase + this to increase the amount of load and vice versa. + test_timeout_s: The amount of time to run the test for before reporting results. + llm_api: The name of the llm api to use. Either "openai" or "litellm". + + Returns: + A summary of the performance metrics collected across all completed requests + (e.g. throughput, latencies, etc.) + The individual metrics for each request. + """ + tokenizer = LlamaTokenizerFast.from_pretrained( + "hf-internal-testing/llama-tokenizer" + ) + get_token_length = lambda text: len(tokenizer.encode(text)) + + if not additional_sampling_params: + additional_sampling_params = {} + + clients = construct_clients(llm_api=llm_api, num_clients=num_concurrent_requests) + req_launcher = RequestsLauncher(clients) + completed_requests = [] + num_completed_requests = 0 + start_time = time.monotonic() + iter = 0 + pbar = tqdm(total=max_num_completed_requests) + while ( + time.monotonic() - start_time < test_timeout_s + and len(completed_requests) < max_num_completed_requests + ): + iter += 1 + num_output_tokens = sample_random_positive_int( + mean_output_tokens, stddev_output_tokens + ) + + prompt = randomly_sample_sonnet_lines_prompt( + prompt_tokens_mean=mean_input_tokens, + prompt_tokens_stddev=stddev_input_tokens, + expect_output_tokens=num_output_tokens, + ) + + default_sampling_params = {"max_tokens": num_output_tokens} + default_sampling_params.update(additional_sampling_params) + request_config = RequestConfig( + model=model, + prompt=prompt, + sampling_params=default_sampling_params, + llm_api=llm_api, + ) + req_launcher.launch_requests(request_config) + # Retrieving results less frequently allows for more concurrent requests + # to be launched. This will overall reduce the amount of time it takes + # for the test to run. + if not (iter % num_concurrent_requests): + outs = req_launcher.get_next_ready() + all_metrics = [] + for out in outs: + request_metrics, gen_text, _ = out + num_output_tokens = get_token_length(gen_text) + if num_output_tokens: + request_metrics[common_metrics.INTER_TOKEN_LAT] /= num_output_tokens + else: + request_metrics[common_metrics.INTER_TOKEN_LAT] = 0 + request_metrics[common_metrics.NUM_OUTPUT_TOKENS] = num_output_tokens + request_metrics[common_metrics.NUM_TOTAL_TOKENS] = request_metrics[common_metrics.NUM_INPUT_TOKENS] + num_output_tokens + all_metrics.append(request_metrics) + completed_requests.extend(all_metrics) + pbar.update(len(completed_requests) - num_completed_requests) + num_completed_requests = len(completed_requests) + + pbar.close() + end_time = time.monotonic() + if end_time - start_time >= test_timeout_s: + print("Test timed out before all requests could be completed.") + + # check one last time that there are no remaining results to collect. + outs = req_launcher.get_next_ready() + all_metrics = [] + for out in outs: + request_metrics, gen_text, _ = out + num_output_tokens = get_token_length(gen_text) + if num_output_tokens: + request_metrics[common_metrics.INTER_TOKEN_LAT] /= num_output_tokens + else: + request_metrics[common_metrics.INTER_TOKEN_LAT] = 0 + request_metrics[common_metrics.NUM_OUTPUT_TOKENS] = num_output_tokens + request_metrics[common_metrics.NUM_TOTAL_TOKENS] = request_metrics[common_metrics.NUM_INPUT_TOKENS] + num_output_tokens + + all_metrics.append(request_metrics) + completed_requests.extend(all_metrics) + + print(f"\Results for token benchmark for {model} queried with the {llm_api} api.\n") + ret = metrics_summary(completed_requests, start_time, end_time) + + metadata = { + "model": model, + "mean_input_tokens": mean_input_tokens, + "stddev_input_tokens": stddev_input_tokens, + "mean_output_tokens": mean_output_tokens, + "stddev_output_tokens": stddev_output_tokens, + "num_concurrent_requests": num_concurrent_requests, + "additional_sampling_params": additional_sampling_params, + } + + metadata["results"] = ret + + return metadata, completed_requests + + +def metrics_summary( + metrics: List[Dict[str, Any]], start_time: int, end_time: int +) -> Dict[str, Any]: + """Generate a summary over metrics generated from potentially multiple instances of this client. + + Args: + metrics: The metrics to summarize. + start_time: The time the test started. + end_time: The time the test ended. + + Returns: + A summary with the following information: + - Overall throughput (generated tokens / total test time) + - Number of completed requests + - Error rate + - Error code frequency + - Quantiles (p25-p99) for the following metrics: + - Inter token latency + - Time to first token + - User total request time + - Number of tokens processed per request + - Number of tokens generated per request + - User throughput (tokens / s) + """ + ret = {} + + def flatten(item): + for sub_item in item: + if isinstance(sub_item, Iterable) and not isinstance(sub_item, str): + yield from flatten(sub_item) + else: + yield sub_item + + df = pd.DataFrame(metrics) + df_without_errored_req = df[df[common_metrics.ERROR_CODE].isna()] + + for key in [ + common_metrics.INTER_TOKEN_LAT, + common_metrics.TTFT, + common_metrics.E2E_LAT, + common_metrics.REQ_OUTPUT_THROUGHPUT, + common_metrics.NUM_INPUT_TOKENS, + common_metrics.NUM_OUTPUT_TOKENS + ]: + print(key) + ret[key] = {} + series = pd.Series(list(flatten(df_without_errored_req[key]))).dropna() + quantiles = series.quantile([0.25, 0.5, 0.75, 0.9, 0.95, 0.99]).to_dict() + quantiles_reformatted_keys = {} + for quantile, value in quantiles.items(): + reformatted_key = f"p{int(quantile * 100)}" + print(f" {reformatted_key} = {value}") + quantiles_reformatted_keys[reformatted_key] = value + ret[key]["quantiles"] = quantiles_reformatted_keys + mean = series.mean() + print(f" mean = {mean}") + ret[key]["mean"] = mean + print(f" min = {series.min()}") + ret[key]["min"] = series.min() + print(f" max = {series.max()}") + ret[key]["max"] = series.max() + print(f" stddev = {series.std()}") + ret[key]["stddev"] = series.std() + + ret[common_metrics.NUM_REQ_STARTED] = len(metrics) + + error_codes = df[common_metrics.ERROR_CODE].dropna() + num_errors = len(error_codes) + ret[common_metrics.ERROR_RATE] = num_errors / len(metrics) if len(metrics) else 0 + ret[common_metrics.NUM_ERRORS] = num_errors + print(f"Number Of Errored Requests: {num_errors}") + error_code_frequency = dict(error_codes.value_counts()) + if num_errors: + error_code_frequency = dict(error_codes.value_counts()) + print("Error Code Frequency") + print(error_code_frequency) + ret[common_metrics.ERROR_CODE_FREQ] = str(error_code_frequency) + + overall_output_throughput = df_without_errored_req[ + common_metrics.NUM_OUTPUT_TOKENS + ].sum() / (end_time - start_time) + + print(f"Overall Output Throughput: {overall_output_throughput}") + ret[common_metrics.OUTPUT_THROUGHPUT] = overall_output_throughput + + num_completed_requests = len(df_without_errored_req) + num_completed_requests_per_min = ( + num_completed_requests / (end_time - start_time) * 60 + ) + print(f"Number Of Completed Requests: {num_completed_requests}") + print(f"Completed Requests Per Minute: {num_completed_requests_per_min}") + + ret[common_metrics.NUM_COMPLETED_REQUESTS] = num_completed_requests + ret[common_metrics.COMPLETED_REQUESTS_PER_MIN] = num_completed_requests_per_min + + return ret + + +def run_token_benchmark( + llm_api: str, + model: str, + test_timeout_s: int, + max_num_completed_requests: int, + num_concurrent_requests: int, + mean_input_tokens: int, + stddev_input_tokens: int, + mean_output_tokens: int, + stddev_output_tokens: int, + additional_sampling_params: str, + results_dir: str, + user_metadata: Dict[str, Any], +): + """ + Args: + llm_api: The name of the llm api to use. + model: The name of the model to query. + max_num_completed_requests: The number of requests to complete before finishing the test. + test_timeout_s: The amount of time to run the test for before reporting results. + num_concurrent_requests: The number of concurrent requests to make. Increase + this to increase the amount of load and vice versa. + mean_input_tokens: The mean number of tokens to send in the prompt for the request. + stddev_input_tokens: The standard deviation of the number of tokens to send in the prompt for the request. + mean_output_tokens: The mean number of tokens to generate per request. + stddev_output_tokens: The standard deviation of the number of tokens to generate per request. + additional_sampling_params: Additional sampling parameters to send with the request. + For more information see the LLM APIs documentation for the completions. + results_dir: The directory to save the results to. + user_metadata: Additional metadata to include in the results. + """ + if mean_input_tokens < 40: + print( + "the minimum number of input tokens that will be sent is 41" + " because of the prompting logic right now" + ) + + summary, individual_responses = get_token_throughput_latencies( + model=model, + llm_api=llm_api, + test_timeout_s=test_timeout_s, + max_num_completed_requests=max_num_completed_requests, + mean_input_tokens=mean_input_tokens, + stddev_input_tokens=stddev_input_tokens, + mean_output_tokens=mean_output_tokens, + stddev_output_tokens=stddev_output_tokens, + num_concurrent_requests=num_concurrent_requests, + additional_sampling_params=json.loads(additional_sampling_params), + ) + + if results_dir: + filename = f"{model}_{mean_input_tokens}_{mean_output_tokens}" + filename = re.sub(r"[^\w\d-]+", "-", filename) + filename = re.sub(r"-{2,}", "-", filename) + summary_filename = f"{filename}_summary" + individual_responses_filename = f"{filename}_individual_responses" + + # Update to metadata. + summary.update(user_metadata) + + results = LLMPerfResults(name=summary_filename, metadata=summary) + results_dir = Path(results_dir) + if not results_dir.exists(): + results_dir.mkdir(parents=True) + elif not results_dir.is_dir(): + raise ValueError(f"{results_dir} is not a directory") + + try: + with open(results_dir / f"{summary_filename}.json", "w") as f: + json.dump(results.to_dict(), f, indent=4, default=str) + except Exception as e: + print(results.to_dict()) + raise e + + try: + with open(results_dir / f"{individual_responses_filename}.json", "w") as f: + json.dump(individual_responses, f, indent=4) + except Exception as e: + print(individual_responses) + raise e + + +args = argparse.ArgumentParser( + description="Run a token throughput and latency benchmark." +) + +args.add_argument( + "--model", type=str, required=True, help="The model to use for this load test." +) +args.add_argument( + "--mean-input-tokens", + type=int, + default=550, + help=( + "The mean number of tokens to send in the prompt for the request. " + " (default: %(default)s)" + ), +) +args.add_argument( + "--stddev-input-tokens", + type=int, + default=150, + help=( + "The standard deviation of number of tokens to send in the prompt for the request. " + "(default: %(default)s)" + ), +) +args.add_argument( + "--mean-output-tokens", + type=int, + default=150, + help=( + "The mean number of tokens to generate from each llm request. This is the max_tokens param " + "for the completions API. Note that this is not always the number of tokens returned. " + "(default: %(default)s)" + ), +) +args.add_argument( + "--stddev-output-tokens", + type=int, + default=80, + help=( + "The stdandard deviation on the number of tokens to generate per llm request. " + "(default: %(default)s)" + ), +) +args.add_argument( + "--num-concurrent-requests", + type=int, + default=10, + help=("The number of concurrent requests to send (default: %(default)s)"), +) +args.add_argument( + "--timeout", + type=int, + default=90, + help="The amount of time to run the load test for. (default: %(default)s)", +) +args.add_argument( + "--max-num-completed-requests", + type=int, + default=10, + help=( + "The number of requests to complete before finishing the test. Note " + "that its possible for the test to timeout first. (default: %(default)s)" + ), +) +args.add_argument( + "--additional-sampling-params", + type=str, + default="{}", + help=( + "Additional sampling params to send with the each request to the LLM API. " + "(default: %(default)s) No additional sampling params are sent." + ), +) +args.add_argument( + "--results-dir", + type=str, + default="", + help=( + "The directory to save the results to. " + "(`default: %(default)s`) No results are saved)" + ), +) +args.add_argument( + "--llm-api", + type=str, + default="openai", + help=( + f"The name of the llm api to use. Can select from {SUPPORTED_APIS}" + " (default: %(default)s)" + ), +) +args.add_argument( + "--metadata", + type=str, + default="", + help=( + "A comma separated list of metadata to include in the results, e.g. " + "name=foo,bar=1. These will be added to the metadata field of the results. " + ), +) + +if __name__ == "__main__": + env_vars = dict(os.environ) + ray.init(runtime_env={"env_vars": env_vars}) + args = args.parse_args() + + # Parse user metadata. + user_metadata = {} + if args.metadata: + for item in args.metadata.split(","): + key, value = item.split("=") + user_metadata[key] = value + + run_token_benchmark( + llm_api=args.llm_api, + model=args.model, + test_timeout_s=args.timeout, + max_num_completed_requests=args.max_num_completed_requests, + mean_input_tokens=args.mean_input_tokens, + stddev_input_tokens=args.stddev_input_tokens, + mean_output_tokens=args.mean_output_tokens, + stddev_output_tokens=args.stddev_output_tokens, + num_concurrent_requests=args.num_concurrent_requests, + additional_sampling_params=args.additional_sampling_params, + results_dir=args.results_dir, + user_metadata=user_metadata, + ) From 4d1f8efb4d1c3217fc7f0ce1bbb4617d3f40173d Mon Sep 17 00:00:00 2001 From: Avnish Narayan Date: Mon, 4 Dec 2023 14:22:51 -0800 Subject: [PATCH 2/3] Add notice back in Signed-off-by: Avnish Narayan --- NOTICE.txt | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 NOTICE.txt diff --git a/NOTICE.txt b/NOTICE.txt new file mode 100644 index 0000000..4820e73 --- /dev/null +++ b/NOTICE.txt @@ -0,0 +1,14 @@ +[Project Name] +Copyright 2023-onwards Anyscale, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. \ No newline at end of file From 1e42cb3fc45134ab25d1994b4e2d87b9fcbdd5f5 Mon Sep 17 00:00:00 2001 From: Avnish Narayan Date: Mon, 4 Dec 2023 18:43:10 -0800 Subject: [PATCH 3/3] Merge master Signed-off-by: Avnish Narayan --- configs.py | 29 ----------------------------- 1 file changed, 29 deletions(-) delete mode 100644 configs.py diff --git a/configs.py b/configs.py deleted file mode 100644 index ac8e11d..0000000 --- a/configs.py +++ /dev/null @@ -1,29 +0,0 @@ -from dataclasses import dataclass -from enum import Enum - -class Framework(Enum): - ANYSCALE = "anyscale" - OPENAI = "openai" - FIREWORKS = "fireworks" - VERTEXAI = "vertexai" - SAGEMAKER = "sagemaker" - PERPLEXITY = "perplexity" - TOGETHER = "together" - VLLM = "vllm" - TGI = "tgi" - - # helper method to get the list of values/ supported frameworks - @classmethod - def list(cls): - return list(map(lambda c: c.value, cls)) - -# One class for all endpoint configs -@dataclass -class EndpointConfig: - framework: Framework - api_base: str = None - api_key: str = None - model: str = None - region: str = None # Used by SageMaker - endpoint_name: str = None # Used by SageMaker - project_id: str = None # Used by VertexAI