feat: vllm with bitsandbytes, exp. backoffs in bench

av · Sep 12, 2024 · de1ee3a · de1ee3a
1 parent 876b722
commit de1ee3a
Show file tree

Hide file tree

Showing 5 changed files with 36 additions and 5 deletions.
diff --git a/bench/src/llm.ts b/bench/src/llm.ts
@@ -1,5 +1,5 @@
 import { config } from "./config.ts";
-import { omit } from './utils.ts';
+import { omit, sleep } from './utils.ts';
 
 export type LLMOptions = {
   maxTokens?: number;
@@ -21,6 +21,26 @@ export class LLM {
   }
 
   async chat(message: string, options = {}): Promise<string> {
+    const maxRetries = 4;
+    let retries = 0;
+
+    while (retries < maxRetries) {
+      try {
+        return await this.attemptChat(message, options);
+      } catch (error) {
+        retries++;
+        if (retries >= maxRetries) {
+          throw error;
+        }
+        console.warn(`Attempt ${retries} failed. Retrying in ${2 ** retries} seconds...`);
+        await sleep(2 ** retries * 1000); // Exponential backoff
+      }
+    }
+
+    throw new Error('Max retries reached');
+  }
+
+  private async attemptChat(message: string, options = {}): Promise<string> {
     const completionOptions = {
       ...(this.llm?.options || {}),
       ...options,

diff --git a/bench/src/utils.ts b/bench/src/utils.ts
@@ -168,4 +168,6 @@ export function parseArgs(args: string[]) {
 
     return acc;
   }, {} as Record<string, string | string[] | boolean>);
-}
+}
+
+export const sleep = (ms: number) => new Promise(resolve => setTimeout(resolve, ms));
diff --git a/compose.vllm.yml b/compose.vllm.yml
@@ -2,7 +2,11 @@ services:
   vllm:
     container_name: ${HARBOR_CONTAINER_PREFIX}.vllm
     env_file: ./.env
-    image: vllm/vllm-openai:${HARBOR_VLLM_VERSION}
+    build:
+      context: ./vllm
+      dockerfile: Dockerfile
+      args:
+        HARBOR_VLLM_VERSION: ${HARBOR_VLLM_VERSION}
     ports:
       - ${HARBOR_VLLM_HOST_PORT}:8000
     ipc: host

diff --git a/http-catalog/vllm.http b/http-catalog/vllm.http
@@ -20,11 +20,11 @@ curl {{host}}/v1/chat/completions -H 'Content-Type: application/json' -d '{
 ###
 
 curl {{host}}/v1/chat/completions -H 'Content-Type: application/json' -d '{
-  "model": "google/gemma-2-2b-it",
+  "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
   "messages": [
     {
       "role": "user",
-      "content": "Answer in one word. Where is Paris?"
+      "content": "Answer in one word starting with a letter 'A'. Where is Paris?"
     }
   ],
   "temperature": 0.2

diff --git a/vllm/Dockerfile b/vllm/Dockerfile
@@ -0,0 +1,5 @@
+ARG HARBOR_VLLM_VERSION=latest
+FROM vllm/vllm-openai:${HARBOR_VLLM_VERSION}
+
+# Install bitsandbytes for additional quantization support
+RUN pip install bitsandbytes