fix(llm): fix streaming sse filter not ran twice and prompt token cou…

…nt and missing metadata
Kong · Nov 29, 2024 · e053601 · e053601 · github-actions · Nov 29, 2024
1 parent 581571b
commit e053601
Show file tree

Hide file tree

Showing 4 changed files with 44 additions and 18 deletions.
diff --git a/kong/llm/plugin/base.lua b/kong/llm/plugin/base.lua
@@ -19,6 +19,13 @@ local STAGES = {
   RES_POST_PROCESSING = 7,
 }
 
+-- Filters in those stages are allowed to execute more than one time in a request
+-- TODO: implement singleton support, that in one iteration of of body_filter only one filter
+-- only ran one times. This is not an issue today as they are only used in one plugin.
+local REPEATED_PHASES = {
+  [STAGES.STREAMING] = true,
+}
+
 local MetaPlugin = {}
 
 local all_filters = {}
@@ -38,7 +45,7 @@ local function run_stage(stage, sub_plugin, conf)
     if not f then
       kong.log.err("no filter named '" .. name .. "' registered")
 
-    elseif not ai_executed_filters[name] then
+    elseif not ai_executed_filters[name] or REPEATED_PHASES[stage] then
       ai_executed_filters[name] = true
 
       kong.log.debug("executing filter ", name)

diff --git a/kong/llm/plugin/shared-filters/normalize-sse-chunk.lua b/kong/llm/plugin/shared-filters/normalize-sse-chunk.lua
@@ -83,6 +83,8 @@ local function handle_streaming_frame(conf, chunk, finished)
 
 
   for _, event in ipairs(events) do
+    -- TODO: currently only subset of driver follow the body, err, metadata pattern
+    -- unify this so that it was always extracted from the body
     local formatted, _, metadata = ai_driver.from_format(event, conf.model, "stream/" .. conf.route_type)
 
     if formatted then
@@ -106,12 +108,6 @@ local function handle_streaming_frame(conf, chunk, finished)
         if body_buffer then
           body_buffer:put(token_t)
         end
-
-        -- incredibly loose estimate based on https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
-        -- but this is all we can do until OpenAI fixes this...
-        --
-        -- essentially, every 4 characters is a token, with minimum of 1*4 per event
-        ai_plugin_o11y.metrics_add("llm_completion_tokens_count", math.ceil(#strip(token_t) / 4))
       end
     end
 
@@ -141,6 +137,19 @@ local function handle_streaming_frame(conf, chunk, finished)
 
     local prompt_tokens_count = ai_plugin_o11y.metrics_get("llm_prompt_tokens_count")
     local completion_tokens_count = ai_plugin_o11y.metrics_get("llm_completion_tokens_count")
+
+    if conf.logging and conf.logging.log_statistics then
+      -- no metadata populated in the event streams, do our estimation
+      if completion_tokens_count == 0 then
+        -- incredibly loose estimate based on https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
+        -- but this is all we can do until OpenAI fixes this...
+        --
+        -- essentially, every 4 characters is a token, with minimum of 1*4 per event
+        completion_tokens_count = math.ceil(#strip(response) / 4)
+        ai_plugin_o11y.metrics_set("llm_completion_tokens_count", completion_tokens_count)
+      end
+    end
+
     -- populate cost
     if conf.model.options and conf.model.options.input_cost and conf.model.options.output_cost then
       local cost = (prompt_tokens_count * conf.model.options.input_cost +

diff --git a/spec/03-plugins/38-ai-proxy/02-openai_integration_spec.lua b/spec/03-plugins/38-ai-proxy/02-openai_integration_spec.lua
@@ -871,14 +871,14 @@ for _, strategy in helpers.all_strategies() do
         local _, first_got = next(log_message.ai)
         local actual_llm_latency = first_got.meta.llm_latency
         local actual_time_per_token = first_got.usage.time_per_token
-        local time_per_token = math.floor(actual_llm_latency / first_got.usage.completion_tokens)
+        local time_per_token = actual_llm_latency / first_got.usage.completion_tokens
 
         first_got.meta.llm_latency = 1
         first_got.usage.time_per_token = 1
 
         assert.same(first_expected, first_got)
         assert.is_true(actual_llm_latency >= 0)
-        assert.same(actual_time_per_token, time_per_token)
+        assert.same(tonumber(string.format("%.3f", actual_time_per_token)), tonumber(string.format("%.3f", time_per_token)))
         assert.same(first_got.meta.request_model, "gpt-3.5-turbo")
       end)
 
@@ -1529,14 +1529,14 @@ for _, strategy in helpers.all_strategies() do
 
         local actual_llm_latency = first_got.meta.llm_latency
         local actual_time_per_token = first_got.usage.time_per_token
-        local time_per_token = math.floor(actual_llm_latency / first_got.usage.completion_tokens)
+        local time_per_token = actual_llm_latency / first_got.usage.completion_tokens
 
         first_got.meta.llm_latency = 1
         first_got.usage.time_per_token = 1
 
         assert.same(first_expected, first_got)
         assert.is_true(actual_llm_latency >= 0)
-        assert.same(actual_time_per_token, time_per_token)
+        assert.same(tonumber(string.format("%.3f", actual_time_per_token)), tonumber(string.format("%.3f", time_per_token)))
       end)
 
       it("logs payloads", function()

diff --git a/spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua b/spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua
@@ -20,10 +20,10 @@ local _EXPECTED_CHAT_STATS = {
   },
   usage = {
     prompt_tokens = 18,
-    completion_tokens = 7,
-    total_tokens = 25,
+    completion_tokens = 13, -- this was from estimation
+    total_tokens = 31,
     time_per_token = 1,
-    cost = 0.00037,
+    cost = 0.00031,
   },
 }
 
@@ -377,7 +377,9 @@ for _, strategy in helpers.all_strategies() do
             options = {
               max_tokens = 256,
               temperature = 1.0,
-              upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/openai/llm/v1/chat/good"
+              upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/openai/llm/v1/chat/good",
+              input_cost = 10.0,
+              output_cost = 10.0,
             },
           },
         },
@@ -418,7 +420,9 @@ for _, strategy in helpers.all_strategies() do
             options = {
               max_tokens = 256,
               temperature = 1.0,
-              upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/openai/llm/v1/chat/partial"
+              upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/openai/llm/v1/chat/partial",
+              input_cost = 10.0,
+              output_cost = 10.0,
             },
           },
         },
@@ -454,7 +458,9 @@ for _, strategy in helpers.all_strategies() do
             options = {
               max_tokens = 256,
               temperature = 1.0,
-              upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/cohere/llm/v1/chat/good"
+              upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/cohere/llm/v1/chat/good",
+              input_cost = 10.0,
+              output_cost = 10.0,
             },
           },
         },
@@ -492,6 +498,8 @@ for _, strategy in helpers.all_strategies() do
               temperature = 1.0,
               upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/anthropic/llm/v1/chat/good",
               anthropic_version = "2023-06-01",
+              input_cost = 10.0,
+              output_cost = 10.0,
             },
           },
         },
@@ -527,7 +535,9 @@ for _, strategy in helpers.all_strategies() do
             options = {
               max_tokens = 256,
               temperature = 1.0,
-              upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/openai/llm/v1/chat/bad"
+              upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/openai/llm/v1/chat/bad",
+              input_cost = 10.0,
+              output_cost = 10.0,
             },
           },
         },