Skip to content

Commit

Permalink
fix(llm): fix streaming sse filter not ran twice and prompt token cou…
Browse files Browse the repository at this point in the history
…nt and missing metadata
  • Loading branch information
tysoekong authored and fffonion committed Nov 29, 2024
1 parent 581571b commit e053601
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 18 deletions.
9 changes: 8 additions & 1 deletion kong/llm/plugin/base.lua
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,13 @@ local STAGES = {
RES_POST_PROCESSING = 7,
}

-- Filters in those stages are allowed to execute more than one time in a request
-- TODO: implement singleton support, that in one iteration of of body_filter only one filter
-- only ran one times. This is not an issue today as they are only used in one plugin.
local REPEATED_PHASES = {
[STAGES.STREAMING] = true,
}

local MetaPlugin = {}

local all_filters = {}
Expand All @@ -38,7 +45,7 @@ local function run_stage(stage, sub_plugin, conf)
if not f then
kong.log.err("no filter named '" .. name .. "' registered")

elseif not ai_executed_filters[name] then
elseif not ai_executed_filters[name] or REPEATED_PHASES[stage] then
ai_executed_filters[name] = true

kong.log.debug("executing filter ", name)
Expand Down
21 changes: 15 additions & 6 deletions kong/llm/plugin/shared-filters/normalize-sse-chunk.lua
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ local function handle_streaming_frame(conf, chunk, finished)


for _, event in ipairs(events) do
-- TODO: currently only subset of driver follow the body, err, metadata pattern
-- unify this so that it was always extracted from the body
local formatted, _, metadata = ai_driver.from_format(event, conf.model, "stream/" .. conf.route_type)

if formatted then
Expand All @@ -106,12 +108,6 @@ local function handle_streaming_frame(conf, chunk, finished)
if body_buffer then
body_buffer:put(token_t)
end

-- incredibly loose estimate based on https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
-- but this is all we can do until OpenAI fixes this...
--
-- essentially, every 4 characters is a token, with minimum of 1*4 per event
ai_plugin_o11y.metrics_add("llm_completion_tokens_count", math.ceil(#strip(token_t) / 4))
end
end

Expand Down Expand Up @@ -141,6 +137,19 @@ local function handle_streaming_frame(conf, chunk, finished)

local prompt_tokens_count = ai_plugin_o11y.metrics_get("llm_prompt_tokens_count")
local completion_tokens_count = ai_plugin_o11y.metrics_get("llm_completion_tokens_count")

if conf.logging and conf.logging.log_statistics then
-- no metadata populated in the event streams, do our estimation
if completion_tokens_count == 0 then
-- incredibly loose estimate based on https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
-- but this is all we can do until OpenAI fixes this...
--
-- essentially, every 4 characters is a token, with minimum of 1*4 per event
completion_tokens_count = math.ceil(#strip(response) / 4)
ai_plugin_o11y.metrics_set("llm_completion_tokens_count", completion_tokens_count)
end
end

-- populate cost
if conf.model.options and conf.model.options.input_cost and conf.model.options.output_cost then
local cost = (prompt_tokens_count * conf.model.options.input_cost +
Expand Down
8 changes: 4 additions & 4 deletions spec/03-plugins/38-ai-proxy/02-openai_integration_spec.lua
Original file line number Diff line number Diff line change
Expand Up @@ -871,14 +871,14 @@ for _, strategy in helpers.all_strategies() do
local _, first_got = next(log_message.ai)
local actual_llm_latency = first_got.meta.llm_latency
local actual_time_per_token = first_got.usage.time_per_token
local time_per_token = math.floor(actual_llm_latency / first_got.usage.completion_tokens)
local time_per_token = actual_llm_latency / first_got.usage.completion_tokens

first_got.meta.llm_latency = 1
first_got.usage.time_per_token = 1

assert.same(first_expected, first_got)
assert.is_true(actual_llm_latency >= 0)
assert.same(actual_time_per_token, time_per_token)
assert.same(tonumber(string.format("%.3f", actual_time_per_token)), tonumber(string.format("%.3f", time_per_token)))
assert.same(first_got.meta.request_model, "gpt-3.5-turbo")
end)

Expand Down Expand Up @@ -1529,14 +1529,14 @@ for _, strategy in helpers.all_strategies() do

local actual_llm_latency = first_got.meta.llm_latency
local actual_time_per_token = first_got.usage.time_per_token
local time_per_token = math.floor(actual_llm_latency / first_got.usage.completion_tokens)
local time_per_token = actual_llm_latency / first_got.usage.completion_tokens

first_got.meta.llm_latency = 1
first_got.usage.time_per_token = 1

assert.same(first_expected, first_got)
assert.is_true(actual_llm_latency >= 0)
assert.same(actual_time_per_token, time_per_token)
assert.same(tonumber(string.format("%.3f", actual_time_per_token)), tonumber(string.format("%.3f", time_per_token)))
end)

it("logs payloads", function()
Expand Down
24 changes: 17 additions & 7 deletions spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@ local _EXPECTED_CHAT_STATS = {
},
usage = {
prompt_tokens = 18,
completion_tokens = 7,
total_tokens = 25,
completion_tokens = 13, -- this was from estimation
total_tokens = 31,
time_per_token = 1,
cost = 0.00037,
cost = 0.00031,
},
}

Expand Down Expand Up @@ -377,7 +377,9 @@ for _, strategy in helpers.all_strategies() do
options = {
max_tokens = 256,
temperature = 1.0,
upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/openai/llm/v1/chat/good"
upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/openai/llm/v1/chat/good",
input_cost = 10.0,
output_cost = 10.0,
},
},
},
Expand Down Expand Up @@ -418,7 +420,9 @@ for _, strategy in helpers.all_strategies() do
options = {
max_tokens = 256,
temperature = 1.0,
upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/openai/llm/v1/chat/partial"
upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/openai/llm/v1/chat/partial",
input_cost = 10.0,
output_cost = 10.0,
},
},
},
Expand Down Expand Up @@ -454,7 +458,9 @@ for _, strategy in helpers.all_strategies() do
options = {
max_tokens = 256,
temperature = 1.0,
upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/cohere/llm/v1/chat/good"
upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/cohere/llm/v1/chat/good",
input_cost = 10.0,
output_cost = 10.0,
},
},
},
Expand Down Expand Up @@ -492,6 +498,8 @@ for _, strategy in helpers.all_strategies() do
temperature = 1.0,
upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/anthropic/llm/v1/chat/good",
anthropic_version = "2023-06-01",
input_cost = 10.0,
output_cost = 10.0,
},
},
},
Expand Down Expand Up @@ -527,7 +535,9 @@ for _, strategy in helpers.all_strategies() do
options = {
max_tokens = 256,
temperature = 1.0,
upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/openai/llm/v1/chat/bad"
upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/openai/llm/v1/chat/bad",
input_cost = 10.0,
output_cost = 10.0,
},
},
},
Expand Down

1 comment on commit e053601

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bazel Build

Docker image available kong/kong:e053601b40e4b642575cb184e64204a17414d484
Artifacts available https://github.com/Kong/kong/actions/runs/12079695620

Please sign in to comment.