Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Server: fix seed for multiple slots #6835

Merged
merged 4 commits into from
Apr 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
invalid_param = true;
return true;
}
// This is temporary, in the future the samplign state will be moved fully to llama_sampling_context.
params.seed = std::stoul(argv[i]);
sparams.seed = std::stoul(argv[i]);
return true;
}
if (arg == "-t" || arg == "--threads") {
Expand Down
13 changes: 12 additions & 1 deletion common/sampling.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#define LLAMA_API_INTERNAL
#include "sampling.h"
#include <random>

struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
struct llama_sampling_context * result = new llama_sampling_context();
Expand Down Expand Up @@ -33,6 +35,8 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_

result->prev.resize(params.n_prev);

llama_sampling_set_rng_seed(result, params.seed);

return result;
}

Expand Down Expand Up @@ -62,6 +66,13 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
ctx->cur.clear();
}

void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
if (seed == LLAMA_DEFAULT_SEED) {
seed = time(NULL);
}
ctx->rng.seed(seed);
}

void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) {
if (dst->grammar) {
llama_grammar_free(dst->grammar);
Expand Down Expand Up @@ -203,7 +214,7 @@ static llama_token llama_sampling_sample_impl(

sampler_queue(ctx_main, params, cur_p, min_keep);

id = llama_sample_token(ctx_main, &cur_p);
id = llama_sample_token_with_rng(ctx_main, &cur_p, ctx_sampling->rng);

//{
// const int n_top = 10;
Expand Down
47 changes: 27 additions & 20 deletions common/sampling.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@

#include "grammar-parser.h"

#include <random>
#include <string>
#include <vector>
#include <unordered_map>
#include <vector>

// sampler types
enum class llama_sampler_type : char {
Expand All @@ -20,25 +21,26 @@ enum class llama_sampler_type : char {

// sampling parameters
typedef struct llama_sampling_params {
int32_t n_prev = 64; // number of previous tokens to remember
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
int32_t top_k = 40; // <= 0 to use vocab size
float top_p = 0.95f; // 1.0 = disabled
float min_p = 0.05f; // 0.0 = disabled
float tfs_z = 1.00f; // 1.0 = disabled
float typical_p = 1.00f; // 1.0 = disabled
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
float dynatemp_range = 0.00f; // 0.0 = disabled
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
float penalty_repeat = 1.00f; // 1.0 = disabled
float penalty_freq = 0.00f; // 0.0 = disabled
float penalty_present = 0.00f; // 0.0 = disabled
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate
bool penalize_nl = false; // consider newlines as a repeatable token
int32_t n_prev = 64; // number of previous tokens to remember
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
int32_t top_k = 40; // <= 0 to use vocab size
float top_p = 0.95f; // 1.0 = disabled
float min_p = 0.05f; // 0.0 = disabled
float tfs_z = 1.00f; // 1.0 = disabled
float typical_p = 1.00f; // 1.0 = disabled
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
float dynatemp_range = 0.00f; // 0.0 = disabled
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
float penalty_repeat = 1.00f; // 1.0 = disabled
float penalty_freq = 0.00f; // 0.0 = disabled
float penalty_present = 0.00f; // 0.0 = disabled
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate
bool penalize_nl = false; // consider newlines as a repeatable token
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context

std::vector<llama_sampler_type> samplers_sequence = {
llama_sampler_type::TOP_K,
Expand Down Expand Up @@ -79,6 +81,8 @@ struct llama_sampling_context {
// TODO: replace with ring-buffer
std::vector<llama_token> prev;
std::vector<llama_token_data> cur;

std::mt19937 rng;
};

#include "common.h"
Expand All @@ -93,6 +97,9 @@ void llama_sampling_free(struct llama_sampling_context * ctx);
// - reset grammar
void llama_sampling_reset(llama_sampling_context * ctx);

// Set the sampler seed
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed);

// Copy the sampler context
void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);

Expand Down
1 change: 0 additions & 1 deletion examples/lookup/lookup-stats.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ int main(int argc, char ** argv){

// load the model
std::tie(model, ctx) = llama_init_from_gpt_params(params);
llama_set_rng_seed(ctx, params.seed);
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));

// tokenize the prompt
Expand Down
1 change: 0 additions & 1 deletion examples/lookup/lookup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ int main(int argc, char ** argv){

// load the model
std::tie(model, ctx) = llama_init_from_gpt_params(params);
llama_set_rng_seed(ctx, params.seed);
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));

// tokenize the prompt
Expand Down
1 change: 0 additions & 1 deletion examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,6 @@ int main(int argc, char ** argv) {
return 1;
}
session_tokens.resize(n_token_count_out);
llama_set_rng_seed(ctx, params.seed);
LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
}
}
Expand Down
3 changes: 1 addition & 2 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -854,7 +854,7 @@ struct server_context {
slot.sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
slot.params.n_keep = json_value(data, "n_keep", slot.params.n_keep);
slot.params.n_discard = json_value(data, "n_discard", default_params.n_discard);
slot.params.seed = json_value(data, "seed", default_params.seed);
slot.sparams.seed = json_value(data, "seed", default_sparams.seed);
slot.sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);

Expand Down Expand Up @@ -1028,7 +1028,6 @@ struct server_context {
send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
return false;
}
llama_set_rng_seed(ctx, slot.params.seed);
}

slot.command = SLOT_COMMAND_LOAD_PROMPT;
Expand Down
57 changes: 57 additions & 0 deletions examples/server/tests/features/results.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
@llama.cpp
@results
Feature: Results

Background: Server startup
Given a server listening on localhost:8080
And a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models
And a model file test-model-00001-of-00003.gguf
And 128 as batch size
And 256 KV cache size
And 128 max tokens to predict

Scenario Outline: Multi users completion
Given <n_slots> slots
And continuous batching
Then the server is starting
Then the server is healthy

Given 42 as seed
And a prompt:
"""
Write a very long story about AI.
"""

Given 42 as seed
And a prompt:
"""
Write a very long story about AI.
"""

Given 42 as seed
And a prompt:
"""
Write a very long story about AI.
"""

Given 42 as seed
And a prompt:
"""
Write a very long story about AI.
"""

Given 42 as seed
And a prompt:
"""
Write a very long story about AI.
"""

Given concurrent completion requests
Then the server is busy
Then the server is idle
And all slots are idle
Then all predictions are equal
Examples:
| n_slots |
| 1 |
| 2 |
34 changes: 34 additions & 0 deletions examples/server/tests/features/steps/steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def step_server_config(context, server_fqdn, server_port):
context.server_metrics = False
context.server_process = None
context.seed = None
context.draft = None
context.server_seed = None
context.user_api_key = None
context.response_format = None
Expand Down Expand Up @@ -107,6 +108,11 @@ def step_n_gpu_layer(context, ngl):
context.n_gpu_layer = ngl


@step('{draft:d} as draft')
def step_draft(context, draft):
context.draft = draft


@step('{n_ctx:d} KV cache size')
def step_n_ctx(context, n_ctx):
context.n_ctx = n_ctx
Expand Down Expand Up @@ -254,6 +260,15 @@ def step_n_tokens_predicted(context, predicted_n):
assert_n_tokens_predicted(context.completion, predicted_n)


@step('all predictions are equal')
@async_run_until_complete
async def step_predictions_equal(context):
n_completions = await gather_tasks_results(context)
assert n_completions >= 2, "need at least 2 completions"
assert_all_predictions_equal(context.tasks_result)
context.tasks_result = []


@step('the completion is truncated')
def step_assert_completion_truncated(context):
step_assert_completion_truncated(context, '')
Expand Down Expand Up @@ -1020,6 +1035,23 @@ def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re
assert n_predicted == expected_predicted_n, (f'invalid number of tokens predicted:'
f' {n_predicted} <> {expected_predicted_n}')

def assert_all_predictions_equal(completion_responses):
content_0 = completion_responses[0]['content']

if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
print(f"content 0: {content_0}")

i = 1
for response in completion_responses[1:]:
content = response['content']

if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
print(f"content {i}: {content}")

assert content == content_0, "contents not equal"

i += 1


async def gather_tasks_results(context):
n_tasks = len(context.concurrent_tasks)
Expand Down Expand Up @@ -1148,6 +1180,8 @@ def start_server_background(context):
server_args.extend(['--ubatch-size', context.n_ubatch])
if context.n_gpu_layer:
server_args.extend(['--n-gpu-layers', context.n_gpu_layer])
if context.draft is not None:
server_args.extend(['--draft', context.draft])
if context.server_continuous_batching:
server_args.append('--cont-batching')
if context.server_embeddings:
Expand Down
7 changes: 5 additions & 2 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13478,7 +13478,7 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da
return result;
}

llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng) {
GGML_ASSERT(ctx);

const int64_t t_start_sample_us = ggml_time_us();
Expand All @@ -13491,7 +13491,6 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
}

std::discrete_distribution<> dist(probs.begin(), probs.end());
auto & rng = ctx->rng;
int idx = dist(rng);

llama_token result = candidates->data[idx].id;
Expand All @@ -13501,6 +13500,10 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
return result;
}

llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
return llama_sample_token_with_rng(ctx, candidates, ctx->rng);
}

void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
const int64_t t_start_sample_us = ggml_time_us();

Expand Down
9 changes: 7 additions & 2 deletions llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -987,7 +987,7 @@ extern "C" {
struct llama_context * ctx,
llama_token_data_array * candidates);

/// @details Randomly selects a token from the candidates based on their probabilities.
/// @details Randomly selects a token from the candidates based on their probabilities using the RNG of ctx.
LLAMA_API llama_token llama_sample_token(
struct llama_context * ctx,
llama_token_data_array * candidates);
Expand Down Expand Up @@ -1074,8 +1074,9 @@ extern "C" {
// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
#ifdef LLAMA_API_INTERNAL

#include <vector>
#include <random>
#include <string>
#include <vector>

struct ggml_tensor;

Expand Down Expand Up @@ -1112,6 +1113,10 @@ std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
const std::string & src,
llama_partial_utf8 partial_start);

// Randomly selects a token from the candidates based on their probabilities using given std::mt19937.
// This is a temporary workaround in order to fix race conditions when sampling with multiple sequences.
llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng);

#endif // LLAMA_API_INTERNAL

#endif // LLAMA_H
Loading