Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

llama : allow gguf RoPE keys to be overridden with defaults #3240

Merged
merged 4 commits into from
Sep 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -647,9 +647,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" --cfg-negative-prompt-file FNAME\n");
printf(" negative prompt file to use for guidance. (default: empty)\n");
printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale);
printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base);
printf(" --rope-freq-scale N RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale);
printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n");
printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
printf(" --no-penalize-nl do not penalize newline token\n");
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
Expand Down
4 changes: 2 additions & 2 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -701,8 +701,8 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
printf(" --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
printf(" --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n");
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
Expand Down
60 changes: 24 additions & 36 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -929,23 +929,22 @@ static const size_t kB = 1024;
static const size_t MB = kB*kB;
static const size_t GB = kB*kB*kB;

// default hparams (LLaMA 7B)
struct llama_hparams {
uint32_t n_vocab = 32000;
uint32_t n_ctx_train = 2048; // the context size used during training
uint32_t n_ctx = 512; // the context size used during inference
uint32_t n_embd = 4096;
uint32_t n_head = 32;
uint32_t n_head_kv = 32;
uint32_t n_layer = 32;
uint32_t n_rot = 64;
uint32_t n_ff = 11008;

float f_norm_eps = 1e-5;
float f_norm_rms_eps = 1e-5;

float rope_freq_base = 10000.0f;
float rope_freq_scale = 1.0f;
uint32_t n_vocab;
uint32_t n_ctx_train; // context size the model was trained on
uint32_t n_ctx; // context size used during inference
uint32_t n_embd;
uint32_t n_head;
uint32_t n_head_kv;
uint32_t n_layer;
uint32_t n_rot;
uint32_t n_ff;

float f_norm_eps;
float f_norm_rms_eps;

float rope_freq_base;
float rope_freq_scale;

bool operator!=(const llama_hparams & other) const {
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
Expand Down Expand Up @@ -1076,7 +1075,7 @@ struct llama_model {

std::string name = "n/a";

llama_hparams hparams;
llama_hparams hparams = {};
llama_vocab vocab;

struct ggml_tensor * tok_embeddings;
Expand Down Expand Up @@ -1674,28 +1673,17 @@ static void llm_load_hparams(
hparams.n_head_kv = hparams.n_head;
GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));

// TODO: manually setting rope freq base and scale should override this
// FIXME: partial fix when the param specified is not the default value, but
// will not work for overriding the model value to the params default

llama_context_params defaults = llama_context_default_params();

// rope_freq_base
{
float ropebase = 10000.0f;
GGUF_GET_KEY(ctx, ropebase, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
if (ropebase != 10000.0f && rope_freq_base == defaults.rope_freq_base) {
rope_freq_base = ropebase;
}
// rope_freq_base (optional)
if (rope_freq_base == 0.0f) {
rope_freq_base = 10000.0f;
GGUF_GET_KEY(ctx, rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
}

// rope_freq_scale (inverse of the kv) is optional
{
if (rope_freq_scale == 0.0f) {
float ropescale = 1.0f;
GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
if (ropescale != 1.0f && rope_freq_scale == defaults.rope_freq_scale) {
rope_freq_scale = 1.0f/ropescale;
}
rope_freq_scale = 1.0f/ropescale;
}

// sanity check for n_rot (optional)
Expand Down Expand Up @@ -6188,8 +6176,8 @@ struct llama_context_params llama_context_default_params() {
/*.n_gpu_layers =*/ 0,
/*.main_gpu =*/ 0,
/*.tensor_split =*/ nullptr,
/*.rope_freq_base =*/ 10000.0f,
/*.rope_freq_scale =*/ 1.0f,
/*.rope_freq_base =*/ 0.0f,
/*.rope_freq_scale =*/ 0.0f,
/*.progress_callback =*/ nullptr,
/*.progress_callback_user_data =*/ nullptr,
/*.low_vram =*/ false,
Expand Down