diff --git a/integration/embed_test.go b/integration/embed_test.go index 10333d5dfa9..4a68af68ae7 100644 --- a/integration/embed_test.go +++ b/integration/embed_test.go @@ -70,8 +70,8 @@ func TestAllMiniLMEmbed(t *testing.T) { t.Fatalf("expected 0.010071031, got %.8f", res.Embeddings[0][0]) } - if res.PromptEvalCount != 8 { - t.Fatalf("expected 8 prompt tokens, got %d", res.PromptEvalCount) + if res.PromptEvalCount != 6 { + t.Fatalf("expected 6 prompt tokens, got %d", res.PromptEvalCount) } } @@ -102,8 +102,8 @@ func TestAllMiniLMBatchEmbed(t *testing.T) { t.Fatalf("expected 0.010071031 and -0.009802706, got %.8f and %.8f", res.Embeddings[0][0], res.Embeddings[1][0]) } - if res.PromptEvalCount != 16 { - t.Fatalf("expected 16 prompt tokens, got %d", res.PromptEvalCount) + if res.PromptEvalCount != 12 { + t.Fatalf("expected 12 prompt tokens, got %d", res.PromptEvalCount) } } diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index 5717c17a905..8e08b850f8e 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -1429,7 +1429,13 @@ struct llama_server_context switch (task.type) { case TASK_TYPE_COMPLETION: { - server_slot *slot = prefix_slot(task.data["prompt"]); + server_slot *slot = nullptr; + if (task.embedding_mode) { + // Embedding seq_id (aka slot id) must always be <= token length, so always use slot 0 + slot = slots[0].available() ? &slots[0] : nullptr; + } else { + slot = prefix_slot(task.data["prompt"]); + } if (slot == nullptr) { // if no slot is available, we defer this task for processing later diff --git a/llm/patches/08-pooling.diff b/llm/patches/08-pooling.diff deleted file mode 100644 index 2e4fe11eef9..00000000000 --- a/llm/patches/08-pooling.diff +++ /dev/null @@ -1,60 +0,0 @@ -diff --git a/src/llama.cpp b/src/llama.cpp -index 721b8f4e..cfe7ac40 100644 ---- a/src/llama.cpp -+++ b/src/llama.cpp -@@ -8420,14 +8420,14 @@ struct llm_build_context { - } - - struct ggml_tensor * build_inp_mean() { -- lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens); -+ lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, cparams.n_seq_max); - cb(lctx.inp_mean, "inp_mean", -1); - ggml_set_input(lctx.inp_mean); - return lctx.inp_mean; - } - - struct ggml_tensor * build_inp_cls() { -- lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); -+ lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, cparams.n_seq_max); - cb(lctx.inp_cls, "inp_cls", -1); - ggml_set_input(lctx.inp_cls); - return lctx.inp_cls; -@@ -13847,19 +13847,16 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer)); - - float * data = (float *) lctx.inp_mean->data; -- memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean)); -+ memset(lctx.inp_mean->data, 0, n_tokens * cparams.n_seq_max * ggml_element_size(lctx.inp_mean)); - - std::vector sum(n_tokens, 0); - for (int i = 0; i < n_tokens; ++i) { - const llama_seq_id seq_id = batch.seq_id[i][0]; -- -- GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN"); -- - sum[seq_id] += 1; - } - -- std::vector div(n_tokens, 0.0f); -- for (int i = 0; i < n_tokens; ++i) { -+ std::vector div(cparams.n_seq_max, 0.0f); -+ for (uint32_t i = 0; i < cparams.n_seq_max; ++i) { - const uint64_t s = sum[i]; - if (s > 0) { - div[i] = 1.0f/float(s); -@@ -13879,14 +13876,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer)); - - uint32_t * data = (uint32_t *) lctx.inp_cls->data; -- memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls)); -+ memset(lctx.inp_cls->data, 0, cparams.n_seq_max * ggml_element_size(lctx.inp_cls)); - - for (int i = 0; i < n_tokens; ++i) { - const llama_seq_id seq_id = batch.seq_id[i][0]; - const llama_pos pos = batch.pos[i]; -- -- GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS"); -- - if (pos == 0) { - data[seq_id] = i; - } diff --git a/server/sched.go b/server/sched.go index 9d8c4144699..58071bf035c 100644 --- a/server/sched.go +++ b/server/sched.go @@ -193,6 +193,11 @@ func (s *Scheduler) processPending(ctx context.Context) { break } + // Embedding models should always be loaded with parallel=1 + if pending.model.CheckCapabilities(CapabilityCompletion) != nil { + numParallel = 1 + } + // Evaluate if the model will fit in the available system memory, or if we should unload a model first if len(gpus) == 1 && gpus[0].Library == "cpu" { // simplifying assumption of defaultParallel when in CPU mode