Skip to content

Commit

Permalink
Fix embeddings memory corruption (ollama#6467)
Browse files Browse the repository at this point in the history
* Fix embeddings memory corruption

The patch was leading to a buffer overrun corruption.  Once removed though, parallism
in server.cpp lead to hitting an assert due to slot/seq IDs being >= token count.  To
work around this, only use slot 0 for embeddings.

* Fix embed integration test assumption

The token eval count has changed with recent llama.cpp bumps (0.3.5+)
  • Loading branch information
dhiltgen authored Aug 22, 2024
1 parent 6bd8a4b commit 90ca841
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 65 deletions.
8 changes: 4 additions & 4 deletions integration/embed_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,8 @@ func TestAllMiniLMEmbed(t *testing.T) {
t.Fatalf("expected 0.010071031, got %.8f", res.Embeddings[0][0])
}

if res.PromptEvalCount != 8 {
t.Fatalf("expected 8 prompt tokens, got %d", res.PromptEvalCount)
if res.PromptEvalCount != 6 {
t.Fatalf("expected 6 prompt tokens, got %d", res.PromptEvalCount)
}
}

Expand Down Expand Up @@ -102,8 +102,8 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
t.Fatalf("expected 0.010071031 and -0.009802706, got %.8f and %.8f", res.Embeddings[0][0], res.Embeddings[1][0])
}

if res.PromptEvalCount != 16 {
t.Fatalf("expected 16 prompt tokens, got %d", res.PromptEvalCount)
if res.PromptEvalCount != 12 {
t.Fatalf("expected 12 prompt tokens, got %d", res.PromptEvalCount)
}
}

Expand Down
8 changes: 7 additions & 1 deletion llm/ext_server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1429,7 +1429,13 @@ struct llama_server_context
switch (task.type)
{
case TASK_TYPE_COMPLETION: {
server_slot *slot = prefix_slot(task.data["prompt"]);
server_slot *slot = nullptr;
if (task.embedding_mode) {
// Embedding seq_id (aka slot id) must always be <= token length, so always use slot 0
slot = slots[0].available() ? &slots[0] : nullptr;
} else {
slot = prefix_slot(task.data["prompt"]);
}
if (slot == nullptr)
{
// if no slot is available, we defer this task for processing later
Expand Down
60 changes: 0 additions & 60 deletions llm/patches/08-pooling.diff

This file was deleted.

5 changes: 5 additions & 0 deletions server/sched.go
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,11 @@ func (s *Scheduler) processPending(ctx context.Context) {
break
}

// Embedding models should always be loaded with parallel=1
if pending.model.CheckCapabilities(CapabilityCompletion) != nil {
numParallel = 1
}

// Evaluate if the model will fit in the available system memory, or if we should unload a model first
if len(gpus) == 1 && gpus[0].Library == "cpu" {
// simplifying assumption of defaultParallel when in CPU mode
Expand Down

0 comments on commit 90ca841

Please sign in to comment.