From e49ea0123b960bed9392576d9d7d6187449f47e5 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Mon, 13 May 2024 19:07:51 +0200 Subject: [PATCH] feat(llama.cpp): add `flash_attention` and `no_kv_offloading` (#2310) feat(llama.cpp): add flash_attn and no_kv_offload Signed-off-by: Ettore Di Giacinto --- backend/backend.proto | 3 +++ backend/cpp/llama/grpc-server.cpp | 3 +++ core/backend/options.go | 2 ++ core/config/backend_config.go | 3 +++ 4 files changed, 11 insertions(+) diff --git a/backend/backend.proto b/backend/backend.proto index 778a96ffac97..cb87fe02d46f 100644 --- a/backend/backend.proto +++ b/backend/backend.proto @@ -212,6 +212,9 @@ message ModelOptions { float YarnBetaSlow = 47; string Type = 49; + + bool FlashAttention = 56; + bool NoKVOffload = 57; } message Result { diff --git a/backend/cpp/llama/grpc-server.cpp b/backend/cpp/llama/grpc-server.cpp index 6fb086585f4e..f9673b33ccfa 100644 --- a/backend/cpp/llama/grpc-server.cpp +++ b/backend/cpp/llama/grpc-server.cpp @@ -2254,6 +2254,9 @@ static void params_parse(const backend::ModelOptions* request, } params.use_mlock = request->mlock(); params.use_mmap = request->mmap(); + params.flash_attn = request->flashattention(); + params.no_kv_offload = request->nokvoffload(); + params.embedding = request->embeddings(); if (request->ropescaling() == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } diff --git a/core/backend/options.go b/core/backend/options.go index 4a7435e68586..c638ebd5ed83 100644 --- a/core/backend/options.go +++ b/core/backend/options.go @@ -77,6 +77,8 @@ func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions { MaxModelLen: int32(c.MaxModelLen), TensorParallelSize: int32(c.TensorParallelSize), MMProj: c.MMProj, + FlashAttention: c.FlashAttention, + NoKVOffload: c.NoKVOffloading, YarnExtFactor: c.YarnExtFactor, YarnAttnFactor: c.YarnAttnFactor, YarnBetaFast: c.YarnBetaFast, diff --git a/core/config/backend_config.go b/core/config/backend_config.go index cb1b7c2a360b..41c792fb1da3 100644 --- a/core/config/backend_config.go +++ b/core/config/backend_config.go @@ -132,6 +132,9 @@ type LLMConfig struct { TensorParallelSize int `yaml:"tensor_parallel_size"` // vLLM MMProj string `yaml:"mmproj"` + FlashAttention bool `yaml:"flash_attention"` + NoKVOffloading bool `yaml:"no_kv_offloading"` + RopeScaling string `yaml:"rope_scaling"` ModelType string `yaml:"type"`