From 83133b6c2d77acd2949c4a7352355dd1cd6bafc7 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Thu, 5 Dec 2024 21:35:20 +0100 Subject: [PATCH] feat(llama.cpp): expose cache_type_k and cache_type_v for quant of kv cache Signed-off-by: Ettore Di Giacinto --- backend/backend.proto | 3 +++ backend/cpp/llama/grpc-server.cpp | 2 ++ core/backend/options.go | 2 ++ core/config/backend_config.go | 6 ++++-- 4 files changed, 11 insertions(+), 2 deletions(-) diff --git a/backend/backend.proto b/backend/backend.proto index 48b0101b4b29..0a341ca2a9ed 100644 --- a/backend/backend.proto +++ b/backend/backend.proto @@ -242,6 +242,9 @@ message ModelOptions { repeated float LoraScales = 61; repeated string Options = 62; + + string CacheTypeKey = 63; + string CacheTypeValue = 64; } message Result { diff --git a/backend/cpp/llama/grpc-server.cpp b/backend/cpp/llama/grpc-server.cpp index 0fde74cbd3a6..2059ccda317e 100644 --- a/backend/cpp/llama/grpc-server.cpp +++ b/backend/cpp/llama/grpc-server.cpp @@ -2241,6 +2241,8 @@ static void params_parse(const backend::ModelOptions* request, } // params.model_alias ?? params.model_alias = request->modelfile(); + params.cache_type_k = request->cachetypekey(); + params.cache_type_v = request->cachetypevalue(); params.n_ctx = request->contextsize(); //params.memory_f16 = request->f16memory(); params.cpuparams.n_threads = request->threads(); diff --git a/core/backend/options.go b/core/backend/options.go index 1f88122fc2b2..f6247c605668 100644 --- a/core/backend/options.go +++ b/core/backend/options.go @@ -151,6 +151,8 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions { TensorParallelSize: int32(c.TensorParallelSize), MMProj: c.MMProj, FlashAttention: c.FlashAttention, + CacheTypeKey: c.CacheTypeK, + CacheTypeValue: c.CacheTypeV, NoKVOffload: c.NoKVOffloading, YarnExtFactor: c.YarnExtFactor, YarnAttnFactor: c.YarnAttnFactor, diff --git a/core/config/backend_config.go b/core/config/backend_config.go index 1de540f94382..0ff347699932 100644 --- a/core/config/backend_config.go +++ b/core/config/backend_config.go @@ -155,8 +155,10 @@ type LLMConfig struct { TensorParallelSize int `yaml:"tensor_parallel_size"` // vLLM MMProj string `yaml:"mmproj"` - FlashAttention bool `yaml:"flash_attention"` - NoKVOffloading bool `yaml:"no_kv_offloading"` + FlashAttention bool `yaml:"flash_attention"` + NoKVOffloading bool `yaml:"no_kv_offloading"` + CacheTypeK string `yaml:"cache_type_k"` + CacheTypeV string `yaml:"cache_type_v"` RopeScaling string `yaml:"rope_scaling"` ModelType string `yaml:"type"`