fix: add rope settings during model load, fix CUDA (#821)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
mudler · Jul 27, 2023 · 096d98c · 096d98c
1 parent 147cae9
commit 096d98c
Show file tree

Hide file tree

Showing 7 changed files with 176 additions and 148 deletions.
diff --git a/Makefile b/Makefile
@@ -4,8 +4,7 @@ GOVET=$(GOCMD) vet
 BINARY_NAME=local-ai
 
 # llama.cpp versions
-# Temporarly pinned to https://github.com/go-skynet/go-llama.cpp/pull/124
-GOLLAMA_VERSION?=562d2b5a71195627a63bb34f639e0fb0e2b2df3f
+GOLLAMA_VERSION?=6ba16de8e965e5aa0f32d25ef9d6149bb6586565
 
 # gpt4all version
 GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all

diff --git a/README.md b/README.md
@@ -13,15 +13,13 @@
 
 **LocalAI** is a drop-in replacement REST API that's compatible with OpenAI API specifications for local inferencing. It allows you to run LLMs (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families that are compatible with the ggml format. Does not require GPU.
 
-For a list of the supported model families, please see [the model compatibility table](https://localai.io/model-compatibility/index.html#model-compatibility-table).
-
 In a nutshell:
 
 - Local, OpenAI drop-in alternative REST API. You own your data.
 - NO GPU required. NO Internet access is required either
   - Optional, GPU Acceleration is available in `llama.cpp`-compatible LLMs. See also the [build section](https://localai.io/basics/build/index.html). 
 - Supports multiple models:
-  - 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `gpt4all.cpp`, ... and more)
+  - 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `gpt4all.cpp`, ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
   - 🗣 [Text to Audio](https://localai.io/features/text-to-audio/)
   - 🔈 [Audio to Text](https://localai.io/features/audio-to-text/) (Audio transcription with `whisper.cpp`)
   - 🎨 [Image generation with stable diffusion](https://localai.io/features/image-generation)
@@ -31,6 +29,8 @@ In a nutshell:
 
 LocalAI was created by [Ettore Di Giacinto](https://github.com/mudler/) and is a community-driven project, focused on making the AI accessible to anyone. Any contribution, feedback and PR is welcome! 
 
+Note that this started just as a [fun weekend project](https://localai.io/#backstory) in order to try to create the necessary pieces for a full AI assistant like `ChatGPT`: the community is growing fast and we are working hard to make it better and more stable. If you want to help, please consider contributing (see below)!
+
 See the [Getting started](https://localai.io/basics/getting_started/index.html) and [examples](https://github.com/go-skynet/LocalAI/tree/master/examples/) sections to learn how to use LocalAI. For a list of curated models check out the [model gallery](https://localai.io/models/).
 
 
@@ -53,6 +53,7 @@ See the [Getting started](https://localai.io/basics/getting_started/index.html)
 - [ ] Upstream our golang bindings to llama.cpp (https://github.com/ggerganov/llama.cpp/issues/351) 
 - [ ] Enable gallery management directly from the webui.
 - [x] 🔥 OpenAI functions: https://github.com/go-skynet/LocalAI/issues/588
+- [ ] 🔥 GPTQ support: https://github.com/go-skynet/LocalAI/issues/796
 
 ## News
 

diff --git a/api/backend/options.go b/api/backend/options.go
@@ -15,19 +15,21 @@ func gRPCModelOpts(c config.Config) *pb.ModelOptions {
 		b = c.Batch
 	}
 	return &pb.ModelOptions{
-		ContextSize: int32(c.ContextSize),
-		Seed:        int32(c.Seed),
-		NBatch:      int32(b),
-		F16Memory:   c.F16,
-		MLock:       c.MMlock,
-		NUMA:        c.NUMA,
-		Embeddings:  c.Embeddings,
-		LowVRAM:     c.LowVRAM,
-		NGPULayers:  int32(c.NGPULayers),
-		MMap:        c.MMap,
-		MainGPU:     c.MainGPU,
-		Threads:     int32(c.Threads),
-		TensorSplit: c.TensorSplit,
+		ContextSize:   int32(c.ContextSize),
+		Seed:          int32(c.Seed),
+		NBatch:        int32(b),
+		F16Memory:     c.F16,
+		MLock:         c.MMlock,
+		RopeFreqBase:  c.RopeFreqBase,
+		RopeFreqScale: c.RopeFreqScale,
+		NUMA:          c.NUMA,
+		Embeddings:    c.Embeddings,
+		LowVRAM:       c.LowVRAM,
+		NGPULayers:    int32(c.NGPULayers),
+		MMap:          c.MMap,
+		MainGPU:       c.MainGPU,
+		Threads:       int32(c.Threads),
+		TensorSplit:   c.TensorSplit,
 	}
 }
 

diff --git a/extra/grpc/huggingface/backend_pb2.py b/extra/grpc/huggingface/backend_pb2.py
diff --git a/pkg/grpc/llm/llama/llama.go b/pkg/grpc/llm/llama/llama.go
@@ -17,7 +17,10 @@ type LLM struct {
 }
 
 func (llm *LLM) Load(opts *pb.ModelOptions) error {
-	llamaOpts := []llama.ModelOption{}
+	llamaOpts := []llama.ModelOption{
+		llama.WithRopeFreqBase(opts.RopeFreqBase),
+		llama.WithRopeFreqScale(opts.RopeFreqScale),
+	}
 
 	if opts.ContextSize != 0 {
 		llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize)))
@@ -56,15 +59,15 @@ func (llm *LLM) Load(opts *pb.ModelOptions) error {
 
 func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption {
 	predictOptions := []llama.PredictOption{
-		llama.SetTemperature(float64(opts.Temperature)),
-		llama.SetTopP(float64(opts.TopP)),
+		llama.SetTemperature(opts.Temperature),
+		llama.SetTopP(opts.TopP),
 		llama.SetTopK(int(opts.TopK)),
 		llama.SetTokens(int(opts.Tokens)),
 		llama.SetThreads(int(opts.Threads)),
 		llama.WithGrammar(opts.Grammar),
-		llama.SetRopeFreqBase(float64(opts.RopeFreqBase)),
-		llama.SetRopeFreqScale(float64(opts.RopeFreqScale)),
-		llama.SetNegativePromptScale(float64(opts.NegativePromptScale)),
+		llama.SetRopeFreqBase(opts.RopeFreqBase),
+		llama.SetRopeFreqScale(opts.RopeFreqScale),
+		llama.SetNegativePromptScale(opts.NegativePromptScale),
 		llama.SetNegativePrompt(opts.NegativePrompt),
 	}
 
@@ -86,11 +89,11 @@ func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption {
 	}
 
 	if opts.MirostatETA != 0 {
-		predictOptions = append(predictOptions, llama.SetMirostatETA(float64(opts.MirostatETA)))
+		predictOptions = append(predictOptions, llama.SetMirostatETA(opts.MirostatETA))
 	}
 
 	if opts.MirostatTAU != 0 {
-		predictOptions = append(predictOptions, llama.SetMirostatTAU(float64(opts.MirostatTAU)))
+		predictOptions = append(predictOptions, llama.SetMirostatTAU(opts.MirostatTAU))
 	}
 
 	if opts.Debug {
@@ -100,7 +103,7 @@ func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption {
 	predictOptions = append(predictOptions, llama.SetStopWords(opts.StopPrompts...))
 
 	if opts.PresencePenalty != 0 {
-		predictOptions = append(predictOptions, llama.SetPenalty(float64(opts.PresencePenalty)))
+		predictOptions = append(predictOptions, llama.SetPenalty(opts.PresencePenalty))
 	}
 
 	if opts.NKeep != 0 {
@@ -125,13 +128,13 @@ func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption {
 
 	//predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed))
 
-	predictOptions = append(predictOptions, llama.SetFrequencyPenalty(float64(opts.FrequencyPenalty)))
+	predictOptions = append(predictOptions, llama.SetFrequencyPenalty(opts.FrequencyPenalty))
 	predictOptions = append(predictOptions, llama.SetMlock(opts.MLock))
 	predictOptions = append(predictOptions, llama.SetMemoryMap(opts.MMap))
 	predictOptions = append(predictOptions, llama.SetPredictionMainGPU(opts.MainGPU))
 	predictOptions = append(predictOptions, llama.SetPredictionTensorSplit(opts.TensorSplit))
-	predictOptions = append(predictOptions, llama.SetTailFreeSamplingZ(float64(opts.TailFreeSamplingZ)))
-	predictOptions = append(predictOptions, llama.SetTypicalP(float64(opts.TypicalP)))
+	predictOptions = append(predictOptions, llama.SetTailFreeSamplingZ(opts.TailFreeSamplingZ))
+	predictOptions = append(predictOptions, llama.SetTypicalP(opts.TypicalP))
 	return predictOptions
 }