Skip to content

Commit

Permalink
fix: add rope settings during model load, fix CUDA (#821)
Browse files Browse the repository at this point in the history
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
  • Loading branch information
mudler authored Jul 27, 2023
1 parent 147cae9 commit 096d98c
Show file tree
Hide file tree
Showing 7 changed files with 176 additions and 148 deletions.
3 changes: 1 addition & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@ GOVET=$(GOCMD) vet
BINARY_NAME=local-ai

# llama.cpp versions
# Temporarly pinned to https://github.com/go-skynet/go-llama.cpp/pull/124
GOLLAMA_VERSION?=562d2b5a71195627a63bb34f639e0fb0e2b2df3f
GOLLAMA_VERSION?=6ba16de8e965e5aa0f32d25ef9d6149bb6586565

# gpt4all version
GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
Expand Down
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,13 @@

**LocalAI** is a drop-in replacement REST API that's compatible with OpenAI API specifications for local inferencing. It allows you to run LLMs (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families that are compatible with the ggml format. Does not require GPU.

For a list of the supported model families, please see [the model compatibility table](https://localai.io/model-compatibility/index.html#model-compatibility-table).

In a nutshell:

- Local, OpenAI drop-in alternative REST API. You own your data.
- NO GPU required. NO Internet access is required either
- Optional, GPU Acceleration is available in `llama.cpp`-compatible LLMs. See also the [build section](https://localai.io/basics/build/index.html).
- Supports multiple models:
- 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `gpt4all.cpp`, ... and more)
- 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `gpt4all.cpp`, ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
- 🗣 [Text to Audio](https://localai.io/features/text-to-audio/)
- 🔈 [Audio to Text](https://localai.io/features/audio-to-text/) (Audio transcription with `whisper.cpp`)
- 🎨 [Image generation with stable diffusion](https://localai.io/features/image-generation)
Expand All @@ -31,6 +29,8 @@ In a nutshell:

LocalAI was created by [Ettore Di Giacinto](https://github.com/mudler/) and is a community-driven project, focused on making the AI accessible to anyone. Any contribution, feedback and PR is welcome!

Note that this started just as a [fun weekend project](https://localai.io/#backstory) in order to try to create the necessary pieces for a full AI assistant like `ChatGPT`: the community is growing fast and we are working hard to make it better and more stable. If you want to help, please consider contributing (see below)!

See the [Getting started](https://localai.io/basics/getting_started/index.html) and [examples](https://github.com/go-skynet/LocalAI/tree/master/examples/) sections to learn how to use LocalAI. For a list of curated models check out the [model gallery](https://localai.io/models/).


Expand All @@ -53,6 +53,7 @@ See the [Getting started](https://localai.io/basics/getting_started/index.html)
- [ ] Upstream our golang bindings to llama.cpp (https://github.com/ggerganov/llama.cpp/issues/351)
- [ ] Enable gallery management directly from the webui.
- [x] 🔥 OpenAI functions: https://github.com/go-skynet/LocalAI/issues/588
- [ ] 🔥 GPTQ support: https://github.com/go-skynet/LocalAI/issues/796

## News

Expand Down
28 changes: 15 additions & 13 deletions api/backend/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,21 @@ func gRPCModelOpts(c config.Config) *pb.ModelOptions {
b = c.Batch
}
return &pb.ModelOptions{
ContextSize: int32(c.ContextSize),
Seed: int32(c.Seed),
NBatch: int32(b),
F16Memory: c.F16,
MLock: c.MMlock,
NUMA: c.NUMA,
Embeddings: c.Embeddings,
LowVRAM: c.LowVRAM,
NGPULayers: int32(c.NGPULayers),
MMap: c.MMap,
MainGPU: c.MainGPU,
Threads: int32(c.Threads),
TensorSplit: c.TensorSplit,
ContextSize: int32(c.ContextSize),
Seed: int32(c.Seed),
NBatch: int32(b),
F16Memory: c.F16,
MLock: c.MMlock,
RopeFreqBase: c.RopeFreqBase,
RopeFreqScale: c.RopeFreqScale,
NUMA: c.NUMA,
Embeddings: c.Embeddings,
LowVRAM: c.LowVRAM,
NGPULayers: int32(c.NGPULayers),
MMap: c.MMap,
MainGPU: c.MainGPU,
Threads: int32(c.Threads),
TensorSplit: c.TensorSplit,
}
}

Expand Down
36 changes: 18 additions & 18 deletions extra/grpc/huggingface/backend_pb2.py

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

27 changes: 15 additions & 12 deletions pkg/grpc/llm/llama/llama.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,10 @@ type LLM struct {
}

func (llm *LLM) Load(opts *pb.ModelOptions) error {
llamaOpts := []llama.ModelOption{}
llamaOpts := []llama.ModelOption{
llama.WithRopeFreqBase(opts.RopeFreqBase),
llama.WithRopeFreqScale(opts.RopeFreqScale),
}

if opts.ContextSize != 0 {
llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize)))
Expand Down Expand Up @@ -56,15 +59,15 @@ func (llm *LLM) Load(opts *pb.ModelOptions) error {

func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption {
predictOptions := []llama.PredictOption{
llama.SetTemperature(float64(opts.Temperature)),
llama.SetTopP(float64(opts.TopP)),
llama.SetTemperature(opts.Temperature),
llama.SetTopP(opts.TopP),
llama.SetTopK(int(opts.TopK)),
llama.SetTokens(int(opts.Tokens)),
llama.SetThreads(int(opts.Threads)),
llama.WithGrammar(opts.Grammar),
llama.SetRopeFreqBase(float64(opts.RopeFreqBase)),
llama.SetRopeFreqScale(float64(opts.RopeFreqScale)),
llama.SetNegativePromptScale(float64(opts.NegativePromptScale)),
llama.SetRopeFreqBase(opts.RopeFreqBase),
llama.SetRopeFreqScale(opts.RopeFreqScale),
llama.SetNegativePromptScale(opts.NegativePromptScale),
llama.SetNegativePrompt(opts.NegativePrompt),
}

Expand All @@ -86,11 +89,11 @@ func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption {
}

if opts.MirostatETA != 0 {
predictOptions = append(predictOptions, llama.SetMirostatETA(float64(opts.MirostatETA)))
predictOptions = append(predictOptions, llama.SetMirostatETA(opts.MirostatETA))
}

if opts.MirostatTAU != 0 {
predictOptions = append(predictOptions, llama.SetMirostatTAU(float64(opts.MirostatTAU)))
predictOptions = append(predictOptions, llama.SetMirostatTAU(opts.MirostatTAU))
}

if opts.Debug {
Expand All @@ -100,7 +103,7 @@ func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption {
predictOptions = append(predictOptions, llama.SetStopWords(opts.StopPrompts...))

if opts.PresencePenalty != 0 {
predictOptions = append(predictOptions, llama.SetPenalty(float64(opts.PresencePenalty)))
predictOptions = append(predictOptions, llama.SetPenalty(opts.PresencePenalty))
}

if opts.NKeep != 0 {
Expand All @@ -125,13 +128,13 @@ func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption {

//predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed))

predictOptions = append(predictOptions, llama.SetFrequencyPenalty(float64(opts.FrequencyPenalty)))
predictOptions = append(predictOptions, llama.SetFrequencyPenalty(opts.FrequencyPenalty))
predictOptions = append(predictOptions, llama.SetMlock(opts.MLock))
predictOptions = append(predictOptions, llama.SetMemoryMap(opts.MMap))
predictOptions = append(predictOptions, llama.SetPredictionMainGPU(opts.MainGPU))
predictOptions = append(predictOptions, llama.SetPredictionTensorSplit(opts.TensorSplit))
predictOptions = append(predictOptions, llama.SetTailFreeSamplingZ(float64(opts.TailFreeSamplingZ)))
predictOptions = append(predictOptions, llama.SetTypicalP(float64(opts.TypicalP)))
predictOptions = append(predictOptions, llama.SetTailFreeSamplingZ(opts.TailFreeSamplingZ))
predictOptions = append(predictOptions, llama.SetTypicalP(opts.TypicalP))
return predictOptions
}

Expand Down
Loading

0 comments on commit 096d98c

Please sign in to comment.