diff --git a/README.md b/README.md index e997993..dfbfbba 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ llama-swap is an OpenAI API compatible server that gives you complete control ov Features: - ✅ Easy to deploy: single binary with no dependencies -- ✅ Single yaml configuration file +- ✅ Easy to config: single yaml file - ✅ On-demand model switching - ✅ Full control over server settings per model - ✅ OpenAI API support (`v1/completions` and `v1/chat/completions`) @@ -16,7 +16,8 @@ Features: - ✅ Run multiple models at once with `profiles` - ✅ Remote log monitoring at `/log` - ✅ Automatic unloading of models from GPUs after timeout -- ✅ Use any local server that provides an OpenAI compatible API (llama.cpp, vllm, tabblyAPI, etc) +- ✅ Use any local OpenAI compatible server (llama.cpp, vllm, tabblyAPI, etc) +- ✅ Direct access to proxied upstream HTTP server via `/upstream/:model_id` ## Releases @@ -73,6 +74,12 @@ models: --model path/to/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf proxy: http://127.0.0.1:8999 + # unlisted models do not show up in /v1/models or /upstream lists + # but they can still be requested as normal + "qwen-unlisted": + cmd: llama-server --port 9999 -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0 + unlisted: true + # profiles make it easy to managing multi model (and gpu) configurations. # # Tips: diff --git a/config.example.yaml b/config.example.yaml index 583b4b2..093ff9e 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -33,6 +33,7 @@ models: - env1=hello cmd: build/simple-responder --port 8999 proxy: http://127.0.0.1:8999 + unlisted: true # use "none" to skip check. Caution this may cause some requests to fail # until the upstream server is ready for traffic @@ -42,9 +43,11 @@ models: "broken": cmd: models/llama-server-osx --port 8999 -m models/doesnotexist.gguf proxy: http://127.0.0.1:8999 + unlisted: true "broken_timeout": cmd: models/llama-server-osx --port 8999 -m models/qwen2.5-0.5b-instruct-q8_0.gguf proxy: http://127.0.0.1:9000 + unlisted: true # creating a coding profile with models for code generation and general questions profiles: diff --git a/misc/assets/favicon-raw.png b/misc/assets/favicon-raw.png new file mode 100644 index 0000000..191c37c Binary files /dev/null and b/misc/assets/favicon-raw.png differ diff --git a/proxy/config.go b/proxy/config.go index f17e019..f3bef77 100644 --- a/proxy/config.go +++ b/proxy/config.go @@ -16,6 +16,7 @@ type ModelConfig struct { Env []string `yaml:"env"` CheckEndpoint string `yaml:"checkEndpoint"` UnloadAfter int `yaml:"ttl"` + Unlisted bool `yaml:"unlisted"` } func (m *ModelConfig) SanitizedCommand() ([]string, error) { diff --git a/proxy/html/favicon.ico b/proxy/html/favicon.ico new file mode 100644 index 0000000..388ef73 Binary files /dev/null and b/proxy/html/favicon.ico differ diff --git a/proxy/proxymanager.go b/proxy/proxymanager.go index 0c26990..b418243 100644 --- a/proxy/proxymanager.go +++ b/proxy/proxymanager.go @@ -2,10 +2,12 @@ package proxy import ( "bytes" + "embed" "encoding/json" "fmt" "io" "net/http" + "sort" "strconv" "strings" "sync" @@ -18,6 +20,15 @@ const ( PROFILE_SPLIT_CHAR = ":" ) +//go:embed html/favicon.ico +var faviconData []byte + +//go:embed html/logs.html +var logsHTML []byte + +// make sure embed is kept there by the IDE auto-package importer +var _ = embed.FS{} + type ProxyManager struct { sync.Mutex @@ -48,7 +59,12 @@ func New(config *Config) *ProxyManager { pm.ginEngine.GET("/logs/stream", pm.streamLogsHandler) pm.ginEngine.GET("/logs/streamSSE", pm.streamLogsHandlerSSE) - pm.ginEngine.NoRoute(pm.proxyNoRouteHandler) + pm.ginEngine.GET("/upstream", pm.upstreamIndex) + pm.ginEngine.Any("/upstream/:model_id/*upstreamPath", pm.proxyToUpstream) + + pm.ginEngine.GET("/favicon.ico", func(c *gin.Context) { + c.Data(http.StatusOK, "image/x-icon", faviconData) + }) // Disable console color for testing gin.DisableConsoleColor() @@ -86,7 +102,11 @@ func (pm *ProxyManager) stopProcesses() { func (pm *ProxyManager) listModelsHandler(c *gin.Context) { data := []interface{}{} - for id := range pm.config.Models { + for id, modelConfig := range pm.config.Models { + if modelConfig.Unlisted { + continue + } + data = append(data, map[string]interface{}{ "id": id, "object": "model", @@ -113,7 +133,7 @@ func (pm *ProxyManager) swapModel(requestedModel string) (*Process, error) { pm.Lock() defer pm.Unlock() - // Check if requestedModel contains a / + // Check if requestedModel contains a PROFILE_SPLIT_CHAR profileName, modelName := "", requestedModel if idx := strings.Index(requestedModel, PROFILE_SPLIT_CHAR); idx != -1 { profileName = requestedModel[:idx] @@ -170,6 +190,48 @@ func (pm *ProxyManager) swapModel(requestedModel string) (*Process, error) { return pm.currentProcesses[requestedProcessKey], nil } +func (pm *ProxyManager) proxyToUpstream(c *gin.Context) { + requestedModel := c.Param("model_id") + + if requestedModel == "" { + c.AbortWithError(http.StatusBadRequest, fmt.Errorf("model id required in path")) + return + } + + if process, err := pm.swapModel(requestedModel); err != nil { + c.AbortWithError(http.StatusNotFound, fmt.Errorf("unable to swap to model, %s", err.Error())) + } else { + // rewrite the path + c.Request.URL.Path = c.Param("upstreamPath") + process.ProxyRequest(c.Writer, c.Request) + } +} + +func (pm *ProxyManager) upstreamIndex(c *gin.Context) { + var html strings.Builder + + html.WriteString("\n

Available Models

") + c.Header("Content-Type", "text/html") + c.String(http.StatusOK, html.String()) +} + func (pm *ProxyManager) proxyChatRequestHandler(c *gin.Context) { bodyBytes, err := io.ReadAll(c.Request.Body) if err != nil { @@ -201,16 +263,6 @@ func (pm *ProxyManager) proxyChatRequestHandler(c *gin.Context) { } } -func (pm *ProxyManager) proxyNoRouteHandler(c *gin.Context) { - // since maps are unordered, just use the first available process if one exists - for _, process := range pm.currentProcesses { - process.ProxyRequest(c.Writer, c.Request) - return - } - - c.AbortWithError(http.StatusBadRequest, fmt.Errorf("no strategy to handle request")) -} - func ProcessKeyName(groupName, modelName string) string { return groupName + PROFILE_SPLIT_CHAR + modelName } diff --git a/proxy/proxymanager_loghandlers.go b/proxy/proxymanager_loghandlers.go index fc2ac4d..36cea1c 100644 --- a/proxy/proxymanager_loghandlers.go +++ b/proxy/proxymanager_loghandlers.go @@ -1,7 +1,6 @@ package proxy import ( - "embed" "fmt" "net/http" "strings" @@ -9,12 +8,6 @@ import ( "github.com/gin-gonic/gin" ) -//go:embed html/logs.html -var logsHTML []byte - -// make sure embed is kept there by the IDE auto-package importer -var _ = embed.FS{} - func (pm *ProxyManager) sendLogsHandlers(c *gin.Context) { accept := c.GetHeader("Accept")