Port remaining highlighting code to javascript

This change also sneaks in a new /slotz administrative endpoint. The new llamafiler chat completions endpoint, is now consistent with the chatbot in terms of output. We now have a STOP button in the new web GUI.
Mozilla-Ocho · Nov 6, 2024 · fdfdb13 · fdfdb13
1 parent 566cdc1
commit fdfdb13
Show file tree

Hide file tree

Showing 56 changed files with 9,125 additions and 773 deletions.
diff --git a/llama.cpp/common.cpp b/llama.cpp/common.cpp
@@ -284,6 +284,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         if (params.n_threads_batch <= 0) {
             params.n_threads_batch = cpu_get_num_math(); // [jart]
         }
+        FLAG_threads_batch = params.n_threads_batch; // [jart]
         return true;
     }
     if (arg == "-td" || arg == "--threads-draft") {
@@ -832,6 +833,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
             invalid_param = true;
             return true;
         }
+        FLAG_split_mode = params.split_mode; // [jart]
 // #ifndef GGML_USE_CUDA_SYCL_VULKAN // [jart]
 //         fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n");
 // #endif // GGML_USE_CUDA_SYCL_VULKAN

diff --git a/llamafile/flags.cpp b/llamafile/flags.cpp
@@ -68,6 +68,7 @@ int FLAG_seed = LLAMA_DEFAULT_SEED;
 int FLAG_slots = 1;
 int FLAG_split_mode = LLAMA_SPLIT_MODE_LAYER;
 int FLAG_threads;
+int FLAG_threads_batch;
 int FLAG_token_burst = 100;
 int FLAG_token_cidr = 24;
 int FLAG_ubatch = 512;
@@ -315,6 +316,13 @@ void llamafile_get_flags(int argc, char **argv) {
             continue;
         }
 
+        if (!strcmp(flag, "-tb") || !strcmp(flag, "--threads-batch")) {
+            if (i == argc)
+                missing("--threads-batch");
+            FLAG_threads_batch = atoi(argv[i++]);
+            continue;
+        }
+
         if (!strcmp(flag, "-b") || !strcmp(flag, "--batch-size")) {
             if (i == argc)
                 missing("--batch-size");

diff --git a/llamafile/highlight_markdown.cpp b/llamafile/highlight_markdown.cpp
@@ -35,6 +35,7 @@ enum {
     INCODE,
     INCODE2,
     INCODE2_TICK,
+    INCODE2_TICK2,
     EMPHASIS,
     EMPHASIS_BACKSLASH,
 };
@@ -81,6 +82,7 @@ void HighlightMarkdown::feed(std::string *r, std::string_view input) {
                 // handle \*\*not bold\*\* etc.
                 t_ = BACKSLASH;
                 *r += '\\';
+                bol_ = false;
             } else {
                 lf::append_wchar(r, c);
             }
@@ -119,6 +121,7 @@ void HighlightMarkdown::feed(std::string *r, std::string_view input) {
                 if (c == '\\')
                     t_ = EMPHASIS_BACKSLASH;
             }
+            bol_ = false;
             break;
 
         case EMPHASIS:
@@ -171,13 +174,20 @@ void HighlightMarkdown::feed(std::string *r, std::string_view input) {
 
         case TICK:
             if (c == '`') {
-                t_ = TICK_TICK;
+                if (bol_) {
+                    t_ = TICK_TICK;
+                } else {
+                    *r += HI_INCODE;
+                    *r += "``";
+                    t_ = INCODE2;
+                }
             } else {
                 *r += HI_INCODE;
                 *r += '`';
                 lf::append_wchar(r, c);
                 t_ = INCODE;
             }
+            bol_ = false;
             break;
 
         case INCODE:
@@ -202,13 +212,22 @@ void HighlightMarkdown::feed(std::string *r, std::string_view input) {
         case INCODE2_TICK:
             lf::append_wchar(r, c);
             if (c == '`') {
-                *r += HI_RESET;
-                t_ = NORMAL;
+                t_ = INCODE2_TICK2;
             } else {
                 t_ = INCODE2;
             }
             break;
 
+        case INCODE2_TICK2:
+            if (c == '`') {
+                *r += '`';
+            } else {
+                *r += HI_RESET;
+                t_ = NORMAL;
+                goto Normal;
+            }
+            break;
+
         case TICK_TICK:
             if (c == '`') {
                 t_ = LANG;
@@ -300,6 +319,7 @@ void HighlightMarkdown::flush(std::string *r) {
     case INCODE:
     case INCODE2:
     case INCODE2_TICK:
+    case INCODE2_TICK2:
     case STRONG:
     case STRONG_BACKSLASH:
     case STRONG_STAR:

diff --git a/llamafile/llamafile.h b/llamafile/llamafile.h
@@ -43,6 +43,7 @@ extern int FLAG_seed;
 extern int FLAG_slots;
 extern int FLAG_split_mode;
 extern int FLAG_threads;
+extern int FLAG_threads_batch;
 extern int FLAG_token_burst;
 extern int FLAG_token_cidr;
 extern int FLAG_ubatch;

diff --git a/llamafile/server/client.cpp b/llamafile/server/client.cpp
@@ -636,6 +636,8 @@ Client::dispatcher()
         return embedding();
     if (p1 == "v1/chat/completions")
         return v1_chat_completions();
+    if (p1 == "slotz")
+        return slotz();
 
     // serve static endpoints
     int infd;

diff --git a/llamafile/server/client.h b/llamafile/server/client.h
@@ -114,4 +114,6 @@ struct Client
 
     bool v1_chat_completions() __wur;
     bool get_v1_chat_completions_params(V1ChatCompletionParams*) __wur;
+
+    bool slotz() __wur;
 };
diff --git a/llamafile/server/slot.cpp b/llamafile/server/slot.cpp
@@ -72,19 +72,21 @@ Slot::start()
     cparams.n_batch = FLAG_batch;
     cparams.n_ubatch = FLAG_ubatch;
     cparams.n_seq_max = 1;
-    cparams.n_threads = FLAG_threads;
-    cparams.n_threads_batch = MIN(FLAG_threads, 20);
-    cparams.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE;
-    cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
+    cparams.n_threads = MIN(FLAG_threads, 20);
+    cparams.n_threads_batch = FLAG_threads;
+    cparams.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
+    cparams.pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED;
+    cparams.attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED;
     cparams.rope_freq_base = 0;
     cparams.yarn_ext_factor = -1;
     cparams.yarn_attn_factor = 1;
     cparams.yarn_beta_fast = 32;
     cparams.yarn_beta_slow = 1;
     cparams.yarn_orig_ctx = 0;
     cparams.defrag_thold = -1;
-    cparams.type_k = GGML_TYPE_F16;
-    cparams.type_v = GGML_TYPE_F16;
+    cparams.offload_kqv = true;
+    cparams.type_k = X86_HAVE(AVX512_BF16) ? GGML_TYPE_BF16 : GGML_TYPE_F16;
+    cparams.type_v = X86_HAVE(AVX512_BF16) ? GGML_TYPE_BF16 : GGML_TYPE_F16;
     cparams.flash_attn = FLAG_flash_attn;
     system_fingerprint_ = generate_system_fingerprint(&cparams);
     if (!(ctx_ = llama_new_context_with_model(model_, cparams)))
@@ -155,3 +157,12 @@ Slot::prefill(const std::vector<int>& tokens)
          reuse_count);
     return eval_tokens(new_tokens);
 }
+
+std::string
+Slot::dump()
+{
+    std::string r;
+    for (size_t i = 0; i < history_.size(); ++i)
+        r += llama_token_to_piece(ctx_, history_[i], RENDER_SPECIAL_TOKENS);
+    return r;
+}
diff --git a/llamafile/server/slot.h b/llamafile/server/slot.h
@@ -16,6 +16,7 @@
 // limitations under the License.
 
 #pragma once
+#include <string>
 #include <time.h>
 #include <utility>
 #include <vector>
@@ -38,4 +39,5 @@ struct Slot
     bool eval_token(int);
     bool eval_tokens(std::vector<int>);
     bool prefill(const std::vector<int>&);
+    std::string dump();
 };
diff --git a/llamafile/server/slots.cpp b/llamafile/server/slots.cpp
@@ -48,6 +48,7 @@ Slots::start(int count)
         if (slot->start()) {
             ++made;
             slots_.emplace(slot);
+            all_slots_.push_back(slot);
         } else {
             delete slot;
         }

diff --git a/llamafile/server/slots.h b/llamafile/server/slots.h
@@ -28,6 +28,7 @@ struct Slots
 {
     llama_model* model_;
     std::multiset<SlotEntry> slots_;
+    std::vector<Slot*> all_slots_;
     pthread_mutex_t lock_;
     pthread_cond_t cond_;
 

diff --git a/llamafile/server/slotz.cpp b/llamafile/server/slotz.cpp
@@ -0,0 +1,39 @@
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2024 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "client.h"
+#include "server.h"
+#include "slot.h"
+#include "slots.h"
+#include "utils.h"
+#include "worker.h"
+
+bool
+Client::slotz()
+{
+    std::string s = std::string(or_empty(param("add_special")));
+    int id = atoi(s.c_str());
+    if (id < 0)
+        return send_error(400);
+    if (id >= worker_->server_->slots_->all_slots_.size())
+        return send_error(404);
+    Slot* slot = worker_->server_->slots_->all_slots_[id];
+    std::string dump = slot->dump();
+    char* p = append_http_response_message(obuf_.p, 200);
+    p = stpcpy(p, "Content-Type: text/plain\r\n");
+    return send_response(obuf_.p, p, dump);
+}
diff --git a/llamafile/server/www/chatbot.css b/llamafile/server/www/chatbot.css
@@ -11,6 +11,11 @@ body {
   background: #f5f5f5;
 }
 
+p {
+  margin: 1rem auto;
+  box-sizing: border-box;
+}
+
 .chat-container {
   max-width: 800px;
   margin: 2rem auto;
@@ -30,9 +35,12 @@ body {
 }
 
 .chat-header h1 {
-  font-size: 1.25rem;
+  font-size: 2rem;
   color: #212529;
-  vertical-align: center;
+}
+
+.chat-header img {
+  vertical-align: middle;
 }
 
 .chat-messages {
@@ -105,8 +113,29 @@ body {
   cursor: not-allowed;
 }
 
+.stop-button {
+    padding: 0.75rem 1.5rem;
+    background: #dc3545;
+    color: white;
+    border: none;
+    border-radius: 6px;
+    cursor: pointer;
+    font-size: 1rem;
+    transition: background-color 0.2s;
+}
+
+.stop-button:hover {
+    background: #bb2d3b;
+}
+
+.stop-button:disabled {
+    background: #6c757d;
+    cursor: not-allowed;
+}
+
 .message pre {
-  background: #f8f9fa;
+  margin: 1rem auto;
+  background: #fefefe;
   padding: 0.5rem;
   border-radius: 4px;
   overflow-x: auto;