Skip to content

Commit

Permalink
Port remaining highlighting code to javascript
Browse files Browse the repository at this point in the history
This change also sneaks in a new /slotz administrative endpoint. The new
llamafiler chat completions endpoint, is now consistent with the chatbot
in terms of output. We now have a STOP button in the new web GUI.
  • Loading branch information
jart committed Nov 6, 2024
1 parent 566cdc1 commit fdfdb13
Show file tree
Hide file tree
Showing 56 changed files with 9,125 additions and 773 deletions.
2 changes: 2 additions & 0 deletions llama.cpp/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
if (params.n_threads_batch <= 0) {
params.n_threads_batch = cpu_get_num_math(); // [jart]
}
FLAG_threads_batch = params.n_threads_batch; // [jart]
return true;
}
if (arg == "-td" || arg == "--threads-draft") {
Expand Down Expand Up @@ -832,6 +833,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
invalid_param = true;
return true;
}
FLAG_split_mode = params.split_mode; // [jart]
// #ifndef GGML_USE_CUDA_SYCL_VULKAN // [jart]
// fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n");
// #endif // GGML_USE_CUDA_SYCL_VULKAN
Expand Down
8 changes: 8 additions & 0 deletions llamafile/flags.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ int FLAG_seed = LLAMA_DEFAULT_SEED;
int FLAG_slots = 1;
int FLAG_split_mode = LLAMA_SPLIT_MODE_LAYER;
int FLAG_threads;
int FLAG_threads_batch;
int FLAG_token_burst = 100;
int FLAG_token_cidr = 24;
int FLAG_ubatch = 512;
Expand Down Expand Up @@ -315,6 +316,13 @@ void llamafile_get_flags(int argc, char **argv) {
continue;
}

if (!strcmp(flag, "-tb") || !strcmp(flag, "--threads-batch")) {
if (i == argc)
missing("--threads-batch");
FLAG_threads_batch = atoi(argv[i++]);
continue;
}

if (!strcmp(flag, "-b") || !strcmp(flag, "--batch-size")) {
if (i == argc)
missing("--batch-size");
Expand Down
26 changes: 23 additions & 3 deletions llamafile/highlight_markdown.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ enum {
INCODE,
INCODE2,
INCODE2_TICK,
INCODE2_TICK2,
EMPHASIS,
EMPHASIS_BACKSLASH,
};
Expand Down Expand Up @@ -81,6 +82,7 @@ void HighlightMarkdown::feed(std::string *r, std::string_view input) {
// handle \*\*not bold\*\* etc.
t_ = BACKSLASH;
*r += '\\';
bol_ = false;
} else {
lf::append_wchar(r, c);
}
Expand Down Expand Up @@ -119,6 +121,7 @@ void HighlightMarkdown::feed(std::string *r, std::string_view input) {
if (c == '\\')
t_ = EMPHASIS_BACKSLASH;
}
bol_ = false;
break;

case EMPHASIS:
Expand Down Expand Up @@ -171,13 +174,20 @@ void HighlightMarkdown::feed(std::string *r, std::string_view input) {

case TICK:
if (c == '`') {
t_ = TICK_TICK;
if (bol_) {
t_ = TICK_TICK;
} else {
*r += HI_INCODE;
*r += "``";
t_ = INCODE2;
}
} else {
*r += HI_INCODE;
*r += '`';
lf::append_wchar(r, c);
t_ = INCODE;
}
bol_ = false;
break;

case INCODE:
Expand All @@ -202,13 +212,22 @@ void HighlightMarkdown::feed(std::string *r, std::string_view input) {
case INCODE2_TICK:
lf::append_wchar(r, c);
if (c == '`') {
*r += HI_RESET;
t_ = NORMAL;
t_ = INCODE2_TICK2;
} else {
t_ = INCODE2;
}
break;

case INCODE2_TICK2:
if (c == '`') {
*r += '`';
} else {
*r += HI_RESET;
t_ = NORMAL;
goto Normal;
}
break;

case TICK_TICK:
if (c == '`') {
t_ = LANG;
Expand Down Expand Up @@ -300,6 +319,7 @@ void HighlightMarkdown::flush(std::string *r) {
case INCODE:
case INCODE2:
case INCODE2_TICK:
case INCODE2_TICK2:
case STRONG:
case STRONG_BACKSLASH:
case STRONG_STAR:
Expand Down
1 change: 1 addition & 0 deletions llamafile/llamafile.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ extern int FLAG_seed;
extern int FLAG_slots;
extern int FLAG_split_mode;
extern int FLAG_threads;
extern int FLAG_threads_batch;
extern int FLAG_token_burst;
extern int FLAG_token_cidr;
extern int FLAG_ubatch;
Expand Down
2 changes: 2 additions & 0 deletions llamafile/server/client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -636,6 +636,8 @@ Client::dispatcher()
return embedding();
if (p1 == "v1/chat/completions")
return v1_chat_completions();
if (p1 == "slotz")
return slotz();

// serve static endpoints
int infd;
Expand Down
2 changes: 2 additions & 0 deletions llamafile/server/client.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,4 +114,6 @@ struct Client

bool v1_chat_completions() __wur;
bool get_v1_chat_completions_params(V1ChatCompletionParams*) __wur;

bool slotz() __wur;
};
23 changes: 17 additions & 6 deletions llamafile/server/slot.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,19 +72,21 @@ Slot::start()
cparams.n_batch = FLAG_batch;
cparams.n_ubatch = FLAG_ubatch;
cparams.n_seq_max = 1;
cparams.n_threads = FLAG_threads;
cparams.n_threads_batch = MIN(FLAG_threads, 20);
cparams.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE;
cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
cparams.n_threads = MIN(FLAG_threads, 20);
cparams.n_threads_batch = FLAG_threads;
cparams.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
cparams.pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED;
cparams.attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED;
cparams.rope_freq_base = 0;
cparams.yarn_ext_factor = -1;
cparams.yarn_attn_factor = 1;
cparams.yarn_beta_fast = 32;
cparams.yarn_beta_slow = 1;
cparams.yarn_orig_ctx = 0;
cparams.defrag_thold = -1;
cparams.type_k = GGML_TYPE_F16;
cparams.type_v = GGML_TYPE_F16;
cparams.offload_kqv = true;
cparams.type_k = X86_HAVE(AVX512_BF16) ? GGML_TYPE_BF16 : GGML_TYPE_F16;
cparams.type_v = X86_HAVE(AVX512_BF16) ? GGML_TYPE_BF16 : GGML_TYPE_F16;
cparams.flash_attn = FLAG_flash_attn;
system_fingerprint_ = generate_system_fingerprint(&cparams);
if (!(ctx_ = llama_new_context_with_model(model_, cparams)))
Expand Down Expand Up @@ -155,3 +157,12 @@ Slot::prefill(const std::vector<int>& tokens)
reuse_count);
return eval_tokens(new_tokens);
}

std::string
Slot::dump()
{
std::string r;
for (size_t i = 0; i < history_.size(); ++i)
r += llama_token_to_piece(ctx_, history_[i], RENDER_SPECIAL_TOKENS);
return r;
}
2 changes: 2 additions & 0 deletions llamafile/server/slot.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
// limitations under the License.

#pragma once
#include <string>
#include <time.h>
#include <utility>
#include <vector>
Expand All @@ -38,4 +39,5 @@ struct Slot
bool eval_token(int);
bool eval_tokens(std::vector<int>);
bool prefill(const std::vector<int>&);
std::string dump();
};
1 change: 1 addition & 0 deletions llamafile/server/slots.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ Slots::start(int count)
if (slot->start()) {
++made;
slots_.emplace(slot);
all_slots_.push_back(slot);
} else {
delete slot;
}
Expand Down
1 change: 1 addition & 0 deletions llamafile/server/slots.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ struct Slots
{
llama_model* model_;
std::multiset<SlotEntry> slots_;
std::vector<Slot*> all_slots_;
pthread_mutex_t lock_;
pthread_cond_t cond_;

Expand Down
39 changes: 39 additions & 0 deletions llamafile/server/slotz.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
//
// Copyright 2024 Mozilla Foundation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "client.h"
#include "server.h"
#include "slot.h"
#include "slots.h"
#include "utils.h"
#include "worker.h"

bool
Client::slotz()
{
std::string s = std::string(or_empty(param("add_special")));
int id = atoi(s.c_str());
if (id < 0)
return send_error(400);
if (id >= worker_->server_->slots_->all_slots_.size())
return send_error(404);
Slot* slot = worker_->server_->slots_->all_slots_[id];
std::string dump = slot->dump();
char* p = append_http_response_message(obuf_.p, 200);
p = stpcpy(p, "Content-Type: text/plain\r\n");
return send_response(obuf_.p, p, dump);
}
35 changes: 32 additions & 3 deletions llamafile/server/www/chatbot.css
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@ body {
background: #f5f5f5;
}

p {
margin: 1rem auto;
box-sizing: border-box;
}

.chat-container {
max-width: 800px;
margin: 2rem auto;
Expand All @@ -30,9 +35,12 @@ body {
}

.chat-header h1 {
font-size: 1.25rem;
font-size: 2rem;
color: #212529;
vertical-align: center;
}

.chat-header img {
vertical-align: middle;
}

.chat-messages {
Expand Down Expand Up @@ -105,8 +113,29 @@ body {
cursor: not-allowed;
}

.stop-button {
padding: 0.75rem 1.5rem;
background: #dc3545;
color: white;
border: none;
border-radius: 6px;
cursor: pointer;
font-size: 1rem;
transition: background-color 0.2s;
}

.stop-button:hover {
background: #bb2d3b;
}

.stop-button:disabled {
background: #6c757d;
cursor: not-allowed;
}

.message pre {
background: #f8f9fa;
margin: 1rem auto;
background: #fefefe;
padding: 0.5rem;
border-radius: 4px;
overflow-x: auto;
Expand Down
Loading

0 comments on commit fdfdb13

Please sign in to comment.