ggerganov · MaggotHATE · Nov 20, 2024 · Nov 20, 2024 · Nov 20, 2024 · Nov 20, 2024
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -852,15 +852,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.input_prefix = value;
             params.enable_chat_template = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL}));
     add_opt(common_arg(
         {"--in-suffix"}, "STRING",
         "string to suffix after user inputs with (default: empty)",
         [](common_params & params, const std::string & value) {
             params.input_suffix = value;
             params.enable_chat_template = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL}));
     add_opt(common_arg(
         {"--no-warmup"},
         "skip warming up the model with an empty run",

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -2829,7 +2829,7 @@ int main(int argc, char ** argv) {
             return;
         }
 
-        json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
+        json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template, params.input_prefix, params.input_suffix);
 
         std::vector<server_task> tasks = ctx_server.create_tasks_inference(data, SERVER_TASK_INF_TYPE_COMPLETION);
         ctx_server.queue_results.add_waiting_tasks(tasks);
@@ -3220,15 +3220,20 @@ int main(int argc, char ** argv) {
 
     // if a custom chat template is not supplied, we will use the one that comes with the model (if any)
     if (params.chat_template.empty()) {
-        if (!ctx_server.validate_model_chat_template()) {
+        if (!params.input_prefix.empty() || !params.input_suffix.empty()) {
+            LOG_WRN("%s: Prefix and suffix are used instead of a chat template. This may cause the model to output suboptimal responses\n", __func__);
+            params.chat_template = "custom";
+        } else if (!ctx_server.validate_model_chat_template()) {
             LOG_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
             params.chat_template = "chatml";
         }
+    } else if (!params.input_prefix.empty() || !params.input_suffix.empty()) {
+        LOG_WRN("%s: Prefix and suffix are not used because a chat template is defined.\n", __func__);
+    } else {
+        // print sample chat example to make it clear which template is used
+        LOG_INF("%s: chat template, built_in: %d, chat_example: '%s'\n", __func__, params.chat_template.empty(), common_chat_format_example(ctx_server.model, params.chat_template).c_str());
     }
 
-    // print sample chat example to make it clear which template is used
-    LOG_INF("%s: chat template, built_in: %d, chat_example: '%s'\n", __func__, params.chat_template.empty(), common_chat_format_example(ctx_server.model, params.chat_template).c_str());
-
     ctx_server.queue_tasks.on_new_task(std::bind(
                 &server_context::process_single_task, &ctx_server, std::placeholders::_1));
 

diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
@@ -300,8 +300,9 @@ static llama_tokens format_infill(
 }
 
 // Format given chat. If tmpl is empty, we take the template from model metadata
-inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
+inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::string & prefix, const std::string & suffix, const std::vector<json> & messages) {
     std::vector<common_chat_msg> chat;
+    std::string formatted_chat;
 
     for (size_t i = 0; i < messages.size(); ++i) {
         const auto & curr_msg = messages[i];
@@ -325,10 +326,16 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
             throw std::runtime_error("Missing 'content' (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
         }
 
-        chat.push_back({role, content});
+        if (tmpl == "custom") {
-        if (tmpl == "custom") {
+        bool is_custom = !prefix.empty() || !suffix.empty();
+        if (is_custom) {
-        if (tmpl == "custom") {
+        bool is_custom = !prefix.empty() || !suffix.empty();
+        if (is_custom) {
+            // simple format using prefix and suffix
+            if (role == "user") formatted_chat += prefix + content + suffix;
+            else formatted_chat += content;
+        } else {
+            chat.push_back({role, content}); 
+        }
     }
 
-    const auto formatted_chat = common_chat_apply_template(model, tmpl, chat, true);
+    if (tmpl != "custom") formatted_chat = common_chat_apply_template(model, tmpl, chat, true);
-    if (tmpl != "custom") formatted_chat = common_chat_apply_template(model, tmpl, chat, true);
+    if (!is_custom) formatted_chat = common_chat_apply_template(model, tmpl, chat, true);
-    if (tmpl != "custom") formatted_chat = common_chat_apply_template(model, tmpl, chat, true);
+    if (!is_custom) formatted_chat = common_chat_apply_template(model, tmpl, chat, true);
     LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
 
     return formatted_chat;
@@ -597,13 +604,15 @@ static bool server_sent_event(httplib::DataSink & sink, const char * event, cons
 static json oaicompat_completion_params_parse(
     const struct llama_model * model,
     const json & body, /* openai api json semantics */
-    const std::string & chat_template) {
+    const std::string & chat_template,
+    const std::string & input_prefix,
+    const std::string & input_suffix) {
     json llama_params;
 
     llama_params["__oaicompat"] = true;
 
     // Apply chat template to the list of messages
-    llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"));
+    llama_params["prompt"] = format_chat(model, chat_template, input_prefix, input_suffix, body.at("messages"));
 
     // Handle "stop" field
     if (body.contains("stop") && body.at("stop").is_string()) {