Skip to content

Commit

Permalink
Merge branch 'upstream' into concedo_experimental
Browse files Browse the repository at this point in the history
# Conflicts:
#	.devops/nix/package.nix
#	CMakePresets.json
#	README.md
#	flake.lock
#	ggml/src/CMakeLists.txt
#	tests/test-backend-ops.cpp
#	tests/test-chat-template.cpp
  • Loading branch information
LostRuins committed Jul 2, 2024
2 parents 82202ae + 023b880 commit 0fc18d2
Show file tree
Hide file tree
Showing 25 changed files with 1,455 additions and 1,370 deletions.
16 changes: 13 additions & 3 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1015,16 +1015,19 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
}
if (arg == "--in-prefix-bos") {
params.input_prefix_bos = true;
params.enable_chat_template = false;
return true;
}
if (arg == "--in-prefix") {
CHECK_ARG
params.input_prefix = argv[i];
params.enable_chat_template = false;
return true;
}
if (arg == "--in-suffix") {
CHECK_ARG
params.input_suffix = argv[i];
params.enable_chat_template = false;
return true;
}
if (arg == "--spm-infill") {
Expand Down Expand Up @@ -1407,7 +1410,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
"halt generation at PROMPT, return control in interactive mode\n"
"can be specified more than once for multiple prompts" });
options.push_back({ "main", "-sp, --special", "special tokens output enabled (default: %s)", params.special ? "true" : "false" });
options.push_back({ "main", "-cnv, --conversation", "run in conversation mode (does not print special tokens and suffix/prefix) (default: %s)", params.conversation ? "true" : "false" });
options.push_back({ "main", "-cnv, --conversation", "run in conversation mode (does not print special tokens and suffix/prefix, use default chat template) (default: %s)", params.conversation ? "true" : "false" });
options.push_back({ "main infill", "-i, --interactive", "run in interactive mode (default: %s)", params.interactive ? "true" : "false" });
options.push_back({ "main infill", "-if, --interactive-first", "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" });
options.push_back({ "main infill", "-mli, --multiline-input", "allows you to write or paste multiple lines without ending each in '\\'" });
Expand Down Expand Up @@ -2669,12 +2672,19 @@ std::string llama_chat_format_single(const struct llama_model * model,
const std::vector<llama_chat_msg> & past_msg,
const llama_chat_msg & new_msg,
bool add_ass) {
std::ostringstream ss;
auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false);
std::vector<llama_chat_msg> chat_new(past_msg);
// if the past_msg ends with a newline, we must preserve it in the formatted version
if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
ss << "\n";
};
// format chat with new_msg
chat_new.push_back(new_msg);
auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass);
auto formatted = fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
return formatted;
// get the diff part
ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
return ss.str();
}

std::string llama_chat_format_example(const struct llama_model * model,
Expand Down
1 change: 1 addition & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,7 @@ struct gpt_params {
std::string public_path = "";
std::string chat_template = "";
std::string system_prompt = "";
bool enable_chat_template = true;

std::vector<std::string> api_keys;

Expand Down
51 changes: 38 additions & 13 deletions convert-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -576,7 +576,19 @@ def _set_vocab_qwen(self):
special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
special_vocab.add_to_gguf(self.gguf_writer)

def _set_vocab_sentencepiece(self):
def _set_vocab_sentencepiece(self, add_to_gguf=True):
tokens, scores, toktypes = self._create_vocab_sentencepiece()

self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)

special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer)

def _create_vocab_sentencepiece(self):
from sentencepiece import SentencePieceProcessor

tokenizer_path = self.dir_model / 'tokenizer.model'
Expand Down Expand Up @@ -638,14 +650,7 @@ def _set_vocab_sentencepiece(self):
scores.append(-1000.0)
toktypes.append(SentencePieceTokenTypes.UNUSED)

self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)

special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer)
return tokens, scores, toktypes

def _set_vocab_llama_hf(self):
vocab = gguf.LlamaHfVocab(self.dir_model)
Expand Down Expand Up @@ -2345,7 +2350,19 @@ class Gemma2Model(Model):
model_arch = gguf.MODEL_ARCH.GEMMA2

def set_vocab(self):
self._set_vocab_llama_hf()
tokens, scores, toktypes = self._create_vocab_sentencepiece()
# hack: This is required so that we can properly use start/end-of-turn for chat template
for i in range(108):
# including <unusedX>, <start_of_turn>, <end_of_turn>
toktypes[i] = SentencePieceTokenTypes.CONTROL
self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)

special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer)
self.gguf_writer.add_add_space_prefix(False)

def set_gguf_parameters(self):
Expand All @@ -2369,6 +2386,12 @@ def set_gguf_parameters(self):
self.gguf_writer.add_final_logit_softcapping(
self.hparams["final_logit_softcapping"]
)
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])

# sanity check
attn_scalar = self.hparams["query_pre_attn_scalar"]
if attn_scalar != hparams["hidden_size"] / hparams["num_attention_heads"]:
raise ValueError("query_pre_attn_scalar must be equal to n_embd / n_head")

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unusem
Expand Down Expand Up @@ -3097,7 +3120,8 @@ def main() -> None:
"auto": gguf.LlamaFileType.GUESSED,
}

if args.use_temp_file and (args.split_max_tensors > 0 or args.split_max_size != "0"):
is_split = args.split_max_tensors > 0 or args.split_max_size != "0"
if args.use_temp_file and is_split:
logger.error("Error: Cannot use temp file when splitting")
sys.exit(1)

Expand Down Expand Up @@ -3134,11 +3158,12 @@ def main() -> None:
if args.vocab_only:
logger.info("Exporting model vocab...")
model_instance.write_vocab()
logger.info("Model vocab successfully exported.")
logger.info(f"Model vocab successfully exported to {model_instance.fname_out}")
else:
logger.info("Exporting model...")
model_instance.write()
logger.info("Model successfully exported.")
out_path = f"{model_instance.fname_out.parent}{os.sep}" if is_split else model_instance.fname_out
logger.info(f"Model successfully exported to {out_path}")


if __name__ == '__main__':
Expand Down
11 changes: 7 additions & 4 deletions examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ int main(int argc, char ** argv) {
std::vector<llama_token> embd_inp;

{
auto prompt = params.conversation
auto prompt = (params.conversation && params.enable_chat_template)
? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
: params.prompt;
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
Expand Down Expand Up @@ -811,7 +811,9 @@ int main(int argc, char ** argv) {
is_antiprompt = true;
}

chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
if (params.enable_chat_template) {
chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
}
is_interacting = true;
printf("\n");
}
Expand Down Expand Up @@ -873,12 +875,13 @@ int main(int argc, char ** argv) {
string_process_escapes(buffer);
}

std::string user_inp = params.conversation
bool format_chat = params.conversation && params.enable_chat_template;
std::string user_inp = format_chat
? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
: std::move(buffer);
// TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
const auto line_inp = ::llama_tokenize(ctx, user_inp, false, params.conversation);
const auto line_inp = ::llama_tokenize(ctx, user_inp, false, format_chat);
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);

LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
Expand Down
14 changes: 7 additions & 7 deletions ggml/src/ggml-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,19 +106,19 @@ typedef sycl::half2 ggml_half2;
#define QR6_K 2

#define QI2_XXS (QK_K / (4*QR2_XXS))
#define QR2_XXS 8
#define QR2_XXS 4

#define QI2_XS (QK_K / (4*QR2_XS))
#define QR2_XS 8
#define QR2_XS 4

#define QI2_S (QK_K / (4*QR2_S))
#define QR2_S 8
#define QR2_S 4

#define QI3_XXS (QK_K / (4*QR3_XXS))
#define QR3_XXS 8
#define QR3_XXS 4

#define QI3_XS (QK_K / (4*QR3_XS))
#define QR3_XS 8
#define QR3_XS 4

#define QI1_S (QK_K / (4*QR1_S))
#define QR1_S 8
Expand All @@ -130,10 +130,10 @@ typedef sycl::half2 ggml_half2;
#define QR4_NL 2

#define QI4_XS (QK_K / (4*QR4_XS))
#define QR4_XS 8
#define QR4_XS 2

#define QI3_S (QK_K / (4*QR3_S))
#define QR3_S 8
#define QR3_S 4

#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP

Expand Down
59 changes: 35 additions & 24 deletions ggml/src/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1882,6 +1882,11 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
bool use_mul_mat_q = ggml_is_quantized(src0->type)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;

// if mmvq is available it's a better choice than dmmv:
#ifndef GGML_CUDA_FORCE_DMMV
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
#endif // GGML_CUDA_FORCE_DMMV

bool any_gpus_with_slow_fp16 = false;

if (split) {
Expand All @@ -1894,22 +1899,15 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
}

const int cc = ggml_cuda_info().devices[id].cc;
use_mul_mat_vec_q = use_mul_mat_vec_q && cc >= MIN_CC_DP4A;
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
}
} else {
const int cc = ggml_cuda_info().devices[ctx.device].cc;
use_mul_mat_vec_q = use_mul_mat_vec_q && cc >= MIN_CC_DP4A;
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
}

// if mmvq is available it's a better choice than dmmv:
#ifndef GGML_CUDA_FORCE_DMMV
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
#endif // GGML_CUDA_FORCE_DMMV

// debug helpers
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
//printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
Expand Down Expand Up @@ -2717,27 +2715,40 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
case GGML_OP_MUL_MAT:
case GGML_OP_MUL_MAT_ID:
{
struct ggml_tensor * a;
struct ggml_tensor * b;
struct ggml_tensor * a = op->src[0];
if (op->op == GGML_OP_MUL_MAT) {
a = op->src[0];
b = op->src[1];
} else {
a = op->src[2];
b = op->src[1];
}
if (a->ne[3] != b->ne[3]) {
return false;
}
ggml_type a_type = a->type;
if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S ||
a_type == GGML_TYPE_IQ1_M || a_type == GGML_TYPE_IQ2_S || a_type == GGML_TYPE_IQ4_XS) {
if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
struct ggml_tensor * b = op->src[1];
if (a->ne[3] != b->ne[3]) {
return false;
}
}
return true;
switch (a->type) {
case GGML_TYPE_F32:
case GGML_TYPE_F16:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
case GGML_TYPE_Q8_K:
case GGML_TYPE_IQ1_M:
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_XS:
return true;
default:
return false;
}
} break;
case GGML_OP_GET_ROWS:
{
Expand Down
Loading

0 comments on commit 0fc18d2

Please sign in to comment.