Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor lora adapter support #8332

Merged
merged 42 commits into from
Jul 15, 2024
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
67c5e14
lora: load to devide buft
ngxson Jul 6, 2024
e9d7b6c
add patch tensor function
ngxson Jul 6, 2024
4e28ad4
correct tensor patch
ngxson Jul 6, 2024
1b4ffba
llama_lora_adapter_apply
ngxson Jul 6, 2024
b88ce0f
correct ggml_backend_tensor_copy
ngxson Jul 6, 2024
f6d090d
add llm_build_mm
ngxson Jul 7, 2024
a1666aa
Merge branch 'master' into xsn/fix_lora
ngxson Jul 7, 2024
30faf1f
fix auto merge
ngxson Jul 7, 2024
79e2982
update based on review comments
ngxson Jul 8, 2024
847135a
add convert script
ngxson Jul 8, 2024
712fecb
no more transpose A
ngxson Jul 8, 2024
84288ff
add f16 convert
ngxson Jul 8, 2024
41ced24
Merge branch 'master' into xsn/fix_lora
ngxson Jul 8, 2024
0e16188
add metadata check
ngxson Jul 8, 2024
6c617e2
add sanity check
ngxson Jul 8, 2024
7a83f20
fix ftype
ngxson Jul 8, 2024
d52455f
add requirements
ngxson Jul 8, 2024
802565c
fix requirements
ngxson Jul 8, 2024
95b3eb0
fix outfile
ngxson Jul 8, 2024
03d24ca
Merge pull request #8 from ngxson/xsn/fix_lora_convert
ngxson Jul 8, 2024
ee2b35c
conversion: only allow selected models
ngxson Jul 9, 2024
713665d
fix types
ngxson Jul 9, 2024
f15167a
cuda : do not use dmmv if the tensor does not have enough cols
slaren Jul 10, 2024
9841fbd
llama : lora fixes
slaren Jul 10, 2024
4fe0861
Merge pull request #9 from ggerganov/sl/fix_fix_lora
ngxson Jul 10, 2024
1faf7e5
do not disable mmap with lora
ngxson Jul 10, 2024
e68344c
Merge branch 'master' into xsn/fix_lora
ngxson Jul 10, 2024
916e959
llm_build_lora_mm_id
ngxson Jul 10, 2024
9d96328
convert_lora : MoE LoRA conversion support
compilade Jul 9, 2024
8956543
convert_hf : simplify modify_tensors for InternLM2
compilade Jul 15, 2024
87301bd
llama : use llm_build_lora_mm in most model graphs
compilade Jul 15, 2024
703573f
Merge branch 'master' into xsn/fix_lora
ngxson Jul 15, 2024
42415a4
auto scale
ngxson Jul 15, 2024
5b18118
Revert "auto scale"
ngxson Jul 15, 2024
f68d092
remove redundant params
ngxson Jul 15, 2024
b704448
Merge branch 'master' into xsn/fix_lora
ngxson Jul 15, 2024
9175f4b
Apply suggestions from code review
ngxson Jul 15, 2024
0ba23ba
change kv metadata
ngxson Jul 15, 2024
b1c4069
move add_type to __init__
ngxson Jul 15, 2024
4d9ac0f
Merge branch 'master' into xsn/fix_lora
ngxson Jul 15, 2024
d09382f
convert_hf : move add_type to main()
compilade Jul 15, 2024
383b6bc
Merge branch 'master' into xsn/fix_lora
ngxson Jul 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 3 additions & 8 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2080,19 +2080,14 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
float lora_scale = std::get<1>(params.lora_adapter[i]);
int err = llama_model_apply_lora_from_file(model,
lora_adapter.c_str(),
lora_scale,
((i > 0) || params.lora_base.empty())
? NULL
: params.lora_base.c_str(),
params.n_threads);
if (err != 0) {
auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
if (adapter == nullptr) {
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
llama_free(lctx);
llama_free_model(model);
return std::make_tuple(nullptr, nullptr);
}
llama_lora_adapter_set(lctx, adapter, lora_scale);
}

if (params.ignore_eos) {
Expand Down
4 changes: 2 additions & 2 deletions ggml/src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -19339,7 +19339,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph

fprintf(fp, "digraph G {\n");
fprintf(fp, " newrank = true;\n");
fprintf(fp, " rankdir = LR;\n");
fprintf(fp, " rankdir = TB;\n");

for (int i = 0; i < gb->n_nodes; i++) {
struct ggml_tensor * node = gb->nodes[i];
Expand Down Expand Up @@ -19401,7 +19401,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
}

fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
if (ggml_nelements(node) < 5) {
if (ggml_nelements(node) < 5 && node->data != NULL) {
fprintf(fp, " | (");
for (int j = 0; j < ggml_nelements(node); j++) {
if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
Expand Down
37 changes: 25 additions & 12 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -408,6 +408,9 @@ extern "C" {
const char * content;
} llama_chat_message;

// lora adapter
struct llama_lora_adapter;

// Helpers for getting default parameters
LLAMA_API struct llama_model_params llama_model_default_params(void);
LLAMA_API struct llama_context_params llama_context_default_params(void);
Expand Down Expand Up @@ -507,18 +510,28 @@ extern "C" {
const char * fname_out,
const llama_model_quantize_params * params);

// Apply a LoRA adapter to a loaded model
// path_base_model is the path to a higher quality model to use as a base for
// the layers modified by the adapter. Can be NULL to use the current loaded model.
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
// will be applied on top of the previous one
// Returns 0 on success
LLAMA_API int32_t llama_model_apply_lora_from_file(
const struct llama_model * model,
const char * path_lora,
float scale,
const char * path_base_model,
int32_t n_threads);
// Load a LoRA adapter from file
// The loaded adapter will be associated to the given model, and will be free when the model is deleted
LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init(
struct llama_model * model,
const char * path_lora);

// Add a loaded LoRA adapter to given context
// This will not modify model's weight
LLAMA_API int32_t llama_lora_adapter_set(
struct llama_context * ctx,
struct llama_lora_adapter * adapter,
float scale);

// Remove a LoRA adapter from given context
// Return -1 if the adapter is not present in the context
LLAMA_API int32_t llama_lora_adapter_remove(
struct llama_context * ctx,
struct llama_lora_adapter * adapter);

// Manually free a LoRA adapter
// Note: loaded adapters will be free when the associated model is deleted
LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter);

// Apply a loaded control vector to a llama_context, or if data is NULL, clear
// the currently loaded vector.
Expand Down
Loading
Loading