From 5aaf24766aa7d85077816b7857f38d0d1d6b982b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 9 Oct 2024 11:01:53 +0300 Subject: [PATCH 01/42] llama : add infill sampler --- common/common.h | 2 ++ examples/server/server.cpp | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/common/common.h b/common/common.h index 5ca8fd391ab74..2fb92ae143c54 100644 --- a/common/common.h +++ b/common/common.h @@ -117,6 +117,8 @@ struct common_sampler_params { float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities float dynatemp_range = 0.00f; // 0.0 = disabled float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler + float infill_p = 0.80f; + float infill_p_eog = 0.01f; int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) float penalty_repeat = 1.00f; // 1.0 = disabled float penalty_freq = 0.00f; // 0.0 = disabled diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 3992108e7f383..e9621ba93c956 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -873,6 +873,8 @@ struct server_context { slot.sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z); slot.sparams.typ_p = json_value(data, "typical_p", default_sparams.typ_p); slot.sparams.temp = json_value(data, "temperature", default_sparams.temp); + slot.sparams.infill_p = json_value(data, "infill_p", default_sparams.infill_p); + slot.sparams.infill_p_eog = json_value(data, "infill_p_eog", default_sparams.infill_p_eog); slot.sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range); slot.sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent); slot.sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n); @@ -1241,6 +1243,8 @@ struct server_context { {"xtc_threshold", slot.sparams.xtc_threshold}, {"tfs_z", slot.sparams.tfs_z}, {"typical_p", slot.sparams.typ_p}, + {"infill_p", slot.sparams.infill_p}, + {"infill_p_eog", slot.sparams.infill_p_eog}, {"repeat_last_n", slot.sparams.penalty_last_n}, {"repeat_penalty", slot.sparams.penalty_repeat}, {"presence_penalty", slot.sparams.penalty_present}, From 0566c695316a077b2c46b79cc32c19802da6ae03 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 9 Oct 2024 11:01:30 +0300 Subject: [PATCH 02/42] llama.vim : neovim plugin --- examples/llama.vim | 199 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 199 insertions(+) create mode 100644 examples/llama.vim diff --git a/examples/llama.vim b/examples/llama.vim new file mode 100644 index 0000000000000..30a717181062b --- /dev/null +++ b/examples/llama.vim @@ -0,0 +1,199 @@ +" sample config: +" +" - Ctrl+F - trigger FIM completion +" +" copy paste this in your .vimrc: +" +"augroup llama_cpp +" autocmd! +" autocmd InsertEnter * inoremap :call llama#fim()a +"augroup END +" + +" color of the suggested text +highlight llama_hint guifg=#ff772f + +let s:default_config = { + \ 'endpoint': 'http://127.0.0.1:8012/infill', + \ 'n_prefix': 32, + \ 'n_suffix': 32, + \ 'n_predict': 64, + \ 'n_probs': 3, + \ 'temperature': 0.1, + \ 'stop': ["\n"] + \ } + +let g:llama_config = get(g:, 'llama_config', s:default_config) + +function! llama#fim() abort + let l:pos_x = col('.') + let l:pos_y = line('.') + let l:max_y = line('$') + + let l:lines_prefix = getline(max([1, l:pos_y - g:llama_config.n_prefix]), l:pos_y - 1) + let l:lines_suffix = getline(l:pos_y + 1, min([l:max_y, l:pos_y + g:llama_config.n_suffix])) + + let l:line_cur = getline('.') + let l:line_cur_prefix = strpart(l:line_cur, 0, l:pos_x) + let l:line_cur_suffix = strpart(l:line_cur, l:pos_x) + + let l:prefix = "" + \ . join(l:lines_prefix, "\n") + \ . "\n" + \ . l:line_cur_prefix + + let l:suffix = "" + \ . l:line_cur_suffix + \ . join(l:lines_suffix, "\n") + \ . "\n" + + let l:request = json_encode({ + \ 'prompt': "", + \ 'input_prefix': l:prefix, + \ 'input_suffix': l:suffix, + "\ 'stop': g:llama_config.stop, + \ 'n_predict': g:llama_config.n_predict, + "\ 'n_probs': g:llama_config.n_probs, + \ 'penalty_last_n': 0, + \ 'temperature': g:llama_config.temperature, + \ 'top_k': 5, + \ 'infill_p': 0.20, + \ 'infill_p_eog': 0.001, + \ 'stream': v:false, + \ 'samplers': ["top_k", "infill"] + \ }) + + " request completion from the server + let l:curl_command = printf( + \ "curl --silent --no-buffer --request POST --url %s --header \"Content-Type: application/json\" --data %s", + \ g:llama_config.endpoint, shellescape(l:request) + \ ) + + let l:can_accept = v:true + let s:content = [] + + let l:raw = system(l:curl_command) + if l:can_accept && v:shell_error + call add(s:content, "<| curl error: is the server on? |>") + let l:can_accept = v:false + endif + + if l:can_accept && l:raw == "" + call add(s:content, "<| empty response: is the server on? |>") + let l:can_accept = v:false + endif + + " get the generated suggestion + if l:can_accept + let l:response = json_decode(l:raw) + + for l:part in split(get(l:response, 'content', ''), "\n", 1) + call add(s:content, l:part) + endfor + + " remove trailing new lines + while len(s:content) > 0 && s:content[-1] == "" + call remove(s:content, -1) + endwhile + endif + + if len(s:content) == 0 + call add(s:content, "<| nothing to suggest |>") + let l:can_accept = v:false + endif + + let s:pos_dx = len(s:content[-1]) + let s:content[-1] .= l:line_cur_suffix + + " display virtual text with the suggestion + let l:bufnr = bufnr('%') + let s:ns_id = nvim_create_namespace('llama_virtual_text') + + call nvim_buf_set_extmark(l:bufnr, s:ns_id, l:pos_y - 1, l:pos_x - 1, { + \ 'virt_text': [[s:content[0], 'llama_hint']], + \ 'virt_text_win_col': virtcol('.') + \ }) + + call nvim_buf_set_extmark(l:bufnr, s:ns_id, l:pos_y - 1, 0, { + \ 'virt_lines': map(s:content[1:], {idx, val -> [[val, 'llama_hint']]}), + \ 'virt_text_win_col': virtcol('.') + \ }) + + " accept suggestion with Tab and reject it with any other key + if l:can_accept + inoremap :call llama#accept_virtual_text() + else + inoremap :call llama#cancel_virtual_text() + endif + + for l:key in range(33, 127) + [8, 27] + if l:key != 0x7C + if l:key == 8 + execute 'inoremap :call llama#cancel_virtual_text()' + elseif l:key == 27 + execute 'inoremap :call llama#cancel_virtual_text()' + elseif l:key == 127 + execute 'inoremap :call llama#cancel_virtual_text()' + else + execute 'inoremap ' . nr2char(l:key) . ' :call llama#cancel_virtual_text()' . nr2char(l:key) + endif + endif + endfor + + inoremap :call llama#cancel_virtual_text() + inoremap :call llama#cancel_virtual_text() + inoremap :call llama#cancel_virtual_text() + inoremap :call llama#cancel_virtual_text() +endfunction + +function! llama#accept_virtual_text() + let l:pos_x = col('.') + let l:pos_y = line('.') + + let l:line_cur = getline('.') + + let l:pos0 = l:pos_x - 2 + + if l:pos_x == len(l:line_cur) + let l:pos0 = l:pos_x - 1 + endif + + " insert the suggestion at the cursor location + call setline(l:pos_y, l:line_cur[:l:pos0] . s:content[0]) + if len(s:content) > 1 + call append(l:pos_y, s:content[1:-1]) + endif + + " move the cursor to the end of the accepted text + call cursor(l:pos_y + len(s:content) - 1, l:pos_x + s:pos_dx) + + call llama#cancel_virtual_text() +endfunction + +function! llama#cancel_virtual_text() + " clear the virtual text + let l:bufnr = bufnr('%') + call nvim_buf_clear_namespace(l:bufnr, s:ns_id, 0, -1) + + " remove the mappings + iunmap + + for l:key in range(33, 127) + [8, 27] + if l:key != 0x7C + if l:key == 8 + execute 'iunmap ' + elseif l:key == 27 + execute 'iunmap ' + elseif l:key == 127 + execute 'iunmap ' + else + execute 'iunmap ' . nr2char(l:key) + endif + endif + endfor + + iunmap + iunmap + iunmap + iunmap +endfunction From 0c649c8967f60ff7f8247f0ca4aca86e55b891e4 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 9 Oct 2024 12:36:56 +0300 Subject: [PATCH 03/42] llama.vim : fix suffix construction + fix virt text offset --- examples/llama.vim | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/llama.vim b/examples/llama.vim index 30a717181062b..10f81f73331fb 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -44,6 +44,7 @@ function! llama#fim() abort let l:suffix = "" \ . l:line_cur_suffix + \ . "\n" \ . join(l:lines_suffix, "\n") \ . "\n" @@ -111,7 +112,7 @@ function! llama#fim() abort call nvim_buf_set_extmark(l:bufnr, s:ns_id, l:pos_y - 1, l:pos_x - 1, { \ 'virt_text': [[s:content[0], 'llama_hint']], - \ 'virt_text_win_col': virtcol('.') + \ 'virt_text_win_col': l:pos_x == 1 ? 0 : virtcol('.') \ }) call nvim_buf_set_extmark(l:bufnr, s:ns_id, l:pos_y - 1, 0, { From 07e7dd47f21bf3e0af19cae282b5748ad430313c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 9 Oct 2024 12:57:44 +0300 Subject: [PATCH 04/42] llama.vim : handle space --- examples/llama.vim | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/examples/llama.vim b/examples/llama.vim index 10f81f73331fb..24289fbe05bf1 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -127,14 +127,16 @@ function! llama#fim() abort inoremap :call llama#cancel_virtual_text() endif - for l:key in range(33, 127) + [8, 27] + for l:key in range(32, 127) + [8, 27] if l:key != 0x7C if l:key == 8 - execute 'inoremap :call llama#cancel_virtual_text()' + execute 'inoremap :call llama#cancel_virtual_text()' elseif l:key == 27 - execute 'inoremap :call llama#cancel_virtual_text()' + execute 'inoremap :call llama#cancel_virtual_text()' + elseif l:key == 32 + execute 'inoremap :call llama#cancel_virtual_text()' elseif l:key == 127 - execute 'inoremap :call llama#cancel_virtual_text()' + execute 'inoremap :call llama#cancel_virtual_text()' else execute 'inoremap ' . nr2char(l:key) . ' :call llama#cancel_virtual_text()' . nr2char(l:key) endif @@ -153,11 +155,7 @@ function! llama#accept_virtual_text() let l:line_cur = getline('.') - let l:pos0 = l:pos_x - 2 - - if l:pos_x == len(l:line_cur) - let l:pos0 = l:pos_x - 1 - endif + let l:pos0 = l:pos_x == len(l:line_cur) ? l:pos_x - 1 : l:pos_x - 2 " insert the suggestion at the cursor location call setline(l:pos_y, l:line_cur[:l:pos0] . s:content[0]) @@ -179,12 +177,14 @@ function! llama#cancel_virtual_text() " remove the mappings iunmap - for l:key in range(33, 127) + [8, 27] + for l:key in range(32, 127) + [8, 27] if l:key != 0x7C if l:key == 8 execute 'iunmap ' elseif l:key == 27 execute 'iunmap ' + elseif l:key == 32 + execute 'iunmap ' elseif l:key == 127 execute 'iunmap ' else From 9d13e87b1b1dd57b815f96a4099849c7d41af2be Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 9 Oct 2024 15:08:31 +0300 Subject: [PATCH 05/42] llama.vim : add processing info overlay --- examples/llama.vim | 87 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 64 insertions(+), 23 deletions(-) diff --git a/examples/llama.vim b/examples/llama.vim index 24289fbe05bf1..febef637ce9e8 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -11,12 +11,13 @@ " " color of the suggested text -highlight llama_hint guifg=#ff772f +highlight llama_hl_hint guifg=#ff772f +highlight llama_hl_info guifg=#77ff2f let s:default_config = { \ 'endpoint': 'http://127.0.0.1:8012/infill', - \ 'n_prefix': 32, - \ 'n_suffix': 32, + \ 'n_prefix': 128, + \ 'n_suffix': 128, \ 'n_predict': 64, \ 'n_probs': 3, \ 'temperature': 0.1, @@ -71,6 +72,16 @@ function! llama#fim() abort \ ) let l:can_accept = v:true + let l:has_timing = v:false + + let l:n_prompt = 0 + let l:t_prompt_ms = 1.0 + let l:s_prompt = 0 + + let l:n_gen = 0 + let l:t_gen_ms = 1.0 + let l:s_gen = 0 + let s:content = [] let l:raw = system(l:curl_command) @@ -96,6 +107,20 @@ function! llama#fim() abort while len(s:content) > 0 && s:content[-1] == "" call remove(s:content, -1) endwhile + + " if response.timings + if len(get(l:response, 'timings', {})) > 0 + let l:has_timing = v:true + let l:timings = get(l:response, 'timings', {}) + + let l:n_prompt = get(l:timings, 'prompt_n', 0) + let l:t_prompt_ms = get(l:timings, 'prompt_ms', 1) + let l:s_prompt = get(l:timings, 'prompt_per_second', 0) + + let l:n_gen = get(l:timings, 'predicted_n', 0) + let l:t_gen_ms = get(l:timings, 'predicted_ms', 1) + let l:s_gen = get(l:timings, 'predicted_per_second', 0) + endif endif if len(s:content) == 0 @@ -108,48 +133,62 @@ function! llama#fim() abort " display virtual text with the suggestion let l:bufnr = bufnr('%') - let s:ns_id = nvim_create_namespace('llama_virtual_text') - call nvim_buf_set_extmark(l:bufnr, s:ns_id, l:pos_y - 1, l:pos_x - 1, { - \ 'virt_text': [[s:content[0], 'llama_hint']], + let s:id_vt_fim = nvim_create_namespace('vt_fim') + let s:id_vt_info = nvim_create_namespace('vt_info') + + call nvim_buf_set_extmark(l:bufnr, s:id_vt_fim, l:pos_y - 1, l:pos_x - 1, { + \ 'virt_text': [[s:content[0], 'llama_hl_hint']], \ 'virt_text_win_col': l:pos_x == 1 ? 0 : virtcol('.') \ }) - call nvim_buf_set_extmark(l:bufnr, s:ns_id, l:pos_y - 1, 0, { - \ 'virt_lines': map(s:content[1:], {idx, val -> [[val, 'llama_hint']]}), + call nvim_buf_set_extmark(l:bufnr, s:id_vt_fim, l:pos_y - 1, 0, { + \ 'virt_lines': map(s:content[1:], {idx, val -> [[val, 'llama_hl_hint']]}), \ 'virt_text_win_col': virtcol('.') \ }) + " construct the info message: + if l:has_timing + let l:info = printf("prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s)", + \ l:n_prompt, l:t_prompt_ms, l:s_prompt, + \ l:n_gen, l:t_gen_ms, l:s_gen) + + call nvim_buf_set_extmark(l:bufnr, s:id_vt_info, l:pos_y - 1, l:pos_x - 1, { + \ 'virt_text': [[l:info, 'llama_hl_info']], + \ 'virt_text_pos': 'right_align', + \ }) + endif + " accept suggestion with Tab and reject it with any other key if l:can_accept - inoremap :call llama#accept_virtual_text() + inoremap :call llama#accept_vt_fim() else - inoremap :call llama#cancel_virtual_text() + inoremap :call llama#cancel_vt_fim() endif for l:key in range(32, 127) + [8, 27] if l:key != 0x7C if l:key == 8 - execute 'inoremap :call llama#cancel_virtual_text()' + execute 'inoremap :call llama#cancel_vt_fim()' elseif l:key == 27 - execute 'inoremap :call llama#cancel_virtual_text()' + execute 'inoremap :call llama#cancel_vt_fim()' elseif l:key == 32 - execute 'inoremap :call llama#cancel_virtual_text()' + execute 'inoremap :call llama#cancel_vt_fim()' elseif l:key == 127 - execute 'inoremap :call llama#cancel_virtual_text()' + execute 'inoremap :call llama#cancel_vt_fim()' else - execute 'inoremap ' . nr2char(l:key) . ' :call llama#cancel_virtual_text()' . nr2char(l:key) + execute 'inoremap ' . nr2char(l:key) . ' :call llama#cancel_vt_fim()' . nr2char(l:key) endif endif endfor - inoremap :call llama#cancel_virtual_text() - inoremap :call llama#cancel_virtual_text() - inoremap :call llama#cancel_virtual_text() - inoremap :call llama#cancel_virtual_text() + inoremap :call llama#cancel_vt_fim() + inoremap :call llama#cancel_vt_fim() + inoremap :call llama#cancel_vt_fim() + inoremap :call llama#cancel_vt_fim() endfunction -function! llama#accept_virtual_text() +function! llama#accept_vt_fim() let l:pos_x = col('.') let l:pos_y = line('.') @@ -166,13 +205,15 @@ function! llama#accept_virtual_text() " move the cursor to the end of the accepted text call cursor(l:pos_y + len(s:content) - 1, l:pos_x + s:pos_dx) - call llama#cancel_virtual_text() + call llama#cancel_vt_fim() endfunction -function! llama#cancel_virtual_text() +function! llama#cancel_vt_fim() " clear the virtual text let l:bufnr = bufnr('%') - call nvim_buf_clear_namespace(l:bufnr, s:ns_id, 0, -1) + + call nvim_buf_clear_namespace(l:bufnr, s:id_vt_fim, 0, -1) + call nvim_buf_clear_namespace(l:bufnr, s:id_vt_info, 0, -1) " remove the mappings iunmap From 6e82a03b9dada237be3a3e358176b3e8f68e5330 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 9 Oct 2024 15:26:19 +0300 Subject: [PATCH 06/42] llama.vim : display realtime [no ci] --- examples/llama.vim | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/examples/llama.vim b/examples/llama.vim index febef637ce9e8..54bb87cece245 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -27,6 +27,8 @@ let s:default_config = { let g:llama_config = get(g:, 'llama_config', s:default_config) function! llama#fim() abort + let l:t_start = reltime() + let l:pos_x = col('.') let l:pos_y = line('.') let l:max_y = line('$') @@ -149,9 +151,11 @@ function! llama#fim() abort " construct the info message: if l:has_timing - let l:info = printf("prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s)", + let l:info = printf("prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %f ms", \ l:n_prompt, l:t_prompt_ms, l:s_prompt, - \ l:n_gen, l:t_gen_ms, l:s_gen) + \ l:n_gen, l:t_gen_ms, l:s_gen, + \ 1000.0 * reltimefloat(reltime(l:t_start)) + \ ) call nvim_buf_set_extmark(l:bufnr, s:id_vt_info, l:pos_y - 1, l:pos_x - 1, { \ 'virt_text': [[l:info, 'llama_hl_info']], From 26a0c61e8af157aaa1321c34705bb9fcbb4ece0d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 9 Oct 2024 15:44:14 +0300 Subject: [PATCH 07/42] llama.vim : allow repeated suggestions [no ci] --- examples/llama.vim | 89 +++++++++++++++++++++++++++------------------- 1 file changed, 52 insertions(+), 37 deletions(-) diff --git a/examples/llama.vim b/examples/llama.vim index 54bb87cece245..1544887c2c664 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -74,7 +74,7 @@ function! llama#fim() abort \ ) let l:can_accept = v:true - let l:has_timing = v:false + let l:has_info = v:false let l:n_prompt = 0 let l:t_prompt_ms = 1.0 @@ -112,8 +112,8 @@ function! llama#fim() abort " if response.timings if len(get(l:response, 'timings', {})) > 0 - let l:has_timing = v:true - let l:timings = get(l:response, 'timings', {}) + let l:has_info = v:true + let l:timings = get(l:response, 'timings', {}) let l:n_prompt = get(l:timings, 'prompt_n', 0) let l:t_prompt_ms = get(l:timings, 'prompt_ms', 1) @@ -133,25 +133,21 @@ function! llama#fim() abort let s:pos_dx = len(s:content[-1]) let s:content[-1] .= l:line_cur_suffix + call llama#cancel_vt_fim() + " display virtual text with the suggestion let l:bufnr = bufnr('%') let s:id_vt_fim = nvim_create_namespace('vt_fim') let s:id_vt_info = nvim_create_namespace('vt_info') - call nvim_buf_set_extmark(l:bufnr, s:id_vt_fim, l:pos_y - 1, l:pos_x - 1, { - \ 'virt_text': [[s:content[0], 'llama_hl_hint']], - \ 'virt_text_win_col': l:pos_x == 1 ? 0 : virtcol('.') - \ }) - - call nvim_buf_set_extmark(l:bufnr, s:id_vt_fim, l:pos_y - 1, 0, { - \ 'virt_lines': map(s:content[1:], {idx, val -> [[val, 'llama_hl_hint']]}), - \ 'virt_text_win_col': virtcol('.') - \ }) - " construct the info message: - if l:has_timing - let l:info = printf("prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %f ms", + if l:has_info + " prefix the info string with whitespace in order to offset it to the right of the fim overlay + let l:prefix = repeat(' ', len(s:content[0]) - len(l:line_cur_suffix) + 3) + + let l:info = printf("%s // prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %f ms", + \ l:prefix, \ l:n_prompt, l:t_prompt_ms, l:s_prompt, \ l:n_gen, l:t_gen_ms, l:s_gen, \ 1000.0 * reltimefloat(reltime(l:t_start)) @@ -159,11 +155,23 @@ function! llama#fim() abort call nvim_buf_set_extmark(l:bufnr, s:id_vt_info, l:pos_y - 1, l:pos_x - 1, { \ 'virt_text': [[l:info, 'llama_hl_info']], - \ 'virt_text_pos': 'right_align', + \ 'virt_text_pos': 'eol', \ }) endif + call nvim_buf_set_extmark(l:bufnr, s:id_vt_fim, l:pos_y - 1, l:pos_x - 1, { + \ 'virt_text': [[s:content[0], 'llama_hl_hint']], + \ 'virt_text_win_col': l:pos_x == 1 ? 0 : virtcol('.') + \ }) + + call nvim_buf_set_extmark(l:bufnr, s:id_vt_fim, l:pos_y - 1, 0, { + \ 'virt_lines': map(s:content[1:], {idx, val -> [[val, 'llama_hl_hint']]}), + \ 'virt_text_win_col': virtcol('.') + \ }) + " accept suggestion with Tab and reject it with any other key + let s:mapping_on = v:true + if l:can_accept inoremap :call llama#accept_vt_fim() else @@ -216,30 +224,37 @@ function! llama#cancel_vt_fim() " clear the virtual text let l:bufnr = bufnr('%') + let s:id_vt_fim = nvim_create_namespace('vt_fim') + let s:id_vt_info = nvim_create_namespace('vt_info') + call nvim_buf_clear_namespace(l:bufnr, s:id_vt_fim, 0, -1) call nvim_buf_clear_namespace(l:bufnr, s:id_vt_info, 0, -1) - " remove the mappings - iunmap - - for l:key in range(32, 127) + [8, 27] - if l:key != 0x7C - if l:key == 8 - execute 'iunmap ' - elseif l:key == 27 - execute 'iunmap ' - elseif l:key == 32 - execute 'iunmap ' - elseif l:key == 127 - execute 'iunmap ' - else - execute 'iunmap ' . nr2char(l:key) + " remove the key mappings + if exists('s:mapping_on') && s:mapping_on + iunmap + + for l:key in range(32, 127) + [8, 27] + if l:key != 0x7C + if l:key == 8 + execute 'iunmap ' + elseif l:key == 27 + execute 'iunmap ' + elseif l:key == 32 + execute 'iunmap ' + elseif l:key == 127 + execute 'iunmap ' + else + execute 'iunmap ' . nr2char(l:key) + endif endif - endif - endfor + endfor - iunmap - iunmap - iunmap - iunmap + iunmap + iunmap + iunmap + iunmap + + let s:mapping_on = v:false + endif endfunction From 7e0b5062af42e96ba1709f7283d4d0cfd1eb6b55 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 9 Oct 2024 16:07:24 +0300 Subject: [PATCH 08/42] llama.vim : reduce scope of ids to local [no ci] --- examples/llama.vim | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/examples/llama.vim b/examples/llama.vim index 1544887c2c664..b8cfa5906bab3 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -138,33 +138,33 @@ function! llama#fim() abort " display virtual text with the suggestion let l:bufnr = bufnr('%') - let s:id_vt_fim = nvim_create_namespace('vt_fim') - let s:id_vt_info = nvim_create_namespace('vt_info') + let l:id_vt_fim = nvim_create_namespace('vt_fim') + let l:id_vt_info = nvim_create_namespace('vt_info') " construct the info message: if l:has_info " prefix the info string with whitespace in order to offset it to the right of the fim overlay let l:prefix = repeat(' ', len(s:content[0]) - len(l:line_cur_suffix) + 3) - let l:info = printf("%s // prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %f ms", + let l:info = printf("%s | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %f.2 ms", \ l:prefix, \ l:n_prompt, l:t_prompt_ms, l:s_prompt, \ l:n_gen, l:t_gen_ms, l:s_gen, \ 1000.0 * reltimefloat(reltime(l:t_start)) \ ) - call nvim_buf_set_extmark(l:bufnr, s:id_vt_info, l:pos_y - 1, l:pos_x - 1, { + call nvim_buf_set_extmark(l:bufnr, l:id_vt_info, l:pos_y - 1, l:pos_x - 1, { \ 'virt_text': [[l:info, 'llama_hl_info']], \ 'virt_text_pos': 'eol', \ }) endif - call nvim_buf_set_extmark(l:bufnr, s:id_vt_fim, l:pos_y - 1, l:pos_x - 1, { + call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, l:pos_y - 1, l:pos_x - 1, { \ 'virt_text': [[s:content[0], 'llama_hl_hint']], \ 'virt_text_win_col': l:pos_x == 1 ? 0 : virtcol('.') \ }) - call nvim_buf_set_extmark(l:bufnr, s:id_vt_fim, l:pos_y - 1, 0, { + call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, l:pos_y - 1, 0, { \ 'virt_lines': map(s:content[1:], {idx, val -> [[val, 'llama_hl_hint']]}), \ 'virt_text_win_col': virtcol('.') \ }) @@ -224,11 +224,11 @@ function! llama#cancel_vt_fim() " clear the virtual text let l:bufnr = bufnr('%') - let s:id_vt_fim = nvim_create_namespace('vt_fim') - let s:id_vt_info = nvim_create_namespace('vt_info') + let l:id_vt_fim = nvim_create_namespace('vt_fim') + let l:id_vt_info = nvim_create_namespace('vt_info') - call nvim_buf_clear_namespace(l:bufnr, s:id_vt_fim, 0, -1) - call nvim_buf_clear_namespace(l:bufnr, s:id_vt_info, 0, -1) + call nvim_buf_clear_namespace(l:bufnr, l:id_vt_fim, 0, -1) + call nvim_buf_clear_namespace(l:bufnr, l:id_vt_info, 0, -1) " remove the key mappings if exists('s:mapping_on') && s:mapping_on From 41053f92d305468844cb7ad539d0ff752c1e9d6a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 10 Oct 2024 08:38:57 +0300 Subject: [PATCH 09/42] llama.vim : simplify init and cancel + auto-fim --- examples/llama.vim | 210 +++++++++++++++++++++++---------------------- 1 file changed, 109 insertions(+), 101 deletions(-) diff --git a/examples/llama.vim b/examples/llama.vim index b8cfa5906bab3..de889678dd04a 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -2,12 +2,9 @@ " " - Ctrl+F - trigger FIM completion " -" copy paste this in your .vimrc: +" run this once to initialise the plugin: " -"augroup llama_cpp -" autocmd! -" autocmd InsertEnter * inoremap :call llama#fim()a -"augroup END +" :call llama#init() " " color of the suggested text @@ -21,24 +18,76 @@ let s:default_config = { \ 'n_predict': 64, \ 'n_probs': 3, \ 'temperature': 0.1, + \ 'auto_fim': v:true, \ 'stop': ["\n"] \ } let g:llama_config = get(g:, 'llama_config', s:default_config) -function! llama#fim() abort +function! llama#init() + let s:pos_x = 0 + let s:pos_y = 0 + let s:pos_x0 = 0 " pos_x corrected for end-of-line edge case + + let s:line_cur = '' + + let s:pos_dx = 0 + let s:content = [] + let s:can_accept = v:false + + let s:timer_fim = -1 + let s:t_fim_last = reltime() + + augroup llama + autocmd! + autocmd InsertEnter * inoremap :call llama#fim(v:false) + augroup END + + silent! call llama#fim_cancel() +endfunction + +" setup accept/cancel events +function! llama#on_hint(id_timer) + inoremap :call llama#fim_accept() + inoremap :call llama#fim_cancel() + + augroup llama_insert + autocmd! + autocmd CursorMovedI * call llama#fim_cancel() + augroup END +endfunction + +function! llama#fim_auto() + if reltimefloat(reltime(s:t_fim_last)) < 0.50 + if s:timer_fim != -1 + call timer_stop(s:timer_fim) + let s:timer_fim = -1 + endif + endif + + let s:t_fim_last = reltime() + let s:timer_fim = timer_start(500, {-> llama#fim(v:true)}) +endfunction + +function! llama#fim(is_auto) abort let l:t_start = reltime() - let l:pos_x = col('.') - let l:pos_y = line('.') + let s:content = [] + let s:can_accept = v:false + + let s:pos_x = col('.') + let s:pos_y = line('.') let l:max_y = line('$') - let l:lines_prefix = getline(max([1, l:pos_y - g:llama_config.n_prefix]), l:pos_y - 1) - let l:lines_suffix = getline(l:pos_y + 1, min([l:max_y, l:pos_y + g:llama_config.n_suffix])) + let l:lines_prefix = getline(max([1, s:pos_y - g:llama_config.n_prefix]), s:pos_y - 1) + let l:lines_suffix = getline(s:pos_y + 1, min([l:max_y, s:pos_y + g:llama_config.n_suffix])) + + let s:line_cur = getline('.') + + let s:pos_x0 = s:pos_x == len(s:line_cur) ? s:pos_x : s:pos_x - 1 - let l:line_cur = getline('.') - let l:line_cur_prefix = strpart(l:line_cur, 0, l:pos_x) - let l:line_cur_suffix = strpart(l:line_cur, l:pos_x) + let l:line_cur_prefix = strpart(s:line_cur, 0, s:pos_x0) + let l:line_cur_suffix = strpart(s:line_cur, s:pos_x0) let l:prefix = "" \ . join(l:lines_prefix, "\n") @@ -73,7 +122,7 @@ function! llama#fim() abort \ g:llama_config.endpoint, shellescape(l:request) \ ) - let l:can_accept = v:true + let s:can_accept = v:true let l:has_info = v:false let l:n_prompt = 0 @@ -84,21 +133,24 @@ function! llama#fim() abort let l:t_gen_ms = 1.0 let l:s_gen = 0 - let s:content = [] - + " TODO: async this let l:raw = system(l:curl_command) - if l:can_accept && v:shell_error - call add(s:content, "<| curl error: is the server on? |>") - let l:can_accept = v:false + if s:can_accept && v:shell_error + if !a:is_auto + call add(s:content, "<| curl error: is the server on? |>") + endif + let s:can_accept = v:false endif - if l:can_accept && l:raw == "" - call add(s:content, "<| empty response: is the server on? |>") - let l:can_accept = v:false + if s:can_accept && l:raw == "" + if !a:is_auto + call add(s:content, "<| empty response: is the server on? |>") + endif + let s:can_accept = v:false endif " get the generated suggestion - if l:can_accept + if s:can_accept let l:response = json_decode(l:raw) for l:part in split(get(l:response, 'content', ''), "\n", 1) @@ -126,14 +178,20 @@ function! llama#fim() abort endif if len(s:content) == 0 - call add(s:content, "<| nothing to suggest |>") - let l:can_accept = v:false + if !a:is_auto + call add(s:content, "<| nothing to suggest |>") + endif + let s:can_accept = v:false + endif + + if len(s:content) == 0 + return endif let s:pos_dx = len(s:content[-1]) let s:content[-1] .= l:line_cur_suffix - call llama#cancel_vt_fim() + call llama#fim_cancel() " display virtual text with the suggestion let l:bufnr = bufnr('%') @@ -153,74 +211,42 @@ function! llama#fim() abort \ 1000.0 * reltimefloat(reltime(l:t_start)) \ ) - call nvim_buf_set_extmark(l:bufnr, l:id_vt_info, l:pos_y - 1, l:pos_x - 1, { + call nvim_buf_set_extmark(l:bufnr, l:id_vt_info, s:pos_y - 1, s:pos_x - 1, { \ 'virt_text': [[l:info, 'llama_hl_info']], \ 'virt_text_pos': 'eol', \ }) endif - call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, l:pos_y - 1, l:pos_x - 1, { + call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, s:pos_x - 1, { \ 'virt_text': [[s:content[0], 'llama_hl_hint']], - \ 'virt_text_win_col': l:pos_x == 1 ? 0 : virtcol('.') + \ 'virt_text_win_col': s:pos_x == len(s:line_cur) ? virtcol('.') : virtcol('.') - 1 \ }) - call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, l:pos_y - 1, 0, { + call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, 0, { \ 'virt_lines': map(s:content[1:], {idx, val -> [[val, 'llama_hl_hint']]}), \ 'virt_text_win_col': virtcol('.') \ }) - " accept suggestion with Tab and reject it with any other key - let s:mapping_on = v:true - - if l:can_accept - inoremap :call llama#accept_vt_fim() - else - inoremap :call llama#cancel_vt_fim() - endif - - for l:key in range(32, 127) + [8, 27] - if l:key != 0x7C - if l:key == 8 - execute 'inoremap :call llama#cancel_vt_fim()' - elseif l:key == 27 - execute 'inoremap :call llama#cancel_vt_fim()' - elseif l:key == 32 - execute 'inoremap :call llama#cancel_vt_fim()' - elseif l:key == 127 - execute 'inoremap :call llama#cancel_vt_fim()' - else - execute 'inoremap ' . nr2char(l:key) . ' :call llama#cancel_vt_fim()' . nr2char(l:key) - endif - endif - endfor - - inoremap :call llama#cancel_vt_fim() - inoremap :call llama#cancel_vt_fim() - inoremap :call llama#cancel_vt_fim() - inoremap :call llama#cancel_vt_fim() + " need to async this call because the in insert mode causes the cursor to move when at the end of the line + call timer_start(0, 'llama#on_hint') endfunction -function! llama#accept_vt_fim() - let l:pos_x = col('.') - let l:pos_y = line('.') - - let l:line_cur = getline('.') - - let l:pos0 = l:pos_x == len(l:line_cur) ? l:pos_x - 1 : l:pos_x - 2 - +function! llama#fim_accept() " insert the suggestion at the cursor location - call setline(l:pos_y, l:line_cur[:l:pos0] . s:content[0]) - if len(s:content) > 1 - call append(l:pos_y, s:content[1:-1]) - endif + if s:can_accept && len(s:content) > 0 + call setline(s:pos_y, s:line_cur[:(s:pos_x0 - 1)] . s:content[0]) + if len(s:content) > 1 + call append(s:pos_y, s:content[1:-1]) + endif - " move the cursor to the end of the accepted text - call cursor(l:pos_y + len(s:content) - 1, l:pos_x + s:pos_dx) + " move the cursor to the end of the accepted text + call cursor(s:pos_y + len(s:content) - 1, s:pos_x + s:pos_dx) + endif - call llama#cancel_vt_fim() + call llama#fim_cancel() endfunction -function! llama#cancel_vt_fim() +function! llama#fim_cancel() " clear the virtual text let l:bufnr = bufnr('%') @@ -230,31 +256,13 @@ function! llama#cancel_vt_fim() call nvim_buf_clear_namespace(l:bufnr, l:id_vt_fim, 0, -1) call nvim_buf_clear_namespace(l:bufnr, l:id_vt_info, 0, -1) - " remove the key mappings - if exists('s:mapping_on') && s:mapping_on - iunmap - - for l:key in range(32, 127) + [8, 27] - if l:key != 0x7C - if l:key == 8 - execute 'iunmap ' - elseif l:key == 27 - execute 'iunmap ' - elseif l:key == 32 - execute 'iunmap ' - elseif l:key == 127 - execute 'iunmap ' - else - execute 'iunmap ' . nr2char(l:key) - endif - endif - endfor - - iunmap - iunmap - iunmap - iunmap + silent! iunmap + silent! iunmap - let s:mapping_on = v:false - endif + augroup llama_insert + autocmd! + if g:llama_config.auto_fim + autocmd CursorMovedI * call llama#fim_auto() + endif + augroup END endfunction From c507a65af5025fb22bacdc7c89badedb4df29c65 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 10 Oct 2024 12:27:34 +0300 Subject: [PATCH 10/42] llama.vim : async --- examples/llama.vim | 174 ++++++++++++++++++++++++++------------------- 1 file changed, 101 insertions(+), 73 deletions(-) diff --git a/examples/llama.vim b/examples/llama.vim index de889678dd04a..d727948ea53fb 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -1,6 +1,6 @@ " sample config: " -" - Ctrl+F - trigger FIM completion +" - Ctrl+F - trigger FIM completion manually " " run this once to initialise the plugin: " @@ -31,46 +31,30 @@ function! llama#init() let s:line_cur = '' + let s:line_cur_prefix = '' + let s:line_cur_suffix = '' + let s:pos_dx = 0 let s:content = [] let s:can_accept = v:false let s:timer_fim = -1 - let s:t_fim_last = reltime() + let s:t_fim_last = reltime() + let s:t_fim_start = reltime() + + let s:current_job = v:null augroup llama autocmd! autocmd InsertEnter * inoremap :call llama#fim(v:false) + autocmd InsertLeave * call llama#fim_cancel() augroup END silent! call llama#fim_cancel() endfunction -" setup accept/cancel events -function! llama#on_hint(id_timer) - inoremap :call llama#fim_accept() - inoremap :call llama#fim_cancel() - - augroup llama_insert - autocmd! - autocmd CursorMovedI * call llama#fim_cancel() - augroup END -endfunction - -function! llama#fim_auto() - if reltimefloat(reltime(s:t_fim_last)) < 0.50 - if s:timer_fim != -1 - call timer_stop(s:timer_fim) - let s:timer_fim = -1 - endif - endif - - let s:t_fim_last = reltime() - let s:timer_fim = timer_start(500, {-> llama#fim(v:true)}) -endfunction - function! llama#fim(is_auto) abort - let l:t_start = reltime() + let s:t_fim_start = reltime() let s:content = [] let s:can_accept = v:false @@ -86,16 +70,16 @@ function! llama#fim(is_auto) abort let s:pos_x0 = s:pos_x == len(s:line_cur) ? s:pos_x : s:pos_x - 1 - let l:line_cur_prefix = strpart(s:line_cur, 0, s:pos_x0) - let l:line_cur_suffix = strpart(s:line_cur, s:pos_x0) + let s:line_cur_prefix = strpart(s:line_cur, 0, s:pos_x0) + let s:line_cur_suffix = strpart(s:line_cur, s:pos_x0) let l:prefix = "" \ . join(l:lines_prefix, "\n") \ . "\n" - \ . l:line_cur_prefix + \ . s:line_cur_prefix let l:suffix = "" - \ . l:line_cur_suffix + \ . s:line_cur_suffix \ . "\n" \ . join(l:lines_suffix, "\n") \ . "\n" @@ -116,12 +100,80 @@ function! llama#fim(is_auto) abort \ 'samplers': ["top_k", "infill"] \ }) - " request completion from the server let l:curl_command = printf( \ "curl --silent --no-buffer --request POST --url %s --header \"Content-Type: application/json\" --data %s", \ g:llama_config.endpoint, shellescape(l:request) \ ) + " send the request asynchronously + let s:current_job = jobstart(l:curl_command, { + \ 'on_stdout': function('s:fim_on_stdout'), + \ 'on_exit': function('s:fim_on_exit'), + \ 'stdout_buffered': v:true, + \ 'is_auto': a:is_auto + \ }) +endfunction + +function! llama#fim_accept() + " insert the suggestion at the cursor location + if s:can_accept && len(s:content) > 0 + call setline(s:pos_y, s:line_cur[:(s:pos_x0 - 1)] . s:content[0]) + if len(s:content) > 1 + call append(s:pos_y, s:content[1:-1]) + endif + + " move the cursor to the end of the accepted text + call cursor(s:pos_y + len(s:content) - 1, s:pos_x + s:pos_dx) + endif + + call llama#fim_cancel() +endfunction + +function! llama#fim_cancel() + if s:current_job != v:null + call jobstop(s:current_job) + endif + + " clear the virtual text + let l:bufnr = bufnr('%') + + let l:id_vt_fim = nvim_create_namespace('vt_fim') + let l:id_vt_info = nvim_create_namespace('vt_info') + + call nvim_buf_clear_namespace(l:bufnr, l:id_vt_fim, 0, -1) + call nvim_buf_clear_namespace(l:bufnr, l:id_vt_info, 0, -1) + + silent! iunmap + silent! iunmap + + augroup llama_insert + autocmd! + if g:llama_config.auto_fim + autocmd CursorMovedI * call s:fim_auto() + endif + augroup END +endfunction + +function! s:fim_auto() + if s:current_job != v:null + call jobstop(s:current_job) + endif + + if reltimefloat(reltime(s:t_fim_last)) < 0.001*250 + if s:timer_fim != -1 + call timer_stop(s:timer_fim) + let s:timer_fim = -1 + endif + endif + + let s:t_fim_last = reltime() + let s:timer_fim = timer_start(250, {-> llama#fim(v:true)}) +endfunction + + +function! s:fim_on_stdout(job_id, data, event) dict + let l:raw = join(a:data, "\n") + let s:can_accept = v:true let l:has_info = v:false @@ -133,17 +185,15 @@ function! llama#fim(is_auto) abort let l:t_gen_ms = 1.0 let l:s_gen = 0 - " TODO: async this - let l:raw = system(l:curl_command) if s:can_accept && v:shell_error - if !a:is_auto + if !self.is_auto call add(s:content, "<| curl error: is the server on? |>") endif let s:can_accept = v:false endif if s:can_accept && l:raw == "" - if !a:is_auto + if !self.is_auto call add(s:content, "<| empty response: is the server on? |>") endif let s:can_accept = v:false @@ -178,7 +228,7 @@ function! llama#fim(is_auto) abort endif if len(s:content) == 0 - if !a:is_auto + if !self.is_auto call add(s:content, "<| nothing to suggest |>") endif let s:can_accept = v:false @@ -189,7 +239,7 @@ function! llama#fim(is_auto) abort endif let s:pos_dx = len(s:content[-1]) - let s:content[-1] .= l:line_cur_suffix + let s:content[-1] .= s:line_cur_suffix call llama#fim_cancel() @@ -202,13 +252,13 @@ function! llama#fim(is_auto) abort " construct the info message: if l:has_info " prefix the info string with whitespace in order to offset it to the right of the fim overlay - let l:prefix = repeat(' ', len(s:content[0]) - len(l:line_cur_suffix) + 3) + let l:prefix = repeat(' ', len(s:content[0]) - len(s:line_cur_suffix) + 3) let l:info = printf("%s | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %f.2 ms", \ l:prefix, \ l:n_prompt, l:t_prompt_ms, l:s_prompt, \ l:n_gen, l:t_gen_ms, l:s_gen, - \ 1000.0 * reltimefloat(reltime(l:t_start)) + \ 1000.0 * reltimefloat(reltime(s:t_fim_start)) \ ) call nvim_buf_set_extmark(l:bufnr, l:id_vt_info, s:pos_y - 1, s:pos_x - 1, { @@ -227,42 +277,20 @@ function! llama#fim(is_auto) abort \ 'virt_text_win_col': virtcol('.') \ }) - " need to async this call because the in insert mode causes the cursor to move when at the end of the line - call timer_start(0, 'llama#on_hint') -endfunction - -function! llama#fim_accept() - " insert the suggestion at the cursor location - if s:can_accept && len(s:content) > 0 - call setline(s:pos_y, s:line_cur[:(s:pos_x0 - 1)] . s:content[0]) - if len(s:content) > 1 - call append(s:pos_y, s:content[1:-1]) - endif - - " move the cursor to the end of the accepted text - call cursor(s:pos_y + len(s:content) - 1, s:pos_x + s:pos_dx) - endif - - call llama#fim_cancel() -endfunction - -function! llama#fim_cancel() - " clear the virtual text - let l:bufnr = bufnr('%') - - let l:id_vt_fim = nvim_create_namespace('vt_fim') - let l:id_vt_info = nvim_create_namespace('vt_info') - - call nvim_buf_clear_namespace(l:bufnr, l:id_vt_fim, 0, -1) - call nvim_buf_clear_namespace(l:bufnr, l:id_vt_info, 0, -1) - - silent! iunmap - silent! iunmap + " setup accept/cancel events + inoremap :call llama#fim_accept() + inoremap :call llama#fim_cancel() augroup llama_insert autocmd! - if g:llama_config.auto_fim - autocmd CursorMovedI * call llama#fim_auto() - endif + autocmd CursorMovedI * call llama#fim_cancel() augroup END endfunction + +function! s:fim_on_exit(job_id, exit_code, event) dict + if a:exit_code != 0 + echom "Job failed with exit code: " . a:exit_code + endif + + let s:current_job = v:null +endfunction From 6669b550dbc79a78c0d8152cd7e60b640a8563bc Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 10 Oct 2024 17:06:50 +0300 Subject: [PATCH 11/42] llama.vim : set time limit for the generation phase --- examples/llama.vim | 61 ++++++++++++++++++++++++---------------------- src/llama.cpp | 4 +++ 2 files changed, 36 insertions(+), 29 deletions(-) diff --git a/examples/llama.vim b/examples/llama.vim index d727948ea53fb..5ab43f2c9e386 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -12,14 +12,14 @@ highlight llama_hl_hint guifg=#ff772f highlight llama_hl_info guifg=#77ff2f let s:default_config = { - \ 'endpoint': 'http://127.0.0.1:8012/infill', - \ 'n_prefix': 128, - \ 'n_suffix': 128, - \ 'n_predict': 64, - \ 'n_probs': 3, - \ 'temperature': 0.1, - \ 'auto_fim': v:true, - \ 'stop': ["\n"] + \ 'endpoint': 'http://127.0.0.1:8012/infill', + \ 'n_prefix': 128, + \ 'n_suffix': 128, + \ 'n_predict': 64, + \ 't_max_prompt_ms': 300, + \ 't_max_predict_ms': 200, + \ 'auto_fim': v:true, + \ 'stop': ["\n"] \ } let g:llama_config = get(g:, 'llama_config', s:default_config) @@ -48,6 +48,8 @@ function! llama#init() autocmd! autocmd InsertEnter * inoremap :call llama#fim(v:false) autocmd InsertLeave * call llama#fim_cancel() + + autocmd CursorMoved * call llama#fim_cancel() augroup END silent! call llama#fim_cancel() @@ -85,19 +87,20 @@ function! llama#fim(is_auto) abort \ . "\n" let l:request = json_encode({ - \ 'prompt': "", - \ 'input_prefix': l:prefix, - \ 'input_suffix': l:suffix, - "\ 'stop': g:llama_config.stop, - \ 'n_predict': g:llama_config.n_predict, - "\ 'n_probs': g:llama_config.n_probs, - \ 'penalty_last_n': 0, - \ 'temperature': g:llama_config.temperature, - \ 'top_k': 5, - \ 'infill_p': 0.20, - \ 'infill_p_eog': 0.001, - \ 'stream': v:false, - \ 'samplers': ["top_k", "infill"] + \ 'prompt': "", + \ 'input_prefix': l:prefix, + \ 'input_suffix': l:suffix, + "\ 'stop': g:llama_config.stop, + \ 'n_predict': g:llama_config.n_predict, + \ 'penalty_last_n': 0, + \ 'top_k': 5, + \ 'infill_p': 0.20, + \ 'infill_p_eog': 0.001, + \ 'stream': v:false, + \ 'samplers': ["top_k", "infill"], + \ 't_max_prompt_ms': g:llama_config.t_max_prompt_ms, + \ 't_max_predict_ms': g:llama_config.t_max_predict_ms, + \ 'cache_prompt': v:true \ }) let l:curl_command = printf( @@ -181,9 +184,9 @@ function! s:fim_on_stdout(job_id, data, event) dict let l:t_prompt_ms = 1.0 let l:s_prompt = 0 - let l:n_gen = 0 - let l:t_gen_ms = 1.0 - let l:s_gen = 0 + let l:n_predict = 0 + let l:t_predict_ms = 1.0 + let l:s_predict = 0 if s:can_accept && v:shell_error if !self.is_auto @@ -221,9 +224,9 @@ function! s:fim_on_stdout(job_id, data, event) dict let l:t_prompt_ms = get(l:timings, 'prompt_ms', 1) let l:s_prompt = get(l:timings, 'prompt_per_second', 0) - let l:n_gen = get(l:timings, 'predicted_n', 0) - let l:t_gen_ms = get(l:timings, 'predicted_ms', 1) - let l:s_gen = get(l:timings, 'predicted_per_second', 0) + let l:n_predict = get(l:timings, 'predicted_n', 0) + let l:t_predict_ms = get(l:timings, 'predicted_ms', 1) + let l:s_predict = get(l:timings, 'predicted_per_second', 0) endif endif @@ -256,8 +259,8 @@ function! s:fim_on_stdout(job_id, data, event) dict let l:info = printf("%s | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %f.2 ms", \ l:prefix, - \ l:n_prompt, l:t_prompt_ms, l:s_prompt, - \ l:n_gen, l:t_gen_ms, l:s_gen, + \ l:n_prompt, l:t_prompt_ms, l:s_prompt, + \ l:n_predict, l:t_predict_ms, l:s_predict, \ 1000.0 * reltimefloat(reltime(s:t_fim_start)) \ ) diff --git a/src/llama.cpp b/src/llama.cpp index 1813dd29be2b2..80cc939314b3d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -6725,6 +6725,10 @@ static void llm_load_vocab( vocab.special_eog_ids.insert(vocab.special_eom_id); LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__); } + + if (vocab.special_fim_sep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_sep_id) == 0) { + vocab.special_eog_ids.insert(vocab.special_fim_sep_id); + } } // build special tokens cache From 2e8c350a5f5f70c913b68c539f065a4be22458a4 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 10 Oct 2024 18:31:46 +0300 Subject: [PATCH 12/42] llama.vim : fix edge cases --- examples/llama.vim | 44 +++++++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/examples/llama.vim b/examples/llama.vim index 5ab43f2c9e386..3f747b3603e35 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -98,9 +98,9 @@ function! llama#fim(is_auto) abort \ 'infill_p_eog': 0.001, \ 'stream': v:false, \ 'samplers': ["top_k", "infill"], + "\ 'cache_prompt': v:true, \ 't_max_prompt_ms': g:llama_config.t_max_prompt_ms, - \ 't_max_predict_ms': g:llama_config.t_max_predict_ms, - \ 'cache_prompt': v:true + \ 't_max_predict_ms': g:llama_config.t_max_predict_ms \ }) let l:curl_command = printf( @@ -111,10 +111,21 @@ function! llama#fim(is_auto) abort " send the request asynchronously let s:current_job = jobstart(l:curl_command, { \ 'on_stdout': function('s:fim_on_stdout'), - \ 'on_exit': function('s:fim_on_exit'), + \ 'on_exit': function('s:fim_on_exit'), \ 'stdout_buffered': v:true, \ 'is_auto': a:is_auto \ }) + + " this trick is needed to avoid the cursor shifting upon C-O when at the end of the line + if !a:is_auto + augroup llama_insert + autocmd! + augroup END + + if g:llama_config.auto_fim + call timer_start(0, {-> s:fim_auto_enable()}) + endif + endif endfunction function! llama#fim_accept() @@ -151,9 +162,16 @@ function! llama#fim_cancel() augroup llama_insert autocmd! - if g:llama_config.auto_fim - autocmd CursorMovedI * call s:fim_auto() - endif + augroup END + + if g:llama_config.auto_fim + call s:fim_auto_enable() + endif +endfunction + +function! s:fim_auto_enable() + augroup llama_insert + autocmd CursorMovedI * call s:fim_auto() augroup END endfunction @@ -176,6 +194,9 @@ endfunction function! s:fim_on_stdout(job_id, data, event) dict let l:raw = join(a:data, "\n") + if len(l:raw) == 0 + return + endif let s:can_accept = v:true let l:has_info = v:false @@ -195,13 +216,6 @@ function! s:fim_on_stdout(job_id, data, event) dict let s:can_accept = v:false endif - if s:can_accept && l:raw == "" - if !self.is_auto - call add(s:content, "<| empty response: is the server on? |>") - endif - let s:can_accept = v:false - endif - " get the generated suggestion if s:can_accept let l:response = json_decode(l:raw) @@ -232,7 +246,7 @@ function! s:fim_on_stdout(job_id, data, event) dict if len(s:content) == 0 if !self.is_auto - call add(s:content, "<| nothing to suggest |>") + call add(s:content, "<| EOT |>") endif let s:can_accept = v:false endif @@ -272,7 +286,7 @@ function! s:fim_on_stdout(job_id, data, event) dict call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, s:pos_x - 1, { \ 'virt_text': [[s:content[0], 'llama_hl_hint']], - \ 'virt_text_win_col': s:pos_x == len(s:line_cur) ? virtcol('.') : virtcol('.') - 1 + \ 'virt_text_win_col': virtcol('.') - 1 \ }) call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, 0, { From 4b1bd81661142cb8c9f768e465befbd678f64278 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 10 Oct 2024 20:36:25 +0300 Subject: [PATCH 13/42] llama : simplify infill sampler --- common/common.h | 2 -- examples/llama.vim | 8 +++----- examples/server/server.cpp | 4 ---- src/llama-sampling.cpp | 19 +++++++++++++++++++ 4 files changed, 22 insertions(+), 11 deletions(-) diff --git a/common/common.h b/common/common.h index 2fb92ae143c54..5ca8fd391ab74 100644 --- a/common/common.h +++ b/common/common.h @@ -117,8 +117,6 @@ struct common_sampler_params { float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities float dynatemp_range = 0.00f; // 0.0 = disabled float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler - float infill_p = 0.80f; - float infill_p_eog = 0.01f; int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) float penalty_repeat = 1.00f; // 1.0 = disabled float penalty_freq = 0.00f; // 0.0 = disabled diff --git a/examples/llama.vim b/examples/llama.vim index 3f747b3603e35..c89ddea65385b 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -93,9 +93,7 @@ function! llama#fim(is_auto) abort "\ 'stop': g:llama_config.stop, \ 'n_predict': g:llama_config.n_predict, \ 'penalty_last_n': 0, - \ 'top_k': 5, - \ 'infill_p': 0.20, - \ 'infill_p_eog': 0.001, + \ 'top_k': 100, \ 'stream': v:false, \ 'samplers': ["top_k", "infill"], "\ 'cache_prompt': v:true, @@ -180,7 +178,7 @@ function! s:fim_auto() call jobstop(s:current_job) endif - if reltimefloat(reltime(s:t_fim_last)) < 0.001*250 + if reltimefloat(reltime(s:t_fim_last)) < 500*0.001 if s:timer_fim != -1 call timer_stop(s:timer_fim) let s:timer_fim = -1 @@ -188,7 +186,7 @@ function! s:fim_auto() endif let s:t_fim_last = reltime() - let s:timer_fim = timer_start(250, {-> llama#fim(v:true)}) + let s:timer_fim = timer_start(500, {-> llama#fim(v:true)}) endfunction diff --git a/examples/server/server.cpp b/examples/server/server.cpp index e9621ba93c956..3992108e7f383 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -873,8 +873,6 @@ struct server_context { slot.sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z); slot.sparams.typ_p = json_value(data, "typical_p", default_sparams.typ_p); slot.sparams.temp = json_value(data, "temperature", default_sparams.temp); - slot.sparams.infill_p = json_value(data, "infill_p", default_sparams.infill_p); - slot.sparams.infill_p_eog = json_value(data, "infill_p_eog", default_sparams.infill_p_eog); slot.sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range); slot.sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent); slot.sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n); @@ -1243,8 +1241,6 @@ struct server_context { {"xtc_threshold", slot.sparams.xtc_threshold}, {"tfs_z", slot.sparams.tfs_z}, {"typical_p", slot.sparams.typ_p}, - {"infill_p", slot.sparams.infill_p}, - {"infill_p_eog", slot.sparams.infill_p_eog}, {"repeat_last_n", slot.sparams.penalty_last_n}, {"repeat_penalty", slot.sparams.penalty_repeat}, {"presence_penalty", slot.sparams.penalty_present}, diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index d71516153cf82..4a5b922c44a9d 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -1792,6 +1792,10 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_ LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit); } +<<<<<<< HEAD +======= + float p_max = 0.0f; +>>>>>>> af919ec1 (llama : simplify infill sampler) float p_txt_sum = 0.0f; float p_eog_sum = 0.0f; @@ -1803,12 +1807,20 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_ } } +<<<<<<< HEAD const float rat = p_eog_sum == 0.0 ? INFINITY : p_txt_sum / p_eog_sum; GGML_UNUSED(rat); LOG_DBG_CUR("%s: p_txt_sum = %.2f, p_eog_sum = %.2f, rat = %.2f, n = %zu\n", __func__, p_txt_sum, p_eog_sum, rat, cur_p->size); if (3*p_eog_sum*cur_p->size > p_txt_sum) { LOG_DBG_CUR("%s: the ratio p_txt/p_eog = %.2f is too low -> sampling EOG\n", __func__, p_txt_sum/p_eog_sum); +======= + const float rat = p_txt_sum / p_eog_sum; + LLAMA_LOG_DEBUG("infill: p_max = %.2f, p_txt_sum = %.2f, p_eog_sum = %.2f, rat = %.2f, n = %zu\n", p_max, p_txt_sum, p_eog_sum, rat, cur_p->size); + + if (p_max < 0.90f && p_eog_sum*cur_p->size > p_txt_sum) { + LLAMA_LOG_DEBUG("infill: the ratio p_txt/p_eog = %.2f is too low -> sampling EOG\n", p_txt_sum/p_eog_sum); +>>>>>>> af919ec1 (llama : simplify infill sampler) // keep just the EOG tokens const auto size_org = cur_p->size; @@ -1879,6 +1891,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_ } } +<<<<<<< HEAD size_t n_non_eog = 0; size_t size_org = cur_p->size; @@ -1895,6 +1908,12 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_ if (cur_p->data[i].p < thold && !is_eog) { continue; +======= + // mask non-EOG tokens with prob < 0.2 + for (size_t i = 0; i < cur_p->size; ++i) { + if (cur_p->data[i].p < 0.2 && !llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) { + cur_p->data[i].logit = -INFINITY; +>>>>>>> af919ec1 (llama : simplify infill sampler) } if (!is_eog) { From 865d9bc48a903287649784e15b4a9d48934a9ace Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 11 Oct 2024 12:26:22 +0300 Subject: [PATCH 14/42] llama : clean-up ggml-ci --- examples/llama.vim | 111 +++++++++++++++++++++++++++++++---------- src/llama-sampling.cpp | 20 +------- 2 files changed, 85 insertions(+), 46 deletions(-) diff --git a/examples/llama.vim b/examples/llama.vim index c89ddea65385b..99712d234b9ba 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -1,31 +1,72 @@ +" LLM-based text completion using llama.cpp +" +" requires: +" +" - neovim +" - curl +" - llama.cpp server instance +" - FIM-compatible model +" " sample config: " -" - Ctrl+F - trigger FIM completion manually +" - Tab - accept the current suggestion +" - Shift+Tab - accept just the first line of the segguestion +" - Ctrl+F - trigger FIM completion manually +" +" make symlink or copy this file to ~/.config/nvim/autoload/llama.vim +" +" start the llama.cpp server with a FIM-compatible model. for example: +" +" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa --ubatch-size 1024 --batch-size 2048 +" +" --batch-size [512, model max context] +" +" adjust the batch size to control how much of the provided context will be used during the inference +" lower values will use smaller part of the context around the cursor, which will result in faster processing " -" run this once to initialise the plugin: +" --ubatch-size [64, 2048] " -" :call llama#init() +" chunks the batch into smaller chunks for faster processing +" depends on the specific hardware. use llama-bench to profile and determine the best size +" +" run this once to initialise llama.vim: +" +" :call llama#init() " " color of the suggested text highlight llama_hl_hint guifg=#ff772f highlight llama_hl_info guifg=#77ff2f +" endpoint: llama.cpp server endpoint +" n_prefix: number of lines to include in the prefix +" n_suffix: number of lines to include in the suffix +" n_predict: max number of tokens to predict +" t_max_prompt_ms: max alloted time for the text generation +" show_info: show extra info about the inference +" auto_fim: trigger FIM completion automatically on cursor movement let s:default_config = { \ 'endpoint': 'http://127.0.0.1:8012/infill', - \ 'n_prefix': 128, - \ 'n_suffix': 128, + \ 'n_prefix': 256, + \ 'n_suffix': 256, \ 'n_predict': 64, - \ 't_max_prompt_ms': 300, + \ 't_max_prompt_ms': 500, \ 't_max_predict_ms': 200, + \ 'show_info': v:true, \ 'auto_fim': v:true, - \ 'stop': ["\n"] \ } let g:llama_config = get(g:, 'llama_config', s:default_config) function! llama#init() - let s:pos_x = 0 + if !executable('curl') + echohl WarningMsg + echo 'llama.vim requires the "curl" command to be available' + echohl None + return + endif + + let s:pos_x = 0 " cursor position upon start of completion let s:pos_y = 0 let s:pos_x0 = 0 " pos_x corrected for end-of-line edge case @@ -46,8 +87,8 @@ function! llama#init() augroup llama autocmd! - autocmd InsertEnter * inoremap :call llama#fim(v:false) - autocmd InsertLeave * call llama#fim_cancel() + autocmd InsertEnter * inoremap :call llama#fim(v:false) + autocmd InsertLeavePre * call llama#fim_cancel() autocmd CursorMoved * call llama#fim_cancel() augroup END @@ -90,7 +131,6 @@ function! llama#fim(is_auto) abort \ 'prompt': "", \ 'input_prefix': l:prefix, \ 'input_suffix': l:suffix, - "\ 'stop': g:llama_config.stop, \ 'n_predict': g:llama_config.n_predict, \ 'penalty_last_n': 0, \ 'top_k': 100, @@ -126,16 +166,23 @@ function! llama#fim(is_auto) abort endif endfunction -function! llama#fim_accept() +" if first_line == v:true accept only the first line of the response +function! llama#fim_accept(first_line) " insert the suggestion at the cursor location if s:can_accept && len(s:content) > 0 call setline(s:pos_y, s:line_cur[:(s:pos_x0 - 1)] . s:content[0]) if len(s:content) > 1 - call append(s:pos_y, s:content[1:-1]) + if !a:first_line + call append(s:pos_y, s:content[1:-1]) + endif endif " move the cursor to the end of the accepted text - call cursor(s:pos_y + len(s:content) - 1, s:pos_x + s:pos_dx) + if !a:first_line + call cursor(s:pos_y + len(s:content) - 1, s:pos_x + s:pos_dx) + else + call cursor(s:pos_y, s:pos_x + len(s:content[0]) - 1) + endif endif call llama#fim_cancel() @@ -146,6 +193,11 @@ function! llama#fim_cancel() call jobstop(s:current_job) endif + if s:timer_fim != -1 + call timer_stop(s:timer_fim) + let s:timer_fim = -1 + endif + " clear the virtual text let l:bufnr = bufnr('%') @@ -155,7 +207,9 @@ function! llama#fim_cancel() call nvim_buf_clear_namespace(l:bufnr, l:id_vt_fim, 0, -1) call nvim_buf_clear_namespace(l:bufnr, l:id_vt_info, 0, -1) + " remove the mappings silent! iunmap + silent! iunmap silent! iunmap augroup llama_insert @@ -173,6 +227,8 @@ function! s:fim_auto_enable() augroup END endfunction +" auto-start a fim job a short time after the cursor has moved +" if there is already a job queued - cancel it function! s:fim_auto() if s:current_job != v:null call jobstop(s:current_job) @@ -189,7 +245,7 @@ function! s:fim_auto() let s:timer_fim = timer_start(500, {-> llama#fim(v:true)}) endfunction - +" callback that processes the result from the server function! s:fim_on_stdout(job_id, data, event) dict let l:raw = join(a:data, "\n") if len(l:raw) == 0 @@ -199,6 +255,13 @@ function! s:fim_on_stdout(job_id, data, event) dict let s:can_accept = v:true let l:has_info = v:false + if s:can_accept && v:shell_error + if !self.is_auto + call add(s:content, "<| curl error: is the server on? |>") + endif + let s:can_accept = v:false + endif + let l:n_prompt = 0 let l:t_prompt_ms = 1.0 let l:s_prompt = 0 @@ -207,13 +270,6 @@ function! s:fim_on_stdout(job_id, data, event) dict let l:t_predict_ms = 1.0 let l:s_predict = 0 - if s:can_accept && v:shell_error - if !self.is_auto - call add(s:content, "<| curl error: is the server on? |>") - endif - let s:can_accept = v:false - endif - " get the generated suggestion if s:can_accept let l:response = json_decode(l:raw) @@ -227,7 +283,7 @@ function! s:fim_on_stdout(job_id, data, event) dict call remove(s:content, -1) endwhile - " if response.timings + " if response.timings is available if len(get(l:response, 'timings', {})) > 0 let l:has_info = v:true let l:timings = get(l:response, 'timings', {}) @@ -264,8 +320,8 @@ function! s:fim_on_stdout(job_id, data, event) dict let l:id_vt_fim = nvim_create_namespace('vt_fim') let l:id_vt_info = nvim_create_namespace('vt_info') - " construct the info message: - if l:has_info + " construct the info message and display it to the right of the current line + if g:llama_config.show_info && l:has_info " prefix the info string with whitespace in order to offset it to the right of the fim overlay let l:prefix = repeat(' ', len(s:content[0]) - len(s:line_cur_suffix) + 3) @@ -282,6 +338,7 @@ function! s:fim_on_stdout(job_id, data, event) dict \ }) endif + " display the suggestion call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, s:pos_x - 1, { \ 'virt_text': [[s:content[0], 'llama_hl_hint']], \ 'virt_text_win_col': virtcol('.') - 1 @@ -293,8 +350,8 @@ function! s:fim_on_stdout(job_id, data, event) dict \ }) " setup accept/cancel events - inoremap :call llama#fim_accept() - inoremap :call llama#fim_cancel() + inoremap :call llama#fim_accept(v:false) + inoremap :call llama#fim_accept(v:true) augroup llama_insert autocmd! diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 4a5b922c44a9d..96a97901844bc 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -1791,11 +1791,8 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_ for (size_t i = 0; i < cur_p->size; ++i) { LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit); } +#endif -<<<<<<< HEAD -======= - float p_max = 0.0f; ->>>>>>> af919ec1 (llama : simplify infill sampler) float p_txt_sum = 0.0f; float p_eog_sum = 0.0f; @@ -1807,20 +1804,12 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_ } } -<<<<<<< HEAD const float rat = p_eog_sum == 0.0 ? INFINITY : p_txt_sum / p_eog_sum; GGML_UNUSED(rat); LOG_DBG_CUR("%s: p_txt_sum = %.2f, p_eog_sum = %.2f, rat = %.2f, n = %zu\n", __func__, p_txt_sum, p_eog_sum, rat, cur_p->size); if (3*p_eog_sum*cur_p->size > p_txt_sum) { LOG_DBG_CUR("%s: the ratio p_txt/p_eog = %.2f is too low -> sampling EOG\n", __func__, p_txt_sum/p_eog_sum); -======= - const float rat = p_txt_sum / p_eog_sum; - LLAMA_LOG_DEBUG("infill: p_max = %.2f, p_txt_sum = %.2f, p_eog_sum = %.2f, rat = %.2f, n = %zu\n", p_max, p_txt_sum, p_eog_sum, rat, cur_p->size); - - if (p_max < 0.90f && p_eog_sum*cur_p->size > p_txt_sum) { - LLAMA_LOG_DEBUG("infill: the ratio p_txt/p_eog = %.2f is too low -> sampling EOG\n", p_txt_sum/p_eog_sum); ->>>>>>> af919ec1 (llama : simplify infill sampler) // keep just the EOG tokens const auto size_org = cur_p->size; @@ -1891,7 +1880,6 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_ } } -<<<<<<< HEAD size_t n_non_eog = 0; size_t size_org = cur_p->size; @@ -1908,12 +1896,6 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_ if (cur_p->data[i].p < thold && !is_eog) { continue; -======= - // mask non-EOG tokens with prob < 0.2 - for (size_t i = 0; i < cur_p->size; ++i) { - if (cur_p->data[i].p < 0.2 && !llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) { - cur_p->data[i].logit = -INFINITY; ->>>>>>> af919ec1 (llama : simplify infill sampler) } if (!is_eog) { From c9a46f4bd7386804c127a4e3bbe0456e62edd06c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 11 Oct 2024 13:36:56 +0300 Subject: [PATCH 15/42] llama.vim : minor [no ci] --- examples/llama.vim | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/llama.vim b/examples/llama.vim index 99712d234b9ba..e23373f3b2064 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -325,7 +325,7 @@ function! s:fim_on_stdout(job_id, data, event) dict " prefix the info string with whitespace in order to offset it to the right of the fim overlay let l:prefix = repeat(' ', len(s:content[0]) - len(s:line_cur_suffix) + 3) - let l:info = printf("%s | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %f.2 ms", + let l:info = printf("%s | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %.2f ms", \ l:prefix, \ l:n_prompt, l:t_prompt_ms, l:s_prompt, \ l:n_predict, l:t_predict_ms, l:s_predict, From 5624e919df7e2937880773b12b5b3fbf16382694 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 11 Oct 2024 19:39:44 +0300 Subject: [PATCH 16/42] llama.vim : fix docs [no ci] --- examples/llama.vim | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/llama.vim b/examples/llama.vim index e23373f3b2064..56a876b0de27e 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -39,10 +39,11 @@ highlight llama_hl_hint guifg=#ff772f highlight llama_hl_info guifg=#77ff2f " endpoint: llama.cpp server endpoint -" n_prefix: number of lines to include in the prefix -" n_suffix: number of lines to include in the suffix +" n_prefix: number of lines before the cursor location to include in the prefix +" n_suffix: number of lines after the cursor location to include in the suffix " n_predict: max number of tokens to predict -" t_max_prompt_ms: max alloted time for the text generation +" t_max_prompt_ms: max alloted time for the prompt generation (TODO: not yet supported) +" t_max_predict_ms: max alloted time for the prediction " show_info: show extra info about the inference " auto_fim: trigger FIM completion automatically on cursor movement let s:default_config = { From 491f211b4caf36f90eb350ecf53d570029ce91ad Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 11 Oct 2024 21:14:47 +0300 Subject: [PATCH 17/42] llama : improve infill sampler ggml-ci --- src/llama-sampling.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 96a97901844bc..d71516153cf82 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -1791,7 +1791,6 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_ for (size_t i = 0; i < cur_p->size; ++i) { LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit); } -#endif float p_txt_sum = 0.0f; float p_eog_sum = 0.0f; From 4f46e29b09d53722c7d73e447dd84fd02cb91abd Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 13 Oct 2024 13:42:16 +0300 Subject: [PATCH 18/42] llama : print more info about control tokens --- src/llama.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 80cc939314b3d..1813dd29be2b2 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -6725,10 +6725,6 @@ static void llm_load_vocab( vocab.special_eog_ids.insert(vocab.special_eom_id); LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__); } - - if (vocab.special_fim_sep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_sep_id) == 0) { - vocab.special_eog_ids.insert(vocab.special_fim_sep_id); - } } // build special tokens cache From b8890229b6b6910667ffd71f3e8a64f5b4960ffa Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 13 Oct 2024 13:42:56 +0300 Subject: [PATCH 19/42] llama.vim : add ring context from opened files and yanked text --- examples/llama.vim | 134 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 114 insertions(+), 20 deletions(-) diff --git a/examples/llama.vim b/examples/llama.vim index 56a876b0de27e..2818b754e6325 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -38,27 +38,49 @@ highlight llama_hl_hint guifg=#ff772f highlight llama_hl_info guifg=#77ff2f -" endpoint: llama.cpp server endpoint -" n_prefix: number of lines before the cursor location to include in the prefix -" n_suffix: number of lines after the cursor location to include in the suffix -" n_predict: max number of tokens to predict -" t_max_prompt_ms: max alloted time for the prompt generation (TODO: not yet supported) -" t_max_predict_ms: max alloted time for the prediction -" show_info: show extra info about the inference -" auto_fim: trigger FIM completion automatically on cursor movement +" general parameters: +" +" endpoint: llama.cpp server endpoint +" n_prefix: number of lines before the cursor location to include in the prefix +" n_suffix: number of lines after the cursor location to include in the suffix +" n_predict: max number of tokens to predict +" t_max_prompt_ms: max alloted time for the prompt generation (TODO: not yet supported) +" t_max_predict_ms: max alloted time for the prediction +" show_info: show extra info about the inference (0 - disabled, 1 - statusline, 2 - inline) +" auto_fim: trigger FIM completion automatically on cursor movement +" +" ring buffer of chunks, accumulated with time upon: +" +" - completion request +" - yank +" - reading a file +" +" ring context parameters: +" +" ring_n_chunks: max number of chunks to pass as extra context to the server (0 to disable) +" ring_chunk_size: max size of the chunks (in number of lines) +" ring_scope: the range around the cursor position (in number of lines) for gathering chunks +" let s:default_config = { \ 'endpoint': 'http://127.0.0.1:8012/infill', - \ 'n_prefix': 256, - \ 'n_suffix': 256, + \ 'n_prefix': 128, + \ 'n_suffix': 128, \ 'n_predict': 64, \ 't_max_prompt_ms': 500, \ 't_max_predict_ms': 200, - \ 'show_info': v:true, + \ 'show_info': 2, \ 'auto_fim': v:true, + \ 'ring_n_chunks': 32, + \ 'ring_chunk_size': 64, + \ 'ring_scope': 1024, \ } let g:llama_config = get(g:, 'llama_config', s:default_config) +function! s:rand(i0, i1) abort + return a:i0 + rand() % (a:i1 - a:i0 + 1) +endfunction + function! llama#init() if !executable('curl') echohl WarningMsg @@ -76,6 +98,9 @@ function! llama#init() let s:line_cur_prefix = '' let s:line_cur_suffix = '' + let s:ring_n_chunks = [] + + let s:pos_y_pick = -9999 " last y where we picked a chunk let s:pos_dx = 0 let s:content = [] let s:can_accept = v:false @@ -91,12 +116,55 @@ function! llama#init() autocmd InsertEnter * inoremap :call llama#fim(v:false) autocmd InsertLeavePre * call llama#fim_cancel() - autocmd CursorMoved * call llama#fim_cancel() + autocmd CursorMoved * call llama#fim_cancel() + + autocmd TextYankPost * if v:event.operator ==# 'y' | call s:pick_chunk(v:event.regcontents, v:false) | endif + + autocmd BufEnter * call timer_start(100, {-> s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true)}) augroup END silent! call llama#fim_cancel() endfunction +function! s:pick_chunk(text, no_mod) + " do not pick chunks from buffers with pending changes or buffers that are not files + if a:no_mod && (getbufvar(bufnr('%'), '&modified') || !buflisted(bufnr('%')) || !filereadable(expand('%'))) + return + endif + + if g:llama_config.ring_n_chunks <= 0 + return + endif + + if len(a:text) + 1 < g:llama_config.ring_chunk_size + let l:chunk = join(a:text, "\n") + else + let l:l0 = s:rand(0, len(a:text) - g:llama_config.ring_chunk_size) + let l:l1 = l:l0 + g:llama_config.ring_chunk_size + + let l:chunk = join(a:text[l:l0:l:l1], "\n") + endif + + " check if this chunk is already added + let l:exist = v:false + for i in range(len(s:ring_n_chunks)) + if s:ring_n_chunks[i] == l:chunk + let l:exist = v:true + break + endif + endfor + + if l:exist + return + endif + + if len(s:ring_n_chunks) == g:llama_config.ring_n_chunks + call remove(s:ring_n_chunks, 0) + endif + + call add(s:ring_n_chunks, l:chunk) +endfunction + function! llama#fim(is_auto) abort let s:t_fim_start = reltime() @@ -128,6 +196,20 @@ function! llama#fim(is_auto) abort \ . join(l:lines_suffix, "\n") \ . "\n" + " TODO: per-file location + let l:delta_y = abs(s:pos_y - s:pos_y_pick) + + " only gather chunks if the cursor has moved a lot + if a:is_auto && l:delta_y > 32 + " pick a prefix chunk + call s:pick_chunk(getline(max([1, s:pos_y - g:llama_config.ring_scope]), max([1, s:pos_y - g:llama_config.n_prefix])), v:false) + + "" pick a suffix chunk + call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:llama_config.n_suffix]), min([l:max_y, s:pos_y + g:llama_config.ring_scope])), v:false) + + let s:pos_y_pick = s:pos_y + endif + let l:request = json_encode({ \ 'prompt': "", \ 'input_prefix': l:prefix, @@ -137,7 +219,8 @@ function! llama#fim(is_auto) abort \ 'top_k': 100, \ 'stream': v:false, \ 'samplers': ["top_k", "infill"], - "\ 'cache_prompt': v:true, + \ 'cache_prompt': v:true, + \ 'extra_context': s:ring_n_chunks, \ 't_max_prompt_ms': g:llama_config.t_max_prompt_ms, \ 't_max_predict_ms': g:llama_config.t_max_predict_ms \ }) @@ -235,6 +318,7 @@ function! s:fim_auto() call jobstop(s:current_job) endif + " TODO: when job cancellation is implemented on the server, reduce these timeouts if reltimefloat(reltime(s:t_fim_last)) < 500*0.001 if s:timer_fim != -1 call timer_stop(s:timer_fim) @@ -284,6 +368,11 @@ function! s:fim_on_stdout(job_id, data, event) dict call remove(s:content, -1) endwhile + let l:generation_settings = get(l:response, 'generation_settings', {}) + let l:n_ctx = get(l:generation_settings, 'n_ctx', 0) + + let l:n_cached = get(l:response, 'tokens_cached', 0) + " if response.timings is available if len(get(l:response, 'timings', {})) > 0 let l:has_info = v:true @@ -322,21 +411,26 @@ function! s:fim_on_stdout(job_id, data, event) dict let l:id_vt_info = nvim_create_namespace('vt_info') " construct the info message and display it to the right of the current line - if g:llama_config.show_info && l:has_info + if g:llama_config.show_info > 0 && l:has_info " prefix the info string with whitespace in order to offset it to the right of the fim overlay let l:prefix = repeat(' ', len(s:content[0]) - len(s:line_cur_suffix) + 3) - let l:info = printf("%s | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %.2f ms", - \ l:prefix, + let l:info = printf("%s | context: %d / %d | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %.2f ms", + \ g:llama_config.show_info == 2 ? l:prefix : '', + \ l:n_cached, l:n_ctx, \ l:n_prompt, l:t_prompt_ms, l:s_prompt, \ l:n_predict, l:t_predict_ms, l:s_predict, \ 1000.0 * reltimefloat(reltime(s:t_fim_start)) \ ) - call nvim_buf_set_extmark(l:bufnr, l:id_vt_info, s:pos_y - 1, s:pos_x - 1, { - \ 'virt_text': [[l:info, 'llama_hl_info']], - \ 'virt_text_pos': 'eol', - \ }) + if g:llama_config.show_info == 1 + let &statusline = l:info + elseif g:llama_config.show_info == 2 + call nvim_buf_set_extmark(l:bufnr, l:id_vt_info, s:pos_y - 1, s:pos_x - 1, { + \ 'virt_text': [[l:info, 'llama_hl_info']], + \ 'virt_text_pos': 'eol', + \ }) + endif endif " display the suggestion From 27bc11da0f6349eff044c5861bc30d23267281ef Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 13 Oct 2024 13:57:19 +0300 Subject: [PATCH 20/42] llama.vim : update server command [no ci] --- examples/llama.vim | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/llama.vim b/examples/llama.vim index 2818b754e6325..130af3a2671f1 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -17,7 +17,7 @@ " " start the llama.cpp server with a FIM-compatible model. for example: " -" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa --ubatch-size 1024 --batch-size 2048 +" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa --ubatch-size 1024 --batch-size 2048 --cache-reuse 512 " " --batch-size [512, model max context] " From f794549baedc4d78a5580fb40f646f80da8598e7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 13 Oct 2024 14:17:58 +0300 Subject: [PATCH 21/42] llama.vim : gather chunk on leaving buffer [no ci] --- examples/llama.vim | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/llama.vim b/examples/llama.vim index 130af3a2671f1..8d85fb8621d02 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -120,7 +120,9 @@ function! llama#init() autocmd TextYankPost * if v:event.operator ==# 'y' | call s:pick_chunk(v:event.regcontents, v:false) | endif + " gather chunks upon entering/leaving a buffer autocmd BufEnter * call timer_start(100, {-> s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true)}) + autocmd BufLeave * call s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true) augroup END silent! call llama#fim_cancel() @@ -146,6 +148,7 @@ function! s:pick_chunk(text, no_mod) endif " check if this chunk is already added + " TODO: smarter check for string similarity to evict old chunks that are very similart to the new one let l:exist = v:false for i in range(len(s:ring_n_chunks)) if s:ring_n_chunks[i] == l:chunk @@ -204,7 +207,7 @@ function! llama#fim(is_auto) abort " pick a prefix chunk call s:pick_chunk(getline(max([1, s:pos_y - g:llama_config.ring_scope]), max([1, s:pos_y - g:llama_config.n_prefix])), v:false) - "" pick a suffix chunk + " pick a suffix chunk call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:llama_config.n_suffix]), min([l:max_y, s:pos_y + g:llama_config.ring_scope])), v:false) let s:pos_y_pick = s:pos_y From 27d53cb4ee92fe96dde9528c84738e3232810584 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 13 Oct 2024 16:11:38 +0300 Subject: [PATCH 22/42] llama.vim : logic to evict old chunks that are similar to new one --- examples/llama.vim | 67 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 53 insertions(+), 14 deletions(-) diff --git a/examples/llama.vim b/examples/llama.vim index 8d85fb8621d02..6e1840a548914 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -98,7 +98,8 @@ function! llama#init() let s:line_cur_prefix = '' let s:line_cur_suffix = '' - let s:ring_n_chunks = [] + let s:ring_chunks = [] + let s:ring_n_evict = 0 let s:pos_y_pick = -9999 " last y where we picked a chunk let s:pos_dx = 0 @@ -128,6 +129,25 @@ function! llama#init() silent! call llama#fim_cancel() endfunction +" TODO: figure out something better +function! s:chunk_sim(c0, c1) + let l:lines0 = len(a:c0) + let l:lines1 = len(a:c1) + + let l:common = 0 + + for l:line0 in a:c0 + for l:line1 in a:c1 + if l:line0 == l:line1 + let l:common += 1 + break + endif + endfor + endfor + + return 2.0 * l:common / (l:lines0 + l:lines1) +endfunction + function! s:pick_chunk(text, no_mod) " do not pick chunks from buffers with pending changes or buffers that are not files if a:no_mod && (getbufvar(bufnr('%'), '&modified') || !buflisted(bufnr('%')) || !filereadable(expand('%'))) @@ -138,20 +158,25 @@ function! s:pick_chunk(text, no_mod) return endif + if len(a:text) < 3 + return + endif + if len(a:text) + 1 < g:llama_config.ring_chunk_size - let l:chunk = join(a:text, "\n") + let l:chunk = a:text else - let l:l0 = s:rand(0, len(a:text) - g:llama_config.ring_chunk_size) - let l:l1 = l:l0 + g:llama_config.ring_chunk_size + let l:l0 = s:rand(0, max([0, len(a:text) - g:llama_config.ring_chunk_size])) + let l:l1 = min([l:l0 + g:llama_config.ring_chunk_size, len(a:text)]) - let l:chunk = join(a:text[l:l0:l:l1], "\n") + let l:chunk = a:text[l:l0:l:l1] endif + let l:chunk_str = join(l:chunk, "\n") + " check if this chunk is already added - " TODO: smarter check for string similarity to evict old chunks that are very similart to the new one let l:exist = v:false - for i in range(len(s:ring_n_chunks)) - if s:ring_n_chunks[i] == l:chunk + for i in range(len(s:ring_chunks)) + if s:ring_chunks[i].data == l:chunk let l:exist = v:true break endif @@ -161,11 +186,19 @@ function! s:pick_chunk(text, no_mod) return endif - if len(s:ring_n_chunks) == g:llama_config.ring_n_chunks - call remove(s:ring_n_chunks, 0) + " evict chunks that are very similar to the new one + for i in range(len(s:ring_chunks) - 1, 0, -1) + if s:chunk_sim(s:ring_chunks[i].data, l:chunk) > 0.9 + call remove(s:ring_chunks, i) + let s:ring_n_evict += 1 + endif + endfor + + if len(s:ring_chunks) == g:llama_config.ring_n_chunks + call remove(s:ring_chunks, 0) endif - call add(s:ring_n_chunks, l:chunk) + call add(s:ring_chunks, {'data': l:chunk, 'str': l:chunk_str, 'time': reltime()}) endfunction function! llama#fim(is_auto) abort @@ -213,6 +246,12 @@ function! llama#fim(is_auto) abort let s:pos_y_pick = s:pos_y endif + " array of strings + let l:extra_context = [] + for l:chunk in s:ring_chunks + call add(l:extra_context, l:chunk.str) + endfor + let l:request = json_encode({ \ 'prompt': "", \ 'input_prefix': l:prefix, @@ -223,7 +262,7 @@ function! llama#fim(is_auto) abort \ 'stream': v:false, \ 'samplers': ["top_k", "infill"], \ 'cache_prompt': v:true, - \ 'extra_context': s:ring_n_chunks, + \ 'extra_context': l:extra_context, \ 't_max_prompt_ms': g:llama_config.t_max_prompt_ms, \ 't_max_predict_ms': g:llama_config.t_max_predict_ms \ }) @@ -418,9 +457,9 @@ function! s:fim_on_stdout(job_id, data, event) dict " prefix the info string with whitespace in order to offset it to the right of the fim overlay let l:prefix = repeat(' ', len(s:content[0]) - len(s:line_cur_suffix) + 3) - let l:info = printf("%s | context: %d / %d | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %.2f ms", + let l:info = printf("%s | context: %d / %d / %d / %d | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %.2f ms", \ g:llama_config.show_info == 2 ? l:prefix : '', - \ l:n_cached, l:n_ctx, + \ l:n_cached, l:n_ctx, len(s:ring_chunks), s:ring_n_evict, \ l:n_prompt, l:t_prompt_ms, l:s_prompt, \ l:n_predict, l:t_predict_ms, l:s_predict, \ 1000.0 * reltimefloat(reltime(s:t_fim_start)) From d81a0ac185fe9d9ca1e191ea19327582f6880aa2 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 13 Oct 2024 16:53:32 +0300 Subject: [PATCH 23/42] llama.vim : do not evict certain chunks [no ci] --- examples/llama.vim | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/examples/llama.vim b/examples/llama.vim index 6e1840a548914..3fe69e339751b 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -71,7 +71,7 @@ let s:default_config = { \ 'show_info': 2, \ 'auto_fim': v:true, \ 'ring_n_chunks': 32, - \ 'ring_chunk_size': 64, + \ 'ring_chunk_size': 128, \ 'ring_scope': 1024, \ } @@ -119,11 +119,11 @@ function! llama#init() autocmd CursorMoved * call llama#fim_cancel() - autocmd TextYankPost * if v:event.operator ==# 'y' | call s:pick_chunk(v:event.regcontents, v:false) | endif + autocmd TextYankPost * if v:event.operator ==# 'y' | call s:pick_chunk(v:event.regcontents, v:false, v:true) | endif " gather chunks upon entering/leaving a buffer - autocmd BufEnter * call timer_start(100, {-> s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true)}) - autocmd BufLeave * call s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true) + autocmd BufEnter * call timer_start(100, {-> s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)}) + autocmd BufLeave * call s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true) augroup END silent! call llama#fim_cancel() @@ -148,7 +148,7 @@ function! s:chunk_sim(c0, c1) return 2.0 * l:common / (l:lines0 + l:lines1) endfunction -function! s:pick_chunk(text, no_mod) +function! s:pick_chunk(text, no_mod, do_evict) " do not pick chunks from buffers with pending changes or buffers that are not files if a:no_mod && (getbufvar(bufnr('%'), '&modified') || !buflisted(bufnr('%')) || !filereadable(expand('%'))) return @@ -165,8 +165,8 @@ function! s:pick_chunk(text, no_mod) if len(a:text) + 1 < g:llama_config.ring_chunk_size let l:chunk = a:text else - let l:l0 = s:rand(0, max([0, len(a:text) - g:llama_config.ring_chunk_size])) - let l:l1 = min([l:l0 + g:llama_config.ring_chunk_size, len(a:text)]) + let l:l0 = s:rand(0, max([0, len(a:text) - g:llama_config.ring_chunk_size/2])) + let l:l1 = min([l:l0 + g:llama_config.ring_chunk_size/2, len(a:text)]) let l:chunk = a:text[l:l0:l:l1] endif @@ -189,8 +189,12 @@ function! s:pick_chunk(text, no_mod) " evict chunks that are very similar to the new one for i in range(len(s:ring_chunks) - 1, 0, -1) if s:chunk_sim(s:ring_chunks[i].data, l:chunk) > 0.9 - call remove(s:ring_chunks, i) - let s:ring_n_evict += 1 + if a:do_evict + call remove(s:ring_chunks, i) + let s:ring_n_evict += 1 + else + return + endif endif endfor @@ -237,11 +241,12 @@ function! llama#fim(is_auto) abort " only gather chunks if the cursor has moved a lot if a:is_auto && l:delta_y > 32 - " pick a prefix chunk - call s:pick_chunk(getline(max([1, s:pos_y - g:llama_config.ring_scope]), max([1, s:pos_y - g:llama_config.n_prefix])), v:false) - - " pick a suffix chunk - call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:llama_config.n_suffix]), min([l:max_y, s:pos_y + g:llama_config.ring_scope])), v:false) + " randomly pick a prefix or a suffix chunk + if s:rand(0, 1) + call s:pick_chunk(getline(max([1, s:pos_y - g:llama_config.ring_scope]), max([1, s:pos_y - g:llama_config.n_prefix])), v:false, v:false) + else + call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:llama_config.n_suffix]), min([l:max_y, s:pos_y + g:llama_config.ring_scope])), v:false, v:false) + endif let s:pos_y_pick = s:pos_y endif From 2960510153a45f384f075985d91cba957f69ef79 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 13 Oct 2024 17:17:01 +0300 Subject: [PATCH 24/42] llama.vim : do not auto-fim when far from the end of the line [no ci] --- examples/llama.vim | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/examples/llama.vim b/examples/llama.vim index 3fe69e339751b..bf56a5e5f9e93 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -48,6 +48,7 @@ highlight llama_hl_info guifg=#77ff2f " t_max_predict_ms: max alloted time for the prediction " show_info: show extra info about the inference (0 - disabled, 1 - statusline, 2 - inline) " auto_fim: trigger FIM completion automatically on cursor movement +" max_line_suffix: do not auto-trigger FIM completion if there are more than this number of characters to the right of the cursor " " ring buffer of chunks, accumulated with time upon: " @@ -70,6 +71,7 @@ let s:default_config = { \ 't_max_predict_ms': 200, \ 'show_info': 2, \ 'auto_fim': v:true, + \ 'max_line_suffix': 8, \ 'ring_n_chunks': 32, \ 'ring_chunk_size': 128, \ 'ring_scope': 1024, @@ -124,6 +126,9 @@ function! llama#init() " gather chunks upon entering/leaving a buffer autocmd BufEnter * call timer_start(100, {-> s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)}) autocmd BufLeave * call s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true) + + " gather chunk upon saving the file + autocmd BufWritePost * call s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true) augroup END silent! call llama#fim_cancel() @@ -225,6 +230,10 @@ function! llama#fim(is_auto) abort let s:line_cur_prefix = strpart(s:line_cur, 0, s:pos_x0) let s:line_cur_suffix = strpart(s:line_cur, s:pos_x0) + if a:is_auto && len(s:line_cur_suffix) > g:llama_config.max_line_suffix + return + endif + let l:prefix = "" \ . join(l:lines_prefix, "\n") \ . "\n" From bc2857b88c69b913e42b09c2470acf13af37a640 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 13 Oct 2024 18:23:22 +0300 Subject: [PATCH 25/42] llama.vim : async context processing ggml-ci --- examples/llama.vim | 94 +++++++++++++++++++++++++++++++++------------- 1 file changed, 68 insertions(+), 26 deletions(-) diff --git a/examples/llama.vim b/examples/llama.vim index bf56a5e5f9e93..bc61ea8ba6981 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -17,7 +17,7 @@ " " start the llama.cpp server with a FIM-compatible model. for example: " -" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa --ubatch-size 1024 --batch-size 2048 --cache-reuse 512 +" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa --ubatch-size 512 --batch-size 1024 --cache-reuse 512 " " --batch-size [512, model max context] " @@ -54,7 +54,9 @@ highlight llama_hl_info guifg=#77ff2f " " - completion request " - yank -" - reading a file +" - entering a buffer +" - leaving a buffer +" - writing a file " " ring context parameters: " @@ -208,6 +210,36 @@ function! s:pick_chunk(text, no_mod, do_evict) endif call add(s:ring_chunks, {'data': l:chunk, 'str': l:chunk_str, 'time': reltime()}) + + " send asynchronous job with the new extra context so that it is ready for the next FIM + let l:extra_context = [] + for l:chunk in s:ring_chunks + call add(l:extra_context, l:chunk.str) + endfor + + let l:request = json_encode({ + \ 'prompt': "", + \ 'input_prefix': "", + \ 'input_suffix': "", + \ 'n_predict': 1, + \ 'penalty_last_n': 0, + \ 'top_k': 100, + \ 'stream': v:false, + \ 'samplers': ["top_k", "infill"], + \ 'cache_prompt': v:true, + \ 'extra_context': l:extra_context, + \ 't_max_prompt_ms': 1, + \ 't_max_predict_ms': 1 + \ }) + + let l:curl_command = printf( + \ "curl --silent --no-buffer --request POST --url %s --header \"Content-Type: application/json\" --data %s", + \ g:llama_config.endpoint, shellescape(l:request) + \ ) + + call jobstart(l:curl_command, { + \ 'on_exit': function('s:fim_on_exit') + \ }) endfunction function! llama#fim(is_auto) abort @@ -245,21 +277,6 @@ function! llama#fim(is_auto) abort \ . join(l:lines_suffix, "\n") \ . "\n" - " TODO: per-file location - let l:delta_y = abs(s:pos_y - s:pos_y_pick) - - " only gather chunks if the cursor has moved a lot - if a:is_auto && l:delta_y > 32 - " randomly pick a prefix or a suffix chunk - if s:rand(0, 1) - call s:pick_chunk(getline(max([1, s:pos_y - g:llama_config.ring_scope]), max([1, s:pos_y - g:llama_config.n_prefix])), v:false, v:false) - else - call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:llama_config.n_suffix]), min([l:max_y, s:pos_y + g:llama_config.ring_scope])), v:false, v:false) - endif - - let s:pos_y_pick = s:pos_y - endif - " array of strings let l:extra_context = [] for l:chunk in s:ring_chunks @@ -294,6 +311,21 @@ function! llama#fim(is_auto) abort \ 'is_auto': a:is_auto \ }) + " TODO: per-file location + let l:delta_y = abs(s:pos_y - s:pos_y_pick) + + " only gather chunks if the cursor has moved a lot + if a:is_auto && l:delta_y > 32 + " randomly pick a prefix or a suffix chunk + if s:rand(0, 1) + call s:pick_chunk(getline(max([1, s:pos_y - g:llama_config.ring_scope]), max([1, s:pos_y - g:llama_config.n_prefix])), v:false, v:false) + else + call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:llama_config.n_suffix]), min([l:max_y, s:pos_y + g:llama_config.ring_scope])), v:false, v:false) + endif + + let s:pos_y_pick = s:pos_y + endif + " this trick is needed to avoid the cursor shifting upon C-O when at the end of the line if !a:is_auto augroup llama_insert @@ -427,7 +459,8 @@ function! s:fim_on_stdout(job_id, data, event) dict let l:generation_settings = get(l:response, 'generation_settings', {}) let l:n_ctx = get(l:generation_settings, 'n_ctx', 0) - let l:n_cached = get(l:response, 'tokens_cached', 0) + let l:n_cached = get(l:response, 'tokens_cached', 0) + let l:truncated = get(l:response, 'truncated', v:false) " if response.timings is available if len(get(l:response, 'timings', {})) > 0 @@ -466,22 +499,31 @@ function! s:fim_on_stdout(job_id, data, event) dict let l:id_vt_fim = nvim_create_namespace('vt_fim') let l:id_vt_info = nvim_create_namespace('vt_info') - " construct the info message and display it to the right of the current line + " construct the info message if g:llama_config.show_info > 0 && l:has_info " prefix the info string with whitespace in order to offset it to the right of the fim overlay let l:prefix = repeat(' ', len(s:content[0]) - len(s:line_cur_suffix) + 3) - let l:info = printf("%s | context: %d / %d / %d / %d | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %.2f ms", - \ g:llama_config.show_info == 2 ? l:prefix : '', - \ l:n_cached, l:n_ctx, len(s:ring_chunks), s:ring_n_evict, - \ l:n_prompt, l:t_prompt_ms, l:s_prompt, - \ l:n_predict, l:t_predict_ms, l:s_predict, - \ 1000.0 * reltimefloat(reltime(s:t_fim_start)) - \ ) + if l:truncated + let l:info = printf("%s | WARNING: the context is full: %d / %d, increase the server context size or reduce g:llama_config.ring_n_chunks", + \ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim', + \ l:n_cached, l:n_ctx + \ ) + else + let l:info = printf("%s | context: %d / %d / %d / %d | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %.2f ms", + \ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim', + \ l:n_cached, l:n_ctx, len(s:ring_chunks), s:ring_n_evict, + \ l:n_prompt, l:t_prompt_ms, l:s_prompt, + \ l:n_predict, l:t_predict_ms, l:s_predict, + \ 1000.0 * reltimefloat(reltime(s:t_fim_start)) + \ ) + endif if g:llama_config.show_info == 1 + "" display it in the statusline let &statusline = l:info elseif g:llama_config.show_info == 2 + " display it to the right of the current line call nvim_buf_set_extmark(l:bufnr, l:id_vt_info, s:pos_y - 1, s:pos_x - 1, { \ 'virt_text': [[l:info, 'llama_hl_info']], \ 'virt_text_pos': 'eol', From 916c2ee3fd95976213f95922b5242d2c4834dec9 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 13 Oct 2024 18:50:36 +0300 Subject: [PATCH 26/42] llama : simplify infill sampler --- examples/llama.vim | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/llama.vim b/examples/llama.vim index bc61ea8ba6981..5a2027021d1b4 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -70,11 +70,11 @@ let s:default_config = { \ 'n_suffix': 128, \ 'n_predict': 64, \ 't_max_prompt_ms': 500, - \ 't_max_predict_ms': 200, + \ 't_max_predict_ms': 500, \ 'show_info': 2, \ 'auto_fim': v:true, \ 'max_line_suffix': 8, - \ 'ring_n_chunks': 32, + \ 'ring_n_chunks': 16, \ 'ring_chunk_size': 128, \ 'ring_scope': 1024, \ } From ae76a092b850283121af2cf2eb83a764b214fb6a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 13 Oct 2024 21:36:02 +0300 Subject: [PATCH 27/42] llama.vim : pass filenames for each chunk ggml-ci --- examples/llama.vim | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/examples/llama.vim b/examples/llama.vim index 5a2027021d1b4..919055876d9f6 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -178,7 +178,7 @@ function! s:pick_chunk(text, no_mod, do_evict) let l:chunk = a:text[l:l0:l:l1] endif - let l:chunk_str = join(l:chunk, "\n") + let l:chunk_str = join(l:chunk, "\n") . "\n" " check if this chunk is already added let l:exist = v:false @@ -209,12 +209,16 @@ function! s:pick_chunk(text, no_mod, do_evict) call remove(s:ring_chunks, 0) endif - call add(s:ring_chunks, {'data': l:chunk, 'str': l:chunk_str, 'time': reltime()}) + call add(s:ring_chunks, {'data': l:chunk, 'str': l:chunk_str, 'time': reltime(), 'filename': expand('%')}) " send asynchronous job with the new extra context so that it is ready for the next FIM let l:extra_context = [] for l:chunk in s:ring_chunks - call add(l:extra_context, l:chunk.str) + call add(l:extra_context, { + \ 'text': l:chunk.str, + \ 'time': l:chunk.time, + \ 'filename': l:chunk.filename + \ }) endfor let l:request = json_encode({ @@ -277,10 +281,14 @@ function! llama#fim(is_auto) abort \ . join(l:lines_suffix, "\n") \ . "\n" - " array of strings + " prepare the extra context data let l:extra_context = [] for l:chunk in s:ring_chunks - call add(l:extra_context, l:chunk.str) + call add(l:extra_context, { + \ 'text': l:chunk.str, + \ 'time': l:chunk.time, + \ 'filename': l:chunk.filename + \ }) endfor let l:request = json_encode({ From 9f8fa900f68ece27185e6c0ac9690a2a5b251cbd Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 13 Oct 2024 21:56:29 +0300 Subject: [PATCH 28/42] llama.vim : fix repetitions [no ci] --- examples/llama.vim | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/examples/llama.vim b/examples/llama.vim index 919055876d9f6..0065d5e5a6431 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -499,6 +499,11 @@ function! s:fim_on_stdout(job_id, data, event) dict let s:pos_dx = len(s:content[-1]) let s:content[-1] .= s:line_cur_suffix + " truncate the suggestion if it repeats the next line + if len(s:content) > 1 && s:content[1] == getline(s:pos_y + 1) + let s:content = [s:content[0]] + endif + call llama#fim_cancel() " display virtual text with the suggestion From 25ecb35c4f37b009d6c5930049bdfe49942bde5e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 14 Oct 2024 15:50:08 +0300 Subject: [PATCH 29/42] llama.vim : simplify job logic + improve robustness and responsivness --- examples/llama.vim | 129 ++++++++++++++++++--------------------------- 1 file changed, 51 insertions(+), 78 deletions(-) diff --git a/examples/llama.vim b/examples/llama.vim index 0065d5e5a6431..90d08a8e5d62e 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -95,7 +95,6 @@ function! llama#init() let s:pos_x = 0 " cursor position upon start of completion let s:pos_y = 0 - let s:pos_x0 = 0 " pos_x corrected for end-of-line edge case let s:line_cur = '' @@ -105,32 +104,40 @@ function! llama#init() let s:ring_chunks = [] let s:ring_n_evict = 0 + let s:hint_shown = v:false let s:pos_y_pick = -9999 " last y where we picked a chunk let s:pos_dx = 0 let s:content = [] let s:can_accept = v:false - let s:timer_fim = -1 - let s:t_fim_last = reltime() - let s:t_fim_start = reltime() + let s:t_fim_start = reltime() " used to measure total FIM time let s:current_job = v:null augroup llama autocmd! - autocmd InsertEnter * inoremap :call llama#fim(v:false) - autocmd InsertLeavePre * call llama#fim_cancel() + autocmd InsertEnter * inoremap a + autocmd InsertLeavePre * call llama#fim_cancel() - autocmd CursorMoved * call llama#fim_cancel() + autocmd CursorMoved * call llama#fim_cancel() + autocmd CompleteChanged * call llama#fim_cancel() - autocmd TextYankPost * if v:event.operator ==# 'y' | call s:pick_chunk(v:event.regcontents, v:false, v:true) | endif + if g:llama_config.auto_fim + autocmd InsertEnter * call llama#fim(v:true, v:false) + autocmd CursorMovedI * call llama#fim(v:true, v:false) + autocmd CursorHoldI * call llama#fim(v:true, v:true) + else + autocmd CursorMovedI * call llama#fim_cancel() + endif + + autocmd TextYankPost * if v:event.operator ==# 'y' | call s:pick_chunk(v:event.regcontents, v:false, v:true) | endif " gather chunks upon entering/leaving a buffer - autocmd BufEnter * call timer_start(100, {-> s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)}) - autocmd BufLeave * call s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true) + autocmd BufEnter * call timer_start(100, {-> s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)}) + autocmd BufLeave * call s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true) " gather chunk upon saving the file - autocmd BufWritePost * call s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true) + autocmd BufWritePost * call s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true) augroup END silent! call llama#fim_cancel() @@ -241,18 +248,27 @@ function! s:pick_chunk(text, no_mod, do_evict) \ g:llama_config.endpoint, shellescape(l:request) \ ) - call jobstart(l:curl_command, { - \ 'on_exit': function('s:fim_on_exit') - \ }) + call jobstart(l:curl_command, {}) endfunction -function! llama#fim(is_auto) abort +function! llama#fim(is_auto, on_hold) abort + if a:on_hold && s:hint_shown + return + endif + + call llama#fim_cancel() + + if reltimefloat(reltime(s:t_fim_start)) < 0.5 + let s:t_fim_start = reltime() + return + endif + let s:t_fim_start = reltime() let s:content = [] let s:can_accept = v:false - let s:pos_x = col('.') + let s:pos_x = col('.') - 1 let s:pos_y = line('.') let l:max_y = line('$') @@ -261,10 +277,8 @@ function! llama#fim(is_auto) abort let s:line_cur = getline('.') - let s:pos_x0 = s:pos_x == len(s:line_cur) ? s:pos_x : s:pos_x - 1 - - let s:line_cur_prefix = strpart(s:line_cur, 0, s:pos_x0) - let s:line_cur_suffix = strpart(s:line_cur, s:pos_x0) + let s:line_cur_prefix = strpart(s:line_cur, 0, s:pos_x) + let s:line_cur_suffix = strpart(s:line_cur, s:pos_x) if a:is_auto && len(s:line_cur_suffix) > g:llama_config.max_line_suffix return @@ -311,11 +325,17 @@ function! llama#fim(is_auto) abort \ g:llama_config.endpoint, shellescape(l:request) \ ) + if s:current_job != v:null + call jobstop(s:current_job) + endif + " send the request asynchronously let s:current_job = jobstart(l:curl_command, { \ 'on_stdout': function('s:fim_on_stdout'), \ 'on_exit': function('s:fim_on_exit'), \ 'stdout_buffered': v:true, + \ 'pos_x': s:pos_x, + \ 'pos_y': s:pos_y, \ 'is_auto': a:is_auto \ }) @@ -333,24 +353,13 @@ function! llama#fim(is_auto) abort let s:pos_y_pick = s:pos_y endif - - " this trick is needed to avoid the cursor shifting upon C-O when at the end of the line - if !a:is_auto - augroup llama_insert - autocmd! - augroup END - - if g:llama_config.auto_fim - call timer_start(0, {-> s:fim_auto_enable()}) - endif - endif endfunction " if first_line == v:true accept only the first line of the response function! llama#fim_accept(first_line) " insert the suggestion at the cursor location if s:can_accept && len(s:content) > 0 - call setline(s:pos_y, s:line_cur[:(s:pos_x0 - 1)] . s:content[0]) + call setline(s:pos_y, s:line_cur[:(s:pos_x - 1)] . s:content[0]) if len(s:content) > 1 if !a:first_line call append(s:pos_y, s:content[1:-1]) @@ -361,7 +370,7 @@ function! llama#fim_accept(first_line) if !a:first_line call cursor(s:pos_y + len(s:content) - 1, s:pos_x + s:pos_dx) else - call cursor(s:pos_y, s:pos_x + len(s:content[0]) - 1) + call cursor(s:pos_y, s:pos_x + len(s:content[0])) endif endif @@ -369,14 +378,7 @@ function! llama#fim_accept(first_line) endfunction function! llama#fim_cancel() - if s:current_job != v:null - call jobstop(s:current_job) - endif - - if s:timer_fim != -1 - call timer_stop(s:timer_fim) - let s:timer_fim = -1 - endif + let s:hint_shown = v:false " clear the virtual text let l:bufnr = bufnr('%') @@ -391,39 +393,6 @@ function! llama#fim_cancel() silent! iunmap silent! iunmap silent! iunmap - - augroup llama_insert - autocmd! - augroup END - - if g:llama_config.auto_fim - call s:fim_auto_enable() - endif -endfunction - -function! s:fim_auto_enable() - augroup llama_insert - autocmd CursorMovedI * call s:fim_auto() - augroup END -endfunction - -" auto-start a fim job a short time after the cursor has moved -" if there is already a job queued - cancel it -function! s:fim_auto() - if s:current_job != v:null - call jobstop(s:current_job) - endif - - " TODO: when job cancellation is implemented on the server, reduce these timeouts - if reltimefloat(reltime(s:t_fim_last)) < 500*0.001 - if s:timer_fim != -1 - call timer_stop(s:timer_fim) - let s:timer_fim = -1 - endif - endif - - let s:t_fim_last = reltime() - let s:timer_fim = timer_start(500, {-> llama#fim(v:true)}) endfunction " callback that processes the result from the server @@ -433,6 +402,13 @@ function! s:fim_on_stdout(job_id, data, event) dict return endif + if self.pos_x != col('.') - 1 || self.pos_y != line('.') + return + endif + + let s:pos_x = self.pos_x + let s:pos_y = self.pos_y + let s:can_accept = v:true let l:has_info = v:false @@ -559,10 +535,7 @@ function! s:fim_on_stdout(job_id, data, event) dict inoremap :call llama#fim_accept(v:false) inoremap :call llama#fim_accept(v:true) - augroup llama_insert - autocmd! - autocmd CursorMovedI * call llama#fim_cancel() - augroup END + let s:hint_shown = v:true endfunction function! s:fim_on_exit(job_id, exit_code, event) dict From e4be74b4b70b7431d17c40d04f3d087f9d592e0a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 15 Oct 2024 09:34:26 +0300 Subject: [PATCH 30/42] llama.vim : add top_p + improve responsivness + fix edge cases --- examples/llama.vim | 64 +++++++++++++++++++++++++++------------------- 1 file changed, 38 insertions(+), 26 deletions(-) diff --git a/examples/llama.vim b/examples/llama.vim index 90d08a8e5d62e..a80b5d5d024f2 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -66,16 +66,16 @@ highlight llama_hl_info guifg=#77ff2f " let s:default_config = { \ 'endpoint': 'http://127.0.0.1:8012/infill', - \ 'n_prefix': 128, - \ 'n_suffix': 128, + \ 'n_prefix': 256, + \ 'n_suffix': 8, \ 'n_predict': 64, \ 't_max_prompt_ms': 500, - \ 't_max_predict_ms': 500, + \ 't_max_predict_ms': 200, \ 'show_info': 2, \ 'auto_fim': v:true, \ 'max_line_suffix': 8, - \ 'ring_n_chunks': 16, - \ 'ring_chunk_size': 128, + \ 'ring_n_chunks': 64, + \ 'ring_chunk_size': 64, \ 'ring_scope': 1024, \ } @@ -110,13 +110,14 @@ function! llama#init() let s:content = [] let s:can_accept = v:false + let s:timer_fim = -1 let s:t_fim_start = reltime() " used to measure total FIM time let s:current_job = v:null augroup llama autocmd! - autocmd InsertEnter * inoremap a + autocmd InsertEnter * inoremap llama#fim_inline(v:false, v:false) autocmd InsertLeavePre * call llama#fim_cancel() autocmd CursorMoved * call llama#fim_cancel() @@ -125,7 +126,7 @@ function! llama#init() if g:llama_config.auto_fim autocmd InsertEnter * call llama#fim(v:true, v:false) autocmd CursorMovedI * call llama#fim(v:true, v:false) - autocmd CursorHoldI * call llama#fim(v:true, v:true) + "autocmd CursorHoldI * call llama#fim(v:true, v:true) else autocmd CursorMovedI * call llama#fim_cancel() endif @@ -202,7 +203,7 @@ function! s:pick_chunk(text, no_mod, do_evict) " evict chunks that are very similar to the new one for i in range(len(s:ring_chunks) - 1, 0, -1) - if s:chunk_sim(s:ring_chunks[i].data, l:chunk) > 0.9 + if s:chunk_sim(s:ring_chunks[i].data, l:chunk) > 0.5 if a:do_evict call remove(s:ring_chunks, i) let s:ring_n_evict += 1 @@ -234,9 +235,10 @@ function! s:pick_chunk(text, no_mod, do_evict) \ 'input_suffix': "", \ 'n_predict': 1, \ 'penalty_last_n': 0, - \ 'top_k': 100, + \ 'top_k': 40, + \ 'top_p': 0.99, \ 'stream': v:false, - \ 'samplers': ["top_k", "infill"], + \ 'samplers': ["top_k", "top_p", "infill"], \ 'cache_prompt': v:true, \ 'extra_context': l:extra_context, \ 't_max_prompt_ms': 1, @@ -251,15 +253,27 @@ function! s:pick_chunk(text, no_mod, do_evict) call jobstart(l:curl_command, {}) endfunction +function! llama#fim_inline(is_auto, on_hold) abort + call llama#fim(a:is_auto, a:on_hold) + return '' +endfunction + function! llama#fim(is_auto, on_hold) abort - if a:on_hold && s:hint_shown + if a:on_hold && (s:hint_shown || (s:pos_x == col('.') - 1 && s:pos_y == line('.'))) return endif call llama#fim_cancel() - if reltimefloat(reltime(s:t_fim_start)) < 0.5 + " avoid sending repeated requests too fast + if reltimefloat(reltime(s:t_fim_start)) < 0.6 + if s:timer_fim != -1 + call timer_stop(s:timer_fim) + let s:timer_fim = -1 + endif + let s:t_fim_start = reltime() + let s:timer_fim = timer_start(600, {-> llama#fim(v:true, v:true)}) return endif @@ -287,6 +301,8 @@ function! llama#fim(is_auto, on_hold) abort let l:prefix = "" \ . join(l:lines_prefix, "\n") \ . "\n" + + let l:prompt = "" \ . s:line_cur_prefix let l:suffix = "" @@ -306,14 +322,15 @@ function! llama#fim(is_auto, on_hold) abort endfor let l:request = json_encode({ - \ 'prompt': "", \ 'input_prefix': l:prefix, + \ 'prompt': l:prompt, \ 'input_suffix': l:suffix, \ 'n_predict': g:llama_config.n_predict, \ 'penalty_last_n': 0, - \ 'top_k': 100, + \ 'top_k': 40, + \ 'top_p': 0.99, \ 'stream': v:false, - \ 'samplers': ["top_k", "infill"], + \ 'samplers': ["top_k", "top_p", "infill"], \ 'cache_prompt': v:true, \ 'extra_context': l:extra_context, \ 't_max_prompt_ms': g:llama_config.t_max_prompt_ms, @@ -343,13 +360,10 @@ function! llama#fim(is_auto, on_hold) abort let l:delta_y = abs(s:pos_y - s:pos_y_pick) " only gather chunks if the cursor has moved a lot + " TODO: something more clever? reranking? if a:is_auto && l:delta_y > 32 - " randomly pick a prefix or a suffix chunk - if s:rand(0, 1) - call s:pick_chunk(getline(max([1, s:pos_y - g:llama_config.ring_scope]), max([1, s:pos_y - g:llama_config.n_prefix])), v:false, v:false) - else - call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:llama_config.n_suffix]), min([l:max_y, s:pos_y + g:llama_config.ring_scope])), v:false, v:false) - endif + call s:pick_chunk(getline(max([1, s:pos_y - g:llama_config.ring_scope]), max([1, s:pos_y - g:llama_config.n_prefix])), v:false, v:false) + call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:llama_config.n_suffix]), min([l:max_y, s:pos_y + g:llama_config.n_suffix + g:llama_config.ring_chunk_size])), v:false, v:false) let s:pos_y_pick = s:pos_y endif @@ -367,7 +381,7 @@ function! llama#fim_accept(first_line) endif " move the cursor to the end of the accepted text - if !a:first_line + if !a:first_line && len(s:content) > 1 call cursor(s:pos_y + len(s:content) - 1, s:pos_x + s:pos_dx) else call cursor(s:pos_y, s:pos_x + len(s:content[0])) @@ -462,9 +476,7 @@ function! s:fim_on_stdout(job_id, data, event) dict endif if len(s:content) == 0 - if !self.is_auto - call add(s:content, "<| EOT |>") - endif + call add(s:content, "") let s:can_accept = v:false endif @@ -475,7 +487,7 @@ function! s:fim_on_stdout(job_id, data, event) dict let s:pos_dx = len(s:content[-1]) let s:content[-1] .= s:line_cur_suffix - " truncate the suggestion if it repeats the next line + " truncate the suggestion if it repeats the following lines if len(s:content) > 1 && s:content[1] == getline(s:pos_y + 1) let s:content = [s:content[0]] endif From 0c1f51b73e781b16b3593e716ff1e4eab0131ea8 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 15 Oct 2024 09:37:26 +0300 Subject: [PATCH 31/42] llama : improve infill sampler ggml-ci --- src/llama-sampling.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index d71516153cf82..1a297aa3866e6 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -1764,7 +1764,7 @@ struct llama_sampler * llama_sampler_init_logit_bias( // infill -//#define GGML_DEBUG_SAMPLER_INFILL +#define GGML_DEBUG_SAMPLER_INFILL struct llama_sampler_infill { const struct llama_vocab * vocab; From 42a9008b31a0a31ad206a43bc2136733da2e31bc Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 15 Oct 2024 10:50:18 +0300 Subject: [PATCH 32/42] llama.vim : process extra chunks in the background [no ci] --- examples/llama.vim | 86 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 73 insertions(+), 13 deletions(-) diff --git a/examples/llama.vim b/examples/llama.vim index a80b5d5d024f2..b184faa7e5989 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -17,11 +17,11 @@ " " start the llama.cpp server with a FIM-compatible model. for example: " -" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa --ubatch-size 512 --batch-size 1024 --cache-reuse 512 +" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa -dt 0.1 --ubatch-size 512 --batch-size 1024 --cache-reuse 512 " " --batch-size [512, model max context] " -" adjust the batch size to control how much of the provided context will be used during the inference +" adjust the batch size to control how much of the provided local context will be used during the inference " lower values will use smaller part of the context around the cursor, which will result in faster processing " " --ubatch-size [64, 2048] @@ -58,11 +58,12 @@ highlight llama_hl_info guifg=#77ff2f " - leaving a buffer " - writing a file " -" ring context parameters: +" parameters for the ring-buffer with extra context: " " ring_n_chunks: max number of chunks to pass as extra context to the server (0 to disable) " ring_chunk_size: max size of the chunks (in number of lines) " ring_scope: the range around the cursor position (in number of lines) for gathering chunks +" ring_update_ms: how often to process queued chunks in normal mode " let s:default_config = { \ 'endpoint': 'http://127.0.0.1:8012/infill', @@ -77,6 +78,7 @@ let s:default_config = { \ 'ring_n_chunks': 64, \ 'ring_chunk_size': 64, \ 'ring_scope': 1024, + \ 'ring_update_ms': 1000, \ } let g:llama_config = get(g:, 'llama_config', s:default_config) @@ -101,7 +103,8 @@ function! llama#init() let s:line_cur_prefix = '' let s:line_cur_suffix = '' - let s:ring_chunks = [] + let s:ring_chunks = [] " current set of chunks used as extra context + let s:ring_queued = [] " chunks that are queued to be sent for processing let s:ring_n_evict = 0 let s:hint_shown = v:false @@ -112,6 +115,7 @@ function! llama#init() let s:timer_fim = -1 let s:t_fim_start = reltime() " used to measure total FIM time + let s:t_last_move = reltime() " last time the cursor moved let s:current_job = v:null @@ -120,15 +124,14 @@ function! llama#init() autocmd InsertEnter * inoremap llama#fim_inline(v:false, v:false) autocmd InsertLeavePre * call llama#fim_cancel() - autocmd CursorMoved * call llama#fim_cancel() + autocmd CursorMoved * call s:on_move() + autocmd CursorMovedI * call s:on_move() autocmd CompleteChanged * call llama#fim_cancel() if g:llama_config.auto_fim autocmd InsertEnter * call llama#fim(v:true, v:false) autocmd CursorMovedI * call llama#fim(v:true, v:false) "autocmd CursorHoldI * call llama#fim(v:true, v:true) - else - autocmd CursorMovedI * call llama#fim_cancel() endif autocmd TextYankPost * if v:event.operator ==# 'y' | call s:pick_chunk(v:event.regcontents, v:false, v:true) | endif @@ -142,6 +145,11 @@ function! llama#init() augroup END silent! call llama#fim_cancel() + + " init background update of the ring buffer + if g:llama_config.ring_n_chunks > 0 + call s:ring_update() + endif endfunction " TODO: figure out something better @@ -163,6 +171,7 @@ function! s:chunk_sim(c0, c1) return 2.0 * l:common / (l:lines0 + l:lines1) endfunction +" pick a chunk from the provided text and queue it for processing function! s:pick_chunk(text, no_mod, do_evict) " do not pick chunks from buffers with pending changes or buffers that are not files if a:no_mod && (getbufvar(bufnr('%'), '&modified') || !buflisted(bufnr('%')) || !filereadable(expand('%'))) @@ -190,6 +199,7 @@ function! s:pick_chunk(text, no_mod, do_evict) " check if this chunk is already added let l:exist = v:false + for i in range(len(s:ring_chunks)) if s:ring_chunks[i].data == l:chunk let l:exist = v:true @@ -197,11 +207,30 @@ function! s:pick_chunk(text, no_mod, do_evict) endif endfor + for i in range(len(s:ring_queued)) + if s:ring_queued[i].data == l:chunk + let l:exist = v:true + break + endif + endfor + if l:exist return endif " evict chunks that are very similar to the new one + for i in range(len(s:ring_queued) - 1, 0, -1) + if s:chunk_sim(s:ring_queued[i].data, l:chunk) > 0.5 + if a:do_evict + call remove(s:ring_queued, i) + let s:ring_n_evict += 1 + else + return + endif + endif + endfor + + " also from s:ring_chunks for i in range(len(s:ring_chunks) - 1, 0, -1) if s:chunk_sim(s:ring_chunks[i].data, l:chunk) > 0.5 if a:do_evict @@ -213,11 +242,36 @@ function! s:pick_chunk(text, no_mod, do_evict) endif endfor + if len(s:ring_queued) == 16 + call remove(s:ring_queued, 0) + endif + + call add(s:ring_queued, {'data': l:chunk, 'str': l:chunk_str, 'time': reltime(), 'filename': expand('%')}) + + "let &statusline = 'extra context: ' . len(s:ring_chunks) . ' / ' . len(s:ring_queued) +endfunction + +" called every g:llama_config.ring_update_ms, processed chunks are moved to s:ring_chunks +function! s:ring_update() + call timer_start(g:llama_config.ring_update_ms, {-> s:ring_update()}) + + " update only if in normal mode or if the cursor hasn't moved for a while + if mode() !=# 'n' && reltimefloat(reltime(s:t_last_move)) < 3.0 + return + endif + + if len(s:ring_queued) == 0 + return + endif + + " move the first queued chunk to the ring buffer if len(s:ring_chunks) == g:llama_config.ring_n_chunks call remove(s:ring_chunks, 0) endif - call add(s:ring_chunks, {'data': l:chunk, 'str': l:chunk_str, 'time': reltime(), 'filename': expand('%')}) + call add(s:ring_chunks, remove(s:ring_queued, 0)) + + "let &statusline = 'updated context: ' . len(s:ring_chunks) . ' / ' . len(s:ring_queued) " send asynchronous job with the new extra context so that it is ready for the next FIM let l:extra_context = [] @@ -229,16 +283,16 @@ function! s:pick_chunk(text, no_mod, do_evict) \ }) endfor + " no samplers needed here let l:request = json_encode({ \ 'prompt': "", \ 'input_prefix': "", \ 'input_suffix': "", \ 'n_predict': 1, \ 'penalty_last_n': 0, - \ 'top_k': 40, - \ 'top_p': 0.99, + \ 'temperature': 0.0, \ 'stream': v:false, - \ 'samplers': ["top_k", "top_p", "infill"], + \ 'samplers': ["temperature"], \ 'cache_prompt': v:true, \ 'extra_context': l:extra_context, \ 't_max_prompt_ms': 1, @@ -409,6 +463,12 @@ function! llama#fim_cancel() silent! iunmap endfunction +function! s:on_move() + let s:t_last_move = reltime() + + call llama#fim_cancel() +endfunction + " callback that processes the result from the server function! s:fim_on_stdout(job_id, data, event) dict let l:raw = join(a:data, "\n") @@ -511,9 +571,9 @@ function! s:fim_on_stdout(job_id, data, event) dict \ l:n_cached, l:n_ctx \ ) else - let l:info = printf("%s | context: %d / %d / %d / %d | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %.2f ms", + let l:info = printf("%s | context: %d / %d / r=%d / q=%d / e=%d | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %.2f ms", \ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim', - \ l:n_cached, l:n_ctx, len(s:ring_chunks), s:ring_n_evict, + \ l:n_cached, l:n_ctx, len(s:ring_chunks), len(s:ring_queued), s:ring_n_evict, \ l:n_prompt, l:t_prompt_ms, l:s_prompt, \ l:n_predict, l:t_predict_ms, l:s_predict, \ 1000.0 * reltimefloat(reltime(s:t_fim_start)) From 060573f7e81b11dc02a6f0fd0a6f047937f3d1d3 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 15 Oct 2024 11:34:32 +0300 Subject: [PATCH 33/42] llama.vim : add comments [no ci] --- examples/llama.vim | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/examples/llama.vim b/examples/llama.vim index b184faa7e5989..6ae6d2b39ad88 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -62,7 +62,9 @@ highlight llama_hl_info guifg=#77ff2f " " ring_n_chunks: max number of chunks to pass as extra context to the server (0 to disable) " ring_chunk_size: max size of the chunks (in number of lines) -" ring_scope: the range around the cursor position (in number of lines) for gathering chunks +" note: adjust these numbers so that you don't overrun your context +" at ring_n_chunks = 64 and ring_chunk_size = 64 you need ~32k context +" ring_scope: the range around the cursor position (in number of lines) for gathering chunks after FIM " ring_update_ms: how often to process queued chunks in normal mode " let s:default_config = { @@ -416,7 +418,10 @@ function! llama#fim(is_auto, on_hold) abort " only gather chunks if the cursor has moved a lot " TODO: something more clever? reranking? if a:is_auto && l:delta_y > 32 + " expand the prefix even further call s:pick_chunk(getline(max([1, s:pos_y - g:llama_config.ring_scope]), max([1, s:pos_y - g:llama_config.n_prefix])), v:false, v:false) + + " pick a suffix chunk call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:llama_config.n_suffix]), min([l:max_y, s:pos_y + g:llama_config.n_suffix + g:llama_config.ring_chunk_size])), v:false, v:false) let s:pos_y_pick = s:pos_y From 847c8c023e03e4599a6d55c2dc4ad12a53de7123 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 15 Oct 2024 11:49:20 +0300 Subject: [PATCH 34/42] llama.vim : update infill API params [no ci] --- examples/llama.vim | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/llama.vim b/examples/llama.vim index 6ae6d2b39ad88..a09ecfe7c315b 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -287,16 +287,16 @@ function! s:ring_update() " no samplers needed here let l:request = json_encode({ - \ 'prompt': "", \ 'input_prefix': "", \ 'input_suffix': "", + \ 'input_extra': l:extra_context, + \ 'prompt': "", \ 'n_predict': 1, \ 'penalty_last_n': 0, \ 'temperature': 0.0, \ 'stream': v:false, \ 'samplers': ["temperature"], \ 'cache_prompt': v:true, - \ 'extra_context': l:extra_context, \ 't_max_prompt_ms': 1, \ 't_max_predict_ms': 1 \ }) @@ -379,8 +379,9 @@ function! llama#fim(is_auto, on_hold) abort let l:request = json_encode({ \ 'input_prefix': l:prefix, - \ 'prompt': l:prompt, \ 'input_suffix': l:suffix, + \ 'input_extra': l:extra_context, + \ 'prompt': l:prompt, \ 'n_predict': g:llama_config.n_predict, \ 'penalty_last_n': 0, \ 'top_k': 40, @@ -388,7 +389,6 @@ function! llama#fim(is_auto, on_hold) abort \ 'stream': v:false, \ 'samplers': ["top_k", "top_p", "infill"], \ 'cache_prompt': v:true, - \ 'extra_context': l:extra_context, \ 't_max_prompt_ms': g:llama_config.t_max_prompt_ms, \ 't_max_predict_ms': g:llama_config.t_max_predict_ms \ }) From 4583aef12bbfb7a49216bfa55a4741f953bb962b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 15 Oct 2024 17:18:32 +0300 Subject: [PATCH 35/42] llama.vim : final touches ggml-ci --- examples/llama.vim | 35 +++++++++++++++++++++++++++-------- src/llama-sampling.cpp | 2 +- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/examples/llama.vim b/examples/llama.vim index a09ecfe7c315b..3b115c49ccb30 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -17,7 +17,7 @@ " " start the llama.cpp server with a FIM-compatible model. for example: " -" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa -dt 0.1 --ubatch-size 512 --batch-size 1024 --cache-reuse 512 +" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa -dt 0.1 --ubatch-size 512 --batch-size 1024 --cache-reuse 64 " " --batch-size [512, model max context] " @@ -33,8 +33,10 @@ " " :call llama#init() " +" more info: https://github.com/ggerganov/llama.cpp/pull/9787/files +" -" color of the suggested text +" colors (adjust to your liking) highlight llama_hl_hint guifg=#ff772f highlight llama_hl_info guifg=#77ff2f @@ -154,6 +156,8 @@ function! llama#init() endif endfunction +" compute how similar two chunks of text are +" 0 - no similarity, 1 - high similarity " TODO: figure out something better function! s:chunk_sim(c0, c1) let l:lines0 = len(a:c0) @@ -173,17 +177,23 @@ function! s:chunk_sim(c0, c1) return 2.0 * l:common / (l:lines0 + l:lines1) endfunction -" pick a chunk from the provided text and queue it for processing +" pick a random chunk of size g:llama_config.ring_chunk_size from the provided text and queue it for processing +" +" no_mod - do not pick chunks from buffers with pending changes +" do_evict - evict chunks that are very similar to the new one +" function! s:pick_chunk(text, no_mod, do_evict) " do not pick chunks from buffers with pending changes or buffers that are not files if a:no_mod && (getbufvar(bufnr('%'), '&modified') || !buflisted(bufnr('%')) || !filereadable(expand('%'))) return endif + " if the extra context option is disabled - do nothing if g:llama_config.ring_n_chunks <= 0 return endif + " don't pick very small chunks if len(a:text) < 3 return endif @@ -220,9 +230,9 @@ function! s:pick_chunk(text, no_mod, do_evict) return endif - " evict chunks that are very similar to the new one + " evict queued chunks that are very similar to the new one for i in range(len(s:ring_queued) - 1, 0, -1) - if s:chunk_sim(s:ring_queued[i].data, l:chunk) > 0.5 + if s:chunk_sim(s:ring_queued[i].data, l:chunk) > 0.9 if a:do_evict call remove(s:ring_queued, i) let s:ring_n_evict += 1 @@ -234,7 +244,7 @@ function! s:pick_chunk(text, no_mod, do_evict) " also from s:ring_chunks for i in range(len(s:ring_chunks) - 1, 0, -1) - if s:chunk_sim(s:ring_chunks[i].data, l:chunk) > 0.5 + if s:chunk_sim(s:ring_chunks[i].data, l:chunk) > 0.9 if a:do_evict call remove(s:ring_chunks, i) let s:ring_n_evict += 1 @@ -244,6 +254,7 @@ function! s:pick_chunk(text, no_mod, do_evict) endif endfor + " TODO: become parameter ? if len(s:ring_queued) == 16 call remove(s:ring_queued, 0) endif @@ -253,7 +264,8 @@ function! s:pick_chunk(text, no_mod, do_evict) "let &statusline = 'extra context: ' . len(s:ring_chunks) . ' / ' . len(s:ring_queued) endfunction -" called every g:llama_config.ring_update_ms, processed chunks are moved to s:ring_chunks +" picks a queued chunk, sends it for processing and adds it to s:ring_chunks +" called every g:llama_config.ring_update_ms function! s:ring_update() call timer_start(g:llama_config.ring_update_ms, {-> s:ring_update()}) @@ -306,15 +318,21 @@ function! s:ring_update() \ g:llama_config.endpoint, shellescape(l:request) \ ) + " no callbacks because we don't need to process the response call jobstart(l:curl_command, {}) endfunction +" necessary for 'inoremap ' function! llama#fim_inline(is_auto, on_hold) abort call llama#fim(a:is_auto, a:on_hold) return '' endfunction +" the main FIM call +" takes local context around the cursor and sends it together with the extra context +" to the llama.cpp server for completion function! llama#fim(is_auto, on_hold) abort + " we already have a suggestion for the current cursor position if a:on_hold && (s:hint_shown || (s:pos_x == col('.') - 1 && s:pos_y == line('.'))) return endif @@ -415,6 +433,7 @@ function! llama#fim(is_auto, on_hold) abort " TODO: per-file location let l:delta_y = abs(s:pos_y - s:pos_y_pick) + " gather some extra context nearby and process it in the background " only gather chunks if the cursor has moved a lot " TODO: something more clever? reranking? if a:is_auto && l:delta_y > 32 @@ -474,7 +493,7 @@ function! s:on_move() call llama#fim_cancel() endfunction -" callback that processes the result from the server +" callback that processes the FIM result from the server and displays the suggestion function! s:fim_on_stdout(job_id, data, event) dict let l:raw = join(a:data, "\n") if len(l:raw) == 0 diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 1a297aa3866e6..d71516153cf82 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -1764,7 +1764,7 @@ struct llama_sampler * llama_sampler_init_logit_bias( // infill -#define GGML_DEBUG_SAMPLER_INFILL +//#define GGML_DEBUG_SAMPLER_INFILL struct llama_sampler_infill { const struct llama_vocab * vocab; From d1b8b215d51df9c5e17fd5921eb3d05f419c3fae Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 17 Oct 2024 16:16:19 +0300 Subject: [PATCH 36/42] llama.vim : fix repetitions of existing text --- examples/llama.vim | 62 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 49 insertions(+), 13 deletions(-) diff --git a/examples/llama.vim b/examples/llama.vim index 3b115c49ccb30..3d328556304ee 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -33,7 +33,7 @@ " " :call llama#init() " -" more info: https://github.com/ggerganov/llama.cpp/pull/9787/files +" more info: https://github.com/ggerganov/llama.cpp/pull/9787 " " colors (adjust to your liking) @@ -46,7 +46,7 @@ highlight llama_hl_info guifg=#77ff2f " n_prefix: number of lines before the cursor location to include in the prefix " n_suffix: number of lines after the cursor location to include in the suffix " n_predict: max number of tokens to predict -" t_max_prompt_ms: max alloted time for the prompt generation (TODO: not yet supported) +" t_max_prompt_ms: max alloted time for the prompt processing (TODO: not yet supported) " t_max_predict_ms: max alloted time for the prediction " show_info: show extra info about the inference (0 - disabled, 1 - statusline, 2 - inline) " auto_fim: trigger FIM completion automatically on cursor movement @@ -99,8 +99,8 @@ function! llama#init() return endif - let s:pos_x = 0 " cursor position upon start of completion - let s:pos_y = 0 + let s:pos_x = 0 " cursor position upon start of completion + let s:pos_y = 0 let s:line_cur = '' @@ -329,8 +329,7 @@ function! llama#fim_inline(is_auto, on_hold) abort endfunction " the main FIM call -" takes local context around the cursor and sends it together with the extra context -" to the llama.cpp server for completion +" takes local context around the cursor and sends it together with the extra context to the server for completion function! llama#fim(is_auto, on_hold) abort " we already have a suggestion for the current cursor position if a:on_hold && (s:hint_shown || (s:pos_x == col('.') - 1 && s:pos_y == line('.'))) @@ -569,13 +568,50 @@ function! s:fim_on_stdout(job_id, data, event) dict endif let s:pos_dx = len(s:content[-1]) - let s:content[-1] .= s:line_cur_suffix - " truncate the suggestion if it repeats the following lines - if len(s:content) > 1 && s:content[1] == getline(s:pos_y + 1) - let s:content = [s:content[0]] + " NOTE: the following is logic for discarding predictions that repeat existing text + " the code is quite ugly and there is very likely a simpler and more canonical way to implement this + " + " still, I wonder if there is some better way that avoids having to do these special hacks? + " on one hand, the LLM 'sees' the contents of the file before we start editing, so it is normal that it would + " start generating whatever we have given it via the extra context. but on the other hand, it's not very + " helpful to re-generate the same code that is already there + + " truncate the suggestion if the first line is empty + if s:content[0] == "" + let s:content = [""] + endif + + " truncate the suggestion if it repeats the suffix + if len(s:content) == 1 && s:content[0] == s:line_cur_suffix + let s:content = [""] endif + " find the first non-empty line (strip whitespace) + let l:cmp_y = s:pos_y + 1 + while l:cmp_y < line('$') && getline(l:cmp_y) =~? '^\s*$' + let l:cmp_y += 1 + endwhile + + if (s:line_cur_prefix . s:content[0]) == getline(l:cmp_y) + " truncate the suggestion if it repeats the next line + if len(s:content) == 1 + let s:content = [""] + endif + + " ... or if the second line of the suggestion is the prefix of line l:cmp_y + 1 + if len(s:content) == 2 && s:content[-1] == getline(l:cmp_y + 1)[:len(s:content[-1]) - 1] + let s:content = [""] + endif + + " ... or if the middle chunk of lines of the suggestion is the same as [l:cmp_y + 1, l:cmp_y + len(s:content) - 1) + if len(s:content) > 2 && join(s:content[1:-1], "\n") == join(getline(l:cmp_y + 1, l:cmp_y + len(s:content) - 1), "\n") + let s:content = [""] + endif + endif + + let s:content[-1] .= s:line_cur_suffix + call llama#fim_cancel() " display virtual text with the suggestion @@ -595,9 +631,9 @@ function! s:fim_on_stdout(job_id, data, event) dict \ l:n_cached, l:n_ctx \ ) else - let l:info = printf("%s | context: %d / %d / r=%d / q=%d / e=%d | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %.2f ms", + let l:info = printf("%s | c: %d / %d, r: %d, e: %d, q: %d | p: %d (%.2f ms, %.2f t/s) | g: %d (%.2f ms, %.2f t/s) | t: %.2f ms", \ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim', - \ l:n_cached, l:n_ctx, len(s:ring_chunks), len(s:ring_queued), s:ring_n_evict, + \ l:n_cached, l:n_ctx, len(s:ring_chunks), s:ring_n_evict, len(s:ring_queued), \ l:n_prompt, l:t_prompt_ms, l:s_prompt, \ l:n_predict, l:t_predict_ms, l:s_predict, \ 1000.0 * reltimefloat(reltime(s:t_fim_start)) @@ -627,7 +663,7 @@ function! s:fim_on_stdout(job_id, data, event) dict \ 'virt_text_win_col': virtcol('.') \ }) - " setup accept/cancel events + " setup accept shortcuts inoremap :call llama#fim_accept(v:false) inoremap :call llama#fim_accept(v:true) From 1600d846b6e4ea528384eb6fce8dbad63a056e4c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 17 Oct 2024 22:09:47 +0300 Subject: [PATCH 37/42] llama.vim : complete only whithin the local scope [no ci] --- examples/llama.vim | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/examples/llama.vim b/examples/llama.vim index 3d328556304ee..f5cbef624e6c9 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -73,9 +73,9 @@ let s:default_config = { \ 'endpoint': 'http://127.0.0.1:8012/infill', \ 'n_prefix': 256, \ 'n_suffix': 8, - \ 'n_predict': 64, + \ 'n_predict': 128, \ 't_max_prompt_ms': 500, - \ 't_max_predict_ms': 200, + \ 't_max_predict_ms': 1000, \ 'show_info': 2, \ 'auto_fim': v:true, \ 'max_line_suffix': 8, @@ -394,12 +394,16 @@ function! llama#fim(is_auto, on_hold) abort \ }) endfor + " the indentation of the current line + let l:indent = strlen(matchstr(s:line_cur_prefix, '^\s*')) + let l:request = json_encode({ \ 'input_prefix': l:prefix, \ 'input_suffix': l:suffix, \ 'input_extra': l:extra_context, \ 'prompt': l:prompt, \ 'n_predict': g:llama_config.n_predict, + \ 'n_indent': l:indent, \ 'penalty_last_n': 0, \ 'top_k': 40, \ 'top_p': 0.99, @@ -567,8 +571,6 @@ function! s:fim_on_stdout(job_id, data, event) dict return endif - let s:pos_dx = len(s:content[-1]) - " NOTE: the following is logic for discarding predictions that repeat existing text " the code is quite ugly and there is very likely a simpler and more canonical way to implement this " @@ -578,7 +580,12 @@ function! s:fim_on_stdout(job_id, data, event) dict " helpful to re-generate the same code that is already there " truncate the suggestion if the first line is empty - if s:content[0] == "" + if len(s:content) == 1 && s:content[0] == "" + let s:content = [""] + endif + + " ... and the next lines are repeated + if len(s:content) > 1 && s:content[0] == "" && s:content[1:] == getline(s:pos_y + 1, s:pos_y + len(s:content) - 1) let s:content = [""] endif @@ -610,6 +617,17 @@ function! s:fim_on_stdout(job_id, data, event) dict endif endif + " keep only lines that have the same or larger whitespace prefix as s:line_cur_prefix + "let l:indent = strlen(matchstr(s:line_cur_prefix, '^\s*')) + "for i in range(1, len(s:content) - 1) + " if strlen(matchstr(s:content[i], '^\s*')) < l:indent + " let s:content = s:content[:i - 1] + " break + " endif + "endfor + + let s:pos_dx = len(s:content[-1]) + let s:content[-1] .= s:line_cur_suffix call llama#fim_cancel() From 6bb6e6dd8094c426b3cdc3077fe34e239e1d1835 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 18 Oct 2024 09:47:14 +0300 Subject: [PATCH 38/42] llama.vim : display ring capacity [no ci] --- examples/llama.vim | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/llama.vim b/examples/llama.vim index f5cbef624e6c9..16434e570247c 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -649,9 +649,9 @@ function! s:fim_on_stdout(job_id, data, event) dict \ l:n_cached, l:n_ctx \ ) else - let l:info = printf("%s | c: %d / %d, r: %d, e: %d, q: %d | p: %d (%.2f ms, %.2f t/s) | g: %d (%.2f ms, %.2f t/s) | t: %.2f ms", + let l:info = printf("%s | c: %d / %d, r: %d / %d, e: %d, q: %d / 16 | p: %d (%.2f ms, %.2f t/s) | g: %d (%.2f ms, %.2f t/s) | t: %.2f ms", \ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim', - \ l:n_cached, l:n_ctx, len(s:ring_chunks), s:ring_n_evict, len(s:ring_queued), + \ l:n_cached, l:n_ctx, len(s:ring_chunks), g:llama_config.ring_n_chunks, s:ring_n_evict, len(s:ring_queued), \ l:n_prompt, l:t_prompt_ms, l:s_prompt, \ l:n_predict, l:t_predict_ms, l:s_predict, \ 1000.0 * reltimefloat(reltime(s:t_fim_start)) From fe78c3939911975c9508653162efb4dbd1a33474 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 18 Oct 2024 13:48:00 +0300 Subject: [PATCH 39/42] llama.vim : fix large chunk accept + comments [no ci] --- examples/llama.vim | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/examples/llama.vim b/examples/llama.vim index 16434e570247c..e06cdff38a6b5 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -17,7 +17,7 @@ " " start the llama.cpp server with a FIM-compatible model. for example: " -" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa -dt 0.1 --ubatch-size 512 --batch-size 1024 --cache-reuse 64 +" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa -dt 0.1 --ubatch-size 512 --batch-size 1024 --cache-reuse 256 " " --batch-size [512, model max context] " @@ -29,6 +29,12 @@ " chunks the batch into smaller chunks for faster processing " depends on the specific hardware. use llama-bench to profile and determine the best size " +" --cache-reuse (ge:llama_config.n_predict, 1024] +" +" this should be either 0 (disabled) or strictly larger than g:llama_config.n_predict +" using non-zero value enables context reuse on the server side which dramatically improves the performance at +" large contexts. a value of 256 should be good for all cases +" " run this once to initialise llama.vim: " " :call llama#init() @@ -43,8 +49,8 @@ highlight llama_hl_info guifg=#77ff2f " general parameters: " " endpoint: llama.cpp server endpoint -" n_prefix: number of lines before the cursor location to include in the prefix -" n_suffix: number of lines after the cursor location to include in the suffix +" n_prefix: number of lines before the cursor location to include in the local prefix +" n_suffix: number of lines after the cursor location to include in the local suffix " n_predict: max number of tokens to predict " t_max_prompt_ms: max alloted time for the prompt processing (TODO: not yet supported) " t_max_predict_ms: max alloted time for the prediction @@ -72,7 +78,7 @@ highlight llama_hl_info guifg=#77ff2f let s:default_config = { \ 'endpoint': 'http://127.0.0.1:8012/infill', \ 'n_prefix': 256, - \ 'n_suffix': 8, + \ 'n_suffix': 64, \ 'n_predict': 128, \ 't_max_prompt_ms': 500, \ 't_max_predict_ms': 1000, @@ -463,7 +469,7 @@ function! llama#fim_accept(first_line) " move the cursor to the end of the accepted text if !a:first_line && len(s:content) > 1 - call cursor(s:pos_y + len(s:content) - 1, s:pos_x + s:pos_dx) + call cursor(s:pos_y + len(s:content) - 1, s:pos_x + s:pos_dx + 1) else call cursor(s:pos_y, s:pos_x + len(s:content[0])) endif From b8efb0725de3b16bef35ac05761bcd07e7e0de46 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 18 Oct 2024 22:45:23 +0300 Subject: [PATCH 40/42] llama.vim : minor [no ci] --- examples/llama.vim | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/llama.vim b/examples/llama.vim index e06cdff38a6b5..cf915ff4ec48c 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -144,6 +144,7 @@ function! llama#init() "autocmd CursorHoldI * call llama#fim(v:true, v:true) endif + " gather chunks upon yanking autocmd TextYankPost * if v:event.operator ==# 'y' | call s:pick_chunk(v:event.regcontents, v:false, v:true) | endif " gather chunks upon entering/leaving a buffer @@ -310,7 +311,6 @@ function! s:ring_update() \ 'input_extra': l:extra_context, \ 'prompt': "", \ 'n_predict': 1, - \ 'penalty_last_n': 0, \ 'temperature': 0.0, \ 'stream': v:false, \ 'samplers': ["temperature"], @@ -410,7 +410,6 @@ function! llama#fim(is_auto, on_hold) abort \ 'prompt': l:prompt, \ 'n_predict': g:llama_config.n_predict, \ 'n_indent': l:indent, - \ 'penalty_last_n': 0, \ 'top_k': 40, \ 'top_p': 0.99, \ 'stream': v:false, From 32927e68b7fbfd6dfa82e531d186f1b6b22612ae Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 21 Oct 2024 12:32:38 +0300 Subject: [PATCH 41/42] llama.vim : remove on-hold code + fixes [no ci] --- examples/llama.vim | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/examples/llama.vim b/examples/llama.vim index cf915ff4ec48c..24e4a7cd18690 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -131,7 +131,7 @@ function! llama#init() augroup llama autocmd! - autocmd InsertEnter * inoremap llama#fim_inline(v:false, v:false) + autocmd InsertEnter * inoremap llama#fim_inline(v:false) autocmd InsertLeavePre * call llama#fim_cancel() autocmd CursorMoved * call s:on_move() @@ -139,9 +139,7 @@ function! llama#init() autocmd CompleteChanged * call llama#fim_cancel() if g:llama_config.auto_fim - autocmd InsertEnter * call llama#fim(v:true, v:false) - autocmd CursorMovedI * call llama#fim(v:true, v:false) - "autocmd CursorHoldI * call llama#fim(v:true, v:true) + autocmd CursorMovedI * call llama#fim(v:true) endif " gather chunks upon yanking @@ -329,16 +327,17 @@ function! s:ring_update() endfunction " necessary for 'inoremap ' -function! llama#fim_inline(is_auto, on_hold) abort - call llama#fim(a:is_auto, a:on_hold) +function! llama#fim_inline(is_auto) abort + call llama#fim(a:is_auto) return '' endfunction " the main FIM call " takes local context around the cursor and sends it together with the extra context to the server for completion -function! llama#fim(is_auto, on_hold) abort +function! llama#fim(is_auto) abort " we already have a suggestion for the current cursor position - if a:on_hold && (s:hint_shown || (s:pos_x == col('.') - 1 && s:pos_y == line('.'))) + if s:hint_shown && !a:is_auto + call llama#fim_cancel() return endif @@ -352,7 +351,7 @@ function! llama#fim(is_auto, on_hold) abort endif let s:t_fim_start = reltime() - let s:timer_fim = timer_start(600, {-> llama#fim(v:true, v:true)}) + let s:timer_fim = timer_start(600, {-> llama#fim(v:true)}) return endif @@ -512,6 +511,11 @@ function! s:fim_on_stdout(job_id, data, event) dict return endif + " show the suggestion only in insert mode + if mode() !=# 'i' + return + endif + let s:pos_x = self.pos_x let s:pos_y = self.pos_y From 8fb51545477f2e1ef636a7c10f47e345e4b5d985 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 21 Oct 2024 15:57:15 +0300 Subject: [PATCH 42/42] llama.vim : minor [no ci] --- examples/llama.vim | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/llama.vim b/examples/llama.vim index 24e4a7cd18690..e75872cae0e9c 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -11,7 +11,7 @@ " " - Tab - accept the current suggestion " - Shift+Tab - accept just the first line of the segguestion -" - Ctrl+F - trigger FIM completion manually +" - Ctrl+F - toggle FIM completion manually " " make symlink or copy this file to ~/.config/nvim/autoload/llama.vim "