From 5aaf24766aa7d85077816b7857f38d0d1d6b982b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 9 Oct 2024 11:01:53 +0300
Subject: [PATCH 01/42] llama : add infill sampler

---
 common/common.h            | 2 ++
 examples/server/server.cpp | 4 ++++
 2 files changed, 6 insertions(+)
diff --git a/common/common.h b/common/common.h
index 5ca8fd391ab74..2fb92ae143c54 100644
--- a/common/common.h
+++ b/common/common.h
@@ -117,6 +117,8 @@ struct common_sampler_params {
     float   temp              = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
     float   dynatemp_range    = 0.00f; // 0.0 = disabled
     float   dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
+    float   infill_p          = 0.80f;
+    float   infill_p_eog      = 0.01f;
     int32_t penalty_last_n    = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
     float   penalty_repeat    = 1.00f; // 1.0 = disabled
     float   penalty_freq      = 0.00f; // 0.0 = disabled
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 3992108e7f383..e9621ba93c956 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -873,6 +873,8 @@ struct server_context {
         slot.sparams.tfs_z             = json_value(data, "tfs_z",             default_sparams.tfs_z);
         slot.sparams.typ_p             = json_value(data, "typical_p",         default_sparams.typ_p);
         slot.sparams.temp              = json_value(data, "temperature",       default_sparams.temp);
+        slot.sparams.infill_p          = json_value(data, "infill_p",          default_sparams.infill_p);
+        slot.sparams.infill_p_eog      = json_value(data, "infill_p_eog",      default_sparams.infill_p_eog);
         slot.sparams.dynatemp_range    = json_value(data, "dynatemp_range",    default_sparams.dynatemp_range);
         slot.sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
         slot.sparams.penalty_last_n    = json_value(data, "repeat_last_n",     default_sparams.penalty_last_n);
@@ -1241,6 +1243,8 @@ struct server_context {
             {"xtc_threshold",             slot.sparams.xtc_threshold},
             {"tfs_z",                     slot.sparams.tfs_z},
             {"typical_p",                 slot.sparams.typ_p},
+            {"infill_p",                  slot.sparams.infill_p},
+            {"infill_p_eog",              slot.sparams.infill_p_eog},
             {"repeat_last_n",             slot.sparams.penalty_last_n},
             {"repeat_penalty",            slot.sparams.penalty_repeat},
             {"presence_penalty",          slot.sparams.penalty_present},

From 0566c695316a077b2c46b79cc32c19802da6ae03 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 9 Oct 2024 11:01:30 +0300
Subject: [PATCH 02/42] llama.vim : neovim plugin

---
 examples/llama.vim | 199 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 199 insertions(+)
 create mode 100644 examples/llama.vim

diff --git a/examples/llama.vim b/examples/llama.vim
new file mode 100644
index 0000000000000..30a717181062b
--- /dev/null
+++ b/examples/llama.vim
@@ -0,0 +1,199 @@
+" sample config:
+"
+"   - Ctrl+F - trigger FIM completion
+"
+" copy paste this in your .vimrc:
+"
+"augroup llama_cpp
+"    autocmd!
+"    autocmd InsertEnter * inoremap <buffer> <silent> <C-F> <Esc>:call llama#fim()<CR>a
+"augroup END
+"
+
+" color of the suggested text
+highlight llama_hint guifg=#ff772f
+
+let s:default_config = {
+    \ 'endpoint':    'http://127.0.0.1:8012/infill',
+    \ 'n_prefix':    32,
+    \ 'n_suffix':    32,
+    \ 'n_predict':   64,
+    \ 'n_probs':     3,
+    \ 'temperature': 0.1,
+    \ 'stop':        ["\n"]
+    \ }
+
+let g:llama_config = get(g:, 'llama_config', s:default_config)
+
+function! llama#fim() abort
+    let l:pos_x = col('.')
+    let l:pos_y = line('.')
+    let l:max_y = line('$')
+
+    let l:lines_prefix = getline(max([1, l:pos_y - g:llama_config.n_prefix]), l:pos_y - 1)
+    let l:lines_suffix = getline(l:pos_y + 1, min([l:max_y, l:pos_y + g:llama_config.n_suffix]))
+
+    let l:line_cur        = getline('.')
+    let l:line_cur_prefix = strpart(l:line_cur, 0, l:pos_x)
+    let l:line_cur_suffix = strpart(l:line_cur, l:pos_x)
+
+    let l:prefix = ""
+        \ . join(l:lines_prefix, "\n")
+        \ . "\n"
+        \ . l:line_cur_prefix
+
+    let l:suffix = ""
+        \ . l:line_cur_suffix
+        \ . join(l:lines_suffix, "\n")
+        \ . "\n"
+
+    let l:request = json_encode({
+        \ 'prompt':         "",
+        \ 'input_prefix':   l:prefix,
+        \ 'input_suffix':   l:suffix,
+       "\ 'stop':           g:llama_config.stop,
+        \ 'n_predict':      g:llama_config.n_predict,
+       "\ 'n_probs':        g:llama_config.n_probs,
+        \ 'penalty_last_n': 0,
+        \ 'temperature':    g:llama_config.temperature,
+        \ 'top_k':          5,
+        \ 'infill_p':       0.20,
+        \ 'infill_p_eog':   0.001,
+        \ 'stream':         v:false,
+        \ 'samplers':       ["top_k", "infill"]
+        \ })
+
+    " request completion from the server
+    let l:curl_command = printf(
+        \ "curl --silent --no-buffer --request POST --url %s --header \"Content-Type: application/json\" --data %s",
+        \ g:llama_config.endpoint, shellescape(l:request)
+        \ )
+
+    let l:can_accept = v:true
+    let s:content = []
+
+    let l:raw = system(l:curl_command)
+    if l:can_accept && v:shell_error
+        call add(s:content, "<| curl error: is the server on? |>")
+        let l:can_accept = v:false
+    endif
+
+    if l:can_accept && l:raw == ""
+        call add(s:content, "<| empty response: is the server on? |>")
+        let l:can_accept = v:false
+    endif
+
+    " get the generated suggestion
+    if l:can_accept
+        let l:response = json_decode(l:raw)
+
+        for l:part in split(get(l:response, 'content', ''), "\n", 1)
+            call add(s:content, l:part)
+        endfor
+
+        " remove trailing new lines
+        while len(s:content) > 0 && s:content[-1] == ""
+            call remove(s:content, -1)
+        endwhile
+    endif
+
+    if len(s:content) == 0
+        call add(s:content, "<| nothing to suggest |>")
+        let l:can_accept = v:false
+    endif
+
+    let s:pos_dx = len(s:content[-1])
+    let s:content[-1] .= l:line_cur_suffix
+
+    " display virtual text with the suggestion
+    let l:bufnr = bufnr('%')
+    let s:ns_id = nvim_create_namespace('llama_virtual_text')
+
+    call nvim_buf_set_extmark(l:bufnr, s:ns_id, l:pos_y - 1, l:pos_x - 1, {
+        \ 'virt_text': [[s:content[0], 'llama_hint']],
+        \ 'virt_text_win_col': virtcol('.')
+        \ })
+
+    call nvim_buf_set_extmark(l:bufnr, s:ns_id, l:pos_y - 1, 0, {
+        \ 'virt_lines': map(s:content[1:], {idx, val -> [[val, 'llama_hint']]}),
+        \ 'virt_text_win_col': virtcol('.')
+        \ })
+
+    " accept suggestion with Tab and reject it with any other key
+    if l:can_accept
+        inoremap <buffer> <Tab> <C-O>:call llama#accept_virtual_text()<CR>
+    else
+        inoremap <buffer> <Tab> <C-O>:call llama#cancel_virtual_text()<CR>
+    endif
+
+    for l:key in range(33, 127) + [8, 27]
+        if l:key != 0x7C
+            if l:key == 8
+                execute 'inoremap <buffer> <Bs>  <C-O>:call llama#cancel_virtual_text()<CR><Bs>'
+            elseif l:key == 27
+                execute 'inoremap <buffer> <Esc> <C-O>:call llama#cancel_virtual_text()<CR><Esc>'
+            elseif l:key == 127
+                execute 'inoremap <buffer> <Del> <C-O>:call llama#cancel_virtual_text()<CR><Del>'
+            else
+                execute 'inoremap <buffer> ' . nr2char(l:key) . ' <C-O>:call llama#cancel_virtual_text()<CR>' . nr2char(l:key)
+            endif
+        endif
+    endfor
+
+    inoremap <buffer> <Up>    <C-O>:call llama#cancel_virtual_text()<CR><Up>
+    inoremap <buffer> <Down>  <C-O>:call llama#cancel_virtual_text()<CR><Down>
+    inoremap <buffer> <Left>  <C-O>:call llama#cancel_virtual_text()<CR><Left>
+    inoremap <buffer> <Right> <C-O>:call llama#cancel_virtual_text()<CR><Right>
+endfunction
+
+function! llama#accept_virtual_text()
+    let l:pos_x = col('.')
+    let l:pos_y = line('.')
+
+    let l:line_cur = getline('.')
+
+    let l:pos0 = l:pos_x - 2
+
+    if l:pos_x == len(l:line_cur)
+        let l:pos0 = l:pos_x - 1
+    endif
+
+    " insert the suggestion at the cursor location
+    call setline(l:pos_y, l:line_cur[:l:pos0] . s:content[0])
+    if len(s:content) > 1
+        call append(l:pos_y, s:content[1:-1])
+    endif
+
+    " move the cursor to the end of the accepted text
+    call cursor(l:pos_y + len(s:content) - 1, l:pos_x + s:pos_dx)
+
+    call llama#cancel_virtual_text()
+endfunction
+
+function! llama#cancel_virtual_text()
+    " clear the virtual text
+    let l:bufnr = bufnr('%')
+    call nvim_buf_clear_namespace(l:bufnr, s:ns_id, 0, -1)
+
+    " remove the mappings
+    iunmap <buffer> <Tab>
+
+    for l:key in range(33, 127) + [8, 27]
+        if l:key != 0x7C
+            if l:key == 8
+                execute 'iunmap <buffer> <Bs>'
+            elseif l:key == 27
+                execute 'iunmap <buffer> <Esc>'
+            elseif l:key == 127
+                execute 'iunmap <buffer> <Del>'
+            else
+                execute 'iunmap <buffer> ' . nr2char(l:key)
+            endif
+        endif
+    endfor
+
+    iunmap <buffer> <Up>
+    iunmap <buffer> <Down>
+    iunmap <buffer> <Left>
+    iunmap <buffer> <Right>
+endfunction

From 0c649c8967f60ff7f8247f0ca4aca86e55b891e4 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 9 Oct 2024 12:36:56 +0300
Subject: [PATCH 03/42] llama.vim : fix suffix construction + fix virt text
 offset

---
 examples/llama.vim | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/llama.vim b/examples/llama.vim
index 30a717181062b..10f81f73331fb 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -44,6 +44,7 @@ function! llama#fim() abort
 
     let l:suffix = ""
         \ . l:line_cur_suffix
+        \ . "\n"
         \ . join(l:lines_suffix, "\n")
         \ . "\n"
 
@@ -111,7 +112,7 @@ function! llama#fim() abort
 
     call nvim_buf_set_extmark(l:bufnr, s:ns_id, l:pos_y - 1, l:pos_x - 1, {
         \ 'virt_text': [[s:content[0], 'llama_hint']],
-        \ 'virt_text_win_col': virtcol('.')
+        \ 'virt_text_win_col': l:pos_x == 1 ? 0 : virtcol('.')
         \ })
 
     call nvim_buf_set_extmark(l:bufnr, s:ns_id, l:pos_y - 1, 0, {

From 07e7dd47f21bf3e0af19cae282b5748ad430313c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 9 Oct 2024 12:57:44 +0300
Subject: [PATCH 04/42] llama.vim : handle space

---
 examples/llama.vim | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/examples/llama.vim b/examples/llama.vim
index 10f81f73331fb..24289fbe05bf1 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -127,14 +127,16 @@ function! llama#fim() abort
         inoremap <buffer> <Tab> <C-O>:call llama#cancel_virtual_text()<CR>
     endif
 
-    for l:key in range(33, 127) + [8, 27]
+    for l:key in range(32, 127) + [8, 27]
         if l:key != 0x7C
             if l:key == 8
-                execute 'inoremap <buffer> <Bs>  <C-O>:call llama#cancel_virtual_text()<CR><Bs>'
+                execute 'inoremap <buffer> <Bs>    <C-O>:call llama#cancel_virtual_text()<CR><Bs>'
             elseif l:key == 27
-                execute 'inoremap <buffer> <Esc> <C-O>:call llama#cancel_virtual_text()<CR><Esc>'
+                execute 'inoremap <buffer> <Esc>   <C-O>:call llama#cancel_virtual_text()<CR><Esc>'
+            elseif l:key == 32
+                execute 'inoremap <buffer> <Space> <C-O>:call llama#cancel_virtual_text()<CR><Space>'
             elseif l:key == 127
-                execute 'inoremap <buffer> <Del> <C-O>:call llama#cancel_virtual_text()<CR><Del>'
+                execute 'inoremap <buffer> <Del>   <C-O>:call llama#cancel_virtual_text()<CR><Del>'
             else
                 execute 'inoremap <buffer> ' . nr2char(l:key) . ' <C-O>:call llama#cancel_virtual_text()<CR>' . nr2char(l:key)
             endif
@@ -153,11 +155,7 @@ function! llama#accept_virtual_text()
 
     let l:line_cur = getline('.')
 
-    let l:pos0 = l:pos_x - 2
-
-    if l:pos_x == len(l:line_cur)
-        let l:pos0 = l:pos_x - 1
-    endif
+    let l:pos0 = l:pos_x == len(l:line_cur) ? l:pos_x - 1 : l:pos_x - 2
 
     " insert the suggestion at the cursor location
     call setline(l:pos_y, l:line_cur[:l:pos0] . s:content[0])
@@ -179,12 +177,14 @@ function! llama#cancel_virtual_text()
     " remove the mappings
     iunmap <buffer> <Tab>
 
-    for l:key in range(33, 127) + [8, 27]
+    for l:key in range(32, 127) + [8, 27]
         if l:key != 0x7C
             if l:key == 8
                 execute 'iunmap <buffer> <Bs>'
             elseif l:key == 27
                 execute 'iunmap <buffer> <Esc>'
+            elseif l:key == 32
+                execute 'iunmap <buffer> <Space>'
             elseif l:key == 127
                 execute 'iunmap <buffer> <Del>'
             else

From 9d13e87b1b1dd57b815f96a4099849c7d41af2be Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 9 Oct 2024 15:08:31 +0300
Subject: [PATCH 05/42] llama.vim : add processing info overlay

---
 examples/llama.vim | 87 ++++++++++++++++++++++++++++++++++------------
 1 file changed, 64 insertions(+), 23 deletions(-)

diff --git a/examples/llama.vim b/examples/llama.vim
index 24289fbe05bf1..febef637ce9e8 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -11,12 +11,13 @@
 "
 
 " color of the suggested text
-highlight llama_hint guifg=#ff772f
+highlight llama_hl_hint guifg=#ff772f
+highlight llama_hl_info guifg=#77ff2f
 
 let s:default_config = {
     \ 'endpoint':    'http://127.0.0.1:8012/infill',
-    \ 'n_prefix':    32,
-    \ 'n_suffix':    32,
+    \ 'n_prefix':    128,
+    \ 'n_suffix':    128,
     \ 'n_predict':   64,
     \ 'n_probs':     3,
     \ 'temperature': 0.1,
@@ -71,6 +72,16 @@ function! llama#fim() abort
         \ )
 
     let l:can_accept = v:true
+    let l:has_timing = v:false
+
+    let l:n_prompt    = 0
+    let l:t_prompt_ms = 1.0
+    let l:s_prompt    = 0
+
+    let l:n_gen    = 0
+    let l:t_gen_ms = 1.0
+    let l:s_gen    = 0
+
     let s:content = []
 
     let l:raw = system(l:curl_command)
@@ -96,6 +107,20 @@ function! llama#fim() abort
         while len(s:content) > 0 && s:content[-1] == ""
             call remove(s:content, -1)
         endwhile
+
+        " if response.timings
+        if len(get(l:response, 'timings', {})) > 0
+            let l:has_timing = v:true
+            let l:timings = get(l:response, 'timings', {})
+
+            let l:n_prompt    = get(l:timings, 'prompt_n', 0)
+            let l:t_prompt_ms = get(l:timings, 'prompt_ms', 1)
+            let l:s_prompt    = get(l:timings, 'prompt_per_second', 0)
+
+            let l:n_gen    = get(l:timings, 'predicted_n', 0)
+            let l:t_gen_ms = get(l:timings, 'predicted_ms', 1)
+            let l:s_gen    = get(l:timings, 'predicted_per_second', 0)
+        endif
     endif
 
     if len(s:content) == 0
@@ -108,48 +133,62 @@ function! llama#fim() abort
 
     " display virtual text with the suggestion
     let l:bufnr = bufnr('%')
-    let s:ns_id = nvim_create_namespace('llama_virtual_text')
 
-    call nvim_buf_set_extmark(l:bufnr, s:ns_id, l:pos_y - 1, l:pos_x - 1, {
-        \ 'virt_text': [[s:content[0], 'llama_hint']],
+    let s:id_vt_fim  = nvim_create_namespace('vt_fim')
+    let s:id_vt_info = nvim_create_namespace('vt_info')
+
+    call nvim_buf_set_extmark(l:bufnr, s:id_vt_fim, l:pos_y - 1, l:pos_x - 1, {
+        \ 'virt_text': [[s:content[0], 'llama_hl_hint']],
         \ 'virt_text_win_col': l:pos_x == 1 ? 0 : virtcol('.')
         \ })
 
-    call nvim_buf_set_extmark(l:bufnr, s:ns_id, l:pos_y - 1, 0, {
-        \ 'virt_lines': map(s:content[1:], {idx, val -> [[val, 'llama_hint']]}),
+    call nvim_buf_set_extmark(l:bufnr, s:id_vt_fim, l:pos_y - 1, 0, {
+        \ 'virt_lines': map(s:content[1:], {idx, val -> [[val, 'llama_hl_hint']]}),
         \ 'virt_text_win_col': virtcol('.')
         \ })
 
+    " construct the info message:
+    if l:has_timing
+        let l:info = printf("prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s)",
+            \ l:n_prompt, l:t_prompt_ms, l:s_prompt,
+            \ l:n_gen, l:t_gen_ms, l:s_gen)
+
+        call nvim_buf_set_extmark(l:bufnr, s:id_vt_info, l:pos_y - 1, l:pos_x - 1, {
+            \ 'virt_text': [[l:info, 'llama_hl_info']],
+            \ 'virt_text_pos': 'right_align',
+            \ })
+    endif
+
     " accept suggestion with Tab and reject it with any other key
     if l:can_accept
-        inoremap <buffer> <Tab> <C-O>:call llama#accept_virtual_text()<CR>
+        inoremap <buffer> <Tab> <C-O>:call llama#accept_vt_fim()<CR>
     else
-        inoremap <buffer> <Tab> <C-O>:call llama#cancel_virtual_text()<CR>
+        inoremap <buffer> <Tab> <C-O>:call llama#cancel_vt_fim()<CR>
     endif
 
     for l:key in range(32, 127) + [8, 27]
         if l:key != 0x7C
             if l:key == 8
-                execute 'inoremap <buffer> <Bs>    <C-O>:call llama#cancel_virtual_text()<CR><Bs>'
+                execute 'inoremap <buffer> <Bs>    <C-O>:call llama#cancel_vt_fim()<CR><Bs>'
             elseif l:key == 27
-                execute 'inoremap <buffer> <Esc>   <C-O>:call llama#cancel_virtual_text()<CR><Esc>'
+                execute 'inoremap <buffer> <Esc>   <C-O>:call llama#cancel_vt_fim()<CR><Esc>'
             elseif l:key == 32
-                execute 'inoremap <buffer> <Space> <C-O>:call llama#cancel_virtual_text()<CR><Space>'
+                execute 'inoremap <buffer> <Space> <C-O>:call llama#cancel_vt_fim()<CR><Space>'
             elseif l:key == 127
-                execute 'inoremap <buffer> <Del>   <C-O>:call llama#cancel_virtual_text()<CR><Del>'
+                execute 'inoremap <buffer> <Del>   <C-O>:call llama#cancel_vt_fim()<CR><Del>'
             else
-                execute 'inoremap <buffer> ' . nr2char(l:key) . ' <C-O>:call llama#cancel_virtual_text()<CR>' . nr2char(l:key)
+                execute 'inoremap <buffer> ' . nr2char(l:key) . ' <C-O>:call llama#cancel_vt_fim()<CR>' . nr2char(l:key)
             endif
         endif
     endfor
 
-    inoremap <buffer> <Up>    <C-O>:call llama#cancel_virtual_text()<CR><Up>
-    inoremap <buffer> <Down>  <C-O>:call llama#cancel_virtual_text()<CR><Down>
-    inoremap <buffer> <Left>  <C-O>:call llama#cancel_virtual_text()<CR><Left>
-    inoremap <buffer> <Right> <C-O>:call llama#cancel_virtual_text()<CR><Right>
+    inoremap <buffer> <Up>    <C-O>:call llama#cancel_vt_fim()<CR><Up>
+    inoremap <buffer> <Down>  <C-O>:call llama#cancel_vt_fim()<CR><Down>
+    inoremap <buffer> <Left>  <C-O>:call llama#cancel_vt_fim()<CR><Left>
+    inoremap <buffer> <Right> <C-O>:call llama#cancel_vt_fim()<CR><Right>
 endfunction
 
-function! llama#accept_virtual_text()
+function! llama#accept_vt_fim()
     let l:pos_x = col('.')
     let l:pos_y = line('.')
 
@@ -166,13 +205,15 @@ function! llama#accept_virtual_text()
     " move the cursor to the end of the accepted text
     call cursor(l:pos_y + len(s:content) - 1, l:pos_x + s:pos_dx)
 
-    call llama#cancel_virtual_text()
+    call llama#cancel_vt_fim()
 endfunction
 
-function! llama#cancel_virtual_text()
+function! llama#cancel_vt_fim()
     " clear the virtual text
     let l:bufnr = bufnr('%')
-    call nvim_buf_clear_namespace(l:bufnr, s:ns_id, 0, -1)
+
+    call nvim_buf_clear_namespace(l:bufnr, s:id_vt_fim,  0, -1)
+    call nvim_buf_clear_namespace(l:bufnr, s:id_vt_info, 0, -1)
 
     " remove the mappings
     iunmap <buffer> <Tab>

From 6e82a03b9dada237be3a3e358176b3e8f68e5330 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 9 Oct 2024 15:26:19 +0300
Subject: [PATCH 06/42] llama.vim : display realtime [no ci]

---
 examples/llama.vim | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/examples/llama.vim b/examples/llama.vim
index febef637ce9e8..54bb87cece245 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -27,6 +27,8 @@ let s:default_config = {
 let g:llama_config = get(g:, 'llama_config', s:default_config)
 
 function! llama#fim() abort
+    let l:t_start = reltime()
+
     let l:pos_x = col('.')
     let l:pos_y = line('.')
     let l:max_y = line('$')
@@ -149,9 +151,11 @@ function! llama#fim() abort
 
     " construct the info message:
     if l:has_timing
-        let l:info = printf("prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s)",
+        let l:info = printf("prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %f ms",
             \ l:n_prompt, l:t_prompt_ms, l:s_prompt,
-            \ l:n_gen, l:t_gen_ms, l:s_gen)
+            \ l:n_gen, l:t_gen_ms, l:s_gen,
+            \ 1000.0 * reltimefloat(reltime(l:t_start))
+            \ )
 
         call nvim_buf_set_extmark(l:bufnr, s:id_vt_info, l:pos_y - 1, l:pos_x - 1, {
             \ 'virt_text': [[l:info, 'llama_hl_info']],

From 26a0c61e8af157aaa1321c34705bb9fcbb4ece0d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 9 Oct 2024 15:44:14 +0300
Subject: [PATCH 07/42] llama.vim : allow repeated suggestions [no ci]

---
 examples/llama.vim | 89 +++++++++++++++++++++++++++-------------------
 1 file changed, 52 insertions(+), 37 deletions(-)

diff --git a/examples/llama.vim b/examples/llama.vim
index 54bb87cece245..1544887c2c664 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -74,7 +74,7 @@ function! llama#fim() abort
         \ )
 
     let l:can_accept = v:true
-    let l:has_timing = v:false
+    let l:has_info   = v:false
 
     let l:n_prompt    = 0
     let l:t_prompt_ms = 1.0
@@ -112,8 +112,8 @@ function! llama#fim() abort
 
         " if response.timings
         if len(get(l:response, 'timings', {})) > 0
-            let l:has_timing = v:true
-            let l:timings = get(l:response, 'timings', {})
+            let l:has_info = v:true
+            let l:timings  = get(l:response, 'timings', {})
 
             let l:n_prompt    = get(l:timings, 'prompt_n', 0)
             let l:t_prompt_ms = get(l:timings, 'prompt_ms', 1)
@@ -133,25 +133,21 @@ function! llama#fim() abort
     let s:pos_dx = len(s:content[-1])
     let s:content[-1] .= l:line_cur_suffix
 
+    call llama#cancel_vt_fim()
+
     " display virtual text with the suggestion
     let l:bufnr = bufnr('%')
 
     let s:id_vt_fim  = nvim_create_namespace('vt_fim')
     let s:id_vt_info = nvim_create_namespace('vt_info')
 
-    call nvim_buf_set_extmark(l:bufnr, s:id_vt_fim, l:pos_y - 1, l:pos_x - 1, {
-        \ 'virt_text': [[s:content[0], 'llama_hl_hint']],
-        \ 'virt_text_win_col': l:pos_x == 1 ? 0 : virtcol('.')
-        \ })
-
-    call nvim_buf_set_extmark(l:bufnr, s:id_vt_fim, l:pos_y - 1, 0, {
-        \ 'virt_lines': map(s:content[1:], {idx, val -> [[val, 'llama_hl_hint']]}),
-        \ 'virt_text_win_col': virtcol('.')
-        \ })
-
     " construct the info message:
-    if l:has_timing
-        let l:info = printf("prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %f ms",
+    if l:has_info
+        " prefix the info string with whitespace in order to offset it to the right of the fim overlay
+        let l:prefix = repeat(' ', len(s:content[0]) - len(l:line_cur_suffix) + 3)
+
+        let l:info = printf("%s // prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %f ms",
+            \ l:prefix,
             \ l:n_prompt, l:t_prompt_ms, l:s_prompt,
             \ l:n_gen, l:t_gen_ms, l:s_gen,
             \ 1000.0 * reltimefloat(reltime(l:t_start))
@@ -159,11 +155,23 @@ function! llama#fim() abort
 
         call nvim_buf_set_extmark(l:bufnr, s:id_vt_info, l:pos_y - 1, l:pos_x - 1, {
             \ 'virt_text': [[l:info, 'llama_hl_info']],
-            \ 'virt_text_pos': 'right_align',
+            \ 'virt_text_pos': 'eol',
             \ })
     endif
 
+    call nvim_buf_set_extmark(l:bufnr, s:id_vt_fim, l:pos_y - 1, l:pos_x - 1, {
+        \ 'virt_text': [[s:content[0], 'llama_hl_hint']],
+        \ 'virt_text_win_col': l:pos_x == 1 ? 0 : virtcol('.')
+        \ })
+
+    call nvim_buf_set_extmark(l:bufnr, s:id_vt_fim, l:pos_y - 1, 0, {
+        \ 'virt_lines': map(s:content[1:], {idx, val -> [[val, 'llama_hl_hint']]}),
+        \ 'virt_text_win_col': virtcol('.')
+        \ })
+
     " accept suggestion with Tab and reject it with any other key
+    let s:mapping_on = v:true
+
     if l:can_accept
         inoremap <buffer> <Tab> <C-O>:call llama#accept_vt_fim()<CR>
     else
@@ -216,30 +224,37 @@ function! llama#cancel_vt_fim()
     " clear the virtual text
     let l:bufnr = bufnr('%')
 
+    let s:id_vt_fim  = nvim_create_namespace('vt_fim')
+    let s:id_vt_info = nvim_create_namespace('vt_info')
+
     call nvim_buf_clear_namespace(l:bufnr, s:id_vt_fim,  0, -1)
     call nvim_buf_clear_namespace(l:bufnr, s:id_vt_info, 0, -1)
 
-    " remove the mappings
-    iunmap <buffer> <Tab>
-
-    for l:key in range(32, 127) + [8, 27]
-        if l:key != 0x7C
-            if l:key == 8
-                execute 'iunmap <buffer> <Bs>'
-            elseif l:key == 27
-                execute 'iunmap <buffer> <Esc>'
-            elseif l:key == 32
-                execute 'iunmap <buffer> <Space>'
-            elseif l:key == 127
-                execute 'iunmap <buffer> <Del>'
-            else
-                execute 'iunmap <buffer> ' . nr2char(l:key)
+    " remove the key mappings
+    if exists('s:mapping_on') && s:mapping_on
+        iunmap <buffer> <Tab>
+
+        for l:key in range(32, 127) + [8, 27]
+            if l:key != 0x7C
+                if l:key == 8
+                    execute 'iunmap <buffer> <Bs>'
+                elseif l:key == 27
+                    execute 'iunmap <buffer> <Esc>'
+                elseif l:key == 32
+                    execute 'iunmap <buffer> <Space>'
+                elseif l:key == 127
+                    execute 'iunmap <buffer> <Del>'
+                else
+                    execute 'iunmap <buffer> ' . nr2char(l:key)
+                endif
             endif
-        endif
-    endfor
+        endfor
 
-    iunmap <buffer> <Up>
-    iunmap <buffer> <Down>
-    iunmap <buffer> <Left>
-    iunmap <buffer> <Right>
+        iunmap <buffer> <Up>
+        iunmap <buffer> <Down>
+        iunmap <buffer> <Left>
+        iunmap <buffer> <Right>
+
+        let s:mapping_on = v:false
+    endif
 endfunction

From 7e0b5062af42e96ba1709f7283d4d0cfd1eb6b55 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 9 Oct 2024 16:07:24 +0300
Subject: [PATCH 08/42] llama.vim : reduce scope of ids to local [no ci]

---
 examples/llama.vim | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/examples/llama.vim b/examples/llama.vim
index 1544887c2c664..b8cfa5906bab3 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -138,33 +138,33 @@ function! llama#fim() abort
     " display virtual text with the suggestion
     let l:bufnr = bufnr('%')
 
-    let s:id_vt_fim  = nvim_create_namespace('vt_fim')
-    let s:id_vt_info = nvim_create_namespace('vt_info')
+    let l:id_vt_fim  = nvim_create_namespace('vt_fim')
+    let l:id_vt_info = nvim_create_namespace('vt_info')
 
     " construct the info message:
     if l:has_info
         " prefix the info string with whitespace in order to offset it to the right of the fim overlay
         let l:prefix = repeat(' ', len(s:content[0]) - len(l:line_cur_suffix) + 3)
 
-        let l:info = printf("%s // prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %f ms",
+        let l:info = printf("%s | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %f.2 ms",
             \ l:prefix,
             \ l:n_prompt, l:t_prompt_ms, l:s_prompt,
             \ l:n_gen, l:t_gen_ms, l:s_gen,
             \ 1000.0 * reltimefloat(reltime(l:t_start))
             \ )
 
-        call nvim_buf_set_extmark(l:bufnr, s:id_vt_info, l:pos_y - 1, l:pos_x - 1, {
+        call nvim_buf_set_extmark(l:bufnr, l:id_vt_info, l:pos_y - 1, l:pos_x - 1, {
             \ 'virt_text': [[l:info, 'llama_hl_info']],
             \ 'virt_text_pos': 'eol',
             \ })
     endif
 
-    call nvim_buf_set_extmark(l:bufnr, s:id_vt_fim, l:pos_y - 1, l:pos_x - 1, {
+    call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, l:pos_y - 1, l:pos_x - 1, {
         \ 'virt_text': [[s:content[0], 'llama_hl_hint']],
         \ 'virt_text_win_col': l:pos_x == 1 ? 0 : virtcol('.')
         \ })
 
-    call nvim_buf_set_extmark(l:bufnr, s:id_vt_fim, l:pos_y - 1, 0, {
+    call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, l:pos_y - 1, 0, {
         \ 'virt_lines': map(s:content[1:], {idx, val -> [[val, 'llama_hl_hint']]}),
         \ 'virt_text_win_col': virtcol('.')
         \ })
@@ -224,11 +224,11 @@ function! llama#cancel_vt_fim()
     " clear the virtual text
     let l:bufnr = bufnr('%')
 
-    let s:id_vt_fim  = nvim_create_namespace('vt_fim')
-    let s:id_vt_info = nvim_create_namespace('vt_info')
+    let l:id_vt_fim  = nvim_create_namespace('vt_fim')
+    let l:id_vt_info = nvim_create_namespace('vt_info')
 
-    call nvim_buf_clear_namespace(l:bufnr, s:id_vt_fim,  0, -1)
-    call nvim_buf_clear_namespace(l:bufnr, s:id_vt_info, 0, -1)
+    call nvim_buf_clear_namespace(l:bufnr, l:id_vt_fim,  0, -1)
+    call nvim_buf_clear_namespace(l:bufnr, l:id_vt_info, 0, -1)
 
     " remove the key mappings
     if exists('s:mapping_on') && s:mapping_on

From 41053f92d305468844cb7ad539d0ff752c1e9d6a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 10 Oct 2024 08:38:57 +0300
Subject: [PATCH 09/42] llama.vim : simplify init and cancel + auto-fim

---
 examples/llama.vim | 210 +++++++++++++++++++++++----------------------
 1 file changed, 109 insertions(+), 101 deletions(-)

diff --git a/examples/llama.vim b/examples/llama.vim
index b8cfa5906bab3..de889678dd04a 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -2,12 +2,9 @@
 "
 "   - Ctrl+F - trigger FIM completion
 "
-" copy paste this in your .vimrc:
+" run this once to initialise the plugin:
 "
-"augroup llama_cpp
-"    autocmd!
-"    autocmd InsertEnter * inoremap <buffer> <silent> <C-F> <Esc>:call llama#fim()<CR>a
-"augroup END
+" :call llama#init()
 "
 
 " color of the suggested text
@@ -21,24 +18,76 @@ let s:default_config = {
     \ 'n_predict':   64,
     \ 'n_probs':     3,
     \ 'temperature': 0.1,
+    \ 'auto_fim':    v:true,
     \ 'stop':        ["\n"]
     \ }
 
 let g:llama_config = get(g:, 'llama_config', s:default_config)
 
-function! llama#fim() abort
+function! llama#init()
+    let s:pos_x  = 0
+    let s:pos_y  = 0
+    let s:pos_x0 = 0 " pos_x corrected for end-of-line edge case
+
+    let s:line_cur = ''
+
+    let s:pos_dx = 0
+    let s:content = []
+    let s:can_accept = v:false
+
+    let s:timer_fim = -1
+    let s:t_fim_last = reltime()
+
+    augroup llama
+        autocmd!
+        autocmd InsertEnter * inoremap <buffer> <silent> <C-F> <C-O>:call llama#fim(v:false)<CR>
+    augroup END
+
+    silent! call llama#fim_cancel()
+endfunction
+
+" setup accept/cancel events
+function! llama#on_hint(id_timer)
+    inoremap <buffer> <Tab> <C-O>:call llama#fim_accept()<CR>
+    inoremap <buffer> <Esc> <C-O>:call llama#fim_cancel()<CR><Esc>
+
+    augroup llama_insert
+        autocmd!
+        autocmd CursorMovedI * call llama#fim_cancel()
+    augroup END
+endfunction
+
+function! llama#fim_auto()
+    if reltimefloat(reltime(s:t_fim_last)) < 0.50
+        if s:timer_fim != -1
+            call timer_stop(s:timer_fim)
+            let s:timer_fim = -1
+        endif
+    endif
+
+    let s:t_fim_last = reltime()
+    let s:timer_fim = timer_start(500, {-> llama#fim(v:true)})
+endfunction
+
+function! llama#fim(is_auto) abort
     let l:t_start = reltime()
 
-    let l:pos_x = col('.')
-    let l:pos_y = line('.')
+    let s:content = []
+    let s:can_accept = v:false
+
+    let s:pos_x = col('.')
+    let s:pos_y = line('.')
     let l:max_y = line('$')
 
-    let l:lines_prefix = getline(max([1, l:pos_y - g:llama_config.n_prefix]), l:pos_y - 1)
-    let l:lines_suffix = getline(l:pos_y + 1, min([l:max_y, l:pos_y + g:llama_config.n_suffix]))
+    let l:lines_prefix = getline(max([1, s:pos_y - g:llama_config.n_prefix]), s:pos_y - 1)
+    let l:lines_suffix = getline(s:pos_y + 1, min([l:max_y, s:pos_y + g:llama_config.n_suffix]))
+
+    let s:line_cur = getline('.')
+
+    let s:pos_x0 = s:pos_x == len(s:line_cur) ? s:pos_x : s:pos_x - 1
 
-    let l:line_cur        = getline('.')
-    let l:line_cur_prefix = strpart(l:line_cur, 0, l:pos_x)
-    let l:line_cur_suffix = strpart(l:line_cur, l:pos_x)
+    let l:line_cur_prefix = strpart(s:line_cur, 0, s:pos_x0)
+    let l:line_cur_suffix = strpart(s:line_cur, s:pos_x0)
 
     let l:prefix = ""
         \ . join(l:lines_prefix, "\n")
@@ -73,7 +122,7 @@ function! llama#fim() abort
         \ g:llama_config.endpoint, shellescape(l:request)
         \ )
 
-    let l:can_accept = v:true
+    let s:can_accept = v:true
     let l:has_info   = v:false
 
     let l:n_prompt    = 0
@@ -84,21 +133,24 @@ function! llama#fim() abort
     let l:t_gen_ms = 1.0
     let l:s_gen    = 0
 
-    let s:content = []
-
+    " TODO: async this
     let l:raw = system(l:curl_command)
-    if l:can_accept && v:shell_error
-        call add(s:content, "<| curl error: is the server on? |>")
-        let l:can_accept = v:false
+    if s:can_accept && v:shell_error
+        if !a:is_auto
+            call add(s:content, "<| curl error: is the server on? |>")
+        endif
+        let s:can_accept = v:false
     endif
 
-    if l:can_accept && l:raw == ""
-        call add(s:content, "<| empty response: is the server on? |>")
-        let l:can_accept = v:false
+    if s:can_accept && l:raw == ""
+        if !a:is_auto
+            call add(s:content, "<| empty response: is the server on? |>")
+        endif
+        let s:can_accept = v:false
     endif
 
     " get the generated suggestion
-    if l:can_accept
+    if s:can_accept
         let l:response = json_decode(l:raw)
 
         for l:part in split(get(l:response, 'content', ''), "\n", 1)
@@ -126,14 +178,20 @@ function! llama#fim() abort
     endif
 
     if len(s:content) == 0
-        call add(s:content, "<| nothing to suggest |>")
-        let l:can_accept = v:false
+        if !a:is_auto
+            call add(s:content, "<| nothing to suggest |>")
+        endif
+        let s:can_accept = v:false
+    endif
+
+    if len(s:content) == 0
+        return
     endif
 
     let s:pos_dx = len(s:content[-1])
     let s:content[-1] .= l:line_cur_suffix
 
-    call llama#cancel_vt_fim()
+    call llama#fim_cancel()
 
     " display virtual text with the suggestion
     let l:bufnr = bufnr('%')
@@ -153,74 +211,42 @@ function! llama#fim() abort
             \ 1000.0 * reltimefloat(reltime(l:t_start))
             \ )
 
-        call nvim_buf_set_extmark(l:bufnr, l:id_vt_info, l:pos_y - 1, l:pos_x - 1, {
+        call nvim_buf_set_extmark(l:bufnr, l:id_vt_info, s:pos_y - 1, s:pos_x - 1, {
             \ 'virt_text': [[l:info, 'llama_hl_info']],
             \ 'virt_text_pos': 'eol',
             \ })
     endif
 
-    call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, l:pos_y - 1, l:pos_x - 1, {
+    call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, s:pos_x - 1, {
         \ 'virt_text': [[s:content[0], 'llama_hl_hint']],
-        \ 'virt_text_win_col': l:pos_x == 1 ? 0 : virtcol('.')
+        \ 'virt_text_win_col': s:pos_x == len(s:line_cur) ? virtcol('.') : virtcol('.') - 1
         \ })
 
-    call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, l:pos_y - 1, 0, {
+    call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, 0, {
         \ 'virt_lines': map(s:content[1:], {idx, val -> [[val, 'llama_hl_hint']]}),
         \ 'virt_text_win_col': virtcol('.')
         \ })
 
-    " accept suggestion with Tab and reject it with any other key
-    let s:mapping_on = v:true
-
-    if l:can_accept
-        inoremap <buffer> <Tab> <C-O>:call llama#accept_vt_fim()<CR>
-    else
-        inoremap <buffer> <Tab> <C-O>:call llama#cancel_vt_fim()<CR>
-    endif
-
-    for l:key in range(32, 127) + [8, 27]
-        if l:key != 0x7C
-            if l:key == 8
-                execute 'inoremap <buffer> <Bs>    <C-O>:call llama#cancel_vt_fim()<CR><Bs>'
-            elseif l:key == 27
-                execute 'inoremap <buffer> <Esc>   <C-O>:call llama#cancel_vt_fim()<CR><Esc>'
-            elseif l:key == 32
-                execute 'inoremap <buffer> <Space> <C-O>:call llama#cancel_vt_fim()<CR><Space>'
-            elseif l:key == 127
-                execute 'inoremap <buffer> <Del>   <C-O>:call llama#cancel_vt_fim()<CR><Del>'
-            else
-                execute 'inoremap <buffer> ' . nr2char(l:key) . ' <C-O>:call llama#cancel_vt_fim()<CR>' . nr2char(l:key)
-            endif
-        endif
-    endfor
-
-    inoremap <buffer> <Up>    <C-O>:call llama#cancel_vt_fim()<CR><Up>
-    inoremap <buffer> <Down>  <C-O>:call llama#cancel_vt_fim()<CR><Down>
-    inoremap <buffer> <Left>  <C-O>:call llama#cancel_vt_fim()<CR><Left>
-    inoremap <buffer> <Right> <C-O>:call llama#cancel_vt_fim()<CR><Right>
+    " need to async this call because the <C-O> in insert mode causes the cursor to move when at the end of the line
+    call timer_start(0, 'llama#on_hint')
 endfunction
 
-function! llama#accept_vt_fim()
-    let l:pos_x = col('.')
-    let l:pos_y = line('.')
-
-    let l:line_cur = getline('.')
-
-    let l:pos0 = l:pos_x == len(l:line_cur) ? l:pos_x - 1 : l:pos_x - 2
-
+function! llama#fim_accept()
     " insert the suggestion at the cursor location
-    call setline(l:pos_y, l:line_cur[:l:pos0] . s:content[0])
-    if len(s:content) > 1
-        call append(l:pos_y, s:content[1:-1])
-    endif
+    if s:can_accept && len(s:content) > 0
+        call setline(s:pos_y, s:line_cur[:(s:pos_x0 - 1)] . s:content[0])
+        if len(s:content) > 1
+            call append(s:pos_y, s:content[1:-1])
+        endif
 
-    " move the cursor to the end of the accepted text
-    call cursor(l:pos_y + len(s:content) - 1, l:pos_x + s:pos_dx)
+        " move the cursor to the end of the accepted text
+        call cursor(s:pos_y + len(s:content) - 1, s:pos_x + s:pos_dx)
+    endif
 
-    call llama#cancel_vt_fim()
+    call llama#fim_cancel()
 endfunction
 
-function! llama#cancel_vt_fim()
+function! llama#fim_cancel()
     " clear the virtual text
     let l:bufnr = bufnr('%')
 
@@ -230,31 +256,13 @@ function! llama#cancel_vt_fim()
     call nvim_buf_clear_namespace(l:bufnr, l:id_vt_fim,  0, -1)
     call nvim_buf_clear_namespace(l:bufnr, l:id_vt_info, 0, -1)
 
-    " remove the key mappings
-    if exists('s:mapping_on') && s:mapping_on
-        iunmap <buffer> <Tab>
-
-        for l:key in range(32, 127) + [8, 27]
-            if l:key != 0x7C
-                if l:key == 8
-                    execute 'iunmap <buffer> <Bs>'
-                elseif l:key == 27
-                    execute 'iunmap <buffer> <Esc>'
-                elseif l:key == 32
-                    execute 'iunmap <buffer> <Space>'
-                elseif l:key == 127
-                    execute 'iunmap <buffer> <Del>'
-                else
-                    execute 'iunmap <buffer> ' . nr2char(l:key)
-                endif
-            endif
-        endfor
-
-        iunmap <buffer> <Up>
-        iunmap <buffer> <Down>
-        iunmap <buffer> <Left>
-        iunmap <buffer> <Right>
+    silent! iunmap <buffer> <Tab>
+    silent! iunmap <buffer> <Esc>
 
-        let s:mapping_on = v:false
-    endif
+    augroup llama_insert
+        autocmd!
+        if g:llama_config.auto_fim
+            autocmd CursorMovedI * call llama#fim_auto()
+        endif
+    augroup END
 endfunction

From c507a65af5025fb22bacdc7c89badedb4df29c65 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 10 Oct 2024 12:27:34 +0300
Subject: [PATCH 10/42] llama.vim : async

---
 examples/llama.vim | 174 ++++++++++++++++++++++++++-------------------
 1 file changed, 101 insertions(+), 73 deletions(-)

diff --git a/examples/llama.vim b/examples/llama.vim
index de889678dd04a..d727948ea53fb 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -1,6 +1,6 @@
 " sample config:
 "
-"   - Ctrl+F - trigger FIM completion
+"   - Ctrl+F - trigger FIM completion manually
 "
 " run this once to initialise the plugin:
 "
@@ -31,46 +31,30 @@ function! llama#init()
 
     let s:line_cur = ''
 
+    let s:line_cur_prefix = ''
+    let s:line_cur_suffix = ''
+
     let s:pos_dx = 0
     let s:content = []
     let s:can_accept = v:false
 
     let s:timer_fim = -1
-    let s:t_fim_last = reltime()
+    let s:t_fim_last  = reltime()
+    let s:t_fim_start = reltime()
+
+    let s:current_job = v:null
 
     augroup llama
         autocmd!
         autocmd InsertEnter * inoremap <buffer> <silent> <C-F> <C-O>:call llama#fim(v:false)<CR>
+        autocmd InsertLeave * call llama#fim_cancel()
     augroup END
 
     silent! call llama#fim_cancel()
 endfunction
 
-" setup accept/cancel events
-function! llama#on_hint(id_timer)
-    inoremap <buffer> <Tab> <C-O>:call llama#fim_accept()<CR>
-    inoremap <buffer> <Esc> <C-O>:call llama#fim_cancel()<CR><Esc>
-
-    augroup llama_insert
-        autocmd!
-        autocmd CursorMovedI * call llama#fim_cancel()
-    augroup END
-endfunction
-
-function! llama#fim_auto()
-    if reltimefloat(reltime(s:t_fim_last)) < 0.50
-        if s:timer_fim != -1
-            call timer_stop(s:timer_fim)
-            let s:timer_fim = -1
-        endif
-    endif
-
-    let s:t_fim_last = reltime()
-    let s:timer_fim = timer_start(500, {-> llama#fim(v:true)})
-endfunction
-
 function! llama#fim(is_auto) abort
-    let l:t_start = reltime()
+    let s:t_fim_start = reltime()
 
     let s:content = []
     let s:can_accept = v:false
@@ -86,16 +70,16 @@ function! llama#fim(is_auto) abort
 
     let s:pos_x0 = s:pos_x == len(s:line_cur) ? s:pos_x : s:pos_x - 1
 
-    let l:line_cur_prefix = strpart(s:line_cur, 0, s:pos_x0)
-    let l:line_cur_suffix = strpart(s:line_cur, s:pos_x0)
+    let s:line_cur_prefix = strpart(s:line_cur, 0, s:pos_x0)
+    let s:line_cur_suffix = strpart(s:line_cur, s:pos_x0)
 
     let l:prefix = ""
         \ . join(l:lines_prefix, "\n")
         \ . "\n"
-        \ . l:line_cur_prefix
+        \ . s:line_cur_prefix
 
     let l:suffix = ""
-        \ . l:line_cur_suffix
+        \ . s:line_cur_suffix
         \ . "\n"
         \ . join(l:lines_suffix, "\n")
         \ . "\n"
@@ -116,12 +100,80 @@ function! llama#fim(is_auto) abort
         \ 'samplers':       ["top_k", "infill"]
         \ })
 
-    " request completion from the server
     let l:curl_command = printf(
         \ "curl --silent --no-buffer --request POST --url %s --header \"Content-Type: application/json\" --data %s",
         \ g:llama_config.endpoint, shellescape(l:request)
         \ )
 
+    " send the request asynchronously
+    let s:current_job = jobstart(l:curl_command, {
+        \ 'on_stdout': function('s:fim_on_stdout'),
+        \ 'on_exit': function('s:fim_on_exit'),
+        \ 'stdout_buffered': v:true,
+        \ 'is_auto': a:is_auto
+        \ })
+endfunction
+
+function! llama#fim_accept()
+    " insert the suggestion at the cursor location
+    if s:can_accept && len(s:content) > 0
+        call setline(s:pos_y, s:line_cur[:(s:pos_x0 - 1)] . s:content[0])
+        if len(s:content) > 1
+            call append(s:pos_y, s:content[1:-1])
+        endif
+
+        " move the cursor to the end of the accepted text
+        call cursor(s:pos_y + len(s:content) - 1, s:pos_x + s:pos_dx)
+    endif
+
+    call llama#fim_cancel()
+endfunction
+
+function! llama#fim_cancel()
+    if s:current_job != v:null
+        call jobstop(s:current_job)
+    endif
+
+    " clear the virtual text
+    let l:bufnr = bufnr('%')
+
+    let l:id_vt_fim  = nvim_create_namespace('vt_fim')
+    let l:id_vt_info = nvim_create_namespace('vt_info')
+
+    call nvim_buf_clear_namespace(l:bufnr, l:id_vt_fim,  0, -1)
+    call nvim_buf_clear_namespace(l:bufnr, l:id_vt_info, 0, -1)
+
+    silent! iunmap <buffer> <Tab>
+    silent! iunmap <buffer> <Esc>
+
+    augroup llama_insert
+        autocmd!
+        if g:llama_config.auto_fim
+            autocmd CursorMovedI * call s:fim_auto()
+        endif
+    augroup END
+endfunction
+
+function! s:fim_auto()
+    if s:current_job != v:null
+        call jobstop(s:current_job)
+    endif
+
+    if reltimefloat(reltime(s:t_fim_last)) < 0.001*250
+        if s:timer_fim != -1
+            call timer_stop(s:timer_fim)
+            let s:timer_fim = -1
+        endif
+    endif
+
+    let s:t_fim_last = reltime()
+    let s:timer_fim = timer_start(250, {-> llama#fim(v:true)})
+endfunction
+
+
+function! s:fim_on_stdout(job_id, data, event) dict
+    let l:raw = join(a:data, "\n")
+
     let s:can_accept = v:true
     let l:has_info   = v:false
 
@@ -133,17 +185,15 @@ function! llama#fim(is_auto) abort
     let l:t_gen_ms = 1.0
     let l:s_gen    = 0
 
-    " TODO: async this
-    let l:raw = system(l:curl_command)
     if s:can_accept && v:shell_error
-        if !a:is_auto
+        if !self.is_auto
             call add(s:content, "<| curl error: is the server on? |>")
         endif
         let s:can_accept = v:false
     endif
 
     if s:can_accept && l:raw == ""
-        if !a:is_auto
+        if !self.is_auto
             call add(s:content, "<| empty response: is the server on? |>")
         endif
         let s:can_accept = v:false
@@ -178,7 +228,7 @@ function! llama#fim(is_auto) abort
     endif
 
     if len(s:content) == 0
-        if !a:is_auto
+        if !self.is_auto
             call add(s:content, "<| nothing to suggest |>")
         endif
         let s:can_accept = v:false
@@ -189,7 +239,7 @@ function! llama#fim(is_auto) abort
     endif
 
     let s:pos_dx = len(s:content[-1])
-    let s:content[-1] .= l:line_cur_suffix
+    let s:content[-1] .= s:line_cur_suffix
 
     call llama#fim_cancel()
 
@@ -202,13 +252,13 @@ function! llama#fim(is_auto) abort
     " construct the info message:
     if l:has_info
         " prefix the info string with whitespace in order to offset it to the right of the fim overlay
-        let l:prefix = repeat(' ', len(s:content[0]) - len(l:line_cur_suffix) + 3)
+        let l:prefix = repeat(' ', len(s:content[0]) - len(s:line_cur_suffix) + 3)
 
         let l:info = printf("%s | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %f.2 ms",
             \ l:prefix,
             \ l:n_prompt, l:t_prompt_ms, l:s_prompt,
             \ l:n_gen, l:t_gen_ms, l:s_gen,
-            \ 1000.0 * reltimefloat(reltime(l:t_start))
+            \ 1000.0 * reltimefloat(reltime(s:t_fim_start))
             \ )
 
         call nvim_buf_set_extmark(l:bufnr, l:id_vt_info, s:pos_y - 1, s:pos_x - 1, {
@@ -227,42 +277,20 @@ function! llama#fim(is_auto) abort
         \ 'virt_text_win_col': virtcol('.')
         \ })
 
-    " need to async this call because the <C-O> in insert mode causes the cursor to move when at the end of the line
-    call timer_start(0, 'llama#on_hint')
-endfunction
-
-function! llama#fim_accept()
-    " insert the suggestion at the cursor location
-    if s:can_accept && len(s:content) > 0
-        call setline(s:pos_y, s:line_cur[:(s:pos_x0 - 1)] . s:content[0])
-        if len(s:content) > 1
-            call append(s:pos_y, s:content[1:-1])
-        endif
-
-        " move the cursor to the end of the accepted text
-        call cursor(s:pos_y + len(s:content) - 1, s:pos_x + s:pos_dx)
-    endif
-
-    call llama#fim_cancel()
-endfunction
-
-function! llama#fim_cancel()
-    " clear the virtual text
-    let l:bufnr = bufnr('%')
-
-    let l:id_vt_fim  = nvim_create_namespace('vt_fim')
-    let l:id_vt_info = nvim_create_namespace('vt_info')
-
-    call nvim_buf_clear_namespace(l:bufnr, l:id_vt_fim,  0, -1)
-    call nvim_buf_clear_namespace(l:bufnr, l:id_vt_info, 0, -1)
-
-    silent! iunmap <buffer> <Tab>
-    silent! iunmap <buffer> <Esc>
+    " setup accept/cancel events
+    inoremap <buffer> <Tab> <C-O>:call llama#fim_accept()<CR>
+    inoremap <buffer> <Esc> <C-O>:call llama#fim_cancel()<CR><Esc>
 
     augroup llama_insert
         autocmd!
-        if g:llama_config.auto_fim
-            autocmd CursorMovedI * call llama#fim_auto()
-        endif
+        autocmd CursorMovedI * call llama#fim_cancel()
     augroup END
 endfunction
+
+function! s:fim_on_exit(job_id, exit_code, event) dict
+    if a:exit_code != 0
+        echom "Job failed with exit code: " . a:exit_code
+    endif
+
+    let s:current_job = v:null
+endfunction

From 6669b550dbc79a78c0d8152cd7e60b640a8563bc Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 10 Oct 2024 17:06:50 +0300
Subject: [PATCH 11/42] llama.vim : set time limit for the generation phase

---
 examples/llama.vim | 61 ++++++++++++++++++++++++----------------------
 src/llama.cpp      |  4 +++
 2 files changed, 36 insertions(+), 29 deletions(-)

diff --git a/examples/llama.vim b/examples/llama.vim
index d727948ea53fb..5ab43f2c9e386 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -12,14 +12,14 @@ highlight llama_hl_hint guifg=#ff772f
 highlight llama_hl_info guifg=#77ff2f
 
 let s:default_config = {
-    \ 'endpoint':    'http://127.0.0.1:8012/infill',
-    \ 'n_prefix':    128,
-    \ 'n_suffix':    128,
-    \ 'n_predict':   64,
-    \ 'n_probs':     3,
-    \ 'temperature': 0.1,
-    \ 'auto_fim':    v:true,
-    \ 'stop':        ["\n"]
+    \ 'endpoint':         'http://127.0.0.1:8012/infill',
+    \ 'n_prefix':         128,
+    \ 'n_suffix':         128,
+    \ 'n_predict':        64,
+    \ 't_max_prompt_ms':  300,
+    \ 't_max_predict_ms': 200,
+    \ 'auto_fim':         v:true,
+    \ 'stop':             ["\n"]
     \ }
 
 let g:llama_config = get(g:, 'llama_config', s:default_config)
@@ -48,6 +48,8 @@ function! llama#init()
         autocmd!
         autocmd InsertEnter * inoremap <buffer> <silent> <C-F> <C-O>:call llama#fim(v:false)<CR>
         autocmd InsertLeave * call llama#fim_cancel()
+
+        autocmd CursorMoved * call llama#fim_cancel()
     augroup END
 
     silent! call llama#fim_cancel()
@@ -85,19 +87,20 @@ function! llama#fim(is_auto) abort
         \ . "\n"
 
     let l:request = json_encode({
-        \ 'prompt':         "",
-        \ 'input_prefix':   l:prefix,
-        \ 'input_suffix':   l:suffix,
-       "\ 'stop':           g:llama_config.stop,
-        \ 'n_predict':      g:llama_config.n_predict,
-       "\ 'n_probs':        g:llama_config.n_probs,
-        \ 'penalty_last_n': 0,
-        \ 'temperature':    g:llama_config.temperature,
-        \ 'top_k':          5,
-        \ 'infill_p':       0.20,
-        \ 'infill_p_eog':   0.001,
-        \ 'stream':         v:false,
-        \ 'samplers':       ["top_k", "infill"]
+        \ 'prompt':           "",
+        \ 'input_prefix':     l:prefix,
+        \ 'input_suffix':     l:suffix,
+       "\ 'stop':             g:llama_config.stop,
+        \ 'n_predict':        g:llama_config.n_predict,
+        \ 'penalty_last_n':   0,
+        \ 'top_k':            5,
+        \ 'infill_p':         0.20,
+        \ 'infill_p_eog':     0.001,
+        \ 'stream':           v:false,
+        \ 'samplers':         ["top_k", "infill"],
+        \ 't_max_prompt_ms':  g:llama_config.t_max_prompt_ms,
+        \ 't_max_predict_ms': g:llama_config.t_max_predict_ms,
+        \ 'cache_prompt':     v:true
         \ })
 
     let l:curl_command = printf(
@@ -181,9 +184,9 @@ function! s:fim_on_stdout(job_id, data, event) dict
     let l:t_prompt_ms = 1.0
     let l:s_prompt    = 0
 
-    let l:n_gen    = 0
-    let l:t_gen_ms = 1.0
-    let l:s_gen    = 0
+    let l:n_predict    = 0
+    let l:t_predict_ms = 1.0
+    let l:s_predict    = 0
 
     if s:can_accept && v:shell_error
         if !self.is_auto
@@ -221,9 +224,9 @@ function! s:fim_on_stdout(job_id, data, event) dict
             let l:t_prompt_ms = get(l:timings, 'prompt_ms', 1)
             let l:s_prompt    = get(l:timings, 'prompt_per_second', 0)
 
-            let l:n_gen    = get(l:timings, 'predicted_n', 0)
-            let l:t_gen_ms = get(l:timings, 'predicted_ms', 1)
-            let l:s_gen    = get(l:timings, 'predicted_per_second', 0)
+            let l:n_predict    = get(l:timings, 'predicted_n', 0)
+            let l:t_predict_ms = get(l:timings, 'predicted_ms', 1)
+            let l:s_predict    = get(l:timings, 'predicted_per_second', 0)
         endif
     endif
 
@@ -256,8 +259,8 @@ function! s:fim_on_stdout(job_id, data, event) dict
 
         let l:info = printf("%s | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %f.2 ms",
             \ l:prefix,
-            \ l:n_prompt, l:t_prompt_ms, l:s_prompt,
-            \ l:n_gen, l:t_gen_ms, l:s_gen,
+            \ l:n_prompt,  l:t_prompt_ms,  l:s_prompt,
+            \ l:n_predict, l:t_predict_ms, l:s_predict,
             \ 1000.0 * reltimefloat(reltime(s:t_fim_start))
             \ )
 
diff --git a/src/llama.cpp b/src/llama.cpp
index 1813dd29be2b2..80cc939314b3d 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -6725,6 +6725,10 @@ static void llm_load_vocab(
             vocab.special_eog_ids.insert(vocab.special_eom_id);
             LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
         }
+
+        if (vocab.special_fim_sep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_sep_id) == 0) {
+            vocab.special_eog_ids.insert(vocab.special_fim_sep_id);
+        }
     }
 
     // build special tokens cache

From 2e8c350a5f5f70c913b68c539f065a4be22458a4 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 10 Oct 2024 18:31:46 +0300
Subject: [PATCH 12/42] llama.vim : fix edge cases

---
 examples/llama.vim | 44 +++++++++++++++++++++++++++++---------------
 1 file changed, 29 insertions(+), 15 deletions(-)

diff --git a/examples/llama.vim b/examples/llama.vim
index 5ab43f2c9e386..3f747b3603e35 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -98,9 +98,9 @@ function! llama#fim(is_auto) abort
         \ 'infill_p_eog':     0.001,
         \ 'stream':           v:false,
         \ 'samplers':         ["top_k", "infill"],
+       "\ 'cache_prompt':     v:true,
         \ 't_max_prompt_ms':  g:llama_config.t_max_prompt_ms,
-        \ 't_max_predict_ms': g:llama_config.t_max_predict_ms,
-        \ 'cache_prompt':     v:true
+        \ 't_max_predict_ms': g:llama_config.t_max_predict_ms
         \ })
 
     let l:curl_command = printf(
@@ -111,10 +111,21 @@ function! llama#fim(is_auto) abort
     " send the request asynchronously
     let s:current_job = jobstart(l:curl_command, {
         \ 'on_stdout': function('s:fim_on_stdout'),
-        \ 'on_exit': function('s:fim_on_exit'),
+        \ 'on_exit':   function('s:fim_on_exit'),
         \ 'stdout_buffered': v:true,
         \ 'is_auto': a:is_auto
         \ })
+
+    " this trick is needed to avoid the cursor shifting upon C-O when at the end of the line
+    if !a:is_auto
+        augroup llama_insert
+            autocmd!
+        augroup END
+
+        if g:llama_config.auto_fim
+            call timer_start(0, {-> s:fim_auto_enable()})
+        endif
+    endif
 endfunction
 
 function! llama#fim_accept()
@@ -151,9 +162,16 @@ function! llama#fim_cancel()
 
     augroup llama_insert
         autocmd!
-        if g:llama_config.auto_fim
-            autocmd CursorMovedI * call s:fim_auto()
-        endif
+    augroup END
+
+    if g:llama_config.auto_fim
+        call s:fim_auto_enable()
+    endif
+endfunction
+
+function! s:fim_auto_enable()
+    augroup llama_insert
+        autocmd CursorMovedI * call s:fim_auto()
     augroup END
 endfunction
 
@@ -176,6 +194,9 @@ endfunction
 
 function! s:fim_on_stdout(job_id, data, event) dict
     let l:raw = join(a:data, "\n")
+    if len(l:raw) == 0
+        return
+    endif
 
     let s:can_accept = v:true
     let l:has_info   = v:false
@@ -195,13 +216,6 @@ function! s:fim_on_stdout(job_id, data, event) dict
         let s:can_accept = v:false
     endif
 
-    if s:can_accept && l:raw == ""
-        if !self.is_auto
-            call add(s:content, "<| empty response: is the server on? |>")
-        endif
-        let s:can_accept = v:false
-    endif
-
     " get the generated suggestion
     if s:can_accept
         let l:response = json_decode(l:raw)
@@ -232,7 +246,7 @@ function! s:fim_on_stdout(job_id, data, event) dict
 
     if len(s:content) == 0
         if !self.is_auto
-            call add(s:content, "<| nothing to suggest |>")
+            call add(s:content, "<| EOT |>")
         endif
         let s:can_accept = v:false
     endif
@@ -272,7 +286,7 @@ function! s:fim_on_stdout(job_id, data, event) dict
 
     call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, s:pos_x - 1, {
         \ 'virt_text': [[s:content[0], 'llama_hl_hint']],
-        \ 'virt_text_win_col': s:pos_x == len(s:line_cur) ? virtcol('.') : virtcol('.') - 1
+        \ 'virt_text_win_col': virtcol('.') - 1
         \ })
 
     call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, 0, {

From 4b1bd81661142cb8c9f768e465befbd678f64278 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 10 Oct 2024 20:36:25 +0300
Subject: [PATCH 13/42] llama : simplify infill sampler

---
 common/common.h            |  2 --
 examples/llama.vim         |  8 +++-----
 examples/server/server.cpp |  4 ----
 src/llama-sampling.cpp     | 19 +++++++++++++++++++
 4 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/common/common.h b/common/common.h
index 2fb92ae143c54..5ca8fd391ab74 100644
--- a/common/common.h
+++ b/common/common.h
@@ -117,8 +117,6 @@ struct common_sampler_params {
     float   temp              = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
     float   dynatemp_range    = 0.00f; // 0.0 = disabled
     float   dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
-    float   infill_p          = 0.80f;
-    float   infill_p_eog      = 0.01f;
     int32_t penalty_last_n    = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
     float   penalty_repeat    = 1.00f; // 1.0 = disabled
     float   penalty_freq      = 0.00f; // 0.0 = disabled
diff --git a/examples/llama.vim b/examples/llama.vim
index 3f747b3603e35..c89ddea65385b 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -93,9 +93,7 @@ function! llama#fim(is_auto) abort
        "\ 'stop':             g:llama_config.stop,
         \ 'n_predict':        g:llama_config.n_predict,
         \ 'penalty_last_n':   0,
-        \ 'top_k':            5,
-        \ 'infill_p':         0.20,
-        \ 'infill_p_eog':     0.001,
+        \ 'top_k':            100,
         \ 'stream':           v:false,
         \ 'samplers':         ["top_k", "infill"],
        "\ 'cache_prompt':     v:true,
@@ -180,7 +178,7 @@ function! s:fim_auto()
         call jobstop(s:current_job)
     endif
 
-    if reltimefloat(reltime(s:t_fim_last)) < 0.001*250
+    if reltimefloat(reltime(s:t_fim_last)) < 500*0.001
         if s:timer_fim != -1
             call timer_stop(s:timer_fim)
             let s:timer_fim = -1
@@ -188,7 +186,7 @@ function! s:fim_auto()
     endif
 
     let s:t_fim_last = reltime()
-    let s:timer_fim = timer_start(250, {-> llama#fim(v:true)})
+    let s:timer_fim = timer_start(500, {-> llama#fim(v:true)})
 endfunction
 
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index e9621ba93c956..3992108e7f383 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -873,8 +873,6 @@ struct server_context {
         slot.sparams.tfs_z             = json_value(data, "tfs_z",             default_sparams.tfs_z);
         slot.sparams.typ_p             = json_value(data, "typical_p",         default_sparams.typ_p);
         slot.sparams.temp              = json_value(data, "temperature",       default_sparams.temp);
-        slot.sparams.infill_p          = json_value(data, "infill_p",          default_sparams.infill_p);
-        slot.sparams.infill_p_eog      = json_value(data, "infill_p_eog",      default_sparams.infill_p_eog);
         slot.sparams.dynatemp_range    = json_value(data, "dynatemp_range",    default_sparams.dynatemp_range);
         slot.sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
         slot.sparams.penalty_last_n    = json_value(data, "repeat_last_n",     default_sparams.penalty_last_n);
@@ -1243,8 +1241,6 @@ struct server_context {
             {"xtc_threshold",             slot.sparams.xtc_threshold},
             {"tfs_z",                     slot.sparams.tfs_z},
             {"typical_p",                 slot.sparams.typ_p},
-            {"infill_p",                  slot.sparams.infill_p},
-            {"infill_p_eog",              slot.sparams.infill_p_eog},
             {"repeat_last_n",             slot.sparams.penalty_last_n},
             {"repeat_penalty",            slot.sparams.penalty_repeat},
             {"presence_penalty",          slot.sparams.penalty_present},
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index d71516153cf82..4a5b922c44a9d 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1792,6 +1792,10 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
         LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
     }
 
+<<<<<<< HEAD
+=======
+    float p_max     = 0.0f;
+>>>>>>> af919ec1 (llama : simplify infill sampler)
     float p_txt_sum = 0.0f;
     float p_eog_sum = 0.0f;
 
@@ -1803,12 +1807,20 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
         }
     }
 
+<<<<<<< HEAD
     const float rat = p_eog_sum == 0.0 ? INFINITY : p_txt_sum / p_eog_sum; GGML_UNUSED(rat);
 
     LOG_DBG_CUR("%s: p_txt_sum = %.2f, p_eog_sum = %.2f, rat = %.2f, n = %zu\n", __func__, p_txt_sum, p_eog_sum, rat, cur_p->size);
 
     if (3*p_eog_sum*cur_p->size > p_txt_sum) {
         LOG_DBG_CUR("%s: the ratio p_txt/p_eog = %.2f is too low -> sampling EOG\n", __func__, p_txt_sum/p_eog_sum);
+=======
+    const float rat = p_txt_sum / p_eog_sum;
+    LLAMA_LOG_DEBUG("infill: p_max = %.2f, p_txt_sum = %.2f, p_eog_sum = %.2f, rat = %.2f, n = %zu\n", p_max, p_txt_sum, p_eog_sum, rat, cur_p->size);
+
+    if (p_max < 0.90f && p_eog_sum*cur_p->size > p_txt_sum) {
+        LLAMA_LOG_DEBUG("infill: the ratio p_txt/p_eog = %.2f is too low -> sampling EOG\n", p_txt_sum/p_eog_sum);
+>>>>>>> af919ec1 (llama : simplify infill sampler)
 
         // keep just the EOG tokens
         const auto size_org = cur_p->size;
@@ -1879,6 +1891,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
         }
     }
 
+<<<<<<< HEAD
     size_t n_non_eog = 0;
 
     size_t size_org = cur_p->size;
@@ -1895,6 +1908,12 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
 
         if (cur_p->data[i].p < thold && !is_eog) {
             continue;
+=======
+    // mask non-EOG tokens with prob < 0.2
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        if (cur_p->data[i].p < 0.2 && !llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
+            cur_p->data[i].logit = -INFINITY;
+>>>>>>> af919ec1 (llama : simplify infill sampler)
         }
 
         if (!is_eog) {

From 865d9bc48a903287649784e15b4a9d48934a9ace Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 11 Oct 2024 12:26:22 +0300
Subject: [PATCH 14/42] llama : clean-up

ggml-ci
---
 examples/llama.vim     | 111 +++++++++++++++++++++++++++++++----------
 src/llama-sampling.cpp |  20 +-------
 2 files changed, 85 insertions(+), 46 deletions(-)

diff --git a/examples/llama.vim b/examples/llama.vim
index c89ddea65385b..99712d234b9ba 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -1,31 +1,72 @@
+" LLM-based text completion using llama.cpp
+"
+" requires:
+"
+"   - neovim
+"   - curl
+"   - llama.cpp server instance
+"   - FIM-compatible model
+"
 " sample config:
 "
-"   - Ctrl+F - trigger FIM completion manually
+"   - Tab       - accept the current suggestion
+"   - Shift+Tab - accept just the first line of the segguestion
+"   - Ctrl+F    - trigger FIM completion manually
+"
+" make symlink or copy this file to ~/.config/nvim/autoload/llama.vim
+"
+" start the llama.cpp server with a FIM-compatible model. for example:
+"
+"   $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa --ubatch-size 1024 --batch-size 2048
+"
+"   --batch-size [512, model max context]
+"
+"     adjust the batch size to control how much of the provided context will be used during the inference
+"     lower values will use smaller part of the context around the cursor, which will result in faster processing
 "
-" run this once to initialise the plugin:
+"   --ubatch-size [64, 2048]
 "
-" :call llama#init()
+"     chunks the batch into smaller chunks for faster processing
+"     depends on the specific hardware. use llama-bench to profile and determine the best size
+"
+" run this once to initialise llama.vim:
+"
+"   :call llama#init()
 "
 
 " color of the suggested text
 highlight llama_hl_hint guifg=#ff772f
 highlight llama_hl_info guifg=#77ff2f
 
+" endpoint:         llama.cpp server endpoint
+" n_prefix:         number of lines to include in the prefix
+" n_suffix:         number of lines to include in the suffix
+" n_predict:        max number of tokens to predict
+" t_max_prompt_ms:  max alloted time for the text generation
+" show_info:        show extra info about the inference
+" auto_fim:         trigger FIM completion automatically on cursor movement
 let s:default_config = {
     \ 'endpoint':         'http://127.0.0.1:8012/infill',
-    \ 'n_prefix':         128,
-    \ 'n_suffix':         128,
+    \ 'n_prefix':         256,
+    \ 'n_suffix':         256,
     \ 'n_predict':        64,
-    \ 't_max_prompt_ms':  300,
+    \ 't_max_prompt_ms':  500,
     \ 't_max_predict_ms': 200,
+    \ 'show_info':        v:true,
     \ 'auto_fim':         v:true,
-    \ 'stop':             ["\n"]
     \ }
 
 let g:llama_config = get(g:, 'llama_config', s:default_config)
 
 function! llama#init()
-    let s:pos_x  = 0
+    if !executable('curl')
+        echohl WarningMsg
+        echo 'llama.vim requires the "curl" command to be available'
+        echohl None
+        return
+    endif
+
+    let s:pos_x  = 0 " cursor position upon start of completion
     let s:pos_y  = 0
     let s:pos_x0 = 0 " pos_x corrected for end-of-line edge case
 
@@ -46,8 +87,8 @@ function! llama#init()
 
     augroup llama
         autocmd!
-        autocmd InsertEnter * inoremap <buffer> <silent> <C-F> <C-O>:call llama#fim(v:false)<CR>
-        autocmd InsertLeave * call llama#fim_cancel()
+        autocmd InsertEnter    * inoremap <buffer> <silent> <C-F> <C-O>:call llama#fim(v:false)<CR>
+        autocmd InsertLeavePre * call llama#fim_cancel()
 
         autocmd CursorMoved * call llama#fim_cancel()
     augroup END
@@ -90,7 +131,6 @@ function! llama#fim(is_auto) abort
         \ 'prompt':           "",
         \ 'input_prefix':     l:prefix,
         \ 'input_suffix':     l:suffix,
-       "\ 'stop':             g:llama_config.stop,
         \ 'n_predict':        g:llama_config.n_predict,
         \ 'penalty_last_n':   0,
         \ 'top_k':            100,
@@ -126,16 +166,23 @@ function! llama#fim(is_auto) abort
     endif
 endfunction
 
-function! llama#fim_accept()
+" if first_line == v:true accept only the first line of the response
+function! llama#fim_accept(first_line)
     " insert the suggestion at the cursor location
     if s:can_accept && len(s:content) > 0
         call setline(s:pos_y, s:line_cur[:(s:pos_x0 - 1)] . s:content[0])
         if len(s:content) > 1
-            call append(s:pos_y, s:content[1:-1])
+            if !a:first_line
+                call append(s:pos_y, s:content[1:-1])
+            endif
         endif
 
         " move the cursor to the end of the accepted text
-        call cursor(s:pos_y + len(s:content) - 1, s:pos_x + s:pos_dx)
+        if !a:first_line
+            call cursor(s:pos_y + len(s:content) - 1, s:pos_x + s:pos_dx)
+        else
+            call cursor(s:pos_y, s:pos_x + len(s:content[0]) - 1)
+        endif
     endif
 
     call llama#fim_cancel()
@@ -146,6 +193,11 @@ function! llama#fim_cancel()
         call jobstop(s:current_job)
     endif
 
+    if s:timer_fim != -1
+        call timer_stop(s:timer_fim)
+        let s:timer_fim = -1
+    endif
+
     " clear the virtual text
     let l:bufnr = bufnr('%')
 
@@ -155,7 +207,9 @@ function! llama#fim_cancel()
     call nvim_buf_clear_namespace(l:bufnr, l:id_vt_fim,  0, -1)
     call nvim_buf_clear_namespace(l:bufnr, l:id_vt_info, 0, -1)
 
+    " remove the mappings
     silent! iunmap <buffer> <Tab>
+    silent! iunmap <buffer> <S-Tab>
     silent! iunmap <buffer> <Esc>
 
     augroup llama_insert
@@ -173,6 +227,8 @@ function! s:fim_auto_enable()
     augroup END
 endfunction
 
+" auto-start a fim job a short time after the cursor has moved
+" if there is already a job queued - cancel it
 function! s:fim_auto()
     if s:current_job != v:null
         call jobstop(s:current_job)
@@ -189,7 +245,7 @@ function! s:fim_auto()
     let s:timer_fim = timer_start(500, {-> llama#fim(v:true)})
 endfunction
 
-
+" callback that processes the result from the server
 function! s:fim_on_stdout(job_id, data, event) dict
     let l:raw = join(a:data, "\n")
     if len(l:raw) == 0
@@ -199,6 +255,13 @@ function! s:fim_on_stdout(job_id, data, event) dict
     let s:can_accept = v:true
     let l:has_info   = v:false
 
+    if s:can_accept && v:shell_error
+        if !self.is_auto
+            call add(s:content, "<| curl error: is the server on? |>")
+        endif
+        let s:can_accept = v:false
+    endif
+
     let l:n_prompt    = 0
     let l:t_prompt_ms = 1.0
     let l:s_prompt    = 0
@@ -207,13 +270,6 @@ function! s:fim_on_stdout(job_id, data, event) dict
     let l:t_predict_ms = 1.0
     let l:s_predict    = 0
 
-    if s:can_accept && v:shell_error
-        if !self.is_auto
-            call add(s:content, "<| curl error: is the server on? |>")
-        endif
-        let s:can_accept = v:false
-    endif
-
     " get the generated suggestion
     if s:can_accept
         let l:response = json_decode(l:raw)
@@ -227,7 +283,7 @@ function! s:fim_on_stdout(job_id, data, event) dict
             call remove(s:content, -1)
         endwhile
 
-        " if response.timings
+        " if response.timings is available
         if len(get(l:response, 'timings', {})) > 0
             let l:has_info = v:true
             let l:timings  = get(l:response, 'timings', {})
@@ -264,8 +320,8 @@ function! s:fim_on_stdout(job_id, data, event) dict
     let l:id_vt_fim  = nvim_create_namespace('vt_fim')
     let l:id_vt_info = nvim_create_namespace('vt_info')
 
-    " construct the info message:
-    if l:has_info
+    " construct the info message and display it to the right of the current line
+    if g:llama_config.show_info && l:has_info
         " prefix the info string with whitespace in order to offset it to the right of the fim overlay
         let l:prefix = repeat(' ', len(s:content[0]) - len(s:line_cur_suffix) + 3)
 
@@ -282,6 +338,7 @@ function! s:fim_on_stdout(job_id, data, event) dict
             \ })
     endif
 
+    " display the suggestion
     call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, s:pos_x - 1, {
         \ 'virt_text': [[s:content[0], 'llama_hl_hint']],
         \ 'virt_text_win_col': virtcol('.') - 1
@@ -293,8 +350,8 @@ function! s:fim_on_stdout(job_id, data, event) dict
         \ })
 
     " setup accept/cancel events
-    inoremap <buffer> <Tab> <C-O>:call llama#fim_accept()<CR>
-    inoremap <buffer> <Esc> <C-O>:call llama#fim_cancel()<CR><Esc>
+    inoremap <buffer> <Tab>   <C-O>:call llama#fim_accept(v:false)<CR>
+    inoremap <buffer> <S-Tab> <C-O>:call llama#fim_accept(v:true)<CR>
 
     augroup llama_insert
         autocmd!
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 4a5b922c44a9d..96a97901844bc 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1791,11 +1791,8 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
     for (size_t i = 0; i < cur_p->size; ++i) {
         LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
     }
+#endif
 
-<<<<<<< HEAD
-=======
-    float p_max     = 0.0f;
->>>>>>> af919ec1 (llama : simplify infill sampler)
     float p_txt_sum = 0.0f;
     float p_eog_sum = 0.0f;
 
@@ -1807,20 +1804,12 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
         }
     }
 
-<<<<<<< HEAD
     const float rat = p_eog_sum == 0.0 ? INFINITY : p_txt_sum / p_eog_sum; GGML_UNUSED(rat);
 
     LOG_DBG_CUR("%s: p_txt_sum = %.2f, p_eog_sum = %.2f, rat = %.2f, n = %zu\n", __func__, p_txt_sum, p_eog_sum, rat, cur_p->size);
 
     if (3*p_eog_sum*cur_p->size > p_txt_sum) {
         LOG_DBG_CUR("%s: the ratio p_txt/p_eog = %.2f is too low -> sampling EOG\n", __func__, p_txt_sum/p_eog_sum);
-=======
-    const float rat = p_txt_sum / p_eog_sum;
-    LLAMA_LOG_DEBUG("infill: p_max = %.2f, p_txt_sum = %.2f, p_eog_sum = %.2f, rat = %.2f, n = %zu\n", p_max, p_txt_sum, p_eog_sum, rat, cur_p->size);
-
-    if (p_max < 0.90f && p_eog_sum*cur_p->size > p_txt_sum) {
-        LLAMA_LOG_DEBUG("infill: the ratio p_txt/p_eog = %.2f is too low -> sampling EOG\n", p_txt_sum/p_eog_sum);
->>>>>>> af919ec1 (llama : simplify infill sampler)
 
         // keep just the EOG tokens
         const auto size_org = cur_p->size;
@@ -1891,7 +1880,6 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
         }
     }
 
-<<<<<<< HEAD
     size_t n_non_eog = 0;
 
     size_t size_org = cur_p->size;
@@ -1908,12 +1896,6 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
 
         if (cur_p->data[i].p < thold && !is_eog) {
             continue;
-=======
-    // mask non-EOG tokens with prob < 0.2
-    for (size_t i = 0; i < cur_p->size; ++i) {
-        if (cur_p->data[i].p < 0.2 && !llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
-            cur_p->data[i].logit = -INFINITY;
->>>>>>> af919ec1 (llama : simplify infill sampler)
         }
 
         if (!is_eog) {

From c9a46f4bd7386804c127a4e3bbe0456e62edd06c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 11 Oct 2024 13:36:56 +0300
Subject: [PATCH 15/42] llama.vim : minor [no ci]

---
 examples/llama.vim | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/llama.vim b/examples/llama.vim
index 99712d234b9ba..e23373f3b2064 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -325,7 +325,7 @@ function! s:fim_on_stdout(job_id, data, event) dict
         " prefix the info string with whitespace in order to offset it to the right of the fim overlay
         let l:prefix = repeat(' ', len(s:content[0]) - len(s:line_cur_suffix) + 3)
 
-        let l:info = printf("%s | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %f.2 ms",
+        let l:info = printf("%s | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %.2f ms",
             \ l:prefix,
             \ l:n_prompt,  l:t_prompt_ms,  l:s_prompt,
             \ l:n_predict, l:t_predict_ms, l:s_predict,

From 5624e919df7e2937880773b12b5b3fbf16382694 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 11 Oct 2024 19:39:44 +0300
Subject: [PATCH 16/42] llama.vim : fix docs [no ci]

---
 examples/llama.vim | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/examples/llama.vim b/examples/llama.vim
index e23373f3b2064..56a876b0de27e 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -39,10 +39,11 @@ highlight llama_hl_hint guifg=#ff772f
 highlight llama_hl_info guifg=#77ff2f
 
 " endpoint:         llama.cpp server endpoint
-" n_prefix:         number of lines to include in the prefix
-" n_suffix:         number of lines to include in the suffix
+" n_prefix:         number of lines before the cursor location to include in the prefix
+" n_suffix:         number of lines after  the cursor location to include in the suffix
 " n_predict:        max number of tokens to predict
-" t_max_prompt_ms:  max alloted time for the text generation
+" t_max_prompt_ms:  max alloted time for the prompt generation (TODO: not yet supported)
+" t_max_predict_ms: max alloted time for the prediction
 " show_info:        show extra info about the inference
 " auto_fim:         trigger FIM completion automatically on cursor movement
 let s:default_config = {

From 491f211b4caf36f90eb350ecf53d570029ce91ad Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 11 Oct 2024 21:14:47 +0300
Subject: [PATCH 17/42] llama : improve infill sampler

ggml-ci
---
 src/llama-sampling.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 96a97901844bc..d71516153cf82 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1791,7 +1791,6 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
     for (size_t i = 0; i < cur_p->size; ++i) {
         LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
     }
-#endif
 
     float p_txt_sum = 0.0f;
     float p_eog_sum = 0.0f;

From 4f46e29b09d53722c7d73e447dd84fd02cb91abd Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 13 Oct 2024 13:42:16 +0300
Subject: [PATCH 18/42] llama : print more info about control tokens

---
 src/llama.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 80cc939314b3d..1813dd29be2b2 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -6725,10 +6725,6 @@ static void llm_load_vocab(
             vocab.special_eog_ids.insert(vocab.special_eom_id);
             LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
         }
-
-        if (vocab.special_fim_sep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_sep_id) == 0) {
-            vocab.special_eog_ids.insert(vocab.special_fim_sep_id);
-        }
     }
 
     // build special tokens cache

From b8890229b6b6910667ffd71f3e8a64f5b4960ffa Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 13 Oct 2024 13:42:56 +0300
Subject: [PATCH 19/42] llama.vim : add ring context from opened files and
 yanked text

---
 examples/llama.vim | 134 ++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 114 insertions(+), 20 deletions(-)

diff --git a/examples/llama.vim b/examples/llama.vim
index 56a876b0de27e..2818b754e6325 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -38,27 +38,49 @@
 highlight llama_hl_hint guifg=#ff772f
 highlight llama_hl_info guifg=#77ff2f
 
-" endpoint:         llama.cpp server endpoint
-" n_prefix:         number of lines before the cursor location to include in the prefix
-" n_suffix:         number of lines after  the cursor location to include in the suffix
-" n_predict:        max number of tokens to predict
-" t_max_prompt_ms:  max alloted time for the prompt generation (TODO: not yet supported)
-" t_max_predict_ms: max alloted time for the prediction
-" show_info:        show extra info about the inference
-" auto_fim:         trigger FIM completion automatically on cursor movement
+" general parameters:
+"
+"   endpoint:         llama.cpp server endpoint
+"   n_prefix:         number of lines before the cursor location to include in the prefix
+"   n_suffix:         number of lines after  the cursor location to include in the suffix
+"   n_predict:        max number of tokens to predict
+"   t_max_prompt_ms:  max alloted time for the prompt generation (TODO: not yet supported)
+"   t_max_predict_ms: max alloted time for the prediction
+"   show_info:        show extra info about the inference (0 - disabled, 1 - statusline, 2 - inline)
+"   auto_fim:         trigger FIM completion automatically on cursor movement
+"
+" ring buffer of chunks, accumulated with time upon:
+"
+"  - completion request
+"  - yank
+"  - reading a file
+"
+" ring context parameters:
+"
+"   ring_n_chunks:    max number of chunks to pass as extra context to the server (0 to disable)
+"   ring_chunk_size:  max size of the chunks (in number of lines)
+"   ring_scope:       the range around the cursor position (in number of lines) for gathering chunks
+"
 let s:default_config = {
     \ 'endpoint':         'http://127.0.0.1:8012/infill',
-    \ 'n_prefix':         256,
-    \ 'n_suffix':         256,
+    \ 'n_prefix':         128,
+    \ 'n_suffix':         128,
     \ 'n_predict':        64,
     \ 't_max_prompt_ms':  500,
     \ 't_max_predict_ms': 200,
-    \ 'show_info':        v:true,
+    \ 'show_info':        2,
     \ 'auto_fim':         v:true,
+    \ 'ring_n_chunks':    32,
+    \ 'ring_chunk_size':  64,
+    \ 'ring_scope':       1024,
     \ }
 
 let g:llama_config = get(g:, 'llama_config', s:default_config)
 
+function! s:rand(i0, i1) abort
+    return a:i0 + rand() % (a:i1 - a:i0 + 1)
+endfunction
+
 function! llama#init()
     if !executable('curl')
         echohl WarningMsg
@@ -76,6 +98,9 @@ function! llama#init()
     let s:line_cur_prefix = ''
     let s:line_cur_suffix = ''
 
+    let s:ring_n_chunks = []
+
+    let s:pos_y_pick = -9999 " last y where we picked a chunk
     let s:pos_dx = 0
     let s:content = []
     let s:can_accept = v:false
@@ -91,12 +116,55 @@ function! llama#init()
         autocmd InsertEnter    * inoremap <buffer> <silent> <C-F> <C-O>:call llama#fim(v:false)<CR>
         autocmd InsertLeavePre * call llama#fim_cancel()
 
-        autocmd CursorMoved * call llama#fim_cancel()
+        autocmd CursorMoved    * call llama#fim_cancel()
+
+        autocmd TextYankPost   * if v:event.operator ==# 'y' | call s:pick_chunk(v:event.regcontents, v:false) | endif
+
+        autocmd BufEnter       * call timer_start(100, {-> s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true)})
     augroup END
 
     silent! call llama#fim_cancel()
 endfunction
 
+function! s:pick_chunk(text, no_mod)
+    " do not pick chunks from buffers with pending changes or buffers that are not files
+    if a:no_mod && (getbufvar(bufnr('%'), '&modified') || !buflisted(bufnr('%')) || !filereadable(expand('%')))
+        return
+    endif
+
+    if g:llama_config.ring_n_chunks <= 0
+        return
+    endif
+
+    if len(a:text) + 1 < g:llama_config.ring_chunk_size
+        let l:chunk = join(a:text, "\n")
+    else
+        let l:l0 = s:rand(0, len(a:text) - g:llama_config.ring_chunk_size)
+        let l:l1 = l:l0 + g:llama_config.ring_chunk_size
+
+        let l:chunk = join(a:text[l:l0:l:l1], "\n")
+    endif
+
+    " check if this chunk is already added
+    let l:exist = v:false
+    for i in range(len(s:ring_n_chunks))
+        if s:ring_n_chunks[i] == l:chunk
+            let l:exist = v:true
+            break
+        endif
+    endfor
+
+    if l:exist
+        return
+    endif
+
+    if len(s:ring_n_chunks) == g:llama_config.ring_n_chunks
+        call remove(s:ring_n_chunks, 0)
+    endif
+
+    call add(s:ring_n_chunks, l:chunk)
+endfunction
+
 function! llama#fim(is_auto) abort
     let s:t_fim_start = reltime()
 
@@ -128,6 +196,20 @@ function! llama#fim(is_auto) abort
         \ . join(l:lines_suffix, "\n")
         \ . "\n"
 
+    " TODO: per-file location
+    let l:delta_y = abs(s:pos_y - s:pos_y_pick)
+
+    " only gather chunks if the cursor has moved a lot
+    if a:is_auto && l:delta_y > 32
+        " pick a prefix chunk
+        call s:pick_chunk(getline(max([1, s:pos_y - g:llama_config.ring_scope]), max([1, s:pos_y - g:llama_config.n_prefix])), v:false)
+
+        "" pick a suffix chunk
+        call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:llama_config.n_suffix]), min([l:max_y, s:pos_y + g:llama_config.ring_scope])), v:false)
+
+        let s:pos_y_pick = s:pos_y
+    endif
+
     let l:request = json_encode({
         \ 'prompt':           "",
         \ 'input_prefix':     l:prefix,
@@ -137,7 +219,8 @@ function! llama#fim(is_auto) abort
         \ 'top_k':            100,
         \ 'stream':           v:false,
         \ 'samplers':         ["top_k", "infill"],
-       "\ 'cache_prompt':     v:true,
+        \ 'cache_prompt':     v:true,
+        \ 'extra_context':    s:ring_n_chunks,
         \ 't_max_prompt_ms':  g:llama_config.t_max_prompt_ms,
         \ 't_max_predict_ms': g:llama_config.t_max_predict_ms
         \ })
@@ -235,6 +318,7 @@ function! s:fim_auto()
         call jobstop(s:current_job)
     endif
 
+    " TODO: when job cancellation is implemented on the server, reduce these timeouts
     if reltimefloat(reltime(s:t_fim_last)) < 500*0.001
         if s:timer_fim != -1
             call timer_stop(s:timer_fim)
@@ -284,6 +368,11 @@ function! s:fim_on_stdout(job_id, data, event) dict
             call remove(s:content, -1)
         endwhile
 
+        let l:generation_settings = get(l:response, 'generation_settings', {})
+        let l:n_ctx = get(l:generation_settings, 'n_ctx', 0)
+
+        let l:n_cached = get(l:response, 'tokens_cached', 0)
+
         " if response.timings is available
         if len(get(l:response, 'timings', {})) > 0
             let l:has_info = v:true
@@ -322,21 +411,26 @@ function! s:fim_on_stdout(job_id, data, event) dict
     let l:id_vt_info = nvim_create_namespace('vt_info')
 
     " construct the info message and display it to the right of the current line
-    if g:llama_config.show_info && l:has_info
+    if g:llama_config.show_info > 0 && l:has_info
         " prefix the info string with whitespace in order to offset it to the right of the fim overlay
         let l:prefix = repeat(' ', len(s:content[0]) - len(s:line_cur_suffix) + 3)
 
-        let l:info = printf("%s | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %.2f ms",
-            \ l:prefix,
+        let l:info = printf("%s | context: %d / %d | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %.2f ms",
+            \ g:llama_config.show_info == 2 ? l:prefix : '',
+            \ l:n_cached,  l:n_ctx,
             \ l:n_prompt,  l:t_prompt_ms,  l:s_prompt,
             \ l:n_predict, l:t_predict_ms, l:s_predict,
             \ 1000.0 * reltimefloat(reltime(s:t_fim_start))
             \ )
 
-        call nvim_buf_set_extmark(l:bufnr, l:id_vt_info, s:pos_y - 1, s:pos_x - 1, {
-            \ 'virt_text': [[l:info, 'llama_hl_info']],
-            \ 'virt_text_pos': 'eol',
-            \ })
+        if g:llama_config.show_info == 1
+            let &statusline = l:info
+        elseif g:llama_config.show_info == 2
+            call nvim_buf_set_extmark(l:bufnr, l:id_vt_info, s:pos_y - 1, s:pos_x - 1, {
+                \ 'virt_text': [[l:info, 'llama_hl_info']],
+                \ 'virt_text_pos': 'eol',
+                \ })
+        endif
     endif
 
     " display the suggestion

From 27bc11da0f6349eff044c5861bc30d23267281ef Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 13 Oct 2024 13:57:19 +0300
Subject: [PATCH 20/42] llama.vim : update server command [no ci]

---
 examples/llama.vim | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/llama.vim b/examples/llama.vim
index 2818b754e6325..130af3a2671f1 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -17,7 +17,7 @@
 "
 " start the llama.cpp server with a FIM-compatible model. for example:
 "
-"   $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa --ubatch-size 1024 --batch-size 2048
+"   $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa --ubatch-size 1024 --batch-size 2048 --cache-reuse 512
 "
 "   --batch-size [512, model max context]
 "

From f794549baedc4d78a5580fb40f646f80da8598e7 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 13 Oct 2024 14:17:58 +0300
Subject: [PATCH 21/42] llama.vim : gather chunk on leaving buffer [no ci]

---
 examples/llama.vim | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/llama.vim b/examples/llama.vim
index 130af3a2671f1..8d85fb8621d02 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -120,7 +120,9 @@ function! llama#init()
 
         autocmd TextYankPost   * if v:event.operator ==# 'y' | call s:pick_chunk(v:event.regcontents, v:false) | endif
 
+        " gather chunks upon entering/leaving a buffer
         autocmd BufEnter       * call timer_start(100, {-> s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true)})
+        autocmd BufLeave       * call                      s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true)
     augroup END
 
     silent! call llama#fim_cancel()
@@ -146,6 +148,7 @@ function! s:pick_chunk(text, no_mod)
     endif
 
     " check if this chunk is already added
+    " TODO: smarter check for string similarity to evict old chunks that are very similart to the new one
     let l:exist = v:false
     for i in range(len(s:ring_n_chunks))
         if s:ring_n_chunks[i] == l:chunk
@@ -204,7 +207,7 @@ function! llama#fim(is_auto) abort
         " pick a prefix chunk
         call s:pick_chunk(getline(max([1, s:pos_y - g:llama_config.ring_scope]), max([1, s:pos_y - g:llama_config.n_prefix])), v:false)
 
-        "" pick a suffix chunk
+        " pick a suffix chunk
         call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:llama_config.n_suffix]), min([l:max_y, s:pos_y + g:llama_config.ring_scope])), v:false)
 
         let s:pos_y_pick = s:pos_y

From 27d53cb4ee92fe96dde9528c84738e3232810584 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 13 Oct 2024 16:11:38 +0300
Subject: [PATCH 22/42] llama.vim : logic to evict old chunks that are similar
 to new one

---
 examples/llama.vim | 67 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 53 insertions(+), 14 deletions(-)

diff --git a/examples/llama.vim b/examples/llama.vim
index 8d85fb8621d02..6e1840a548914 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -98,7 +98,8 @@ function! llama#init()
     let s:line_cur_prefix = ''
     let s:line_cur_suffix = ''
 
-    let s:ring_n_chunks = []
+    let s:ring_chunks = []
+    let s:ring_n_evict = 0
 
     let s:pos_y_pick = -9999 " last y where we picked a chunk
     let s:pos_dx = 0
@@ -128,6 +129,25 @@ function! llama#init()
     silent! call llama#fim_cancel()
 endfunction
 
+" TODO: figure out something better
+function! s:chunk_sim(c0, c1)
+    let l:lines0 = len(a:c0)
+    let l:lines1 = len(a:c1)
+
+    let l:common = 0
+
+    for l:line0 in a:c0
+        for l:line1 in a:c1
+            if l:line0 == l:line1
+                let l:common += 1
+                break
+            endif
+        endfor
+    endfor
+
+    return 2.0 * l:common / (l:lines0 + l:lines1)
+endfunction
+
 function! s:pick_chunk(text, no_mod)
     " do not pick chunks from buffers with pending changes or buffers that are not files
     if a:no_mod && (getbufvar(bufnr('%'), '&modified') || !buflisted(bufnr('%')) || !filereadable(expand('%')))
@@ -138,20 +158,25 @@ function! s:pick_chunk(text, no_mod)
         return
     endif
 
+    if len(a:text) < 3
+        return
+    endif
+
     if len(a:text) + 1 < g:llama_config.ring_chunk_size
-        let l:chunk = join(a:text, "\n")
+        let l:chunk = a:text
     else
-        let l:l0 = s:rand(0, len(a:text) - g:llama_config.ring_chunk_size)
-        let l:l1 = l:l0 + g:llama_config.ring_chunk_size
+        let l:l0 = s:rand(0, max([0, len(a:text) - g:llama_config.ring_chunk_size]))
+        let l:l1 = min([l:l0 + g:llama_config.ring_chunk_size, len(a:text)])
 
-        let l:chunk = join(a:text[l:l0:l:l1], "\n")
+        let l:chunk = a:text[l:l0:l:l1]
     endif
 
+    let l:chunk_str = join(l:chunk, "\n")
+
     " check if this chunk is already added
-    " TODO: smarter check for string similarity to evict old chunks that are very similart to the new one
     let l:exist = v:false
-    for i in range(len(s:ring_n_chunks))
-        if s:ring_n_chunks[i] == l:chunk
+    for i in range(len(s:ring_chunks))
+        if s:ring_chunks[i].data == l:chunk
             let l:exist = v:true
             break
         endif
@@ -161,11 +186,19 @@ function! s:pick_chunk(text, no_mod)
         return
     endif
 
-    if len(s:ring_n_chunks) == g:llama_config.ring_n_chunks
-        call remove(s:ring_n_chunks, 0)
+    " evict chunks that are very similar to the new one
+    for i in range(len(s:ring_chunks) - 1, 0, -1)
+        if s:chunk_sim(s:ring_chunks[i].data, l:chunk) > 0.9
+            call remove(s:ring_chunks, i)
+            let s:ring_n_evict += 1
+        endif
+    endfor
+
+    if len(s:ring_chunks) == g:llama_config.ring_n_chunks
+        call remove(s:ring_chunks, 0)
     endif
 
-    call add(s:ring_n_chunks, l:chunk)
+    call add(s:ring_chunks, {'data': l:chunk, 'str': l:chunk_str, 'time': reltime()})
 endfunction
 
 function! llama#fim(is_auto) abort
@@ -213,6 +246,12 @@ function! llama#fim(is_auto) abort
         let s:pos_y_pick = s:pos_y
     endif
 
+    " array of strings
+    let l:extra_context = []
+    for l:chunk in s:ring_chunks
+        call add(l:extra_context, l:chunk.str)
+    endfor
+
     let l:request = json_encode({
         \ 'prompt':           "",
         \ 'input_prefix':     l:prefix,
@@ -223,7 +262,7 @@ function! llama#fim(is_auto) abort
         \ 'stream':           v:false,
         \ 'samplers':         ["top_k", "infill"],
         \ 'cache_prompt':     v:true,
-        \ 'extra_context':    s:ring_n_chunks,
+        \ 'extra_context':    l:extra_context,
         \ 't_max_prompt_ms':  g:llama_config.t_max_prompt_ms,
         \ 't_max_predict_ms': g:llama_config.t_max_predict_ms
         \ })
@@ -418,9 +457,9 @@ function! s:fim_on_stdout(job_id, data, event) dict
         " prefix the info string with whitespace in order to offset it to the right of the fim overlay
         let l:prefix = repeat(' ', len(s:content[0]) - len(s:line_cur_suffix) + 3)
 
-        let l:info = printf("%s | context: %d / %d | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %.2f ms",
+        let l:info = printf("%s | context: %d / %d / %d / %d | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %.2f ms",
             \ g:llama_config.show_info == 2 ? l:prefix : '',
-            \ l:n_cached,  l:n_ctx,
+            \ l:n_cached,  l:n_ctx, len(s:ring_chunks), s:ring_n_evict,
             \ l:n_prompt,  l:t_prompt_ms,  l:s_prompt,
             \ l:n_predict, l:t_predict_ms, l:s_predict,
             \ 1000.0 * reltimefloat(reltime(s:t_fim_start))

From d81a0ac185fe9d9ca1e191ea19327582f6880aa2 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 13 Oct 2024 16:53:32 +0300
Subject: [PATCH 23/42] llama.vim : do not evict certain chunks [no ci]

---
 examples/llama.vim | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/examples/llama.vim b/examples/llama.vim
index 6e1840a548914..3fe69e339751b 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -71,7 +71,7 @@ let s:default_config = {
     \ 'show_info':        2,
     \ 'auto_fim':         v:true,
     \ 'ring_n_chunks':    32,
-    \ 'ring_chunk_size':  64,
+    \ 'ring_chunk_size':  128,
     \ 'ring_scope':       1024,
     \ }
 
@@ -119,11 +119,11 @@ function! llama#init()
 
         autocmd CursorMoved    * call llama#fim_cancel()
 
-        autocmd TextYankPost   * if v:event.operator ==# 'y' | call s:pick_chunk(v:event.regcontents, v:false) | endif
+        autocmd TextYankPost   * if v:event.operator ==# 'y' | call s:pick_chunk(v:event.regcontents, v:false, v:true) | endif
 
         " gather chunks upon entering/leaving a buffer
-        autocmd BufEnter       * call timer_start(100, {-> s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true)})
-        autocmd BufLeave       * call                      s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true)
+        autocmd BufEnter       * call timer_start(100, {-> s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)})
+        autocmd BufLeave       * call                      s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)
     augroup END
 
     silent! call llama#fim_cancel()
@@ -148,7 +148,7 @@ function! s:chunk_sim(c0, c1)
     return 2.0 * l:common / (l:lines0 + l:lines1)
 endfunction
 
-function! s:pick_chunk(text, no_mod)
+function! s:pick_chunk(text, no_mod, do_evict)
     " do not pick chunks from buffers with pending changes or buffers that are not files
     if a:no_mod && (getbufvar(bufnr('%'), '&modified') || !buflisted(bufnr('%')) || !filereadable(expand('%')))
         return
@@ -165,8 +165,8 @@ function! s:pick_chunk(text, no_mod)
     if len(a:text) + 1 < g:llama_config.ring_chunk_size
         let l:chunk = a:text
     else
-        let l:l0 = s:rand(0, max([0, len(a:text) - g:llama_config.ring_chunk_size]))
-        let l:l1 = min([l:l0 + g:llama_config.ring_chunk_size, len(a:text)])
+        let l:l0 = s:rand(0, max([0, len(a:text) - g:llama_config.ring_chunk_size/2]))
+        let l:l1 = min([l:l0 + g:llama_config.ring_chunk_size/2, len(a:text)])
 
         let l:chunk = a:text[l:l0:l:l1]
     endif
@@ -189,8 +189,12 @@ function! s:pick_chunk(text, no_mod)
     " evict chunks that are very similar to the new one
     for i in range(len(s:ring_chunks) - 1, 0, -1)
         if s:chunk_sim(s:ring_chunks[i].data, l:chunk) > 0.9
-            call remove(s:ring_chunks, i)
-            let s:ring_n_evict += 1
+            if a:do_evict
+                call remove(s:ring_chunks, i)
+                let s:ring_n_evict += 1
+            else
+                return
+            endif
         endif
     endfor
 
@@ -237,11 +241,12 @@ function! llama#fim(is_auto) abort
 
     " only gather chunks if the cursor has moved a lot
     if a:is_auto && l:delta_y > 32
-        " pick a prefix chunk
-        call s:pick_chunk(getline(max([1, s:pos_y - g:llama_config.ring_scope]), max([1, s:pos_y - g:llama_config.n_prefix])), v:false)
-
-        " pick a suffix chunk
-        call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:llama_config.n_suffix]), min([l:max_y, s:pos_y + g:llama_config.ring_scope])), v:false)
+        " randomly pick a prefix or a suffix chunk
+        if s:rand(0, 1)
+            call s:pick_chunk(getline(max([1, s:pos_y - g:llama_config.ring_scope]), max([1, s:pos_y - g:llama_config.n_prefix])), v:false, v:false)
+        else
+            call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:llama_config.n_suffix]), min([l:max_y, s:pos_y + g:llama_config.ring_scope])), v:false, v:false)
+        endif
 
         let s:pos_y_pick = s:pos_y
     endif

From 2960510153a45f384f075985d91cba957f69ef79 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 13 Oct 2024 17:17:01 +0300
Subject: [PATCH 24/42] llama.vim : do not auto-fim when far from the end of
 the line [no ci]

---
 examples/llama.vim | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/examples/llama.vim b/examples/llama.vim
index 3fe69e339751b..bf56a5e5f9e93 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -48,6 +48,7 @@ highlight llama_hl_info guifg=#77ff2f
 "   t_max_predict_ms: max alloted time for the prediction
 "   show_info:        show extra info about the inference (0 - disabled, 1 - statusline, 2 - inline)
 "   auto_fim:         trigger FIM completion automatically on cursor movement
+"   max_line_suffix:  do not auto-trigger FIM completion if there are more than this number of characters to the right of the cursor
 "
 " ring buffer of chunks, accumulated with time upon:
 "
@@ -70,6 +71,7 @@ let s:default_config = {
     \ 't_max_predict_ms': 200,
     \ 'show_info':        2,
     \ 'auto_fim':         v:true,
+    \ 'max_line_suffix':  8,
     \ 'ring_n_chunks':    32,
     \ 'ring_chunk_size':  128,
     \ 'ring_scope':       1024,
@@ -124,6 +126,9 @@ function! llama#init()
         " gather chunks upon entering/leaving a buffer
         autocmd BufEnter       * call timer_start(100, {-> s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)})
         autocmd BufLeave       * call                      s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)
+
+        " gather chunk upon saving the file
+        autocmd BufWritePost   * call s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)
     augroup END
 
     silent! call llama#fim_cancel()
@@ -225,6 +230,10 @@ function! llama#fim(is_auto) abort
     let s:line_cur_prefix = strpart(s:line_cur, 0, s:pos_x0)
     let s:line_cur_suffix = strpart(s:line_cur, s:pos_x0)
 
+    if a:is_auto && len(s:line_cur_suffix) > g:llama_config.max_line_suffix
+        return
+    endif
+
     let l:prefix = ""
         \ . join(l:lines_prefix, "\n")
         \ . "\n"

From bc2857b88c69b913e42b09c2470acf13af37a640 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 13 Oct 2024 18:23:22 +0300
Subject: [PATCH 25/42] llama.vim : async context processing

ggml-ci
---
 examples/llama.vim | 94 +++++++++++++++++++++++++++++++++-------------
 1 file changed, 68 insertions(+), 26 deletions(-)

diff --git a/examples/llama.vim b/examples/llama.vim
index bf56a5e5f9e93..bc61ea8ba6981 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -17,7 +17,7 @@
 "
 " start the llama.cpp server with a FIM-compatible model. for example:
 "
-"   $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa --ubatch-size 1024 --batch-size 2048 --cache-reuse 512
+"   $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa --ubatch-size 512 --batch-size 1024 --cache-reuse 512
 "
 "   --batch-size [512, model max context]
 "
@@ -54,7 +54,9 @@ highlight llama_hl_info guifg=#77ff2f
 "
 "  - completion request
 "  - yank
-"  - reading a file
+"  - entering a buffer
+"  - leaving a buffer
+"  - writing a file
 "
 " ring context parameters:
 "
@@ -208,6 +210,36 @@ function! s:pick_chunk(text, no_mod, do_evict)
     endif
 
     call add(s:ring_chunks, {'data': l:chunk, 'str': l:chunk_str, 'time': reltime()})
+
+    " send asynchronous job with the new extra context so that it is ready for the next FIM
+    let l:extra_context = []
+    for l:chunk in s:ring_chunks
+        call add(l:extra_context, l:chunk.str)
+    endfor
+
+    let l:request = json_encode({
+        \ 'prompt':           "",
+        \ 'input_prefix':     "",
+        \ 'input_suffix':     "",
+        \ 'n_predict':        1,
+        \ 'penalty_last_n':   0,
+        \ 'top_k':            100,
+        \ 'stream':           v:false,
+        \ 'samplers':         ["top_k", "infill"],
+        \ 'cache_prompt':     v:true,
+        \ 'extra_context':    l:extra_context,
+        \ 't_max_prompt_ms':  1,
+        \ 't_max_predict_ms': 1
+        \ })
+
+    let l:curl_command = printf(
+        \ "curl --silent --no-buffer --request POST --url %s --header \"Content-Type: application/json\" --data %s",
+        \ g:llama_config.endpoint, shellescape(l:request)
+        \ )
+
+    call jobstart(l:curl_command, {
+        \ 'on_exit':   function('s:fim_on_exit')
+        \ })
 endfunction
 
 function! llama#fim(is_auto) abort
@@ -245,21 +277,6 @@ function! llama#fim(is_auto) abort
         \ . join(l:lines_suffix, "\n")
         \ . "\n"
 
-    " TODO: per-file location
-    let l:delta_y = abs(s:pos_y - s:pos_y_pick)
-
-    " only gather chunks if the cursor has moved a lot
-    if a:is_auto && l:delta_y > 32
-        " randomly pick a prefix or a suffix chunk
-        if s:rand(0, 1)
-            call s:pick_chunk(getline(max([1, s:pos_y - g:llama_config.ring_scope]), max([1, s:pos_y - g:llama_config.n_prefix])), v:false, v:false)
-        else
-            call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:llama_config.n_suffix]), min([l:max_y, s:pos_y + g:llama_config.ring_scope])), v:false, v:false)
-        endif
-
-        let s:pos_y_pick = s:pos_y
-    endif
-
     " array of strings
     let l:extra_context = []
     for l:chunk in s:ring_chunks
@@ -294,6 +311,21 @@ function! llama#fim(is_auto) abort
         \ 'is_auto': a:is_auto
         \ })
 
+    " TODO: per-file location
+    let l:delta_y = abs(s:pos_y - s:pos_y_pick)
+
+    " only gather chunks if the cursor has moved a lot
+    if a:is_auto && l:delta_y > 32
+        " randomly pick a prefix or a suffix chunk
+        if s:rand(0, 1)
+            call s:pick_chunk(getline(max([1, s:pos_y - g:llama_config.ring_scope]), max([1, s:pos_y - g:llama_config.n_prefix])), v:false, v:false)
+        else
+            call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:llama_config.n_suffix]), min([l:max_y, s:pos_y + g:llama_config.ring_scope])), v:false, v:false)
+        endif
+
+        let s:pos_y_pick = s:pos_y
+    endif
+
     " this trick is needed to avoid the cursor shifting upon C-O when at the end of the line
     if !a:is_auto
         augroup llama_insert
@@ -427,7 +459,8 @@ function! s:fim_on_stdout(job_id, data, event) dict
         let l:generation_settings = get(l:response, 'generation_settings', {})
         let l:n_ctx = get(l:generation_settings, 'n_ctx', 0)
 
-        let l:n_cached = get(l:response, 'tokens_cached', 0)
+        let l:n_cached  = get(l:response, 'tokens_cached', 0)
+        let l:truncated = get(l:response, 'truncated', v:false)
 
         " if response.timings is available
         if len(get(l:response, 'timings', {})) > 0
@@ -466,22 +499,31 @@ function! s:fim_on_stdout(job_id, data, event) dict
     let l:id_vt_fim  = nvim_create_namespace('vt_fim')
     let l:id_vt_info = nvim_create_namespace('vt_info')
 
-    " construct the info message and display it to the right of the current line
+    " construct the info message
     if g:llama_config.show_info > 0 && l:has_info
         " prefix the info string with whitespace in order to offset it to the right of the fim overlay
         let l:prefix = repeat(' ', len(s:content[0]) - len(s:line_cur_suffix) + 3)
 
-        let l:info = printf("%s | context: %d / %d / %d / %d | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %.2f ms",
-            \ g:llama_config.show_info == 2 ? l:prefix : '',
-            \ l:n_cached,  l:n_ctx, len(s:ring_chunks), s:ring_n_evict,
-            \ l:n_prompt,  l:t_prompt_ms,  l:s_prompt,
-            \ l:n_predict, l:t_predict_ms, l:s_predict,
-            \ 1000.0 * reltimefloat(reltime(s:t_fim_start))
-            \ )
+        if l:truncated
+            let l:info = printf("%s | WARNING: the context is full: %d / %d, increase the server context size or reduce g:llama_config.ring_n_chunks",
+                \ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim',
+                \ l:n_cached, l:n_ctx
+                \ )
+        else
+            let l:info = printf("%s | context: %d / %d / %d / %d | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %.2f ms",
+                \ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim',
+                \ l:n_cached,  l:n_ctx, len(s:ring_chunks), s:ring_n_evict,
+                \ l:n_prompt,  l:t_prompt_ms,  l:s_prompt,
+                \ l:n_predict, l:t_predict_ms, l:s_predict,
+                \ 1000.0 * reltimefloat(reltime(s:t_fim_start))
+                \ )
+        endif
 
         if g:llama_config.show_info == 1
+            "" display it in the statusline
             let &statusline = l:info
         elseif g:llama_config.show_info == 2
+            " display it to the right of the current line
             call nvim_buf_set_extmark(l:bufnr, l:id_vt_info, s:pos_y - 1, s:pos_x - 1, {
                 \ 'virt_text': [[l:info, 'llama_hl_info']],
                 \ 'virt_text_pos': 'eol',

From 916c2ee3fd95976213f95922b5242d2c4834dec9 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 13 Oct 2024 18:50:36 +0300
Subject: [PATCH 26/42] llama : simplify infill sampler

---
 examples/llama.vim | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/llama.vim b/examples/llama.vim
index bc61ea8ba6981..5a2027021d1b4 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -70,11 +70,11 @@ let s:default_config = {
     \ 'n_suffix':         128,
     \ 'n_predict':        64,
     \ 't_max_prompt_ms':  500,
-    \ 't_max_predict_ms': 200,
+    \ 't_max_predict_ms': 500,
     \ 'show_info':        2,
     \ 'auto_fim':         v:true,
     \ 'max_line_suffix':  8,
-    \ 'ring_n_chunks':    32,
+    \ 'ring_n_chunks':    16,
     \ 'ring_chunk_size':  128,
     \ 'ring_scope':       1024,
     \ }

From ae76a092b850283121af2cf2eb83a764b214fb6a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 13 Oct 2024 21:36:02 +0300
Subject: [PATCH 27/42] llama.vim : pass filenames for each chunk

ggml-ci
---
 examples/llama.vim | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/examples/llama.vim b/examples/llama.vim
index 5a2027021d1b4..919055876d9f6 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -178,7 +178,7 @@ function! s:pick_chunk(text, no_mod, do_evict)
         let l:chunk = a:text[l:l0:l:l1]
     endif
 
-    let l:chunk_str = join(l:chunk, "\n")
+    let l:chunk_str = join(l:chunk, "\n") . "\n"
 
     " check if this chunk is already added
     let l:exist = v:false
@@ -209,12 +209,16 @@ function! s:pick_chunk(text, no_mod, do_evict)
         call remove(s:ring_chunks, 0)
     endif
 
-    call add(s:ring_chunks, {'data': l:chunk, 'str': l:chunk_str, 'time': reltime()})
+    call add(s:ring_chunks, {'data': l:chunk, 'str': l:chunk_str, 'time': reltime(), 'filename': expand('%')})
 
     " send asynchronous job with the new extra context so that it is ready for the next FIM
     let l:extra_context = []
     for l:chunk in s:ring_chunks
-        call add(l:extra_context, l:chunk.str)
+        call add(l:extra_context, {
+            \ 'text':     l:chunk.str,
+            \ 'time':     l:chunk.time,
+            \ 'filename': l:chunk.filename
+            \ })
     endfor
 
     let l:request = json_encode({
@@ -277,10 +281,14 @@ function! llama#fim(is_auto) abort
         \ . join(l:lines_suffix, "\n")
         \ . "\n"
 
-    " array of strings
+    " prepare the extra context data
     let l:extra_context = []
     for l:chunk in s:ring_chunks
-        call add(l:extra_context, l:chunk.str)
+        call add(l:extra_context, {
+            \ 'text':     l:chunk.str,
+            \ 'time':     l:chunk.time,
+            \ 'filename': l:chunk.filename
+            \ })
     endfor
 
     let l:request = json_encode({

From 9f8fa900f68ece27185e6c0ac9690a2a5b251cbd Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 13 Oct 2024 21:56:29 +0300
Subject: [PATCH 28/42] llama.vim : fix repetitions [no ci]

---
 examples/llama.vim | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/examples/llama.vim b/examples/llama.vim
index 919055876d9f6..0065d5e5a6431 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -499,6 +499,11 @@ function! s:fim_on_stdout(job_id, data, event) dict
     let s:pos_dx = len(s:content[-1])
     let s:content[-1] .= s:line_cur_suffix
 
+    " truncate the suggestion if it repeats the next line
+    if len(s:content) > 1 && s:content[1] == getline(s:pos_y + 1)
+        let s:content = [s:content[0]]
+    endif
+
     call llama#fim_cancel()
 
     " display virtual text with the suggestion

From 25ecb35c4f37b009d6c5930049bdfe49942bde5e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 14 Oct 2024 15:50:08 +0300
Subject: [PATCH 29/42] llama.vim : simplify job logic + improve robustness and
 responsivness

---
 examples/llama.vim | 129 ++++++++++++++++++---------------------------
 1 file changed, 51 insertions(+), 78 deletions(-)

diff --git a/examples/llama.vim b/examples/llama.vim
index 0065d5e5a6431..90d08a8e5d62e 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -95,7 +95,6 @@ function! llama#init()
 
     let s:pos_x  = 0 " cursor position upon start of completion
     let s:pos_y  = 0
-    let s:pos_x0 = 0 " pos_x corrected for end-of-line edge case
 
     let s:line_cur = ''
 
@@ -105,32 +104,40 @@ function! llama#init()
     let s:ring_chunks = []
     let s:ring_n_evict = 0
 
+    let s:hint_shown = v:false
     let s:pos_y_pick = -9999 " last y where we picked a chunk
     let s:pos_dx = 0
     let s:content = []
     let s:can_accept = v:false
 
-    let s:timer_fim = -1
-    let s:t_fim_last  = reltime()
-    let s:t_fim_start = reltime()
+    let s:t_fim_start = reltime() " used to measure total FIM time
 
     let s:current_job = v:null
 
     augroup llama
         autocmd!
-        autocmd InsertEnter    * inoremap <buffer> <silent> <C-F> <C-O>:call llama#fim(v:false)<CR>
-        autocmd InsertLeavePre * call llama#fim_cancel()
+        autocmd InsertEnter     * inoremap <buffer> <silent> <C-F> <Esc>a
+        autocmd InsertLeavePre  * call llama#fim_cancel()
 
-        autocmd CursorMoved    * call llama#fim_cancel()
+        autocmd CursorMoved     * call llama#fim_cancel()
+        autocmd CompleteChanged * call llama#fim_cancel()
 
-        autocmd TextYankPost   * if v:event.operator ==# 'y' | call s:pick_chunk(v:event.regcontents, v:false, v:true) | endif
+        if g:llama_config.auto_fim
+            autocmd InsertEnter  * call llama#fim(v:true, v:false)
+            autocmd CursorMovedI * call llama#fim(v:true, v:false)
+            autocmd CursorHoldI  * call llama#fim(v:true, v:true)
+        else
+            autocmd CursorMovedI * call llama#fim_cancel()
+        endif
+
+        autocmd TextYankPost    * if v:event.operator ==# 'y' | call s:pick_chunk(v:event.regcontents, v:false, v:true) | endif
 
         " gather chunks upon entering/leaving a buffer
-        autocmd BufEnter       * call timer_start(100, {-> s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)})
-        autocmd BufLeave       * call                      s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)
+        autocmd BufEnter        * call timer_start(100, {-> s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)})
+        autocmd BufLeave        * call                      s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)
 
         " gather chunk upon saving the file
-        autocmd BufWritePost   * call s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)
+        autocmd BufWritePost    * call s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)
     augroup END
 
     silent! call llama#fim_cancel()
@@ -241,18 +248,27 @@ function! s:pick_chunk(text, no_mod, do_evict)
         \ g:llama_config.endpoint, shellescape(l:request)
         \ )
 
-    call jobstart(l:curl_command, {
-        \ 'on_exit':   function('s:fim_on_exit')
-        \ })
+    call jobstart(l:curl_command, {})
 endfunction
 
-function! llama#fim(is_auto) abort
+function! llama#fim(is_auto, on_hold) abort
+    if a:on_hold && s:hint_shown
+        return
+    endif
+
+    call llama#fim_cancel()
+
+    if reltimefloat(reltime(s:t_fim_start)) < 0.5
+        let s:t_fim_start = reltime()
+        return
+    endif
+
     let s:t_fim_start = reltime()
 
     let s:content = []
     let s:can_accept = v:false
 
-    let s:pos_x = col('.')
+    let s:pos_x = col('.') - 1
     let s:pos_y = line('.')
     let l:max_y = line('$')
 
@@ -261,10 +277,8 @@ function! llama#fim(is_auto) abort
 
     let s:line_cur = getline('.')
 
-    let s:pos_x0 = s:pos_x == len(s:line_cur) ? s:pos_x : s:pos_x - 1
-
-    let s:line_cur_prefix = strpart(s:line_cur, 0, s:pos_x0)
-    let s:line_cur_suffix = strpart(s:line_cur, s:pos_x0)
+    let s:line_cur_prefix = strpart(s:line_cur, 0, s:pos_x)
+    let s:line_cur_suffix = strpart(s:line_cur, s:pos_x)
 
     if a:is_auto && len(s:line_cur_suffix) > g:llama_config.max_line_suffix
         return
@@ -311,11 +325,17 @@ function! llama#fim(is_auto) abort
         \ g:llama_config.endpoint, shellescape(l:request)
         \ )
 
+    if s:current_job != v:null
+        call jobstop(s:current_job)
+    endif
+
     " send the request asynchronously
     let s:current_job = jobstart(l:curl_command, {
         \ 'on_stdout': function('s:fim_on_stdout'),
         \ 'on_exit':   function('s:fim_on_exit'),
         \ 'stdout_buffered': v:true,
+        \ 'pos_x': s:pos_x,
+        \ 'pos_y': s:pos_y,
         \ 'is_auto': a:is_auto
         \ })
 
@@ -333,24 +353,13 @@ function! llama#fim(is_auto) abort
 
         let s:pos_y_pick = s:pos_y
     endif
-
-    " this trick is needed to avoid the cursor shifting upon C-O when at the end of the line
-    if !a:is_auto
-        augroup llama_insert
-            autocmd!
-        augroup END
-
-        if g:llama_config.auto_fim
-            call timer_start(0, {-> s:fim_auto_enable()})
-        endif
-    endif
 endfunction
 
 " if first_line == v:true accept only the first line of the response
 function! llama#fim_accept(first_line)
     " insert the suggestion at the cursor location
     if s:can_accept && len(s:content) > 0
-        call setline(s:pos_y, s:line_cur[:(s:pos_x0 - 1)] . s:content[0])
+        call setline(s:pos_y, s:line_cur[:(s:pos_x - 1)] . s:content[0])
         if len(s:content) > 1
             if !a:first_line
                 call append(s:pos_y, s:content[1:-1])
@@ -361,7 +370,7 @@ function! llama#fim_accept(first_line)
         if !a:first_line
             call cursor(s:pos_y + len(s:content) - 1, s:pos_x + s:pos_dx)
         else
-            call cursor(s:pos_y, s:pos_x + len(s:content[0]) - 1)
+            call cursor(s:pos_y, s:pos_x + len(s:content[0]))
         endif
     endif
 
@@ -369,14 +378,7 @@ function! llama#fim_accept(first_line)
 endfunction
 
 function! llama#fim_cancel()
-    if s:current_job != v:null
-        call jobstop(s:current_job)
-    endif
-
-    if s:timer_fim != -1
-        call timer_stop(s:timer_fim)
-        let s:timer_fim = -1
-    endif
+    let s:hint_shown = v:false
 
     " clear the virtual text
     let l:bufnr = bufnr('%')
@@ -391,39 +393,6 @@ function! llama#fim_cancel()
     silent! iunmap <buffer> <Tab>
     silent! iunmap <buffer> <S-Tab>
     silent! iunmap <buffer> <Esc>
-
-    augroup llama_insert
-        autocmd!
-    augroup END
-
-    if g:llama_config.auto_fim
-        call s:fim_auto_enable()
-    endif
-endfunction
-
-function! s:fim_auto_enable()
-    augroup llama_insert
-        autocmd CursorMovedI * call s:fim_auto()
-    augroup END
-endfunction
-
-" auto-start a fim job a short time after the cursor has moved
-" if there is already a job queued - cancel it
-function! s:fim_auto()
-    if s:current_job != v:null
-        call jobstop(s:current_job)
-    endif
-
-    " TODO: when job cancellation is implemented on the server, reduce these timeouts
-    if reltimefloat(reltime(s:t_fim_last)) < 500*0.001
-        if s:timer_fim != -1
-            call timer_stop(s:timer_fim)
-            let s:timer_fim = -1
-        endif
-    endif
-
-    let s:t_fim_last = reltime()
-    let s:timer_fim = timer_start(500, {-> llama#fim(v:true)})
 endfunction
 
 " callback that processes the result from the server
@@ -433,6 +402,13 @@ function! s:fim_on_stdout(job_id, data, event) dict
         return
     endif
 
+    if self.pos_x != col('.') - 1 || self.pos_y != line('.')
+        return
+    endif
+
+    let s:pos_x = self.pos_x
+    let s:pos_y = self.pos_y
+
     let s:can_accept = v:true
     let l:has_info   = v:false
 
@@ -559,10 +535,7 @@ function! s:fim_on_stdout(job_id, data, event) dict
     inoremap <buffer> <Tab>   <C-O>:call llama#fim_accept(v:false)<CR>
     inoremap <buffer> <S-Tab> <C-O>:call llama#fim_accept(v:true)<CR>
 
-    augroup llama_insert
-        autocmd!
-        autocmd CursorMovedI * call llama#fim_cancel()
-    augroup END
+    let s:hint_shown = v:true
 endfunction
 
 function! s:fim_on_exit(job_id, exit_code, event) dict

From e4be74b4b70b7431d17c40d04f3d087f9d592e0a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 15 Oct 2024 09:34:26 +0300
Subject: [PATCH 30/42] llama.vim : add top_p + improve responsivness + fix
 edge cases

---
 examples/llama.vim | 64 +++++++++++++++++++++++++++-------------------
 1 file changed, 38 insertions(+), 26 deletions(-)

diff --git a/examples/llama.vim b/examples/llama.vim
index 90d08a8e5d62e..a80b5d5d024f2 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -66,16 +66,16 @@ highlight llama_hl_info guifg=#77ff2f
 "
 let s:default_config = {
     \ 'endpoint':         'http://127.0.0.1:8012/infill',
-    \ 'n_prefix':         128,
-    \ 'n_suffix':         128,
+    \ 'n_prefix':         256,
+    \ 'n_suffix':         8,
     \ 'n_predict':        64,
     \ 't_max_prompt_ms':  500,
-    \ 't_max_predict_ms': 500,
+    \ 't_max_predict_ms': 200,
     \ 'show_info':        2,
     \ 'auto_fim':         v:true,
     \ 'max_line_suffix':  8,
-    \ 'ring_n_chunks':    16,
-    \ 'ring_chunk_size':  128,
+    \ 'ring_n_chunks':    64,
+    \ 'ring_chunk_size':  64,
     \ 'ring_scope':       1024,
     \ }
 
@@ -110,13 +110,14 @@ function! llama#init()
     let s:content = []
     let s:can_accept = v:false
 
+    let s:timer_fim = -1
     let s:t_fim_start = reltime() " used to measure total FIM time
 
     let s:current_job = v:null
 
     augroup llama
         autocmd!
-        autocmd InsertEnter     * inoremap <buffer> <silent> <C-F> <Esc>a
+        autocmd InsertEnter     * inoremap <expr> <silent> <C-F> llama#fim_inline(v:false, v:false)
         autocmd InsertLeavePre  * call llama#fim_cancel()
 
         autocmd CursorMoved     * call llama#fim_cancel()
@@ -125,7 +126,7 @@ function! llama#init()
         if g:llama_config.auto_fim
             autocmd InsertEnter  * call llama#fim(v:true, v:false)
             autocmd CursorMovedI * call llama#fim(v:true, v:false)
-            autocmd CursorHoldI  * call llama#fim(v:true, v:true)
+           "autocmd CursorHoldI  * call llama#fim(v:true, v:true)
         else
             autocmd CursorMovedI * call llama#fim_cancel()
         endif
@@ -202,7 +203,7 @@ function! s:pick_chunk(text, no_mod, do_evict)
 
     " evict chunks that are very similar to the new one
     for i in range(len(s:ring_chunks) - 1, 0, -1)
-        if s:chunk_sim(s:ring_chunks[i].data, l:chunk) > 0.9
+        if s:chunk_sim(s:ring_chunks[i].data, l:chunk) > 0.5
             if a:do_evict
                 call remove(s:ring_chunks, i)
                 let s:ring_n_evict += 1
@@ -234,9 +235,10 @@ function! s:pick_chunk(text, no_mod, do_evict)
         \ 'input_suffix':     "",
         \ 'n_predict':        1,
         \ 'penalty_last_n':   0,
-        \ 'top_k':            100,
+        \ 'top_k':            40,
+        \ 'top_p':            0.99,
         \ 'stream':           v:false,
-        \ 'samplers':         ["top_k", "infill"],
+        \ 'samplers':         ["top_k", "top_p", "infill"],
         \ 'cache_prompt':     v:true,
         \ 'extra_context':    l:extra_context,
         \ 't_max_prompt_ms':  1,
@@ -251,15 +253,27 @@ function! s:pick_chunk(text, no_mod, do_evict)
     call jobstart(l:curl_command, {})
 endfunction
 
+function! llama#fim_inline(is_auto, on_hold) abort
+    call llama#fim(a:is_auto, a:on_hold)
+    return ''
+endfunction
+
 function! llama#fim(is_auto, on_hold) abort
-    if a:on_hold && s:hint_shown
+    if a:on_hold && (s:hint_shown || (s:pos_x == col('.') - 1 && s:pos_y == line('.')))
         return
     endif
 
     call llama#fim_cancel()
 
-    if reltimefloat(reltime(s:t_fim_start)) < 0.5
+    " avoid sending repeated requests too fast
+    if reltimefloat(reltime(s:t_fim_start)) < 0.6
+        if s:timer_fim != -1
+            call timer_stop(s:timer_fim)
+            let s:timer_fim = -1
+        endif
+
         let s:t_fim_start = reltime()
+        let s:timer_fim = timer_start(600, {-> llama#fim(v:true, v:true)})
         return
     endif
 
@@ -287,6 +301,8 @@ function! llama#fim(is_auto, on_hold) abort
     let l:prefix = ""
         \ . join(l:lines_prefix, "\n")
         \ . "\n"
+
+    let l:prompt = ""
         \ . s:line_cur_prefix
 
     let l:suffix = ""
@@ -306,14 +322,15 @@ function! llama#fim(is_auto, on_hold) abort
     endfor
 
     let l:request = json_encode({
-        \ 'prompt':           "",
         \ 'input_prefix':     l:prefix,
+        \ 'prompt':           l:prompt,
         \ 'input_suffix':     l:suffix,
         \ 'n_predict':        g:llama_config.n_predict,
         \ 'penalty_last_n':   0,
-        \ 'top_k':            100,
+        \ 'top_k':            40,
+        \ 'top_p':            0.99,
         \ 'stream':           v:false,
-        \ 'samplers':         ["top_k", "infill"],
+        \ 'samplers':         ["top_k", "top_p", "infill"],
         \ 'cache_prompt':     v:true,
         \ 'extra_context':    l:extra_context,
         \ 't_max_prompt_ms':  g:llama_config.t_max_prompt_ms,
@@ -343,13 +360,10 @@ function! llama#fim(is_auto, on_hold) abort
     let l:delta_y = abs(s:pos_y - s:pos_y_pick)
 
     " only gather chunks if the cursor has moved a lot
+    " TODO: something more clever? reranking?
     if a:is_auto && l:delta_y > 32
-        " randomly pick a prefix or a suffix chunk
-        if s:rand(0, 1)
-            call s:pick_chunk(getline(max([1, s:pos_y - g:llama_config.ring_scope]), max([1, s:pos_y - g:llama_config.n_prefix])), v:false, v:false)
-        else
-            call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:llama_config.n_suffix]), min([l:max_y, s:pos_y + g:llama_config.ring_scope])), v:false, v:false)
-        endif
+        call s:pick_chunk(getline(max([1,       s:pos_y - g:llama_config.ring_scope]), max([1,       s:pos_y - g:llama_config.n_prefix])), v:false, v:false)
+        call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:llama_config.n_suffix]),   min([l:max_y, s:pos_y + g:llama_config.n_suffix + g:llama_config.ring_chunk_size])), v:false, v:false)
 
         let s:pos_y_pick = s:pos_y
     endif
@@ -367,7 +381,7 @@ function! llama#fim_accept(first_line)
         endif
 
         " move the cursor to the end of the accepted text
-        if !a:first_line
+        if !a:first_line && len(s:content) > 1
             call cursor(s:pos_y + len(s:content) - 1, s:pos_x + s:pos_dx)
         else
             call cursor(s:pos_y, s:pos_x + len(s:content[0]))
@@ -462,9 +476,7 @@ function! s:fim_on_stdout(job_id, data, event) dict
     endif
 
     if len(s:content) == 0
-        if !self.is_auto
-            call add(s:content, "<| EOT |>")
-        endif
+        call add(s:content, "")
         let s:can_accept = v:false
     endif
 
@@ -475,7 +487,7 @@ function! s:fim_on_stdout(job_id, data, event) dict
     let s:pos_dx = len(s:content[-1])
     let s:content[-1] .= s:line_cur_suffix
 
-    " truncate the suggestion if it repeats the next line
+    " truncate the suggestion if it repeats the following lines
     if len(s:content) > 1 && s:content[1] == getline(s:pos_y + 1)
         let s:content = [s:content[0]]
     endif

From 0c1f51b73e781b16b3593e716ff1e4eab0131ea8 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 15 Oct 2024 09:37:26 +0300
Subject: [PATCH 31/42] llama : improve infill sampler

ggml-ci
---
 src/llama-sampling.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index d71516153cf82..1a297aa3866e6 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1764,7 +1764,7 @@ struct llama_sampler * llama_sampler_init_logit_bias(
 
 // infill
 
-//#define GGML_DEBUG_SAMPLER_INFILL
+#define GGML_DEBUG_SAMPLER_INFILL
 
 struct llama_sampler_infill {
     const struct llama_vocab * vocab;

From 42a9008b31a0a31ad206a43bc2136733da2e31bc Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 15 Oct 2024 10:50:18 +0300
Subject: [PATCH 32/42] llama.vim : process extra chunks in the background [no
 ci]

---
 examples/llama.vim | 86 +++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 73 insertions(+), 13 deletions(-)

diff --git a/examples/llama.vim b/examples/llama.vim
index a80b5d5d024f2..b184faa7e5989 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -17,11 +17,11 @@
 "
 " start the llama.cpp server with a FIM-compatible model. for example:
 "
-"   $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa --ubatch-size 512 --batch-size 1024 --cache-reuse 512
+"   $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa -dt 0.1 --ubatch-size 512 --batch-size 1024 --cache-reuse 512
 "
 "   --batch-size [512, model max context]
 "
-"     adjust the batch size to control how much of the provided context will be used during the inference
+"     adjust the batch size to control how much of the provided local context will be used during the inference
 "     lower values will use smaller part of the context around the cursor, which will result in faster processing
 "
 "   --ubatch-size [64, 2048]
@@ -58,11 +58,12 @@ highlight llama_hl_info guifg=#77ff2f
 "  - leaving a buffer
 "  - writing a file
 "
-" ring context parameters:
+" parameters for the ring-buffer with extra context:
 "
 "   ring_n_chunks:    max number of chunks to pass as extra context to the server (0 to disable)
 "   ring_chunk_size:  max size of the chunks (in number of lines)
 "   ring_scope:       the range around the cursor position (in number of lines) for gathering chunks
+"   ring_update_ms:   how often to process queued chunks in normal mode
 "
 let s:default_config = {
     \ 'endpoint':         'http://127.0.0.1:8012/infill',
@@ -77,6 +78,7 @@ let s:default_config = {
     \ 'ring_n_chunks':    64,
     \ 'ring_chunk_size':  64,
     \ 'ring_scope':       1024,
+    \ 'ring_update_ms':   1000,
     \ }
 
 let g:llama_config = get(g:, 'llama_config', s:default_config)
@@ -101,7 +103,8 @@ function! llama#init()
     let s:line_cur_prefix = ''
     let s:line_cur_suffix = ''
 
-    let s:ring_chunks = []
+    let s:ring_chunks = [] " current set of chunks used as extra context
+    let s:ring_queued = [] " chunks that are queued to be sent for processing
     let s:ring_n_evict = 0
 
     let s:hint_shown = v:false
@@ -112,6 +115,7 @@ function! llama#init()
 
     let s:timer_fim = -1
     let s:t_fim_start = reltime() " used to measure total FIM time
+    let s:t_last_move = reltime() " last time the cursor moved
 
     let s:current_job = v:null
 
@@ -120,15 +124,14 @@ function! llama#init()
         autocmd InsertEnter     * inoremap <expr> <silent> <C-F> llama#fim_inline(v:false, v:false)
         autocmd InsertLeavePre  * call llama#fim_cancel()
 
-        autocmd CursorMoved     * call llama#fim_cancel()
+        autocmd CursorMoved     * call s:on_move()
+        autocmd CursorMovedI    * call s:on_move()
         autocmd CompleteChanged * call llama#fim_cancel()
 
         if g:llama_config.auto_fim
             autocmd InsertEnter  * call llama#fim(v:true, v:false)
             autocmd CursorMovedI * call llama#fim(v:true, v:false)
            "autocmd CursorHoldI  * call llama#fim(v:true, v:true)
-        else
-            autocmd CursorMovedI * call llama#fim_cancel()
         endif
 
         autocmd TextYankPost    * if v:event.operator ==# 'y' | call s:pick_chunk(v:event.regcontents, v:false, v:true) | endif
@@ -142,6 +145,11 @@ function! llama#init()
     augroup END
 
     silent! call llama#fim_cancel()
+
+    " init background update of the ring buffer
+    if g:llama_config.ring_n_chunks > 0
+        call s:ring_update()
+    endif
 endfunction
 
 " TODO: figure out something better
@@ -163,6 +171,7 @@ function! s:chunk_sim(c0, c1)
     return 2.0 * l:common / (l:lines0 + l:lines1)
 endfunction
 
+" pick a chunk from the provided text and queue it for processing
 function! s:pick_chunk(text, no_mod, do_evict)
     " do not pick chunks from buffers with pending changes or buffers that are not files
     if a:no_mod && (getbufvar(bufnr('%'), '&modified') || !buflisted(bufnr('%')) || !filereadable(expand('%')))
@@ -190,6 +199,7 @@ function! s:pick_chunk(text, no_mod, do_evict)
 
     " check if this chunk is already added
     let l:exist = v:false
+
     for i in range(len(s:ring_chunks))
         if s:ring_chunks[i].data == l:chunk
             let l:exist = v:true
@@ -197,11 +207,30 @@ function! s:pick_chunk(text, no_mod, do_evict)
         endif
     endfor
 
+    for i in range(len(s:ring_queued))
+        if s:ring_queued[i].data == l:chunk
+            let l:exist = v:true
+            break
+        endif
+    endfor
+
     if l:exist
         return
     endif
 
     " evict chunks that are very similar to the new one
+    for i in range(len(s:ring_queued) - 1, 0, -1)
+        if s:chunk_sim(s:ring_queued[i].data, l:chunk) > 0.5
+            if a:do_evict
+                call remove(s:ring_queued, i)
+                let s:ring_n_evict += 1
+            else
+                return
+            endif
+        endif
+    endfor
+
+    " also from s:ring_chunks
     for i in range(len(s:ring_chunks) - 1, 0, -1)
         if s:chunk_sim(s:ring_chunks[i].data, l:chunk) > 0.5
             if a:do_evict
@@ -213,11 +242,36 @@ function! s:pick_chunk(text, no_mod, do_evict)
         endif
     endfor
 
+    if len(s:ring_queued) == 16
+        call remove(s:ring_queued, 0)
+    endif
+
+    call add(s:ring_queued, {'data': l:chunk, 'str': l:chunk_str, 'time': reltime(), 'filename': expand('%')})
+
+    "let &statusline = 'extra context: ' . len(s:ring_chunks) . ' / ' . len(s:ring_queued)
+endfunction
+
+" called every g:llama_config.ring_update_ms, processed chunks are moved to s:ring_chunks
+function! s:ring_update()
+    call timer_start(g:llama_config.ring_update_ms, {-> s:ring_update()})
+
+    " update only if in normal mode or if the cursor hasn't moved for a while
+    if mode() !=# 'n' && reltimefloat(reltime(s:t_last_move)) < 3.0
+        return
+    endif
+
+    if len(s:ring_queued) == 0
+        return
+    endif
+
+    " move the first queued chunk to the ring buffer
     if len(s:ring_chunks) == g:llama_config.ring_n_chunks
         call remove(s:ring_chunks, 0)
     endif
 
-    call add(s:ring_chunks, {'data': l:chunk, 'str': l:chunk_str, 'time': reltime(), 'filename': expand('%')})
+    call add(s:ring_chunks, remove(s:ring_queued, 0))
+
+    "let &statusline = 'updated context: ' . len(s:ring_chunks) . ' / ' . len(s:ring_queued)
 
     " send asynchronous job with the new extra context so that it is ready for the next FIM
     let l:extra_context = []
@@ -229,16 +283,16 @@ function! s:pick_chunk(text, no_mod, do_evict)
             \ })
     endfor
 
+    " no samplers needed here
     let l:request = json_encode({
         \ 'prompt':           "",
         \ 'input_prefix':     "",
         \ 'input_suffix':     "",
         \ 'n_predict':        1,
         \ 'penalty_last_n':   0,
-        \ 'top_k':            40,
-        \ 'top_p':            0.99,
+        \ 'temperature':      0.0,
         \ 'stream':           v:false,
-        \ 'samplers':         ["top_k", "top_p", "infill"],
+        \ 'samplers':         ["temperature"],
         \ 'cache_prompt':     v:true,
         \ 'extra_context':    l:extra_context,
         \ 't_max_prompt_ms':  1,
@@ -409,6 +463,12 @@ function! llama#fim_cancel()
     silent! iunmap <buffer> <Esc>
 endfunction
 
+function! s:on_move()
+    let s:t_last_move = reltime()
+
+    call llama#fim_cancel()
+endfunction
+
 " callback that processes the result from the server
 function! s:fim_on_stdout(job_id, data, event) dict
     let l:raw = join(a:data, "\n")
@@ -511,9 +571,9 @@ function! s:fim_on_stdout(job_id, data, event) dict
                 \ l:n_cached, l:n_ctx
                 \ )
         else
-            let l:info = printf("%s | context: %d / %d / %d / %d | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %.2f ms",
+            let l:info = printf("%s | context: %d / %d / r=%d / q=%d / e=%d | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %.2f ms",
                 \ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim',
-                \ l:n_cached,  l:n_ctx, len(s:ring_chunks), s:ring_n_evict,
+                \ l:n_cached,  l:n_ctx, len(s:ring_chunks), len(s:ring_queued), s:ring_n_evict,
                 \ l:n_prompt,  l:t_prompt_ms,  l:s_prompt,
                 \ l:n_predict, l:t_predict_ms, l:s_predict,
                 \ 1000.0 * reltimefloat(reltime(s:t_fim_start))

From 060573f7e81b11dc02a6f0fd0a6f047937f3d1d3 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 15 Oct 2024 11:34:32 +0300
Subject: [PATCH 33/42] llama.vim : add comments [no ci]

---
 examples/llama.vim | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/examples/llama.vim b/examples/llama.vim
index b184faa7e5989..6ae6d2b39ad88 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -62,7 +62,9 @@ highlight llama_hl_info guifg=#77ff2f
 "
 "   ring_n_chunks:    max number of chunks to pass as extra context to the server (0 to disable)
 "   ring_chunk_size:  max size of the chunks (in number of lines)
-"   ring_scope:       the range around the cursor position (in number of lines) for gathering chunks
+"                     note: adjust these numbers so that you don't overrun your context
+"                           at ring_n_chunks = 64 and ring_chunk_size = 64 you need ~32k context
+"   ring_scope:       the range around the cursor position (in number of lines) for gathering chunks after FIM
 "   ring_update_ms:   how often to process queued chunks in normal mode
 "
 let s:default_config = {
@@ -416,7 +418,10 @@ function! llama#fim(is_auto, on_hold) abort
     " only gather chunks if the cursor has moved a lot
     " TODO: something more clever? reranking?
     if a:is_auto && l:delta_y > 32
+        " expand the prefix even further
         call s:pick_chunk(getline(max([1,       s:pos_y - g:llama_config.ring_scope]), max([1,       s:pos_y - g:llama_config.n_prefix])), v:false, v:false)
+
+        " pick a suffix chunk
         call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:llama_config.n_suffix]),   min([l:max_y, s:pos_y + g:llama_config.n_suffix + g:llama_config.ring_chunk_size])), v:false, v:false)
 
         let s:pos_y_pick = s:pos_y

From 847c8c023e03e4599a6d55c2dc4ad12a53de7123 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 15 Oct 2024 11:49:20 +0300
Subject: [PATCH 34/42] llama.vim : update infill API params [no ci]

---
 examples/llama.vim | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/llama.vim b/examples/llama.vim
index 6ae6d2b39ad88..a09ecfe7c315b 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -287,16 +287,16 @@ function! s:ring_update()
 
     " no samplers needed here
     let l:request = json_encode({
-        \ 'prompt':           "",
         \ 'input_prefix':     "",
         \ 'input_suffix':     "",
+        \ 'input_extra':      l:extra_context,
+        \ 'prompt':           "",
         \ 'n_predict':        1,
         \ 'penalty_last_n':   0,
         \ 'temperature':      0.0,
         \ 'stream':           v:false,
         \ 'samplers':         ["temperature"],
         \ 'cache_prompt':     v:true,
-        \ 'extra_context':    l:extra_context,
         \ 't_max_prompt_ms':  1,
         \ 't_max_predict_ms': 1
         \ })
@@ -379,8 +379,9 @@ function! llama#fim(is_auto, on_hold) abort
 
     let l:request = json_encode({
         \ 'input_prefix':     l:prefix,
-        \ 'prompt':           l:prompt,
         \ 'input_suffix':     l:suffix,
+        \ 'input_extra':      l:extra_context,
+        \ 'prompt':           l:prompt,
         \ 'n_predict':        g:llama_config.n_predict,
         \ 'penalty_last_n':   0,
         \ 'top_k':            40,
@@ -388,7 +389,6 @@ function! llama#fim(is_auto, on_hold) abort
         \ 'stream':           v:false,
         \ 'samplers':         ["top_k", "top_p", "infill"],
         \ 'cache_prompt':     v:true,
-        \ 'extra_context':    l:extra_context,
         \ 't_max_prompt_ms':  g:llama_config.t_max_prompt_ms,
         \ 't_max_predict_ms': g:llama_config.t_max_predict_ms
         \ })

From 4583aef12bbfb7a49216bfa55a4741f953bb962b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 15 Oct 2024 17:18:32 +0300
Subject: [PATCH 35/42] llama.vim : final touches

ggml-ci
---
 examples/llama.vim     | 35 +++++++++++++++++++++++++++--------
 src/llama-sampling.cpp |  2 +-
 2 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/examples/llama.vim b/examples/llama.vim
index a09ecfe7c315b..3b115c49ccb30 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -17,7 +17,7 @@
 "
 " start the llama.cpp server with a FIM-compatible model. for example:
 "
-"   $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa -dt 0.1 --ubatch-size 512 --batch-size 1024 --cache-reuse 512
+"   $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa -dt 0.1 --ubatch-size 512 --batch-size 1024 --cache-reuse 64
 "
 "   --batch-size [512, model max context]
 "
@@ -33,8 +33,10 @@
 "
 "   :call llama#init()
 "
+" more info: https://github.com/ggerganov/llama.cpp/pull/9787/files
+"
 
-" color of the suggested text
+" colors (adjust to your liking)
 highlight llama_hl_hint guifg=#ff772f
 highlight llama_hl_info guifg=#77ff2f
 
@@ -154,6 +156,8 @@ function! llama#init()
     endif
 endfunction
 
+" compute how similar two chunks of text are
+" 0 - no similarity, 1 - high similarity
 " TODO: figure out something better
 function! s:chunk_sim(c0, c1)
     let l:lines0 = len(a:c0)
@@ -173,17 +177,23 @@ function! s:chunk_sim(c0, c1)
     return 2.0 * l:common / (l:lines0 + l:lines1)
 endfunction
 
-" pick a chunk from the provided text and queue it for processing
+" pick a random chunk of size g:llama_config.ring_chunk_size from the provided text and queue it for processing
+"
+" no_mod   - do not pick chunks from buffers with pending changes
+" do_evict - evict chunks that are very similar to the new one
+"
 function! s:pick_chunk(text, no_mod, do_evict)
     " do not pick chunks from buffers with pending changes or buffers that are not files
     if a:no_mod && (getbufvar(bufnr('%'), '&modified') || !buflisted(bufnr('%')) || !filereadable(expand('%')))
         return
     endif
 
+    " if the extra context option is disabled - do nothing
     if g:llama_config.ring_n_chunks <= 0
         return
     endif
 
+    " don't pick very small chunks
     if len(a:text) < 3
         return
     endif
@@ -220,9 +230,9 @@ function! s:pick_chunk(text, no_mod, do_evict)
         return
     endif
 
-    " evict chunks that are very similar to the new one
+    " evict queued chunks that are very similar to the new one
     for i in range(len(s:ring_queued) - 1, 0, -1)
-        if s:chunk_sim(s:ring_queued[i].data, l:chunk) > 0.5
+        if s:chunk_sim(s:ring_queued[i].data, l:chunk) > 0.9
             if a:do_evict
                 call remove(s:ring_queued, i)
                 let s:ring_n_evict += 1
@@ -234,7 +244,7 @@ function! s:pick_chunk(text, no_mod, do_evict)
 
     " also from s:ring_chunks
     for i in range(len(s:ring_chunks) - 1, 0, -1)
-        if s:chunk_sim(s:ring_chunks[i].data, l:chunk) > 0.5
+        if s:chunk_sim(s:ring_chunks[i].data, l:chunk) > 0.9
             if a:do_evict
                 call remove(s:ring_chunks, i)
                 let s:ring_n_evict += 1
@@ -244,6 +254,7 @@ function! s:pick_chunk(text, no_mod, do_evict)
         endif
     endfor
 
+    " TODO: become parameter ?
     if len(s:ring_queued) == 16
         call remove(s:ring_queued, 0)
     endif
@@ -253,7 +264,8 @@ function! s:pick_chunk(text, no_mod, do_evict)
     "let &statusline = 'extra context: ' . len(s:ring_chunks) . ' / ' . len(s:ring_queued)
 endfunction
 
-" called every g:llama_config.ring_update_ms, processed chunks are moved to s:ring_chunks
+" picks a queued chunk, sends it for processing and adds it to s:ring_chunks
+" called every g:llama_config.ring_update_ms
 function! s:ring_update()
     call timer_start(g:llama_config.ring_update_ms, {-> s:ring_update()})
 
@@ -306,15 +318,21 @@ function! s:ring_update()
         \ g:llama_config.endpoint, shellescape(l:request)
         \ )
 
+    " no callbacks because we don't need to process the response
     call jobstart(l:curl_command, {})
 endfunction
 
+" necessary for 'inoremap <expr>'
 function! llama#fim_inline(is_auto, on_hold) abort
     call llama#fim(a:is_auto, a:on_hold)
     return ''
 endfunction
 
+" the main FIM call
+" takes local context around the cursor and sends it together with the extra context
+" to the llama.cpp server for completion
 function! llama#fim(is_auto, on_hold) abort
+    " we already have a suggestion for the current cursor position
     if a:on_hold && (s:hint_shown || (s:pos_x == col('.') - 1 && s:pos_y == line('.')))
         return
     endif
@@ -415,6 +433,7 @@ function! llama#fim(is_auto, on_hold) abort
     " TODO: per-file location
     let l:delta_y = abs(s:pos_y - s:pos_y_pick)
 
+    " gather some extra context nearby and process it in the background
     " only gather chunks if the cursor has moved a lot
     " TODO: something more clever? reranking?
     if a:is_auto && l:delta_y > 32
@@ -474,7 +493,7 @@ function! s:on_move()
     call llama#fim_cancel()
 endfunction
 
-" callback that processes the result from the server
+" callback that processes the FIM result from the server and displays the suggestion
 function! s:fim_on_stdout(job_id, data, event) dict
     let l:raw = join(a:data, "\n")
     if len(l:raw) == 0
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 1a297aa3866e6..d71516153cf82 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1764,7 +1764,7 @@ struct llama_sampler * llama_sampler_init_logit_bias(
 
 // infill
 
-#define GGML_DEBUG_SAMPLER_INFILL
+//#define GGML_DEBUG_SAMPLER_INFILL
 
 struct llama_sampler_infill {
     const struct llama_vocab * vocab;

From d1b8b215d51df9c5e17fd5921eb3d05f419c3fae Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 17 Oct 2024 16:16:19 +0300
Subject: [PATCH 36/42] llama.vim : fix repetitions of existing text

---
 examples/llama.vim | 62 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 49 insertions(+), 13 deletions(-)

diff --git a/examples/llama.vim b/examples/llama.vim
index 3b115c49ccb30..3d328556304ee 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -33,7 +33,7 @@
 "
 "   :call llama#init()
 "
-" more info: https://github.com/ggerganov/llama.cpp/pull/9787/files
+" more info: https://github.com/ggerganov/llama.cpp/pull/9787
 "
 
 " colors (adjust to your liking)
@@ -46,7 +46,7 @@ highlight llama_hl_info guifg=#77ff2f
 "   n_prefix:         number of lines before the cursor location to include in the prefix
 "   n_suffix:         number of lines after  the cursor location to include in the suffix
 "   n_predict:        max number of tokens to predict
-"   t_max_prompt_ms:  max alloted time for the prompt generation (TODO: not yet supported)
+"   t_max_prompt_ms:  max alloted time for the prompt processing (TODO: not yet supported)
 "   t_max_predict_ms: max alloted time for the prediction
 "   show_info:        show extra info about the inference (0 - disabled, 1 - statusline, 2 - inline)
 "   auto_fim:         trigger FIM completion automatically on cursor movement
@@ -99,8 +99,8 @@ function! llama#init()
         return
     endif
 
-    let s:pos_x  = 0 " cursor position upon start of completion
-    let s:pos_y  = 0
+    let s:pos_x = 0 " cursor position upon start of completion
+    let s:pos_y = 0
 
     let s:line_cur = ''
 
@@ -329,8 +329,7 @@ function! llama#fim_inline(is_auto, on_hold) abort
 endfunction
 
 " the main FIM call
-" takes local context around the cursor and sends it together with the extra context
-" to the llama.cpp server for completion
+" takes local context around the cursor and sends it together with the extra context to the server for completion
 function! llama#fim(is_auto, on_hold) abort
     " we already have a suggestion for the current cursor position
     if a:on_hold && (s:hint_shown || (s:pos_x == col('.') - 1 && s:pos_y == line('.')))
@@ -569,13 +568,50 @@ function! s:fim_on_stdout(job_id, data, event) dict
     endif
 
     let s:pos_dx = len(s:content[-1])
-    let s:content[-1] .= s:line_cur_suffix
 
-    " truncate the suggestion if it repeats the following lines
-    if len(s:content) > 1 && s:content[1] == getline(s:pos_y + 1)
-        let s:content = [s:content[0]]
+    " NOTE: the following is logic for discarding predictions that repeat existing text
+    "       the code is quite ugly and there is very likely a simpler and more canonical way to implement this
+    "
+    "       still, I wonder if there is some better way that avoids having to do these special hacks?
+    "       on one hand, the LLM 'sees' the contents of the file before we start editing, so it is normal that it would
+    "       start generating whatever we have given it via the extra context. but on the other hand, it's not very
+    "       helpful to re-generate the same code that is already there
+
+    " truncate the suggestion if the first line is empty
+    if s:content[0] == ""
+        let s:content = [""]
+    endif
+
+    " truncate the suggestion if it repeats the suffix
+    if len(s:content) == 1 && s:content[0] == s:line_cur_suffix
+        let s:content = [""]
     endif
 
+    " find the first non-empty line (strip whitespace)
+    let l:cmp_y = s:pos_y + 1
+    while l:cmp_y < line('$') && getline(l:cmp_y) =~? '^\s*$'
+        let l:cmp_y += 1
+    endwhile
+
+    if (s:line_cur_prefix . s:content[0]) == getline(l:cmp_y)
+        " truncate the suggestion if it repeats the next line
+        if len(s:content) == 1
+            let s:content = [""]
+        endif
+
+        " ... or if the second line of the suggestion is the prefix of line l:cmp_y + 1
+        if len(s:content) == 2 && s:content[-1] == getline(l:cmp_y + 1)[:len(s:content[-1]) - 1]
+            let s:content = [""]
+        endif
+
+        " ... or if the middle chunk of lines of the suggestion is the same as [l:cmp_y + 1, l:cmp_y + len(s:content) - 1)
+        if len(s:content) > 2 && join(s:content[1:-1], "\n") == join(getline(l:cmp_y + 1, l:cmp_y + len(s:content) - 1), "\n")
+            let s:content = [""]
+        endif
+    endif
+
+    let s:content[-1] .= s:line_cur_suffix
+
     call llama#fim_cancel()
 
     " display virtual text with the suggestion
@@ -595,9 +631,9 @@ function! s:fim_on_stdout(job_id, data, event) dict
                 \ l:n_cached, l:n_ctx
                 \ )
         else
-            let l:info = printf("%s | context: %d / %d / r=%d / q=%d / e=%d | prompt: %d (%.2f ms, %.2f t/s) | predict: %d (%.2f ms, %.2f t/s) | total: %.2f ms",
+            let l:info = printf("%s | c: %d / %d, r: %d, e: %d, q: %d | p: %d (%.2f ms, %.2f t/s) | g: %d (%.2f ms, %.2f t/s) | t: %.2f ms",
                 \ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim',
-                \ l:n_cached,  l:n_ctx, len(s:ring_chunks), len(s:ring_queued), s:ring_n_evict,
+                \ l:n_cached,  l:n_ctx, len(s:ring_chunks), s:ring_n_evict, len(s:ring_queued),
                 \ l:n_prompt,  l:t_prompt_ms,  l:s_prompt,
                 \ l:n_predict, l:t_predict_ms, l:s_predict,
                 \ 1000.0 * reltimefloat(reltime(s:t_fim_start))
@@ -627,7 +663,7 @@ function! s:fim_on_stdout(job_id, data, event) dict
         \ 'virt_text_win_col': virtcol('.')
         \ })
 
-    " setup accept/cancel events
+    " setup accept shortcuts
     inoremap <buffer> <Tab>   <C-O>:call llama#fim_accept(v:false)<CR>
     inoremap <buffer> <S-Tab> <C-O>:call llama#fim_accept(v:true)<CR>
 

From 1600d846b6e4ea528384eb6fce8dbad63a056e4c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 17 Oct 2024 22:09:47 +0300
Subject: [PATCH 37/42] llama.vim : complete only whithin the local scope [no
 ci]

---
 examples/llama.vim | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/examples/llama.vim b/examples/llama.vim
index 3d328556304ee..f5cbef624e6c9 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -73,9 +73,9 @@ let s:default_config = {
     \ 'endpoint':         'http://127.0.0.1:8012/infill',
     \ 'n_prefix':         256,
     \ 'n_suffix':         8,
-    \ 'n_predict':        64,
+    \ 'n_predict':        128,
     \ 't_max_prompt_ms':  500,
-    \ 't_max_predict_ms': 200,
+    \ 't_max_predict_ms': 1000,
     \ 'show_info':        2,
     \ 'auto_fim':         v:true,
     \ 'max_line_suffix':  8,
@@ -394,12 +394,16 @@ function! llama#fim(is_auto, on_hold) abort
             \ })
     endfor
 
+    " the indentation of the current line
+    let l:indent = strlen(matchstr(s:line_cur_prefix, '^\s*'))
+
     let l:request = json_encode({
         \ 'input_prefix':     l:prefix,
         \ 'input_suffix':     l:suffix,
         \ 'input_extra':      l:extra_context,
         \ 'prompt':           l:prompt,
         \ 'n_predict':        g:llama_config.n_predict,
+        \ 'n_indent':         l:indent,
         \ 'penalty_last_n':   0,
         \ 'top_k':            40,
         \ 'top_p':            0.99,
@@ -567,8 +571,6 @@ function! s:fim_on_stdout(job_id, data, event) dict
         return
     endif
 
-    let s:pos_dx = len(s:content[-1])
-
     " NOTE: the following is logic for discarding predictions that repeat existing text
     "       the code is quite ugly and there is very likely a simpler and more canonical way to implement this
     "
@@ -578,7 +580,12 @@ function! s:fim_on_stdout(job_id, data, event) dict
     "       helpful to re-generate the same code that is already there
 
     " truncate the suggestion if the first line is empty
-    if s:content[0] == ""
+    if len(s:content) == 1 && s:content[0] == ""
+        let s:content = [""]
+    endif
+
+    " ... and the next lines are repeated
+    if len(s:content) > 1 && s:content[0] == "" && s:content[1:] == getline(s:pos_y + 1, s:pos_y + len(s:content) - 1)
         let s:content = [""]
     endif
 
@@ -610,6 +617,17 @@ function! s:fim_on_stdout(job_id, data, event) dict
         endif
     endif
 
+    " keep only lines that have the same or larger whitespace prefix as s:line_cur_prefix
+    "let l:indent = strlen(matchstr(s:line_cur_prefix, '^\s*'))
+    "for i in range(1, len(s:content) - 1)
+    "    if strlen(matchstr(s:content[i], '^\s*')) < l:indent
+    "        let s:content = s:content[:i - 1]
+    "        break
+    "    endif
+    "endfor
+
+    let s:pos_dx = len(s:content[-1])
+
     let s:content[-1] .= s:line_cur_suffix
 
     call llama#fim_cancel()

From 6bb6e6dd8094c426b3cdc3077fe34e239e1d1835 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 18 Oct 2024 09:47:14 +0300
Subject: [PATCH 38/42] llama.vim : display ring capacity [no ci]

---
 examples/llama.vim | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/llama.vim b/examples/llama.vim
index f5cbef624e6c9..16434e570247c 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -649,9 +649,9 @@ function! s:fim_on_stdout(job_id, data, event) dict
                 \ l:n_cached, l:n_ctx
                 \ )
         else
-            let l:info = printf("%s | c: %d / %d, r: %d, e: %d, q: %d | p: %d (%.2f ms, %.2f t/s) | g: %d (%.2f ms, %.2f t/s) | t: %.2f ms",
+            let l:info = printf("%s | c: %d / %d, r: %d / %d, e: %d, q: %d / 16 | p: %d (%.2f ms, %.2f t/s) | g: %d (%.2f ms, %.2f t/s) | t: %.2f ms",
                 \ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim',
-                \ l:n_cached,  l:n_ctx, len(s:ring_chunks), s:ring_n_evict, len(s:ring_queued),
+                \ l:n_cached,  l:n_ctx, len(s:ring_chunks), g:llama_config.ring_n_chunks, s:ring_n_evict, len(s:ring_queued),
                 \ l:n_prompt,  l:t_prompt_ms,  l:s_prompt,
                 \ l:n_predict, l:t_predict_ms, l:s_predict,
                 \ 1000.0 * reltimefloat(reltime(s:t_fim_start))

From fe78c3939911975c9508653162efb4dbd1a33474 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 18 Oct 2024 13:48:00 +0300
Subject: [PATCH 39/42] llama.vim : fix large chunk accept + comments [no ci]

---
 examples/llama.vim | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/examples/llama.vim b/examples/llama.vim
index 16434e570247c..e06cdff38a6b5 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -17,7 +17,7 @@
 "
 " start the llama.cpp server with a FIM-compatible model. for example:
 "
-"   $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa -dt 0.1 --ubatch-size 512 --batch-size 1024 --cache-reuse 64
+"   $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa -dt 0.1 --ubatch-size 512 --batch-size 1024 --cache-reuse 256
 "
 "   --batch-size [512, model max context]
 "
@@ -29,6 +29,12 @@
 "     chunks the batch into smaller chunks for faster processing
 "     depends on the specific hardware. use llama-bench to profile and determine the best size
 "
+"   --cache-reuse (ge:llama_config.n_predict, 1024]
+"
+"     this should be either 0 (disabled) or strictly larger than g:llama_config.n_predict
+"     using non-zero value enables context reuse on the server side which dramatically improves the performance at
+"     large contexts. a value of 256 should be good for all cases
+"
 " run this once to initialise llama.vim:
 "
 "   :call llama#init()
@@ -43,8 +49,8 @@ highlight llama_hl_info guifg=#77ff2f
 " general parameters:
 "
 "   endpoint:         llama.cpp server endpoint
-"   n_prefix:         number of lines before the cursor location to include in the prefix
-"   n_suffix:         number of lines after  the cursor location to include in the suffix
+"   n_prefix:         number of lines before the cursor location to include in the local prefix
+"   n_suffix:         number of lines after  the cursor location to include in the local suffix
 "   n_predict:        max number of tokens to predict
 "   t_max_prompt_ms:  max alloted time for the prompt processing (TODO: not yet supported)
 "   t_max_predict_ms: max alloted time for the prediction
@@ -72,7 +78,7 @@ highlight llama_hl_info guifg=#77ff2f
 let s:default_config = {
     \ 'endpoint':         'http://127.0.0.1:8012/infill',
     \ 'n_prefix':         256,
-    \ 'n_suffix':         8,
+    \ 'n_suffix':         64,
     \ 'n_predict':        128,
     \ 't_max_prompt_ms':  500,
     \ 't_max_predict_ms': 1000,
@@ -463,7 +469,7 @@ function! llama#fim_accept(first_line)
 
         " move the cursor to the end of the accepted text
         if !a:first_line && len(s:content) > 1
-            call cursor(s:pos_y + len(s:content) - 1, s:pos_x + s:pos_dx)
+            call cursor(s:pos_y + len(s:content) - 1, s:pos_x + s:pos_dx + 1)
         else
             call cursor(s:pos_y, s:pos_x + len(s:content[0]))
         endif

From b8efb0725de3b16bef35ac05761bcd07e7e0de46 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 18 Oct 2024 22:45:23 +0300
Subject: [PATCH 40/42] llama.vim : minor [no ci]

---
 examples/llama.vim | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/llama.vim b/examples/llama.vim
index e06cdff38a6b5..cf915ff4ec48c 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -144,6 +144,7 @@ function! llama#init()
            "autocmd CursorHoldI  * call llama#fim(v:true, v:true)
         endif
 
+        " gather chunks upon yanking
         autocmd TextYankPost    * if v:event.operator ==# 'y' | call s:pick_chunk(v:event.regcontents, v:false, v:true) | endif
 
         " gather chunks upon entering/leaving a buffer
@@ -310,7 +311,6 @@ function! s:ring_update()
         \ 'input_extra':      l:extra_context,
         \ 'prompt':           "",
         \ 'n_predict':        1,
-        \ 'penalty_last_n':   0,
         \ 'temperature':      0.0,
         \ 'stream':           v:false,
         \ 'samplers':         ["temperature"],
@@ -410,7 +410,6 @@ function! llama#fim(is_auto, on_hold) abort
         \ 'prompt':           l:prompt,
         \ 'n_predict':        g:llama_config.n_predict,
         \ 'n_indent':         l:indent,
-        \ 'penalty_last_n':   0,
         \ 'top_k':            40,
         \ 'top_p':            0.99,
         \ 'stream':           v:false,

From 32927e68b7fbfd6dfa82e531d186f1b6b22612ae Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 21 Oct 2024 12:32:38 +0300
Subject: [PATCH 41/42] llama.vim : remove on-hold code + fixes [no ci]

---
 examples/llama.vim | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/examples/llama.vim b/examples/llama.vim
index cf915ff4ec48c..24e4a7cd18690 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -131,7 +131,7 @@ function! llama#init()
 
     augroup llama
         autocmd!
-        autocmd InsertEnter     * inoremap <expr> <silent> <C-F> llama#fim_inline(v:false, v:false)
+        autocmd InsertEnter     * inoremap <expr> <silent> <C-F> llama#fim_inline(v:false)
         autocmd InsertLeavePre  * call llama#fim_cancel()
 
         autocmd CursorMoved     * call s:on_move()
@@ -139,9 +139,7 @@ function! llama#init()
         autocmd CompleteChanged * call llama#fim_cancel()
 
         if g:llama_config.auto_fim
-            autocmd InsertEnter  * call llama#fim(v:true, v:false)
-            autocmd CursorMovedI * call llama#fim(v:true, v:false)
-           "autocmd CursorHoldI  * call llama#fim(v:true, v:true)
+            autocmd CursorMovedI * call llama#fim(v:true)
         endif
 
         " gather chunks upon yanking
@@ -329,16 +327,17 @@ function! s:ring_update()
 endfunction
 
 " necessary for 'inoremap <expr>'
-function! llama#fim_inline(is_auto, on_hold) abort
-    call llama#fim(a:is_auto, a:on_hold)
+function! llama#fim_inline(is_auto) abort
+    call llama#fim(a:is_auto)
     return ''
 endfunction
 
 " the main FIM call
 " takes local context around the cursor and sends it together with the extra context to the server for completion
-function! llama#fim(is_auto, on_hold) abort
+function! llama#fim(is_auto) abort
     " we already have a suggestion for the current cursor position
-    if a:on_hold && (s:hint_shown || (s:pos_x == col('.') - 1 && s:pos_y == line('.')))
+    if s:hint_shown && !a:is_auto
+        call llama#fim_cancel()
         return
     endif
 
@@ -352,7 +351,7 @@ function! llama#fim(is_auto, on_hold) abort
         endif
 
         let s:t_fim_start = reltime()
-        let s:timer_fim = timer_start(600, {-> llama#fim(v:true, v:true)})
+        let s:timer_fim = timer_start(600, {-> llama#fim(v:true)})
         return
     endif
 
@@ -512,6 +511,11 @@ function! s:fim_on_stdout(job_id, data, event) dict
         return
     endif
 
+    " show the suggestion only in insert mode
+    if mode() !=# 'i'
+        return
+    endif
+
     let s:pos_x = self.pos_x
     let s:pos_y = self.pos_y
 

From 8fb51545477f2e1ef636a7c10f47e345e4b5d985 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 21 Oct 2024 15:57:15 +0300
Subject: [PATCH 42/42] llama.vim : minor [no ci]

---
 examples/llama.vim | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/llama.vim b/examples/llama.vim
index 24e4a7cd18690..e75872cae0e9c 100644
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -11,7 +11,7 @@
 "
 "   - Tab       - accept the current suggestion
 "   - Shift+Tab - accept just the first line of the segguestion
-"   - Ctrl+F    - trigger FIM completion manually
+"   - Ctrl+F    - toggle FIM completion manually
 "
 " make symlink or copy this file to ~/.config/nvim/autoload/llama.vim
 "