diff --git a/examples/demo_cffi.py b/examples/demo_cffi.py index 00d500b..27a761f 100644 --- a/examples/demo_cffi.py +++ b/examples/demo_cffi.py @@ -1,24 +1,77 @@ +# import os +# import sys +# sys.path.append(os.path.abspath('.')) + import psutil -from llama.cffi import llama_generate, LlamaOptions - - -options = LlamaOptions( - no_display_prompt=True, - threads=psutil.cpu_count(logical=False), - # ctx_size=8192, - ctx_size=4 * 4096, - predict=512, - flash_attn=True, - cont_batching=True, - simple_io=True, - # log_disable=True, - hf_repo='bartowski/Phi-3.1-mini-128k-instruct-GGUF', - hf_file='Phi-3.1-mini-128k-instruct-Q4_K_M.gguf', - # hf_file='Phi-3.1-mini-128k-instruct-IQ2_M.gguf', - chat_template='chatml', - # prompt='<|im_start|>user\nEvaluate 1 + 2.<|im_end|>\n<|im_start|>assistant\n', - prompt='<|system|>\nYou are a helpful assistant.<|end|><|user|>\nEvaluate 1 + 2.<|end|>\n<|assistant|>\n', -) - -for chunk in llama_generate(options): - print(chunk, flush=True, end='') +from llama.llama_cli_cffi import llama_generate, Model, Options + +from demo_models import models + + +def demo1(): + options = Options( + no_display_prompt=True, + threads=psutil.cpu_count(logical=False), + ctx_size=8192, + predict=512, + flash_attn=True, + cont_batching=True, + simple_io=True, + log_disable=True, + hf_repo=models[0].hf_repo, + hf_file=models[0].hf_file, + prompt='<|system|>\nYou are a helpful assistant.<|end|><|user|>\nEvaluate 1 + 2.<|end|>\n<|assistant|>\n', + ) + + for chunk in llama_generate(options): + print(chunk, flush=True, end='') + + print() + + +def demo2(): + options = Options( + no_display_prompt=True, + threads=psutil.cpu_count(logical=False), + ctx_size=2048, + predict=-2, + flash_attn=True, + cont_batching=True, + simple_io=True, + log_disable=True, + hf_repo=models[1].hf_repo, + hf_file=models[1].hf_file, + prompt='<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nEvaluate 1 + 2.<|im_end|>\n<|im_start|>assistant\n', + ) + + for chunk in llama_generate(options): + print(chunk, flush=True, end='') + + print() + + +def demo3(): + options = Options( + no_display_prompt=True, + threads=psutil.cpu_count(logical=False), + ctx_size=2048, + predict=-2, + flash_attn=True, + cont_batching=True, + simple_io=True, + log_disable=True, + hf_repo=models[2].hf_repo, + hf_file=models[2].hf_file, + prompt='<|system|>\nYou are a helpful assistant.<|end|><|user|>\nEvaluate 1 + 2.<|end|>\n<|assistant|>\n', + ) + + for chunk in llama_generate(options): + print(chunk, flush=True, end='') + + print() + + +if __name__ == '__main__': + demo1() + demo2() + demo3() diff --git a/examples/demo_ctypes.py b/examples/demo_ctypes.py index c759ef8..32ceb69 100644 --- a/examples/demo_ctypes.py +++ b/examples/demo_ctypes.py @@ -5,23 +5,7 @@ import psutil from llama.llama_cli_ctypes import llama_generate, Model, Options -models = [ - Model( - 'microsoft/Phi-3-mini-128k-instruct', - 'bartowski/Phi-3.1-mini-128k-instruct-GGUF', - 'Phi-3.1-mini-128k-instruct-Q4_K_M.gguf', - ), - Model( - 'Qwen/Qwen2-1.5B-Instruct', - 'Qwen/Qwen2-1.5B-Instruct-GGUF', - 'qwen2-1_5b-instruct-q4_k_m.gguf', - ), - Model( - 'TinyLlama/TinyLlama-1.1B-Chat-v1.0', - 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF', - 'tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf', - ), -] +from demo_models import models def demo1(): diff --git a/examples/demo_models.py b/examples/demo_models.py new file mode 100644 index 0000000..dc712e8 --- /dev/null +++ b/examples/demo_models.py @@ -0,0 +1,19 @@ +from llama.llama_cli_model import Model + +models = [ + Model( + 'microsoft/Phi-3-mini-128k-instruct', + 'bartowski/Phi-3.1-mini-128k-instruct-GGUF', + 'Phi-3.1-mini-128k-instruct-Q4_K_M.gguf', + ), + Model( + 'Qwen/Qwen2-1.5B-Instruct', + 'Qwen/Qwen2-1.5B-Instruct-GGUF', + 'qwen2-1_5b-instruct-q4_k_m.gguf', + ), + Model( + 'TinyLlama/TinyLlama-1.1B-Chat-v1.0', + 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF', + 'tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf', + ), +] \ No newline at end of file diff --git a/llama/llama_cli_cffi.py b/llama/llama_cli_cffi.py index f3389cf..dc48829 100644 --- a/llama/llama_cli_cffi.py +++ b/llama/llama_cli_cffi.py @@ -3,25 +3,41 @@ import json import ctypes from queue import Queue -from copy import deepcopy from typing import Iterator from threading import Thread from functools import partial from huggingface_hub import hf_hub_download +from .llama_cli_model import Model from .llama_cli_options import Options, convert_options_to_bytes from ._llama_cli import lib, ffi -FPRINTF_FUNC = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_void_p, ctypes.c_char_p, ctypes.c_char_p) -FFLUSH_FUNC = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_void_p) +_LLAMA_YIELD_TOKEN_T = ctypes.CFUNCTYPE(None, ctypes.c_char_p) +_LLAMA_SHOULD_STOP_T = ctypes.CFUNCTYPE(ctypes.c_int) +def _llama_yield_token_func(chunk: bytes, queue=None, callback=None, metadata=None): + chunk = chunk.decode() + print(chunk, flush=True, end='') + + +def _llama_should_stop_func(queue=None, callback=None, metadata=None) -> int: + return 0 def _llama_cli_main(argc, argv, queue=None, callback=None, metadata=None): - r = lib.llama_cli_main(argc, argv) + _llama_yield_token = _LLAMA_YIELD_TOKEN_T(partial(_llama_yield_token_func, queue=queue, callback=callback, metadata=metadata)) + _llama_should_stop = _LLAMA_SHOULD_STOP_T(partial(_llama_should_stop_func, queue=queue, callback=callback, metadata=metadata)) + + _llama_yield_token_address = ctypes.cast(_llama_yield_token, ctypes.c_void_p).value + _llama_should_stop_address = ctypes.cast(_llama_should_stop, ctypes.c_void_p).value + + cffi__llama_yield_token_callback = ffi.cast('void (*_llama_yield_token_t)(const char * token)', _llama_yield_token_address) + cffi__llama_should_stop_callback = ffi.cast('int (*_llama_should_stop_t)(void)', _llama_should_stop_address) + + r = lib._llama_cli_main(argc, argv, cffi__llama_yield_token_callback, cffi__llama_should_stop_callback, 1) assert r == 0 if queue is not None: @@ -48,32 +64,7 @@ def llama_generate(options: Options, callback=None) -> Iterator[str] | None: else: queue = Queue() - # get bos, eos, and eot from metedata - metadata_options = deepcopy(options) - metadata_options.log_disable = True - metadata_argv: list[bytes] = [b'llama-cli'] + convert_options_to_bytes(metadata_options) - metadata_argv = [ffi.new('char[]', n) for n in metadata_argv] - metadata_argc = len(metadata_argv) - - c_metadata: 'const char*' = lib.llama_get_metadata_as_json(metadata_argc, metadata_argv) - metadata: bytes = ffi.string(c_metadata) - lib.llama_free_metadata_as_json(c_metadata) - metadata: str = metadata.decode('utf-8') - metadata: dict = json.loads(metadata) - print(f'{metadata = }') - - # intercept token generation - fprintf = FPRINTF_FUNC(partial(fprintf_func, queue=queue, metadata=metadata)) - fflush = FFLUSH_FUNC(fflush_func) - - fprintf_address = ctypes.cast(fprintf, ctypes.c_void_p).value - fflush_address = ctypes.cast(fflush, ctypes.c_void_p).value - - cffi_fprintf_callback = ffi.cast('int (*func)(FILE*, const char* format, ...)', fprintf_address) - cffi_fflush_callback = ffi.cast('int (*func)(FILE*)', fflush_address) - - lib.llama_set_fprintf(cffi_fprintf_callback) - lib.llama_set_fflush(cffi_fflush_callback) + metadata: dict = {} argv: list[bytes] = [b'llama-cli'] + convert_options_to_bytes(options) argv = [ffi.new('char[]', n) for n in argv] diff --git a/llama/llama_cli_ctypes.py b/llama/llama_cli_ctypes.py index fa9b63c..d90972c 100644 --- a/llama/llama_cli_ctypes.py +++ b/llama/llama_cli_ctypes.py @@ -2,7 +2,7 @@ import os import json -from ctypes import * +import ctypes from queue import Queue from typing import Iterator from threading import Thread @@ -17,13 +17,13 @@ module_path = os.path.abspath(__file__) module_dir = os.path.dirname(module_path) llama_cli_lib_path = os.path.join(module_dir, 'llama-cli.so') -lib = CDLL(llama_cli_lib_path) +lib = ctypes.CDLL(llama_cli_lib_path) -_LLAMA_YIELD_TOKEN_T = CFUNCTYPE(None, c_char_p) -_LLAMA_SHOULD_STOP_T = CFUNCTYPE(c_int) +_LLAMA_YIELD_TOKEN_T = ctypes.CFUNCTYPE(None, ctypes.c_char_p) +_LLAMA_SHOULD_STOP_T = ctypes.CFUNCTYPE(ctypes.c_int) -lib._llama_cli_main.argtypes = [c_int, POINTER(c_char_p), _LLAMA_YIELD_TOKEN_T, _LLAMA_SHOULD_STOP_T, c_int] -lib._llama_cli_main.restype = c_int +lib._llama_cli_main.argtypes = [ctypes.c_int, ctypes.POINTER(ctypes.c_char_p), _LLAMA_YIELD_TOKEN_T, _LLAMA_SHOULD_STOP_T, ctypes.c_int] +lib._llama_cli_main.restype = ctypes.c_int def _llama_yield_token_func(chunk: bytes, queue=None, callback=None, metadata=None): @@ -47,7 +47,7 @@ def _llama_cli_main(argc, argv, queue=None, callback=None, metadata=None): callback(None) -def llama_generate(options: Options, callback=None, metadata=None) -> Iterator[str] | None: +def llama_generate(options: Options, callback=None) -> Iterator[str] | None: # check hf_repo, hf_file if options.hf_repo and options.hf_file: options.model = hf_hub_download(repo_id=options.hf_repo, filename=options.hf_file) @@ -69,7 +69,7 @@ def llama_generate(options: Options, callback=None, metadata=None) -> Iterator[s argv: list[bytes] = [b'llama-cli'] + convert_options_to_bytes(options) argc = len(argv) - argv = (c_char_p * argc)(*argv) + argv = (ctypes.c_char_p * argc)(*argv) if callback: _llama_cli_main(argc, argv, queue, callback, metadata) diff --git a/scripts/build.py b/scripts/build.py index 57512ca..06caa96 100644 --- a/scripts/build.py +++ b/scripts/build.py @@ -10,13 +10,9 @@ ffibuilder = FFI() ffibuilder.cdef(''' - void llama_set_stdout(FILE* f); - void llama_set_stderr(FILE* f); - void llama_set_fprintf(int (*func)(FILE*, const char* format, ...)); - void llama_set_fflush(int (*func)(FILE*)); - const char* llama_get_metadata_as_json(int argc, char ** argv); - void llama_free_metadata_as_json(const char * c_output); - int llama_cli_main(int argc, char ** argv); + typedef void (*_llama_yield_token_t)(const char * token); + typedef int (*_llama_should_stop_t)(void); + int _llama_cli_main(int argc, char ** argv, _llama_yield_token_t _llama_yield_token, _llama_should_stop_t _llama_should_stop, int stop_on_bos_eos_eot); ''') ffibuilder.set_source( @@ -24,25 +20,42 @@ ''' #include - void llama_set_stdout(FILE* f); - void llama_set_stderr(FILE* f); - void llama_set_fprintf(int (*func)(FILE*, const char* format, ...)); - void llama_set_fflush(int (*func)(FILE*)); - const char* llama_get_metadata_as_json(int argc, char ** argv); - void llama_free_metadata_as_json(const char * c_output); - int llama_cli_main(int argc, char ** argv); + typedef void (*_llama_yield_token_t)(const char * token); + typedef int (*_llama_should_stop_t)(void); + int _llama_cli_main(int argc, char ** argv, _llama_yield_token_t _llama_yield_token, _llama_should_stop_t _llama_should_stop, int stop_on_bos_eos_eot); ''', libraries=['stdc++'], - extra_objects=['../llama.cpp/libllama-cli.a'], + extra_objects=['../llama.cpp/llama-cli.a'], ) def build(*args, **kwargs): - # subprocess.run(['rm', '-rf', 'llama.cpp'], check=True) - # subprocess.run(['git', 'clone', 'https://github.com/ggerganov/llama.cpp.git'], check=True) - # subprocess.run(['patch', 'llama.cpp/examples/main/main.cpp', 'main_shared_library_1.patch'], check=True) - # subprocess.run(['patch', 'llama.cpp/Makefile', 'makefile_static_library_0.patch'], check=True) + env = os.environ.copy() + + subprocess.run(['git', 'clone', 'https://github.com/ggerganov/llama.cpp.git'], check=True) + subprocess.run(['patch', 'llama.cpp/examples/main/main.cpp', 'main_3.patch'], check=True) + subprocess.run(['patch', 'llama.cpp/Makefile', 'Makefile_3.patch'], check=True) + + if 'PYODIDE' in env and env['PYODIDE'] == '1': + env['CXXFLAGS'] += ' -msimd128 -fno-rtti -DNDEBUG -flto=full -s INITIAL_MEMORY=2GB -s MAXIMUM_MEMORY=4GB -s ALLOW_MEMORY_GROWTH ' + env['UNAME_M'] = 'wasm' + + subprocess.run(['make', '-C', 'llama.cpp', '-j', 'llama-cli-shared', 'llama-cli-static', 'GGML_NO_OPENMP=1', 'GGML_NO_LLAMAFILE=1'], check=True, env=env) + + # cffi + ffibuilder.compile(tmpdir='build', verbose=True) + + # ctypes + for file in glob.glob('build/*.so') + glob.glob('llama.cpp/*.so'): + shutil.move(file, 'llama/') + + for file in glob.glob('build/*.dll') + glob.glob('llama.cpp/*.dll'): + shutil.move(file, 'llama/') + for file in glob.glob('build/*.dylib') + glob.glob('llama.cpp/*.dylib'): + shutil.move(file, 'llama/') + + ''' # cffi env = os.environ.copy() env['CXXFLAGS'] = '-DSHARED_LIB' @@ -73,6 +86,7 @@ def build(*args, **kwargs): for file in glob.glob('build/*.dylib') + glob.glob('llama.cpp/*.dylib'): shutil.move(file, 'llama/') + ''' if __name__ == '__main__': diff --git a/scripts/clean.py b/scripts/clean.py index d5a7736..8a66cdc 100644 --- a/scripts/clean.py +++ b/scripts/clean.py @@ -3,9 +3,9 @@ def clean(): - files = glob.glob('llama/*.so') + files = glob.glob('llama/*.so') + glob.glob('llama/*.a') + glob.glob('llama/*.dylib') + glob.glob('llama/*.dll') subprocess.run(['rm', '-fv'] + files, check=True) subprocess.run(['rm', '-fr', 'build'], check=True) subprocess.run(['rm', '-fr', 'dist'], check=True) - # subprocess.run(['rm', '-fr', 'llama.cpp'], check=True) + subprocess.run(['rm', '-fr', 'llama.cpp'], check=True) subprocess.run(['rm', '-fr', 'wheelhouse'], check=True)