cffi cpu/cuda 12.5

tangledgroup · Jul 18, 2024 · 05dde60 · 05dde60
1 parent 86916bf
commit 05dde60
Show file tree

Hide file tree

Showing 8 changed files with 168 additions and 41 deletions.
diff --git a/BUILD.md b/BUILD.md
@@ -0,0 +1,27 @@
+# llama-cpp-cffi
+
+## Build
+
+```bash
+#
+# setup venv
+#
+python -m venv venv
+source venv/bin/activate
+pip install poetry
+
+# x86_64
+poetry run cibuildwheel --output-dir wheelhouse --platform linux --arch x86_64 .
+
+# aarch64
+docker run --rm --privileged linuxkit/binfmt:v0.8
+poetry run cibuildwheel --output-dir wheelhouse --platform linux --arch aarch64 .
+
+# pyodide, pyscript, wasm (NOTE: cannot be published to PyPI)
+# poetry run cibuildwheel --output-dir wheelhouse --platform pyodide .
+
+#
+# publish
+#
+poetry publish --dist-dir wheelhouse
+```
diff --git a/Makefile_3.patch b/Makefile_3.patch
@@ -27,7 +27,7 @@
 +    LIB_CXXFLAGS := -DLLAMA_LIB
 +endif
 +
-+LIB_NAME := llama-cli.$(LIB_EXT)
++LIB_NAME := llama_cli.$(LIB_EXT)
 +
  # Architecture specific
  # TODO: probably these flags need to be tweaked on some architectures
@@ -44,7 +44,7 @@
 +llama-cli-static: examples/main/main.cpp \
 +	$(OBJ_ALL)
 +	$(CXX) $(CXXFLAGS) $(LIB_CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-+	ar rcs llama-cli.a $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<)
++	ar rcs llama_cli.a $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<)
 +
  llama-infill: examples/infill/infill.cpp \
  	$(OBJ_ALL)

diff --git a/README.md b/README.md
@@ -1,45 +1,73 @@
 # llama-cpp-cffi
 
-Python binding for llama.cpp using cffi
+<!--
+[![Build][build-image]]()
+[![Status][status-image]][pypi-project-url]
+[![Stable Version][stable-ver-image]][pypi-project-url]
+[![Coverage][coverage-image]]()
+[![Python][python-ver-image]][pypi-project-url]
+[![License][mit-image]][mit-url]
+-->
+[![Downloads](https://img.shields.io/pypi/dm/llama-cli-cffi)](https://pypistats.org/packages/llama-cli-cffi)
+[![Supported Versions](https://img.shields.io/pypi/pyversions/llama-cli-cffi)](https://pypi.org/project/llama-cli-cffi)
+[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 
-## Build
+**Python** binding for [llama.cpp](https://github.com/ggerganov/llama.cpp) using **cffi** and **ctypes**. Supports **CPU** and **CUDA 12.5** execution.
+
+## Install
 
 ```bash
-#
-# setup venv
-#
-python -m venv venv
-source venv/bin/activate
-pip install poetry
+pip install llama-cli-cffi
+```
 
-#
-# build
-#
+## Example
 
-# x86_64
-poetry run cibuildwheel --output-dir wheelhouse --platform linux --arch x86_64 .
+```python
+from llama.llama_cli_cffi_cpu import llama_generate, Model, Options
+# from llama.llama_cli_cffi_cuda_12_5 import llama_generate, Model, Options
+# from llama.llama_cli_ctypes_cuda import llama_generate, Model, Options
+# from llama.llama_cli_ctypes_cuda_12_5 import llama_generate, Model, Options
 
-# aarch64
-docker run --rm --privileged linuxkit/binfmt:v0.8
-poetry run cibuildwheel --output-dir wheelhouse --platform linux --arch aarch64 .
+from llama.formatter import get_config
 
-# pyodide, pyscript, wasm (NOTE: cannot be published to PyPI)
-# poetry run cibuildwheel --output-dir wheelhouse --platform pyodide .
+model = Model(
+    'TinyLlama/TinyLlama-1.1B-Chat-v1.0',
+    'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF',
+    'tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf',
+)
 
-#
-# publish
-#
-poetry publish --dist-dir wheelhouse
+config = get_config(model.creator_hf_repo)
+
+messages = [
+    {'role': 'system', 'content': 'You are a helpful assistant.'},
+    {'role': 'user', 'content': 'Evaluate 1 + 2 in Python.'},
+]
 
+options = Options(
+    ctx_size=config.max_position_embeddings,
+    predict=-2,
+    model=model,
+    prompt=messages,
+)
+
+for chunk in llama_generate(options):
+    print(chunk, flush=True, end='')
+
+# newline
+print()
+```
+
+## Demos
+
+```BASH
 #
 # run demos
 #
-python -B examples/demo_cffi.py
+python -B examples/demo_cffi_cpu.py
+python -B examples/demo_cffi_cuda_12_5.py
+
 python -B examples/demo_ctypes_cpu.py
 python -B examples/demo_ctypes_cuda_12_5.py
-python -m http.server -d examples/demo_pyonide -b "0.0.0.0" 5000
-```
 
-```bash
-make -j llama-cli-shared llama-cli-static GGML_NO_OPENMP=1 GGML_NO_LLAMAFILE=1
-```
+# python -m http.server -d examples/demo_pyonide -b "0.0.0.0" 5000
+```
diff --git a/examples/demo_0.py b/examples/demo_0.py
@@ -0,0 +1,28 @@
+from llama.llama_cli_cffi_cpu import llama_generate, Model, Options
+from llama.formatter import get_config
+
+model = Model(
+    'TinyLlama/TinyLlama-1.1B-Chat-v1.0',
+    'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF',
+    'tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf',
+)
+
+config = get_config(model.creator_hf_repo)
+
+messages = [
+    {'role': 'system', 'content': 'You are a helpful assistant.'},
+    {'role': 'user', 'content': 'Evaluate 1 + 2 in Python.'},
+]
+
+options = Options(
+    ctx_size=config.max_position_embeddings,
+    predict=-2,
+    model=model,
+    prompt=messages,
+)
+
+for chunk in llama_generate(options):
+    print(chunk, flush=True, end='')
+
+# newline
+print()
diff --git a/examples/demo_cffi_cuda_12_5.py b/examples/demo_cffi_cuda_12_5.py
@@ -0,0 +1,39 @@
+# import os
+# import sys
+# sys.path.append(os.path.abspath('.'))
+
+from llama.llama_cli_cffi_cuda_12_5 import llama_generate, Model, Options
+from llama.formatter import get_config
+
+from demo_models import models
+
+
+def demo_model(model: Model, messages: list[dict]):
+    config = get_config(model.creator_hf_repo)
+
+    options = Options(
+        ctx_size=32 * 1024 if model.creator_hf_repo == 'microsoft/Phi-3-mini-128k-instruct' else config.max_position_embeddings,
+        predict=-2,
+        gpu_layers=19 if model.creator_hf_repo == 'microsoft/Phi-3-mini-128k-instruct' else 99,
+        # log_disable=False,
+        model=model,
+        prompt=messages,
+    )
+
+    for chunk in llama_generate(options):
+        print(chunk, flush=True, end='')
+
+    print()
+
+
+if __name__ == '__main__':
+    messages = [
+        {'role': 'system', 'content': 'You are a helpful assistant.'},
+        {'role': 'user', 'content': 'Evaluate 1 + 2 in Python.'},
+    ]
+
+    for model in models:
+        config = get_config(model.creator_hf_repo)
+        print(f'{model = }, {config.max_position_embeddings = }')
+        demo_model(model, messages)
+        print('-' * 80)
diff --git a/llama/llama_cli_ctypes_cpu.py b/llama/llama_cli_ctypes_cpu.py
@@ -19,7 +19,7 @@
 
 module_path = os.path.abspath(__file__)
 module_dir = os.path.dirname(module_path)
-llama_cli_lib_path = os.path.join(module_dir, 'llama-cli-cpu.so')
+llama_cli_lib_path = os.path.join(module_dir, 'llama_cli_cpu.so')
 lib = ctypes.CDLL(llama_cli_lib_path)
 
 _LLAMA_YIELD_TOKEN_T = ctypes.CFUNCTYPE(None, ctypes.c_char_p)

diff --git a/llama/llama_cli_ctypes_cuda_12_5.py b/llama/llama_cli_ctypes_cuda_12_5.py
@@ -19,7 +19,7 @@
 
 module_path = os.path.abspath(__file__)
 module_dir = os.path.dirname(module_path)
-llama_cli_lib_path = os.path.join(module_dir, 'llama-cli-cuda-12_5.so')
+llama_cli_lib_path = os.path.join(module_dir, 'llama_cli_cuda_12_5.so')
 lib = ctypes.CDLL(llama_cli_lib_path)
 
 _LLAMA_YIELD_TOKEN_T = ctypes.CFUNCTYPE(None, ctypes.c_char_p)

diff --git a/scripts/build.py b/scripts/build.py
@@ -37,7 +37,7 @@ def build_cpu(*args, **kwargs):
         # 'GGML_OPENBLAS=1',
     ], check=True, env=env)
 
-    subprocess.run(['mv', 'llama.cpp/llama-cli.so', 'llama/llama-cli-cpu.so'], check=True)
+    subprocess.run(['mv', 'llama.cpp/llama_cli.so', 'llama/llama_cli_cpu.so'], check=True)
 
     #
     # cffi
@@ -59,10 +59,8 @@ def build_cpu(*args, **kwargs):
         typedef int (*_llama_should_stop_t)(void);
         int _llama_cli_main(int argc, char ** argv, _llama_yield_token_t _llama_yield_token, _llama_should_stop_t _llama_should_stop, int stop_on_bos_eos_eot);
         ''',
-        libraries=[
-            'stdc++',
-        ],
-        extra_objects=['../llama.cpp/llama-cli.a'],
+        libraries=['stdc++'],
+        extra_objects=['../llama.cpp/llama_cli.a'],
     )
 
     ffibuilder.compile(tmpdir='build', verbose=True)
@@ -147,7 +145,7 @@ def build_cuda_12_5(*args, **kwargs):
         'GGML_CUDA=1',
     ], check=True, env=env)
 
-    subprocess.run(['mv', 'llama.cpp/llama-cli.so', 'llama/llama-cli-cuda-12_5.so'], check=True)
+    subprocess.run(['mv', 'llama.cpp/llama_cli.so', 'llama/llama_cli_cuda_12_5.so'], check=True)
 
     #
     # cffi
@@ -171,8 +169,14 @@ def build_cuda_12_5(*args, **kwargs):
         ''',
         libraries=[
             'stdc++',
+            'cuda',
+            'cublas',
+            'culibos',
+            'cudart',
+            'cublasLt', 
         ],
-        extra_objects=['../llama.cpp/llama-cli.a'],
+        library_dirs=[f'{cuda_output_dir}/dist/lib64'],
+        extra_objects=['../llama.cpp/llama_cli.a'],
     )
 
     ffibuilder.compile(tmpdir='build', verbose=True)
@@ -197,8 +201,9 @@ def build(*args, **kwargs):
     build_cpu(*args, **kwargs)
 
     # cuda 12.5
-    clean_llama_cpp()
-    build_cuda_12_5(*args, **kwargs)
+    if os.environ['AUDITWHEEL_ARCH'] == 'x86_64':
+        clean_llama_cpp()
+        build_cuda_12_5(*args, **kwargs)
 
 
 if __name__ == '__main__':