Skip to content

Commit

Permalink
cffi cpu/cuda 12.5
Browse files Browse the repository at this point in the history
  • Loading branch information
mtasic85 committed Jul 18, 2024
1 parent 86916bf commit 05dde60
Show file tree
Hide file tree
Showing 8 changed files with 168 additions and 41 deletions.
27 changes: 27 additions & 0 deletions BUILD.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# llama-cpp-cffi

## Build

```bash
#
# setup venv
#
python -m venv venv
source venv/bin/activate
pip install poetry

# x86_64
poetry run cibuildwheel --output-dir wheelhouse --platform linux --arch x86_64 .

# aarch64
docker run --rm --privileged linuxkit/binfmt:v0.8
poetry run cibuildwheel --output-dir wheelhouse --platform linux --arch aarch64 .

# pyodide, pyscript, wasm (NOTE: cannot be published to PyPI)
# poetry run cibuildwheel --output-dir wheelhouse --platform pyodide .

#
# publish
#
poetry publish --dist-dir wheelhouse
```
4 changes: 2 additions & 2 deletions Makefile_3.patch
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
+ LIB_CXXFLAGS := -DLLAMA_LIB
+endif
+
+LIB_NAME := llama-cli.$(LIB_EXT)
+LIB_NAME := llama_cli.$(LIB_EXT)
+
# Architecture specific
# TODO: probably these flags need to be tweaked on some architectures
Expand All @@ -44,7 +44,7 @@
+llama-cli-static: examples/main/main.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) $(LIB_CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ ar rcs llama-cli.a $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<)
+ ar rcs llama_cli.a $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<)
+
llama-infill: examples/infill/infill.cpp \
$(OBJ_ALL)
Expand Down
84 changes: 56 additions & 28 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,45 +1,73 @@
# llama-cpp-cffi

Python binding for llama.cpp using cffi
<!--
[![Build][build-image]]()
[![Status][status-image]][pypi-project-url]
[![Stable Version][stable-ver-image]][pypi-project-url]
[![Coverage][coverage-image]]()
[![Python][python-ver-image]][pypi-project-url]
[![License][mit-image]][mit-url]
-->
[![Downloads](https://img.shields.io/pypi/dm/llama-cli-cffi)](https://pypistats.org/packages/llama-cli-cffi)
[![Supported Versions](https://img.shields.io/pypi/pyversions/llama-cli-cffi)](https://pypi.org/project/llama-cli-cffi)
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)

## Build
**Python** binding for [llama.cpp](https://github.com/ggerganov/llama.cpp) using **cffi** and **ctypes**. Supports **CPU** and **CUDA 12.5** execution.

## Install

```bash
#
# setup venv
#
python -m venv venv
source venv/bin/activate
pip install poetry
pip install llama-cli-cffi
```

#
# build
#
## Example

# x86_64
poetry run cibuildwheel --output-dir wheelhouse --platform linux --arch x86_64 .
```python
from llama.llama_cli_cffi_cpu import llama_generate, Model, Options
# from llama.llama_cli_cffi_cuda_12_5 import llama_generate, Model, Options
# from llama.llama_cli_ctypes_cuda import llama_generate, Model, Options
# from llama.llama_cli_ctypes_cuda_12_5 import llama_generate, Model, Options

# aarch64
docker run --rm --privileged linuxkit/binfmt:v0.8
poetry run cibuildwheel --output-dir wheelhouse --platform linux --arch aarch64 .
from llama.formatter import get_config

# pyodide, pyscript, wasm (NOTE: cannot be published to PyPI)
# poetry run cibuildwheel --output-dir wheelhouse --platform pyodide .
model = Model(
'TinyLlama/TinyLlama-1.1B-Chat-v1.0',
'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF',
'tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf',
)

#
# publish
#
poetry publish --dist-dir wheelhouse
config = get_config(model.creator_hf_repo)

messages = [
{'role': 'system', 'content': 'You are a helpful assistant.'},
{'role': 'user', 'content': 'Evaluate 1 + 2 in Python.'},
]

options = Options(
ctx_size=config.max_position_embeddings,
predict=-2,
model=model,
prompt=messages,
)

for chunk in llama_generate(options):
print(chunk, flush=True, end='')

# newline
print()
```

## Demos

```BASH
#
# run demos
#
python -B examples/demo_cffi.py
python -B examples/demo_cffi_cpu.py
python -B examples/demo_cffi_cuda_12_5.py

python -B examples/demo_ctypes_cpu.py
python -B examples/demo_ctypes_cuda_12_5.py
python -m http.server -d examples/demo_pyonide -b "0.0.0.0" 5000
```

```bash
make -j llama-cli-shared llama-cli-static GGML_NO_OPENMP=1 GGML_NO_LLAMAFILE=1
```
# python -m http.server -d examples/demo_pyonide -b "0.0.0.0" 5000
```
28 changes: 28 additions & 0 deletions examples/demo_0.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from llama.llama_cli_cffi_cpu import llama_generate, Model, Options
from llama.formatter import get_config

model = Model(
'TinyLlama/TinyLlama-1.1B-Chat-v1.0',
'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF',
'tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf',
)

config = get_config(model.creator_hf_repo)

messages = [
{'role': 'system', 'content': 'You are a helpful assistant.'},
{'role': 'user', 'content': 'Evaluate 1 + 2 in Python.'},
]

options = Options(
ctx_size=config.max_position_embeddings,
predict=-2,
model=model,
prompt=messages,
)

for chunk in llama_generate(options):
print(chunk, flush=True, end='')

# newline
print()
39 changes: 39 additions & 0 deletions examples/demo_cffi_cuda_12_5.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# import os
# import sys
# sys.path.append(os.path.abspath('.'))

from llama.llama_cli_cffi_cuda_12_5 import llama_generate, Model, Options
from llama.formatter import get_config

from demo_models import models


def demo_model(model: Model, messages: list[dict]):
config = get_config(model.creator_hf_repo)

options = Options(
ctx_size=32 * 1024 if model.creator_hf_repo == 'microsoft/Phi-3-mini-128k-instruct' else config.max_position_embeddings,
predict=-2,
gpu_layers=19 if model.creator_hf_repo == 'microsoft/Phi-3-mini-128k-instruct' else 99,
# log_disable=False,
model=model,
prompt=messages,
)

for chunk in llama_generate(options):
print(chunk, flush=True, end='')

print()


if __name__ == '__main__':
messages = [
{'role': 'system', 'content': 'You are a helpful assistant.'},
{'role': 'user', 'content': 'Evaluate 1 + 2 in Python.'},
]

for model in models:
config = get_config(model.creator_hf_repo)
print(f'{model = }, {config.max_position_embeddings = }')
demo_model(model, messages)
print('-' * 80)
2 changes: 1 addition & 1 deletion llama/llama_cli_ctypes_cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

module_path = os.path.abspath(__file__)
module_dir = os.path.dirname(module_path)
llama_cli_lib_path = os.path.join(module_dir, 'llama-cli-cpu.so')
llama_cli_lib_path = os.path.join(module_dir, 'llama_cli_cpu.so')
lib = ctypes.CDLL(llama_cli_lib_path)

_LLAMA_YIELD_TOKEN_T = ctypes.CFUNCTYPE(None, ctypes.c_char_p)
Expand Down
2 changes: 1 addition & 1 deletion llama/llama_cli_ctypes_cuda_12_5.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

module_path = os.path.abspath(__file__)
module_dir = os.path.dirname(module_path)
llama_cli_lib_path = os.path.join(module_dir, 'llama-cli-cuda-12_5.so')
llama_cli_lib_path = os.path.join(module_dir, 'llama_cli_cuda_12_5.so')
lib = ctypes.CDLL(llama_cli_lib_path)

_LLAMA_YIELD_TOKEN_T = ctypes.CFUNCTYPE(None, ctypes.c_char_p)
Expand Down
23 changes: 14 additions & 9 deletions scripts/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def build_cpu(*args, **kwargs):
# 'GGML_OPENBLAS=1',
], check=True, env=env)

subprocess.run(['mv', 'llama.cpp/llama-cli.so', 'llama/llama-cli-cpu.so'], check=True)
subprocess.run(['mv', 'llama.cpp/llama_cli.so', 'llama/llama_cli_cpu.so'], check=True)

#
# cffi
Expand All @@ -59,10 +59,8 @@ def build_cpu(*args, **kwargs):
typedef int (*_llama_should_stop_t)(void);
int _llama_cli_main(int argc, char ** argv, _llama_yield_token_t _llama_yield_token, _llama_should_stop_t _llama_should_stop, int stop_on_bos_eos_eot);
''',
libraries=[
'stdc++',
],
extra_objects=['../llama.cpp/llama-cli.a'],
libraries=['stdc++'],
extra_objects=['../llama.cpp/llama_cli.a'],
)

ffibuilder.compile(tmpdir='build', verbose=True)
Expand Down Expand Up @@ -147,7 +145,7 @@ def build_cuda_12_5(*args, **kwargs):
'GGML_CUDA=1',
], check=True, env=env)

subprocess.run(['mv', 'llama.cpp/llama-cli.so', 'llama/llama-cli-cuda-12_5.so'], check=True)
subprocess.run(['mv', 'llama.cpp/llama_cli.so', 'llama/llama_cli_cuda_12_5.so'], check=True)

#
# cffi
Expand All @@ -171,8 +169,14 @@ def build_cuda_12_5(*args, **kwargs):
''',
libraries=[
'stdc++',
'cuda',
'cublas',
'culibos',
'cudart',
'cublasLt',
],
extra_objects=['../llama.cpp/llama-cli.a'],
library_dirs=[f'{cuda_output_dir}/dist/lib64'],
extra_objects=['../llama.cpp/llama_cli.a'],
)

ffibuilder.compile(tmpdir='build', verbose=True)
Expand All @@ -197,8 +201,9 @@ def build(*args, **kwargs):
build_cpu(*args, **kwargs)

# cuda 12.5
clean_llama_cpp()
build_cuda_12_5(*args, **kwargs)
if os.environ['AUDITWHEEL_ARCH'] == 'x86_64':
clean_llama_cpp()
build_cuda_12_5(*args, **kwargs)


if __name__ == '__main__':
Expand Down

0 comments on commit 05dde60

Please sign in to comment.