Skip to content

Commit

Permalink
0.1.2: OpenAI © Chat Completions API, CUDA >=7.5 support
Browse files Browse the repository at this point in the history
  • Loading branch information
mtasic85 committed Jul 19, 2024
1 parent 872b517 commit b5e465f
Show file tree
Hide file tree
Showing 10 changed files with 122 additions and 40 deletions.
16 changes: 16 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,21 @@
# CHANGELOG

## v0.1.2

Added:
- OpenAI © compatible Chat Completions API server.

Fixed:
- `options` is deepcopy-ed when passed to `llama_generate(options)`, so it can be reused.

Changed:
- Build for manylinux_2_28 and musllinux_1_2

## v0.1.1

Changed:
- Updated: huggingface-hub = "^0.24.0", setuptools = "^71.0.3"

## v0.1.0

Added:
Expand Down
26 changes: 23 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,24 @@

## Install

Basic library install:

```bash
pip install llama-cpp-cffi
```

In case you want [Chat Completions API by OpenAI ©](https://platform.openai.com/docs/overview) compatible API:

```bash
pip install llama-cpp-cffi[openai]
```

## Example

### Library Usage

`examples/demo_0.py`

```python
from llama.llama_cli_cffi_cpu import llama_generate, Model, Options
# from llama.llama_cli_cffi_cuda_12_5 import llama_generate, Model, Options
Expand All @@ -31,9 +43,9 @@ from llama.llama_cli_cffi_cpu import llama_generate, Model, Options
from llama.formatter import get_config

model = Model(
'TinyLlama/TinyLlama-1.1B-Chat-v1.0',
'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF',
'tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf',
creator_hf_repo='TinyLlama/TinyLlama-1.1B-Chat-v1.0',
hf_repo='TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF',
hf_file='tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf',
)

config = get_config(model.creator_hf_repo)
Expand All @@ -55,6 +67,14 @@ for chunk in llama_generate(options):

# newline
print()

```

### OpenAI © compatible Chat Completions

`examples/demo_1.py`

```python
```

## Demos
Expand Down
10 changes: 7 additions & 3 deletions examples/demo_0.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
from llama.llama_cli_cffi_cpu import llama_generate, Model, Options
# from llama.llama_cli_cffi_cuda_12_5 import llama_generate, Model, Options
# from llama.llama_cli_ctypes_cuda import llama_generate, Model, Options
# from llama.llama_cli_ctypes_cuda_12_5 import llama_generate, Model, Options

from llama.formatter import get_config

model = Model(
'TinyLlama/TinyLlama-1.1B-Chat-v1.0',
'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF',
'tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf',
creator_hf_repo='TinyLlama/TinyLlama-1.1B-Chat-v1.0',
hf_repo='TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF',
hf_file='tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf',
)

config = get_config(model.creator_hf_repo)
Expand Down
2 changes: 2 additions & 0 deletions llama/llama_cli_cffi_cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import json
import ctypes
from queue import Queue
from copy import deepcopy
from typing import Iterator
from threading import Thread
from functools import partial
Expand Down Expand Up @@ -110,6 +111,7 @@ def llama_generate(options: Options, callback=None) -> Iterator[str] | None:

assert options.model and isinstance(options.model, Model)

options: Options = deepcopy(options)
model: Model = options.model
tokenizer = get_tokenizer(model.creator_hf_repo)
options.model = hf_hub_download(repo_id=model.hf_repo, filename=model.hf_file)
Expand Down
2 changes: 2 additions & 0 deletions llama/llama_cli_cffi_cuda_12_5.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import json
import ctypes
from queue import Queue
from copy import deepcopy
from typing import Iterator
from threading import Thread
from functools import partial
Expand Down Expand Up @@ -110,6 +111,7 @@ def llama_generate(options: Options, callback=None) -> Iterator[str] | None:

assert options.model and isinstance(options.model, Model)

options: Options = deepcopy(options)
model: Model = options.model
tokenizer = get_tokenizer(model.creator_hf_repo)
options.model = hf_hub_download(repo_id=model.hf_repo, filename=model.hf_file)
Expand Down
2 changes: 2 additions & 0 deletions llama/llama_cli_ctypes_cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import json
import ctypes
from queue import Queue
from copy import deepcopy
from typing import Iterator
from threading import Thread
from functools import partial
Expand Down Expand Up @@ -110,6 +111,7 @@ def llama_generate(options: Options, callback=None) -> Iterator[str] | None:

assert options.model and isinstance(options.model, Model)

options: Options = deepcopy(options)
model: Model = options.model
tokenizer = get_tokenizer(model.creator_hf_repo)
options.model = hf_hub_download(repo_id=model.hf_repo, filename=model.hf_file)
Expand Down
2 changes: 2 additions & 0 deletions llama/llama_cli_ctypes_cuda_12_5.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import json
import ctypes
from queue import Queue
from copy import deepcopy
from typing import Iterator
from threading import Thread
from functools import partial
Expand Down Expand Up @@ -110,6 +111,7 @@ def llama_generate(options: Options, callback=None) -> Iterator[str] | None:

assert options.model and isinstance(options.model, Model)

options: Options = deepcopy(options)
model: Model = options.model
tokenizer = get_tokenizer(model.creator_hf_repo)
options.model = hf_hub_download(repo_id=model.hf_repo, filename=model.hf_file)
Expand Down
29 changes: 15 additions & 14 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

26 changes: 18 additions & 8 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
[tool.poetry]
name = "llama-cpp-cffi"
version = "0.1.0"
version = "0.1.2"
description = "Python binding for llama.cpp using cffi"
homepage = "https://github.com/mtasic85/llama-cpp-cffi"
repository = "https://github.com/mtasic85/llama-cpp-cffi"
authors = ["Marko Tasic <mtasic85@gmail.com>"]
homepage = "https://github.com/tangledgroup/llama-cpp-cffi"
repository = "https://github.com/tangledgroup/llama-cpp-cffi"
authors = ["Marko Tasic <mtasic85@gmail.com>", "Tangled Group, Inc <info@tangledgroup.com>"]
license = "MIT"
readme = "README.md"
packages = [{include = "llama"}]
Expand All @@ -13,14 +13,21 @@ include = [{path = "llama/*.so"}]
[tool.poetry.dependencies]
python = "^3.10"
attrs = "^23.2.0"
huggingface-hub = "^0.23.4"
huggingface-hub = "^0.24.0"
cffi = "^1.16.0"
setuptools = "^70.2.0"
setuptools = "^71.0.3"
psutil = "^6.0.0"
transformers = "^4.42.4"
jinja2 = "^3.1.4"
sentencepiece = "^0.2.0"
protobuf = "^5.27.2"
openai = {version = "^1.35.15", optional = true}
aiohttp = {extras = ["speedups"], version = "^3.9.5", optional = true}
uvloop = {version = "^0.19.0", optional = true}

[tool.poetry.extras]
openai = ["openai", "aiohttp"]
uvloop = ["uvloop"]

[tool.poetry.group.dev.dependencies]
cibuildwheel = "^2.19.2"
Expand All @@ -35,11 +42,14 @@ script = "scripts/build.py"
[tool.cibuildwheel]
build-frontend = "build"
before-build = "pip install poetry"
# before-build = "pip install poetry; yum -y install wget"
skip = ["cp36-*", "cp37-*", "cp38-*", "cp39-*", "pp*", "*-win32", "*i686"]
manylinux-x86_64-image = "manylinux_2_28"
manylinux-aarch64-image = "manylinux_2_28"
musllinux-x86_64-image = "musllinux_1_2"
musllinux-aarch64-image = "musllinux_1_2"
build-verbosity=3
repair-wheel-command = ""
# environment = {"LD_LIBRARY_PATH" = "/project/cuda-12.5.1/dist/lib64:$LD_LIBRARY_PATH", "CUDA_HOME" = "/project/cuda-12.5.1/dist"}
environment = {"LD_LIBRARY_PATH" = "/project/cuda-12.5.1/dist/lib64:/project/cuda-12.5.1/dist/targets/x86_64-linux/lib:/project/cuda-12.5.1/dist/lib64/stubs:$LD_LIBRARY_PATH", "CUDA_HOME" = "/project/cuda-12.5.1/dist"}

[tool.cibuildwheel.pyodide]

Expand Down
47 changes: 35 additions & 12 deletions scripts/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,16 @@ def build_cuda_12_5(*args, **kwargs):
cuda_output_dir = os.path.abspath('./cuda-12.5.1')
cuda_file_path = os.path.join(cuda_output_dir, cuda_file)

env['PATH'] = env['PATH'] + f':{cuda_output_dir}/dist/bin'
env['PATH'] = f'{cuda_output_dir}/dist/bin:{env["PATH"]}'
env['CUDA_PATH'] = f'{cuda_output_dir}/dist'
env['CUDA_DOCKER_ARCH'] = 'compute_75'
env['NVCCFLAGS'] = '\
-gencode arch=compute_75,code=sm_75 \
-gencode arch=compute_80,code=sm_80 \
-gencode arch=compute_86,code=sm_86 \
-gencode arch=compute_89,code=sm_89 \
-gencode arch=compute_90,code=sm_90 \
-gencode arch=compute_90,code=compute_90'

# download cuda file
if not os.path.exists(cuda_file_path):
Expand All @@ -107,7 +115,8 @@ def build_cuda_12_5(*args, **kwargs):
cmd = [
f'{cuda_output_dir}/{cuda_file}',
'--tar',
'mxvf',
# 'mxvf',
'mxf',
'--wildcards',
'./builds/cuda_cccl/*',
'./builds/cuda_cudart/*',
Expand All @@ -133,6 +142,15 @@ def build_cuda_12_5(*args, **kwargs):
cmd = f'cp -r {cuda_output_dir}/builds/libcublas/* {cuda_output_dir}/dist'
subprocess.run(cmd, shell=True, check=True)

# cmd = 'pwd'
# subprocess.run(cmd, check=True)

# cmd = 'ls -l'
# subprocess.run(cmd, check=True, shell=True)

# cmd = f'ls -l {cuda_output_dir}/dist'
# subprocess.run(cmd, check=True, shell=True)

#
# build llama.cpp
#
Expand Down Expand Up @@ -176,13 +194,17 @@ def build_cuda_12_5(*args, **kwargs):
''',
libraries=[
'stdc++',
# 'cuda',
# 'cublas',
# 'culibos',
# 'cudart',
# 'cublasLt',
'cuda',
'cublas',
'culibos',
'cudart',
'cublasLt',
],
library_dirs=[
f'{cuda_output_dir}/dist/lib64',
f'{cuda_output_dir}/dist/targets/x86_64-linux/lib',
f'{cuda_output_dir}/dist/lib64/stubs',
],
library_dirs=[f'{cuda_output_dir}/dist/lib64'],
extra_objects=['../llama.cpp/llama_cli.a'],
)

Expand All @@ -203,14 +225,15 @@ def build(*args, **kwargs):
clean()
clone_llama_cpp()

# cuda 12.5
if os.environ.get('AUDITWHEEL_POLICY') in ('manylinux2014', 'manylinux_2_28', None) and os.environ.get('AUDITWHEEL_ARCH') in ('x86_64', None):
clean_llama_cpp()
build_cuda_12_5(*args, **kwargs)

# cpu
clean_llama_cpp()
build_cpu(*args, **kwargs)

# cuda 12.5
if os.environ['AUDITWHEEL_POLICY'] == 'manylinux2014' and os.environ['AUDITWHEEL_ARCH'] == 'x86_64':
clean_llama_cpp()
build_cuda_12_5(*args, **kwargs)


if __name__ == '__main__':
Expand Down

0 comments on commit b5e465f

Please sign in to comment.