0.1.2: OpenAI © Chat Completions API, CUDA >=7.5 support

tangledgroup · Jul 19, 2024 · b5e465f · b5e465f
1 parent 872b517
commit b5e465f
Show file tree

Hide file tree

Showing 10 changed files with 122 additions and 40 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,21 @@
 # CHANGELOG
 
+## v0.1.2
+
+Added:
+    - OpenAI © compatible Chat Completions API server.
+
+Fixed:
+    - `options` is deepcopy-ed when passed to `llama_generate(options)`, so it can be reused.
+
+Changed:
+    - Build for manylinux_2_28 and musllinux_1_2
+
+## v0.1.1
+
+Changed:
+    - Updated: huggingface-hub = "^0.24.0", setuptools = "^71.0.3"
+
 ## v0.1.0
 
 Added:

diff --git a/README.md b/README.md
@@ -16,12 +16,24 @@
 
 ## Install
 
+Basic library install:
+
 ```bash
 pip install llama-cpp-cffi
 ```
 
+In case you want [Chat Completions API by OpenAI ©](https://platform.openai.com/docs/overview) compatible API:
+
+```bash
+pip install llama-cpp-cffi[openai]
+```
+
 ## Example
 
+### Library Usage
+
+`examples/demo_0.py`
+
 ```python
 from llama.llama_cli_cffi_cpu import llama_generate, Model, Options
 # from llama.llama_cli_cffi_cuda_12_5 import llama_generate, Model, Options
@@ -31,9 +43,9 @@ from llama.llama_cli_cffi_cpu import llama_generate, Model, Options
 from llama.formatter import get_config
 
 model = Model(
-    'TinyLlama/TinyLlama-1.1B-Chat-v1.0',
-    'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF',
-    'tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf',
+    creator_hf_repo='TinyLlama/TinyLlama-1.1B-Chat-v1.0',
+    hf_repo='TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF',
+    hf_file='tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf',
 )
 
 config = get_config(model.creator_hf_repo)
@@ -55,6 +67,14 @@ for chunk in llama_generate(options):
 
 # newline
 print()
+
+```
+
+### OpenAI © compatible Chat Completions
+
+`examples/demo_1.py`
+
+```python
 ```
 
 ## Demos

diff --git a/examples/demo_0.py b/examples/demo_0.py
@@ -1,10 +1,14 @@
 from llama.llama_cli_cffi_cpu import llama_generate, Model, Options
+# from llama.llama_cli_cffi_cuda_12_5 import llama_generate, Model, Options
+# from llama.llama_cli_ctypes_cuda import llama_generate, Model, Options
+# from llama.llama_cli_ctypes_cuda_12_5 import llama_generate, Model, Options
+
 from llama.formatter import get_config
 
 model = Model(
-    'TinyLlama/TinyLlama-1.1B-Chat-v1.0',
-    'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF',
-    'tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf',
+    creator_hf_repo='TinyLlama/TinyLlama-1.1B-Chat-v1.0',
+    hf_repo='TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF',
+    hf_file='tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf',
 )
 
 config = get_config(model.creator_hf_repo)

diff --git a/llama/llama_cli_cffi_cpu.py b/llama/llama_cli_cffi_cpu.py
@@ -4,6 +4,7 @@
 import json
 import ctypes
 from queue import Queue
+from copy import deepcopy
 from typing import Iterator
 from threading import Thread
 from functools import partial
@@ -110,6 +111,7 @@ def llama_generate(options: Options, callback=None) -> Iterator[str] | None:
 
     assert options.model and isinstance(options.model, Model)
 
+    options: Options = deepcopy(options)
     model: Model = options.model
     tokenizer = get_tokenizer(model.creator_hf_repo)
     options.model = hf_hub_download(repo_id=model.hf_repo, filename=model.hf_file)

diff --git a/llama/llama_cli_cffi_cuda_12_5.py b/llama/llama_cli_cffi_cuda_12_5.py
@@ -4,6 +4,7 @@
 import json
 import ctypes
 from queue import Queue
+from copy import deepcopy
 from typing import Iterator
 from threading import Thread
 from functools import partial
@@ -110,6 +111,7 @@ def llama_generate(options: Options, callback=None) -> Iterator[str] | None:
 
     assert options.model and isinstance(options.model, Model)
 
+    options: Options = deepcopy(options)
     model: Model = options.model
     tokenizer = get_tokenizer(model.creator_hf_repo)
     options.model = hf_hub_download(repo_id=model.hf_repo, filename=model.hf_file)

diff --git a/llama/llama_cli_ctypes_cpu.py b/llama/llama_cli_ctypes_cpu.py
@@ -4,6 +4,7 @@
 import json
 import ctypes
 from queue import Queue
+from copy import deepcopy
 from typing import Iterator
 from threading import Thread
 from functools import partial
@@ -110,6 +111,7 @@ def llama_generate(options: Options, callback=None) -> Iterator[str] | None:
 
     assert options.model and isinstance(options.model, Model)
 
+    options: Options = deepcopy(options)
     model: Model = options.model
     tokenizer = get_tokenizer(model.creator_hf_repo)
     options.model = hf_hub_download(repo_id=model.hf_repo, filename=model.hf_file)

diff --git a/llama/llama_cli_ctypes_cuda_12_5.py b/llama/llama_cli_ctypes_cuda_12_5.py
@@ -4,6 +4,7 @@
 import json
 import ctypes
 from queue import Queue
+from copy import deepcopy
 from typing import Iterator
 from threading import Thread
 from functools import partial
@@ -110,6 +111,7 @@ def llama_generate(options: Options, callback=None) -> Iterator[str] | None:
 
     assert options.model and isinstance(options.model, Model)
 
+    options: Options = deepcopy(options)
     model: Model = options.model
     tokenizer = get_tokenizer(model.creator_hf_repo)
     options.model = hf_hub_download(repo_id=model.hf_repo, filename=model.hf_file)

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,10 +1,10 @@
 [tool.poetry]
 name = "llama-cpp-cffi"
-version = "0.1.0"
+version = "0.1.2"
 description = "Python binding for llama.cpp using cffi"
-homepage = "https://github.com/mtasic85/llama-cpp-cffi"
-repository = "https://github.com/mtasic85/llama-cpp-cffi"
-authors = ["Marko Tasic <mtasic85@gmail.com>"]
+homepage = "https://github.com/tangledgroup/llama-cpp-cffi"
+repository = "https://github.com/tangledgroup/llama-cpp-cffi"
+authors = ["Marko Tasic <mtasic85@gmail.com>", "Tangled Group, Inc <info@tangledgroup.com>"]
 license = "MIT"
 readme = "README.md"
 packages = [{include = "llama"}]
@@ -13,14 +13,21 @@ include = [{path = "llama/*.so"}]
 [tool.poetry.dependencies]
 python = "^3.10"
 attrs = "^23.2.0"
-huggingface-hub = "^0.23.4"
+huggingface-hub = "^0.24.0"
 cffi = "^1.16.0"
-setuptools = "^70.2.0"
+setuptools = "^71.0.3"
 psutil = "^6.0.0"
 transformers = "^4.42.4"
 jinja2 = "^3.1.4"
 sentencepiece = "^0.2.0"
 protobuf = "^5.27.2"
+openai = {version = "^1.35.15", optional = true}
+aiohttp = {extras = ["speedups"], version = "^3.9.5", optional = true}
+uvloop = {version = "^0.19.0", optional = true}
+
+[tool.poetry.extras]
+openai = ["openai", "aiohttp"]
+uvloop = ["uvloop"]
 
 [tool.poetry.group.dev.dependencies]
 cibuildwheel = "^2.19.2"
@@ -35,11 +42,14 @@ script = "scripts/build.py"
 [tool.cibuildwheel]
 build-frontend = "build"
 before-build = "pip install poetry"
-# before-build = "pip install poetry; yum -y install wget"
 skip = ["cp36-*", "cp37-*", "cp38-*", "cp39-*", "pp*", "*-win32", "*i686"]
+manylinux-x86_64-image = "manylinux_2_28"
+manylinux-aarch64-image = "manylinux_2_28"
+musllinux-x86_64-image = "musllinux_1_2"
+musllinux-aarch64-image = "musllinux_1_2"
 build-verbosity=3
 repair-wheel-command = ""
-# environment = {"LD_LIBRARY_PATH" = "/project/cuda-12.5.1/dist/lib64:$LD_LIBRARY_PATH", "CUDA_HOME" = "/project/cuda-12.5.1/dist"}
+environment = {"LD_LIBRARY_PATH" = "/project/cuda-12.5.1/dist/lib64:/project/cuda-12.5.1/dist/targets/x86_64-linux/lib:/project/cuda-12.5.1/dist/lib64/stubs:$LD_LIBRARY_PATH", "CUDA_HOME" = "/project/cuda-12.5.1/dist"}
 
 [tool.cibuildwheel.pyodide]
 

diff --git a/scripts/build.py b/scripts/build.py
@@ -90,8 +90,16 @@ def build_cuda_12_5(*args, **kwargs):
     cuda_output_dir = os.path.abspath('./cuda-12.5.1')
     cuda_file_path = os.path.join(cuda_output_dir, cuda_file)
 
-    env['PATH'] = env['PATH'] + f':{cuda_output_dir}/dist/bin'
+    env['PATH'] =  f'{cuda_output_dir}/dist/bin:{env["PATH"]}'
     env['CUDA_PATH'] = f'{cuda_output_dir}/dist'
+    env['CUDA_DOCKER_ARCH'] = 'compute_75'
+    env['NVCCFLAGS'] = '\
+            -gencode arch=compute_75,code=sm_75 \
+            -gencode arch=compute_80,code=sm_80 \
+            -gencode arch=compute_86,code=sm_86 \
+            -gencode arch=compute_89,code=sm_89 \
+            -gencode arch=compute_90,code=sm_90 \
+            -gencode arch=compute_90,code=compute_90'
 
     # download cuda file
     if not os.path.exists(cuda_file_path):
@@ -107,7 +115,8 @@ def build_cuda_12_5(*args, **kwargs):
     cmd = [
         f'{cuda_output_dir}/{cuda_file}',
         '--tar',
-        'mxvf',
+        # 'mxvf',
+        'mxf',
         '--wildcards',
         './builds/cuda_cccl/*',
         './builds/cuda_cudart/*',
@@ -133,6 +142,15 @@ def build_cuda_12_5(*args, **kwargs):
     cmd = f'cp -r {cuda_output_dir}/builds/libcublas/* {cuda_output_dir}/dist'
     subprocess.run(cmd, shell=True, check=True)
 
+    # cmd = 'pwd'
+    # subprocess.run(cmd, check=True)
+
+    # cmd = 'ls -l'
+    # subprocess.run(cmd, check=True, shell=True)
+
+    # cmd = f'ls -l {cuda_output_dir}/dist'
+    # subprocess.run(cmd, check=True, shell=True)
+
     #
     # build llama.cpp
     #
@@ -176,13 +194,17 @@ def build_cuda_12_5(*args, **kwargs):
         ''',
         libraries=[
             'stdc++',
-            # 'cuda',
-            # 'cublas',
-            # 'culibos',
-            # 'cudart',
-            # 'cublasLt',
+            'cuda',
+            'cublas',
+            'culibos',
+            'cudart',
+            'cublasLt',
+        ],
+        library_dirs=[
+            f'{cuda_output_dir}/dist/lib64',
+            f'{cuda_output_dir}/dist/targets/x86_64-linux/lib',
+            f'{cuda_output_dir}/dist/lib64/stubs',
         ],
-        library_dirs=[f'{cuda_output_dir}/dist/lib64'],
         extra_objects=['../llama.cpp/llama_cli.a'],
     )
 
@@ -203,14 +225,15 @@ def build(*args, **kwargs):
     clean()
     clone_llama_cpp()
 
+    # cuda 12.5
+    if os.environ.get('AUDITWHEEL_POLICY') in ('manylinux2014', 'manylinux_2_28', None) and os.environ.get('AUDITWHEEL_ARCH') in ('x86_64', None):
+        clean_llama_cpp()
+        build_cuda_12_5(*args, **kwargs)
+
     # cpu
     clean_llama_cpp()
     build_cpu(*args, **kwargs)
 
-    # cuda 12.5
-    if os.environ['AUDITWHEEL_POLICY'] == 'manylinux2014' and os.environ['AUDITWHEEL_ARCH'] == 'x86_64':
-        clean_llama_cpp()
-        build_cuda_12_5(*args, **kwargs)
 
 
 if __name__ == '__main__':