diff --git a/CHANGELOG.md b/CHANGELOG.md index 0dba636..5fbcca4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,21 @@ # CHANGELOG +## v0.1.2 + +Added: + - OpenAI © compatible Chat Completions API server. + +Fixed: + - `options` is deepcopy-ed when passed to `llama_generate(options)`, so it can be reused. + +Changed: + - Build for manylinux_2_28 and musllinux_1_2 + +## v0.1.1 + +Changed: + - Updated: huggingface-hub = "^0.24.0", setuptools = "^71.0.3" + ## v0.1.0 Added: diff --git a/README.md b/README.md index 58a81b1..eadb618 100644 --- a/README.md +++ b/README.md @@ -16,12 +16,24 @@ ## Install +Basic library install: + ```bash pip install llama-cpp-cffi ``` +In case you want [Chat Completions API by OpenAI ©](https://platform.openai.com/docs/overview) compatible API: + +```bash +pip install llama-cpp-cffi[openai] +``` + ## Example +### Library Usage + +`examples/demo_0.py` + ```python from llama.llama_cli_cffi_cpu import llama_generate, Model, Options # from llama.llama_cli_cffi_cuda_12_5 import llama_generate, Model, Options @@ -31,9 +43,9 @@ from llama.llama_cli_cffi_cpu import llama_generate, Model, Options from llama.formatter import get_config model = Model( - 'TinyLlama/TinyLlama-1.1B-Chat-v1.0', - 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF', - 'tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf', + creator_hf_repo='TinyLlama/TinyLlama-1.1B-Chat-v1.0', + hf_repo='TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF', + hf_file='tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf', ) config = get_config(model.creator_hf_repo) @@ -55,6 +67,14 @@ for chunk in llama_generate(options): # newline print() + +``` + +### OpenAI © compatible Chat Completions + +`examples/demo_1.py` + +```python ``` ## Demos diff --git a/examples/demo_0.py b/examples/demo_0.py index a8a4b90..8ed4cf9 100644 --- a/examples/demo_0.py +++ b/examples/demo_0.py @@ -1,10 +1,14 @@ from llama.llama_cli_cffi_cpu import llama_generate, Model, Options +# from llama.llama_cli_cffi_cuda_12_5 import llama_generate, Model, Options +# from llama.llama_cli_ctypes_cuda import llama_generate, Model, Options +# from llama.llama_cli_ctypes_cuda_12_5 import llama_generate, Model, Options + from llama.formatter import get_config model = Model( - 'TinyLlama/TinyLlama-1.1B-Chat-v1.0', - 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF', - 'tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf', + creator_hf_repo='TinyLlama/TinyLlama-1.1B-Chat-v1.0', + hf_repo='TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF', + hf_file='tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf', ) config = get_config(model.creator_hf_repo) diff --git a/llama/llama_cli_cffi_cpu.py b/llama/llama_cli_cffi_cpu.py index dfb4f27..06b2a63 100644 --- a/llama/llama_cli_cffi_cpu.py +++ b/llama/llama_cli_cffi_cpu.py @@ -4,6 +4,7 @@ import json import ctypes from queue import Queue +from copy import deepcopy from typing import Iterator from threading import Thread from functools import partial @@ -110,6 +111,7 @@ def llama_generate(options: Options, callback=None) -> Iterator[str] | None: assert options.model and isinstance(options.model, Model) + options: Options = deepcopy(options) model: Model = options.model tokenizer = get_tokenizer(model.creator_hf_repo) options.model = hf_hub_download(repo_id=model.hf_repo, filename=model.hf_file) diff --git a/llama/llama_cli_cffi_cuda_12_5.py b/llama/llama_cli_cffi_cuda_12_5.py index ff6ec12..b4e100f 100644 --- a/llama/llama_cli_cffi_cuda_12_5.py +++ b/llama/llama_cli_cffi_cuda_12_5.py @@ -4,6 +4,7 @@ import json import ctypes from queue import Queue +from copy import deepcopy from typing import Iterator from threading import Thread from functools import partial @@ -110,6 +111,7 @@ def llama_generate(options: Options, callback=None) -> Iterator[str] | None: assert options.model and isinstance(options.model, Model) + options: Options = deepcopy(options) model: Model = options.model tokenizer = get_tokenizer(model.creator_hf_repo) options.model = hf_hub_download(repo_id=model.hf_repo, filename=model.hf_file) diff --git a/llama/llama_cli_ctypes_cpu.py b/llama/llama_cli_ctypes_cpu.py index b3908f9..cda5015 100644 --- a/llama/llama_cli_ctypes_cpu.py +++ b/llama/llama_cli_ctypes_cpu.py @@ -4,6 +4,7 @@ import json import ctypes from queue import Queue +from copy import deepcopy from typing import Iterator from threading import Thread from functools import partial @@ -110,6 +111,7 @@ def llama_generate(options: Options, callback=None) -> Iterator[str] | None: assert options.model and isinstance(options.model, Model) + options: Options = deepcopy(options) model: Model = options.model tokenizer = get_tokenizer(model.creator_hf_repo) options.model = hf_hub_download(repo_id=model.hf_repo, filename=model.hf_file) diff --git a/llama/llama_cli_ctypes_cuda_12_5.py b/llama/llama_cli_ctypes_cuda_12_5.py index 5022aac..e37acb7 100644 --- a/llama/llama_cli_ctypes_cuda_12_5.py +++ b/llama/llama_cli_ctypes_cuda_12_5.py @@ -4,6 +4,7 @@ import json import ctypes from queue import Queue +from copy import deepcopy from typing import Iterator from threading import Thread from functools import partial @@ -110,6 +111,7 @@ def llama_generate(options: Options, callback=None) -> Iterator[str] | None: assert options.model and isinstance(options.model, Model) + options: Options = deepcopy(options) model: Model = options.model tokenizer = get_tokenizer(model.creator_hf_repo) options.model = hf_hub_download(repo_id=model.hf_repo, filename=model.hf_file) diff --git a/poetry.lock b/poetry.lock index 713eadf..14c991a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -311,13 +311,13 @@ tqdm = ["tqdm"] [[package]] name = "huggingface-hub" -version = "0.23.4" +version = "0.24.0" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.8.0" files = [ - {file = "huggingface_hub-0.23.4-py3-none-any.whl", hash = "sha256:3a0b957aa87150addf0cc7bd71b4d954b78e749850e1e7fb29ebbd2db64ca037"}, - {file = "huggingface_hub-0.23.4.tar.gz", hash = "sha256:35d99016433900e44ae7efe1c209164a5a81dbbcd53a52f99c281dcd7ce22431"}, + {file = "huggingface_hub-0.24.0-py3-none-any.whl", hash = "sha256:7ad92edefb93d8145c061f6df8d99df2ff85f8379ba5fac8a95aca0642afa5d7"}, + {file = "huggingface_hub-0.24.0.tar.gz", hash = "sha256:6c7092736b577d89d57b3cdfea026f1b0dc2234ae783fa0d59caf1bf7d52dfa7"}, ] [package.dependencies] @@ -330,17 +330,17 @@ tqdm = ">=4.42.1" typing-extensions = ">=3.7.4.3" [package.extras] -all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio", "jedi", "minijinja (>=1.0)", "mypy (==1.5.1)", "numpy", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.3.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"] +all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio", "jedi", "minijinja (>=1.0)", "mypy (==1.5.1)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.5.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"] cli = ["InquirerPy (==0.3.4)"] -dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio", "jedi", "minijinja (>=1.0)", "mypy (==1.5.1)", "numpy", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.3.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"] +dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio", "jedi", "minijinja (>=1.0)", "mypy (==1.5.1)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.5.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"] fastai = ["fastai (>=2.4)", "fastcore (>=1.3.27)", "toml"] hf-transfer = ["hf-transfer (>=0.1.4)"] inference = ["aiohttp", "minijinja (>=1.0)"] -quality = ["mypy (==1.5.1)", "ruff (>=0.3.0)"] +quality = ["mypy (==1.5.1)", "ruff (>=0.5.0)"] tensorflow = ["graphviz", "pydot", "tensorflow"] tensorflow-testing = ["keras (<3.0)", "tensorflow"] -testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio", "jedi", "minijinja (>=1.0)", "numpy", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"] -torch = ["safetensors", "torch"] +testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio", "jedi", "minijinja (>=1.0)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"] +torch = ["safetensors[torch]", "torch"] typing = ["types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)"] [[package]] @@ -927,18 +927,19 @@ files = [ [[package]] name = "setuptools" -version = "70.3.0" +version = "71.0.3" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "setuptools-70.3.0-py3-none-any.whl", hash = "sha256:fe384da74336c398e0d956d1cae0669bc02eed936cdb1d49b57de1990dc11ffc"}, - {file = "setuptools-70.3.0.tar.gz", hash = "sha256:f171bab1dfbc86b132997f26a119f6056a57950d058587841a0082e8830f9dc5"}, + {file = "setuptools-71.0.3-py3-none-any.whl", hash = "sha256:f501b6e6db709818dc76882582d9c516bf3b67b948864c5fa1d1624c09a49207"}, + {file = "setuptools-71.0.3.tar.gz", hash = "sha256:3d8531791a27056f4a38cd3e54084d8b1c4228ff9cf3f2d7dd075ec99f9fd70d"}, ] [package.extras] -doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"] -test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test", "mypy (==1.10.0)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.3.2)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] +core = ["importlib-metadata (>=6)", "importlib-resources (>=5.10.2)", "jaraco.text (>=3.7)", "more-itertools (>=8.8)", "ordered-set (>=3.1.1)", "packaging (>=24)", "platformdirs (>=2.6.2)", "tomli (>=2.0.1)", "wheel (>=0.43.0)"] +doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (<7.4)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"] +test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test", "mypy (==1.10.0)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (<0.4)", "pytest-ruff (>=0.2.1)", "pytest-ruff (>=0.3.2)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] [[package]] name = "tokenizers" @@ -1187,4 +1188,4 @@ zstd = ["zstandard (>=0.18.0)"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "b92a8ef20b48c63d19eb909d928fd87be7769a5d016f34a3915ce0959c3f3175" +content-hash = "22c69c6d8dc11d842a23c93528d6f3c621bffcce25e36d38389a9a133799aee2" diff --git a/pyproject.toml b/pyproject.toml index b4503e7..808cf02 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,10 +1,10 @@ [tool.poetry] name = "llama-cpp-cffi" -version = "0.1.0" +version = "0.1.2" description = "Python binding for llama.cpp using cffi" -homepage = "https://github.com/mtasic85/llama-cpp-cffi" -repository = "https://github.com/mtasic85/llama-cpp-cffi" -authors = ["Marko Tasic "] +homepage = "https://github.com/tangledgroup/llama-cpp-cffi" +repository = "https://github.com/tangledgroup/llama-cpp-cffi" +authors = ["Marko Tasic ", "Tangled Group, Inc "] license = "MIT" readme = "README.md" packages = [{include = "llama"}] @@ -13,14 +13,21 @@ include = [{path = "llama/*.so"}] [tool.poetry.dependencies] python = "^3.10" attrs = "^23.2.0" -huggingface-hub = "^0.23.4" +huggingface-hub = "^0.24.0" cffi = "^1.16.0" -setuptools = "^70.2.0" +setuptools = "^71.0.3" psutil = "^6.0.0" transformers = "^4.42.4" jinja2 = "^3.1.4" sentencepiece = "^0.2.0" protobuf = "^5.27.2" +openai = {version = "^1.35.15", optional = true} +aiohttp = {extras = ["speedups"], version = "^3.9.5", optional = true} +uvloop = {version = "^0.19.0", optional = true} + +[tool.poetry.extras] +openai = ["openai", "aiohttp"] +uvloop = ["uvloop"] [tool.poetry.group.dev.dependencies] cibuildwheel = "^2.19.2" @@ -35,11 +42,14 @@ script = "scripts/build.py" [tool.cibuildwheel] build-frontend = "build" before-build = "pip install poetry" -# before-build = "pip install poetry; yum -y install wget" skip = ["cp36-*", "cp37-*", "cp38-*", "cp39-*", "pp*", "*-win32", "*i686"] +manylinux-x86_64-image = "manylinux_2_28" +manylinux-aarch64-image = "manylinux_2_28" +musllinux-x86_64-image = "musllinux_1_2" +musllinux-aarch64-image = "musllinux_1_2" build-verbosity=3 repair-wheel-command = "" -# environment = {"LD_LIBRARY_PATH" = "/project/cuda-12.5.1/dist/lib64:$LD_LIBRARY_PATH", "CUDA_HOME" = "/project/cuda-12.5.1/dist"} +environment = {"LD_LIBRARY_PATH" = "/project/cuda-12.5.1/dist/lib64:/project/cuda-12.5.1/dist/targets/x86_64-linux/lib:/project/cuda-12.5.1/dist/lib64/stubs:$LD_LIBRARY_PATH", "CUDA_HOME" = "/project/cuda-12.5.1/dist"} [tool.cibuildwheel.pyodide] diff --git a/scripts/build.py b/scripts/build.py index 4fb7fe5..e96126d 100644 --- a/scripts/build.py +++ b/scripts/build.py @@ -90,8 +90,16 @@ def build_cuda_12_5(*args, **kwargs): cuda_output_dir = os.path.abspath('./cuda-12.5.1') cuda_file_path = os.path.join(cuda_output_dir, cuda_file) - env['PATH'] = env['PATH'] + f':{cuda_output_dir}/dist/bin' + env['PATH'] = f'{cuda_output_dir}/dist/bin:{env["PATH"]}' env['CUDA_PATH'] = f'{cuda_output_dir}/dist' + env['CUDA_DOCKER_ARCH'] = 'compute_75' + env['NVCCFLAGS'] = '\ + -gencode arch=compute_75,code=sm_75 \ + -gencode arch=compute_80,code=sm_80 \ + -gencode arch=compute_86,code=sm_86 \ + -gencode arch=compute_89,code=sm_89 \ + -gencode arch=compute_90,code=sm_90 \ + -gencode arch=compute_90,code=compute_90' # download cuda file if not os.path.exists(cuda_file_path): @@ -107,7 +115,8 @@ def build_cuda_12_5(*args, **kwargs): cmd = [ f'{cuda_output_dir}/{cuda_file}', '--tar', - 'mxvf', + # 'mxvf', + 'mxf', '--wildcards', './builds/cuda_cccl/*', './builds/cuda_cudart/*', @@ -133,6 +142,15 @@ def build_cuda_12_5(*args, **kwargs): cmd = f'cp -r {cuda_output_dir}/builds/libcublas/* {cuda_output_dir}/dist' subprocess.run(cmd, shell=True, check=True) + # cmd = 'pwd' + # subprocess.run(cmd, check=True) + + # cmd = 'ls -l' + # subprocess.run(cmd, check=True, shell=True) + + # cmd = f'ls -l {cuda_output_dir}/dist' + # subprocess.run(cmd, check=True, shell=True) + # # build llama.cpp # @@ -176,13 +194,17 @@ def build_cuda_12_5(*args, **kwargs): ''', libraries=[ 'stdc++', - # 'cuda', - # 'cublas', - # 'culibos', - # 'cudart', - # 'cublasLt', + 'cuda', + 'cublas', + 'culibos', + 'cudart', + 'cublasLt', + ], + library_dirs=[ + f'{cuda_output_dir}/dist/lib64', + f'{cuda_output_dir}/dist/targets/x86_64-linux/lib', + f'{cuda_output_dir}/dist/lib64/stubs', ], - library_dirs=[f'{cuda_output_dir}/dist/lib64'], extra_objects=['../llama.cpp/llama_cli.a'], ) @@ -203,14 +225,15 @@ def build(*args, **kwargs): clean() clone_llama_cpp() + # cuda 12.5 + if os.environ.get('AUDITWHEEL_POLICY') in ('manylinux2014', 'manylinux_2_28', None) and os.environ.get('AUDITWHEEL_ARCH') in ('x86_64', None): + clean_llama_cpp() + build_cuda_12_5(*args, **kwargs) + # cpu clean_llama_cpp() build_cpu(*args, **kwargs) - # cuda 12.5 - if os.environ['AUDITWHEEL_POLICY'] == 'manylinux2014' and os.environ['AUDITWHEEL_ARCH'] == 'x86_64': - clean_llama_cpp() - build_cuda_12_5(*args, **kwargs) if __name__ == '__main__':