diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0dba636..5fbcca4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,21 @@
 # CHANGELOG
 
+## v0.1.2
+
+Added:
+    - OpenAI © compatible Chat Completions API server.
+
+Fixed:
+    - `options` is deepcopy-ed when passed to `llama_generate(options)`, so it can be reused.
+
+Changed:
+    - Build for manylinux_2_28 and musllinux_1_2
+
+## v0.1.1
+
+Changed:
+    - Updated: huggingface-hub = "^0.24.0", setuptools = "^71.0.3"
+
 ## v0.1.0
 
 Added:
diff --git a/README.md b/README.md
index 58a81b1..eadb618 100644
--- a/README.md
+++ b/README.md
@@ -16,12 +16,24 @@
 
 ## Install
 
+Basic library install:
+
 ```bash
 pip install llama-cpp-cffi
 ```
 
+In case you want [Chat Completions API by OpenAI ©](https://platform.openai.com/docs/overview) compatible API:
+
+```bash
+pip install llama-cpp-cffi[openai]
+```
+
 ## Example
 
+### Library Usage
+
+`examples/demo_0.py`
+
 ```python
 from llama.llama_cli_cffi_cpu import llama_generate, Model, Options
 # from llama.llama_cli_cffi_cuda_12_5 import llama_generate, Model, Options
@@ -31,9 +43,9 @@ from llama.llama_cli_cffi_cpu import llama_generate, Model, Options
 from llama.formatter import get_config
 
 model = Model(
-    'TinyLlama/TinyLlama-1.1B-Chat-v1.0',
-    'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF',
-    'tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf',
+    creator_hf_repo='TinyLlama/TinyLlama-1.1B-Chat-v1.0',
+    hf_repo='TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF',
+    hf_file='tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf',
 )
 
 config = get_config(model.creator_hf_repo)
@@ -55,6 +67,14 @@ for chunk in llama_generate(options):
 
 # newline
 print()
+
+```
+
+### OpenAI © compatible Chat Completions
+
+`examples/demo_1.py`
+
+```python
 ```
 
 ## Demos
diff --git a/examples/demo_0.py b/examples/demo_0.py
index a8a4b90..8ed4cf9 100644
--- a/examples/demo_0.py
+++ b/examples/demo_0.py
@@ -1,10 +1,14 @@
 from llama.llama_cli_cffi_cpu import llama_generate, Model, Options
+# from llama.llama_cli_cffi_cuda_12_5 import llama_generate, Model, Options
+# from llama.llama_cli_ctypes_cuda import llama_generate, Model, Options
+# from llama.llama_cli_ctypes_cuda_12_5 import llama_generate, Model, Options
+
 from llama.formatter import get_config
 
 model = Model(
-    'TinyLlama/TinyLlama-1.1B-Chat-v1.0',
-    'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF',
-    'tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf',
+    creator_hf_repo='TinyLlama/TinyLlama-1.1B-Chat-v1.0',
+    hf_repo='TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF',
+    hf_file='tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf',
 )
 
 config = get_config(model.creator_hf_repo)
diff --git a/llama/llama_cli_cffi_cpu.py b/llama/llama_cli_cffi_cpu.py
index dfb4f27..06b2a63 100644
--- a/llama/llama_cli_cffi_cpu.py
+++ b/llama/llama_cli_cffi_cpu.py
@@ -4,6 +4,7 @@
 import json
 import ctypes
 from queue import Queue
+from copy import deepcopy
 from typing import Iterator
 from threading import Thread
 from functools import partial
@@ -110,6 +111,7 @@ def llama_generate(options: Options, callback=None) -> Iterator[str] | None:
 
     assert options.model and isinstance(options.model, Model)
 
+    options: Options = deepcopy(options)
     model: Model = options.model
     tokenizer = get_tokenizer(model.creator_hf_repo)
     options.model = hf_hub_download(repo_id=model.hf_repo, filename=model.hf_file)
diff --git a/llama/llama_cli_cffi_cuda_12_5.py b/llama/llama_cli_cffi_cuda_12_5.py
index ff6ec12..b4e100f 100644
--- a/llama/llama_cli_cffi_cuda_12_5.py
+++ b/llama/llama_cli_cffi_cuda_12_5.py
@@ -4,6 +4,7 @@
 import json
 import ctypes
 from queue import Queue
+from copy import deepcopy
 from typing import Iterator
 from threading import Thread
 from functools import partial
@@ -110,6 +111,7 @@ def llama_generate(options: Options, callback=None) -> Iterator[str] | None:
 
     assert options.model and isinstance(options.model, Model)
 
+    options: Options = deepcopy(options)
     model: Model = options.model
     tokenizer = get_tokenizer(model.creator_hf_repo)
     options.model = hf_hub_download(repo_id=model.hf_repo, filename=model.hf_file)
diff --git a/llama/llama_cli_ctypes_cpu.py b/llama/llama_cli_ctypes_cpu.py
index b3908f9..cda5015 100644
--- a/llama/llama_cli_ctypes_cpu.py
+++ b/llama/llama_cli_ctypes_cpu.py
@@ -4,6 +4,7 @@
 import json
 import ctypes
 from queue import Queue
+from copy import deepcopy
 from typing import Iterator
 from threading import Thread
 from functools import partial
@@ -110,6 +111,7 @@ def llama_generate(options: Options, callback=None) -> Iterator[str] | None:
 
     assert options.model and isinstance(options.model, Model)
 
+    options: Options = deepcopy(options)
     model: Model = options.model
     tokenizer = get_tokenizer(model.creator_hf_repo)
     options.model = hf_hub_download(repo_id=model.hf_repo, filename=model.hf_file)
diff --git a/llama/llama_cli_ctypes_cuda_12_5.py b/llama/llama_cli_ctypes_cuda_12_5.py
index 5022aac..e37acb7 100644
--- a/llama/llama_cli_ctypes_cuda_12_5.py
+++ b/llama/llama_cli_ctypes_cuda_12_5.py
@@ -4,6 +4,7 @@
 import json
 import ctypes
 from queue import Queue
+from copy import deepcopy
 from typing import Iterator
 from threading import Thread
 from functools import partial
@@ -110,6 +111,7 @@ def llama_generate(options: Options, callback=None) -> Iterator[str] | None:
 
     assert options.model and isinstance(options.model, Model)
 
+    options: Options = deepcopy(options)
     model: Model = options.model
     tokenizer = get_tokenizer(model.creator_hf_repo)
     options.model = hf_hub_download(repo_id=model.hf_repo, filename=model.hf_file)
diff --git a/poetry.lock b/poetry.lock
index 713eadf..14c991a 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -311,13 +311,13 @@ tqdm = ["tqdm"]
 
 [[package]]
 name = "huggingface-hub"
-version = "0.23.4"
+version = "0.24.0"
 description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
 optional = false
 python-versions = ">=3.8.0"
 files = [
-    {file = "huggingface_hub-0.23.4-py3-none-any.whl", hash = "sha256:3a0b957aa87150addf0cc7bd71b4d954b78e749850e1e7fb29ebbd2db64ca037"},
-    {file = "huggingface_hub-0.23.4.tar.gz", hash = "sha256:35d99016433900e44ae7efe1c209164a5a81dbbcd53a52f99c281dcd7ce22431"},
+    {file = "huggingface_hub-0.24.0-py3-none-any.whl", hash = "sha256:7ad92edefb93d8145c061f6df8d99df2ff85f8379ba5fac8a95aca0642afa5d7"},
+    {file = "huggingface_hub-0.24.0.tar.gz", hash = "sha256:6c7092736b577d89d57b3cdfea026f1b0dc2234ae783fa0d59caf1bf7d52dfa7"},
 ]
 
 [package.dependencies]
@@ -330,17 +330,17 @@ tqdm = ">=4.42.1"
 typing-extensions = ">=3.7.4.3"
 
 [package.extras]
-all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio", "jedi", "minijinja (>=1.0)", "mypy (==1.5.1)", "numpy", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.3.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
+all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio", "jedi", "minijinja (>=1.0)", "mypy (==1.5.1)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.5.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
 cli = ["InquirerPy (==0.3.4)"]
-dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio", "jedi", "minijinja (>=1.0)", "mypy (==1.5.1)", "numpy", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.3.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
+dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio", "jedi", "minijinja (>=1.0)", "mypy (==1.5.1)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.5.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
 fastai = ["fastai (>=2.4)", "fastcore (>=1.3.27)", "toml"]
 hf-transfer = ["hf-transfer (>=0.1.4)"]
 inference = ["aiohttp", "minijinja (>=1.0)"]
-quality = ["mypy (==1.5.1)", "ruff (>=0.3.0)"]
+quality = ["mypy (==1.5.1)", "ruff (>=0.5.0)"]
 tensorflow = ["graphviz", "pydot", "tensorflow"]
 tensorflow-testing = ["keras (<3.0)", "tensorflow"]
-testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio", "jedi", "minijinja (>=1.0)", "numpy", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"]
-torch = ["safetensors", "torch"]
+testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio", "jedi", "minijinja (>=1.0)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"]
+torch = ["safetensors[torch]", "torch"]
 typing = ["types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)"]
 
 [[package]]
@@ -927,18 +927,19 @@ files = [
 
 [[package]]
 name = "setuptools"
-version = "70.3.0"
+version = "71.0.3"
 description = "Easily download, build, install, upgrade, and uninstall Python packages"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "setuptools-70.3.0-py3-none-any.whl", hash = "sha256:fe384da74336c398e0d956d1cae0669bc02eed936cdb1d49b57de1990dc11ffc"},
-    {file = "setuptools-70.3.0.tar.gz", hash = "sha256:f171bab1dfbc86b132997f26a119f6056a57950d058587841a0082e8830f9dc5"},
+    {file = "setuptools-71.0.3-py3-none-any.whl", hash = "sha256:f501b6e6db709818dc76882582d9c516bf3b67b948864c5fa1d1624c09a49207"},
+    {file = "setuptools-71.0.3.tar.gz", hash = "sha256:3d8531791a27056f4a38cd3e54084d8b1c4228ff9cf3f2d7dd075ec99f9fd70d"},
 ]
 
 [package.extras]
-doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
-test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test", "mypy (==1.10.0)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.3.2)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
+core = ["importlib-metadata (>=6)", "importlib-resources (>=5.10.2)", "jaraco.text (>=3.7)", "more-itertools (>=8.8)", "ordered-set (>=3.1.1)", "packaging (>=24)", "platformdirs (>=2.6.2)", "tomli (>=2.0.1)", "wheel (>=0.43.0)"]
+doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (<7.4)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
+test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test", "mypy (==1.10.0)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (<0.4)", "pytest-ruff (>=0.2.1)", "pytest-ruff (>=0.3.2)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
 
 [[package]]
 name = "tokenizers"
@@ -1187,4 +1188,4 @@ zstd = ["zstandard (>=0.18.0)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "b92a8ef20b48c63d19eb909d928fd87be7769a5d016f34a3915ce0959c3f3175"
+content-hash = "22c69c6d8dc11d842a23c93528d6f3c621bffcce25e36d38389a9a133799aee2"
diff --git a/pyproject.toml b/pyproject.toml
index b4503e7..808cf02 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,10 +1,10 @@
 [tool.poetry]
 name = "llama-cpp-cffi"
-version = "0.1.0"
+version = "0.1.2"
 description = "Python binding for llama.cpp using cffi"
-homepage = "https://github.com/mtasic85/llama-cpp-cffi"
-repository = "https://github.com/mtasic85/llama-cpp-cffi"
-authors = ["Marko Tasic <mtasic85@gmail.com>"]
+homepage = "https://github.com/tangledgroup/llama-cpp-cffi"
+repository = "https://github.com/tangledgroup/llama-cpp-cffi"
+authors = ["Marko Tasic <mtasic85@gmail.com>", "Tangled Group, Inc <info@tangledgroup.com>"]
 license = "MIT"
 readme = "README.md"
 packages = [{include = "llama"}]
@@ -13,14 +13,21 @@ include = [{path = "llama/*.so"}]
 [tool.poetry.dependencies]
 python = "^3.10"
 attrs = "^23.2.0"
-huggingface-hub = "^0.23.4"
+huggingface-hub = "^0.24.0"
 cffi = "^1.16.0"
-setuptools = "^70.2.0"
+setuptools = "^71.0.3"
 psutil = "^6.0.0"
 transformers = "^4.42.4"
 jinja2 = "^3.1.4"
 sentencepiece = "^0.2.0"
 protobuf = "^5.27.2"
+openai = {version = "^1.35.15", optional = true}
+aiohttp = {extras = ["speedups"], version = "^3.9.5", optional = true}
+uvloop = {version = "^0.19.0", optional = true}
+
+[tool.poetry.extras]
+openai = ["openai", "aiohttp"]
+uvloop = ["uvloop"]
 
 [tool.poetry.group.dev.dependencies]
 cibuildwheel = "^2.19.2"
@@ -35,11 +42,14 @@ script = "scripts/build.py"
 [tool.cibuildwheel]
 build-frontend = "build"
 before-build = "pip install poetry"
-# before-build = "pip install poetry; yum -y install wget"
 skip = ["cp36-*", "cp37-*", "cp38-*", "cp39-*", "pp*", "*-win32", "*i686"]
+manylinux-x86_64-image = "manylinux_2_28"
+manylinux-aarch64-image = "manylinux_2_28"
+musllinux-x86_64-image = "musllinux_1_2"
+musllinux-aarch64-image = "musllinux_1_2"
 build-verbosity=3
 repair-wheel-command = ""
-# environment = {"LD_LIBRARY_PATH" = "/project/cuda-12.5.1/dist/lib64:$LD_LIBRARY_PATH", "CUDA_HOME" = "/project/cuda-12.5.1/dist"}
+environment = {"LD_LIBRARY_PATH" = "/project/cuda-12.5.1/dist/lib64:/project/cuda-12.5.1/dist/targets/x86_64-linux/lib:/project/cuda-12.5.1/dist/lib64/stubs:$LD_LIBRARY_PATH", "CUDA_HOME" = "/project/cuda-12.5.1/dist"}
 
 [tool.cibuildwheel.pyodide]
 
diff --git a/scripts/build.py b/scripts/build.py
index 4fb7fe5..e96126d 100644
--- a/scripts/build.py
+++ b/scripts/build.py
@@ -90,8 +90,16 @@ def build_cuda_12_5(*args, **kwargs):
     cuda_output_dir = os.path.abspath('./cuda-12.5.1')
     cuda_file_path = os.path.join(cuda_output_dir, cuda_file)
 
-    env['PATH'] = env['PATH'] + f':{cuda_output_dir}/dist/bin'
+    env['PATH'] =  f'{cuda_output_dir}/dist/bin:{env["PATH"]}'
     env['CUDA_PATH'] = f'{cuda_output_dir}/dist'
+    env['CUDA_DOCKER_ARCH'] = 'compute_75'
+    env['NVCCFLAGS'] = '\
+            -gencode arch=compute_75,code=sm_75 \
+            -gencode arch=compute_80,code=sm_80 \
+            -gencode arch=compute_86,code=sm_86 \
+            -gencode arch=compute_89,code=sm_89 \
+            -gencode arch=compute_90,code=sm_90 \
+            -gencode arch=compute_90,code=compute_90'
 
     # download cuda file
     if not os.path.exists(cuda_file_path):
@@ -107,7 +115,8 @@ def build_cuda_12_5(*args, **kwargs):
     cmd = [
         f'{cuda_output_dir}/{cuda_file}',
         '--tar',
-        'mxvf',
+        # 'mxvf',
+        'mxf',
         '--wildcards',
         './builds/cuda_cccl/*',
         './builds/cuda_cudart/*',
@@ -133,6 +142,15 @@ def build_cuda_12_5(*args, **kwargs):
     cmd = f'cp -r {cuda_output_dir}/builds/libcublas/* {cuda_output_dir}/dist'
     subprocess.run(cmd, shell=True, check=True)
 
+    # cmd = 'pwd'
+    # subprocess.run(cmd, check=True)
+
+    # cmd = 'ls -l'
+    # subprocess.run(cmd, check=True, shell=True)
+
+    # cmd = f'ls -l {cuda_output_dir}/dist'
+    # subprocess.run(cmd, check=True, shell=True)
+
     #
     # build llama.cpp
     #
@@ -176,13 +194,17 @@ def build_cuda_12_5(*args, **kwargs):
         ''',
         libraries=[
             'stdc++',
-            # 'cuda',
-            # 'cublas',
-            # 'culibos',
-            # 'cudart',
-            # 'cublasLt',
+            'cuda',
+            'cublas',
+            'culibos',
+            'cudart',
+            'cublasLt',
+        ],
+        library_dirs=[
+            f'{cuda_output_dir}/dist/lib64',
+            f'{cuda_output_dir}/dist/targets/x86_64-linux/lib',
+            f'{cuda_output_dir}/dist/lib64/stubs',
         ],
-        library_dirs=[f'{cuda_output_dir}/dist/lib64'],
         extra_objects=['../llama.cpp/llama_cli.a'],
     )
 
@@ -203,14 +225,15 @@ def build(*args, **kwargs):
     clean()
     clone_llama_cpp()
 
+    # cuda 12.5
+    if os.environ.get('AUDITWHEEL_POLICY') in ('manylinux2014', 'manylinux_2_28', None) and os.environ.get('AUDITWHEEL_ARCH') in ('x86_64', None):
+        clean_llama_cpp()
+        build_cuda_12_5(*args, **kwargs)
+
     # cpu
     clean_llama_cpp()
     build_cpu(*args, **kwargs)
 
-    # cuda 12.5
-    if os.environ['AUDITWHEEL_POLICY'] == 'manylinux2014' and os.environ['AUDITWHEEL_ARCH'] == 'x86_64':
-        clean_llama_cpp()
-        build_cuda_12_5(*args, **kwargs)
 
 
 if __name__ == '__main__':