new patches for make.cpp and Makefile

tangledgroup · Jul 11, 2024 · 1a55881 · 1a55881
1 parent 8b8eeaf
commit 1a55881
Show file tree

Hide file tree

Showing 10 changed files with 247 additions and 123 deletions.
diff --git a/Makefile_3.patch b/Makefile_3.patch
@@ -0,0 +1,51 @@
+--- ../Makefile	2024-07-10 22:17:00.507999852 +0200
++++ Makefile	2024-07-10 22:40:13.002343655 +0200
+@@ -425,6 +425,31 @@
+ 	MK_CXXFLAGS += -pg
+ endif
+
++#
++# llama-cpp-cffi
++# Set shared library extension and linker flags based on the platform
++#
++ifeq ($(UNAME_S), Linux)
++    LIB_EXT := so
++    LIB_LDFLAGS := -shared
++    LIB_CXXFLAGS := -fPIC -DLLAMA_LIB
++endif
++
++ifeq ($(UNAME_S), Darwin)
++    LIB_EXT := dylib
++    LIB_LDFLAGS := -dynamiclib
++    LIB_CXXFLAGS := -fPIC -DLLAMA_LIB
++endif
++
++# For Windows (assuming MinGW)
++ifeq ($(OS), Windows_NT)
++    LIB_EXT := dll
++    LIB_LDFLAGS := -shared
++    LIB_CXXFLAGS := -DLLAMA_LIB
++endif
++
++LIB_NAME := llama-cli.$(LIB_EXT)
++
+ # Architecture specific
+ # TODO: probably these flags need to be tweaked on some architectures
+ #       feel free to update the Makefile for your architecture and send a pull request or issue
+@@ -1132,6 +1157,16 @@
+ 	@echo '====  Run ./llama-cli -h for help.  ===='
+ 	@echo
+
++llama-cli-shared: examples/main/main.cpp \
++	$(OBJ_ALL)
++	$(CXX) $(CXXFLAGS) $(LIB_CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
++	$(CXX) $(CXXFLAGS) $(LIB_CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $(LIB_NAME) $(LIB_LDFLAGS) $(LDFLAGS)
++
++llama-cli-static: examples/main/main.cpp \
++	$(OBJ_ALL)
++	$(CXX) $(CXXFLAGS) $(LIB_CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
++	ar rcs llama-cli.a $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<)
++
+ llama-infill: examples/infill/infill.cpp \
+ 	$(OBJ_ALL)
+ 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
diff --git a/examples/demo_ctypes.py b/examples/demo_ctypes.py
@@ -1,23 +1,93 @@
+import os
+import sys
+sys.path.append(os.path.abspath('.'))
+
 import psutil
-from llama.ctypes import llama_generate, LlamaOptions
-
-
-options = LlamaOptions(
-    no_display_prompt=True,
-    threads=psutil.cpu_count(logical=False),
-    ctx_size=8192,
-    predict=512,
-    flash_attn=True,
-    cont_batching=True,
-    simple_io=True,
-    # log_disable=True,
-    hf_repo='bartowski/Phi-3.1-mini-128k-instruct-GGUF',
-    hf_file='Phi-3.1-mini-128k-instruct-Q4_K_M.gguf',
-    # hf_file='Phi-3.1-mini-128k-instruct-IQ2_M.gguf',
-    chat_template='chatml',
-    # prompt='<|im_start|>user\nEvaluate 1 + 2.<|im_end|>\n<|im_start|>assistant\n',
-    prompt='<|system|>\nYou are a helpful assistant.<|end|><|user|>\nEvaluate 1 + 2.<|end|>\n<|assistant|>\n',
-)
-
-for chunk in llama_generate(options):
-    print(chunk, flush=True, end='')
+from llama.llama_cli_ctypes import llama_generate, Model, Options
+
+models = [
+    Model(
+        'microsoft/Phi-3-mini-128k-instruct',
+        'bartowski/Phi-3.1-mini-128k-instruct-GGUF',
+        'Phi-3.1-mini-128k-instruct-Q4_K_M.gguf',
+    ),
+    Model(
+        'Qwen/Qwen2-1.5B-Instruct',
+        'Qwen/Qwen2-1.5B-Instruct-GGUF',
+        'qwen2-1_5b-instruct-q4_k_m.gguf',
+    ),
+    Model(
+        'TinyLlama/TinyLlama-1.1B-Chat-v1.0',
+        'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF',
+        'tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf',
+    ),
+]
+
+
+def demo1():
+    options = Options(
+        no_display_prompt=True,
+        threads=psutil.cpu_count(logical=False),
+        ctx_size=8192,
+        predict=512,
+        flash_attn=True,
+        cont_batching=True,
+        simple_io=True,
+        log_disable=True,
+        hf_repo=models[0].hf_repo,
+        hf_file=models[0].hf_file,
+        prompt='<|system|>\nYou are a helpful assistant.<|end|><|user|>\nEvaluate 1 + 2.<|end|>\n<|assistant|>\n',
+    )
+
+    for chunk in llama_generate(options):
+        print(chunk, flush=True, end='')
+
+    print()
+
+
+def demo2():
+    options = Options(
+        no_display_prompt=True,
+        threads=psutil.cpu_count(logical=False),
+        ctx_size=2048,
+        predict=-2,
+        flash_attn=True,
+        cont_batching=True,
+        simple_io=True,
+        log_disable=True,
+        hf_repo=models[1].hf_repo,
+        hf_file=models[1].hf_file,
+        prompt='<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nEvaluate 1 + 2.<|im_end|>\n<|im_start|>assistant\n',
+    )
+
+    for chunk in llama_generate(options):
+        print(chunk, flush=True, end='')
+
+    print()
+
+
+def demo3():
+    options = Options(
+        no_display_prompt=True,
+        threads=psutil.cpu_count(logical=False),
+        ctx_size=2048,
+        predict=-2,
+        flash_attn=True,
+        cont_batching=True,
+        simple_io=True,
+        log_disable=True,
+        hf_repo=models[2].hf_repo,
+        hf_file=models[2].hf_file,
+        prompt='<|system|>\nYou are a helpful assistant.<|end|><|user|>\nEvaluate 1 + 2.<|end|>\n<|assistant|>\n',
+    )
+
+    for chunk in llama_generate(options):
+        print(chunk, flush=True, end='')
+
+    print()
+
+
+if __name__ == '__main__':
+    demo1()
+    demo2()
+    demo3()
diff --git a/llama/__init__.py b/llama/__init__.py
@@ -1 +1,2 @@
-from .cffi import *
+# from .cffi import *
+# from .ctypes import *
diff --git a/llama/cffi.py b/llama/cffi.py
diff --git a/llama/ctypes.py b/llama/ctypes.py
diff --git a/llama/llama_cli_cffi.py b/llama/llama_cli_cffi.py
@@ -1,4 +1,4 @@
-__all__ = ['llama_generate', 'LlamaOptions']
+__all__ = ['llama_generate', 'Options']
 
 import json
 import ctypes
@@ -10,40 +10,14 @@
 
 from huggingface_hub import hf_hub_download
 
-from .llama_cli_options import LlamaOptions, convert_options_to_bytes
+from .llama_cli_options import Options, convert_options_to_bytes
 from ._llama_cli import lib, ffi
 
 
 FPRINTF_FUNC = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_void_p, ctypes.c_char_p, ctypes.c_char_p)
 FFLUSH_FUNC = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_void_p)
 
 
-def fprintf_func(file_obj, fmt, arg, queue=None, callback=None, metadata=None):
-    content = arg.decode('utf-8')
-
-    if metadata:
-        eos = metadata.get('eos')
-        eot = metadata.get('eot')
-
-        if eos is not None and eos in content:
-            content = content[:content.index(eos)]
-        elif eot is not None and eot in content:
-            content = content[:content.index(eot)]
-
-    if queue is not None:
-        if content:
-            queue.put(content)
-        else:
-            queue.put(None)
-    elif callback is not None:
-        callback(content)
-
-    size = len(content)
-    return size
-
-
-def fflush_func(file_obj):
-    return 0
 
 
 def _llama_cli_main(argc, argv, queue=None, callback=None, metadata=None):
@@ -56,7 +30,7 @@ def _llama_cli_main(argc, argv, queue=None, callback=None, metadata=None):
         callback(None)
 
 
-def llama_generate(options: LlamaOptions, callback=None) -> Iterator[str] | None:
+def llama_generate(options: Options, callback=None) -> Iterator[str] | None:
     # check hf_repo, hf_file
     if options.hf_repo and options.hf_file:
         options.model = hf_hub_download(repo_id=options.hf_repo, filename=options.hf_file)

diff --git a/llama/llama_cli_ctypes.py b/llama/llama_cli_ctypes.py
@@ -1,68 +1,44 @@
-__all__ = ['llama_generate', 'LlamaOptions']
+__all__ = ['llama_generate', 'Options']
 
 import os
 import json
 from ctypes import *
 from queue import Queue
-from copy import deepcopy
 from typing import Iterator
 from threading import Thread
 from functools import partial
 
 from huggingface_hub import hf_hub_download
 
-from .llama_cli_options import LlamaOptions, convert_options_to_bytes
+from .llama_cli_model import Model
+from .llama_cli_options import Options, convert_options_to_bytes
 
-current_module_path = os.path.abspath(__file__)
-current_module_dir = os.path.dirname(current_module_path)
-libllama_cli_path = os.path.join(current_module_dir, 'llama-cli.so')
 
-lib = CDLL(libllama_cli_path)
+module_path = os.path.abspath(__file__)
+module_dir = os.path.dirname(module_path)
+llama_cli_lib_path = os.path.join(module_dir, 'llama-cli.so')
+lib = CDLL(llama_cli_lib_path)
 
-# lib.llama_cli_main.argtypes = [c_int, POINTER(c_char_p)]
-# lib.llama_cli_main.restype = c_int
+_LLAMA_YIELD_TOKEN_T = CFUNCTYPE(None, c_char_p)
+_LLAMA_SHOULD_STOP_T = CFUNCTYPE(c_int)
 
-# lib.llama_get_metadata_as_json.argtypes = [c_int, POINTER(c_char_p)]
-# lib.llama_get_metadata_as_json.restype = c_void_p
+lib._llama_cli_main.argtypes = [c_int, POINTER(c_char_p), _LLAMA_YIELD_TOKEN_T, _LLAMA_SHOULD_STOP_T, c_int]
+lib._llama_cli_main.restype = c_int
 
-# lib.llama_free_metadata_as_json.argtypes = [c_void_p]
-# lib.llama_free_metadata_as_json.restype = None
 
-# FPRINTF_FUNC = CFUNCTYPE(c_int, c_void_p, c_char_p, c_char_p)
-# FFLUSH_FUNC = CFUNCTYPE(c_int, c_void_p)
+def _llama_yield_token_func(chunk: bytes, queue=None, callback=None, metadata=None):
+    chunk = chunk.decode()
+    print(chunk, flush=True, end='')
 
 
-'''
-def fprintf_func(file_obj, fmt, arg, queue=None, callback=None, metadata=None):
-    content = arg.decode('utf-8')
-
-    if metadata:
-        eos = metadata.get('eos')
-        eot = metadata.get('eot')
-
-        if eos is not None and eos in content:
-            content = content[:content.index(eos)]
-        elif eot is not None and eot in content:
-            content = content[:content.index(eot)]
-
-    if queue is not None:
-        if content:
-            queue.put(content)
-        else:
-            queue.put(None)
-    elif callback is not None:
-        callback(content)
-
-    size = len(content)
-    return size
-
-
-def fflush_func(file_obj):
+def _llama_should_stop_func(queue=None, callback=None, metadata=None) -> int:
     return 0
-'''
+
 
 def _llama_cli_main(argc, argv, queue=None, callback=None, metadata=None):
-    r = lib.llama_cli_main(argc, argv)
+    _llama_yield_token = _LLAMA_YIELD_TOKEN_T(partial(_llama_yield_token_func, queue=queue, callback=callback, metadata=metadata))
+    _llama_should_stop = _LLAMA_SHOULD_STOP_T(partial(_llama_should_stop_func, queue=queue, callback=callback, metadata=metadata))
+    r = lib._llama_cli_main(argc, argv, _llama_yield_token, _llama_should_stop, 1)
     assert r == 0
 
     if queue is not None:
@@ -71,7 +47,7 @@ def _llama_cli_main(argc, argv, queue=None, callback=None, metadata=None):
         callback(None)
 
 
-def llama_generate(options: LlamaOptions, callback=None, metadata=None) -> Iterator[str] | None:
+def llama_generate(options: Options, callback=None, metadata=None) -> Iterator[str] | None:
     # check hf_repo, hf_file
     if options.hf_repo and options.hf_file:
         options.model = hf_hub_download(repo_id=options.hf_repo, filename=options.hf_file)
@@ -89,25 +65,7 @@ def llama_generate(options: LlamaOptions, callback=None, metadata=None) -> Itera
     else:
         queue = Queue()
 
-    # get bos, eos, and eot from metedata
-    metadata_options = deepcopy(options)
-    metadata_options.log_disable = True
-    metadata_argv: list[bytes] = [b'llama-cli'] + convert_options_to_bytes(metadata_options)
-    metadata_argc = len(metadata_argv)
-    metadata_argv = (c_char_p * metadata_argc)(*metadata_argv)
-
-    c_metadata: 'const char*' = lib.llama_get_metadata_as_json(metadata_argc, metadata_argv)
-    metadata: str = string_at(c_metadata)
-    lib.llama_free_metadata_as_json(c_metadata)
-    metadata: dict = json.loads(metadata)
-    print(f'{metadata = }')
-
-    # intercept token generation
-    fprintf = FPRINTF_FUNC(partial(fprintf_func, queue=queue, callback=callback, metadata=metadata))
-    fflush = FFLUSH_FUNC(fflush_func)
-
-    lib.llama_set_fprintf(fprintf)
-    lib.llama_set_fflush(fflush)
+    metadata: dict = {}
 
     argv: list[bytes] = [b'llama-cli'] + convert_options_to_bytes(options)
     argc = len(argv)

diff --git a/llama/llama_cli_model.py b/llama/llama_cli_model.py
@@ -0,0 +1,10 @@
+__all__ = ['Model']
+
+from attrs import define, field
+
+
+@define
+class Model:
+    creator_hf_repo: str | None
+    hf_repo: str
+    hf_file: str
diff --git a/llama/llama_cli_options.py b/llama/llama_cli_options.py
@@ -1,10 +1,10 @@
-__all__ = ['LlamaOptions', 'convert_options_to_bytes']
+__all__ = ['Options', 'convert_options_to_bytes']
 
 import attr
 
 
 @attr.s
-class LlamaOptions:
+class Options:
     help = attr.ib(default=None, metadata={"description": "print usage and exit", "long_name": "--help", "alias": "-h"})
     version = attr.ib(default=None, metadata={"description": "show version and build info", "long_name": "--version"})
     verbose = attr.ib(default=None, metadata={"description": "print verbose information", "long_name": "--verbose", "alias": "-v"})
@@ -192,11 +192,11 @@ class LlamaOptions:
     method = attr.ib(default=None, metadata={"description": "dimensionality reduction method to be used", "long_name": "--method", "default": 'pca'})
 
 
-def convert_options_to_bytes(options: LlamaOptions) -> list[bytes]:
+def convert_options_to_bytes(options: Options) -> list[bytes]:
     result = []
 
     # Iterate over all attributes of the options class
-    for field in attr.fields(LlamaOptions):
+    for field in attr.fields(Options):
         value = getattr(options, field.name)
         if value is not None:
             long_name = field.metadata["long_name"]