Skip to content

Commit

Permalink
new patches for make.cpp and Makefile
Browse files Browse the repository at this point in the history
  • Loading branch information
mtasic85 committed Jul 11, 2024
1 parent 8b8eeaf commit 1a55881
Show file tree
Hide file tree
Showing 10 changed files with 247 additions and 123 deletions.
51 changes: 51 additions & 0 deletions Makefile_3.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
--- ../Makefile 2024-07-10 22:17:00.507999852 +0200
+++ Makefile 2024-07-10 22:40:13.002343655 +0200
@@ -425,6 +425,31 @@
MK_CXXFLAGS += -pg
endif

+#
+# llama-cpp-cffi
+# Set shared library extension and linker flags based on the platform
+#
+ifeq ($(UNAME_S), Linux)
+ LIB_EXT := so
+ LIB_LDFLAGS := -shared
+ LIB_CXXFLAGS := -fPIC -DLLAMA_LIB
+endif
+
+ifeq ($(UNAME_S), Darwin)
+ LIB_EXT := dylib
+ LIB_LDFLAGS := -dynamiclib
+ LIB_CXXFLAGS := -fPIC -DLLAMA_LIB
+endif
+
+# For Windows (assuming MinGW)
+ifeq ($(OS), Windows_NT)
+ LIB_EXT := dll
+ LIB_LDFLAGS := -shared
+ LIB_CXXFLAGS := -DLLAMA_LIB
+endif
+
+LIB_NAME := llama-cli.$(LIB_EXT)
+
# Architecture specific
# TODO: probably these flags need to be tweaked on some architectures
# feel free to update the Makefile for your architecture and send a pull request or issue
@@ -1132,6 +1157,16 @@
@echo '==== Run ./llama-cli -h for help. ===='
@echo

+llama-cli-shared: examples/main/main.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) $(LIB_CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(LIB_CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $(LIB_NAME) $(LIB_LDFLAGS) $(LDFLAGS)
+
+llama-cli-static: examples/main/main.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) $(LIB_CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ ar rcs llama-cli.a $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<)
+
llama-infill: examples/infill/infill.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
114 changes: 92 additions & 22 deletions examples/demo_ctypes.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,93 @@
import os
import sys
sys.path.append(os.path.abspath('.'))

import psutil
from llama.ctypes import llama_generate, LlamaOptions


options = LlamaOptions(
no_display_prompt=True,
threads=psutil.cpu_count(logical=False),
ctx_size=8192,
predict=512,
flash_attn=True,
cont_batching=True,
simple_io=True,
# log_disable=True,
hf_repo='bartowski/Phi-3.1-mini-128k-instruct-GGUF',
hf_file='Phi-3.1-mini-128k-instruct-Q4_K_M.gguf',
# hf_file='Phi-3.1-mini-128k-instruct-IQ2_M.gguf',
chat_template='chatml',
# prompt='<|im_start|>user\nEvaluate 1 + 2.<|im_end|>\n<|im_start|>assistant\n',
prompt='<|system|>\nYou are a helpful assistant.<|end|><|user|>\nEvaluate 1 + 2.<|end|>\n<|assistant|>\n',
)

for chunk in llama_generate(options):
print(chunk, flush=True, end='')
from llama.llama_cli_ctypes import llama_generate, Model, Options

models = [
Model(
'microsoft/Phi-3-mini-128k-instruct',
'bartowski/Phi-3.1-mini-128k-instruct-GGUF',
'Phi-3.1-mini-128k-instruct-Q4_K_M.gguf',
),
Model(
'Qwen/Qwen2-1.5B-Instruct',
'Qwen/Qwen2-1.5B-Instruct-GGUF',
'qwen2-1_5b-instruct-q4_k_m.gguf',
),
Model(
'TinyLlama/TinyLlama-1.1B-Chat-v1.0',
'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF',
'tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf',
),
]


def demo1():
options = Options(
no_display_prompt=True,
threads=psutil.cpu_count(logical=False),
ctx_size=8192,
predict=512,
flash_attn=True,
cont_batching=True,
simple_io=True,
log_disable=True,
hf_repo=models[0].hf_repo,
hf_file=models[0].hf_file,
prompt='<|system|>\nYou are a helpful assistant.<|end|><|user|>\nEvaluate 1 + 2.<|end|>\n<|assistant|>\n',
)

for chunk in llama_generate(options):
print(chunk, flush=True, end='')

print()


def demo2():
options = Options(
no_display_prompt=True,
threads=psutil.cpu_count(logical=False),
ctx_size=2048,
predict=-2,
flash_attn=True,
cont_batching=True,
simple_io=True,
log_disable=True,
hf_repo=models[1].hf_repo,
hf_file=models[1].hf_file,
prompt='<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nEvaluate 1 + 2.<|im_end|>\n<|im_start|>assistant\n',
)

for chunk in llama_generate(options):
print(chunk, flush=True, end='')

print()


def demo3():
options = Options(
no_display_prompt=True,
threads=psutil.cpu_count(logical=False),
ctx_size=2048,
predict=-2,
flash_attn=True,
cont_batching=True,
simple_io=True,
log_disable=True,
hf_repo=models[2].hf_repo,
hf_file=models[2].hf_file,
prompt='<|system|>\nYou are a helpful assistant.<|end|><|user|>\nEvaluate 1 + 2.<|end|>\n<|assistant|>\n',
)

for chunk in llama_generate(options):
print(chunk, flush=True, end='')

print()


if __name__ == '__main__':
demo1()
demo2()
demo3()
3 changes: 2 additions & 1 deletion llama/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from .cffi import *
# from .cffi import *
# from .ctypes import *
2 changes: 0 additions & 2 deletions llama/cffi.py

This file was deleted.

2 changes: 0 additions & 2 deletions llama/ctypes.py

This file was deleted.

32 changes: 3 additions & 29 deletions llama/llama_cli_cffi.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__all__ = ['llama_generate', 'LlamaOptions']
__all__ = ['llama_generate', 'Options']

import json
import ctypes
Expand All @@ -10,40 +10,14 @@

from huggingface_hub import hf_hub_download

from .llama_cli_options import LlamaOptions, convert_options_to_bytes
from .llama_cli_options import Options, convert_options_to_bytes
from ._llama_cli import lib, ffi


FPRINTF_FUNC = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_void_p, ctypes.c_char_p, ctypes.c_char_p)
FFLUSH_FUNC = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_void_p)


def fprintf_func(file_obj, fmt, arg, queue=None, callback=None, metadata=None):
content = arg.decode('utf-8')

if metadata:
eos = metadata.get('eos')
eot = metadata.get('eot')

if eos is not None and eos in content:
content = content[:content.index(eos)]
elif eot is not None and eot in content:
content = content[:content.index(eot)]

if queue is not None:
if content:
queue.put(content)
else:
queue.put(None)
elif callback is not None:
callback(content)

size = len(content)
return size


def fflush_func(file_obj):
return 0


def _llama_cli_main(argc, argv, queue=None, callback=None, metadata=None):
Expand All @@ -56,7 +30,7 @@ def _llama_cli_main(argc, argv, queue=None, callback=None, metadata=None):
callback(None)


def llama_generate(options: LlamaOptions, callback=None) -> Iterator[str] | None:
def llama_generate(options: Options, callback=None) -> Iterator[str] | None:
# check hf_repo, hf_file
if options.hf_repo and options.hf_file:
options.model = hf_hub_download(repo_id=options.hf_repo, filename=options.hf_file)
Expand Down
84 changes: 21 additions & 63 deletions llama/llama_cli_ctypes.py
Original file line number Diff line number Diff line change
@@ -1,68 +1,44 @@
__all__ = ['llama_generate', 'LlamaOptions']
__all__ = ['llama_generate', 'Options']

import os
import json
from ctypes import *
from queue import Queue
from copy import deepcopy
from typing import Iterator
from threading import Thread
from functools import partial

from huggingface_hub import hf_hub_download

from .llama_cli_options import LlamaOptions, convert_options_to_bytes
from .llama_cli_model import Model
from .llama_cli_options import Options, convert_options_to_bytes

current_module_path = os.path.abspath(__file__)
current_module_dir = os.path.dirname(current_module_path)
libllama_cli_path = os.path.join(current_module_dir, 'llama-cli.so')

lib = CDLL(libllama_cli_path)
module_path = os.path.abspath(__file__)
module_dir = os.path.dirname(module_path)
llama_cli_lib_path = os.path.join(module_dir, 'llama-cli.so')
lib = CDLL(llama_cli_lib_path)

# lib.llama_cli_main.argtypes = [c_int, POINTER(c_char_p)]
# lib.llama_cli_main.restype = c_int
_LLAMA_YIELD_TOKEN_T = CFUNCTYPE(None, c_char_p)
_LLAMA_SHOULD_STOP_T = CFUNCTYPE(c_int)

# lib.llama_get_metadata_as_json.argtypes = [c_int, POINTER(c_char_p)]
# lib.llama_get_metadata_as_json.restype = c_void_p
lib._llama_cli_main.argtypes = [c_int, POINTER(c_char_p), _LLAMA_YIELD_TOKEN_T, _LLAMA_SHOULD_STOP_T, c_int]
lib._llama_cli_main.restype = c_int

# lib.llama_free_metadata_as_json.argtypes = [c_void_p]
# lib.llama_free_metadata_as_json.restype = None

# FPRINTF_FUNC = CFUNCTYPE(c_int, c_void_p, c_char_p, c_char_p)
# FFLUSH_FUNC = CFUNCTYPE(c_int, c_void_p)
def _llama_yield_token_func(chunk: bytes, queue=None, callback=None, metadata=None):
chunk = chunk.decode()
print(chunk, flush=True, end='')


'''
def fprintf_func(file_obj, fmt, arg, queue=None, callback=None, metadata=None):
content = arg.decode('utf-8')
if metadata:
eos = metadata.get('eos')
eot = metadata.get('eot')
if eos is not None and eos in content:
content = content[:content.index(eos)]
elif eot is not None and eot in content:
content = content[:content.index(eot)]
if queue is not None:
if content:
queue.put(content)
else:
queue.put(None)
elif callback is not None:
callback(content)
size = len(content)
return size
def fflush_func(file_obj):
def _llama_should_stop_func(queue=None, callback=None, metadata=None) -> int:
return 0
'''


def _llama_cli_main(argc, argv, queue=None, callback=None, metadata=None):
r = lib.llama_cli_main(argc, argv)
_llama_yield_token = _LLAMA_YIELD_TOKEN_T(partial(_llama_yield_token_func, queue=queue, callback=callback, metadata=metadata))
_llama_should_stop = _LLAMA_SHOULD_STOP_T(partial(_llama_should_stop_func, queue=queue, callback=callback, metadata=metadata))
r = lib._llama_cli_main(argc, argv, _llama_yield_token, _llama_should_stop, 1)
assert r == 0

if queue is not None:
Expand All @@ -71,7 +47,7 @@ def _llama_cli_main(argc, argv, queue=None, callback=None, metadata=None):
callback(None)


def llama_generate(options: LlamaOptions, callback=None, metadata=None) -> Iterator[str] | None:
def llama_generate(options: Options, callback=None, metadata=None) -> Iterator[str] | None:
# check hf_repo, hf_file
if options.hf_repo and options.hf_file:
options.model = hf_hub_download(repo_id=options.hf_repo, filename=options.hf_file)
Expand All @@ -89,25 +65,7 @@ def llama_generate(options: LlamaOptions, callback=None, metadata=None) -> Itera
else:
queue = Queue()

# get bos, eos, and eot from metedata
metadata_options = deepcopy(options)
metadata_options.log_disable = True
metadata_argv: list[bytes] = [b'llama-cli'] + convert_options_to_bytes(metadata_options)
metadata_argc = len(metadata_argv)
metadata_argv = (c_char_p * metadata_argc)(*metadata_argv)

c_metadata: 'const char*' = lib.llama_get_metadata_as_json(metadata_argc, metadata_argv)
metadata: str = string_at(c_metadata)
lib.llama_free_metadata_as_json(c_metadata)
metadata: dict = json.loads(metadata)
print(f'{metadata = }')

# intercept token generation
fprintf = FPRINTF_FUNC(partial(fprintf_func, queue=queue, callback=callback, metadata=metadata))
fflush = FFLUSH_FUNC(fflush_func)

lib.llama_set_fprintf(fprintf)
lib.llama_set_fflush(fflush)
metadata: dict = {}

argv: list[bytes] = [b'llama-cli'] + convert_options_to_bytes(options)
argc = len(argv)
Expand Down
10 changes: 10 additions & 0 deletions llama/llama_cli_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
__all__ = ['Model']

from attrs import define, field


@define
class Model:
creator_hf_repo: str | None
hf_repo: str
hf_file: str
8 changes: 4 additions & 4 deletions llama/llama_cli_options.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
__all__ = ['LlamaOptions', 'convert_options_to_bytes']
__all__ = ['Options', 'convert_options_to_bytes']

import attr


@attr.s
class LlamaOptions:
class Options:
help = attr.ib(default=None, metadata={"description": "print usage and exit", "long_name": "--help", "alias": "-h"})
version = attr.ib(default=None, metadata={"description": "show version and build info", "long_name": "--version"})
verbose = attr.ib(default=None, metadata={"description": "print verbose information", "long_name": "--verbose", "alias": "-v"})
Expand Down Expand Up @@ -192,11 +192,11 @@ class LlamaOptions:
method = attr.ib(default=None, metadata={"description": "dimensionality reduction method to be used", "long_name": "--method", "default": 'pca'})


def convert_options_to_bytes(options: LlamaOptions) -> list[bytes]:
def convert_options_to_bytes(options: Options) -> list[bytes]:
result = []

# Iterate over all attributes of the options class
for field in attr.fields(LlamaOptions):
for field in attr.fields(Options):
value = getattr(options, field.name)
if value is not None:
long_name = field.metadata["long_name"]
Expand Down
Loading

0 comments on commit 1a55881

Please sign in to comment.