-
Notifications
You must be signed in to change notification settings - Fork 863
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Llamacpp with cpp backend #2527
Changes from all commits
ee07277
8addde3
002e221
f351d1d
e3a753c
c4ab8a1
8cdda73
026b836
c95a576
eda470f
28c9b02
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,4 +4,29 @@ set(MNIST_SOURCE_FILES "") | |
list(APPEND MNIST_SOURCE_FILES ${MNIST_SRC_DIR}/mnist_handler.cc) | ||
add_library(mnist_handler SHARED ${MNIST_SOURCE_FILES}) | ||
target_include_directories(mnist_handler PUBLIC ${MNIST_SRC_DIR}) | ||
target_link_libraries(mnist_handler PRIVATE ts_backends_torch_scripted ts_utils ${TORCH_LIBRARIES}) | ||
target_link_libraries(mnist_handler PRIVATE ts_backends_torch_scripted ts_utils ${TORCH_LIBRARIES}) | ||
|
||
set(LLM_SRC_DIR "${torchserve_cpp_SOURCE_DIR}/src/examples/llamacpp") | ||
set(LLAMACPP_SRC_DIR "/home/ubuntu/llama.cpp") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good to avoid absolute paths. Is the file included in the PR? What is the license of llama.cpp? Do we need to include the license file? |
||
set(LLM_SOURCE_FILES "") | ||
list(APPEND LLM_SOURCE_FILES ${LLM_SRC_DIR}/llamacpp_handler.cc) | ||
add_library(llamacpp_handler SHARED ${LLM_SOURCE_FILES}) | ||
target_include_directories(llamacpp_handler PUBLIC ${LLM_SRC_DIR}) | ||
target_include_directories(llamacpp_handler PUBLIC ${LLAMACPP_SRC_DIR}) | ||
target_link_libraries(llamacpp_handler PRIVATE ts_backends_torch_scripted ts_utils ${TORCH_LIBRARIES}) | ||
|
||
|
||
set(MY_OBJECT_FILES | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Where are the src files to these obj files? |
||
${LLAMACPP_SRC_DIR}/ggml.o | ||
${LLAMACPP_SRC_DIR}/llama.o | ||
${LLAMACPP_SRC_DIR}/common.o | ||
${LLAMACPP_SRC_DIR}/ggml-quants.o | ||
${LLAMACPP_SRC_DIR}/ggml-alloc.o | ||
${LLAMACPP_SRC_DIR}/grammar-parser.o | ||
${LLAMACPP_SRC_DIR}/console.o | ||
${LLAMACPP_SRC_DIR}/build-info.o | ||
${LLAMACPP_SRC_DIR}/ggml-backend.o | ||
|
||
) | ||
|
||
target_sources(llamacpp_handler PRIVATE ${MY_OBJECT_FILES}) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
{ | ||
"checkpoint_path" : "/home/ubuntu/llama-2-7b-chat.Q4_0.gguf" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto also: How big is this file? |
||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,303 @@ | ||
#include "src/examples/llamacpp/llamacpp_handler.hh" | ||
|
||
#include <torch/script.h> | ||
#include <torch/torch.h> | ||
|
||
#include <typeinfo> | ||
|
||
namespace llm { | ||
|
||
void LlamacppHandler::initialize_context() { | ||
llama_ctx = llama_new_context_with_model(llamamodel, ctx_params); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Where is this defined? |
||
|
||
if (llama_ctx == nullptr) { | ||
std::cerr << "Failed to initialize llama context" << std::endl; | ||
} else { | ||
std::cout << "Context initialized successfully" << std::endl; | ||
} | ||
} | ||
|
||
std::pair<std::shared_ptr<torch::jit::script::Module>, | ||
std::shared_ptr<torch::Device>> | ||
LlamacppHandler::LoadModel( | ||
std::shared_ptr<torchserve::LoadModelRequest>& load_model_request) { | ||
try { | ||
auto device = GetTorchDevice(load_model_request); | ||
// Load dummy model | ||
auto module = std::make_shared<torch::jit::script::Module>( | ||
torch::jit::load(fmt::format("{}/{}", load_model_request->model_dir, | ||
manifest_->GetModel().serialized_file), | ||
*device)); | ||
|
||
const std::string configFilePath = | ||
fmt::format("{}/{}", load_model_request->model_dir, "config.json"); | ||
std::string jsonContent; | ||
if (!folly::readFile(configFilePath.c_str(), jsonContent)) { | ||
std::cerr << "config.json not found at: " << configFilePath << std::endl; | ||
throw; | ||
} | ||
folly::dynamic json; | ||
json = folly::parseJson(jsonContent); | ||
|
||
std::string checkpoint_path; | ||
if (json.find("checkpoint_path") != json.items().end()) { | ||
checkpoint_path = json["checkpoint_path"].asString(); | ||
} else { | ||
std::cerr << "Required field 'checkpoint_path' not found in JSON." | ||
<< std::endl; | ||
throw; | ||
} | ||
params.model = checkpoint_path; | ||
params.main_gpu = 0; | ||
params.n_gpu_layers = 35; | ||
|
||
llama_backend_init(params.numa); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Was this parameter initialized? |
||
ctx_params = llama_context_default_params(); | ||
model_params = llama_model_default_params(); | ||
llamamodel = llama_load_model_from_file(params.model.c_str(), model_params); | ||
|
||
return std::make_pair(module, device); | ||
} catch (const c10::Error& e) { | ||
TS_LOGF(ERROR, "loading the model: {}, device id: {}, error: {}", | ||
load_model_request->model_name, load_model_request->gpu_id, | ||
e.msg()); | ||
throw e; | ||
} catch (const std::runtime_error& e) { | ||
TS_LOGF(ERROR, "loading the model: {}, device id: {}, error: {}", | ||
load_model_request->model_name, load_model_request->gpu_id, | ||
e.what()); | ||
throw e; | ||
} | ||
} | ||
|
||
std::vector<torch::jit::IValue> LlamacppHandler::Preprocess( | ||
std::shared_ptr<torch::Device>& device, | ||
std::pair<std::string&, std::map<uint8_t, std::string>&>& idx_to_req_id, | ||
std::shared_ptr<torchserve::InferenceRequestBatch>& request_batch, | ||
std::shared_ptr<torchserve::InferenceResponseBatch>& response_batch) { | ||
initialize_context(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would the constructor be a better place for this? |
||
|
||
std::vector<torch::jit::IValue> batch_ivalue; | ||
std::vector<torch::Tensor> batch_tensors; | ||
uint8_t idx = 0; | ||
for (auto& request : *request_batch) { | ||
try { | ||
(*response_batch)[request.request_id] = | ||
std::make_shared<torchserve::InferenceResponse>(request.request_id); | ||
idx_to_req_id.first += idx_to_req_id.first.empty() | ||
? request.request_id | ||
: "," + request.request_id; | ||
|
||
auto data_it = request.parameters.find( | ||
torchserve::PayloadType::kPARAMETER_NAME_DATA); | ||
auto dtype_it = | ||
request.headers.find(torchserve::PayloadType::kHEADER_NAME_DATA_TYPE); | ||
if (data_it == request.parameters.end()) { | ||
data_it = request.parameters.find( | ||
torchserve::PayloadType::kPARAMETER_NAME_BODY); | ||
dtype_it = request.headers.find( | ||
torchserve::PayloadType::kHEADER_NAME_BODY_TYPE); | ||
} | ||
|
||
if (data_it == request.parameters.end() || | ||
dtype_it == request.headers.end()) { | ||
TS_LOGF(ERROR, "Empty payload for request id: {}", request.request_id); | ||
(*response_batch)[request.request_id]->SetResponse( | ||
500, "data_type", torchserve::PayloadType::kCONTENT_TYPE_TEXT, | ||
"Empty payload"); | ||
continue; | ||
} | ||
|
||
std::string msg = torchserve::Converter::VectorToStr(data_it->second); | ||
|
||
// tokenization | ||
|
||
std::vector<llama_token> tokens_list; | ||
tokens_list = ::llama_tokenize(llama_ctx, msg, true); | ||
|
||
// const int max_context_size = llama_n_ctx(ctx); | ||
const int max_tokens_list_size = max_context_size - 4; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Always good to give the magic numbers a name to clarify the purpose? |
||
|
||
if ((int)tokens_list.size() > max_tokens_list_size) { | ||
std::cout << __func__ << ": error: prompt too long (" | ||
<< tokens_list.size() << " tokens, max " | ||
<< max_tokens_list_size << ")\n"; | ||
} | ||
|
||
// Print the tokens from the prompt : | ||
std::vector<torch::Tensor> tensor_vector; | ||
for (auto id : tokens_list) { | ||
torch::Tensor tensor = torch::tensor(id, torch::kInt64); | ||
tensor_vector.push_back(tensor); | ||
} | ||
|
||
torch::Tensor stacked_tensor = torch::stack(tensor_vector); | ||
batch_ivalue.push_back(stacked_tensor); | ||
idx_to_req_id.second[idx++] = request.request_id; | ||
|
||
} catch (const std::runtime_error& e) { | ||
TS_LOGF(ERROR, "Failed to load tensor for request id: {}, error: {}", | ||
request.request_id, e.what()); | ||
auto response = (*response_batch)[request.request_id]; | ||
response->SetResponse(500, "data_type", | ||
torchserve::PayloadType::kDATA_TYPE_STRING, | ||
"runtime_error, failed to load tensor"); | ||
} catch (const c10::Error& e) { | ||
TS_LOGF(ERROR, "Failed to load tensor for request id: {}, c10 error: {}", | ||
request.request_id, e.msg()); | ||
auto response = (*response_batch)[request.request_id]; | ||
response->SetResponse(500, "data_type", | ||
torchserve::PayloadType::kDATA_TYPE_STRING, | ||
"c10 error, failed to load tensor"); | ||
} | ||
} | ||
|
||
return batch_ivalue; | ||
} | ||
|
||
torch::Tensor LlamacppHandler::Inference( | ||
std::shared_ptr<torch::jit::script::Module> model, | ||
std::vector<torch::jit::IValue>& inputs, | ||
std::shared_ptr<torch::Device>& device, | ||
std::pair<std::string&, std::map<uint8_t, std::string>&>& idx_to_req_id, | ||
std::shared_ptr<torchserve::InferenceResponseBatch>& response_batch) { | ||
torch::InferenceMode guard; | ||
std::vector<torch::Tensor> batch_output_vector; | ||
for (const torch::jit::IValue& input : inputs) { | ||
torch::Tensor tokens_list_tensor = input.toTensor(); | ||
|
||
int64_t num_elements = tokens_list_tensor.numel(); | ||
|
||
int64_t* data_ptr = tokens_list_tensor.data_ptr<int64_t>(); | ||
std::vector<llama_token> tokens_list; | ||
|
||
for (int64_t i = 0; i < num_elements; ++i) { | ||
tokens_list.push_back(data_ptr[i]); | ||
} | ||
const int n_gen = std::min(32, max_context_size); | ||
|
||
long pos = 0; | ||
while (pos < n_gen) { | ||
// evaluate the transformer | ||
|
||
if (llama_eval(llama_ctx, tokens_list.data(), int(tokens_list.size()), | ||
llama_get_kv_cache_token_count(llama_ctx))) { | ||
std::cout << "Failed to eval\n" << __func__ << std::endl; | ||
break; | ||
} | ||
|
||
tokens_list.clear(); | ||
|
||
// sample the next token | ||
|
||
llama_token new_token_id = 0; | ||
|
||
auto logits = llama_get_logits(llama_ctx); | ||
auto n_vocab = llama_n_vocab(llamamodel); | ||
|
||
std::vector<llama_token_data> candidates; | ||
candidates.reserve(n_vocab); | ||
|
||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) { | ||
candidates.emplace_back( | ||
llama_token_data{token_id, logits[token_id], 0.0f}); | ||
} | ||
|
||
llama_token_data_array candidates_p = {candidates.data(), | ||
candidates.size(), false}; | ||
|
||
new_token_id = llama_sample_token_greedy(llama_ctx, &candidates_p); | ||
|
||
// is it an end of stream ? | ||
if (new_token_id == llama_token_eos(llamamodel)) { | ||
std::cout << "Reached [end of text]\n"; | ||
break; | ||
} | ||
|
||
// print the new token : | ||
std::cout << "New Token: " | ||
<< llama_token_to_piece(llama_ctx, new_token_id) << std::endl; | ||
|
||
// push this new token for next evaluation | ||
tokens_list.push_back(new_token_id); | ||
pos += 1; | ||
} | ||
|
||
std::vector<torch::Tensor> tensor_vector; | ||
for (auto id : tokens_list) { | ||
torch::Tensor tensor = torch::tensor(id, torch::kLong); | ||
tensor_vector.push_back(tensor); | ||
} | ||
|
||
torch::Tensor stacked_tensor = torch::stack(tensor_vector); | ||
batch_output_vector.push_back(stacked_tensor); | ||
} | ||
|
||
llama_print_timings(llama_ctx); | ||
return torch::stack(batch_output_vector); | ||
} | ||
|
||
void LlamacppHandler::Postprocess( | ||
const torch::Tensor& data, | ||
std::pair<std::string&, std::map<uint8_t, std::string>&>& idx_to_req_id, | ||
std::shared_ptr<torchserve::InferenceResponseBatch>& response_batch) { | ||
for (const auto& kv : idx_to_req_id.second) { | ||
try { | ||
int64_t num_elements = data.numel(); | ||
|
||
// Convert the tensor to a vector of long values | ||
std::stringstream generated_text_stream; | ||
|
||
auto data_ptr = data.data_ptr<int64_t>(); | ||
for (int64_t i = 0; i < num_elements; ++i) { | ||
generated_text_stream << llama_token_to_piece(llama_ctx, data_ptr[i]); | ||
} | ||
|
||
std::string generated_text_str = generated_text_stream.str(); | ||
std::cout << "Generated Text Str: " << generated_text_str << std::endl; | ||
|
||
auto response = (*response_batch)[kv.second]; | ||
|
||
response->SetResponse(200, "data_type", | ||
torchserve::PayloadType::kDATA_TYPE_STRING, | ||
generated_text_str); | ||
} catch (const std::runtime_error& e) { | ||
TS_LOGF(ERROR, "Failed to load tensor for request id: {}, error: {}", | ||
kv.second, e.what()); | ||
auto response = (*response_batch)[kv.second]; | ||
response->SetResponse(500, "data_type", | ||
torchserve::PayloadType::kDATA_TYPE_STRING, | ||
"runtime_error, failed to postprocess tensor"); | ||
} catch (const c10::Error& e) { | ||
TS_LOGF(ERROR, | ||
"Failed to postprocess tensor for request id: {}, error: {}", | ||
kv.second, e.msg()); | ||
auto response = (*response_batch)[kv.second]; | ||
response->SetResponse(500, "data_type", | ||
torchserve::PayloadType::kDATA_TYPE_STRING, | ||
"c10 error, failed to postprocess tensor"); | ||
} | ||
} | ||
} | ||
|
||
LlamacppHandler::~LlamacppHandler() noexcept { | ||
llama_free(llama_ctx); | ||
llama_free_model(llamamodel); | ||
llama_backend_free(); | ||
} | ||
|
||
} // namespace llm | ||
|
||
#if defined(__linux__) || defined(__APPLE__) | ||
extern "C" { | ||
torchserve::torchscripted::BaseHandler* allocatorLlamacppHandler() { | ||
return new llm::LlamacppHandler(); | ||
} | ||
|
||
void deleterLlamacppHandler(torchserve::torchscripted::BaseHandler* p) { | ||
if (p != nullptr) { | ||
delete static_cast<llm::LlamacppHandler*>(p); | ||
} | ||
} | ||
} | ||
#endif |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would be good to create a CMakeLists.txt in the llamacpp directory and use add_subdirectory() in the main file to avoid the main one to getting too crowded.