Skip to content

Commit

Permalink
Works on EXT4/NTFS
Browse files Browse the repository at this point in the history
  • Loading branch information
JohannesGaessler committed May 17, 2023
1 parent af005ce commit a272e71
Show file tree
Hide file tree
Showing 5 changed files with 95 additions and 27 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ endif
ifdef LLAMA_CUBLAS
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lcufile -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
OBJS += ggml-cuda.o
NVCC = nvcc
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
Expand Down
8 changes: 6 additions & 2 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuda_fp16.h>
#include <cufile.h>

#include "ggml-cuda.h"
#include "ggml.h"
Expand Down Expand Up @@ -372,7 +373,7 @@ struct cuda_buffer {
static cuda_buffer g_cuda_buffer_pool[MAX_CUDA_BUFFERS];
static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;

static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
scoped_spin_lock lock(g_cuda_pool_lock);

for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
Expand All @@ -391,7 +392,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
return ptr;
}

static void ggml_cuda_pool_free(void * ptr, size_t size) {
void ggml_cuda_pool_free(void * ptr, size_t size) {
scoped_spin_lock lock(g_cuda_pool_lock);

for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
Expand Down Expand Up @@ -431,6 +432,9 @@ void ggml_init_cublas() {

// configure logging to stdout
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));

// initialize cuFile for loading model parameters directly to VRAM
CUFILE_CHECK(cuFileDriverOpen());
}
}

Expand Down
10 changes: 10 additions & 0 deletions ggml-cuda.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
#include "ggml.h"

#define CUFILE_CHECK(status) \
do { \
CUfileError_t status_ = (status); \
if (status_.err != CU_FILE_SUCCESS) { \
fprintf(stderr, "cuFile error %d at %s:%d\n", status_.err, __FILE__, __LINE__); \
exit(1); \
} \
} while (0)

#ifdef __cplusplus
extern "C" {
#endif
Expand All @@ -12,6 +21,7 @@ size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct
void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);

// TODO: export these with GGML_API
void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size);
void * ggml_cuda_host_malloc(size_t size);
void ggml_cuda_host_free(void * ptr);

Expand Down
21 changes: 20 additions & 1 deletion llama-util.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,17 @@
#include <cstdarg>
#include <cstdlib>
#include <climits>
#include <fcntl.h>

#include <string>
#include <vector>
#include <stdexcept>

#ifdef GGML_USE_CUBLAS
#include <cufile.h>
#include "ggml-cuda.h"
#endif // GGML_USE_CUBLAS

#ifdef __has_include
#if __has_include(<unistd.h>)
#include <unistd.h>
Expand Down Expand Up @@ -71,6 +77,9 @@ struct llama_file {
// use FILE * so we don't have to re-open the file to mmap
FILE * fp;
size_t size;
#ifdef GGML_USE_CUBLAS
CUfileHandle_t cf_handle;
#endif // GGML_USE_CUBLAS

llama_file(const char * fname, const char * mode) {
fp = std::fopen(fname, mode);
Expand All @@ -80,6 +89,17 @@ struct llama_file {
seek(0, SEEK_END);
size = tell();
seek(0, SEEK_SET);

#ifdef GGML_USE_CUBLAS
CUfileDescr_t cf_descr;
memset((void *)&cf_descr, 0, sizeof(CUfileDescr_t));
int cf_fd = open(fname, O_RDONLY|O_DIRECT, 0644);
cf_descr.handle.fd = cf_fd;
cf_descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;

CUfileError_t status = cuFileHandleRegister(&cf_handle, &cf_descr);
CUFILE_CHECK(status);
#endif // GGML_USE_CUBLAS
}

size_t tell() const {
Expand Down Expand Up @@ -422,7 +442,6 @@ struct llama_buffer {
};

#ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h"
struct llama_ctx_buffer {
uint8_t * addr = NULL;
bool is_cuda;
Expand Down
81 changes: 58 additions & 23 deletions llama.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Defines fileno on msys:
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#include <cstddef>
#include <cstdint>
#include <cstdio>
#endif
Expand All @@ -11,6 +12,7 @@
#include "ggml.h"
#ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h"
#include <cufile.h>
#endif

#include <array>
Expand Down Expand Up @@ -641,7 +643,7 @@ struct llama_model_loader {
}
}

struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne) {
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
auto it = tensors_map.name_to_idx.find(name);
if (it == tensors_map.name_to_idx.end()) {
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
Expand All @@ -652,10 +654,10 @@ struct llama_model_loader {
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
}

return get_tensor_for(lt);
return get_tensor_for(lt, backend);
}

struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) {
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
struct ggml_tensor * tensor;
if (lt.ne.size() == 2) {
tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
Expand All @@ -665,6 +667,7 @@ struct llama_model_loader {
}
ggml_set_name(tensor, lt.name.c_str());
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
tensor->backend = backend;
lt.ggml_tensor = tensor;
num_ggml_tensors_created++;
return tensor;
Expand All @@ -683,7 +686,7 @@ struct llama_model_loader {
}

if (use_mmap) {
mapping.reset(new llama_mmap(&file_loaders.at(0)->file));
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, false));
if (!lmlock) {
// Don't call the callback since the actual loading will be lazy
// and we can't measure it.
Expand All @@ -701,12 +704,28 @@ struct llama_model_loader {
}
LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already
lt.data = (uint8_t *) lt.ggml_tensor->data;
load_data_for(lt);
lt.ggml_tensor->data = lt.data;
done_size += lt.size;
if (use_mmap && lmlock) {
lmlock->grow_to(done_size);

switch (lt.ggml_tensor->backend) {
case GGML_BACKEND_CPU:
{
load_data_for(lt);
done_size += lt.size;
if (use_mmap && lmlock) {
lmlock->grow_to(done_size);
}
} break;
#ifdef GGML_USE_CUBLAS
case GGML_BACKEND_CUDA:
{
cuda_load_data_for(lt);
done_size += lt.size;
} break;
#endif // GGML_USE_CUBLAS
default:
GGML_ASSERT(false);
}

lt.ggml_tensor->data = lt.data;
}
if (progress_callback) {
progress_callback(1.0f, progress_callback_user_data);
Expand Down Expand Up @@ -759,6 +778,18 @@ struct llama_model_loader {
}
}

#ifdef GGML_USE_CUBLAS
void cuda_load_data_for(llama_load_tensor & lt) {
LLAMA_ASSERT(lt.shards.size() == 1);
llama_file & file = file_loaders.at(lt.shards.at(0).file_idx)->file;
size_t offset = lt.shards.at(0).file_off;
size_t actual_size;
void * buf = ggml_cuda_pool_malloc(lt.size, &actual_size);
cuFileRead(file.cf_handle, buf, lt.size, offset, 0);
lt.data = (uint8_t *) buf;
}
#endif // GGML_USE_CUBLAS

static void print_checksum(llama_load_tensor & lt) {
uint32_t sum = 0;
for (size_t i = 0; i < lt.size; i++) {
Expand Down Expand Up @@ -992,28 +1023,30 @@ static void llama_model_load_internal(

ml->ggml_ctx = ctx;

model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
model.norm = ml->get_tensor("norm.weight", {n_embd});
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);

model.layers.resize(n_layer);
const int i_gpu_start = n_layer - n_gpu_layers;
for (uint32_t i = 0; i < n_layer; ++i) {
auto & layer = model.layers[i];
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : GGML_BACKEND_CUDA;

std::string layers_i = "layers." + std::to_string(i);

layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd});
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);

layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd});
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd});
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd});
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd});
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);

layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd});
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);

layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff});
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd});
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff});
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend);
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend);
}
}

Expand All @@ -1029,7 +1062,8 @@ static void llama_model_load_internal(
model.mapping = std::move(ml->mapping);
#ifdef GGML_USE_CUBLAS
{
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
// const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
const int n_gpu = 0;

fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);

Expand Down Expand Up @@ -2395,7 +2429,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
}
size_t idx = model_loader->tensors_map.name_to_idx[base_name];
llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
base_t = model_loader->get_tensor(
base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
lt.data = (uint8_t *) lt.ggml_tensor->data;
model_loader->load_data_for(lt);
lt.ggml_tensor->data = lt.data;
Expand Down

0 comments on commit a272e71

Please sign in to comment.