Skip to content

Commit

Permalink
Implement prototype for instant mmap() loading
Browse files Browse the repository at this point in the history
This change uses a custom malloc() implementation to transactionally
capture to a file dynamic memory created during the loading process.
That includes (1) the malloc() allocation for mem_buffer and (2) all
the C++ STL objects. On my $1000 personal computer, this change lets
me run ./main to generate a single token (-n 1) using the float16 7B
model (~12gb size) in one second. In order to do that, there's a one
time cost where a 13gb file needs to be generated. This change rocks
but it shouldn't be necessary to do something this heroic. We should
instead change the file format, so that tensors don't need reshaping
and realignment in order to be loaded.
  • Loading branch information
jart committed Mar 17, 2023
1 parent 2788f37 commit 5b8023d
Show file tree
Hide file tree
Showing 2 changed files with 207 additions and 19 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ models/*

/main
/quantize
/magic.dat

arm_neon.h
compile_commands.json
225 changes: 206 additions & 19 deletions main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,32 @@
#include "utils.h"

#include <cassert>
#include <cerrno>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <map>
#include <string>
#include <vector>
#include <atomic>

#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
#include <fcntl.h>
#include <signal.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/stat.h>
#endif

#define ROUNDUP(X, K) (((X) + (K)-1) & -(K))
#define IS2POW(X) (!((X) & ((X)-1)))

#define MAGIC_PATH "magic.dat"
#define MAGIC_ADDR (char *)0x330000000000
#define MAGIC_GRAN 2097152
#define MAGIC_ALGN (sizeof(size_t) * 2)

#define ANSI_COLOR_RED "\x1b[31m"
#define ANSI_COLOR_GREEN "\x1b[32m"
#define ANSI_COLOR_YELLOW "\x1b[33m"
Expand Down Expand Up @@ -83,6 +96,173 @@ struct llama_model {
std::map<std::string, struct ggml_tensor *> tensors;
};

struct magic {
uint32_t magic;
std::atomic<unsigned> lock;
int fd;
size_t commit;
size_t offset;
size_t capacity;
gpt_vocab *vocab;
llama_model *model;
};

static struct magic *mag;

static inline void spin_lock(std::atomic<unsigned> &lock) {
while (!lock.exchange(1, std::memory_order_acquire));
}

static inline void spin_unlock(std::atomic<unsigned> &lock) {
lock.store(0, std::memory_order_release);
}

static void *Mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset) {
void *res;
res = mmap(addr, length, prot, flags, fd, offset);
if (res != MAP_FAILED) return res;
perror("mmap");
exit(77);
}

static void magic_commit(void) {
mag->offset = mag->capacity;
mag->commit = mag->capacity;
mag->magic = 0xFEEDABEE;
msync(mag, mag->commit, MS_ASYNC);
}

static void magic_init(void) {
int fd;
size_t n;
struct stat st;
if (mag) return;
n = ROUNDUP(sizeof(struct magic), MAGIC_GRAN);
if ((fd = open(MAGIC_PATH, O_RDWR)) != -1) {
fstat(fd, &st);
if (st.st_size >= n) {
mag = (struct magic *)Mmap(MAGIC_ADDR, n,
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_FIXED, fd, 0);
if (mag->magic == 0xFEEDABEE) {
mag = (struct magic *)Mmap(MAGIC_ADDR, mag->capacity,
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_FIXED, fd, 0);
madvise(MAGIC_ADDR, mag->capacity, MADV_WILLNEED);
ftruncate(fd, mag->commit);
mag->offset = mag->commit;
mag->capacity = mag->commit;
mag->fd = -1;
return;
}
}
ftruncate(fd, 0);
} else if ((fd = open(MAGIC_PATH, O_RDWR | O_CREAT | O_TRUNC, 0644)) == -1) {
perror(MAGIC_PATH);
exit(77);
}
ftruncate(fd, n);
mag = (struct magic *)Mmap(MAGIC_ADDR, n,
PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_FIXED, fd, 0);
mag->offset = MAGIC_GRAN;
mag->fd = fd;
}

void *memalign(size_t a, size_t n) {
void *p;
size_t i, j, k, m;
static int count;
magic_init();
if (a < MAGIC_ALGN) a = MAGIC_ALGN;
while (!IS2POW(a)) ++a;
m = n ? n : 1;
spin_lock(mag->lock);
i = mag->offset;
i = i + sizeof(size_t);
i = ROUNDUP(i, a);
j = ROUNDUP(i + m, MAGIC_GRAN);
if (j > mag->capacity) {
if (!mag->magic) {
ftruncate(mag->fd, j);
p = mmap(MAGIC_ADDR + mag->capacity,
j - mag->capacity, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_FIXED, mag->fd, mag->capacity);
} else {
p = mmap(MAGIC_ADDR + mag->capacity,
j - mag->capacity, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
}
if (p != MAP_FAILED) {
mag->capacity = j;
} else {
spin_unlock(mag->lock);
return 0;
}
}
mag->offset = i + m;
spin_unlock(mag->lock);
p = MAGIC_ADDR + i;
((size_t *)p)[-1] = n;
return p;
}

int posix_memalign(void **pp, size_t a, size_t n) {
int e;
void *m;
size_t q, r;
q = a / sizeof(void *);
r = a % sizeof(void *);
if (!r && q && IS2POW(q)) {
e = errno;
m = memalign(a, n);
if (m) {
*pp = m;
return 0;
} else {
errno = e;
return ENOMEM;
}
} else {
return EINVAL;
}
}

void *malloc(size_t n) {
return memalign(MAGIC_ALGN, n);
}

size_t malloc_usable_size(const void *p) {
return ((const size_t *)p)[-1];
}

void *calloc(size_t n, size_t z) {
void *p;
if ((p = malloc((n *= z)))) {
memset(p, 0, n);
}
return p;
}

void free(void *p) {
// do nothing
}

void *realloc(void *p, size_t n) {
void *q;
if (!p) {
return malloc(n);
}
if (!n) {
free(p);
return 0;
}
if ((q = malloc(n))) {
memcpy(q, p, ((const size_t *)p)[-1]);
}
return q;
}

// load the model's weights from a file
bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) {
fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
Expand Down Expand Up @@ -786,6 +966,8 @@ const char * llama_print_system_info(void) {
}

int main(int argc, char ** argv) {
magic_init();

ggml_time_init();
const int64_t t_main_start_us = ggml_time_us();

Expand All @@ -812,19 +994,24 @@ int main(int argc, char ** argv) {

int64_t t_load_us = 0;

gpt_vocab vocab;
llama_model model;

// load the model
{
gpt_vocab *vocab;
llama_model *model;
if (!mag->magic) {
vocab = new gpt_vocab;
model = new llama_model;
const int64_t t_start_us = ggml_time_us();

if (!llama_model_load(params.model, model, vocab, 512)) { // TODO: set context from user input ??
if (!llama_model_load(params.model, *model, *vocab, 512)) { // TODO: set context from user input ??
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
return 1;
}

t_load_us = ggml_time_us() - t_start_us;
mag->vocab = vocab;
mag->model = model;
magic_commit();
} else {
vocab = mag->vocab;
model = mag->model;
}

// print system information
Expand All @@ -842,18 +1029,18 @@ int main(int argc, char ** argv) {
std::vector<float> logits;

// tokenize the prompt
std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);
std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(*vocab, params.prompt, true);

params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
params.n_predict = std::min(params.n_predict, model->hparams.n_ctx - (int) embd_inp.size());

// tokenize the reverse prompt
std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(vocab, params.antiprompt, false);
std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(*vocab, params.antiprompt, false);

fprintf(stderr, "\n");
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
for (int i = 0; i < (int) embd_inp.size(); i++) {
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], vocab->id_to_token.at(embd_inp[i]).c_str());
}
fprintf(stderr, "\n");
if (params.interactive) {
Expand All @@ -871,7 +1058,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: reverse prompt: '%s'\n", __func__, params.antiprompt.c_str());
fprintf(stderr, "%s: number of tokens in reverse prompt = %zu\n", __func__, antiprompt_inp.size());
for (int i = 0; i < (int) antiprompt_inp.size(); i++) {
fprintf(stderr, "%6d -> '%s'\n", antiprompt_inp[i], vocab.id_to_token.at(antiprompt_inp[i]).c_str());
fprintf(stderr, "%6d -> '%s'\n", antiprompt_inp[i], vocab->id_to_token.at(antiprompt_inp[i]).c_str());
}
fprintf(stderr, "\n");
}
Expand All @@ -883,7 +1070,7 @@ int main(int argc, char ** argv) {

// determine the required inference memory per token:
size_t mem_per_token = 0;
llama_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
llama_eval(*model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);

int last_n_size = params.repeat_last_n;
std::vector<gpt_vocab::id> last_n_tokens(last_n_size);
Expand Down Expand Up @@ -918,7 +1105,7 @@ int main(int argc, char ** argv) {
if (embd.size() > 0) {
const int64_t t_start_us = ggml_time_us();

if (!llama_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
if (!llama_eval(*model, params.n_threads, n_past, embd, logits, mem_per_token)) {
fprintf(stderr, "Failed to predict\n");
return 1;
}
Expand All @@ -936,14 +1123,14 @@ int main(int argc, char ** argv) {
const float temp = params.temp;
const float repeat_penalty = params.repeat_penalty;

const int n_vocab = model.hparams.n_vocab;
const int n_vocab = model->hparams.n_vocab;

gpt_vocab::id id = 0;

{
const int64_t t_start_sample_us = ggml_time_us();

id = llama_sample_top_p_top_k(vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens, repeat_penalty, top_k, top_p, temp, rng);
id = llama_sample_top_p_top_k(*vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens, repeat_penalty, top_k, top_p, temp, rng);

last_n_tokens.erase(last_n_tokens.begin());
last_n_tokens.push_back(id);
Expand Down Expand Up @@ -980,7 +1167,7 @@ int main(int argc, char ** argv) {
// display text
if (!input_noecho) {
for (auto id : embd) {
printf("%s", vocab.id_to_token[id].c_str());
printf("%s", vocab->id_to_token[id].c_str());
}
fflush(stdout);
}
Expand Down Expand Up @@ -1018,7 +1205,7 @@ int main(int argc, char ** argv) {
buf[n_read+1] = 0;
}

std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(vocab, buf, false);
std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(*vocab, buf, false);
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());

remaining_tokens -= line_inp.size();
Expand Down Expand Up @@ -1050,7 +1237,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
}

ggml_free(model.ctx);
ggml_free(model->ctx);

return 0;
}

0 comments on commit 5b8023d

Please sign in to comment.