Skip to content

Commit

Permalink
Merge pull request #2 from arthw/refactor_dev
Browse files Browse the repository at this point in the history
Refactor device management and usage api
  • Loading branch information
arthw authored Aug 1, 2024
2 parents e661170 + f1bc5ad commit c16f01b
Show file tree
Hide file tree
Showing 12 changed files with 592 additions and 509 deletions.
2 changes: 2 additions & 0 deletions ggml/include/ggml-sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int index);
GGML_API GGML_CALL void ggml_sycl_set_single_device(int main_gpu_id);

GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id);

// SYCL doesn't support registering host memory, keep here for reference
// GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
// GGML_API GGML_CALL void ggml_backend_sycl_unregister_host_buffer(void * buffer);
Expand Down
63 changes: 32 additions & 31 deletions ggml/src/ggml-sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@

#include "ggml-sycl/backend.hpp"
#include "ggml-sycl/presets.hpp"
#include "ggml-sycl/sycl_device.hpp"


void ggml_sycl_free_data(struct ggml_tensor * tensor);
Expand All @@ -48,7 +49,7 @@ void ggml_sycl_get_device_description(int device, char * description, size_t d
bool ggml_backend_is_sycl(ggml_backend_t backend);
int ggml_backend_sycl_get_device(ggml_backend_t backend);
static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer);

static bool ggml_backend_buffer_is_sycl(ggml_backend_buffer_t buffer);

void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst,
const void *ptr_src, size_t size) {
Expand Down Expand Up @@ -2279,11 +2280,11 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYC
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
int id = ggml_backend_sycl_get_device_id(i);
if (tensor_split[i] < (i + 1 < ggml_sycl_info().device_count ? tensor_split[i + 1] : 1.0f)) {
if (min_compute_capability > ggml_sycl_info().devices[id].cc) {
min_compute_capability = ggml_sycl_info().devices[id].cc;
if (min_compute_capability > ggml_sycl_info().infos[id].cc) {
min_compute_capability = ggml_sycl_info().infos[id].cc;
}
if (max_compute_capability < ggml_sycl_info().devices[id].cc) {
max_compute_capability = ggml_sycl_info().devices[id].cc;
if (max_compute_capability < ggml_sycl_info().infos[id].cc) {
max_compute_capability = ggml_sycl_info().infos[id].cc;
}
}
}
Expand Down Expand Up @@ -2680,17 +2681,14 @@ static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) {
}

#ifdef NDEBUG
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
int id = ggml_backend_sycl_get_device_id(i);
for (auto &id: ggml_sycl_info().ids) {
SYCL_CHECK(ggml_sycl_set_device(id));
}

for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
int id = ggml_backend_sycl_get_device_id(i);
for (auto &id: ggml_sycl_info().ids) {
SYCL_CHECK(ggml_sycl_set_device(id));

for (int i_other = 0; i_other < ggml_sycl_info().device_count; ++i_other) {
int id_other = ggml_backend_sycl_get_device_id(i_other);
for (auto &id_other: ggml_sycl_info().ids) {
if (id == id_other) {
continue;
}
Expand Down Expand Up @@ -2818,8 +2816,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
}
}

for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
int id = ggml_backend_sycl_get_device_id(i);
for (auto & id: ggml_sycl_info().ids) {
if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) {
continue;
}
Expand All @@ -2843,7 +2840,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
} else {
dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ctx.pool(id), ggml_nelements(src1));
}

if (convert_src1_to_q8_1) {
dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);

Expand Down Expand Up @@ -2885,8 +2881,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
const int64_t is = split ? (src1_col_0/src1_col_stride) % GGML_SYCL_MAX_STREAMS : 0;
const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;

for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
int id = ggml_backend_sycl_get_device_id(i);
for (auto & id: ggml_sycl_info().ids) {
if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) {
continue;
}
Expand Down Expand Up @@ -3028,8 +3023,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
is_max = is_max <= GGML_SYCL_MAX_STREAMS ? is_max : GGML_SYCL_MAX_STREAMS;

ggml_sycl_set_device(ctx.device);
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
int id = ggml_backend_sycl_get_device_id(i);
for (auto & id: ggml_sycl_info().ids) {
if (dev[id].row_low == dev[id].row_high) {
continue;
}
Expand Down Expand Up @@ -3165,8 +3159,13 @@ static void ggml_sycl_pad(ggml_backend_sycl_context & ctx, const ggml_tensor * s

static void ggml_sycl_rms_norm(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__);
// log_tensor_with_cnt(ctx, "log/src0", src0, -1);
// log_tensor_with_cnt(ctx, "log/src1", src1, -1);
// log_tensor_with_cnt(ctx, "log/dst0", dst, -1);
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_rms_norm);
// log_tensor_with_cnt(ctx, "log/dst1", dst, -1);
GGML_SYCL_DEBUG("call %s done\n", __func__);
// exit(1);
}

static void ggml_sycl_mul_mat_vec_p021(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
Expand Down Expand Up @@ -3417,12 +3416,12 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
continue;
}

if (min_compute_capability > ggml_sycl_info().devices[id].cc) {
min_compute_capability = ggml_sycl_info().devices[id].cc;
if (min_compute_capability > ggml_sycl_info().infos[id].cc) {
min_compute_capability = ggml_sycl_info().infos[id].cc;
}
}
} else {
min_compute_capability = ggml_sycl_info().devices[ctx.device].cc;
min_compute_capability = ggml_sycl_info().infos[ctx.device].cc;
}

// check data types and tensor shapes for custom matrix multiplication kernels:
Expand Down Expand Up @@ -4332,7 +4331,6 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_id) {
static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);

GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");

check_allow_device_id(device_id);
Expand All @@ -4342,10 +4340,9 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_id) {
static bool ggml_backend_sycl_buffer_type_initialized = false;

if (!ggml_backend_sycl_buffer_type_initialized) {
for (int i = 0; i < ggml_sycl_info().device_count; i++) {
int id = ggml_backend_sycl_get_device_id(i);
for (auto & id: ggml_sycl_info().ids) {
auto & device = dpct::dev_mgr::instance().get_device(id);
queue_ptr stream = &(device.default_queue());
queue_ptr stream = ggml_sycl_info().infos[id].qptrs[0];
ggml_backend_sycl_buffer_types[id] = {
/* .iface = */ ggml_backend_sycl_buffer_type_interface,
/* .context = */ new ggml_backend_sycl_buffer_type_context{id, GGML_SYCL_NAME + std::to_string(id), stream},
Expand All @@ -4366,8 +4363,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_conte
static bool ggml_backend_sycl_buffer_type_initialized = false;

if (!ggml_backend_sycl_buffer_type_initialized) {
for (int i = 0; i < ggml_sycl_info().device_count; i++) {
int id = ggml_backend_sycl_get_device_id(i);
for (auto & id: ggml_sycl_info().ids) {
ggml_backend_sycl_buffer_types[id] = {
/* .iface = */ ggml_backend_sycl_buffer_type_interface,
/* .context = */ new ggml_backend_sycl_buffer_type_context{id, GGML_SYCL_NAME + std::to_string(id), ctx->stream(id, 0)},
Expand Down Expand Up @@ -4396,8 +4392,7 @@ static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tens
struct ggml_backend_sycl_split_buffer_context {
~ggml_backend_sycl_split_buffer_context() try {
for (ggml_tensor_extra_gpu * extra : tensor_extras) {
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
int id = ggml_backend_sycl_get_device_id(i);
for (auto & id: ggml_sycl_info().ids) {
for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
if (extra->events[id][is] != nullptr) {
/*
Expand Down Expand Up @@ -5148,6 +5143,13 @@ GGML_CALL int ggml_backend_sycl_get_device_count() {
return ggml_sycl_info().device_count;
}

GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id) {

GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_set_single_device_mode\n");
fprintf(stderr, "ggml_backend_sycl_set_single_device: use single device: [%d]\n", main_gpu_id);
ggml_sycl_info(main_gpu_id);
}

GGML_CALL static ggml_backend_t ggml_backend_reg_sycl_init(const char * params, void * user_data) {
ggml_backend_t sycl_backend = ggml_backend_sycl_init((int) (intptr_t) user_data);
return sycl_backend;
Expand All @@ -5159,8 +5161,7 @@ extern "C" int ggml_backend_sycl_reg_devices();

int ggml_backend_sycl_reg_devices() {
assert(ggml_sycl_info().device_count>0);
for (int i = 0; i < ggml_sycl_info().device_count; i++) {
int id = ggml_backend_sycl_get_device_id(i);
for (auto & id: ggml_sycl_info().ids) {
char name[128];
snprintf(name, sizeof(name), "%s%d", GGML_SYCL_NAME, id);
ggml_backend_register(name, ggml_backend_reg_sycl_init, ggml_backend_sycl_buffer_type(id), (void *) (intptr_t) id);
Expand Down
Loading

0 comments on commit c16f01b

Please sign in to comment.