diff --git a/examples/common.cpp b/examples/common.cpp index fed24e027d8a8..fd164eceb366e 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -304,7 +304,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } -#ifdef GGML_USE_CUBLAS +#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS params.main_gpu = std::stoi(argv[i]); #else fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n"); @@ -314,7 +314,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } -#ifdef GGML_USE_CUBLAS +#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS std::string arg_next = argv[i]; // split string by , and / @@ -334,7 +334,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n"); #endif // GGML_USE_CUBLAS } else if (arg == "--low-vram" || arg == "-lv") { -#ifdef GGML_USE_CUBLAS +#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS params.low_vram = true; #else fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n"); @@ -414,7 +414,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { exit(1); } -#ifdef GGML_USE_CUBLAS +#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS if (!params.lora_adapter.empty() && params.n_gpu_layers > 0) { fprintf(stderr, "%s: error: the simultaneous use of LoRAs and GPU acceleration is not supported", __func__); exit(1); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index c0984aadb92ba..8aaf103395faa 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -560,7 +560,7 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, invalid_param = true; break; } -#ifdef GGML_USE_CUBLAS +#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS std::string arg_next = argv[i]; // split string by , and / @@ -583,7 +583,7 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, } else if (arg == "--low-vram" || arg == "-lv") { -#ifdef GGML_USE_CUBLAS +#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS params.low_vram = true; #else fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n"); @@ -594,7 +594,7 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, invalid_param = true; break; } -#ifdef GGML_USE_CUBLAS +#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS params.main_gpu = std::stoi(argv[i]); #else LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {}); diff --git a/ggml.c b/ggml.c index 7e972ebfb4e0c..ce53c9b96f83d 100644 --- a/ggml.c +++ b/ggml.c @@ -161,7 +161,7 @@ inline static void* ggml_aligned_malloc(size_t size) { #endif #elif defined(GGML_USE_OPENBLAS) #include -#elif defined(GGML_USE_CUBLAS) | defined(GGML_USE_HIPBLAS) +#elif defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) #include "ggml-cuda.h" #elif defined(GGML_USE_CLBLAST) #include "ggml-opencl.h" @@ -4116,7 +4116,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); } -#if defined(GGML_USE_CUBLAS) +#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) ggml_init_cublas(); #elif defined(GGML_USE_CLBLAST) ggml_cl_init(); @@ -14875,7 +14875,7 @@ static void ggml_compute_forward_cross_entropy_loss_back( static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { GGML_ASSERT(params); -#ifdef GGML_USE_CUBLAS +#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS bool skip_cpu = ggml_cuda_compute_forward(params, tensor); if (skip_cpu) { return; @@ -16362,7 +16362,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) size_t cur = 0; -#if defined(GGML_USE_CUBLAS) +#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) { node->n_tasks = 1; // TODO: this actually is doing nothing // the threads are still spinning @@ -18637,7 +18637,7 @@ int ggml_cpu_has_wasm_simd(void) { } int ggml_cpu_has_blas(void) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) || defined(GGML_USE_CLBLAST) return 1; #else return 0; diff --git a/llama-util.h b/llama-util.h index 3d5d9e3792a1a..c3c891937da49 100644 --- a/llama-util.h +++ b/llama-util.h @@ -441,7 +441,7 @@ struct llama_buffer { llama_buffer& operator=(llama_buffer&&) = delete; }; -#ifdef GGML_USE_CUBLAS +#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS #include "ggml-cuda.h" struct llama_ctx_buffer { uint8_t * addr = NULL; diff --git a/llama.cpp b/llama.cpp index 27d3d4a0a9a8d..0a177dec4c51a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -10,7 +10,7 @@ #include "llama.h" #include "ggml.h" -#ifdef GGML_USE_CUBLAS +#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS #include "ggml-cuda.h" #elif defined(GGML_USE_CLBLAST) #include "ggml-opencl.h" @@ -175,7 +175,7 @@ struct llama_kv_cache { ggml_free(ctx); } -#ifdef GGML_USE_CUBLAS +#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS ggml_cuda_free_data(k); ggml_cuda_free_data(v); #endif // GGML_USE_CUBLAS @@ -220,7 +220,7 @@ struct llama_model { ggml_free(ctx); } -#ifdef GGML_USE_CUBLAS +#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS for (size_t i = 0; i < tensors_by_name.size(); ++i) { ggml_cuda_free_data(tensors_by_name[i].second); } @@ -791,7 +791,7 @@ struct llama_model_loader { lmlock->grow_to(lock_size); } break; -#if defined(GGML_USE_CUBLAS) +#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) case GGML_BACKEND_GPU: case GGML_BACKEND_GPU_SPLIT: ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor); @@ -911,7 +911,7 @@ static bool kv_cache_init( ggml_set_name(cache.v, "cache_v"); (void) n_gpu_layers; -#ifdef GGML_USE_CUBLAS +#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS if (n_gpu_layers > n_layer + 1) { ggml_cuda_assign_buffers_no_scratch(cache.v); } @@ -1141,7 +1141,7 @@ static void llama_model_load_internal( } (void) main_gpu; -#if defined(GGML_USE_CUBLAS) +#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__); ggml_cuda_set_main_device(main_gpu); #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU @@ -1252,7 +1252,7 @@ static void llama_model_load_internal( (void) vram_scratch; (void) n_batch; -#ifdef GGML_USE_CUBLAS +#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS if (low_vram) { fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__); ggml_cuda_set_scratch_size(0); // disable scratch @@ -1265,7 +1265,7 @@ static void llama_model_load_internal( } } #endif // GGML_USE_CUBLAS -#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) || defined(GGML_USE_CLBLAST) const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); fprintf(stderr, "%s: offloading %d repeating layers to GPU\n", __func__, n_gpu); @@ -1305,7 +1305,7 @@ static void llama_model_load_internal( } (void) tensor_split; -#if defined(GGML_USE_CUBLAS) +#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) { ggml_cuda_set_tensor_split(tensor_split); } @@ -1425,7 +1425,7 @@ static bool llama_eval_internal( offload_func_t offload_func_kq = llama_nop; offload_func_t offload_func_v = llama_nop; -#ifdef GGML_USE_CUBLAS +#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS if (n_gpu_layers > n_layer) { offload_func_nr = ggml_cuda_assign_buffers; } @@ -1440,7 +1440,7 @@ static bool llama_eval_internal( for (int il = 0; il < n_layer; ++il) { offload_func_t offload_func = llama_nop; -#ifdef GGML_USE_CUBLAS +#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS if (il >= i_gpu_start) { offload_func = ggml_cuda_assign_buffers; } diff --git a/llama.h b/llama.h index 0de530d456932..14e892ac1c66b 100644 --- a/llama.h +++ b/llama.h @@ -2,7 +2,7 @@ #define LLAMA_H #include "ggml.h" -#ifdef GGML_USE_CUBLAS +#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS #include "ggml-cuda.h" #define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES #else @@ -38,7 +38,7 @@ #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN #define LLAMA_SESSION_VERSION 1 -#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) +#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) // Defined when llama.cpp is compiled with support for offloading model layers to GPU. #define LLAMA_SUPPORTS_GPU_OFFLOAD #endif diff --git a/otherarch/ggml_v2.c b/otherarch/ggml_v2.c index cb7d5626b34a4..d3772c5faa8a0 100644 --- a/otherarch/ggml_v2.c +++ b/otherarch/ggml_v2.c @@ -139,7 +139,7 @@ inline static void* ggml_v2_aligned_malloc(size_t size) { #include #elif defined(GGML_USE_OPENBLAS) #include -#elif defined(GGML_USE_CUBLAS) +#elif defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) #include "ggml_v2-cuda.h" #endif #if defined(GGML_USE_CLBLAST) @@ -3894,7 +3894,7 @@ struct ggml_v2_context * ggml_v2_init(struct ggml_v2_init_params params) { GGML_V2_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); } -#if defined(GGML_USE_CUBLAS) +#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) ggml_v2_init_cublas(); #elif defined(GGML_USE_CLBLAST) if(quants_unshuffled) @@ -9448,7 +9448,7 @@ static void ggml_v2_compute_forward_mul_mat_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows -#if defined(GGML_USE_CUBLAS) +#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) if (ggml_v2_cuda_can_mul_mat(src0, src1, dst)) { if (params->ith == 0 && params->type == GGML_V2_TASK_COMPUTE) { ggml_v2_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize); @@ -9642,7 +9642,7 @@ static void ggml_v2_compute_forward_mul_mat_f16_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows -#if defined(GGML_USE_CUBLAS) +#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) if (ggml_v2_cuda_can_mul_mat(src0, src1, dst)) { if (params->ith == 0 && params->type == GGML_V2_TASK_COMPUTE) { ggml_v2_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize); @@ -9881,7 +9881,7 @@ static void ggml_v2_compute_forward_mul_mat_q_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows -#if defined(GGML_USE_CUBLAS) +#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) if (ggml_v2_cuda_can_mul_mat(src0, src1, dst)) { if (params->ith == 0 && params->type == GGML_V2_TASK_COMPUTE) { ggml_v2_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize); @@ -14061,7 +14061,7 @@ void ggml_v2_graph_compute(struct ggml_v2_context * ctx, struct ggml_v2_cgraph * size_t cur = 0; -#if defined(GGML_USE_CUBLAS) +#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) if (ggml_v2_cuda_can_mul_mat(node->src0, node->src1, node)) { node->n_tasks = 1; // TODO: this actually is doing nothing // the threads are still spinning @@ -15559,7 +15559,7 @@ int ggml_v2_cpu_has_wasm_simd(void) { } int ggml_v2_cpu_has_blas(void) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS) || defined(GGML_USE_CLBLAST) return 1; #else return 0; diff --git a/otherarch/llama_v2-util.h b/otherarch/llama_v2-util.h index 00aedf8e64ecd..9f65eb0d2c39e 100644 --- a/otherarch/llama_v2-util.h +++ b/otherarch/llama_v2-util.h @@ -415,7 +415,7 @@ struct llama_v2_buffer { llama_v2_buffer& operator=(llama_v2_buffer&&) = delete; }; -#ifdef GGML_USE_CUBLAS +#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS #include "ggml_v2-cuda.h" struct llama_v2_ctx_buffer { uint8_t * addr = NULL; diff --git a/otherarch/llama_v2.cpp b/otherarch/llama_v2.cpp index 167f3e9c39291..65178897ec191 100644 --- a/otherarch/llama_v2.cpp +++ b/otherarch/llama_v2.cpp @@ -9,7 +9,7 @@ #include "llama_v2.h" #include "ggml_v2.h" -#ifdef GGML_USE_CUBLAS +#if defined GGML_USE_CUBLAS || defined GGML_USE_HIPBLAS #include "ggml_v2-cuda.h" #elif defined(GGML_USE_CLBLAST) #include "ggml_v2-opencl.h" @@ -3088,4 +3088,4 @@ std::vector llama_v2_tokenize(struct llama_v2_context * ctx, const res.resize(n); return res; -} \ No newline at end of file +}