diff --git a/src/gpu/intel/compute/kernel_ctx.cpp b/src/gpu/intel/compute/kernel_ctx.cpp new file mode 100644 index 00000000000..8e3926a03ba --- /dev/null +++ b/src/gpu/intel/compute/kernel_ctx.cpp @@ -0,0 +1,228 @@ +/******************************************************************************* +* Copyright 2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "gpu/intel/compute/kernel_ctx.hpp" + +#include +#include +#include +#include +#include +#include +#include + +#include "common/bit_cast.hpp" +#include "gpu/intel/gpu_primitive_attr.hpp" +#include "gpu/intel/utils.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace intel { +namespace compute { + +std::ostream &operator<<(std::ostream &out, const kernel_ctx_t::option_t &opt) { + switch (opt.kind) { + case kernel_ctx_t::option_kind_t::general: out << opt.name; break; + case kernel_ctx_t::option_kind_t::macro_int: + out << "-D" << opt.name << "=" << opt.value.i_value; + if (opt.value.i_value > INT_MAX || opt.value.i_value < INT_MIN) + out << "L"; + break; + case kernel_ctx_t::option_kind_t::macro_float: + out << "-D" << opt.name << "=as_float(0x" << std::hex + << utils::bit_cast(opt.value.f_value) << ")"; + break; + case kernel_ctx_t::option_kind_t::macro_string: + out << "-D" << opt.name << "=" << opt.value.s_value; + break; + default: assert(!"Unknown kind"); + } + return out; +} + +kernel_ctx_t::kernel_ctx_t(const primitive_attr_t *attr) { + set_default_options(attr); + set_default_macros(attr); +} + +std::string kernel_ctx_t::options() const { + std::ostringstream oss; + bool is_first = true; + for (auto &kv : options_) { + if (!is_first) oss << " "; + oss << kv.second; + is_first = false; + } + if (use_int32_offset_) { + oss << " -DUSE_INT32_OFFSET"; + } else { + // TODO: Determine if specialization for buffers between 2GB and 4GB + // is worthwhile + oss << " -cl-intel-greater-than-4GB-buffer-required"; + } + return oss.str(); +} + +void kernel_ctx_t::register_buffer_size(size_t size) { + if (size > INT_MAX) use_int32_offset(false); +} + +void kernel_ctx_t::use_int32_offset(bool value) { + use_int32_offset_ = value; +} + +void kernel_ctx_t::define_int(const char *name, int64_t value) { + add_option(option_t(name, value)); +} + +void kernel_ctx_t::define_int(const std::string &name, int64_t value) { + add_option(option_t(name, value)); +} + +void kernel_ctx_t::define_float(const char *name, float value) { + add_option(option_t(name, value)); +} + +void kernel_ctx_t::add_option(const char *option) { + add_option(std::string(option)); +} + +void kernel_ctx_t::add_option(const std::string &option) { + auto parts = gpu_utils::split(option); + for (auto &p : parts) { + if (p.empty()) continue; + add_option(option_t(p)); + } +} + +void kernel_ctx_t::add_option(const option_t &option) { + auto it = options_.find(option.name); + if (it != options_.end()) { + if (it->second != option) { + std::cout << "Error: option " << option.name + << " is already set to a different value.\n"; + std::cout << " Old option:" << it->second << "\n"; + std::cout << " New option:" << option << "\n"; + abort(); + } + return; + } + options_[option.name] = option; +} + +bool kernel_ctx_t::has_macro(const char *name) const { + return options_.count(name) != 0; +} + +bool kernel_ctx_t::has_macro(const std::string &name) const { + return has_macro(name.c_str()); +} + +void kernel_ctx_t::set_data_type(data_type_t dt) { + switch (dt) { + case data_type::bf16: define_int("DT_BF16", 1); break; + case data_type::f16: define_int("DT_F16", 1); break; + case data_type::f32: define_int("DT_F32", 1); break; + case data_type::f64: define_int("DT_F64", 1); break; + case data_type::s8: define_int("DT_S8", 1); break; + case data_type::u8: define_int("DT_U8", 1); break; + case data_type::f8_e4m3: define_int("DT_HF8", 1); break; + case data_type::f8_e5m2: define_int("DT_BF8", 1); break; + case data_type::f4_e2m1: define_int("DT_F4_E2M1", 1); break; + case data_type::s32: define_int("DT_S32", 1); break; + default: assert(!"unknown data type"); break; + } +} + +std::string kernel_ctx_t::data_type() const { + if (has_macro("DT_F16")) return "f16"; + if (has_macro("DT_F32")) return "f32"; + if (has_macro("DT_F64")) return "f64"; + if (has_macro("DT_S8")) return "s8"; + return ""; +} + +void kernel_ctx_t::add_custom_header( + const std::string &header_name, std::string &&source) { + custom_headers_[header_name] = std::move(source); +} + +const char *kernel_ctx_t::get_custom_header( + const std::string &header_name) const { + auto iter = custom_headers_.find(header_name); + if (iter != custom_headers_.end()) return iter->second.c_str(); + return nullptr; +} + +bool kernel_ctx_t::has_custom_headers() const { + return !custom_headers_.empty(); +} + +void kernel_ctx_t::set_default_options(const primitive_attr_t *attr) { + // By default fp32 division and sqrt are not IEEE-compliant + add_option("-cl-fp32-correctly-rounded-divide-sqrt"); + + if (attr && attr->gpu_attr_) { + auto *gpu_attr = utils::downcast( + attr->gpu_attr_.get()); + if (gpu_attr->threads_per_eu() == 4) { + add_option("-cl-intel-256-GRF-per-thread"); + } + } + + // Set override flag for checking compiler assumptions + if (gpu_utils::dev_getenv("enable_check_assumptions", 0)) { + add_option("-DENABLE_CHECK_ASSUMPTIONS"); + } + + if (gpu_utils::dev_getenv("ocl_debug", 0)) { add_option("-DOCL_DEBUG"); } +} + +void kernel_ctx_t::set_default_macros(const primitive_attr_t *attr) { + if (attr) define_int("DETERMINISTIC", attr->deterministic_); +} + +kernel_ctx_t::option_t::option_t(const std::string &s) { + auto d_pos = s.find("-D"); + bool is_macro = (d_pos != std::string::npos); + if (!is_macro) { + name = s; + return; + } + name = s.substr(d_pos + 2); + auto eq_pos = name.find("="); + bool has_value = (eq_pos != std::string::npos); + if (has_value) { + value = name.substr(eq_pos + 1); + name = name.substr(0, eq_pos); + kind = option_kind_t::macro_string; + } else { + kind = option_kind_t::macro; + } + return; +} + +bool kernel_ctx_t::option_t::operator==(const option_t &other) const { + return (kind == other.kind) && (name == other.name) + && (value == other.value); +} + +} // namespace compute +} // namespace intel +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/intel/compute/kernel_ctx.hpp b/src/gpu/intel/compute/kernel_ctx.hpp index 164ed4f22d3..8d4eaf89327 100644 --- a/src/gpu/intel/compute/kernel_ctx.hpp +++ b/src/gpu/intel/compute/kernel_ctx.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2019-2024 Intel Corporation +* Copyright 2019-2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,17 +17,9 @@ #ifndef GPU_INTEL_COMPUTE_KERNEL_CTX_HPP #define GPU_INTEL_COMPUTE_KERNEL_CTX_HPP -#include #include -#include -#include -#include -#include -#include -#include "common/bit_cast.hpp" #include "gpu/intel/gpu_primitive_attr.hpp" -#include "gpu/intel/utils.hpp" namespace dnnl { namespace impl { @@ -37,161 +29,90 @@ namespace compute { class kernel_ctx_t { public: - kernel_ctx_t(const primitive_attr_t *attr = nullptr) { - set_default_options(attr); - set_default_macros(attr); - } - - std::string options() const { - std::ostringstream oss; - for (auto &opt : option_set_) - oss << " " << opt; - - if (use_int32_offset_) { - oss << " -DUSE_INT32_OFFSET"; - } else { - // TODO: Determine if specialization for buffers between 2GB and 4GB - // is worthwhile - oss << " -cl-intel-greater-than-4GB-buffer-required"; + enum class option_kind_t { + general, // + macro, // -D + macro_int, // -D= + macro_float, // -D=as_float( in hex) + macro_string // -D= + }; + + struct option_value_t { + int64_t i_value = 0; + float f_value = 0; + std::string s_value; + + option_value_t() = default; + option_value_t(int64_t value) : i_value(value) {} + option_value_t(float value) : f_value(value) {} + option_value_t(const std::string &value) : s_value(value) {} + bool operator==(const option_value_t &other) const { + return (i_value == other.i_value) && (f_value == other.f_value) + && (s_value == other.s_value); } - - for (auto &int_var : int_var_map_) { - oss << " -D" << int_var.first << "=" << int_var.second; - if (int_var.second > INT_MAX || int_var.second < INT_MIN) - oss << "L"; + bool operator!=(const option_value_t &other) const { + return !operator==(other); } - - for (auto &float_var : float_var_map_) { - oss << " -D" << float_var.first << "=as_float(0x" << std::hex - << utils::bit_cast(float_var.second) << ")"; + }; + + struct option_t { + option_kind_t kind = option_kind_t::general; + std::string name; + option_value_t value; + + option_t() = default; + option_t(const std::string &s); + option_t(const std::string &name, int64_t value) + : kind(option_kind_t::macro_int), name(name), value(value) {} + option_t(const std::string &name, float value) + : kind(option_kind_t::macro_float), name(name), value(value) {} + option_t(const std::string &name, const std::string &value) + : kind(option_kind_t::macro_string), name(name), value(value) {} + bool operator==(const option_t &other) const; + bool operator!=(const option_t &other) const { + return !operator==(other); } - return oss.str(); - } + }; - void register_buffer_size(size_t size) { - if (size > INT_MAX) use_int32_offset(false); - } + kernel_ctx_t(const primitive_attr_t *attr = nullptr); + std::string options() const; + void register_buffer_size(size_t size); // Enable various optimizations when all buffers are < 2GB in size. In this // case, int32_t types can be used for data offsets and avoid int64_t // operations when native 64-bit operations are unsupported. - void use_int32_offset(bool value) { use_int32_offset_ = value; } + void use_int32_offset(bool value); - void define_int(const char *variable, int64_t value) { - int_var_map_.insert({variable, value}); - } - - void define_int(const std::string &variable, int64_t value) { - define_int(variable.c_str(), value); - } + void define_int(const char *variable, int64_t value); + void define_int(const std::string &variable, int64_t value); // TODO: should be removed, any float values should be passed in // kernel parameters - void define_float(const char *variable, float value) { - float_var_map_.insert({variable, value}); - } - - void add_option(const char *option) { option_set_.insert(option); } - void add_option(const std::string &option) { add_option(option.c_str()); } - - bool has_macro(const char *name) const { - std::string opt_start = std::string("-D") + name + "="; - for (auto &opt : option_set_) - if (opt.find(opt_start) != std::string::npos) return true; - - return int_var_map_.count(name) != 0 || float_var_map_.count(name) != 0; - } - bool has_macro(const std::string &name) const { - return has_macro(name.c_str()); - } - - void set_data_type(data_type_t dt) { - switch (dt) { - case data_type::bf16: define_int("DT_BF16", 1); break; - case data_type::f16: define_int("DT_F16", 1); break; - case data_type::f32: define_int("DT_F32", 1); break; - case data_type::f64: define_int("DT_F64", 1); break; - case data_type::s8: define_int("DT_S8", 1); break; - case data_type::u8: define_int("DT_U8", 1); break; - case data_type::f8_e4m3: define_int("DT_HF8", 1); break; - case data_type::f8_e5m2: define_int("DT_BF8", 1); break; - case data_type::f4_e2m1: define_int("DT_F4_E2M1", 1); break; - case data_type::s32: define_int("DT_S32", 1); break; - default: assert(!"unknown data type"); break; - } - } - - template - T get_scalar(const std::string &s) const { - UNUSED(s); - static_assert(!std::is_same::value, "not expected"); - return {}; - } - - std::string data_type() const { - if (int_var_map_.count("DT_F16") != 0) return "f16"; - - if (int_var_map_.count("DT_F32") != 0) return "f32"; - - if (int_var_map_.count("DT_F64") != 0) return "f64"; + void define_float(const char *variable, float value); - if (int_var_map_.count("DT_S8") != 0) return "s8"; + void add_option(const char *option); + void add_option(const std::string &option); + void add_option(const option_t &option); + bool has_macro(const char *name) const; + bool has_macro(const std::string &name) const; - return ""; - } + void set_data_type(data_type_t dt); + std::string data_type() const; void add_custom_header( - const std::string &header_name, std::string &&source) { - custom_headers_[header_name] = std::move(source); - } - - const char *get_custom_header(const std::string &header_name) const { - auto iter = custom_headers_.find(header_name); - if (iter != custom_headers_.end()) return iter->second.c_str(); - return nullptr; - } - - bool has_custom_headers() const { return !custom_headers_.empty(); } + const std::string &header_name, std::string &&source); + const char *get_custom_header(const std::string &header_name) const; + bool has_custom_headers() const; private: - void set_default_options(const primitive_attr_t *attr) { - // By default fp32 division and sqrt are not IEEE-compliant - add_option("-cl-fp32-correctly-rounded-divide-sqrt"); - - if (attr && attr->gpu_attr_) { - auto *gpu_attr = utils::downcast( - attr->gpu_attr_.get()); - if (gpu_attr->threads_per_eu() == 4) { - add_option("-cl-intel-256-GRF-per-thread"); - } - } + void set_default_options(const primitive_attr_t *attr); + void set_default_macros(const primitive_attr_t *attr); - // Set override flag for checking compiler assumptions - if (gpu_utils::dev_getenv("enable_check_assumptions", 0)) { - add_option("-DENABLE_CHECK_ASSUMPTIONS"); - } - - if (gpu_utils::dev_getenv("ocl_debug", 0)) { - add_option("-DOCL_DEBUG"); - } - } - void set_default_macros(const primitive_attr_t *attr) { - if (attr) { define_int("DETERMINISTIC", attr->deterministic_); } - } - - std::map int_var_map_; - std::map float_var_map_; - std::set option_set_; - std::unordered_map custom_headers_; + std::map options_; + std::map custom_headers_; bool use_int32_offset_ = true; }; -template <> -inline int64_t kernel_ctx_t::get_scalar(const std::string &name) const { - assert(int_var_map_.count(name) != 0 && "not expected"); - return int_var_map_.at(name); -} - } // namespace compute } // namespace intel } // namespace gpu