Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[cherry-pick][XPU] Xpu 2.11 #9090

Merged
merged 13 commits into from
Jun 8, 2022
Merged
19 changes: 12 additions & 7 deletions cmake/backends/xpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -39,15 +39,21 @@ if (NOT XPU_SDK_ENV)
endif ()
endif ()

macro (prepare_xpu_sdk sdk)
set (xpu_${sdk}_url "${XPU_SDK_URL}/${sdk}-${XPU_SDK_ENV}.tar.gz")
message (STATUS "xpu_${sdk}_url: ${xpu_${sdk}_url}")
if (NOT XPU_XDNN_URL)
set (XPU_XDNN_URL "${XPU_SDK_URL}/xdnn-${XPU_SDK_ENV}.tar.gz")
endif ()
message (STATUS "XPU_XDNN_URL: ${XPU_XDNN_URL}")
if (NOT XPU_XRE_URL)
set (XPU_XRE_URL "${XPU_SDK_URL}/xre-${XPU_SDK_ENV}.tar.gz")
endif ()
message (STATUS "XPU_XRE_URL: ${XPU_XRE_URL}")

macro (prepare_xpu_sdk sdk sdk_url)
ExternalProject_Add (
extern_xpu_${sdk}
${EXTERNAL_PROJECT_LOG_ARGS}
DOWNLOAD_DIR ${XPU_DOWNLOAD_DIR}
DOWNLOAD_COMMAND wget --no-check-certificate -c -q ${xpu_${sdk}_url} && tar xf ${sdk}-${XPU_SDK_ENV}.tar.gz
DOWNLOAD_COMMAND wget --no-check-certificate -c -q ${sdk_url} -O ${sdk}.tar.gz && tar xf ${sdk}.tar.gz
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
UPDATE_COMMAND ""
Expand All @@ -56,7 +62,6 @@ macro (prepare_xpu_sdk sdk)

set (xpu_${sdk}_root "${XPU_INSTALL_DIR}/xpu/${sdk}" CACHE PATH "xpu ${sdk} include directory" FORCE)
set (xpu_${sdk}_include_dir "${xpu_${sdk}_root}/include" CACHE PATH "xpu ${sdk} include directory" FORCE)

include_directories (${xpu_${sdk}_include_dir})

foreach (lib ${ARGN})
Expand All @@ -68,8 +73,8 @@ macro (prepare_xpu_sdk sdk)
endmacro ()

if (NOT XPU_SDK_ROOT)
prepare_xpu_sdk (xdnn xpuapi)
prepare_xpu_sdk (xre xpurt)
prepare_xpu_sdk (xdnn ${XPU_XDNN_URL} xpuapi)
prepare_xpu_sdk (xre ${XPU_XRE_URL} xpurt)
set (xpu_builder_libs xpuapi CACHE INTERNAL "xpu builder libs")
set (xpu_runtime_libs xpurt CACHE INTERNAL "xpu runtime libs")
return ()
Expand Down
10 changes: 10 additions & 0 deletions lite/backends/xpu/target_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,16 @@ class TargetWrapper<TARGET(kXPU)> {
CHECK(xpu_stream_.get());
}
tls_raw_ctx_.get()->xpu_stream = xpu_stream_.get();
if (tls_raw_ctx_.get()->dev().type() == xdnn::kXPU1) {
LOG(INFO) << "running in KunLun1";
} else if (tls_raw_ctx_.get()->dev().type() == xdnn::kXPU2) {
LOG(INFO) << "running in KunLun2";
} else if (tls_raw_ctx_.get()->dev().type() == xdnn::kXPU3) {
LOG(INFO) << "running in KunLun3";
} else {
LOG(FATAL) << "running in unknown XPU device: "
<< static_cast<int>(tls_raw_ctx_.get()->dev().type());
}
LOG(INFO) << "thread 0x" << std::hex << std::this_thread::get_id()
<< " set context xpu stream: " << xpu_stream_.get();
if (l3_planner_ == nullptr) {
Expand Down
3 changes: 1 addition & 2 deletions lite/core/optimizer/mir/fusion/__xpu__bigru_fuse_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,7 @@ class RefactorBackwardGRUv1 : public FuseBase {
auto* seq_rev_in_node = graph->NewArgumentNode(seq_rev_in_name);
seq_rev_in_node->arg()->type = LiteType::GetTensorTy(
TARGET(kXPU), PRECISION(kFloat), DATALAYOUT(kNCHW));
auto* seq_rev_in_tensor =
scope->MutableParent()->NewTensor(seq_rev_in_name);
auto* seq_rev_in_tensor = scope->NewTensor(seq_rev_in_name);
seq_rev_in_tensor->set_precision(paddle::lite_api::PrecisionType::kFloat);
seq_rev_in_tensor->set_persistable(true);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -273,8 +273,7 @@ class XPUConv2dAffineChannelFuser : public FuseBase {
auto* max_output_node = graph->NewArgumentNode(max_output_name);
max_output_node->arg()->type = LiteType::GetTensorTy(
TARGET(kXPU), PRECISION(kFloat), DATALAYOUT(kNCHW));
auto* max_output_tensor =
scope->MutableParent()->NewTensor(max_output_name);
auto* max_output_tensor = scope->NewTensor(max_output_name);
max_output_tensor->set_precision(paddle::lite_api::PrecisionType::kFloat);
max_output_tensor->set_persistable(true);
op_desc.SetOutput("OutputMax", {max_output_name});
Expand Down
3 changes: 1 addition & 2 deletions lite/core/optimizer/mir/fusion/__xpu__conv2d_fuse_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -486,8 +486,7 @@ class XPUConv2dFuser : public FuseBase {
auto* max_output_node = graph->NewArgumentNode(max_output_name);
max_output_node->arg()->type = LiteType::GetTensorTy(
TARGET(kXPU), PRECISION(kFloat), DATALAYOUT(kNCHW));
auto* max_output_tensor =
scope->MutableParent()->NewTensor(max_output_name);
auto* max_output_tensor = scope->NewTensor(max_output_name);
max_output_tensor->set_precision(paddle::lite_api::PrecisionType::kFloat);
max_output_tensor->set_persistable(true);
op_desc.SetOutput("OutputMax", {max_output_name});
Expand Down
3 changes: 1 addition & 2 deletions lite/core/optimizer/mir/fusion/__xpu__fc_fuse_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -147,8 +147,7 @@ class XPUFcFuser : public FuseBase {
auto* max_output_node = graph->NewArgumentNode(max_output_name);
max_output_node->arg()->type = LiteType::GetTensorTy(
TARGET(kXPU), PRECISION(kFloat), DATALAYOUT(kNCHW));
auto* max_output_tensor =
scope->MutableParent()->NewTensor(max_output_name);
auto* max_output_tensor = scope->NewTensor(max_output_name);
max_output_tensor->set_precision(paddle::lite_api::PrecisionType::kFloat);
max_output_tensor->set_persistable(true);
op_desc.SetOutput("OutputMax", {max_output_name});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,14 +60,18 @@ namespace fusion {

class XPUMultiEncoderAdaptiveSeqlenFuser : public FuseBase {
public:
explicit XPUMultiEncoderAdaptiveSeqlenFuser(
const std::string& matmul_type = "matmul")
: matmul_type_(matmul_type) {}

void BuildPattern() override {
auto* mask = VarNode("mask")
->assert_is_op_input("matmul", "X")
->assert_is_op_input("matmul", "Y");
auto* matmul = OpNode("matmul", "matmul")->AsIntermediate();
->assert_is_op_input(matmul_type_, "X")
->assert_is_op_input(matmul_type_, "Y");
auto* matmul = OpNode("matmul", matmul_type_)->AsIntermediate();
auto* matmul_out = VarNode("matmul_out")
->assert_is_op_input("scale", "X")
->assert_is_op_output("matmul", "Out")
->assert_is_op_output(matmul_type_, "Out")
->AsIntermediate();
auto* scale = OpNode("scale", "scale")->AsIntermediate();
auto* scale_out = VarNode("scale_out")
Expand Down Expand Up @@ -115,15 +119,15 @@ class XPUMultiEncoderAdaptiveSeqlenFuser : public FuseBase {
graph->NewArgumentNode(embedding_seq_lod_name);
embedding_seq_lod_node->arg()->type = LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kInt32), DATALAYOUT(kNCHW));
scope->MutableParent()->NewTensor(embedding_seq_lod_name);
scope->NewTensor(embedding_seq_lod_name);
// add new arg pad_seq_len
std::string embedding_pad_seq_len_name =
embedding_out_name + "_pad_seq_len";
auto* embedding_pad_seq_len_node =
graph->NewArgumentNode(embedding_pad_seq_len_name);
embedding_pad_seq_len_node->arg()->type = LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kInt32), DATALAYOUT(kNCHW));
scope->MutableParent()->NewTensor(embedding_pad_seq_len_name);
scope->NewTensor(embedding_pad_seq_len_name);

embedding_op_desc.SetOutput("SeqLod", {embedding_seq_lod_name});
embedding_op_desc.SetOutput("PadSeqLen", {embedding_pad_seq_len_name});
Expand All @@ -140,15 +144,21 @@ class XPUMultiEncoderAdaptiveSeqlenFuser : public FuseBase {
DirectedLink(embedding_seq_lod_node, matched.at("xpu_encoder"));
DirectedLink(embedding_pad_seq_len_node, matched.at("xpu_encoder"));
}

private:
std::string matmul_type_;
};

} // namespace fusion

class XPUMultiEncoderAdaptiveSeqlenFusePass : public ProgramPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override {
fusion::XPUMultiEncoderAdaptiveSeqlenFuser fuser;
fuser(graph.get());
std::vector<std::string> matmul_types{"matmul", "matmul_v2"};
for (auto& matmul_type : matmul_types) {
fusion::XPUMultiEncoderAdaptiveSeqlenFuser fuser(matmul_type);
fuser(graph.get());
}
}
};

Expand Down
131 changes: 91 additions & 40 deletions lite/core/optimizer/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -508,43 +508,15 @@ class XPUSingleEncoderFuser : public FuseBase {
CHECK_EQ(q_mul_y_shape[0], qkv_mul_y_shape[1]);
CHECK_EQ(q_mul_y_shape[1], qkv_mul_y_shape[0]);
CHECK_GT(hidden_dim, 0) << "invalid hidden_dim: " << hidden_dim;
// mul input_max, output_max * 6 + matmul x_max,y_max,output_max * 2
std::vector<float> fc_input_max;
set_quant_info(matched, &fc_input_max);
// mul & matmul input/output max
op_desc.SetAttr<std::vector<float>>("fc_input_max", fc_input_max);

if (q_mul_op_info->HasAttr("enable_int8") &&
q_mul_op_info->GetAttr<bool>("enable_int8")) {
op_desc.SetAttr<bool>("enable_int8", true);
op_desc.SetAttr<std::vector<float>>(
"X0_max",
{
127 *
matched.at("q_mul")
->stmt()
->op_info()
->GetAttr<std::vector<float>>("X0_scale")[0],
127 *
matched.at("k_mul")
->stmt()
->op_info()
->GetAttr<std::vector<float>>("X0_scale")[0],
127 *
matched.at("v_mul")
->stmt()
->op_info()
->GetAttr<std::vector<float>>("X0_scale")[0],
127 *
matched.at("qkv_mul")
->stmt()
->op_info()
->GetAttr<std::vector<float>>("X0_scale")[0],
127 *
matched.at("qkv_mul_3")
->stmt()
->op_info()
->GetAttr<std::vector<float>>("X0_scale")[0],
127 *
matched.at("qkv_mul_4")
->stmt()
->op_info()
->GetAttr<std::vector<float>>("X0_scale")[0],
});
op_desc.SetAttr<std::vector<float>>(
"Y0_max",
{
Expand Down Expand Up @@ -579,7 +551,7 @@ class XPUSingleEncoderFuser : public FuseBase {
->op_info()
->GetAttr<std::vector<float>>("Y0_scale")[0],
});
VLOG(3) << "q/k/v 127*y0_scale: "
VLOG(3) << "q/k/v weight_max: "
<< 127 *
matched.at("q_mul")
->stmt()
Expand Down Expand Up @@ -672,6 +644,82 @@ class XPUSingleEncoderFuser : public FuseBase {
std::string mul_type_;
bool with_q_scale_;
bool norm_before_;
// quant_info: mul input_max, output_max * 6 + matmul x_max:y_max, output_max
// * 2
void set_quant_info(const key2nodes_t& matched,
std::vector<float>* quant_info) {
const std::vector<std::string> quant_mul_ops = {
"q_mul", "k_mul", "v_mul", "qkv_mul", "qkv_mul_3", "qkv_mul_4"};
const std::vector<std::string> mul_add_ops = {
"q_add", "k_add", "v_add", "qkv_add", "qkv_add_3", "qkv_add_4"};
const std::vector<std::string> matmul_ops = {"qk_matmul", "qkv_matmul"};

auto* q_mul_op_info = matched.at("q_mul")->stmt()->op_info();
const bool mul_quant = q_mul_op_info->HasAttr("enable_int8") &&
q_mul_op_info->GetAttr<bool>("enable_int8");
auto* qk_matmul_op_info = matched.at("qk_matmul")->stmt()->op_info();
const bool matmul_quant = qk_matmul_op_info->HasAttr("enable_int8") &&
qk_matmul_op_info->GetAttr<bool>("enable_int8");
if (!mul_quant && !matmul_quant) {
VLOG(3) << "no quantized op";
return;
} else {
VLOG(3) << "mul quantized: " << mul_quant
<< ", matmul quantized: " << matmul_quant;
}
for (int i = 0; mul_quant && (i < quant_mul_ops.size()); ++i) {
auto& quant_mul = quant_mul_ops[i];
quant_info->push_back(
127 *
matched.at(quant_mul)->stmt()->op_info()->GetAttr<std::vector<float>>(
"X0_scale")[0]);
// ew_add out_threshold for output quant
auto& quant_ew = mul_add_ops[i];
quant_info->push_back(
matched.at(quant_ew)->stmt()->op_info()->GetAttr<float>(
"out_threshold"));
VLOG(3) << quant_mul << " input_max: " << (*quant_info)[i * 2]
<< ", output_max(ew_add): " << (*quant_info)[i * 2 + 1];
}
CHECK_EQ(quant_info->size(), 12);
float max_qkv_input = std::max((*quant_info)[0], (*quant_info)[2]);
max_qkv_input = std::max(max_qkv_input, (*quant_info)[4]);
(*quant_info)[0] = max_qkv_input;
(*quant_info)[2] = max_qkv_input;
(*quant_info)[4] = max_qkv_input;
float max_qkv_output = std::max((*quant_info)[1], (*quant_info)[3]);
max_qkv_output = std::max(max_qkv_output, (*quant_info)[5]);
(*quant_info)[1] = max_qkv_output;
(*quant_info)[3] = max_qkv_output;
(*quant_info)[5] = max_qkv_output;
VLOG(3) << "max_qkv_input: " << max_qkv_input
<< ", max_qkv_output: " << max_qkv_output;

if (matmul_quant) {
auto* qkv_matmul_op_info = matched.at("qkv_matmul")->stmt()->op_info();
CHECK(qkv_matmul_op_info->HasAttr("X0_scale") == true);
float softmax_out_threshold = matched.at("qk_softmax")
->stmt()
->op_info()
->GetAttr<float>("out_threshold");
VLOG(3) << "qkv_matmul X max: " << softmax_out_threshold
<< ", qkv_matmul Out max: " << (*quant_info)[6];
CHECK_LT(std::abs(softmax_out_threshold -
qkv_matmul_op_info->GetAttr<std::vector<float>>(
"X0_scale")[0] *
127),
1e-5);
CHECK(qk_matmul_op_info->HasAttr("X0_scale") == true);
quant_info->push_back(max_qkv_output);
quant_info->push_back(max_qkv_output);
quant_info->push_back(softmax_out_threshold);
// qkv_matmul X max
quant_info->push_back(softmax_out_threshold);
quant_info->push_back(max_qkv_output);
quant_info->push_back((*quant_info)[6]);
CHECK_EQ(quant_info->size(), 18);
}
}
};

class XPUMultiEncoderFuser {
Expand Down Expand Up @@ -745,13 +793,13 @@ class XPUMultiEncoderFuser {
Node* cur_encoder = all_encoders[i];
auto* op_info = cur_encoder->stmt()->op_info();
if (enable_int8) {
CHECK(
op_info->HasAttr("enable_int8") && op_info->HasAttr("Y0_max") &&
op_info->HasAttr("X0_max") /* && op_info->HasAttr("Out0_max")*/);
CHECK(op_info->HasAttr("enable_int8")) << "no enable_int8 attr";
CHECK(op_info->HasAttr("Y0_max")) << "no Y0_max attr";
CHECK(op_info->HasAttr("fc_input_max")) << "no fc_input_max attr";
for (auto y0 : op_info->GetAttr<std::vector<float>>("Y0_max")) {
fc_weight_max.push_back(y0);
}
for (auto x0 : op_info->GetAttr<std::vector<float>>("X0_max")) {
for (auto x0 : op_info->GetAttr<std::vector<float>>("fc_input_max")) {
fc_input_max.push_back(x0);
}
}
Expand Down Expand Up @@ -802,8 +850,11 @@ class XPUMultiEncoderFuser {
op_desc.SetAttr<bool>("enable_int8", enable_int8);
if (enable_int8) {
CHECK_EQ(fc_precision_, "int8");
CHECK_EQ(fc_input_max.size(), all_encoders.size() * 6);
CHECK_EQ(fc_weight_max.size(), all_encoders.size() * 6);
CHECK((fc_input_max.size() == all_encoders.size() * 12) ||
(fc_input_max.size() == all_encoders.size() * 18))
<< fc_input_max.size()
<< ", all_encoders.size:" << all_encoders.size();
for (int i = 0; i < fc_weight_max.size(); i += 6) {
CHECK_LT(std::abs(fc_weight_max[i] - fc_weight_max[i + 1]), 1e-5)
<< " quanted ernie's q/k weight scale should be euqal: "
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -184,10 +184,9 @@ class XPUMultiSliceSoftmaxFuser {
"__xpu__multi_softmax_concat_output_" + in_name;
CHECK(graph->RetrieveArgument(concat_output_name) == nullptr);
auto* concat_output_node = graph->NewArgumentNode(concat_output_name);
concat_output_node->arg()->is_weight = true;
concat_output_node->arg()->type = LiteType::GetTensorTy(
TARGET(kXPU), PRECISION(kFloat), DATALAYOUT(kNCHW));
scope->MutableParent()->NewTensor(concat_output_name);
scope->NewTensor(concat_output_name);
op_desc.SetOutput("ConcatOut", {concat_output_name});

auto new_op = LiteOpRegistry::Global().Create(op_desc.Type());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -225,8 +225,7 @@ class XPUSqueezeExcitationFuser_DEPREC : public FuseBase {
auto* max_output_node = graph->NewArgumentNode(max_output_name);
max_output_node->arg()->type = LiteType::GetTensorTy(
TARGET(kXPU), PRECISION(kFloat), DATALAYOUT(kNCHW));
auto* max_output_tensor =
scope->MutableParent()->NewTensor(max_output_name);
auto* max_output_tensor = scope->NewTensor(max_output_name);
max_output_tensor->set_precision(paddle::lite_api::PrecisionType::kFloat);
max_output_tensor->set_persistable(true);
op_desc.SetOutput("OutputMax", {max_output_name});
Expand Down Expand Up @@ -537,8 +536,7 @@ class XPUSqueezeExcitationFuser : public FuseBase {
auto* max_output_node = graph->NewArgumentNode(max_output_name);
max_output_node->arg()->type = LiteType::GetTensorTy(
TARGET(kXPU), PRECISION(kFloat), DATALAYOUT(kNCHW));
auto* max_output_tensor =
scope->MutableParent()->NewTensor(max_output_name);
auto* max_output_tensor = scope->NewTensor(max_output_name);
max_output_tensor->set_precision(paddle::lite_api::PrecisionType::kFloat);
max_output_tensor->set_persistable(true);
op_desc.SetOutput("OutputMax", {max_output_name});
Expand Down
3 changes: 3 additions & 0 deletions lite/kernels/xpu/__xpu__conv2d_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ bool QuantFilter<int8_t>(const float* filter_on_host,
template <typename T, PrecisionType PType>
void XPUConv2dCompute<T, PType>::PrepareForRun() {
auto& param = this->template Param<param_t>();
auto& ctx = this->ctx_->template As<XPUContext>();
int max_ptr_size = ctx.GetRawContext()->max_ptr_size();
param.output_max->Resize({max_ptr_size});
auto filter_ptr = param.filter->template data<float>();
auto filter_dims = param.filter->dims();

Expand Down
Loading