From 002b0d746f2c4c1af21ef8871380005864fedf29 Mon Sep 17 00:00:00 2001
From: xiebaiyuan <xiebaiyuan@139.com>
Date: Tue, 13 Aug 2024 19:30:40 +0800
Subject: [PATCH 01/13] [OpenCL]do not init OpenCL runtime if use a arm only
 model, when LITE_WITH_OPENCL ENABLED

test=develop
---
 lite/backends/opencl/cl_image_converter.cc    | 66 +++++++++----------
 lite/backends/opencl/cl_image_converter.h     |  6 +-
 lite/core/context.h                           |  2 +-
 lite/core/kernel.h                            | 10 ++-
 lite/core/program.h                           | 54 ++++++++++-----
 lite/kernels/opencl/conv_image_compute.cc     |  8 +--
 lite/kernels/opencl/io_copy_buffer_compute.cc |  8 +--
 lite/kernels/opencl/layout_image_compute.cc   | 26 ++++----
 lite/kernels/opencl/matmul_image_compute.cc   |  8 +--
 9 files changed, 108 insertions(+), 80 deletions(-)

diff --git a/lite/backends/opencl/cl_image_converter.cc b/lite/backends/opencl/cl_image_converter.cc
index 5ffb0135b6a..4fd00f5f8a6 100644
--- a/lite/backends/opencl/cl_image_converter.cc
+++ b/lite/backends/opencl/cl_image_converter.cc
@@ -71,13 +71,13 @@ void CLImageConverterDefault::NCHWToImage(float *nchw,
           if (c < C) {
             // size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
             // (c % 4);
-            fp16_support_ ? image_fp16[i2] = Float2Half(*p) : image_fp32[i2] =
-                                                                  *p;
+            fp16_support() ? image_fp16[i2] = Float2Half(*p) : image_fp32[i2] =
+                                                                   *p;
             i2 += 4;
             p++;
           } else {
-            fp16_support_ ? image_fp16[i2] = Float2Half(0.f) : image_fp32[i2] =
-                                                                   0.f;
+            fp16_support() ? image_fp16[i2] = Float2Half(0.f) : image_fp32[i2] =
+                                                                    0.f;
             i2 += 4;
           }
         }
@@ -115,7 +115,7 @@ void CLImageConverterDefault::ImageToNCHW(void *image,
       for (size_t h = 0; h < H; h++) {
         size_t i2 = (i1 << 2) + c % 4;
         for (size_t w = 0; w < W; w++) {
-          *p = fp16_support_ ? Half2Float(image_fp16[i2]) : image_fp32[i2];
+          *p = fp16_support() ? Half2Float(image_fp16[i2]) : image_fp32[i2];
           i2 += 4;
           p++;
         }
@@ -196,7 +196,7 @@ void CLImageConverterFolder::NCHWToImage(float *tensor,
     for (size_t h = 0; h < tdim[0]; h++) {
       for (size_t w = 0; w < width * 4; w++) {
         if (w < tdim[1]) {
-          if (fp16_support_) {
+          if (fp16_support()) {
             image_fp16[(h * width + w / 4) * 4 + (w % 4)] =
                 Float2Half(tensor[h * tdim[1] + w]);
           } else {
@@ -204,7 +204,7 @@ void CLImageConverterFolder::NCHWToImage(float *tensor,
                 tensor[h * tdim[1] + w];
           }
         } else {
-          if (fp16_support_) {
+          if (fp16_support()) {
             image_fp16[(h * width + w / 4) * 4 + (w % 4)] = Float2Half(0.f);
           } else {
             image_fp32[(h * width + w / 4) * 4 + (w % 4)] = 0.f;
@@ -241,7 +241,7 @@ void CLImageConverterFolder::ImageToNCHW(void *image,
     for (size_t h = 0; h < H; h++) {
       for (size_t w = 0; w < W; w++) {
         p[h * W + w] =
-            fp16_support_
+            fp16_support()
                 ? Half2Float(image_fp16[(h * width + w / 4) * 4 + (w % 4)])
                 : image_fp32[(h * width + w / 4) * 4 + (w % 4)];
       }
@@ -286,14 +286,14 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor,
           size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
                          w * 4 + n % 4;
           if (n < N) {
-            if (fp16_support_) {
+            if (fp16_support()) {
               image_fp16[index] = Float2Half(*p);
             } else {
               image_fp32[index] = *p;
             }
             p++;
           } else {
-            if (fp16_support_) {
+            if (fp16_support()) {
               image_fp16[index] = Float2Half(0.f);
             } else {
               image_fp32[index] = 0.f;
@@ -330,8 +330,8 @@ void CLImageConverterNWBlock::ImageToNCHW(void *image,
         for (size_t w = 0; w < W; ++w) {
           size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
                          w * 4 + n % 4;
-          *p =
-              fp16_support_ ? Half2Float(image_fp16[index]) : image_fp32[index];
+          *p = fp16_support() ? Half2Float(image_fp16[index])
+                              : image_fp32[index];
           p++;
           if (index >= (width * height * 4)) {
             LOG(INFO) << " index out of range ";
@@ -393,7 +393,7 @@ void CLImageConverterDWBlock::NCHWToImage(float *tensor,
           if (c < C) {
             // size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
             // (c % 4);
-            if (fp16_support_) {
+            if (fp16_support()) {
               image_fp16[i2] = Float2Half(*p);
             } else {
               image_fp32[i2] = *p;
@@ -401,7 +401,7 @@ void CLImageConverterDWBlock::NCHWToImage(float *tensor,
             i2 += 4;
             p++;
           } else {
-            if (fp16_support_) {
+            if (fp16_support()) {
               image_fp16[i2] = Float2Half(0.f);
             } else {
               image_fp32[i2] = 0.f;
@@ -437,7 +437,7 @@ void CLImageConverterDWBlock::ImageToNCHW(void *image,
       for (size_t h = 0; h < H; h++) {
         size_t i2 = (i1 << 2) + c % 4;
         for (size_t w = 0; w < W; w++) {
-          *p = fp16_support_ ? Half2Float(image_fp16[i2]) : image_fp32[i2];
+          *p = fp16_support() ? Half2Float(image_fp16[i2]) : image_fp32[i2];
           i2 += 4;
           p++;
         }
@@ -540,7 +540,7 @@ void CLImageConverterWinoTransWeight::NCHWToImage(float *tensor,
   float *image_fp32 = static_cast<float *>(image);
   half_t *image_fp16 = static_cast<half_t *>(image);
   // auto weight_dest_data = static_cast<half_t *>(image);
-  if (fp16_support_) {
+  if (fp16_support()) {
     memset(image_fp16, 0, num_count * sizeof(half_t));
   } else {
     memset(image_fp32, 0, num_count * sizeof(float));
@@ -573,7 +573,7 @@ void CLImageConverterWinoTransWeight::NCHWToImage(float *tensor,
       auto dstSz_fp16 = dstOz_fp16 + szC4 * 16 + unitCo * my;
       auto dstSz_fp32 = dstOz_fp32 + szC4 * 16 + unitCo * my;
       for (int i = 0; i < 16; ++i) {
-        if (fp16_support_) {
+        if (fp16_support()) {
           *(dstSz_fp16 + i * ((co + 3) / 4) * ((ci + 3) / 4) * 4 * 4) =
               Float2Half(K_Transform.data()[i]);
         } else {
@@ -648,12 +648,12 @@ void CLImageConverterNBlock::NCHWToImage(float *nchw,
           size_t img_idx =
               (((n / 4) * W * H + h * W + w) * c_block4 + c) * 4 + n % 4;
           if (n < N && c < C) {
-            fp16_support_ ? image_fp16[img_idx] = Float2Half(*p)
-                          : image_fp32[img_idx] = *p;
+            fp16_support() ? image_fp16[img_idx] = Float2Half(*p)
+                           : image_fp32[img_idx] = *p;
             p++;
           } else {
-            fp16_support_ ? image_fp16[img_idx] = Float2Half(0.f)
-                          : image_fp32[img_idx] = 0.f;
+            fp16_support() ? image_fp16[img_idx] = Float2Half(0.f)
+                           : image_fp32[img_idx] = 0.f;
           }
         }
       }
@@ -697,12 +697,12 @@ void CLImageConverterNBlockGroup::NCHWToImage(float *nchw,
               (((n / 4) * W * H + h * W + w) * c_block4 + c) * 4 + n % 4;
           size_t remain = n % ((N / groups + 3) / 4 * 4);
           if (remain < (N / groups) && c < C) {
-            fp16_support_ ? image_fp16[img_idx] = Float2Half(*p)
-                          : image_fp32[img_idx] = *p;
+            fp16_support() ? image_fp16[img_idx] = Float2Half(*p)
+                           : image_fp32[img_idx] = *p;
             p++;
           } else {
-            fp16_support_ ? image_fp16[img_idx] = Float2Half(0.f)
-                          : image_fp32[img_idx] = 0.f;
+            fp16_support() ? image_fp16[img_idx] = Float2Half(0.f)
+                           : image_fp32[img_idx] = 0.f;
           }
         }
       }
@@ -760,12 +760,12 @@ void CLImageConverterN2Block::NCHWToImage(float *nchw,
                            (c / 4) * 32 + ((n % 8) / 4) * 16 + (c % 4) * 4 +
                            (n % 8) % 4;
           if (n < N && c < C) {
-            fp16_support_ ? image_fp16[img_idx] = Float2Half(*p)
-                          : image_fp32[img_idx] = *p;
+            fp16_support() ? image_fp16[img_idx] = Float2Half(*p)
+                           : image_fp32[img_idx] = *p;
             p++;
           } else {
-            fp16_support_ ? image_fp16[img_idx] = Float2Half(0.f)
-                          : image_fp32[img_idx] = 0.f;
+            fp16_support() ? image_fp16[img_idx] = Float2Half(0.f)
+                           : image_fp32[img_idx] = 0.f;
           }
         }
       }
@@ -819,12 +819,12 @@ void CLImageConverterDWFilter::NCHWToImage(float *nchw,
         for (size_t w = 0; w < W; w++) {
           size_t img_idx = (((n / 4) * W * H + h * W + w) * C + c) * 4 + n % 4;
           if (n < N) {
-            fp16_support_ ? image_fp16[img_idx] = Float2Half(*p)
-                          : image_fp32[img_idx] = *p;
+            fp16_support() ? image_fp16[img_idx] = Float2Half(*p)
+                           : image_fp32[img_idx] = *p;
             p++;
           } else {
-            fp16_support_ ? image_fp16[img_idx] = Float2Half(0.f)
-                          : image_fp32[img_idx] = 0.f;
+            fp16_support() ? image_fp16[img_idx] = Float2Half(0.f)
+                           : image_fp32[img_idx] = 0.f;
           }
         }
       }
diff --git a/lite/backends/opencl/cl_image_converter.h b/lite/backends/opencl/cl_image_converter.h
index af04e9a4936..61df71c89aa 100644
--- a/lite/backends/opencl/cl_image_converter.h
+++ b/lite/backends/opencl/cl_image_converter.h
@@ -36,8 +36,10 @@ class CLImageConverterBase {
                            const DDim &tensor_dim) = 0;
   virtual DDim InitImageDimInfoWith(const DDim &tensor_dim) = 0;
 
-  bool fp16_support_{paddle::lite::CLRuntime::Global()->get_precision() ==
-                     lite_api::CL_PRECISION_FP16};
+  static bool fp16_support() {
+    return paddle::lite::CLRuntime::Global()->get_precision() ==
+           lite_api::CL_PRECISION_FP16;
+  }
 };
 
 class CLImageConverterDefault : public CLImageConverterBase {
diff --git a/lite/core/context.h b/lite/core/context.h
index 4474aae2cb9..5f96f4b41e7 100644
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -369,7 +369,7 @@ class Context<TargetType::kOpenCL> {
   CLContext* cl_context() { return cl_context_.get(); }
 
   void InitOnce() {
-    if (CLRuntime::Global()->IsInitSuccess() == false) {
+    if (!CLRuntime::Global()->IsInitSuccess()) {
       // gpu is not support , can use cpu instead . do not fatal..
       LOG(ERROR) << "OpenCL runtime init failed";
     }
diff --git a/lite/core/kernel.h b/lite/core/kernel.h
index abbf3fe731d..ee82a1f4d0a 100644
--- a/lite/core/kernel.h
+++ b/lite/core/kernel.h
@@ -215,8 +215,14 @@ class KernelBase {
   cl::Event event_;
   cl::Event event_1;
   cl::Event event_2;
-  bool fp16_support_{paddle::lite::CLRuntime::Global()->get_precision() ==
-                     lite_api::CL_PRECISION_FP16};
+
+  static bool fp16_support() {
+    bool fp16_support = paddle::lite::CLRuntime::Global()->get_precision() ==
+                        lite_api::CL_PRECISION_FP16;
+    LOG(INFO) << "opencl fp16_support: " << fp16_support;
+    return fp16_support;
+  }
+
 #endif
 };
 
diff --git a/lite/core/program.h b/lite/core/program.h
index 7f828d2834c..61a5fbd5f00 100644
--- a/lite/core/program.h
+++ b/lite/core/program.h
@@ -201,9 +201,12 @@ class LITE_API RuntimeProgram {
   bool use_precision_low_ = false;
   ~RuntimeProgram() {
 #ifdef LITE_WITH_OPENCL
-    // save program kernel cache & tuned params
-    CLRuntime::Global()->SaveProgram();
-    CLRuntime::Global()->SaveTuned();
+    // when has opencl kernel try to save params
+    if (has_opencl_kernel_) {
+      // save program kernel cache & tuned params
+      CLRuntime::Global()->SaveProgram();
+      CLRuntime::Global()->SaveTuned();
+    }
 #endif  // LITE_WITH_OPENCL
 #ifdef LITE_WITH_PROFILE
     // exclude data of first epoch
@@ -222,30 +225,42 @@ class LITE_API RuntimeProgram {
     set_profiler();
 #endif
 
-#ifdef LITE_WITH_OPENCL
-    bool opencl_valid = paddle::lite::CLWrapper::Global()->OpenclLibFound() &&
-                        paddle::lite::CLWrapper::Global()->DlsymSuccess() &&
-                        CLRuntime::Global()->OpenCLAvaliableForDevice();
-    using OpenCLContext = Context<TargetType::kOpenCL>;
-    std::unique_ptr<KernelContext> unique_opencl_ctx(new KernelContext());
-    if (opencl_valid) {
-      unique_opencl_ctx->As<OpenCLContext>().InitOnce();
-    }
-#endif
-
     for (auto& inst : instructions_[kRootBlockIdx]) {
       KernelBase* kernel = inst.mutable_kernel();
       if (kernel->target() == TARGET(kOpenCL)) {
 #if defined(LITE_WITH_OPENCL)
-        if (opencl_valid) {
+        // mark has kernel that is opencl
+        has_opencl_kernel_ = true;
+        // init opencl runtime when first find opencl kernel.
+        // when unique_opencl_ctx_ not init. init it
+        if (!unique_opencl_ctx_) {
+          // check opencl env valid.
+          opencl_valid_ = paddle::lite::CLWrapper::Global()->OpenclLibFound() &&
+                          paddle::lite::CLWrapper::Global()->DlsymSuccess() &&
+                          CLRuntime::Global()->OpenCLAvaliableForDevice();
+          // check opencl env valid.
+          if (opencl_valid_) {
+            // init opencl context
+            std::unique_ptr<KernelContext> unique_opencl_ctx(
+                new KernelContext());
+            unique_opencl_ctx_ = std::move(unique_opencl_ctx);
+            (*unique_opencl_ctx_).As<OpenCLContext>().InitOnce();
+          } else {
+            LOG(FATAL) << "check opencl env failed. opencl_valid:"
+                       << opencl_valid_;
+          }
+        }
+
+        // check valid and copy shared context to kernel.
+        if (opencl_valid_) {
           std::unique_ptr<KernelContext> ctx(new KernelContext());
-          (*unique_opencl_ctx)
+          (*unique_opencl_ctx_)
               .As<OpenCLContext>()
               .CopySharedTo(&ctx->As<OpenCLContext>());
           kernel->SetContext(std::move(ctx));
         } else {
           // if gpu not support , fatal when user init gpu model.
-          LOG(FATAL) << "opencl_valid:" << opencl_valid;
+          LOG(FATAL) << "opencl_valid:" << opencl_valid_;
         }
 #endif
       } else if (kernel->target() == TARGET(kMetal)) {
@@ -312,6 +327,11 @@ class LITE_API RuntimeProgram {
   Scope* exec_scope_{};
   int64_t version_{0};
 
+#ifdef LITE_WITH_OPENCL
+  bool opencl_valid_{false};
+  bool has_opencl_kernel_{false};
+  std::unique_ptr<KernelContext> unique_opencl_ctx_;
+#endif
 #ifdef LITE_WITH_METAL
   std::unique_ptr<KernelContext> metal_ctx_{nullptr};
 #endif
diff --git a/lite/kernels/opencl/conv_image_compute.cc b/lite/kernels/opencl/conv_image_compute.cc
index f94a92bfa88..2538363d681 100644
--- a/lite/kernels/opencl/conv_image_compute.cc
+++ b/lite/kernels/opencl/conv_image_compute.cc
@@ -1684,14 +1684,14 @@ void ConvImageCompute::NCHW2IMG4(
       size_t img_idx = (c / 4) * oc_block * 4 + n;
       if (n < oc && c < ic) {
         if ((c % 4) == index) {
-          fp16_support_ ? dst_fp16[img_idx] = Float2Half(*p)
-                        : dst_fp32[img_idx] = *p;
+          fp16_support() ? dst_fp16[img_idx] = Float2Half(*p)
+                         : dst_fp32[img_idx] = *p;
         }
         p++;
       } else {
         if ((c % 4) == index) {
-          fp16_support_ ? dst_fp16[img_idx] = Float2Half(0.f)
-                        : dst_fp32[img_idx] = 0.f;
+          fp16_support() ? dst_fp16[img_idx] = Float2Half(0.f)
+                         : dst_fp32[img_idx] = 0.f;
         }
       }
     }
diff --git a/lite/kernels/opencl/io_copy_buffer_compute.cc b/lite/kernels/opencl/io_copy_buffer_compute.cc
index 8a30c433306..9c725211bff 100644
--- a/lite/kernels/opencl/io_copy_buffer_compute.cc
+++ b/lite/kernels/opencl/io_copy_buffer_compute.cc
@@ -90,7 +90,7 @@ class IoCopyHostToOpenCLCompute
 #endif
   void PrepareForRun() override {
     auto& param = Param<param_t>();
-    if (fp16_support_ && param.process_type != 2) {
+    if (fp16_support() && param.process_type != 2) {
       VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
       auto& context = ctx_->As<OpenCLContext>();
       context.cl_context()->AddKernel(kernel_func_name_,
@@ -114,7 +114,7 @@ class IoCopyHostToOpenCLCompute
     VLOG(2) << "param.y->dims().size():" << param.y->dims().size();
     VLOG(2) << "param.y->dims():" << param.y->dims();
 #endif
-    if (fp16_support_ && param.x->precision() == PRECISION(kFloat) &&
+    if (fp16_support() && param.x->precision() == PRECISION(kFloat) &&
         param.process_type != 2) {
       std::unique_ptr<Tensor> precision_cast_t =
           std::unique_ptr<Tensor>(new Tensor);
@@ -205,7 +205,7 @@ class IoCopykOpenCLToHostCompute
 #endif
   void PrepareForRun() override {
     auto& param = Param<param_t>();
-    if (fp16_support_ && param.process_type != 2) {
+    if (fp16_support() && param.process_type != 2) {
       VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
       auto& context = ctx_->As<OpenCLContext>();
       context.cl_context()->AddKernel(kernel_func_name_,
@@ -243,7 +243,7 @@ class IoCopykOpenCLToHostCompute
     VLOG(4) << "param.process_type:" << param.process_type;
     VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
 #endif
-    if (fp16_support_ && param.x->precision() != PRECISION(kInt64) &&
+    if (fp16_support() && param.x->precision() != PRECISION(kInt64) &&
         param.x->precision() != PRECISION(kInt32) && param.process_type != 2) {
       mem_size = param.x->dims().production() * sizeof(float);
       std::unique_ptr<Tensor> precision_cast_t =
diff --git a/lite/kernels/opencl/layout_image_compute.cc b/lite/kernels/opencl/layout_image_compute.cc
index ecdd61c4586..94e264f4288 100644
--- a/lite/kernels/opencl/layout_image_compute.cc
+++ b/lite/kernels/opencl/layout_image_compute.cc
@@ -46,7 +46,7 @@ class LayoutComputeBufferChwToImageDefault
       kernel_func_name_ = "buffer_to_image2d_with_pre255";
     }
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
-    if (param.process_type != 2 && fp16_support_) {
+    if (param.process_type != 2 && fp16_support()) {
       build_options_ += " -DMUTABLE_TYPE=half ";
     } else {
       build_options_ += " -DMUTABLE_TYPE=float ";
@@ -79,7 +79,7 @@ class LayoutComputeBufferChwToImageDefault
     auto* y_data = MUTABLE_DATA_GPU(
         param.y, image_shape["width"], image_shape["height"], nullptr);
     auto y_dims = param.y->dims();
-    if (fp16_support_)
+    if (fp16_support())
       param.y->set_precision(PRECISION(kFP16));
     else
       param.y->set_precision(PRECISION(kFloat));
@@ -189,7 +189,7 @@ class LayoutComputeImageDefaultToBufferChw
     if (param.process_type == 1) {
       kernel_func_name_ = "image2d_to_buffer_with_post255";
     }
-    if (param.process_type != 2 && fp16_support_) {
+    if (param.process_type != 2 && fp16_support()) {
       build_options_ += " -DMUTABLE_TYPE=half ";
     } else {
       build_options_ += " -DMUTABLE_TYPE=float ";
@@ -217,10 +217,10 @@ class LayoutComputeImageDefaultToBufferChw
       y_data = param.y->mutable_data<uint8_t, cl::Buffer>(TARGET(kOpenCL));
       param.y->set_precision(PRECISION(kInt8));
     } else {
-      y_data = (fp16_support_ && param.process_type != 2)
+      y_data = (fp16_support() && param.process_type != 2)
                    ? param.y->mutable_data<half_t, cl::Buffer>(TARGET(kOpenCL))
                    : param.y->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-      if (fp16_support_ && param.process_type != 2)
+      if (fp16_support() && param.process_type != 2)
         param.y->set_precision(PRECISION(kFP16));
       else
         param.y->set_precision(PRECISION(kFloat));
@@ -328,7 +328,7 @@ class LayoutComputeBufferChwToImage2DNw
   void PrepareForRun() override {
     auto& context = ctx_->As<OpenCLContext>();
     build_options_ +=
-        fp16_support_ ? " -DMUTABLE_TYPE=half " : " -DMUTABLE_TYPE=float ";
+        fp16_support() ? " -DMUTABLE_TYPE=half " : " -DMUTABLE_TYPE=float ";
     context.cl_context()->AddKernel(kernel_func_name_,
                                     "buffer/layout_kernel.cl",
                                     build_options_,
@@ -445,7 +445,7 @@ class LayoutComputeImageDefaultToImageFolder
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
     auto& context = ctx_->As<OpenCLContext>();
     build_options_ +=
-        fp16_support_ ? " -DMUTABLE_TYPE=half " : " -DMUTABLE_TYPE=float ";
+        fp16_support() ? " -DMUTABLE_TYPE=half " : " -DMUTABLE_TYPE=float ";
     context.cl_context()->AddKernel(kernel_func_name_,
                                     "image/layout_kernel.cl",
                                     build_options_,
@@ -541,7 +541,7 @@ class LayoutComputeImageFolderToImageDefault
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
     auto& context = ctx_->As<OpenCLContext>();
     build_options_ +=
-        fp16_support_ ? " -DMUTABLE_TYPE=half " : " -DMUTABLE_TYPE=float ";
+        fp16_support() ? " -DMUTABLE_TYPE=half " : " -DMUTABLE_TYPE=float ";
     context.cl_context()->AddKernel(kernel_func_name_,
                                     "image/layout_kernel.cl",
                                     build_options_,
@@ -637,7 +637,7 @@ class LayoutComputeImageFolderToBufferChw
     if (x_dims.size() > 2) {
       kernel_func_name_ = "image2d_to_buffer";
     }
-    if (param.process_type != 2 && fp16_support_) {
+    if (param.process_type != 2 && fp16_support()) {
       build_options_ += " -DMUTABLE_TYPE=half ";
     } else {
       build_options_ += " -DMUTABLE_TYPE=float ";
@@ -672,11 +672,11 @@ class LayoutComputeImageFolderToBufferChw
       x_image_shape = folder_converter.InitImageDimInfoWith(x_dims);
     }
     auto* y_data =
-        (fp16_support_ && param.process_type != 2)
+        (fp16_support() && param.process_type != 2)
             ? param.y->mutable_data<half_t, cl::Buffer>(TARGET(kOpenCL))
             : param.y->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
     auto* x_data = GET_DATA_GPU(param.x);
-    if (fp16_support_ && param.process_type != 2)
+    if (fp16_support() && param.process_type != 2)
       param.y->set_precision(PRECISION(kFP16));
     else
       param.y->set_precision(PRECISION(kFloat));
@@ -787,7 +787,7 @@ class LayoutComputeBufferChwToImageFolder
     if (x_dims.size() > 2) {
       kernel_func_name_ = "buffer_to_image2d";
     }
-    if (param.process_type != 2 && fp16_support_) {
+    if (param.process_type != 2 && fp16_support()) {
       build_options_ += " -DMUTABLE_TYPE=half ";
     } else {
       build_options_ += " -DMUTABLE_TYPE=float ";
@@ -815,7 +815,7 @@ class LayoutComputeBufferChwToImageFolder
     auto* y_data =
         MUTABLE_DATA_GPU(param.y, image_shape[0], image_shape[1], nullptr);
     auto* x_data = GET_BUFFER_GPU(param.x);
-    if (fp16_support_)
+    if (fp16_support())
       param.y->set_precision(PRECISION(kFP16));
     else
       param.y->set_precision(PRECISION(kFloat));
diff --git a/lite/kernels/opencl/matmul_image_compute.cc b/lite/kernels/opencl/matmul_image_compute.cc
index a7d0d1c7968..ae50b9cc7c4 100644
--- a/lite/kernels/opencl/matmul_image_compute.cc
+++ b/lite/kernels/opencl/matmul_image_compute.cc
@@ -601,12 +601,12 @@ class MatMulV2ImageCompute : public KernelLite<TARGET(kOpenCL),
           size_t i2 = (i1 << 2) + c % 4;
           for (size_t w = 0; w < W; w++) {
             if (c < C) {
-              fp16_support_ ? image_fp16[i2] = Float2Half(nchw[index++])
-                            : image_fp32[i2] = nchw[index++];
+              fp16_support() ? image_fp16[i2] = Float2Half(nchw[index++])
+                             : image_fp32[i2] = nchw[index++];
               i2 += 4;
             } else {
-              fp16_support_ ? image_fp16[i2] = Float2Half(0.f)
-                            : image_fp32[i2] = 0.f;
+              fp16_support() ? image_fp16[i2] = Float2Half(0.f)
+                             : image_fp32[i2] = 0.f;
               i2 += 4;
             }
           }

From 0b231af012d04e4b68701caf029de30acfc9f68c Mon Sep 17 00:00:00 2001
From: xiebaiyuan <xiebaiyuan@139.com>
Date: Thu, 15 Aug 2024 09:43:56 +0800
Subject: [PATCH 02/13] [OpenCL] fix opencl init bugs. reduce opencl memory
 when not use opencl models 1. add one way to allow use to close opencl, to
 reduce memory. 2. create opencl runtime and context when use , avoid of
 static loading. 3. if use an arm model , do not create opencl runtime /
 context. 4. when find opencl target kernel , create opencl. test=develop

---
 lite/api/paddle_api.cc                  |  29 ++-----
 lite/backends/opencl/cl_context.h       |   3 +
 lite/backends/opencl/cl_global_config.h | 105 ++++++++++++++++++++++++
 lite/backends/opencl/cl_runtime.cc      |   7 +-
 lite/backends/opencl/cl_runtime.h       |   8 +-
 lite/core/context.h                     |  28 +++++++
 lite/core/program.h                     |  14 +++-
 7 files changed, 160 insertions(+), 34 deletions(-)
 create mode 100644 lite/backends/opencl/cl_global_config.h

diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc
index 3c0402152ca..b49e93df266 100644
--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -29,6 +29,7 @@
 #endif
 
 #ifdef LITE_WITH_OPENCL
+#include "lite/backends/opencl/cl_global_config.h"
 #include "lite/backends/opencl/cl_runtime.h"
 #endif
 
@@ -43,35 +44,19 @@ bool IsOpenCLBackendValid(bool check_fp16_valid) {
 #ifdef LITE_WITH_LOG
   LOG(INFO) << "need to check fp16 valid:" << check_fp16_valid;
 #endif
-  bool opencl_valid = false;
-
 #ifdef LITE_WITH_OPENCL
-  bool opencl_lib_found = paddle::lite::CLWrapper::Global()->OpenclLibFound();
-#ifdef LITE_WITH_LOG
-  LOG(INFO) << "Found opencl library:" << opencl_lib_found;
-#endif
-  if (opencl_lib_found == false) return false;
-
-  bool dlsym_success = paddle::lite::CLWrapper::Global()->DlsymSuccess();
-#ifdef LITE_WITH_LOG
-  LOG(INFO) << "dlsym_success:" << dlsym_success;
-#endif
-  if (dlsym_success == false) return false;
-  opencl_valid = paddle::lite::CLRuntime::Global()->OpenCLAvaliableForDevice(
+  return paddle::lite::ClGlobalDelegate::Global().IsOpenCLBackendValid(
       check_fp16_valid);
-
-#ifdef LITE_WITH_LOG
-  LOG(INFO) << "opencl_valid:" << opencl_valid;
-#endif
 #endif
-  return opencl_valid;
+  return false;
 }
 
 int GetOpenCLDeviceType() {
+#ifdef LITE_WITH_LOG
+  LOG(INFO) << "GetOpenCLDeviceType";
+#endif
 #ifdef LITE_WITH_OPENCL
-  if (IsOpenCLBackendValid()) {
-    return paddle::lite::CLRuntime::Global()->GetGpuType();
-  }
+  return paddle::lite::ClGlobalDelegate::Global().GetOpenCLDeviceType();
 #endif
   return -1;
 }
diff --git a/lite/backends/opencl/cl_context.h b/lite/backends/opencl/cl_context.h
index a19ea4b46eb..5c6b3b4ad5a 100644
--- a/lite/backends/opencl/cl_context.h
+++ b/lite/backends/opencl/cl_context.h
@@ -28,6 +28,9 @@ namespace lite {
 class CLContext {
  public:
   ~CLContext() {
+#ifdef LITE_WITH_LOG
+    VLOG(4) << "CLContext destructor";
+#endif
     GetCommandQueue().finish();
     for (size_t kidx = 0; kidx < kernels_.size(); ++kidx) {
       // Note(ysh329): Don't need `clReleaseKernel`
diff --git a/lite/backends/opencl/cl_global_config.h b/lite/backends/opencl/cl_global_config.h
new file mode 100644
index 00000000000..edcf665ba60
--- /dev/null
+++ b/lite/backends/opencl/cl_global_config.h
@@ -0,0 +1,105 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace paddle {
+namespace lite {
+/**
+ * When LITE_WITH_OPENCL is enabled, Paddle-Lite will interact with the
+ * OpenCL-related environment. Currently, when Paddle-Lite interacts with the
+ * OpenCL runtime environment, it directly interacts through CLRuntime and
+ * ClWrapper. CLContext actually serves as a part of the Kernel, carrying the
+ * clKernel built for each runtime Kernel. In the process of using paddle_api or
+ * program, the OpenCL environment has to be initialized. However, in practice,
+ * sometimes the model is not an OpenCL model. Initializing the OpenCL
+ * environment in such cases is a waste of memory, especially in environments
+ * where there is a clear intention to avoid initializing the OpenCL
+ * environment. Therefore, a method to isolate the OpenCL environment is
+ * provided. When interacting with the framework, this proxy is uniformly
+ * adopted.
+ */
+class ClGlobalDelegate {
+ public:
+  static ClGlobalDelegate& Global() {
+    static ClGlobalDelegate x;
+    return x;
+  }
+  /**
+   * @brief set use opencl
+   * @param use_opencl
+   */
+  void SetUseOpenCL(bool use_opencl) {
+    use_opencl_ = use_opencl;
+    LOG(INFO) << "set opencl softly : opencl "
+              << (use_opencl_ ? "enable" : "disable");
+  }
+  /**
+   * @brief get use opencl
+   * @return
+   */
+  bool UseOpenCL() const { return use_opencl_; }
+
+  /**
+   * @brief check opencl backend valid
+   * @param check_fp16_valid
+   * @return
+   */
+  bool IsOpenCLBackendValid(bool check_fp16_valid) const {
+    LOG(INFO) << "delegete opencl valid check";
+    if (!UseOpenCL()) {
+      LOG(INFO) << "opencl disable due to softly close";
+      return false;
+    }
+    bool opencl_valid = false;
+
+#ifdef LITE_WITH_OPENCL
+    bool opencl_lib_found = paddle::lite::CLWrapper::Global()->OpenclLibFound();
+#ifdef LITE_WITH_LOG
+    LOG(INFO) << "Found opencl library:" << opencl_lib_found;
+#endif
+    if (!opencl_lib_found) return false;
+
+    bool dlsym_success = paddle::lite::CLWrapper::Global()->DlsymSuccess();
+#ifdef LITE_WITH_LOG
+    LOG(INFO) << "dlsym_success:" << dlsym_success;
+#endif
+    if (!dlsym_success) return false;
+    opencl_valid = paddle::lite::CLRuntime::Global()->OpenCLAvaliableForDevice(
+        check_fp16_valid);
+#ifdef LITE_WITH_LOG
+    LOG(INFO) << "opencl_valid:" << opencl_valid;
+#endif
+#endif
+    return opencl_valid;
+  }
+
+  /**
+   * @brief get opencl device type
+   * @return
+   */
+  int GetOpenCLDeviceType() const {
+    if (this->IsOpenCLBackendValid(false)) {
+      return paddle::lite::CLRuntime::Global()->GetGpuType();
+    }
+    return -1;
+  }
+
+ private:
+  ClGlobalDelegate() = default;
+  // if user do not set this flag, as old ways.
+  bool use_opencl_{true};
+};
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/opencl/cl_runtime.cc b/lite/backends/opencl/cl_runtime.cc
index b0cc212b765..24f5adbda1f 100644
--- a/lite/backends/opencl/cl_runtime.cc
+++ b/lite/backends/opencl/cl_runtime.cc
@@ -25,12 +25,9 @@ limitations under the License. */
 namespace paddle {
 namespace lite {
 
-CLRuntime CLRuntime::instance_;
-
 CLRuntime* CLRuntime::Global() {
-  static std::once_flag init_flag;
-  std::call_once(init_flag, []() { instance_.Init(); });
-  return &instance_;
+  static CLRuntime instance;
+  return &instance;
 }
 
 void CLRuntime::Flush(const int index) {
diff --git a/lite/backends/opencl/cl_runtime.h b/lite/backends/opencl/cl_runtime.h
index f9734c215d4..d40311299f5 100644
--- a/lite/backends/opencl/cl_runtime.h
+++ b/lite/backends/opencl/cl_runtime.h
@@ -221,14 +221,14 @@ class CLRuntime {
   void SetTunedLocalWorkSizeMap(const std::string& key,
                                 const std::vector<int>& tune_vct);
 
- private:
-  static CLRuntime instance_;
-
-  CLRuntime() { Init(); }
   CLRuntime(const CLRuntime&) = delete;
   CLRuntime(const CLRuntime&&) = delete;
   CLRuntime& operator=(const CLRuntime&) = delete;
   CLRuntime& operator=(const CLRuntime&&) = delete;
+
+ private:
+  CLRuntime() { Init(); }
+
   ~CLRuntime();
 
   bool InitializePlatform();
diff --git a/lite/core/context.h b/lite/core/context.h
index 5f96f4b41e7..bf62c0ab826 100644
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -21,6 +21,7 @@
 #endif
 #ifdef LITE_WITH_OPENCL
 #include "lite/backends/opencl/cl_context.h"
+#include "lite/backends/opencl/cl_global_config.h"
 #include "lite/backends/opencl/cl_runtime.h"
 #endif
 #ifdef LITE_WITH_XPU
@@ -369,6 +370,23 @@ class Context<TargetType::kOpenCL> {
   CLContext* cl_context() { return cl_context_.get(); }
 
   void InitOnce() {
+#ifdef LITE_WITH_LOG
+    VLOG(4) << "OpenCL InitOnce ";
+#endif
+    AttemptCreateClContext();
+  }
+
+  void AttemptCreateClContext() {
+    if (!ClGlobalDelegate::Global().UseOpenCL()) {
+      LOG(INFO) << "force close opencl,  so return";
+      return;
+    }
+
+    if (cl_context_ != nullptr) {
+      LOG(INFO) << "cl_context_ already created";
+      return;
+    }
+
     if (!CLRuntime::Global()->IsInitSuccess()) {
       // gpu is not support , can use cpu instead . do not fatal..
       LOG(ERROR) << "OpenCL runtime init failed";
@@ -377,6 +395,10 @@ class Context<TargetType::kOpenCL> {
   }
 
   void CopySharedTo(OpenCLContext* ctx) {
+    if (ctx && (cl_context_ == nullptr)) {
+      LOG(INFO) << "cl_context_ == nullptr, attem to create it";
+      AttemptCreateClContext();
+    }
     if (ctx && cl_context_) {
       ctx->cl_context_ = cl_context_;
     }
@@ -486,6 +508,9 @@ class ContextScheduler {
  private:
   template <TargetType Type, typename ContextT>
   void InitContext() {
+#ifdef LITE_WITH_LOG
+    VLOG(4) << "context init kernel ...";
+#endif
     kernel_contexts_[Type].As<ContextT>().InitOnce();
   }
 
@@ -498,6 +523,9 @@ class ContextScheduler {
     InitContext<TargetType::kARM, ARMContext>();
 #endif
 #ifdef LITE_WITH_OPENCL
+#ifdef LITE_WITH_LOG
+    VLOG(4) << "ContextScheduler init opencl context ";
+#endif
     InitContext<TargetType::kOpenCL, OpenCLContext>();
 #endif
 #ifdef LITE_WITH_METAL
diff --git a/lite/core/program.h b/lite/core/program.h
index 61a5fbd5f00..83173ce0f69 100644
--- a/lite/core/program.h
+++ b/lite/core/program.h
@@ -224,17 +224,25 @@ class LITE_API RuntimeProgram {
 #ifdef LITE_WITH_PROFILE
     set_profiler();
 #endif
-
+#ifdef LITE_WITH_LOG
+    VLOG(4) << "program loop insts ......";
+#endif
     for (auto& inst : instructions_[kRootBlockIdx]) {
       KernelBase* kernel = inst.mutable_kernel();
       if (kernel->target() == TARGET(kOpenCL)) {
 #if defined(LITE_WITH_OPENCL)
-        // mark has kernel that is opencl
+        // mark has opencl kernel
         has_opencl_kernel_ = true;
+
+        // auto enable when opencl kernel is found.
+        ClGlobalDelegate::Global().SetUseOpenCL(true);
         // init opencl runtime when first find opencl kernel.
         // when unique_opencl_ctx_ not init. init it
         if (!unique_opencl_ctx_) {
-          // check opencl env valid.
+// check opencl env valid.
+#ifdef LITE_WITH_LOG
+          VLOG(4) << "INIT OPENCL ON KERNEL";
+#endif
           opencl_valid_ = paddle::lite::CLWrapper::Global()->OpenclLibFound() &&
                           paddle::lite::CLWrapper::Global()->DlsymSuccess() &&
                           CLRuntime::Global()->OpenCLAvaliableForDevice();

From 8d6ed0fadcd9faa06a99fe418944d32fbf1d5908 Mon Sep 17 00:00:00 2001
From: xiebaiyuan <xiebaiyuan@139.com>
Date: Thu, 15 Aug 2024 11:38:42 +0800
Subject: [PATCH 03/13] [OpenCL] fix opencl init bugs. do not create opencl
 when user do not use Use opencl. auto enable opencl when opencl model load
 and opencl check or config.

1. create opencl runtime and context when use , avoid of static loading.
2. if use an arm model , do not create opencl runtime / context.
3. when find opencl target kernel , create opencl.
4. when opencl env check found, enable opencl
test=develop
---
 lite/api/paddle_api.cc                               |  2 +-
 .../opencl/{cl_global_config.h => cl_global.h}       | 12 +++++-------
 lite/core/context.h                                  |  2 +-
 3 files changed, 7 insertions(+), 9 deletions(-)
 rename lite/backends/opencl/{cl_global_config.h => cl_global.h} (93%)

diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc
index b49e93df266..85515697d82 100644
--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -29,7 +29,7 @@
 #endif
 
 #ifdef LITE_WITH_OPENCL
-#include "lite/backends/opencl/cl_global_config.h"
+#include "lite/backends/opencl/cl_global.h"
 #include "lite/backends/opencl/cl_runtime.h"
 #endif
 
diff --git a/lite/backends/opencl/cl_global_config.h b/lite/backends/opencl/cl_global.h
similarity index 93%
rename from lite/backends/opencl/cl_global_config.h
rename to lite/backends/opencl/cl_global.h
index edcf665ba60..b9540e2bc29 100644
--- a/lite/backends/opencl/cl_global_config.h
+++ b/lite/backends/opencl/cl_global.h
@@ -56,12 +56,10 @@ class ClGlobalDelegate {
    * @param check_fp16_valid
    * @return
    */
-  bool IsOpenCLBackendValid(bool check_fp16_valid) const {
+  bool IsOpenCLBackendValid(bool check_fp16_valid) {
     LOG(INFO) << "delegete opencl valid check";
-    if (!UseOpenCL()) {
-      LOG(INFO) << "opencl disable due to softly close";
-      return false;
-    }
+    // use attempt to use opencl , enable it.
+    SetUseOpenCL(true);
     bool opencl_valid = false;
 
 #ifdef LITE_WITH_OPENCL
@@ -89,7 +87,7 @@ class ClGlobalDelegate {
    * @brief get opencl device type
    * @return
    */
-  int GetOpenCLDeviceType() const {
+  int GetOpenCLDeviceType() {
     if (this->IsOpenCLBackendValid(false)) {
       return paddle::lite::CLRuntime::Global()->GetGpuType();
     }
@@ -99,7 +97,7 @@ class ClGlobalDelegate {
  private:
   ClGlobalDelegate() = default;
   // if user do not set this flag, as old ways.
-  bool use_opencl_{true};
+  bool use_opencl_{false};
 };
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/core/context.h b/lite/core/context.h
index bf62c0ab826..fb9d83d6fb7 100644
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -21,7 +21,7 @@
 #endif
 #ifdef LITE_WITH_OPENCL
 #include "lite/backends/opencl/cl_context.h"
-#include "lite/backends/opencl/cl_global_config.h"
+#include "lite/backends/opencl/cl_global.h"
 #include "lite/backends/opencl/cl_runtime.h"
 #endif
 #ifdef LITE_WITH_XPU

From f41748289381600bd24ef3c3c01e7e3a5be0fda3 Mon Sep 17 00:00:00 2001
From: xiebaiyuan <xiebaiyuan@139.com>
Date: Thu, 15 Aug 2024 15:31:25 +0800
Subject: [PATCH 04/13] [OpenCL] fix opencl init bugs. do not create opencl
 when user do not use Use opencl. auto enable opencl when opencl model load
 and opencl check or config. 5. default make option on, to let user feel
 nothing. add api for Professional users test=develop

---
 lite/api/paddle_api.cc           | 9 +++++++++
 lite/api/paddle_api.h            | 3 +++
 lite/backends/opencl/cl_global.h | 2 +-
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc
index 85515697d82..6fe59e4178d 100644
--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -40,6 +40,15 @@
 namespace paddle {
 namespace lite_api {
 
+bool SetOpenCLEnable(bool enable) {
+  LOG(INFO) << "External SetOpenCLEnable : " << enable;
+#ifdef LITE_WITH_OPENCL
+  paddle::lite::ClGlobalDelegate::Global().SetUseOpenCL(enable);
+  return enable;
+#endif
+  return enable;
+}
+
 bool IsOpenCLBackendValid(bool check_fp16_valid) {
 #ifdef LITE_WITH_LOG
   LOG(INFO) << "need to check fp16 valid:" << check_fp16_valid;
diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h
index 0fb575c3bc4..4ef1a27fa75 100644
--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -43,6 +43,9 @@ enum class L3CacheSetMethod {
   // kAutoGrow = 3,   // Not supported yet, least memory consumption.
 };
 
+// return true if current device supports OpenCL model
+LITE_API bool SetOpenCLEnable(bool enable);
+
 // return true if current device supports OpenCL model
 LITE_API bool IsOpenCLBackendValid(bool check_fp16_valid = false);
 
diff --git a/lite/backends/opencl/cl_global.h b/lite/backends/opencl/cl_global.h
index b9540e2bc29..434356e4086 100644
--- a/lite/backends/opencl/cl_global.h
+++ b/lite/backends/opencl/cl_global.h
@@ -97,7 +97,7 @@ class ClGlobalDelegate {
  private:
   ClGlobalDelegate() = default;
   // if user do not set this flag, as old ways.
-  bool use_opencl_{false};
+  bool use_opencl_{true};
 };
 }  // namespace lite
 }  // namespace paddle

From 680513b65ec92f9aef67be2affa56b2fd9d1688d Mon Sep 17 00:00:00 2001
From: xiebaiyuan <xiebaiyuan@139.com>
Date: Thu, 15 Aug 2024 15:55:15 +0800
Subject: [PATCH 05/13] [OpenCL] fix opencl init bugs. do not create opencl
 when user do not use Use opencl. auto enable opencl when opencl model load
 and opencl check or config. 5. only reset flag once test=develop

---
 lite/core/program.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lite/core/program.h b/lite/core/program.h
index 83173ce0f69..ae8de98e8e1 100644
--- a/lite/core/program.h
+++ b/lite/core/program.h
@@ -234,8 +234,6 @@ class LITE_API RuntimeProgram {
         // mark has opencl kernel
         has_opencl_kernel_ = true;
 
-        // auto enable when opencl kernel is found.
-        ClGlobalDelegate::Global().SetUseOpenCL(true);
         // init opencl runtime when first find opencl kernel.
         // when unique_opencl_ctx_ not init. init it
         if (!unique_opencl_ctx_) {
@@ -243,6 +241,8 @@ class LITE_API RuntimeProgram {
 #ifdef LITE_WITH_LOG
           VLOG(4) << "INIT OPENCL ON KERNEL";
 #endif
+          // auto enable when opencl kernel is found.
+          ClGlobalDelegate::Global().SetUseOpenCL(true);
           opencl_valid_ = paddle::lite::CLWrapper::Global()->OpenclLibFound() &&
                           paddle::lite::CLWrapper::Global()->DlsymSuccess() &&
                           CLRuntime::Global()->OpenCLAvaliableForDevice();

From e018d47de6240f4b4a89830eb1bcf34cdbdb5b18 Mon Sep 17 00:00:00 2001
From: xiebaiyuan <xiebaiyuan@139.com>
Date: Mon, 19 Aug 2024 16:02:36 +0800
Subject: [PATCH 06/13] [OpenCL] fix opencl init bugs. do not create opencl
 when user do not use Use opencl. auto enable opencl when opencl model load
 and opencl check or config. 6.fix pr review test=develop

---
 lite/api/paddle_api.cc            | 4 ++--
 lite/api/paddle_api.h             | 2 +-
 lite/backends/opencl/cl_context.h | 4 +---
 lite/backends/opencl/cl_global.h  | 3 ++-
 lite/core/context.h               | 7 +------
 lite/core/program.h               | 7 -------
 6 files changed, 7 insertions(+), 20 deletions(-)

diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc
index 6fe59e4178d..759a3bdefe8 100644
--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -40,8 +40,8 @@
 namespace paddle {
 namespace lite_api {
 
-bool SetOpenCLEnable(bool enable) {
-  LOG(INFO) << "External SetOpenCLEnable : " << enable;
+bool EnableOpenCLBackend(bool enable) {
+  VLOG(4) << "External EnableOpenCLBackend : " << enable;
 #ifdef LITE_WITH_OPENCL
   paddle::lite::ClGlobalDelegate::Global().SetUseOpenCL(enable);
   return enable;
diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h
index 4ef1a27fa75..21868818177 100644
--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -44,7 +44,7 @@ enum class L3CacheSetMethod {
 };
 
 // return true if current device supports OpenCL model
-LITE_API bool SetOpenCLEnable(bool enable);
+LITE_API bool EnableOpenCLBackend(bool enable);
 
 // return true if current device supports OpenCL model
 LITE_API bool IsOpenCLBackendValid(bool check_fp16_valid = false);
diff --git a/lite/backends/opencl/cl_context.h b/lite/backends/opencl/cl_context.h
index 5c6b3b4ad5a..a3ecd808f12 100644
--- a/lite/backends/opencl/cl_context.h
+++ b/lite/backends/opencl/cl_context.h
@@ -28,9 +28,7 @@ namespace lite {
 class CLContext {
  public:
   ~CLContext() {
-#ifdef LITE_WITH_LOG
-    VLOG(4) << "CLContext destructor";
-#endif
+    VLOG(4) << "Call Of CLContext destructor";
     GetCommandQueue().finish();
     for (size_t kidx = 0; kidx < kernels_.size(); ++kidx) {
       // Note(ysh329): Don't need `clReleaseKernel`
diff --git a/lite/backends/opencl/cl_global.h b/lite/backends/opencl/cl_global.h
index 434356e4086..8a6e0538393 100644
--- a/lite/backends/opencl/cl_global.h
+++ b/lite/backends/opencl/cl_global.h
@@ -57,7 +57,8 @@ class ClGlobalDelegate {
    * @return
    */
   bool IsOpenCLBackendValid(bool check_fp16_valid) {
-    LOG(INFO) << "delegete opencl valid check";
+    VLOG(3) << "Delegete opencl valid check, check_fp16_valid: "
+            << check_fp16_valid << ", use_opencl_:" << use_opencl_;
     // use attempt to use opencl , enable it.
     SetUseOpenCL(true);
     bool opencl_valid = false;
diff --git a/lite/core/context.h b/lite/core/context.h
index fb9d83d6fb7..fd8ed37554c 100644
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -369,12 +369,7 @@ class Context<TargetType::kOpenCL> {
  public:
   CLContext* cl_context() { return cl_context_.get(); }
 
-  void InitOnce() {
-#ifdef LITE_WITH_LOG
-    VLOG(4) << "OpenCL InitOnce ";
-#endif
-    AttemptCreateClContext();
-  }
+  void InitOnce() { AttemptCreateClContext(); }
 
   void AttemptCreateClContext() {
     if (!ClGlobalDelegate::Global().UseOpenCL()) {
diff --git a/lite/core/program.h b/lite/core/program.h
index ae8de98e8e1..fca2ee5a0fd 100644
--- a/lite/core/program.h
+++ b/lite/core/program.h
@@ -223,9 +223,6 @@ class LITE_API RuntimeProgram {
     }
 #ifdef LITE_WITH_PROFILE
     set_profiler();
-#endif
-#ifdef LITE_WITH_LOG
-    VLOG(4) << "program loop insts ......";
 #endif
     for (auto& inst : instructions_[kRootBlockIdx]) {
       KernelBase* kernel = inst.mutable_kernel();
@@ -237,10 +234,6 @@ class LITE_API RuntimeProgram {
         // init opencl runtime when first find opencl kernel.
         // when unique_opencl_ctx_ not init. init it
         if (!unique_opencl_ctx_) {
-// check opencl env valid.
-#ifdef LITE_WITH_LOG
-          VLOG(4) << "INIT OPENCL ON KERNEL";
-#endif
           // auto enable when opencl kernel is found.
           ClGlobalDelegate::Global().SetUseOpenCL(true);
           opencl_valid_ = paddle::lite::CLWrapper::Global()->OpenclLibFound() &&

From f4068f7e30e8114e3f3fb06dda70042c510639e0 Mon Sep 17 00:00:00 2001
From: xiebaiyuan <xiebaiyuan@139.com>
Date: Mon, 19 Aug 2024 16:03:08 +0800
Subject: [PATCH 07/13] [OpenCL] fix opencl init bugs. do not create opencl
 when user do not use Use opencl. auto enable opencl when opencl model load
 and opencl check or config. 6.fix pr review test=develop

---
 lite/api/paddle_api.cc | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc
index 759a3bdefe8..09a57d82900 100644
--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -61,9 +61,6 @@ bool IsOpenCLBackendValid(bool check_fp16_valid) {
 }
 
 int GetOpenCLDeviceType() {
-#ifdef LITE_WITH_LOG
-  LOG(INFO) << "GetOpenCLDeviceType";
-#endif
 #ifdef LITE_WITH_OPENCL
   return paddle::lite::ClGlobalDelegate::Global().GetOpenCLDeviceType();
 #endif

From 5f18e09b4229a2322acd03722569db8a2387cffd Mon Sep 17 00:00:00 2001
From: xiebaiyuan <xiebaiyuan@139.com>
Date: Mon, 19 Aug 2024 16:03:31 +0800
Subject: [PATCH 08/13] [OpenCL] fix opencl init bugs. do not create opencl
 when user do not use Use opencl. auto enable opencl when opencl model load
 and opencl check or config. 6.fix pr review test=develop

---
 lite/core/context.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/lite/core/context.h b/lite/core/context.h
index fd8ed37554c..f95d7ff5117 100644
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -518,9 +518,7 @@ class ContextScheduler {
     InitContext<TargetType::kARM, ARMContext>();
 #endif
 #ifdef LITE_WITH_OPENCL
-#ifdef LITE_WITH_LOG
     VLOG(4) << "ContextScheduler init opencl context ";
-#endif
     InitContext<TargetType::kOpenCL, OpenCLContext>();
 #endif
 #ifdef LITE_WITH_METAL

From 2e7c070e0efac4b49e5872fc3beee799c3ee3697 Mon Sep 17 00:00:00 2001
From: xiebaiyuan <xiebaiyuan@139.com>
Date: Mon, 19 Aug 2024 16:06:36 +0800
Subject: [PATCH 09/13] [OpenCL] fix opencl init bugs. do not create opencl
 when user do not use Use opencl. auto enable opencl when opencl model load
 and opencl check or config. 6.fix pr review test=develop

---
 lite/backends/opencl/cl_global.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lite/backends/opencl/cl_global.h b/lite/backends/opencl/cl_global.h
index 8a6e0538393..defddaaea1f 100644
--- a/lite/backends/opencl/cl_global.h
+++ b/lite/backends/opencl/cl_global.h
@@ -42,8 +42,8 @@ class ClGlobalDelegate {
    */
   void SetUseOpenCL(bool use_opencl) {
     use_opencl_ = use_opencl;
-    LOG(INFO) << "set opencl softly : opencl "
-              << (use_opencl_ ? "enable" : "disable");
+    VLOG(4) << "Set opencl softly , use_opencl: "
+            << (use_opencl_ ? "enable" : "disable");
   }
   /**
    * @brief get use opencl

From 51aa8f4a6e67f074069667ecec4ac64ec6007531 Mon Sep 17 00:00:00 2001
From: xiebaiyuan <xiebaiyuan@139.com>
Date: Mon, 19 Aug 2024 16:09:52 +0800
Subject: [PATCH 10/13] [OpenCL] fix opencl init bugs. do not create opencl
 when user do not use Use opencl. auto enable opencl when opencl model load
 and opencl check or config. 6.fix pr review test=develop

---
 lite/backends/opencl/cl_global.h | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/lite/backends/opencl/cl_global.h b/lite/backends/opencl/cl_global.h
index defddaaea1f..252f76ad674 100644
--- a/lite/backends/opencl/cl_global.h
+++ b/lite/backends/opencl/cl_global.h
@@ -65,21 +65,15 @@ class ClGlobalDelegate {
 
 #ifdef LITE_WITH_OPENCL
     bool opencl_lib_found = paddle::lite::CLWrapper::Global()->OpenclLibFound();
-#ifdef LITE_WITH_LOG
     LOG(INFO) << "Found opencl library:" << opencl_lib_found;
-#endif
     if (!opencl_lib_found) return false;
 
     bool dlsym_success = paddle::lite::CLWrapper::Global()->DlsymSuccess();
-#ifdef LITE_WITH_LOG
     LOG(INFO) << "dlsym_success:" << dlsym_success;
-#endif
     if (!dlsym_success) return false;
     opencl_valid = paddle::lite::CLRuntime::Global()->OpenCLAvaliableForDevice(
         check_fp16_valid);
-#ifdef LITE_WITH_LOG
     LOG(INFO) << "opencl_valid:" << opencl_valid;
-#endif
 #endif
     return opencl_valid;
   }

From d7152d20ef16c5742e2677c39461290be4a2462f Mon Sep 17 00:00:00 2001
From: xiebaiyuan <xiebaiyuan@139.com>
Date: Mon, 19 Aug 2024 16:11:37 +0800
Subject: [PATCH 11/13] [OpenCL] fix opencl init bugs. do not create opencl
 when user do not use Use opencl. auto enable opencl when opencl model load
 and opencl check or config. 6.fix pr review test=develop

---
 lite/core/context.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lite/core/context.h b/lite/core/context.h
index f95d7ff5117..109f72e2df4 100644
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -373,12 +373,12 @@ class Context<TargetType::kOpenCL> {
 
   void AttemptCreateClContext() {
     if (!ClGlobalDelegate::Global().UseOpenCL()) {
-      LOG(INFO) << "force close opencl,  so return";
+      LOG(INFO) << "Force close opencl,  so return";
       return;
     }
 
     if (cl_context_ != nullptr) {
-      LOG(INFO) << "cl_context_ already created";
+      LOG(INFO) << "Cl_context has already created, so return";
       return;
     }
 

From 76999dc969878b635ccd1464efcf456cbfd0aa5b Mon Sep 17 00:00:00 2001
From: xiebaiyuan <xiebaiyuan@139.com>
Date: Mon, 19 Aug 2024 16:17:10 +0800
Subject: [PATCH 12/13] [OpenCL] fix opencl init bugs. do not create opencl
 when user do not use Use opencl. auto enable opencl when opencl model load
 and opencl check or config. 6.fix pr review test=develop

---
 lite/backends/opencl/cl_global.h | 6 +++---
 lite/core/program.h              | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/lite/backends/opencl/cl_global.h b/lite/backends/opencl/cl_global.h
index 252f76ad674..6159f495ea2 100644
--- a/lite/backends/opencl/cl_global.h
+++ b/lite/backends/opencl/cl_global.h
@@ -65,15 +65,15 @@ class ClGlobalDelegate {
 
 #ifdef LITE_WITH_OPENCL
     bool opencl_lib_found = paddle::lite::CLWrapper::Global()->OpenclLibFound();
-    LOG(INFO) << "Found opencl library:" << opencl_lib_found;
+    LOG(INFO) << "Found opencl library: " << opencl_lib_found;
     if (!opencl_lib_found) return false;
 
     bool dlsym_success = paddle::lite::CLWrapper::Global()->DlsymSuccess();
-    LOG(INFO) << "dlsym_success:" << dlsym_success;
+    LOG(INFO) << "Dlsym Success: " << dlsym_success;
     if (!dlsym_success) return false;
     opencl_valid = paddle::lite::CLRuntime::Global()->OpenCLAvaliableForDevice(
         check_fp16_valid);
-    LOG(INFO) << "opencl_valid:" << opencl_valid;
+    LOG(INFO) << "Opencl Valid: " << opencl_valid;
 #endif
     return opencl_valid;
   }
diff --git a/lite/core/program.h b/lite/core/program.h
index fca2ee5a0fd..c99d3cbed82 100644
--- a/lite/core/program.h
+++ b/lite/core/program.h
@@ -219,7 +219,7 @@ class LITE_API RuntimeProgram {
 
   void Init() {
     if (instructions_.empty()) {
-      LOG(FATAL) << "no instructions";
+      LOG(FATAL) << "No instructions";
     }
 #ifdef LITE_WITH_PROFILE
     set_profiler();
@@ -247,7 +247,7 @@ class LITE_API RuntimeProgram {
             unique_opencl_ctx_ = std::move(unique_opencl_ctx);
             (*unique_opencl_ctx_).As<OpenCLContext>().InitOnce();
           } else {
-            LOG(FATAL) << "check opencl env failed. opencl_valid:"
+            LOG(FATAL) << "Check opencl env failed. opencl_valid:"
                        << opencl_valid_;
           }
         }
@@ -261,7 +261,7 @@ class LITE_API RuntimeProgram {
           kernel->SetContext(std::move(ctx));
         } else {
           // if gpu not support , fatal when user init gpu model.
-          LOG(FATAL) << "opencl_valid:" << opencl_valid_;
+          LOG(FATAL) << "OpenCl Valid: " << opencl_valid_;
         }
 #endif
       } else if (kernel->target() == TARGET(kMetal)) {

From 3035babb616d1fb787e7cafdf6af641d0cc267f2 Mon Sep 17 00:00:00 2001
From: xiebaiyuan <xiebaiyuan@139.com>
Date: Mon, 19 Aug 2024 16:21:54 +0800
Subject: [PATCH 13/13] [OpenCL] fix opencl init bugs. do not create opencl
 when user do not use Use opencl. auto enable opencl when opencl model load
 and opencl check or config. 6.fix pr review test=develop

---
 lite/api/paddle_api.cc | 3 ---
 lite/core/context.h    | 3 ---
 2 files changed, 6 deletions(-)

diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc
index 09a57d82900..fbe5ddbcfc3 100644
--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -50,9 +50,6 @@ bool EnableOpenCLBackend(bool enable) {
 }
 
 bool IsOpenCLBackendValid(bool check_fp16_valid) {
-#ifdef LITE_WITH_LOG
-  LOG(INFO) << "need to check fp16 valid:" << check_fp16_valid;
-#endif
 #ifdef LITE_WITH_OPENCL
   return paddle::lite::ClGlobalDelegate::Global().IsOpenCLBackendValid(
       check_fp16_valid);
diff --git a/lite/core/context.h b/lite/core/context.h
index 109f72e2df4..dcbd7162b38 100644
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -503,9 +503,6 @@ class ContextScheduler {
  private:
   template <TargetType Type, typename ContextT>
   void InitContext() {
-#ifdef LITE_WITH_LOG
-    VLOG(4) << "context init kernel ...";
-#endif
     kernel_contexts_[Type].As<ContextT>().InitOnce();
   }