Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…

… concat
PaddlePaddle · Oct 19, 2023 · 8a051f8 · 8a051f8
2 parents fac7400 + 4dbd3f7
commit 8a051f8
Show file tree

Hide file tree

Showing 888 changed files with 46,986 additions and 8,404 deletions.
diff --git a/.clang-tidy b/.clang-tidy
@@ -20,7 +20,7 @@ bugprone-integer-division,
 bugprone-misplaced-widening-cast,
 -bugprone-move-forwarding-reference,
 -bugprone-multiple-statement-macro,
--bugprone-narrowing-conversions,
+bugprone-narrowing-conversions,
 -bugprone-not-null-terminated-result,
 -bugprone-parent-virtual-call,
 -bugprone-posix-return,
@@ -155,7 +155,7 @@ cppcoreguidelines-avoid-c-arrays,
 -cppcoreguidelines-avoid-goto,
 cppcoreguidelines-c-copy-assignment-signature,
 cppcoreguidelines-explicit-virtual-functions,
--cppcoreguidelines-init-variables,
+cppcoreguidelines-init-variables,
 cppcoreguidelines-narrowing-conversions,
 cppcoreguidelines-no-malloc,
 -cppcoreguidelines-pro-type-const-cast,
@@ -189,12 +189,12 @@ modernize-use-override,
 modernize-use-transparent-functors,
 -modernize-use-uncaught-exceptions,
 performance-faster-string-find,
--performance-for-range-copy,
+performance-for-range-copy,
 -performance-implicit-conversion-in-loop,
 -performance-inefficient-algorithm,
 performance-inefficient-string-concatenation,
 -performance-inefficient-vector-operation,
--performance-move-const-arg,
+performance-move-const-arg,
 -performance-move-constructor-init,
 -performance-no-automatic-move,
 performance-noexcept-move-constructor,

diff --git a/.flake8 b/.flake8
@@ -26,6 +26,9 @@ per-file-ignores =
     # These files need tabs for testing.
     test/dygraph_to_static/test_error.py:E101,W191
 
+    # Ignore compare with True in sot unittest
+    test/sot/test_dup_top.py:E712
+
     # temp ignore base directory
     python/paddle/base/*:
         E712,

diff --git a/.gitmodules b/.gitmodules
@@ -106,3 +106,7 @@
 	path = third_party/jitify
 	url = https://github.com/NVIDIA/jitify.git
 	ignore = dirty
+[submodule "third_party/cccl"]
+	path = third_party/cccl
+	url = https://github.com/NVIDIA/cccl.git
+	ignore = dirty
diff --git a/cmake/external/cccl.cmake b/cmake/external/cccl.cmake
@@ -0,0 +1,31 @@
+include(ExternalProject)
+
+set(CCCL_PATH
+    "${THIRD_PARTY_PATH}/cccl"
+    CACHE STRING "A path setting for external_cccl path.")
+set(CCCL_PREFIX_DIR ${CCCL_PATH})
+set(CCCL_SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/cccl)
+
+# The latest commit has bugs in windows, so we set a fix commit.
+set(CCCL_TAG 1f6e4bcae0fbf1bbed87f88544d8d2161c490fc1)
+execute_process(COMMAND git --git-dir=${CCCL_SOURCE_DIR}/.git
+                        --work-tree=${CCCL_SOURCE_DIR} checkout ${CCCL_TAG})
+
+set(CCCL_INCLUDE_DIR ${CCCL_SOURCE_DIR})
+message("CCCL_INCLUDE_DIR is ${CCCL_INCLUDE_DIR}")
+include_directories(${CCCL_INCLUDE_DIR})
+
+ExternalProject_Add(
+  extern_cccl
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  SOURCE_DIR ${CCCL_SOURCE_DIR}
+  PREFIX ${CCCL_PREFIX_DIR}
+  UPDATE_COMMAND ""
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND ""
+  INSTALL_COMMAND ""
+  TEST_COMMAND "")
+
+add_library(cccl INTERFACE)
+
+add_dependencies(cccl extern_cccl)
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
@@ -19,12 +19,16 @@ set(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
 set(CBLAS_SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/openblas)
 set(CBLAS_TAG v0.3.7)
 
-# OpenBLAS support Raptor Lake from v0.3.22
-if(UNIX
-   AND NOT APPLE
-   AND NOT WITH_ROCM
+# Why use v0.3.18?  The IDG business line encountered a random openblas error,
+# which can be resolved after upgrading openblas.
+# And why compile when gcc>8.2? Please refer to
+# https://github.com/spack/spack/issues/19932#issuecomment-733452619
+# v0.3.18 only support gcc>=8.3 or gcc>=7.4
+if((CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+   AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 8.2
    AND NOT WITH_XPU)
-  set(CBLAS_TAG v0.3.23)
+  # We only compile with openblas 0.3.18 when gcc >= 8.3
+  set(CBLAS_TAG v0.3.18)
 endif()
 
 if(APPLE AND WITH_ARM)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
@@ -1345,6 +1345,9 @@ function(math_library TARGET)
   if(WITH_GPU)
     if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
       list(APPEND math_common_deps cub)
+    elseif(${CMAKE_CUDA_COMPILER_VERSION} EQUAL 12.0
+           OR ${CMAKE_CUDA_COMPILER_VERSION} GREATER 12.0)
+      list(APPEND math_common_deps cccl)
     else()
       list(APPEND math_common_deps)
     endif()

diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
@@ -247,6 +247,14 @@ if(NOT DEFINED WITH_MKLDNN)
   endif()
 endif()
 
+if(WIN32)
+  if(MSVC)
+    if(MSVC_VERSION LESS 1920)
+      set(WITH_MKLDNN OFF)
+    endif()
+  endif()
+endif()
+
 if(WIN32
    OR APPLE
    OR NOT WITH_GPU
@@ -375,6 +383,10 @@ if(WITH_GPU)
   if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
     include(external/cub) # download cub
     list(APPEND third_party_deps extern_cub)
+  elseif(${CMAKE_CUDA_COMPILER_VERSION} EQUAL 12.0
+         OR ${CMAKE_CUDA_COMPILER_VERSION} GREATER 12.0)
+    include(external/cccl)
+    list(APPEND third_party_deps extern_cccl)
   endif()
   set(URL
       "https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg_20210928.tar.gz"

diff --git a/paddle/cinn/backends/compiler.cc b/paddle/cinn/backends/compiler.cc
@@ -304,6 +304,8 @@ void Compiler::CompileCudaModule(const Module& module,
     auto fn_kernel = cuda_module_->GetFunction(0, kernel_fn_name);
     CHECK(fn_kernel);
 
+    fn_ptr_.push_back(reinterpret_cast<void*>(fn_kernel));
+
     symbols.RegisterVar(kernel_fn_name + "_ptr_",
                         reinterpret_cast<void*>(fn_kernel));
   }

diff --git a/paddle/cinn/backends/compiler.h b/paddle/cinn/backends/compiler.h
@@ -121,6 +121,8 @@ class Compiler final {
    */
   void* Lookup(absl::string_view fn_name);
 
+  std::vector<void*> GetFnPtr() const { return fn_ptr_; }
+
  private:
   void CompileCudaModule(const ir::Module& module,
                          const std::string& code = "");
@@ -136,6 +138,7 @@ class Compiler final {
   Target target_;
   std::unique_ptr<ExecutionEngine> engine_;
 
+  std::vector<void*> fn_ptr_;
 #ifdef CINN_WITH_CUDA
   std::unique_ptr<runtime::cuda::CUDAModule> cuda_module_;
 #endif

diff --git a/paddle/cinn/hlir/dialect/operator/CMakeLists.txt b/paddle/cinn/hlir/dialect/operator/CMakeLists.txt
@@ -1 +1,2 @@
 add_subdirectory(ir)
+add_subdirectory(transforms)
diff --git a/paddle/cinn/hlir/dialect/operator/ir/CMakeLists.txt b/paddle/cinn/hlir/dialect/operator/ir/CMakeLists.txt
@@ -35,6 +35,7 @@ if(NOT CINN_ONLY)
     COMMAND ${CMAKE_COMMAND} -E make_directory ${parsed_op_dir}
     COMMAND ${PYTHON_EXECUTABLE} ${cinn_op_gen_parsed_yaml_file} --op_yaml_path
             ${cinn_op_yaml_file} --output_path ${cinn_op_parsed_yaml_file}
+    DEPENDS ${cinn_op_gen_parsed_yaml_file} ${cinn_op_yaml_file}
     VERBATIM)
 
   add_custom_command(

diff --git a/paddle/cinn/hlir/dialect/operator/ir/attribute_storage.h b/paddle/cinn/hlir/dialect/operator/ir/attribute_storage.h
@@ -77,5 +77,27 @@ struct GroupInfoAttributeStorage : public pir::AttributeStorage {
   ParamKey data_;
 };
 
+struct JITInfoAttributeStorage : public pir::AttributeStorage {
+  using ParamKey = cinn::hlir::framework::newir::CUDAJITInfo;
+  explicit JITInfoAttributeStorage(const ParamKey& key) : data_(key) {}
+
+  static JITInfoAttributeStorage* Construct(const ParamKey& key) {
+    return new JITInfoAttributeStorage(key);
+  }
+
+  static std::size_t HashValue(const ParamKey& key) {
+    return std::hash<int64_t>()(*(reinterpret_cast<int64_t*>(key.fn_ptr)));
+  }
+
+  bool operator==(const ParamKey& key) const {
+    return data_.fn_ptr == key.fn_ptr;
+  }
+
+  const ParamKey& GetAsKey() const { return data_; }
+
+ private:
+  ParamKey data_;
+};
+
 }  // namespace dialect
 }  // namespace cinn
diff --git a/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc b/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc
@@ -44,7 +44,7 @@ std::vector<pir::Operation *> GroupOp::ops() {
                                        inner_block->end());
 }
 
-void GroupOp::Verify() {}
+void GroupOp::VerifySig() {}
 
 void GroupOp::Print(pir::IrPrinter &printer) {
   auto &os = printer.os;

diff --git a/paddle/cinn/hlir/dialect/operator/ir/manual_op.h b/paddle/cinn/hlir/dialect/operator/ir/manual_op.h
@@ -36,7 +36,7 @@ class GroupOp : public pir::Op<GroupOp> {
   pir::Block *block();
   std::vector<pir::Operation *> ops();
 
-  void Verify();
+  void VerifySig();
   void Print(pir::IrPrinter &printer);  // NOLINT
 };
 

diff --git a/paddle/cinn/hlir/dialect/operator/ir/op_attribute.cc b/paddle/cinn/hlir/dialect/operator/ir/op_attribute.cc
@@ -19,7 +19,13 @@ namespace dialect {
 const GroupInfo &GroupInfoAttribute::data() const {
   return storage()->GetAsKey();
 }
+
+const cinn::hlir::framework::newir::CUDAJITInfo &CUDAJITInfoAttribute::data()
+    const {
+  return storage()->GetAsKey();
+}
 }  // namespace dialect
 }  // namespace cinn
 
 IR_DEFINE_EXPLICIT_TYPE_ID(cinn::dialect::GroupInfoAttribute)
+IR_DEFINE_EXPLICIT_TYPE_ID(cinn::dialect::CUDAJITInfoAttribute)
diff --git a/paddle/cinn/hlir/dialect/operator/ir/op_attribute.h b/paddle/cinn/hlir/dialect/operator/ir/op_attribute.h
@@ -33,7 +33,22 @@ class GroupInfoAttribute : public pir::Attribute {
   const GroupInfo& data() const;
 };
 
+class CUDAJITInfoAttribute : public pir::Attribute {
+ public:
+  using Attribute::Attribute;
+
+  DECLARE_ATTRIBUTE_UTILITY_FUNCTOR(CUDAJITInfoAttribute,
+                                    JITInfoAttributeStorage);
+
+  bool operator<(const CUDAJITInfoAttribute& right) const {
+    return storage() < right.storage();
+  }
+
+  const cinn::hlir::framework::newir::CUDAJITInfo& data() const;
+};
+
 }  // namespace dialect
 }  // namespace cinn
 
 IR_DECLARE_EXPLICIT_TYPE_ID(cinn::dialect::GroupInfoAttribute)
+IR_DECLARE_EXPLICIT_TYPE_ID(cinn::dialect::CUDAJITInfoAttribute)
diff --git a/paddle/cinn/hlir/dialect/operator/ir/op_dialect.cc b/paddle/cinn/hlir/dialect/operator/ir/op_dialect.cc
@@ -39,20 +39,31 @@ void OperatorDialect::initialize() {
       >();
   RegisterOp<GroupOp>();
   RegisterAttribute<GroupInfoAttribute>();
+  RegisterAttribute<CUDAJITInfoAttribute>();
 }
 
 void OperatorDialect::PrintType(pir::Type type, std::ostream &os) const {}
 
 void OperatorDialect::PrintAttribute(pir::Attribute attr,
                                      std::ostream &os) const {
-  os << "(" << attr.dialect().name();
-  os << '.';
-  if (auto group_info_attr = attr.dyn_cast<GroupInfoAttribute>()) {
-    const GroupInfo &data = group_info_attr.data();
-    os << "GroupInfo)"
-       << "[" << data.fn_name << "]";
+  if (attr.isa<GroupInfoAttribute>()) {
+    os << "(" << attr.dialect().name();
+    os << '.';
+    if (auto group_info_attr = attr.dyn_cast<GroupInfoAttribute>()) {
+      const GroupInfo &data = group_info_attr.data();
+      os << "GroupInfo)"
+         << "[" << data.fn_name << "]";
+    }
+    { os << "<#AttrNotImplemented>"; }
+  } else if (attr.isa<CUDAJITInfoAttribute>()) {
+    auto cuda_jit_info = attr.dyn_cast<CUDAJITInfoAttribute>();
+
+    os << "(" << cuda_jit_info.data().fn_ptr;
+    os << ')';
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "cinn dialect only support GrupInfo and CUDAJITInfo"));
   }
-  { os << "<#AttrNotImplemented>"; }
 }
 
 void OperatorDialect::PrintOperation(pir::Operation *op,

diff --git a/paddle/cinn/hlir/dialect/operator/ir/ops.yaml b/paddle/cinn/hlir/dialect/operator/ir/ops.yaml
@@ -1,8 +1,25 @@
-- op : add
-  args : (Tensor x, Tensor y)
+- op : broadcast
+  args : (Tensor x, int64_t[] broadcast_axes,  int64_t[] out_shape)
   output : Tensor(out)
   infer_meta :
-    func : ElementwiseInferMeta
+    func : CINNBroadcastInferMeta
+    param : [x, broadcast_axes, out_shape]
   kernel :
-    func : add
-  inplace : (x -> out)
+    func : expand
+    param : [x, broadcast_axes]
+
+- op : reduce_max
+  args : (Tensor x, int64_t[] axis,  bool keep_dim)
+  output : Tensor(out)
+  infer_meta :
+    func : ReduceInferMeta
+  kernel :
+    func : frobenius_norm
+
+- op : reduce_sum
+  args : (Tensor x, int64_t[] axis,  bool keep_dim)
+  output : Tensor(out)
+  infer_meta :
+    func : ReduceInferMeta
+  kernel :
+    func : frobenius_norm
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/CMakeLists.txt b/paddle/cinn/hlir/dialect/operator/transforms/CMakeLists.txt
@@ -0,0 +1,10 @@
+if(NOT CINN_ONLY)
+  cinn_cc_library(
+    op_with_group_merge_pass
+    SRCS
+    group_with_group_merge_pass.cc
+    op_with_group_merge_pass.cc
+    tensor_node.cc
+    DEPS
+    pd_op_dialect)
+endif()
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		add_subdirectory(ir)
		add_subdirectory(transforms)