Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle-Lite …

…into mul_quant; test=develop
PaddlePaddle · Sep 26, 2021 · fac0af2 · fac0af2
2 parents a827668 + ae305a6
commit fac0af2
Show file tree

Hide file tree

Showing 579 changed files with 17,509 additions and 9,881 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -29,7 +29,7 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 include(system)
 include(functions)
-include(cross_compiling/preproject)
+include(os/common)
 
 project(paddle CXX C)
 message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
@@ -194,15 +194,15 @@ if (LITE_WITH_PYTHON)
 endif()
 
 if(LITE_WITH_RKNPU)
-   include(device/rknpu)
+   include(backends/rknpu)
 endif()
 
 if(LITE_WITH_IMAGINATION_NNA)
-	include(device/imagination_nna)
+	include(backends/imagination_nna)
 endif()
 
 if(LITE_WITH_INTEL_FPGA)
-	include(device/intel_fpga)
+	include(backends/intel_fpga)
 endif()
 
 # flatbuffer module for loading model
@@ -234,11 +234,11 @@ endif()
 # for mobile
 if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
     message(STATUS "Building the mobile framework")
-    include(cross_compiling/postproject)
-    include(device/npu) # check and prepare NPU DDK
-    include(device/xpu) # check and prepare XPU SDK
-    include(device/apu) # check and prepare APU SDK 
-    include(device/huawei_ascend_npu)  # check and prepare Ascend NPU SDK 
+    include(os/postproject)
+    include(backends/npu) # check and prepare NPU DDK
+    include(backends/xpu) # check and prepare XPU SDK
+    include(backends/apu) # check and prepare APU SDK 
+    include(backends/huawei_ascend_npu)  # check and prepare Ascend NPU SDK 
 
     # We compile the mobile deployment library when LITE_ON_TINY_PUBLISH=ON
     # So the following third party dependencies are not needed.
@@ -273,15 +273,15 @@ endif()
 ########################################################################################
 
 if(LITE_WITH_XPU)
-    include(device/xpu)
+    include(backends/xpu)
 endif()
 
 if(LITE_WITH_MLU)
     include(mlu)
 endif()
 
 if(LITE_WITH_HUAWEI_ASCEND_NPU)
-    include(device/huawei_ascend_npu)
+    include(backends/huawei_ascend_npu)
 endif()
 
 include(coveralls)

diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@
 
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle-Lite.svg?branch=develop&longCache=true&style=flat-square)](https://travis-ci.org/PaddlePaddle/Paddle-Lite)  [![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://paddle-lite.readthedocs.io/zh/develop/)  [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle-Lite.svg)](https://github.com/PaddlePaddle/Paddle-Lite/releases)  [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
 
-Paddle Lite是一个高性能、轻量级、灵活性强且易于扩展的深度学习推理框架，定位支持包括移动端、嵌入式以及服务器端在内的多硬件平台。
+Paddle Lite是一个高性能、轻量级、灵活性强且易于扩展的深度学习推理框架，定位于支持包括移动端、嵌入式以及服务器端在内的多硬件平台。
 
 当前Paddle Lite不仅在百度内部业务中得到全面应用，也成功支持了众多外部用户和企业的生产任务。
 
@@ -55,7 +55,7 @@ Paddle Lite提供了C++、Java、Python三种API，并且提供了相应API的
 ## 主要特性
 
 - **多硬件支持：**
-	- Paddle Lite架构已经验证和完整支持从 Mobile 到 Server [多种硬件平台](https://paddle-lite.readthedocs.io/zh/latest/introduction/support_hardware.html)，包括 ARM CPU、Mali GPU、Adreno GPU、华为 NPU，以及 FPGA 等，且正在不断增加更多新硬件支持。
+	- Paddle Lite架构已经验证和完整支持从 Mobile 到 Server [多种硬件平台](https://paddle-lite.readthedocs.io/zh/latest/introduction/support_hardware.html)，包括 ARM CPU、Mali GPU、Adreno GPU、英伟达 GPU、苹果 GPU、华为 NPU，以及 FPGA 等，且正在不断增加更多新硬件支持。
 	- 各个硬件平台的 Kernel 在代码层和执行层互不干扰，用户不仅可以自由插拔任何硬件，还支持任意系统可见硬件之间的[混合调度](https://paddle-lite.readthedocs.io/zh/latest/introduction/tech_highlights.html#id7)。
 - **轻量级部署**：
 	- Paddle Lite在设计上对图优化模块和执行引擎实现了良好的解耦拆分，移动端可以直接部署执行阶段，无任何第三方依赖。
@@ -67,7 +67,7 @@ Paddle Lite提供了C++、Java、Python三种API，并且提供了相应API的
 	- Paddle Lite和PaddlePaddle训练框架的OP对齐，提供广泛的模型支持能力。
 	- 目前已严格验证24个模型200个OP的精度和性能，对视觉类模型做到了较为充分的支持，覆盖分类、检测和定位，包含了特色的OCR模型的支持，并在不断丰富中。具体请参考[支持OP](https://paddle-lite.readthedocs.io/zh/latest/introduction/support_operation_list.html)。
 - **强大的图分析和优化能力**：
-	- 不同于常规的移动端预测引擎基于 Python 脚本工具转化模型， Lite 架构上有完整基于 C++ 开发的 IR 及相应 Pass 集合，以支持操作熔合，计算剪枝，存储优化，量化计算等多类计算图优化。更多的优化策略可以简单通过 [新增 Pass](https://paddle-lite.readthedocs.io/zh/latest/develop_guides/add_new_pass.html) 的方式模块化支持。
+	- 不同于常规的移动端预测引擎基于 Python 脚本工具转化模型， Lite 架构上有完整基于 C++ 开发的 IR 及相应 Pass 集合，以支持操作融合，计算剪枝，存储优化，量化计算等多类计算图优化。更多的优化策略可以简单通过 [新增 Pass](https://paddle-lite.readthedocs.io/zh/latest/develop_guides/add_new_pass.html) 的方式模块化支持。
 
 ## 持续集成
 
@@ -76,6 +76,7 @@ Paddle Lite提供了C++、Java、Python三种API，并且提供了相应API的
 | CPU(32bit) | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) |
 | CPU(64bit) | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) |
 | OpenCL | - | - | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | - |
+| Metal | - | - | - | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) |
 | FPGA | - | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | - | - |
 | 华为NPU | - | - | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | - |
 | 百度 XPU | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | - | - |

diff --git a/README_en.md b/README_en.md
@@ -31,7 +31,7 @@ The latest benchmark is located at [benchmark](https://paddlepaddle.github.io/Pa
 
 ### High Compatibility
 
-Hardware compatibility: Paddle Lite supports a diversity of hardwares — ARM CPU, Mali GPU, Adreno GPU, Huawei NPU and FPGA. In the near future, we will also support AI microchips from Cambricon and Bitmain.
+Hardware compatibility: Paddle Lite supports a diversity of hardwares — ARM CPU, Mali GPU, Adreno GPU, Nvidia GPU, Apple GPU, Huawei NPU and FPGA. In the near future, we will also support AI microchips from Cambricon and Bitmain.
 
 Model compatibility: The Op of Paddle Lite is fully compatible to that of PaddlePaddle. The accuracy and performance of 18 models (mostly CV models and OCR models) and 85 operators have been validated. In the future, we will also support other models.
 
@@ -43,15 +43,13 @@ Paddle Lite is designed to support a wide range of hardwares and devices, and it
 
 ![img](https://user-images.githubusercontent.com/45189361/70908123-6ce4fd00-2045-11ea-97e1-ad08446c5c86.png)
 
-As is shown in the figure above, analysis phase includes Machine IR module, and it enables optimizations like Op fusion and redundant computation pruning. Besides, excecution phase only involves Kernal exevution, so it can be deployed on its own to ensure maximized light-weighted deployment.
+As is shown in the figure above, analysis phase includes Machine IR module, and it enables optimizations like Op fusion and redundant computation pruning. Besides, excecution phase only involves Kernal execution, so it can be deployed on its own to ensure maximum light-weighted deployment.
 
 ## Key Info about the Update
 
-The earlier Paddle-Mobile was designed to be compatible with PaddlePaddle and multiple hardwares, including ARM CPU, Mali GPU, Adreno GPU, FPGA, ARM-Linux and Apple's GPU Metal. Within Baidu, inc, many product lines have been using Paddle-Mobile. For more details, please see: [mobile/README](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/README.md).
+The earlier Paddle-Mobile was designed to be compatible with PaddlePaddle and multiple hardwares, including ARM CPU, Mali GPU, Adreno GPU, FPGA, ARM-Linux and Apple's GPU Metal. Within Baidu, inc, many product lines have been using Paddle-Mobile.
 
-As an update of Paddle-Mobile, Paddle Lite has incorporated many older capabilities into the [new architecture](https://github.com/PaddlePaddle/Paddle-Lite/tree/develop/lite). For the time being, the code of Paddle-mobile will be kept under the directory `mobile/`, before complete transfer to Paddle Lite.
-
-For demands of Apple's GPU Metal and web front end inference, please see `./metal` and `./web` . These two modules will be further developed and maintained.
+As an update of Paddle-Mobile, Paddle Lite has incorporated many older capabilities into the [new architecture](https://github.com/PaddlePaddle/Paddle-Lite/tree/develop/lite).
 
 ## Special Thanks
 

diff --git a/cmake/device/apu.cmake → cmake/backends/apu.cmake b/cmake/device/apu.cmake → cmake/backends/apu.cmake
diff --git a/cmake/device/huawei_ascend_npu.cmake → cmake/backends/huawei_ascend_npu.cmake b/cmake/device/huawei_ascend_npu.cmake → cmake/backends/huawei_ascend_npu.cmake
diff --git a/cmake/device/imagination_nna.cmake → cmake/backends/imagination_nna.cmake b/cmake/device/imagination_nna.cmake → cmake/backends/imagination_nna.cmake
diff --git a/cmake/device/intel_fpga.cmake → cmake/backends/intel_fpga.cmake b/cmake/device/intel_fpga.cmake → cmake/backends/intel_fpga.cmake
diff --git a/cmake/device/npu.cmake → cmake/backends/npu.cmake b/cmake/device/npu.cmake → cmake/backends/npu.cmake
diff --git a/cmake/device/rknpu.cmake → cmake/backends/rknpu.cmake b/cmake/device/rknpu.cmake → cmake/backends/rknpu.cmake
diff --git a/cmake/device/xpu.cmake → cmake/backends/xpu.cmake b/cmake/device/xpu.cmake → cmake/backends/xpu.cmake
diff --git a/cmake/functions.cmake b/cmake/functions.cmake
@@ -75,13 +75,14 @@ function(add_kernel TARGET device level)
         get_filename_component(filename ${src} NAME_WE) # conv_compute.cc => conv_compute
         set(kernel_tailor_src_dir "${CMAKE_BINARY_DIR}/kernel_tailor_src_dir")
         set(suffix "for_strip")
-        set(dst_file ${dst_file} "${kernel_tailor_src_dir}/${filename}_${device_name}_${suffix}.cc") # conv_compute_arm.cc
+        set(src_file "${kernel_tailor_src_dir}/${filename}_${device_name}_${suffix}.cc") # conv_compute_arm.cc
         if("${device}" STREQUAL "METAL")
-          set(dst_file ${dst_file} "${kernel_tailor_src_dir}/${filename}_${device_name}_${suffix}.mm") # conv_compute_apple_metal_for_strip.mm
+          set(src_file "${kernel_tailor_src_dir}/${filename}_${device_name}_${suffix}.mm") # conv_compute_apple_metal_for_strip.mm
         endif()
-        if(NOT EXISTS ${dst_file})
+        if(NOT EXISTS ${src_file})
           return()
         endif()
+        set(dst_file ${dst_file} "${src_file}")
       endforeach()
       file(APPEND ${kernels_src_list} "${dst_file}\n")
       set(KERNELS_SRC ${KERNELS_SRC} "${dst_file}" CACHE INTERNAL "kernels source")
@@ -196,6 +197,24 @@ function(lite_cc_test TARGET)
       add_dependencies(lite_compile_deps ${TARGET})
   endif()
 
+  # link to dynamic runtime lib
+  if(LITE_WITH_RKNPU)
+      target_link_libraries(${TARGET} ${rknpu_runtime_libs})
+  endif()
+  if(LITE_WITH_IMAGINATION_NNA)
+      target_link_libraries(${TARGET} ${imagination_nna_builder_libs} ${imagination_nna_runtime_libs})
+  endif()
+  if(LITE_WITH_HUAWEI_ASCEND_NPU)
+      target_link_libraries(${TARGET} ${huawei_ascend_npu_runtime_libs} ${huawei_ascend_npu_builder_libs})
+  endif()
+  if(LITE_WITH_NPU)
+      target_link_libraries(${TARGET} ${npu_builder_libs} ${npu_runtime_libs})
+  endif()
+  if(LITE_WITH_CUDA)
+      get_property(cuda_deps GLOBAL PROPERTY CUDA_MODULES)
+      target_link_libraries(${TARGET} ${cuda_deps})
+  endif()
+
   common_link(${TARGET})
   add_test(NAME ${TARGET}
           COMMAND ${TARGET} ${args_ARGS}

diff --git a/cmake/lite.cmake b/cmake/lite.cmake
@@ -261,12 +261,34 @@ function(lite_cc_binary TARGET)
     # link to paddle-lite static lib automatically
     add_dependencies(${TARGET} bundle_full_api)
 
+
+
     if(NOT WIN32)
       target_link_libraries(${TARGET} ${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a)
       target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
     else()
       target_link_libraries(${TARGET} ${CMAKE_BINARY_DIR}/lite/api/${CMAKE_BUILD_TYPE}/libpaddle_api_full_bundled.lib)
     endif()
+
+
+    # link to dynamic runtime lib
+    if(LITE_WITH_RKNPU)
+        target_link_libraries(${TARGET} ${rknpu_runtime_libs})
+    endif()
+    if(LITE_WITH_IMAGINATION_NNA)
+        target_link_libraries(${TARGET} ${imagination_nna_builder_libs} ${imagination_nna_runtime_libs})
+    endif()
+    if(LITE_WITH_HUAWEI_ASCEND_NPU)
+        target_link_libraries(${TARGET} ${huawei_ascend_npu_runtime_libs} ${huawei_ascend_npu_builder_libs})
+    endif()
+    if(LITE_WITH_NPU)
+        target_link_libraries(${TARGET} ${npu_builder_libs} ${npu_runtime_libs})
+    endif()
+    if(LITE_WITH_CUDA)
+        get_property(cuda_deps GLOBAL PROPERTY CUDA_MODULES)
+        target_link_libraries(${TARGET} ${cuda_deps})
+    endif()
+
     if (NOT APPLE AND NOT WIN32)
         # strip binary target to reduce size
         if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
@@ -282,23 +304,6 @@ function(lite_cc_binary TARGET)
     endif()
 endfunction()
 
-
-# file to record subgraph bridges for new hardware
-set(subgraph_bridges_src_list "${CMAKE_BINARY_DIR}/subgraph_bridges_src_list.txt")
-file(WRITE ${subgraph_bridges_src_list} "") # clean
-
-# add a subgraph bridge for some new hardware which support some op by subgraph
-# device: such as npu, rknpu, apu, huawei_ascend_npu, imagination_nna, nnadapter
-function(add_subgraph_bridge)
-  set(options "")
-  set(oneValueArgs "")
-  set(multiValueArgs SRCS)
-  cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  foreach(src ${args_SRCS})
-    file(APPEND ${subgraph_bridges_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
-  endforeach()
-endfunction(add_subgraph_bridge)
-
 #only for windows 
 function(create_static_lib TARGET_NAME)
   set(libs ${ARGN})
@@ -382,6 +387,9 @@ function(bundle_static_library tgt_name bundled_tgt_name fake_target)
     return()
   endif()
 
+  add_custom_target(${fake_target})
+  add_dependencies(${fake_target} ${tgt_name})
+
   if(NOT IOS AND NOT APPLE)
     file(WRITE ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar.in
       "CREATE ${bundled_tgt_full_name}\n" )
@@ -404,8 +412,9 @@ function(bundle_static_library tgt_name bundled_tgt_name fake_target)
     endif()
 
     add_custom_command(
+      TARGET ${fake_target} PRE_BUILD
+      COMMAND rm -f ${bundled_tgt_full_name}
       COMMAND ${ar_tool} -M < ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar
-      OUTPUT ${bundled_tgt_full_name}
       COMMENT "Bundling ${bundled_tgt_name}"
       DEPENDS ${tgt_name}
       VERBATIM)
@@ -414,15 +423,13 @@ function(bundle_static_library tgt_name bundled_tgt_name fake_target)
       set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
     endforeach()
     add_custom_command(
+      TARGET ${fake_target} PRE_BUILD
+      COMMAND rm -f ${bundled_tgt_full_name}
       COMMAND /usr/bin/libtool -static -o ${bundled_tgt_full_name} ${libfiles}
       DEPENDS ${tgt_name}
-      OUTPUT ${bundled_tgt_full_name}
     )
   endif()
 
-  add_custom_target(${fake_target} ALL DEPENDS ${bundled_tgt_full_name})
-  add_dependencies(${fake_target} ${tgt_name})
-
   add_library(${bundled_tgt_name} STATIC IMPORTED)
   set_target_properties(${bundled_tgt_name}
     PROPERTIES

diff --git a/cmake/cross_compiling/android.cmake → cmake/os/android.cmake b/cmake/cross_compiling/android.cmake → cmake/os/android.cmake
diff --git a/cmake/cross_compiling/armlinux.cmake → cmake/os/armlinux.cmake b/cmake/cross_compiling/armlinux.cmake → cmake/os/armlinux.cmake
diff --git a/cmake/cross_compiling/armmacos.cmake → cmake/os/armmacos.cmake b/cmake/cross_compiling/armmacos.cmake → cmake/os/armmacos.cmake
diff --git a/cmake/cross_compiling/preproject.cmake → cmake/os/common.cmake b/cmake/cross_compiling/preproject.cmake → cmake/os/common.cmake
@@ -55,18 +55,18 @@ message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
 message(STATUS "CMAKE_CXX_FLAGS_RELEASE: ${CMAKE_CXX_FLAGS_RELEASE}")
 
 if(ARM_TARGET_OS STREQUAL "android")
-  include(cross_compiling/android)
+  include(os/android)
 endif()
 if(ARM_TARGET_OS STREQUAL "armlinux")
-  include(cross_compiling/armlinux)
+  include(os/armlinux)
 endif()
 if(ARM_TARGET_OS STREQUAL "ios" OR ARM_TARGET_OS STREQUAL "ios64")
-  include(cross_compiling/ios)
+  include(os/ios)
 endif()
 if(ARM_TARGET_OS STREQUAL "armmacos")
-  include(cross_compiling/armmacos)
+  include(os/armmacos)
 endif()
-include(cross_compiling/host)
+include(os/host)
 
 if(NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Default use Release in android" FORCE)

diff --git a/cmake/cross_compiling/findar.cmake → cmake/os/findar.cmake b/cmake/cross_compiling/findar.cmake → cmake/os/findar.cmake
diff --git a/cmake/cross_compiling/host.cmake → cmake/os/host.cmake b/cmake/cross_compiling/host.cmake → cmake/os/host.cmake
diff --git a/cmake/cross_compiling/ios.cmake → cmake/os/ios.cmake b/cmake/cross_compiling/ios.cmake → cmake/os/ios.cmake
diff --git a/cmake/cross_compiling/postproject.cmake → cmake/os/postproject.cmake b/cmake/cross_compiling/postproject.cmake → cmake/os/postproject.cmake
@@ -17,7 +17,7 @@ if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
 endif()
 include(CheckCXXCompilerFlag)
 if(ANDROID)
-    include(cross_compiling/findar)
+    include(os/findar)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -llog -fPIC")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog -fPIC")
     if(LITE_WITH_ARM82_FP16)

diff --git a/docs/api_reference/cxx_api_doc.md b/docs/api_reference/cxx_api_doc.md
@@ -464,6 +464,32 @@ std::shared_ptr<PaddlePredictor> predictor = CreatePaddlePredictor<MobileConfig>
 
 返回类型：`int`
 
+
+
+### `set_metal_lib_path(path)`
+
+用于iOS设备上使用Metal进行GPU预测时，配置metallib加载路径。
+
+参数：
+
+- `path(str)` - metallib库文件路径
+
+返回类型：`void`
+
+
+
+### `set_metal_use_mps(flag)`
+
+设置iOS设备上使用Metal进行GPU预测时，是否启用[Metal Performance Shaders](https://developer.apple.com/documentation/metalperformanceshaders)。若不设置，默认不使用（建议启用）。
+
+参数：
+
+- `flag(bool)` - 是否使用MPS
+
+返回：是否使用Metal Performance Shaders
+
+返回类型：`bool`
+
 ## PaddlePredictor
 
 ```c++