fix QBits actshuf buf overflow under large batch (#1473)

Co-authored-by: changwangss <chang1.wang@intel.com>
intel · Apr 15, 2024 · a6f3ab3 · a6f3ab3
1 parent 0ec83b1
commit a6f3ab3
Show file tree

Hide file tree

Showing 28 changed files with 39 additions and 34 deletions.
diff --git a/.github/workflows/script/formatScan/pylint.sh b/.github/workflows/script/formatScan/pylint.sh
@@ -41,7 +41,7 @@ python -m pylint -f json --disable=R,C,W,E1129 \
     --max-line-length=120 \
     --extension-pkg-whitelist=numpy,nltk \
     --ignored-classes=TensorProto,NodeProto \
-    --ignored-modules=tensorflow,torch,torch.quantization,torch.tensor,torchvision,mxnet,onnx,onnxruntime,neural_compressor,neural_compressor.benchmark,intel_extension_for_transformers.neural_engine_py,cv2,PIL.Image \
+    --ignored-modules=tensorflow,torch,torch.quantization,torch.tensor,torchvision,mxnet,onnx,onnxruntime,neural_compressor,neural_compressor.benchmark,intel_extension_for_transformers.neural_engine_py,intel_extension_for_transformers.qbits,cv2,PIL.Image \
     /intel-extension-for-transformers/intel_extension_for_transformers >${log_dir}/pylint.json
 exit_code1=$?
 
@@ -51,7 +51,7 @@ python -m pylint -f json --disable=R,C,W,E1129 \
     --disable=no-name-in-module,import-error,no-member,undefined-variable,no-value-for-parameter,unexpected-keyword-arg,not-callable,no-self-argument,too-many-format-args,invalid-unary-operand-type,too-many-function-args \
     --extension-pkg-whitelist=numpy,nltk \
     --ignored-classes=TensorProto,NodeProto \
-    --ignored-modules=tensorflow,torch,torch.quantization,torch.tensor,torchvision,mxnet,onnx,onnxruntime,neural_compressor,neural_compressor.benchmark,intel_extension_for_transformers.neural_engine_py,cv2,PIL.Image \
+    --ignored-modules=tensorflow,torch,torch.quantization,torch.tensor,torchvision,mxnet,onnx,onnxruntime,neural_compressor,neural_compressor.benchmark,intel_extension_for_transformers.neural_engine_py,intel_extension_for_transformers.qbits,cv2,PIL.Image \
     /intel-extension-for-transformers/intel_extension_for_transformers >> ${log_dir}/pylint.json
 exit_code2=$?
 

diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation.py
@@ -436,7 +436,7 @@
     if args.sq:
         config.save_pretrained(args.output_dir)
         user_model.save(args.output_dir)
-    elif args.mixed_precision or args.woq:
+    elif args.mixed_precision or (args.woq and not args.use_neural_speed):
         # user_model will be changed.
         user_model.save_pretrained(args.output_dir)
         # loading saved woq model

diff --git a/...sformers/llm/operator/csrc/CMakeLists.txt → ...ion_for_transformers/qbits/CMakeLists.txt b/...sformers/llm/operator/csrc/CMakeLists.txt → ...ion_for_transformers/qbits/CMakeLists.txt
@@ -12,7 +12,7 @@
 ##  See the License for the specific language governing permissions and
 ##  limitations under the License.
 cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
-project(qbits LANGUAGES C CXX)
+project(qbits_py LANGUAGES C CXX)
 
 
 set(QBITS_TORCH_PATH "" CACHE STRING "Torch install path")
@@ -31,17 +31,20 @@ endif()
 find_package(Torch REQUIRED
 PATHS ${torch_path}
 NO_DEFAULT_PATH)
+
+if(NOT WIN32)
 find_package(PythonLibs 3 REQUIRED)
+endif()
 
 include(FindOpenMP)
 add_subdirectory(dispatcher)
-add_subdirectory(../../../runtime/third_party/pybind11 pybind11)
+add_subdirectory(../transformers/runtime/third_party/pybind11 pybind11)
 
 file(GLOB HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/include/*.hpp)
 file(GLOB qbits_src ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp)
 
 # Link against LibTorch
-pybind11_add_module(qbits ${qbits_src})
-target_compile_features(qbits PRIVATE cxx_std_14)
-target_link_directories(qbits PRIVATE ${torch_path}/lib)
-target_link_libraries(qbits PRIVATE bestla_dispatcher torch_python)
+pybind11_add_module(qbits_py ${qbits_src})
+target_compile_features(qbits_py PRIVATE cxx_std_14)
+target_link_directories(qbits_py PRIVATE ${torch_path}/lib)
+target_link_libraries(qbits_py PRIVATE bestla_dispatcher torch_python)
diff --git a/intel_extension_for_transformers/qbits/__init__.py b/intel_extension_for_transformers/qbits/__init__.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from intel_extension_for_transformers.qbits_py import * # pylint: disable=E0401, E0611
diff --git a/...m/operator/csrc/dispatcher/CMakeLists.txt → ...nsformers/qbits/dispatcher/CMakeLists.txt b/...m/operator/csrc/dispatcher/CMakeLists.txt → ...nsformers/qbits/dispatcher/CMakeLists.txt
@@ -35,5 +35,5 @@ endif()
 
 set_target_properties(bestla_dispatcher PROPERTIES POSITION_INDEPENDENTBTLA_CODE ON)
 set_target_properties(bestla_dispatcher PROPERTIES LINKER_LANGUAGE CXX)
-target_link_libraries(bestla_dispatcher OpenMP::OpenMP_CXX OpenMP::OpenMP_C "${TORCH_LIBRARIES}" torch_python bestla::bestla)
+target_link_libraries(bestla_dispatcher OpenMP::OpenMP_CXX OpenMP::OpenMP_C "${TORCH_LIBRARIES}" bestla::bestla)
 set_property(TARGET torch_cpu PROPERTY INTERFACE_COMPILE_OPTIONS "")
diff --git a/...rc/dispatcher/include/bestla_customop.hpp → ...ts/dispatcher/include/bestla_customop.hpp b/...rc/dispatcher/include/bestla_customop.hpp → ...ts/dispatcher/include/bestla_customop.hpp
diff --git a/...atcher/include/bestla_gemm_dispatcher.hpp → ...atcher/include/bestla_gemm_dispatcher.hpp b/...atcher/include/bestla_gemm_dispatcher.hpp → ...atcher/include/bestla_gemm_dispatcher.hpp
diff --git a/.../dispatcher/include/bestla_packq_impl.hpp → .../dispatcher/include/bestla_packq_impl.hpp b/.../dispatcher/include/bestla_packq_impl.hpp → .../dispatcher/include/bestla_packq_impl.hpp
diff --git a/.../include/bestla_weightonly_dispatcher.hpp → .../include/bestla_weightonly_dispatcher.hpp b/.../include/bestla_weightonly_dispatcher.hpp → .../include/bestla_weightonly_dispatcher.hpp
diff --git a/...c/dispatcher/include/dispatcher_utils.hpp → ...s/dispatcher/include/dispatcher_utils.hpp b/...c/dispatcher/include/dispatcher_utils.hpp → ...s/dispatcher/include/dispatcher_utils.hpp
diff --git a/...erator/csrc/dispatcher/neural_speed.cmake → ...rmers/qbits/dispatcher/neural_speed.cmake b/...erator/csrc/dispatcher/neural_speed.cmake → ...rmers/qbits/dispatcher/neural_speed.cmake
diff --git a/...dispatcher/src/bestla_gemm_dispatcher.cpp → ...dispatcher/src/bestla_gemm_dispatcher.cpp b/...dispatcher/src/bestla_gemm_dispatcher.cpp → ...dispatcher/src/bestla_gemm_dispatcher.cpp
diff --git a/...csrc/dispatcher/src/bestla_packq_impl.cpp → ...bits/dispatcher/src/bestla_packq_impl.cpp b/...csrc/dispatcher/src/bestla_packq_impl.cpp → ...bits/dispatcher/src/bestla_packq_impl.cpp
diff --git a/...cher/src/bestla_weightonly_dispatcher.cpp → ...cher/src/bestla_weightonly_dispatcher.cpp b/...cher/src/bestla_weightonly_dispatcher.cpp → ...cher/src/bestla_weightonly_dispatcher.cpp
@@ -106,7 +106,7 @@ void quantize_to_packed_weight(woq_config_param* p, woq_runtime_ctx* ctx) {
   }
 }
 
-void* get_workspace(int need_size) {
+void* get_workspace(size_t need_size) {
   void* tmpbuf = NULL;
   void* workspace = woq_workspace == nullptr ? NULL : woq_workspace;
   if (workspace != NULL) {
@@ -126,7 +126,7 @@ void do_compute(woq_config_param* p, woq_runtime_ctx* ctx, ParamA param_a) {
   EpiParam param_epi = {ctx->output->data_ptr(), ctx->bias->data_ptr(), ctx->ldo, 0, ctx->alpha, ctx->beta};
   using GemmCore = typename Launcher::GemmCore;
   using StorageWeight = typename Launcher::PrologueB::StorageWeight;
-  int asym_size = 0, shuf_size = 0;
+  size_t asym_size = 0, shuf_size = 0;
   int8_t* tmpbuf = nullptr;
   if constexpr (GemmCore::ISA == BTLA_ISA::AMX_INT8 || GemmCore::ISA == BTLA_ISA::AVX512_VNNI ||
                 GemmCore::ISA == BTLA_ISA::AVX_VNNI) {

diff --git a/.../csrc/dispatcher/src/dispatcher_utils.cpp → ...qbits/dispatcher/src/dispatcher_utils.cpp b/.../csrc/dispatcher/src/dispatcher_utils.cpp → ...qbits/dispatcher/src/dispatcher_utils.cpp
diff --git a/...ers/llm/operator/csrc/include/dropout.hpp → ...or_transformers/qbits/include/dropout.hpp b/...ers/llm/operator/csrc/include/dropout.hpp → ...or_transformers/qbits/include/dropout.hpp
diff --git a/.../transformers/llm/operator/csrc/qbits.cpp → ...xtension_for_transformers/qbits/qbits.cpp b/.../transformers/llm/operator/csrc/qbits.cpp → ...xtension_for_transformers/qbits/qbits.cpp
@@ -170,7 +170,7 @@ static bool check_isa_supported(std::string isa) {
   return false;
 }
 
-PYBIND11_MODULE(qbits, m) {
+PYBIND11_MODULE(qbits_py, m) {
   m.def("quantize_to_packed_weight", &quantize_to_packed_weight);
   m.def("woq_linear", &woq_linear);
   m.def("dequantize_packed_weight", &dequantize_packed_weight);

diff --git a/...lm/operator/csrc/qbits_ut/test_dropout.py → ...ansformers/qbits/qbits_ut/test_dropout.py b/...lm/operator/csrc/qbits_ut/test_dropout.py → ...ansformers/qbits/qbits_ut/test_dropout.py
diff --git a/...llm/operator/csrc/qbits_ut/test_matmul.py → ...ransformers/qbits/qbits_ut/test_matmul.py b/...llm/operator/csrc/qbits_ut/test_matmul.py → ...ransformers/qbits/qbits_ut/test_matmul.py
diff --git a/.../llm/operator/csrc/qbits_ut/test_packq.py → ...transformers/qbits/qbits_ut/test_packq.py b/.../llm/operator/csrc/qbits_ut/test_packq.py → ...transformers/qbits/qbits_ut/test_packq.py
diff --git a/...operator/csrc/qbits_ut/test_weightonly.py → ...formers/qbits/qbits_ut/test_weightonly.py b/...operator/csrc/qbits_ut/test_weightonly.py → ...formers/qbits/qbits_ut/test_weightonly.py
diff --git a/...rs/llm/operator/csrc/qbits_ut/ut_utils.py → ...r_transformers/qbits/qbits_ut/ut_utils.py b/...rs/llm/operator/csrc/qbits_ut/ut_utils.py → ...r_transformers/qbits/qbits_ut/ut_utils.py
diff --git a/...ansformers/llm/operator/csrc/run_build.sh → ...nsion_for_transformers/qbits/run_build.sh b/...ansformers/llm/operator/csrc/run_build.sh → ...nsion_for_transformers/qbits/run_build.sh
diff --git a/...formers/llm/operator/csrc/src/dropout.cpp → ...on_for_transformers/qbits/src/dropout.cpp b/...formers/llm/operator/csrc/src/dropout.cpp → ...on_for_transformers/qbits/src/dropout.cpp
diff --git a/intel_extension_for_transformers/transformers/llm/operator/.clang-format b/intel_extension_for_transformers/transformers/llm/operator/.clang-format
diff --git a/intel_extension_for_transformers/transformers/llm/operator/CMakeLists.txt b/intel_extension_for_transformers/transformers/llm/operator/CMakeLists.txt
diff --git a/intel_extension_for_transformers/transformers/utils/config.py b/intel_extension_for_transformers/transformers/utils/config.py
@@ -434,6 +434,7 @@ def post_init_runtime(self):
         runtime_supported_weight_dtype = [
             "int4",
             "int4_clip",  # int4_clip will merge to int4 in next release.
+            "int4_fullrange", # int4_fullrange will merge to int4 in next release.
             "int8",
             "fp8",
             "fp8_e5m2",
@@ -467,6 +468,8 @@ def post_init_runtime(self):
             self.weight_dtype = "int4"
         elif self.weight_dtype == "int4_clip":
             self.weight_dtype == "int4"
+        elif self.weight_dtype == "int4_fullrange":
+            self.weight_dtype == "int4"
         elif self.weight_dtype == "fp8":
             self.weight_dtype == "fp8_e4m3"
         elif self.weight_dtype == "fp8":

diff --git a/setup.py b/setup.py
@@ -262,7 +262,7 @@ def check_submodules():
         ext_modules = []
     else:
         ext_modules = [CMakeExtension(
-            "intel_extension_for_transformers.qbits", 'intel_extension_for_transformers/transformers/llm/operator/csrc/')]
+            "intel_extension_for_transformers.qbits_py", 'intel_extension_for_transformers/qbits/')]
         if SKIP_RUNTIME:
             subprocess.check_call(
                 ["git", "submodule", "update", "--init", "intel_extension_for_transformers/transformers/runtime/third_party/pybind11"], cwd=cwd)