From d2e6452042b1bad5ae9b18f90863cbf6f13aab29 Mon Sep 17 00:00:00 2001 From: kpuatamazon <56725192+kpuatamazon@users.noreply.github.com> Date: Wed, 16 Sep 2020 17:41:35 +0100 Subject: [PATCH] [1.x] Backport of intgemm #17559 (#19099) * cherry-pick intgemm from master, fix build * Fix test to conform to 1.x * Makefile supporting intgemm compilation * Stricter dependencies on git checkout of intgemm * Operators depend on mkldnn * Don't compile intgemm with gcc older than 5 * Fix intgemm test for windows on 1.x by not using pytest * Update intgemm to use template arguments for integer immediates * Try to fix clang3.6 * Ban gcc < 5 in cmake * Update intgemm with gcc 5.5 debug workaround --- CMakeLists.txt | 30 ++ LICENSE | 2 + Makefile | 66 ++++ include/mxnet/base.h | 2 +- .../intgemm/intgemm_fully_connected_op.cc | 328 ++++++++++++++++++ .../contrib/intgemm/max_absolute_op.cc | 119 +++++++ .../contrib/intgemm/prepare_data_op.cc | 134 +++++++ .../contrib/intgemm/prepare_weight_op.cc | 180 ++++++++++ .../contrib/intgemm/take_weight_op.cc | 146 ++++++++ src/storage/cpu_device_storage.h | 2 +- tests/python/unittest/test_contrib_intgemm.py | 221 ++++++++++++ 11 files changed, 1228 insertions(+), 2 deletions(-) create mode 100644 src/operator/contrib/intgemm/intgemm_fully_connected_op.cc create mode 100644 src/operator/contrib/intgemm/max_absolute_op.cc create mode 100644 src/operator/contrib/intgemm/prepare_data_op.cc create mode 100644 src/operator/contrib/intgemm/prepare_weight_op.cc create mode 100644 src/operator/contrib/intgemm/take_weight_op.cc create mode 100644 tests/python/unittest/test_contrib_intgemm.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 8c7cfe1a2a78..ee4369a758c9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -47,6 +47,11 @@ if(USE_MKL_IF_AVAILABLE AND (NOT APPLE) AND (NOT MSVC) AND (CMAKE_HOST_SYSTEM_PR else() option(USE_MKLDNN "Build with MKL-DNN support" OFF) endif() +if ((CMAKE_SYSTEM_PROCESSOR STREQUAL x86_64) AND ((NOT CMAKE_COMPILER_IS_GNUCC) OR (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 5.0))) + option(USE_INTGEMM "Build with x86_64 intgemm library for low-precision multiplication" ON) +else() + option(USE_INTGEMM "Build with x86_64 intgemm library for low-precision multiplication" OFF) +endif() if(NOT MSVC) option(USE_OPERATOR_TUNING "Enable auto-tuning of operators" ON) else() @@ -306,6 +311,22 @@ if(USE_CPP_PACKAGE) add_definitions(-DMXNET_USE_CPP_PACKAGE=1) endif() +if(USE_INTGEMM) + message(STATUS "Using intgemm") + include(FetchContent) + FetchContent_Declare( + intgemm + GIT_REPOSITORY https://github.com/kpu/intgemm.git + GIT_TAG 4172dcc209e6793dd920dec9cf9c9fc81605bd9d + ) + FetchContent_GetProperties(intgemm) + if(NOT intgemm_POPULATED) + FetchContent_Populate(intgemm) + endif() + add_subdirectory(${intgemm_SOURCE_DIR} ${intgemm_BINARY_DIR} EXCLUDE_FROM_ALL) + add_definitions(-DMXNET_USE_INTGEMM=1) +endif() + # Allow Cuda compiles outside of src tree to find things in 'src' and 'include' include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src) @@ -497,6 +518,11 @@ endif() FILE(GLOB_RECURSE SOURCE "src/*.cc" "src/*.h" "include/*.h") FILE(GLOB_RECURSE CUDA "src/*.cu" "src/*.cuh") +if(NOT USE_INTGEMM) + FILE(GLOB_RECURSE INTGEMM_OPERATOR_SOURCE "src/operator/contrib/intgemm/*.cc" "src/operator/contrib/intgemm/*.h") + list(REMOVE_ITEM SOURCE ${INTGEMM_OPERATOR_SOURCE}) +endif() + # add nnvm to source FILE(GLOB_RECURSE NNVMSOURCE 3rdparty/tvm/nnvm/src/c_api/*.cc @@ -791,6 +817,10 @@ if(USE_MKLDNN) ${CMAKE_BINARY_DIR}/3rdparty/mkldnn/include/dnnl_version.h ${CMAKE_SOURCE_DIR}/include/mkldnn/) endif() +if(USE_INTGEMM) + target_link_libraries(mxnet_static PRIVATE intgemm) +endif() + function(BuildTVMOP) # scope the variables in BuildTVM.cmake to avoid conflict include(cmake/BuildTVM.cmake) diff --git a/LICENSE b/LICENSE index 9aa20d166394..4a8f8dd5e6e8 100644 --- a/LICENSE +++ b/LICENSE @@ -309,6 +309,8 @@ Licensed MIT © Zeno Rocha 11. mx-theme - For details, see docs/python_docs/themes/mx-theme/LICENSE Copyright (c) 2016 myyasuda + 12. intgemm - Refer to 3rdparty/intgemm/LICENSE + Copyright (c) 2017--2019 University of Edinburgh, Nikolay Bogoychev, Mateusz Chudyk, Kenneth Heafield, and Microsoft Corporation ======================================================================================= diff --git a/Makefile b/Makefile index 4ee71c9478b1..aa4120743075 100644 --- a/Makefile +++ b/Makefile @@ -86,6 +86,25 @@ ifeq ($(USE_MKLDNN), 1) MKLDNNROOT = $(ROOTDIR)/3rdparty/mkldnn/build/install endif +ifndef USE_INTGEMM +ifeq ($(UNAME_P), x86_64) + COMPILER := $(shell $(CXX) --version |head -n 1 |cut -d " " -f 1) + COMPILER_VERSION := $(shell $(CXX) -dumpversion |cut -d . -f 1) + ifeq ($(COMPILER), clang) + USE_INTGEMM=1 + endif + ifeq ($(COMPILER), Apple) + USE_INTGEMM=1 + endif + # If it's not clang and not Apple clang, it's probably gcc and we need at least 5. + # gcc --version gives the name of the program it was called with, which makes it hard to detect. + COMPILER_VERSION_GE_5 := $(shell expr $(COMPILER_VERSION) \>= 5) + ifeq ($(COMPILER_VERSION_GE_5), 1) + USE_INTGEMM=1 + endif +endif +endif + include $(TPARTYDIR)/mshadow/make/mshadow.mk include $(DMLC_CORE)/make/dmlc.mk @@ -463,6 +482,46 @@ endif all: lib/libmxnet.a lib/libmxnet.so $(BIN) extra-packages extension_libs SRC = $(wildcard src/*/*/*/*.cc src/*/*/*.cc src/*/*.cc src/*.cc) + +ifeq ($(USE_INTGEMM), 1) + ifndef INTGEMM_PATH + INTGEMM_PATH = build/3rdparty/intgemm + endif + CFLAGS += -DMXNET_USE_INTGEMM=1 + LIB_DEP += $(INTGEMM_PATH)/libintgemm.a + +# Download intgemm if it isn't already +$(INTGEMM_PATH): + @mkdir -p $(INTGEMM_PATH) + rm -rf $(INTGEMM_PATH) + git clone https://github.com/kpu/intgemm $(INTGEMM_PATH) + cd $(INTGEMM_PATH) && git checkout -q 4172dcc209e6793dd920dec9cf9c9fc81605bd9d + +$(INTGEMM_PATH)/compile_test_avx512bw.cc: $(INTGEMM_PATH) + @ +$(INTGEMM_PATH)/compile_test_avx512vnni.cc: $(INTGEMM_PATH) + @ +$(INTGEMM_PATH)/intgemm/intgemm.cc: $(INTGEMM_PATH) + @ + +# Compiler tests for AVX512BW and AVX512VNNI. +$(INTGEMM_PATH)/intgemm/intgemm_config.h: $(INTGEMM_PATH)/compile_test_avx512bw.cc $(INTGEMM_PATH)/compile_test_avx512vnni.cc + echo '#pragma once' >$(INTGEMM_PATH)/intgemm/intgemm_config.h + $(CXX) $(CFLAGS) $(INTGEMM_PATH)/compile_test_avx512bw.cc 2>/dev/null && echo \#define INTGEMM_COMPILER_SUPPORTS_AVX512BW >>$(INTGEMM_PATH)/intgemm/intgemm_config.h || echo Your compiler is missing AVX512BW support + $(CXX) $(CFLAGS) $(INTGEMM_PATH)/compile_test_avx512vnni.cc 2>/dev/null && echo \#define INTGEMM_COMPILER_SUPPORTS_AVX512VNNI >>$(INTGEMM_PATH)/intgemm/intgemm_config.h || echo Your compiler is missing AVX512VNNI support + +$(INTGEMM_PATH)/intgemm/intgemm.o: $(INTGEMM_PATH)/intgemm/intgemm_config.h $(INTGEMM_PATH)/intgemm/intgemm.cc $(wildcard $(INTGEMM_PATH)/intgemm/*.h $(INTGEMM_PATH)/intgemm/*/*.h) + $(CXX) $(CFLAGS) -I$(INTGEMM_PATH) -std=c++11 -c $(INTGEMM_PATH)/intgemm/intgemm.cc -o $@ + +$(INTGEMM_PATH)/libintgemm.a: $(INTGEMM_PATH)/intgemm/intgemm.o + @mkdir -p $(@D) + ar crv $@ $(filter %.o, $?) +else + #If we're not using intgemm, remove the operators from src. + INTGEMM_OPS := $(wildcard src/operator/contrib/intgemm/*.cc) + SRC := $(filter-out $(INTGEMM_OPS),$(SRC)) +endif + OBJ = $(patsubst %.cc, build/%.o, $(SRC)) CUSRC = $(wildcard src/*/*/*/*.cu src/*/*/*.cu src/*/*.cu src/*.cu) CUOBJ = $(patsubst %.cu, build/%_gpu.o, $(CUSRC)) @@ -560,6 +619,13 @@ endif # For quick compile test, used smaller subset ALLX_DEP= $(ALL_DEP) +ifeq ($(USE_INTGEMM), 1) +# Enforce a dependency on $(INTGEMM_PATH)/intgemm/intgemm_config.h which is a generated header based on compiler support. +build/src/operator/contrib/intgemm/%.o: src/operator/contrib/intgemm/%.cc $(INTGEMM_PATH)/intgemm/intgemm_config.h | mkldnn + @mkdir -p $(@D) + $(CXX) -std=c++11 -c $(CFLAGS) -MMD -I$(INTGEMM_PATH) -Isrc/operator -c $< -o $@ +endif + build/src/%.o: src/%.cc | mkldnn @mkdir -p $(@D) $(CXX) -std=c++11 -c $(CFLAGS) -MMD -c $< -o $@ diff --git a/include/mxnet/base.h b/include/mxnet/base.h index 1980ca5d24d3..c2f463839856 100644 --- a/include/mxnet/base.h +++ b/include/mxnet/base.h @@ -551,7 +551,7 @@ inline std::ostream& operator<<(std::ostream &out, const Context &ctx) { #define ADD_FILELINE "\n\nDefined in " __FILE__ ":L" STRINGIZE(__LINE__) -#if MXNET_USE_MKLDNN == 1 +#if MXNET_USE_MKLDNN == 1 || MXNET_USE_INTGEMM == 1 constexpr size_t kMKLDNNAlign = 64; #endif diff --git a/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc new file mode 100644 index 000000000000..216f5ce47ecc --- /dev/null +++ b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc @@ -0,0 +1,328 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file intgemm_fully_connected_op.cc + * \brief Operator wrapping intgemm's Multiply routine + */ + +#include +#include +#include +#include "../../mshadow_op.h" +#include "../../mxnet_op.h" +#include "../../operator_common.h" +#include "../../tensor/init_op.h" + +#include "intgemm/intgemm.h" + +namespace mxnet { +namespace op { + +struct IntgemmFullyConnectedParam : public dmlc::Parameter { + int out_type; + int num_hidden; + bool no_bias; + bool flatten; + DMLC_DECLARE_PARAMETER(IntgemmFullyConnectedParam) { + // This part os a copy of the FullyConnected parameters. + DMLC_DECLARE_FIELD(num_hidden).set_lower_bound(1) + .describe("Number of hidden nodes of the output."); + DMLC_DECLARE_FIELD(no_bias).set_default(false) + .describe("Whether to disable bias parameter."); + DMLC_DECLARE_FIELD(flatten).set_default(true) + .describe("Whether to collapse all but the first axis of the input data tensor."); + + DMLC_DECLARE_FIELD(out_type) + .add_enum("float32", mshadow::kFloat32) + .add_enum("int32", mshadow::kInt32) + .set_default(mshadow::kFloat32) + .describe("Output data type."); + } +}; +DMLC_REGISTER_PARAMETER(IntgemmFullyConnectedParam); + +namespace { +// Parse the above fields into indices for parameters. +// The order is: data weight [scaling] [bias]. +struct ParameterIndices { + explicit ParameterIndices(const IntgemmFullyConnectedParam& param) : + data(0), + weight(1), + scaling(param.out_type == mshadow::kFloat32 ? 2 : kInvalid), + bias(param.no_bias ? kInvalid : (HaveScaling() ? 3 : 2)), + count(2U + HaveScaling() + HaveBias()) {} + bool HaveScaling() const { return scaling != kInvalid; } + bool HaveBias() const { return bias != kInvalid; } + const unsigned int data; + const unsigned int weight; + const unsigned int scaling; + const unsigned int bias; + const unsigned int count; + static const unsigned int kInvalid = std::numeric_limits::max(); +}; +template ParameterIndices Sanity(const nnvm::NodeAttrs& attrs, + T* in, + T* out) { + // 3-4 parameters: A, B, scaling, and optional bias + ParameterIndices ret(nnvm::get(attrs.parsed)); + CHECK_EQ(in->size(), ret.count); + CHECK_EQ(out->size(), 1U); + return ret; +} +} // namespace + +inline bool IntgemmFullyConnectedOpShape(const nnvm::NodeAttrs& attrs, + mxnet::ShapeVector* in_shape, + mxnet::ShapeVector* out_shape) { + const ParameterIndices indices(Sanity(attrs, in_shape, out_shape)); + const IntgemmFullyConnectedParam& param = nnvm::get(attrs.parsed); + // This follows FullyConnectedShape except for scaling. + using namespace mshadow; + mxnet::TShape dshape = (*in_shape)[indices.data]; + mxnet::TShape oshape = (*out_shape)[0]; + // require data to be known + if (!mxnet::ndim_is_known(dshape)) return false; + + index_t num_input; + if (!param.flatten) { + num_input = dshape[dshape.ndim()-1]; + } else { + num_input = dshape.ProdShape(1, dshape.ndim()); + } + SHAPE_ASSIGN_CHECK(*in_shape, indices.weight, Shape2(param.num_hidden, num_input)); + if (indices.HaveScaling()) { + SHAPE_ASSIGN_CHECK(*in_shape, indices.scaling, mxnet::TShape(1, 1)); + } + if (indices.HaveBias()) { + if (!shape_assign(&(*in_shape)[indices.bias], Shape1(param.num_hidden)) && + !shape_assign(&(*in_shape)[indices.bias], Shape2(param.num_hidden, 1))) { + LOG(FATAL) << "Unexpected shape for bias " << (*in_shape)[indices.bias]; + } + } + + if (!param.flatten) { + mxnet::TShape result_shape(dshape); + result_shape[dshape.ndim()-1] = param.num_hidden; + SHAPE_ASSIGN_CHECK(*out_shape, 0, result_shape); + } else { + SHAPE_ASSIGN_CHECK(*out_shape, 0, Shape2(dshape[0], param.num_hidden)); + } + if (oshape.ndim() > 0) { + dshape[0] = oshape[0]; + SHAPE_ASSIGN_CHECK(*in_shape, indices.data, dshape); + } + return true; +} + +bool IntgemmFullyConnectedOpType(const nnvm::NodeAttrs& attrs, + std::vector* in_attrs, + std::vector* out_attrs) { + const ParameterIndices indices(Sanity(attrs, in_attrs, out_attrs)); + const IntgemmFullyConnectedParam& param = nnvm::get(attrs.parsed); + + // Match the configuration for output. + TYPE_ASSIGN_CHECK(*out_attrs, 0, param.out_type); + if (indices.HaveBias()) { + // Bias has same type as output. + TYPE_ASSIGN_CHECK(*in_attrs, indices.bias, (*out_attrs)[0]); + TYPE_ASSIGN_CHECK(*out_attrs, 0, (*in_attrs)[indices.bias]); + } + // Scaling is float32. + if (indices.HaveScaling()) { + TYPE_ASSIGN_CHECK(*in_attrs, indices.scaling, mshadow::kFloat32); + } + // Users have to prepare B. It wasn't intended to be efficient. + TYPE_ASSIGN_CHECK(*in_attrs, indices.weight, mshadow::kInt8); + // A can be a float (in which case it is automatically quantized) or int8. + if (type_is_none((*in_attrs)[indices.data])) { + return false; + } + return ((*in_attrs)[indices.data] == mshadow::kInt8 || + (*in_attrs)[indices.data] == mshadow::kFloat32); +} + +void IntgemmFullyConnectedOpForwardCPU(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const ParameterIndices indices(Sanity(attrs, &inputs, &outputs)); + const IntgemmFullyConnectedParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(req.size(), 1U); + CHECK_EQ(req[0], kWriteTo) << "TODO: doing more than overwriting for intgemm."; + + const TBlob &A = inputs[indices.data], &B = inputs[indices.weight], &C = outputs[0]; + + CHECK(A.type_flag_ == mshadow::kInt8 || A.type_flag_ == mshadow::kFloat32); + CHECK_EQ(B.type_flag_, mshadow::kInt8); + CHECK(C.type_flag_ == mshadow::kInt32 || C.type_flag_ == mshadow::kFloat32); + CHECK(A.CheckContiguous()); + CHECK(B.CheckContiguous()); + CHECK(C.CheckContiguous()); + CHECK_GE(A.shape_.ndim(), 1); + CHECK_GE(B.shape_.ndim(), 2); + size_t A_rows = A.shape_.ProdShape(0, A.shape_.ndim() - 1); + size_t inner = A.shape_[A.shape_.ndim() - 1]; + CHECK_EQ(B.shape_[B.shape_.ndim() - 1], inner); + size_t B_cols = B.shape_.ProdShape(0, B.shape_.ndim() - 1); + + CHECK_EQ(C.shape_.Size(), A_rows * B_cols); + + bool bias = !param.no_bias; + if (bias) { + CHECK_EQ(inputs[indices.bias].type_flag_, C.type_flag_); + CHECK_EQ(inputs[indices.bias].shape_.Size(), param.num_hidden); + } + CHECK_EQ(inner % ::intgemm::Int8::tile_info.b_rows, 0) << + "intgemm requires the inner dimension be a multiple of " << ::intgemm::Int8::tile_info.b_rows; + CHECK_EQ(B_cols % ::intgemm::Int8::tile_info.b_cols, 0) << + "intgemm requires B have a multiple of " << ::intgemm::Int8::tile_info.b_cols << + " columns in the equation C = AB."; + + float out_float_multiplier; + if (indices.HaveScaling()) { + out_float_multiplier = *inputs[indices.scaling].dptr(); + } else { + out_float_multiplier = 0.0; // Unused; stop compiler from complaining. + } + + int8_t *A_quant; + mshadow::Tensor A_quant_store; + if (A.type_flag_ == mshadow::kFloat32) { + const float *A_raw = A.dptr(); + // Quantize A for the user. + // Future: allow scale to be passed in? Should the induced scale be an output? + float scale = 127.0 / ::intgemm::MaxAbsolute(A_raw, A_raw + A.shape_.Size()); + out_float_multiplier /= scale; + A_quant_store = ctx.requested[0].get_space_typed( + mshadow::Shape1(A.shape_.Size()), + ctx.get_stream()); + A_quant = A_quant_store.dptr_; + ::intgemm::Int8::PrepareA(A_raw, A_quant, scale, A_rows, inner); + } else { + CHECK_EQ(A.type_flag_, mshadow::kInt8); + A_quant = A.dptr(); + } + const int8_t *B_quant = B.dptr(); + CHECK_EQ(reinterpret_cast(A_quant) % 64, 0) << + "Pointers should be aligned to a multiple of 64."; + CHECK_EQ(reinterpret_cast(B_quant) % 64, 0) << + "Pointers should be aligned to a multiple of 64."; + if (C.type_flag_ == mshadow::kFloat32) { + CHECK_EQ(reinterpret_cast(C.dptr()) % 64, 0) << + "Pointers should be aligned to a multiple of 64."; + } else { + CHECK_EQ(reinterpret_cast(C.dptr()) % 64, 0) << + "Pointers should be aligned to a multiple of 64."; + } + + if (bias) { + if (C.type_flag_ == mshadow::kFloat32) { + CHECK_EQ(reinterpret_cast(inputs[indices.bias].dptr()) % 64, 0) << + "Pointers should be aligned to a multiple of 64."; + ::intgemm::callbacks::UnquantizeAndAddBiasAndWrite cb( + out_float_multiplier, + inputs[indices.bias].dptr(), + C.dptr()); + ::intgemm::Int8::Multiply(A_quant, B_quant, A_rows, inner, B_cols, cb); + } else { + // int32 + CHECK_EQ(reinterpret_cast(inputs[indices.bias].dptr()) % 64, 0) << + "Pointers should be aligned to a multiple of 64."; + ::intgemm::callbacks::AddBiasAndWrite cb( + inputs[indices.bias].dptr(), + C.dptr()); + ::intgemm::Int8::Multiply(A_quant, B_quant, A_rows, inner, B_cols, cb); + } + } else { + if (C.type_flag_ == mshadow::kFloat32) { + ::intgemm::callbacks::UnquantizeAndWrite cb(out_float_multiplier, C.dptr()); + ::intgemm::Int8::Multiply(A_quant, B_quant, A_rows, inner, B_cols, cb); + } else { + // int32 + ::intgemm::callbacks::Write cb(C.dptr()); + ::intgemm::Int8::Multiply(A_quant, B_quant, A_rows, inner, B_cols, cb); + } + } +} + +NNVM_REGISTER_OP(_contrib_intgemm_fully_connected) +.add_alias("_npx_intgemm_fully_connected") +.describe(R"code(Multiply matrices using 8-bit integers. data * weight. + +Input tensor arguments are: data weight [scaling] [bias] + +data: either float32 or prepared using intgemm_prepare_data (in which case it is int8). + +weight: must be prepared using intgemm_prepare_weight. + +scaling: present if and only if out_type is float32. If so this is multiplied by the result before adding bias. Typically: +scaling = (max passed to intgemm_prepare_weight)/127.0 if data is in float32 +scaling = (max_passed to intgemm_prepare_data)/127.0 * (max passed to intgemm_prepare_weight)/127.0 if data is in int8 + +bias: present if and only if !no_bias. This is added to the output after scaling and has the same number of columns as the output. + +out_type: type of the output. +)code" ADD_FILELINE) +.set_attr_parser(ParamParser) +.set_num_inputs([](const NodeAttrs& attrs) { + return ParameterIndices(nnvm::get(attrs.parsed)).count; +}) +.set_num_outputs(1) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + std::vector ret{"data", "weight"}; + ParameterIndices indices(nnvm::get(attrs.parsed)); + if (indices.HaveScaling()) { + ret.emplace_back("scaling"); + } + if (indices.HaveBias()) { + ret.emplace_back("bias"); + } + return ret; + }) +.set_attr("FResourceRequest", + [](const NodeAttrs& attrs) { + return std::vector{ResourceRequest::kTempSpace}; + }) +.set_attr("FInferShape", IntgemmFullyConnectedOpShape) +.set_attr("FInferType", IntgemmFullyConnectedOpType) +.set_attr("FCompute", IntgemmFullyConnectedOpForwardCPU) +.add_argument( + "data", + "NDArray-or-Symbol", + "First argument to multiplication. Tensor of float32 (quantized on the fly) or int8 from " + "intgemm_prepare_data. If you use a different quantizer, be sure to ban -128. The last " + "dimension must be a multiple of 64.") +.add_argument( + "weight", + "NDArray-or-Symbol", + "Second argument to multiplication. Tensor of int8 from intgemm_prepare_weight. The last " + "dimension must be a multiple of 64. The product of non-last dimensions must be a multiple " + "of 8.") +.add_argument("scaling", "NDArray-or-Symbol", "Scaling factor to apply if output type is float32.") +.add_argument("bias", "NDArray-or-Symbol", "Bias term.") +// TODO(Xinyu): a temp solution to enable GluonCV INT8 flow, +// will be reverted after the improvement of CachedOP is done. +.set_attr("FGradient", MakeZeroGradNodes) +.add_arguments(IntgemmFullyConnectedParam::__FIELDS__()); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/contrib/intgemm/max_absolute_op.cc b/src/operator/contrib/intgemm/max_absolute_op.cc new file mode 100644 index 000000000000..01e10b0f9908 --- /dev/null +++ b/src/operator/contrib/intgemm/max_absolute_op.cc @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file max_absolute_op.cc + * \brief Computes maximum absolute value of a tensor using intgemm + */ + +#include +#include +#include "../../mshadow_op.h" +#include "../../mxnet_op.h" +#include "../../operator_common.h" +#include "../../tensor/init_op.h" + +#include "intgemm/intgemm.h" + +namespace mxnet { +namespace op { + +inline bool MaxAbsoluteOpShape(const nnvm::NodeAttrs& attrs, + mxnet::ShapeVector* in_attrs, + mxnet::ShapeVector* out_attrs) { + // One in, one out. + CHECK_EQ(in_attrs->size(), 1U); + CHECK_EQ(out_attrs->size(), 1U); + + SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape(1, 1)); + return shape_is_known(in_attrs->at(0)); +} + +inline bool MaxAbsoluteOpType(const nnvm::NodeAttrs& attrs, + std::vector* in_attrs, + std::vector* out_attrs) { + CHECK_EQ(in_attrs->size(), 1U); + CHECK_EQ(out_attrs->size(), 1U); + + TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kFloat32); + TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kFloat32); + return true; +} + +inline bool MaxAbsoluteOpStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector* in_attrs, + std::vector* out_attrs) { + *dispatch_mode = DispatchMode::kFCompute; + CHECK_EQ(in_attrs->size(), 1U); + CHECK_EQ(out_attrs->size(), 1U); + (*out_attrs)[0] = kDefaultStorage; + return true; +} + +void MaxAbsoluteOpForwardCPU(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), 1U); + CHECK_EQ(req.size(), 1U); + const TBlob &in = inputs.front(), &out = outputs.front(); + CHECK_EQ(in.type_flag_, mshadow::kFloat32); + CHECK_EQ(out.type_flag_, mshadow::kFloat32); + CHECK(in.CheckContiguous()); + CHECK(out.CheckContiguous()); + + const std::size_t size = in.shape_.Size(); + + const float *data = in.dptr(); + // To maintain alignment, be a multiple of AVX512 register size. + const std::size_t kMultiple = 512 / 8; + CHECK_EQ(reinterpret_cast(data) % kMultiple, 0) + << "Data must be aligned to " << kMultiple << " bytes."; + + float result = ::intgemm::MaxAbsolute(data, data + size); + KERNEL_ASSIGN(*out.dptr(), req[0], result); +} + +NNVM_REGISTER_OP(_contrib_intgemm_maxabsolute) +.add_alias("_npx_intgemm_maxabsolute") +.describe(R"code(Compute the maximum absolute value in a tensor of float32 fast on a CPU. The tensor's total size must be a multiple of 16 and aligned to a multiple of 64 bytes. +mxnet.nd.contrib.intgemm_maxabsolute(arr) == arr.abs().max() +)code" ADD_FILELINE) +.set_num_inputs(1) +.set_num_outputs(1) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + return std::vector{"data"}; + }) +.set_attr("FInferShape", MaxAbsoluteOpShape) +.set_attr("FInferType", MaxAbsoluteOpType) +.set_attr("FInferStorageType", MaxAbsoluteOpStorageType) +.set_attr("FCompute", MaxAbsoluteOpForwardCPU) +.set_attr("FInplaceOption", + [](const NodeAttrs& attrs) { + return std::vector >{{0, 0}}; + }) +.add_argument("data", "NDArray-or-Symbol", "Tensor to compute maximum absolute value of"); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/contrib/intgemm/prepare_data_op.cc b/src/operator/contrib/intgemm/prepare_data_op.cc new file mode 100644 index 000000000000..1d5719de36d2 --- /dev/null +++ b/src/operator/contrib/intgemm/prepare_data_op.cc @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file prepare_data_op.cc + * \brief Converts data aka A matrices (typically activations) to intgemm's + * representation for A in C=AB. This just quantizes to int8 and bans -128. + * The only difference from Quantize/QuantizeV2 is that it bans -128. + */ + +#include +#include +#include "../../mshadow_op.h" +#include "../../mxnet_op.h" +#include "../../operator_common.h" +#include "../../tensor/init_op.h" + +#include "intgemm/intgemm.h" + +namespace mxnet { +namespace op { + +bool PrepareDataOpShape(const nnvm::NodeAttrs& attrs, + mxnet::ShapeVector* in_attrs, + mxnet::ShapeVector* out_attrs) { + // data and maximum + CHECK_EQ(in_attrs->size(), 2U); + CHECK_EQ(out_attrs->size(), 1U); + + SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0)); + SHAPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0)); + + SHAPE_ASSIGN_CHECK(*in_attrs, 1, mxnet::TShape(1, 1)); + + return shape_is_known(out_attrs->at(0)); +} + +bool PrepareDataOpType(const nnvm::NodeAttrs& attrs, + std::vector* in_attrs, + std::vector* out_attrs) { + CHECK_EQ(in_attrs->size(), 2U); + CHECK_EQ(out_attrs->size(), 1U); + + // This routine converts from float to int8 with a scaling factor + TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kFloat32); + TYPE_ASSIGN_CHECK(*in_attrs, 1, mshadow::kFloat32); + TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt8); + return true; +} + +bool PrepareDataOpStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector* in_attrs, + std::vector* out_attrs) { + CHECK_EQ(in_attrs->size(), 2U); + CHECK_EQ(out_attrs->size(), 1U); + STORAGE_TYPE_ASSIGN_CHECK(*out_attrs, 0, kDefaultStorage); + STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, 0, kDefaultStorage); + STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, 1, kDefaultStorage); + DISPATCH_MODE_ASSIGN_CHECK(dispatch_mode, 0, DispatchMode::kFComputeEx); + return true; +} + +void PrepareDataOpForwardCPU(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 2U); + CHECK_EQ(outputs.size(), 1U); + CHECK_EQ(req.size(), 1U); + CHECK_EQ(req[0], kWriteTo) << "intgemm only overwrites"; + const TBlob &in = inputs[0], &out = outputs[0]; + + CHECK_EQ(in.type_flag_, mshadow::kFloat32); + CHECK_EQ(out.type_flag_, mshadow::kInt8); + CHECK(in.CheckContiguous()); + CHECK(out.CheckContiguous()); + + const float *A = in.dptr(); + int8_t *quantA = out.dptr(); + CHECK_EQ(reinterpret_cast(A) % 64, 0); + CHECK_EQ(reinterpret_cast(quantA) % 64, 0); + const float multiplier = 127.0 / *inputs[1].dptr(); + ::intgemm::Int8::Quantize(A, quantA, multiplier, in.shape_.Size()); +} + +NNVM_REGISTER_OP(_contrib_intgemm_prepare_data) +.add_alias("_npx_intgemm_prepare_data") +.describe(R"code(This operator converts quantizes float32 to int8 while also banning -128. + +It it suitable for preparing an data matrix for use by intgemm's C=data * weights operation. + +The float32 values are scaled such that maxabs maps to 127. Typically maxabs = maxabsolute(A). +)code" ADD_FILELINE) +.set_num_inputs(2) +.set_num_outputs(1) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + return std::vector{"data", "maxabs"}; + }) +.set_attr("FInferShape", PrepareDataOpShape) +.set_attr("FInferType", PrepareDataOpType) +.set_attr("FInferStorageType", PrepareDataOpStorageType) +.set_attr("FCompute", PrepareDataOpForwardCPU) +.add_argument("data", "NDArray-or-Symbol", "Activation matrix to be prepared for multiplication.") +.add_argument( + "maxabs", + "NDArray-or-Symbol", + "Maximum absolute value to be used for scaling. (The values will be multiplied by 127.0 / " + "maxabs.") +// TODO(Xinyu): a temp solution to enable GluonCV INT8 flow, +// will be reverted after the improvement of CachedOP is done. +.set_attr("FGradient", MakeZeroGradNodes); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/contrib/intgemm/prepare_weight_op.cc b/src/operator/contrib/intgemm/prepare_weight_op.cc new file mode 100644 index 000000000000..ad106ebca00b --- /dev/null +++ b/src/operator/contrib/intgemm/prepare_weight_op.cc @@ -0,0 +1,180 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file prepare_weight_op.cc + * \brief Converts weight matrices to intgemm's representation. + */ + +#include +#include +#include "../../mshadow_op.h" +#include "../../mxnet_op.h" +#include "../../operator_common.h" +#include "../../tensor/init_op.h" + +#include "intgemm/intgemm.h" + +namespace mxnet { +namespace op { + +struct PrepareWeightParam : public dmlc::Parameter { + bool already_quantized; + DMLC_DECLARE_PARAMETER(PrepareWeightParam) { + DMLC_DECLARE_FIELD(already_quantized).set_default(false) + .describe("Is the weight matrix already quantized?"); + } +}; +DMLC_REGISTER_PARAMETER(PrepareWeightParam); + +bool PrepareWeightOpShape(const nnvm::NodeAttrs& attrs, + mxnet::ShapeVector* in_attrs, + mxnet::ShapeVector* out_attrs) { + // Optimal maximum parameter. + CHECK_GE(in_attrs->size(), 1U) << "Need at least weight to quantize."; + CHECK_LE(in_attrs->size(), 2U) << "weight and maximum for scaling."; + CHECK_EQ(out_attrs->size(), 1U); + + SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0)); + SHAPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0)); + + if (in_attrs->size() == 2U) { + SHAPE_ASSIGN_CHECK(*in_attrs, 1, mxnet::TShape(1, 1)); + } + return shape_is_known(out_attrs->at(0)); +} + +bool PrepareWeightOpType(const nnvm::NodeAttrs& attrs, + std::vector* in_attrs, + std::vector* out_attrs) { + TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt8); + CHECK_GE(in_attrs->size(), 1U) << "Need at least weight to quantize."; + CHECK_LE(in_attrs->size(), 2U) << "weight and maximum for scaling."; + if (in_attrs->size() == 1U) { + TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kInt8); + } else if (in_attrs->size() == 2U) { + TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kFloat32); + TYPE_ASSIGN_CHECK(*in_attrs, 1, mshadow::kFloat32); + } + return true; +} + +bool PrepareWeightOpStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector* in_attrs, + std::vector* out_attrs) { + CHECK_GE(in_attrs->size(), 1U) << "Need at least weight to quantize."; + CHECK_LE(in_attrs->size(), 2U) << "weight and maximum for scaling."; + CHECK_EQ(out_attrs->size(), 1U); + STORAGE_TYPE_ASSIGN_CHECK(*out_attrs, 0, kDefaultStorage); + STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, 0, kDefaultStorage); + if (in_attrs->size() == 2U) { + STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, 1, kDefaultStorage); + } + DISPATCH_MODE_ASSIGN_CHECK(dispatch_mode, 0, DispatchMode::kFComputeEx); + return true; +} + +void PrepareWeightOpForwardCPU(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const PrepareWeightParam& params = nnvm::get(attrs.parsed); + CHECK_EQ(inputs.size(), params.already_quantized ? 1U : 2U); + CHECK_EQ(outputs.size(), 1U); + CHECK_EQ(req.size(), 1U); + CHECK_EQ(req[0], kWriteTo) << "intgemm only overwrites"; + + const TBlob &in = inputs.front(); + const TBlob &out = outputs.front(); + CHECK_EQ(out.type_flag_, mshadow::kInt8); + CHECK(in.CheckContiguous()); + CHECK(out.CheckContiguous()); + size_t B_cols = in.shape_.ProdShape(0, in.shape_.ndim() - 1); + size_t inner = in.shape_[in.shape_.ndim() - 1]; + CHECK_EQ(inner % ::intgemm::Int8::tile_info.b_rows, 0) << + "intgemm requires the inner dimension be a multiple of " << ::intgemm::Int8::tile_info.b_rows; + CHECK_EQ(B_cols % ::intgemm::Int8::tile_info.b_cols, 0) << + "intgemm requires the output dimension (the product of all but the last dimension of the " + "weight matrix) to be a multiple of " << ::intgemm::Int8::tile_info.b_cols << "."; + + int8_t *quantB = out.dptr(); + CHECK_EQ(reinterpret_cast(quantB) % 64, 0) << + "Pointers should be aligned to a multiple of 64."; + CHECK(in.type_flag_ == mshadow::kFloat32 || in.type_flag_ == mshadow::kInt8) << + "Expected either 32-bit values to be quantized or 8-bit values to rearrange."; + if (in.type_flag_ == mshadow::kInt8) { + const int8_t *B = in.dptr(); + CHECK_EQ(reinterpret_cast(B) % 64, 0) << + "Pointers should be aligned to a multiple of 64."; + ::intgemm::Int8::PrepareBQuantizedTransposed(B, quantB, inner, B_cols); + } else if (in.type_flag_ == mshadow::kFloat32) { + const float *B = in.dptr(); + CHECK_EQ(reinterpret_cast(B) % 64, 0) << + "Pointers should be aligned to a multiple of 64."; + ::intgemm::Int8::PrepareBTransposed( + B, + quantB, + 127.0 / *inputs[1].dptr(), + inner, + B_cols); + } +} + +NNVM_REGISTER_OP(_contrib_intgemm_prepare_weight) +.add_alias("_npx_intgemm_prepare_weight") +.describe(R"code(This operator converts a weight matrix in column-major format to intgemm's internal fast representation of weight matrices. MXNet customarily stores weight matrices in column-major (transposed) format. This operator is not meant to be fast; it is meant to be run offline to quantize a model. + +In other words, it prepares weight for the operation C = data * weight^T. + +If the provided weight matrix is float32, it will be quantized first. The quantization function is (int8_t)(127.0 / max * weight) where multiplier is provided as argument 1 (the weight matrix is argument 0). Then the matrix will be rearranged into the CPU-dependent format. + +If the provided weight matrix is already int8, the matrix will only be rearranged into the CPU-dependent format. This way one can quantize with intgemm_prepare_data (which just quantizes), store to disk in a consistent format, then at load time convert to CPU-dependent format with intgemm_prepare_weight. + +The internal representation depends on register length. So AVX512, AVX2, and SSSE3 have different formats. AVX512BW and AVX512VNNI have the same representation. +)code" ADD_FILELINE) +.set_attr_parser(ParamParser) +.set_num_inputs([](const NodeAttrs& attrs) { + const PrepareWeightParam& params = nnvm::get(attrs.parsed); + return params.already_quantized ? 1 : 2; +}) +.set_num_outputs(1) +.set_attr("FListInputNames", [](const NodeAttrs& attrs) { + const PrepareWeightParam& params = nnvm::get(attrs.parsed); + return params.already_quantized ? + std::vector{"weight"} : std::vector{"weight", "maxabs"}; +}) +.set_attr("FInferShape", PrepareWeightOpShape) +.set_attr("FInferType", PrepareWeightOpType) +.set_attr("FInferStorageType", PrepareWeightOpStorageType) +.set_attr("FCompute", PrepareWeightOpForwardCPU) +.add_argument("weight", "NDArray-or-Symbol", "Parameter matrix to be prepared for multiplication.") +.add_argument( + "maxabs", + "NDArray-or-Symbol", + "Maximum absolute value for scaling. The weights will be multipled by 127.0 / maxabs.") +// TODO(Xinyu): a temp solution to enable GluonCV INT8 flow, +// will be reverted after the improvement of CachedOP is done. +.set_attr("FGradient", MakeZeroGradNodes) +.add_arguments(PrepareWeightParam::__FIELDS__()); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/contrib/intgemm/take_weight_op.cc b/src/operator/contrib/intgemm/take_weight_op.cc new file mode 100644 index 000000000000..09e320e47327 --- /dev/null +++ b/src/operator/contrib/intgemm/take_weight_op.cc @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file select_weight_op.cc + * \brief Takes from the all-but-last dimension of a tensor stored in + * intgemm's weight format. This is particularly useful for output matrices where + * some outputs are excluded. + */ + +#include +#include +#include "../../mshadow_op.h" +#include "../../mxnet_op.h" +#include "../../operator_common.h" +#include "../../tensor/init_op.h" + +#include "intgemm/intgemm.h" + +namespace mxnet { +namespace op { + +inline bool TakeWeightOpShape(const nnvm::NodeAttrs& shape, + mxnet::ShapeVector* in_shape, + mxnet::ShapeVector* out_shape) { + // 0 is weight, 1 is indices. + CHECK_EQ(in_shape->size(), 2U); + CHECK_EQ(out_shape->size(), 1U); + + mxnet::TShape &weight = (*in_shape)[0]; + mxnet::TShape &indices = (*in_shape)[1]; + mxnet::TShape &out = (*out_shape)[0]; + + // weight matrices should be 2-dimensional by now. + SHAPE_ASSIGN_CHECK(*in_shape, 0, mxnet::TShape(2, -1)); + SHAPE_ASSIGN_CHECK(*out_shape, 0, mxnet::TShape(2, -1)); + // indices are 1-dimensional. + SHAPE_ASSIGN_CHECK(*in_shape, 1, mxnet::TShape(1, -1)); + + SHAPE_ASSIGN_CHECK(*out_shape, 0, mxnet::TShape({indices[0], weight[1]})); + SHAPE_ASSIGN_CHECK(*in_shape, 0, mxnet::TShape({-1, out[1]})); + SHAPE_ASSIGN_CHECK(*in_shape, 1, mxnet::TShape({out[0]})); + + return shape_is_known(weight) && shape_is_known(indices) && shape_is_known(out); +} + +inline bool TakeWeightOpType(const nnvm::NodeAttrs& attrs, + std::vector* in_attrs, + std::vector* out_attrs) { + CHECK_EQ(in_attrs->size(), 2U); + CHECK_EQ(out_attrs->size(), 1U); + + TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt8); + TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kInt8); + TYPE_ASSIGN_CHECK(*in_attrs, 1, mshadow::kInt32); + return true; +} + +inline bool TakeWeightOpStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector* in_attrs, + std::vector* out_attrs) { + *dispatch_mode = DispatchMode::kFCompute; + CHECK_EQ(in_attrs->size(), 2U); + CHECK_EQ(out_attrs->size(), 1U); + (*out_attrs)[0] = kDefaultStorage; + return true; +} + +void TakeWeightOpForwardCPU(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 2U); + CHECK_EQ(outputs.size(), 1U); + CHECK_EQ(req.size(), 1U); + CHECK_EQ(req[0], kWriteTo) << "TODO request types other than write"; + const TBlob &weight = inputs.front(), &indices = inputs[1], &out = outputs.front(); + CHECK_EQ(weight.type_flag_, mshadow::kInt8); + CHECK_EQ(indices.type_flag_, mshadow::kInt32); + CHECK_EQ(out.type_flag_, mshadow::kInt8); + CHECK(weight.CheckContiguous()); + CHECK(indices.CheckContiguous()); + CHECK(out.CheckContiguous()); + size_t B_cols = indices.shape_[0]; + size_t inner = weight.shape_[weight.shape_.ndim() - 1]; + CHECK_EQ(inner % ::intgemm::Int8::tile_info.b_rows, 0) << + "intgemm requires the inner dimension be a multiple of " << ::intgemm::Int8::tile_info.b_rows; + CHECK_EQ(B_cols % ::intgemm::Int8::tile_info.b_cols, 0) << + "For efficiency, intgemm requires there to be a multiple of " << + ::intgemm::Int8::tile_info.b_cols << " indices."; + // mxnet doesn't have a uint32_t type so we'll just pointer cast. But check the sizes are the + // same. Ideally this should be static. + assert(sizeof(int32_t) == sizeof(::intgemm::Index)); + const ::intgemm::Index *index = + reinterpret_cast(indices.dptr()); + + ::intgemm::Int8::SelectColumnsB( + weight.dptr(), + out.dptr(), + inner, + index, + index + B_cols); +} + +NNVM_REGISTER_OP(_contrib_intgemm_take_weight) +.add_alias("_npx_intgemm_take_weight") +.describe(R"code(Index a weight matrix stored in intgemm's weight format. +The indices select the outputs of matrix multiplication, not the inner dot product dimension. +)code" ADD_FILELINE) +.set_num_inputs(2) +.set_num_outputs(1) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + return std::vector{"weight", "indices"}; + }) +.set_attr("FInferShape", TakeWeightOpShape) +.set_attr("FInferType", TakeWeightOpType) +.set_attr("FInferStorageType", TakeWeightOpStorageType) +.set_attr("FCompute", TakeWeightOpForwardCPU) +.add_argument( + "weight", + "NDArray-or-Symbol", + "Tensor already in intgemm weight format to select from") +.add_argument("indices", "NDArray-or-Symbol", "indices to select on the 0th dimension of weight"); + +} // namespace op +} // namespace mxnet diff --git a/src/storage/cpu_device_storage.h b/src/storage/cpu_device_storage.h index f6b296a9643f..50cc4ab97787 100644 --- a/src/storage/cpu_device_storage.h +++ b/src/storage/cpu_device_storage.h @@ -53,7 +53,7 @@ class CPUDeviceStorage { /*! * \brief Alignment of allocation. */ -#if MXNET_USE_MKLDNN == 1 +#if MXNET_USE_MKLDNN == 1 || MXNET_USE_INTGEMM == 1 // MKLDNN requires special alignment. 64 is used by the MKLDNN library in // memory allocation. static constexpr size_t alignment_ = kMKLDNNAlign; diff --git a/tests/python/unittest/test_contrib_intgemm.py b/tests/python/unittest/test_contrib_intgemm.py new file mode 100644 index 000000000000..69fa5e0eefda --- /dev/null +++ b/tests/python/unittest/test_contrib_intgemm.py @@ -0,0 +1,221 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import mxnet as mx +from mxnet import np, npx +from mxnet.test_utils import same, use_np, assert_almost_equal +from common import with_seed +import random +from itertools import product + + +# with_seed() from MXNet 1.x breaks @pytest.mark.parametrize so all randomized +# tests use a for loop over a Cartesian product of parameters. + +@use_np +@with_seed() +def test_contrib_intgemm_maxabsolute(): + if "intgemm_maxabsolute" not in dir(mx.nd.contrib): + return + for shape in ([(3, 2), (9,17), (2, 7, 1, 8)] + [(i,) for i in range(1,65)]): + # mx.nd API + m = mx.nd.random_uniform(low=-100.0, high=100.0, shape=shape) + fast = mx.nd.contrib.intgemm_maxabsolute(m) + slow = mx.nd.max(mx.nd.abs(m)) + assert same(fast, slow) + # np API + m = np.random.uniform(low=-100.0, high=100.0, size=shape) + fast = npx.intgemm_maxabsolute(m).reshape(()) + slow = np.max(np.abs(m)) + assert same(fast, slow) + +@use_np +@with_seed() +def test_contrib_intgemm_prepare_data(): + if "intgemm_prepare_data" not in dir(mx.nd.contrib): + return + for shape, max_quant in product([(i,) for i in range(1, 67)] + [(2,3), (130, 12)], [2.0, 2.5]): + m = mx.nd.random_uniform(low=-3.0, high=3.0, shape=shape) + scaled = m * 127.0 / max_quant + # Rounding 0.5 can go up or down. Move values away from 0.5. + too_close = mx.nd.abs(mx.nd.round(scaled) - scaled) > 0.45 + m += max_quant / 127.0 * 0.05 * too_close + + # Reference: scale and round + ref = mx.nd.round(m * 127.0 / max_quant) + # Clip to [-127, 127]. Because otherwise e.g. -129 casts to +127. + ref = mx.nd.broadcast_maximum(ref, mx.nd.array([-127.0])) + ref = mx.nd.broadcast_minimum(ref, mx.nd.array([127.0])) + # Reference: cast to int8 + ref = mx.nd.cast(ref, dtype='int8') + # Reference: ban -128 + ref = mx.nd.broadcast_maximum(ref, mx.nd.array([-127], dtype = 'int8')) + + test = mx.nd.contrib.intgemm_prepare_data(m, mx.nd.array([max_quant])) + assert same(test, ref) + test = npx.intgemm_prepare_data(m.as_np_ndarray(), np.array([max_quant])) + assert same(test, ref.as_np_ndarray()) + +@use_np +@with_seed() +def test_contrib_intgemm_weight_consistent(): + # The weight format is actually CPU-dependent so we don't directly test the + # output, but indirectly test that it works. + if "intgemm_prepare_weight" not in dir(mx.nd.contrib): + return + for shape, max_quant, api in product( + [(8, 64), (16, 64), (8, 128), (16, 128), (2, 4, 64)], + [0.2, 3.0], + [(mx.nd.contrib, mx.nd), (npx, np)]): + contrib, top = api + max_array = top.array([max_quant]) + if top == mx.nd: + m = top.random_uniform(low=-3.0, high=3.0, shape=shape) + else: + m = np.random.uniform(size=shape) + direct = contrib.intgemm_prepare_weight(m, max_array) + quant = contrib.intgemm_prepare_data(m, max_array) + indirect = contrib.intgemm_prepare_weight(quant, already_quantized=True) + # Should get the same data from direct call and already_quantized version. + assert same(direct, indirect) + +@use_np +@with_seed() +def test_contrib_intgemm_take_weight(): + if "intgemm_take_weight" not in dir(mx.nd.contrib): + return + test_indices = [ + [0,1,2,3,4,5,6,7], + [1,2,1,2,1,2,1,2], + [7,6,5,4,3,2,1,0], + [3,1,4,1,5,9,2,6], + # Since random_uniform doesn't support int8, use python + [random.randint(0,15) for i in range(8)], + [random.randint(0,15) for i in range(16)], + [random.randint(0,15) for i in range(24)] + ] + for indices, api in product(test_indices, [(mx.nd.contrib, mx.nd), (npx, np)]): + contrib, top = api + m = top.array([random.randint(-127,127) for i in range(16 * 64)], dtype='int8') + m = m.reshape((16, 64)) + indices = top.array(indices, dtype='int32') + # Prepare weight then take. + test = contrib.intgemm_prepare_weight(m, already_quantized=True) + test = contrib.intgemm_take_weight(test, indices) + # Take then prepare. + ref = m.take(indices, axis=0) + ref = contrib.intgemm_prepare_weight(ref, already_quantized=True) + assert same(test, ref) + +@use_np +def test_contrib_intgemm_multiply(): + if "intgemm_fully_connected" not in dir(mx.nd.contrib): + return + apis = [(mx.nd.contrib, mx.nd, mx.nd.FullyConnected, mx.nd.cast), (npx, np, npx.fully_connected, npx.cast)] + for data_rows, inner, weight_cols, api in product(range(1, 5), + range(64, 256, 64), + range(8, 24, 8), + apis): + contrib, top, fully_connected, cast = api + #The multiplication routine has approximations so everything is tested + #deterministically to ensure bounds are met. + random.seed(1) + + # Don't use full range (-127, 127) to avoid saturation. + data = [random.randint(-64, 64) for i in range(data_rows * inner)] + data = top.array(data, dtype='int8').reshape((data_rows, inner)) + weight = [random.randint(-64, 64) for i in range(inner * weight_cols)] + weight = top.array(weight, dtype='int8').reshape((weight_cols, inner)) + weight_prepared = contrib.intgemm_prepare_weight(weight, already_quantized=True) + + # int32 output, no bias + test = contrib.intgemm_fully_connected(data, + weight_prepared, + no_bias=True, + flatten=False, + out_type='int32', + num_hidden=weight_cols) + ref = fully_connected(cast(data, dtype='float32'), + cast(weight, dtype='float32'), + no_bias=True, + flatten=False, + num_hidden=weight_cols) + assert_almost_equal(cast(test, dtype='float32').as_nd_ndarray(), ref.as_nd_ndarray(), rtol=0.01, atol=0.01) + + # float32 output, no bias + scale = 3.0 + test = contrib.intgemm_fully_connected(data, + weight_prepared, + top.array([scale]), + no_bias=True, + flatten=False, + out_type='float32', + num_hidden=weight_cols) + assert_almost_equal(test.as_nd_ndarray(), (ref * scale).as_nd_ndarray(), rtol=0.01, atol=0.01) + + # int32 output, bias + bias = top.array([random.randint(-60000, 60000) for i in range(weight_cols)], dtype = 'int32') + test = contrib.intgemm_fully_connected(data, + weight_prepared, + bias, + no_bias=False, + flatten=False, + out_type='int32', + num_hidden=weight_cols) + ref = fully_connected(cast(data, dtype='float32'), + cast(weight, dtype='float32'), + cast(bias, dtype='float32'), + no_bias=False, + flatten=False, + num_hidden=weight_cols) + assert_almost_equal(cast(test, dtype='float32').as_nd_ndarray(), ref.as_nd_ndarray(), rtol=0.01, atol=0.01) + + # float32 output, bias + # Scaling is applied before bias (and bias is not scaled). So to make the + # reference comparison easy, just scale the bias beforehand. + test = contrib.intgemm_fully_connected(data, + weight_prepared, + top.array([scale]), + cast(bias, dtype='float32') * scale, + no_bias=False, + flatten=False, + out_type='float32', + num_hidden=weight_cols) + assert_almost_equal(test.as_nd_ndarray(), (ref * scale).as_nd_ndarray(), rtol=0.01, atol=0.01) + + # float32 input should work the same as manually prepared int8 input. + data_float = top.array([random.uniform(-3.14, 3.14) for i in range(data_rows * inner)]) + data_float = data_float.reshape(data_rows, inner) + direct = contrib.intgemm_fully_connected(data_float, + weight_prepared, + top.array([scale]), + cast(bias, dtype='float32'), + no_bias=False, + flatten=False, + out_type='float32', + num_hidden=weight_cols) + maxabs = contrib.intgemm_maxabsolute(data_float) + data_prepared = contrib.intgemm_prepare_data(data_float, maxabs) + cooked = contrib.intgemm_fully_connected(data_prepared, + weight_prepared, + top.array(scale * maxabs / 127.0), + cast(bias, dtype='float32'), + no_bias=False, + flatten=False, + out_type='float32', + num_hidden=weight_cols) + assert_almost_equal(direct.as_nd_ndarray(), cooked.as_nd_ndarray(), rtol=0.01, atol=0.01)