You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
PR #63 involves considerable efforts to implement Block reduction with TensorCore, which is expected to improve the performance of continuous batching with tensorcore. However, the performance is still poorer than trtllm batch dequantize kernel, a comprehensive study is required to understand the gap.
code to reproduce:
# Copyright (c) Microsoft Corporation.# Licensed under the MIT License.frombitblas.utils.target_detectorimportauto_detect_nvidia_targetfrombitblasimportMatmul, MatmulConfigimportargparseimportbitblasimporttvmfrombitblas.base.roller.policyimportTensorCorePolicy, DefaultPolicyfrombitblas.base.roller.archimportCUDAfrombitblas.gpu.matmul_analysisimportget_tensorized_func_and_tagsfrombitblas.base.utilsimportapply_and_buildimporttimefromtvmimportte, tirbitblas.set_log_level("DEBUG")
# Initialize the parser parser=argparse.ArgumentParser(
description="Benchmark BitBLAS int4 on a specific target."
)
# Add arguments to the parser parser.add_argument(
"--target",
type=str,
default=auto_detect_nvidia_target(),
help="Specify the target device for benchmarking."
)
parser.add_argument(
"--group_size",
type=int,
default=None,
help="Group size for grouped quantization."
)
parser.add_argument(
"--A_dtype",
type=str,
default="float16",
choices=["float16", "float32", "float64", "int32", "int8"], # Assuming these are the valid choices help="Data type of activation A."
)
parser.add_argument(
"--W_dtype",
type=str,
default="uint4",
help="Data type of weight W."
)
parser.add_argument(
"--accum_dtype",
type=str,
default="float16",
help="Data type for accumulation."
)
parser.add_argument(
"--out_dtype",
type=str,
default="float16",
choices=["float16", "float32", "int32", "int8"], # Assuming these are the valid choices help="Data type for output."
)
parser.add_argument(
"--layout",
type=str,
default="nt",
choices=["nt", "nn"], # Assuming these are the valid choices help="Matrix layout, 'nt' for non-transpose A and transpose W."
)
parser.add_argument(
"--with_bias",
action="store_true",
help="Include bias in the benchmark."
)
parser.add_argument(
"--with_scaling",
action="store_true",
help="Include scaling factor in the quantization."
)
parser.add_argument(
"--with_zeros",
action="store_true",
help="Include zeros in the quantization."
)
parser.add_argument(
"--zeros_mode",
type=str,
default=None,
choices=["original", "rescale", "quantized"], # Replace with actual modes if applicable help="Specify the mode for calculating zeros."
)
parser.add_argument(
"--propagate_a",
type=str,
default=True,
choices=["original", "rescale", "quantized"], # Replace with actual modes if applicable help="Specify the mode for calculating zeros."
)
parser.add_argument(
"--propagate_b",
type=str,
default=True,
choices=["original", "rescale", "quantized"], # Replace with actual modes if applicable help="Specify the mode for calculating zeros."
)
# Parse the arguments args=parser.parse_args()
# Assign arguments to variables target=args.targetgroup_size=args.group_sizeA_dtype=args.A_dtypeW_dtype=args.W_dtypeaccum_dtype=args.accum_dtypeout_dtype=args.out_dtypelayout=args.layoutwith_bias=args.with_biasgroup_size=args.group_sizewith_scaling=args.with_scalingwith_zeros=args.with_zeroszeros_mode=args.zeros_modepropagate_a=args.propagate_apropagate_b=args.propagate_btest_shapes= [
(MatmulConfig, Matmul, (16, 16384, 16384, A_dtype, W_dtype, out_dtype, accum_dtype, layout, with_bias, group_size, with_scaling, with_zeros, zeros_mode)),
]
benchmark_sets= []
benchmark_sets.extend(test_shapes)
# fmt:onbenchmark_results= {}
forconfig, operator, input_argsinbenchmark_sets:
matmul_config=config(*input_args, propagate_a=True, propagate_b=True, fast_decoding=True)
matmul=operator(matmul_config, target=target, enable_tuning=False)
func=matmul.prim_funcintrin_info=bitblas.base.roller.hint.IntrinInfo(
in_dtype="float16",
out_dtype="float16",
trans_b=True,
input_transform_kind=2,
weight_transform_kind=2,
)
sch_normal=bitblas.gpu.MatmulTensorizationMMAWithDequantizeInfo().sch_shared_memory_prefetch_with_config(
func,
bitblas.base.roller.hint.Hint().from_dict({
"warp": [16, 16],
"block": [16, 64],
"rstep": [128],
"pipeline_stage": 2,
"use_async": True,
"intrin_info": intrin_info,
"shared_scope": "shared",
"vectorize": {
"A": 8,
"B": 8,
},
"rasterization_plan": bitblas.base.roller.Rasterization2DColumn(10)
})
)
withtvm.transform.PassContext(config={"tir.use_async_copy": True, "tir.merge_static_smem": False, "cuda.kernels_output_dir": "./debug/bitblas_fp16xint4_fp16_pb_noscale_with_default"}):
rt_mod=tvm.build(sch_normal.mod, target=matmul.target)
time_evaluator=rt_mod.time_evaluator(rt_mod.entry_name, tvm.cuda(), number=10)
profile_tensors=matmul.get_profile_tensors()
latency=time_evaluator(*profile_tensors).mean*1e3# print(rt_mod.imported_modules[0].get_source())print(f"Time cost is: {latency:.3f} ms")
sch_reduce=bitblas.gpu.MatmulTensorizationMMAWithDequantizeInfo().sch_shared_memory_prefetch_with_config(
func,
bitblas.base.roller.hint.Hint().from_dict({
"warp": [16, 16],
"block": [16, 64],
"rstep": [128],
"pipeline_stage": 2,
"use_async": True,
"intrin_info": intrin_info,
"shared_scope": "shared",
"vectorize": {
"A": 8,
"B": 8,
},
"block_reduction_depth": 2,
"rasterization_plan": bitblas.base.roller.Rasterization2DColumn(10)
})
)
withtvm.transform.PassContext(config={"tir.use_async_copy": True, "tir.merge_static_smem": False, "cuda.kernels_output_dir": "./debug/bitblas_fp16xint4_fp16_pb_noscale_with_default"}):
rt_mod=tvm.build(sch_reduce.mod, target=matmul.target)
time_evaluator=rt_mod.time_evaluator(rt_mod.entry_name, tvm.cuda(), number=10)
latency=time_evaluator(*profile_tensors).mean*1e3# print(rt_mod.imported_modules[0].get_source())print(f"Time cost is: {latency:.3f} ms")
The text was updated successfully, but these errors were encountered:
Marlin appears to outperform both cutlass and bitBlas within small i4 shapes. Marlin's CUDA kernel design mandates setting the grid size to the number of streaming multiprocessors (such as 108), whereas cutlass and bitblas adjust it according to the tile size (such as 128). which makes around 50% wave waste.
PR #63 involves considerable efforts to implement Block reduction with TensorCore, which is expected to improve the performance of continuous batching with tensorcore. However, the performance is still poorer than trtllm batch dequantize kernel, a comprehensive study is required to understand the gap.
code to reproduce:
The text was updated successfully, but these errors were encountered: