From 6f4645f1240f57bbabfa45423571d0448dd6a117 Mon Sep 17 00:00:00 2001
From: "Xu, Rui" <rui.xu@intel.com>
Date: Mon, 23 Sep 2024 00:48:01 -0700
Subject: [PATCH 01/14] add tuner

---
 test/benchgc/CMakeLists.txt                   |   1 +
 test/benchgc/src/benchgc/__main__.py          | 105 ++-
 test/benchgc/src/benchgc/bench.py             |  11 +-
 test/benchgc/src/benchgc/tuner/CMakeLists.txt |  22 +
 test/benchgc/src/benchgc/tuner/README.md      |   1 +
 test/benchgc/src/benchgc/tuner/__init__.py    |  15 +
 .../src/benchgc/tuner/config_filter.py        |  98 +++
 test/benchgc/src/benchgc/tuner/op_config.py   | 174 +++++
 test/benchgc/src/benchgc/tuner/tuner.py       | 608 ++++++++++++++++++
 test/benchgc/src/benchgc/tuner/utils.py       |  61 ++
 10 files changed, 1088 insertions(+), 8 deletions(-)
 create mode 100644 test/benchgc/src/benchgc/tuner/CMakeLists.txt
 create mode 100644 test/benchgc/src/benchgc/tuner/README.md
 create mode 100644 test/benchgc/src/benchgc/tuner/__init__.py
 create mode 100644 test/benchgc/src/benchgc/tuner/config_filter.py
 create mode 100644 test/benchgc/src/benchgc/tuner/op_config.py
 create mode 100644 test/benchgc/src/benchgc/tuner/tuner.py
 create mode 100644 test/benchgc/src/benchgc/tuner/utils.py

diff --git a/test/benchgc/CMakeLists.txt b/test/benchgc/CMakeLists.txt
index ff00d27b7..356cd449f 100644
--- a/test/benchgc/CMakeLists.txt
+++ b/test/benchgc/CMakeLists.txt
@@ -41,3 +41,4 @@ add_subdirectory("src/benchgc/tensor")
 add_subdirectory("src/benchgc/arith")
 add_subdirectory("src/benchgc/pattern")
 add_subdirectory("src/benchgc/math")
+add_subdirectory("src/benchgc/tuner")
diff --git a/test/benchgc/src/benchgc/__main__.py b/test/benchgc/src/benchgc/__main__.py
index 379078a9d..67945d4ef 100644
--- a/test/benchgc/src/benchgc/__main__.py
+++ b/test/benchgc/src/benchgc/__main__.py
@@ -31,9 +31,15 @@
     set_default_fill,
 )
 from benchgc.arg.arg import Arg
-from benchgc.bench import mlir_wrapper_bench, py_timeit_bench
+from benchgc.bench import (
+    batch_mlir_wrapper_bench,
+    batch_py_timeit_bench,
+    mlir_wrapper_bench,
+    py_timeit_bench,
+)
 from benchgc.mlir.arg import get_mlir_args
 from benchgc.pattern import get_pattern_clz
+from benchgc.tuner.tuner import GATuner, GridTuner, Tuner, TuningSpace
 from gc_mlir import ir
 from gc_mlir.graph_compiler import GraphCompiler
 
@@ -44,7 +50,7 @@ def add_common_options(parser: argparse.ArgumentParser):
         "--mode",
         required=False,
         help="specify the test mode, C for correctness testing, P for performance testing",
-        choices=["C", "P"],
+        choices=["C", "P", "T"],
         default="C",
         type=str,
     )
@@ -198,7 +204,7 @@ def add_common_options(parser: argparse.ArgumentParser):
 
 def add_bench_options(parser: argparse.ArgumentParser):
     """add options for bench mode"""
-    if parser.parse_known_args()[0].mode == "P":
+    if parser.parse_known_args()[0].mode in ("P", "T"):
         parser.add_argument(
             "--bench_kind", type=str, choices=["py", "wrapper"], default="py"
         )
@@ -213,6 +219,40 @@ def add_pattern_options(parser: argparse.ArgumentParser):
         pattern_name = parser.parse_known_args()[0].case
         get_pattern_clz(pattern_name).add_args(parser)
 
+def add_tuner_options(parser: argparse.ArgumentParser):
+    """add options for the mode T"""
+    if parser.parse_known_args()[0].mode == "T":
+        parser.add_argument(
+            "--search_alg", type=str, choices=["grid", "ga"], default="grid"
+        )
+        parser.add_argument(
+            "--tuning_batch", type=int, default=Tuner.DEFAULT_BATCH_SIZE
+        )
+        parser.add_argument("--early_stop", type=int, default=Tuner.DEFAULT_EARLY_STOP)
+        parser.add_argument(
+            "--max_tuning_iters", type=int, default=Tuner.DEFAULT_MAX_ITERS
+        )
+        parser.add_argument("--timeout", type=int, default=Tuner.DEFAULT_TIMEOUT)
+        parser.add_argument(
+            "--space_percent", type=float, default=TuningSpace.DEFAULT_SPACE_PERCENT
+        )
+        parser.add_argument("--checkpoint_path", type=str, default="")
+
+        if parser.parse_known_args()[0].search_alg == "ga":
+            parser.add_argument(
+                "--random_seed", type=int, default=GATuner.DEFAULT_RANDOM_SEED
+            )
+            parser.add_argument(
+                "--elite_num", type=int, default=GATuner.DEFAULT_ELITE_NUM
+            )
+            parser.add_argument(
+                "--mutation_prob", type=float, default=GATuner.DEFAULT_MUTATION_PROB
+            )
+            parser.add_argument(
+                "--expected_tune_num",
+                type=int,
+                default=GATuner.DEFAULT_EXPECTED_TUNE_NUM,
+            )
 
 def get_module_and_args(flags: argparse.Namespace):
     args: List[Arg] = []
@@ -391,11 +431,68 @@ def performance_testing(flags: argparse.Namespace, module: ir.Module, args: List
         print(json_res)
 
 
+def performance_tuning(flags: argparse.Namespace, module: ir.Module, args: List[Arg]):
+    gc_args: List[torch.Tensor | int] = []
+    gc_tensors: Dict[str, torch.Tensor] = {}
+    for i in range(len(args)):
+        tensor = fill_tensor(flags, args[i], i)
+        gc_tensors["%arg" + str(i)] = tensor
+        if args[i].scalar:
+            gc_args.append(tensor.data_ptr())
+        else:
+            gc_args.append(tensor)
+
+    mlir_args = get_mlir_args(gc_args)
+    with module.context as ctx, ir.Location.unknown():
+        if flags.ir_printing:
+            ctx.enable_multithreading(False)
+        batch_bench = (
+            batch_py_timeit_bench
+            if flags.bench_kind == "py"
+            else batch_mlir_wrapper_bench
+        )
+
+        def tuner_batch_bench(ir_moudles):
+            return batch_bench(
+                ir_moudles,
+                flags.entry,
+                "any(gc-cpu-pipeline)",
+                mlir_args,
+                flags.ir_printing,
+                flags.repeat,
+                flags.warm_up,
+            )
+
+        assert flags.space_percent > 0 and flags.space_percent <= 1.0
+        space = TuningSpace(module, flags.space_percent)
+        print("flags.search_alg", flags.search_alg)
+        if flags.search_alg == "grid":
+            tuner = GridTuner(
+                tuner_batch_bench,
+                space,
+                flags.tuning_batch,
+                flags.early_stop,
+                flags.checkpoint_path,
+            )
+        else:
+            tuner = GATuner(
+                tuner_batch_bench,
+                space,
+                flags.tuning_batch,
+                flags.early_stop,
+                flags.checkpoint_path,
+                random_seed=flags.random_seed,
+                expected_tune_num=flags.expected_tune_num,
+            )
+        tuner.run(flags.max_tuning_iters, flags.timeout)
+
+
 if __name__ == "__main__":
     arg_parser = argparse.ArgumentParser(prog="benchmark tool for graph compiler")
     add_common_options(arg_parser)
     add_bench_options(arg_parser)
     add_pattern_options(arg_parser)
+    add_tuner_options(arg_parser)
     flags = arg_parser.parse_args()
     benchgc.util.set_seed(flags.seed)
     ir_module, module_args = get_module_and_args(flags)
@@ -403,5 +500,7 @@ def performance_testing(flags: argparse.Namespace, module: ir.Module, args: List
         correctness_testing(flags, ir_module, module_args)
     elif flags.mode == "P":
         performance_testing(flags, ir_module, module_args)
+    elif flags.mode == "T":
+        performance_tuning(flags, ir_module, module_args)
     else:
         pass
diff --git a/test/benchgc/src/benchgc/bench.py b/test/benchgc/src/benchgc/bench.py
index 0c8763191..1dd41a142 100644
--- a/test/benchgc/src/benchgc/bench.py
+++ b/test/benchgc/src/benchgc/bench.py
@@ -117,21 +117,22 @@ def batch_py_timeit_bench(
     ir_modules: List[ir.Module],
     entry_name: str,
     pipeline: str,
-    mlir_args: list,
+    mlir_args: List[Any],
     ir_printing=False,
     repeat_time=5,
     warm_up=2,
 ) -> List[Tuple[float, float]]:
     """benchmark a batch of mlir with python timeit."""
     compiler = GraphCompiler(pipeline)
+    engines = []
     funcs = []
     compile_costs = []
     for m in ir_modules:
         compile_begin = timeit.default_timer()
         engine = compiler.compile_and_jit(m, ir_printing=ir_printing)
+        engines.append(engine)
         compile_cost = (timeit.default_timer() - compile_begin) * 1000
         compile_costs.append(compile_cost)
-        funcs.append(engine.lookup(entry_name))
 
     # Copied from execution_engine.py so that the cost of cast does not affect perf result.
     packed_args = (ctypes.c_void_p * len(mlir_args))()
@@ -141,11 +142,11 @@ def batch_py_timeit_bench(
     def run_bench(func, arg):
         func(arg)
 
-    for func in funcs:
+    for func in [engine.lookup(entry_name) for engine in engines]:
         timeit.timeit(lambda: run_bench(func, packed_args), number=warm_up)
 
     execute_costs = []
-    for func in funcs:
+    for func in [engine.lookup(entry_name) for engine in engines]:
         total_time = timeit.timeit(
             lambda: run_bench(func, packed_args), number=repeat_time
         )
@@ -158,7 +159,7 @@ def batch_mlir_wrapper_bench(
     ir_modules: ir.Module,
     entry_name: str,
     pipeline: str,
-    mlir_args: list,
+    mlir_args: List[Any],
     ir_printing=False,
     repeat_time=5,
     warm_up=2,
diff --git a/test/benchgc/src/benchgc/tuner/CMakeLists.txt b/test/benchgc/src/benchgc/tuner/CMakeLists.txt
new file mode 100644
index 000000000..506e36153
--- /dev/null
+++ b/test/benchgc/src/benchgc/tuner/CMakeLists.txt
@@ -0,0 +1,22 @@
+################################################################################
+# Copyright (C) 2024 Intel Corporation
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+# SPDX-License-Identifier: Apache-2.0
+################################################################################
+
+
+file(GLOB PYTHON_SCRIPTS "*.py")
+foreach(PY_SCRIPT ${PYTHON_SCRIPTS})
+  configure_file(${PY_SCRIPT} ${CMAKE_BINARY_DIR}/test/benchgc/src/benchgc/tuner/ COPYONLY)
+endforeach()
diff --git a/test/benchgc/src/benchgc/tuner/README.md b/test/benchgc/src/benchgc/tuner/README.md
new file mode 100644
index 000000000..503fa1da0
--- /dev/null
+++ b/test/benchgc/src/benchgc/tuner/README.md
@@ -0,0 +1 @@
+#TODO
\ No newline at end of file
diff --git a/test/benchgc/src/benchgc/tuner/__init__.py b/test/benchgc/src/benchgc/tuner/__init__.py
new file mode 100644
index 000000000..4d3e897ce
--- /dev/null
+++ b/test/benchgc/src/benchgc/tuner/__init__.py
@@ -0,0 +1,15 @@
+################################################################################
+# Copyright 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
diff --git a/test/benchgc/src/benchgc/tuner/config_filter.py b/test/benchgc/src/benchgc/tuner/config_filter.py
new file mode 100644
index 000000000..1dff74e27
--- /dev/null
+++ b/test/benchgc/src/benchgc/tuner/config_filter.py
@@ -0,0 +1,98 @@
+################################################################################
+# Copyright (C) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+# SPDX-License-Identifier: Apache-2.0
+################################################################################
+
+import math
+from abc import ABC, abstractmethod
+from typing import List
+
+
+class ConfigFilter(ABC):
+    """
+    A class used to help filter out unseen configs.
+    """
+
+    @abstractmethod
+    def already_met(self, v: List[int]) -> bool:
+        """Check if the config has been met before."""
+
+    @abstractmethod
+    def add(self, v: List[int]):
+        """Add the config to the filter."""
+
+    @abstractmethod
+    def save(self):
+        """Save the satus of the filter."""
+
+    @abstractmethod
+    def load(self, data):
+        """Load the status of the filter."""
+
+
+class BloomFilter(ConfigFilter):
+    """Bloom Filter"""
+
+    def __init__(self, num_samples: int, err_rate: float):
+        self.num_bits = int(-(num_samples * math.log(err_rate)) / (math.log(2) ** 2))
+        self.num_hashes = int((self.num_bits / num_samples) * math.log(2))
+        self.bit_array = [0] * self.num_bits
+
+    def already_met(self, v):
+        for i in range(int(self.num_hashes)):
+            try:
+                import mmh3
+            except ImportError:
+                raise ImportError("Please install mmh3 package")
+            hash_v = mmh3.hash(v, i) % self.num_bits
+            if self.bit_array[hash_v] == 0:
+                return False
+        return True
+
+    def add(self, v):
+        for i in range(int(self.num_hashes)):
+            try:
+                import mmh3
+            except ImportError:
+                raise ImportError("Please install mmh3 package")
+            hash_v = mmh3.hash(v, i) % self.num_bits
+            self.bit_array[hash_v] = 1
+
+    def save(self):
+        return self.bit_array
+
+    def load(self, data):
+        self.bit_array = data
+
+
+class HashSetFilter(ConfigFilter):
+    """Fliter based on HashSet"""
+
+    def __init__(self):
+        self.data = set()
+
+    def add(self, v):
+        self.data.add(tuple(v))
+
+    def already_met(self, v: List[int]) -> bool:
+        return tuple(v) in self.data
+
+    def save(self):
+        return self.data
+
+    def load(self, data):
+        self.data.clear()
+        for item in data:
+            self.add(item)
diff --git a/test/benchgc/src/benchgc/tuner/op_config.py b/test/benchgc/src/benchgc/tuner/op_config.py
new file mode 100644
index 000000000..67b80a701
--- /dev/null
+++ b/test/benchgc/src/benchgc/tuner/op_config.py
@@ -0,0 +1,174 @@
+################################################################################
+# Copyright (C) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+# SPDX-License-Identifier: Apache-2.0
+################################################################################
+
+import json
+import math
+import os
+
+from gc_mlir.extras import types as T
+from gc_mlir.ir import IntegerAttr, OpView
+
+
+class Config:
+    def __init__(self):
+        self.field_candidates = {}
+        self.field_constraints = {}
+        self.init_candidates()
+        self.init_constraints()
+
+    def init_candidates(self):
+        pass
+
+    def init_constraints(self):
+        pass
+
+    def attach_to_ir(self, op: OpView):
+        pass
+
+
+def find_factors(num):
+    factors = set()
+    for i in range(1, int(math.sqrt(num)) + 1):
+        if num % i == 0:
+            factors.add(i)
+            factors.add(num // i)
+    return sorted(factors)
+
+
+class MatMulConfig(Config):
+    def __init__(
+        self,
+        op: OpView = None,
+        M_threads: int = 1,
+        K_threads: int = 1,
+        N_threads: int = 1,
+        M_block: int = 1,
+        K_block: int = 1,
+        N_block: int = 1,
+        innermostM_block: int = 1,
+        innermostK_block: int = 1,
+        innermostN_block: int = 1,
+    ):
+        # you can set the default value and candidates by info from matmul_op
+        self.M = op.inputs[0].type.shape[0]
+        self.K = op.inputs[0].type.shape[1]
+        self.N = op.inputs[1].type.shape[1]
+        # self.input_a_dtype = str(op.inputs[0].type.element_type)
+        self.num_threads = int(os.environ.get("OMP_NUM_THREADS", 1))
+        self.M_threads = M_threads
+        self.K_threads = K_threads
+        self.N_threads = N_threads
+        self.M_block = M_block
+        self.K_block = K_block
+        self.N_block = N_block
+        self.innermostM_block = innermostM_block
+        self.innermostK_block = innermostK_block
+        self.innermostN_block = innermostN_block
+        super().__init__()
+
+    def init_candidates(self):
+        default_blocks = [16, 32, 64]
+        self.field_candidates["M_threads"] = find_factors(self.num_threads)
+        self.field_candidates["K_threads"] = find_factors(self.num_threads)
+        self.field_candidates["N_threads"] = find_factors(self.num_threads)
+        self.field_candidates["M_block"] = [
+            block for block in default_blocks if self.M >= block
+        ]
+        self.field_candidates["K_block"] = [
+            block for block in default_blocks if self.K >= block
+        ]
+        self.field_candidates["N_block"] = [
+            block for block in default_blocks if self.N >= block
+        ]
+        self.field_candidates["innermostM_block"] = [
+            block for block in default_blocks if self.M >= block
+        ]
+        self.field_candidates["innermostK_block"] = [
+            block for block in default_blocks if self.K >= block
+        ]
+        self.field_candidates["innermostN_block"] = [
+            block for block in default_blocks if self.N >= block
+        ]
+
+    def init_constraints(self):
+        # example: using lambda to add constraints, adding constraints by the order of the fields
+        self.field_constraints["M_threads"] = None
+        self.field_constraints["K_threads"] = (
+            lambda MatMulConfig, K_threads: self.num_threads
+            % (MatMulConfig.M_threads * K_threads)
+            == 0
+        )
+        self.field_constraints["N_threads"] = (
+            lambda MatMulConfig, N_threads: self.num_threads
+            % (MatMulConfig.M_threads * MatMulConfig.K_threads * N_threads)
+            == 0
+        )
+        self.field_constraints["M_block"] = None
+        self.field_constraints["K_block"] = None
+        self.field_constraints["N_block"] = None
+        self.field_constraints["innermostM_block"] = (
+            lambda MatMulConfig, innermostM_block: MatMulConfig.M_block
+            % innermostM_block
+            == 0
+        )
+        self.field_constraints["innermostK_block"] = (
+            lambda MatMulConfig, innermostK_block: MatMulConfig.K_block
+            % innermostK_block
+            == 0
+        )
+        self.field_constraints["innermostN_block"] = (
+            lambda MatMulConfig, innermostN_block: MatMulConfig.N_block
+            % innermostN_block
+            == 0
+        )
+
+    def attach_to_ir(self, op: OpView):
+        attr_to_field = {
+            "Mthreads": self.M_threads,
+            "Kthreads": self.K_threads,
+            "Nthreads": self.N_threads,
+            "MBlock": self.M_block,
+            "KBlock": self.K_block,
+            "NBlock": self.N_block,
+            "innermostMBlock": self.innermostM_block,
+            "innermostKBlock": self.innermostK_block,
+            "innermostNBlock": self.innermostN_block,
+        }
+        for name, value in attr_to_field.items():
+            op.attributes[name] = IntegerAttr.get(T.i32(), value)
+
+    def __repr__(self) -> str:
+        return self.__str__()
+
+    def __str__(self) -> str:
+        obj_dict = {
+            "MatMulConfig": {
+                "M_threads": self.M_threads,
+                "K_threads": self.K_threads,
+                "N_threads": self.N_threads,
+                "M_block": self.M_block,
+                "K_block": self.K_block,
+                "N_block": self.N_block,
+                "innermostM_block": self.innermostM_block,
+                "innermostK_block": self.innermostK_block,
+                "innermostN_block": self.innermostN_block,
+            }
+        }
+        return json.dumps(obj_dict, indent=4)
+
+
+OP_TO_CONFIG = {"linalg.matmul": MatMulConfig, "onednn_graph.matmul": MatMulConfig}
diff --git a/test/benchgc/src/benchgc/tuner/tuner.py b/test/benchgc/src/benchgc/tuner/tuner.py
new file mode 100644
index 000000000..f8c1e1a02
--- /dev/null
+++ b/test/benchgc/src/benchgc/tuner/tuner.py
@@ -0,0 +1,608 @@
+################################################################################
+# Copyright (C) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+# SPDX-License-Identifier: Apache-2.0
+################################################################################
+
+import json
+import os
+import random
+import sys
+import time
+from abc import ABC, abstractmethod
+from copy import deepcopy
+from typing import List
+
+from gc_mlir import ir
+
+from benchgc.tuner.config_filter import BloomFilter, HashSetFilter
+from benchgc.tuner.op_config import *
+from benchgc.tuner.utils import attach_configs_to_ir, gen_configs_from_ir
+
+tuner_verbose = False
+
+
+class TuningSpace:
+    """
+    The class works as a bridge between the tuner and the configs in MLIR module.
+    """
+
+    DEFAULT_SPACE_PERCENT = 1.0
+
+    def __init__(
+        self, ir_module: ir.Module, space_percent: float = DEFAULT_SPACE_PERCENT
+    ):
+        self.initial_ir = ir_module
+        self.graph_config = gen_configs_from_ir(ir_module)
+        self.space_size = 1
+        self.flatten_candidates = []
+        self.flatten_field_name = []
+        self.flatten_constraints = []
+        self.ind_candidate_to_config = {}
+        candidate_ind = 0
+        for config_ind, config in enumerate(self.graph_config):
+            for field_name, candidates in config.field_candidates.items():
+                self.space_size = self.space_size * len(candidates)
+                self.flatten_candidates.append(candidates)
+                self.flatten_field_name.append(field_name)
+                self.flatten_constraints.append(config.field_constraints[field_name])
+                self.ind_candidate_to_config[candidate_ind] = config_ind
+                candidate_ind += 1
+        self.space_size = int(self.space_size * space_percent)
+
+    def make_config_from_indexes(self, indexes: List[int]):
+        """
+        Make a config from a list of indexes of candidates.
+        """
+        graph_config = deepcopy(self.graph_config)
+        for cid, candidate in enumerate(self.flatten_candidates):
+            val = candidate[indexes[cid]]
+            config = graph_config[self.ind_candidate_to_config[cid]]
+            field_name = self.flatten_field_name[cid]
+            setattr(config, field_name, val)
+        return graph_config
+
+    def get_cur_config(self, candidate_ind: int):
+        """
+        Get the current config with a incoming candidate index
+        """
+        return self.graph_config[self.ind_candidate_to_config[candidate_ind]]
+
+    def verify_config(self, candidate_idx, val) -> bool:
+        """
+        Verify the config with constraints
+        """
+        config = self.get_cur_config(candidate_idx)
+        field_name = self.flatten_field_name[candidate_idx]
+        constraint = self.flatten_constraints[candidate_idx]
+        val = self.flatten_candidates[candidate_idx][val]
+        setattr(config, field_name, val)
+        if constraint:
+            return constraint(config, val)
+        return True
+
+    def filter_next_candidates(self, candidate_idx, val) -> List[int]:
+        """
+        Get the next candidates with the incoming candidate index and value
+        """
+        field_name = self.flatten_field_name[candidate_idx]
+        config = self.get_cur_config(candidate_idx)
+        setattr(
+            config,
+            field_name,
+            self.flatten_candidates[candidate_idx][val],
+        )
+        if (candidate_idx + 1) >= len(self.flatten_candidates):
+            return []
+        constraint = self.flatten_constraints[candidate_idx + 1]
+        if constraint:
+            next_candidates = self.flatten_candidates[candidate_idx + 1]
+            return [
+                index
+                for index, value in enumerate(next_candidates)
+                if constraint(config, value)
+            ]
+        else:
+            return list(range(len(self.flatten_candidates[candidate_idx + 1])))
+
+
+class Tuner(ABC):
+    """
+    Class for creating different configs and choose the config with best perf
+    """
+
+    DEFAULT_BATCH_SIZE = 50
+    DEFAULT_EARLY_STOP = -1
+    DEFAULT_TIMEOUT = -1
+    DEFAULT_MAX_ITERS = sys.maxsize
+
+    def __init__(
+        self,
+        batch_executor,
+        tunning_space: TuningSpace,
+        batch_size=DEFAULT_BATCH_SIZE,
+        early_stop=DEFAULT_EARLY_STOP,
+        checkpoint="",
+    ):
+        self.batch_executor = batch_executor
+        self.batch_size = batch_size
+        self.early_stop = early_stop
+        self.best_cost = sys.float_info.max
+        self.best = []
+        self.iter = 0
+        self.last_update_iter = 0
+        self.skipped_num = 0
+        self.tunning_space = tunning_space
+        self.checkpoint = checkpoint
+        if self.checkpoint:
+            os.makedirs(os.path.dirname(self.checkpoint), exist_ok=True)
+        assert len(tunning_space.graph_config), "There are no tunable ops"
+
+    def tuner_update(self, config_indices_batch: List[List[int]], costs: List[float]):
+        """
+        Update after each batch of configs was executed
+        """
+        if min(costs) < self.best_cost:
+            self.best_cost = min(costs)
+            self.best = config_indices_batch[costs.index(min(costs))]
+        if self.checkpoint:
+            self.save_status()
+
+    @abstractmethod
+    def get_next_config_indices_batch(self) -> List[List[int]]:
+        """
+        Get the next batch of config indices
+        """
+        pass
+
+    @abstractmethod
+    def load_status(self):
+        """
+        Load the Tuner status from the checkpoint
+        """
+        pass
+
+    @abstractmethod
+    def save_status(self):
+        """
+        Save the Tuner status to the checkpoint
+        """
+        pass
+
+    def tuner_finish(self, tuning_time):
+        """
+        Execute when tuning is finished
+        """
+        print("Tuning ends in", tuning_time, "s")
+        best_config = self.tunning_space.make_config_from_indexes(self.best)
+        print("Best cost:", self.best_cost, "ms")
+        print("Best config:", best_config)
+        attach_configs_to_ir(self.tunning_space.initial_ir, best_config)
+        print(
+            "mlir:\n",
+            self.tunning_space.initial_ir,
+        )
+
+    def run(self, max_iter: int = DEFAULT_MAX_ITERS, timeout: int = DEFAULT_TIMEOUT):
+        """
+        Start of tuning process
+        """
+        if self.early_stop > 0 and self.iter - self.last_update_iter > self.early_stop:
+            # in case of resuming from a saved state and it has already
+            # early-stopped
+            print("Early stop now")
+            return
+        start_time = time.time()
+        spaces_size = self.tunning_space.space_size
+        while self.iter < max_iter and self.iter < spaces_size:
+            config_indices_batch = self.get_next_config_indices_batch()
+            if not config_indices_batch:
+                print("Tuner returns empty batch, early stop now")
+                break
+            if len(config_indices_batch) > min(
+                max_iter - self.iter, spaces_size - self.iter
+            ):
+                config_indices_batch = config_indices_batch[
+                    : min(max_iter - self.iter, spaces_size - self.iter)
+                ]
+
+            old_iter = self.iter
+            self.iter += len(config_indices_batch)
+            if tuner_verbose:
+                print("config_indices_batch:", config_indices_batch)
+            perf_result = []
+            ir_modules = []
+            for config_indexes in config_indices_batch:
+                real_config = self.tunning_space.make_config_from_indexes(
+                    config_indexes
+                )
+                # todo : ir.Module can not support deepcopy
+                new_ir = ir.Module.parse(
+                    str(self.tunning_space.initial_ir),
+                    self.tunning_space.initial_ir.context,
+                )
+                attach_configs_to_ir(new_ir, real_config)
+                ir_modules.append(new_ir)
+            res = self.batch_executor(ir_modules)
+            perf_result = [item[1] for item in res]
+            old_best = self.best_cost
+            self.tuner_update(config_indices_batch, perf_result)
+            print(
+                "[",
+                self.iter,
+                "/",
+                min(max_iter, spaces_size),
+                "] skipped:",
+                self.skipped_num,
+                "best:",
+                self.best_cost,
+                "ms",
+            )
+            if self.best_cost != old_best:
+                self.last_update_iter = old_iter
+            else:
+                if (
+                    self.early_stop > 0
+                    and old_iter - self.last_update_iter > self.early_stop
+                ):
+                    print("Early stop now")
+                    break
+            if timeout >= 0 and time.time() - start_time > timeout:
+                print("Tuning timeout...")
+                break
+        self.tuner_finish(time.time() - start_time)
+
+
+class GridTuner(Tuner):
+    """
+    Tuner with grid serach
+    """
+
+    def __init__(
+        self,
+        batch_executor,
+        tunning_space: TuningSpace,
+        batch_size=Tuner.DEFAULT_BATCH_SIZE,
+        early_stop=Tuner.DEFAULT_EARLY_STOP,
+        checkpoint="",
+    ):
+        super().__init__(
+            batch_executor, tunning_space, batch_size, early_stop, checkpoint
+        )
+        self.current_idx = 0
+        self.cumulative_size = [1] * len(self.tunning_space.flatten_candidates)
+        self.cumulative_size[-1] = 1
+        for i in range(len(self.cumulative_size) - 2, -1, -1):
+            self.cumulative_size[i] = self.cumulative_size[i + 1] * len(
+                self.tunning_space.flatten_candidates[i + 1]
+            )
+        if self.checkpoint:
+            self.load_status()
+
+    def get_next_config_indices_batch(self) -> list:
+        config_indices_batch = []
+        while len(config_indices_batch) < self.batch_size:
+            if self.current_idx >= self.tunning_space.space_size:
+                break
+            config_ids = [-1] * len(self.tunning_space.flatten_candidates)
+            remain = self.current_idx
+            valid_config_idx = True
+            for j in range(len(config_ids)):
+                config_ids[j] = remain // self.cumulative_size[j]
+                valid_config_idx = self.tunning_space.verify_config(j, config_ids[j])
+                if not valid_config_idx:
+                    break
+                remain = remain % self.cumulative_size[j]
+            self.current_idx = self.current_idx + 1
+            if valid_config_idx:
+                config_indices_batch.append(config_ids)
+                if tuner_verbose:
+                    print(self.tunning_space.make_config_from_indexes(config_ids))
+            else:
+                self.skipped_num += 1
+                if tuner_verbose:
+                    print("bad config, skip")
+        return config_indices_batch
+
+    def save_status(self):
+        save_dict = {
+            "iter": self.iter,
+            "last_update_iter": self.last_update_iter,
+            "best": self.best,
+            "best_cost": self.best_cost,
+            "current_idx": self.current_idx,
+            "skipped_num": self.skipped_num,
+        }
+        with open(self.checkpoint, "w") as file:
+            json.dump(save_dict, file, indent=4)
+
+    def load_status(self):
+        print("continue tuning from checkpoint...")
+        with open(
+            self.checkpoint,
+            "r",
+        ) as file:
+            try:
+                data = json.load(file)
+                assert set(
+                    [
+                        "iter",
+                        "last_update_iter",
+                        "best",
+                        "best_cost",
+                        "current_idx",
+                        "skipped_num",
+                    ]
+                ) == set(data.keys())
+                self.iter = data["iter"]
+                self.last_update_iter = data["last_update_iter"]
+                self.best = data["best"]
+                self.best_cost = data["best_cost"]
+                self.current_idx = data["current_idx"]
+                self.skipped_num = data["skipped_num"]
+            except Exception as e:
+                print("load checkpoint failed", e)
+
+
+class GATuner(Tuner):
+    """Tuner with Genetic Algorithm"""
+
+    DEFAULT_ELITE_NUM = 9
+    DEFAULT_MUTATION_PROB = 0.1
+    DEFAULT_RANDOM_SEED = 0
+    DEFAULT_EXPECTED_TUNE_NUM = 0
+
+    def __init__(
+        self,
+        batch_executor,
+        tuning_space,
+        pop_size=Tuner.DEFAULT_BATCH_SIZE,
+        early_stop=Tuner.DEFAULT_EARLY_STOP,
+        checkpoint="",
+        elite_num: int = DEFAULT_ELITE_NUM,
+        mutation_prob: float = DEFAULT_MUTATION_PROB,
+        random_seed: int = DEFAULT_RANDOM_SEED,
+        expected_tune_num: int = DEFAULT_EXPECTED_TUNE_NUM,
+    ):
+        super().__init__(batch_executor, tuning_space, pop_size, early_stop, checkpoint)
+        self.elite_num = min(elite_num, pop_size)
+        self.mutation_prob = mutation_prob
+        self.pop_size = pop_size
+        self.cur_mutation_prob = mutation_prob
+        self.prev_results = []
+        self.elites = []
+        random.seed(random_seed)
+        if expected_tune_num == 0:
+            self.filter = HashSetFilter()
+        else:
+            self.filter = BloomFilter(expected_tune_num, err_rate=0.01)
+
+        self.candidate_indices = [[]] * len(self.tunning_space.flatten_candidates)
+        self.candidate_indices[0] = list(
+            range(len(self.tunning_space.flatten_candidates[0]))
+        )
+        if self.checkpoint:
+            self.load_status()
+
+    def save_status(self):
+        save_dict = {
+            "iter": self.iter,
+            "last_update_iter": self.last_update_iter,
+            "best": self.best,
+            "best_cost": self.best_cost,
+            "skipped_num": self.skipped_num,
+            "cur_mutation_prob": self.cur_mutation_prob,
+            "prev_results": self.prev_results,
+            "elites": self.elites,
+            "tuned": list(self.filter.save()),
+        }
+        with open(self.checkpoint, "w") as file:
+            json.dump(save_dict, file, indent=4)
+
+    def load_status(self):
+        print("continue tuning from checkpoint...")
+        with open(
+            self.checkpoint,
+            "r",
+        ) as file:
+            try:
+                data = json.load(file)
+                assert set(
+                    [
+                        "iter",
+                        "last_update_iter",
+                        "best",
+                        "best_cost",
+                        "skipped_num",
+                        "cur_mutation_prob",
+                        "prev_results",
+                        "elites",
+                        "tuned",
+                    ]
+                ) == set(data.keys())
+                self.iter = data["iter"]
+                self.last_update_iter = data["last_update_iter"]
+                self.best = data["best"]
+                self.best_cost = data["best_cost"]
+                self.skipped_num = data["skipped_num"]
+                self.cur_mutation_prob = data["cur_mutation_prob"]
+                self.prev_results = data["prev_results"]
+                self.elites = data["elites"]
+                self.filter.load(data["tuned"])
+            except Exception as e:
+                print("load checkpoint failed", e)
+
+    def set_field(self, gene, idx, val):
+        gene[idx] = val
+        self.update_candidate_indices(idx, val)
+
+    def update_candidate_indices(self, idx, val):
+        next_candidates = self.tunning_space.filter_next_candidates(idx, val)
+        if idx + 1 < len(self.candidate_indices):
+            self.candidate_indices[idx + 1] = next_candidates
+
+    @staticmethod
+    def update_mutation_prob(prob, lower_bound, move_up):
+        if move_up:
+            prob = min(prob * 1.01, 0.5)
+        else:
+            prob = max(prob * 0.98, lower_bound)
+        return prob
+
+    @staticmethod
+    def random_choice(prob_range) -> int:
+        random_val = random.randint(0, sys.maxsize) / sys.maxsize
+        for i in range(len(prob_range)):
+            if random_val <= prob_range[i]:
+                return i
+        return -1
+
+    def push_to_tune(self, to_tune, gene) -> bool:
+        if self.filter.already_met(gene):
+            self.cur_mutation_prob = GATuner.update_mutation_prob(
+                self.cur_mutation_prob, self.mutation_prob, True
+            )
+            return False
+        if gene in to_tune:
+            self.cur_mutation_prob = GATuner.update_mutation_prob(
+                self.cur_mutation_prob, self.mutation_prob, True
+            )
+            return False
+
+        to_tune.append(gene)
+        self.cur_mutation_prob = GATuner.update_mutation_prob(
+            self.cur_mutation_prob, self.mutation_prob, False
+        )
+        return True
+
+    def get_next_config_indices_batch(self) -> list:
+        prob_range = [0.0] * len(self.prev_results)
+        total_score = 0
+        for i, prev_result in enumerate(self.prev_results):
+            total_score += prev_result[1]
+            prob_range[i] = total_score
+        prob_range = [x / total_score for x in prob_range]
+        to_tune = []
+        for i in range(self.pop_size):
+            self.get_next_config(prob_range, to_tune)
+
+        if tuner_verbose:
+            print("to_tune", to_tune)
+            for to_tune_config in to_tune:
+                print(self.tunning_space.make_config_from_indexes(to_tune_config))
+
+        if len(to_tune) < self.pop_size:
+            print(
+                f"GA Cannot generate enough unmet genes in this batch (batch_size={self.pop_size})"
+            )
+        return to_tune
+
+    def get_next_config(self, prob_range, to_tune):
+        max_tries = 20
+        try_cnt = 0
+        while try_cnt < max_tries:
+            try_cnt += 1
+            if not self.elites:
+                gene = [-1] * len(self.tunning_space.flatten_candidates)
+                need_repo = True
+                redo_cnt = 0
+                while redo_cnt < 50 and need_repo:
+                    need_repo = False
+                    for j in range(len(gene)):
+                        # try to randomly pick one candidate
+                        data, success = GATuner.random_item_from(
+                            self.candidate_indices[j]
+                        )
+                        if not success:
+                            need_repo = True
+                            break
+                        else:
+                            self.set_field(gene, j, data)
+                    redo_cnt += 1
+                if need_repo:
+                    print("Cannot create a valid random gene")
+                if self.push_to_tune(to_tune, gene):
+                    return
+            else:
+                assert len(self.prev_results) > 0
+                # print("len(prob_range) = ", len(prob_range))
+                if len(prob_range) == 1:
+                    return
+                gene_size = len(self.tunning_space.flatten_candidates)
+                first_gene = GATuner.random_choice(prob_range)
+                second_gene = GATuner.random_choice(prob_range)
+                while second_gene == first_gene:
+                    second_gene = GATuner.random_choice(prob_range)
+
+                joint_point = random.randint(0, gene_size)
+
+                new_gene = [-1] * gene_size
+                need_redo = False
+                for j in range(gene_size):
+                    candidates = self.candidate_indices[j]
+                    if not candidates:
+                        need_redo = True
+                        continue
+                    if (
+                        random.randint(0, sys.maxsize) / sys.maxsize
+                    ) < self.cur_mutation_prob:
+                        self.set_field(
+                            new_gene, j, GATuner.random_item_from(candidates)[0]
+                        )
+                    else:
+                        #  inherit from parents
+                        left_gene = self.prev_results[first_gene][0][j]
+                        right_gene = self.prev_results[second_gene][0][j]
+                        if j < joint_point:
+                            prefered_gene = left_gene
+                            unprefered_gene = right_gene
+                        else:
+                            prefered_gene = right_gene
+                            unprefered_gene = left_gene
+
+                        if prefered_gene in candidates:
+                            self.set_field(new_gene, j, prefered_gene)
+                        elif unprefered_gene in candidates:
+                            self.set_field(new_gene, j, unprefered_gene)
+                        else:
+                            self.set_field(
+                                new_gene, j, GATuner.random_item_from(candidates)[0]
+                            )
+                if need_redo:
+                    print("need_redo")
+                    continue
+
+                if self.push_to_tune(to_tune, new_gene):
+                    return
+
+    def tuner_update(
+        self, config_indices_batch: List[List[int]], perf_result: List[float]
+    ):
+        self.prev_results.clear()
+        for i, config_indices in enumerate(config_indices_batch):
+            self.filter.add(config_indices)
+            self.prev_results.append((config_indices, 1 / perf_result[i]))
+
+        for elite in self.elites:
+            self.prev_results.append(elite)
+        self.elites = sorted(self.prev_results, key=lambda x: x[1], reverse=True)[
+            : self.elite_num
+        ]
+        super().tuner_update(config_indices_batch, perf_result)
+
+    @staticmethod
+    def random_item_from(v: List[int]):
+        if not v:
+            return 0, False
+        return v[random.randint(0, len(v) - 1)], True
diff --git a/test/benchgc/src/benchgc/tuner/utils.py b/test/benchgc/src/benchgc/tuner/utils.py
new file mode 100644
index 000000000..28e1ce0e1
--- /dev/null
+++ b/test/benchgc/src/benchgc/tuner/utils.py
@@ -0,0 +1,61 @@
+################################################################################
+# Copyright (C) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+# SPDX-License-Identifier: Apache-2.0
+################################################################################
+
+
+from typing import List
+
+from benchgc.tuner.op_config import OP_TO_CONFIG, Config
+from gc_mlir import ir
+
+
+def get_all_tunable_ops(op: ir.Operation):
+    """Get tunable ops from the children op"""
+    tunable_ops = []
+    for region in op.regions:
+        for block in region:
+            for child_op in block:
+                if (
+                    "skipTuner" in child_op.attributes
+                    and child_op.attributes["skipTuner"]
+                ):
+                    continue
+                if child_op.name in OP_TO_CONFIG:
+                    tunable_ops.append(child_op)
+                tunable_ops = tunable_ops + get_all_tunable_ops(child_op)
+    return tunable_ops
+
+
+def gen_configs_from_ir(ir_module: ir.Module):
+    """Genrate configs from ir module"""
+    tunable_ops = get_all_tunable_ops(ir_module.operation)
+    configs = []
+    for op in tunable_ops:
+        if op.name in OP_TO_CONFIG:
+            configs.append(OP_TO_CONFIG[op.name](op))
+    return configs
+
+
+def attach_configs_to_ir(ir_module: ir.Module, configs: List[Config]):
+    """Add configs to ir module"""
+    tunable_ops = get_all_tunable_ops(ir_module.operation)
+    assert len(tunable_ops) == len(
+        configs
+    ), "tunable ops and configs should have the same length"
+    for i, op in enumerate(tunable_ops):
+        if op.name in OP_TO_CONFIG:
+            with ir_module.context:
+                configs[i].attach_to_ir(op)

From f9e1fb08ae7010ae84acdda76e7c1fbbeb3b6265 Mon Sep 17 00:00:00 2001
From: "Xu, Rui" <rui.xu@intel.com>
Date: Tue, 24 Sep 2024 00:07:57 -0700
Subject: [PATCH 02/14] update readme

---
 test/benchgc/README.md                      |   3 +
 test/benchgc/src/benchgc/__main__.py        |  29 +++-
 test/benchgc/src/benchgc/tuner/README.md    | 176 +++++++++++++++++++-
 test/benchgc/src/benchgc/tuner/op_config.py |   9 +-
 test/benchgc/src/benchgc/tuner/tuner.py     |  24 ++-
 5 files changed, 219 insertions(+), 22 deletions(-)

diff --git a/test/benchgc/README.md b/test/benchgc/README.md
index 239105c82..69da9a0d6 100644
--- a/test/benchgc/README.md
+++ b/test/benchgc/README.md
@@ -138,12 +138,15 @@ module {
 ### --bench_kind [str]
 * py : use the MLIR Python API to invoke the kernel and use Python to calculate the time cost
 * wrapper : modify MLIR by wrapping the kernel into a new method and calling the `nanoTime()` method before and after calling the kernel. Finally, calculate the difference as the time cost
+* default: `py`
 
 ### --warm_up [int]
 * warm-up times of the execution
+* default: 100
 
 ### --repeat [int]
 * repeat times of the execution
+* default: 100
 
 ## Pattern Options
 Each pattern has its own unique options.
diff --git a/test/benchgc/src/benchgc/__main__.py b/test/benchgc/src/benchgc/__main__.py
index 67945d4ef..936e25d72 100644
--- a/test/benchgc/src/benchgc/__main__.py
+++ b/test/benchgc/src/benchgc/__main__.py
@@ -208,9 +208,16 @@ def add_bench_options(parser: argparse.ArgumentParser):
         parser.add_argument(
             "--bench_kind", type=str, choices=["py", "wrapper"], default="py"
         )
-        parser.add_argument("--warm_up", type=int, default=100)
-        parser.add_argument("--repeat", type=int, default=100)
-
+        parser.add_argument(
+            "--warm_up",
+            type=int,
+            default=100 if parser.parse_known_args()[0].mode == "P" else 2,
+        )
+        parser.add_argument(
+            "--repeat",
+            type=int,
+            default=100 if parser.parse_known_args()[0].mode == "P" else 4,
+        )
 
 
 def add_pattern_options(parser: argparse.ArgumentParser):
@@ -236,20 +243,25 @@ def add_tuner_options(parser: argparse.ArgumentParser):
         parser.add_argument(
             "--space_percent", type=float, default=TuningSpace.DEFAULT_SPACE_PERCENT
         )
+        parser.add_argument(
+            "--tuner_verbose",
+            action="store_true",
+            help="if we need print the tuner log",
+        )
         parser.add_argument("--checkpoint_path", type=str, default="")
 
         if parser.parse_known_args()[0].search_alg == "ga":
             parser.add_argument(
-                "--random_seed", type=int, default=GATuner.DEFAULT_RANDOM_SEED
+                "--ga_random_seed", type=int, default=GATuner.DEFAULT_RANDOM_SEED
             )
             parser.add_argument(
-                "--elite_num", type=int, default=GATuner.DEFAULT_ELITE_NUM
+                "--ga_elite_num", type=int, default=GATuner.DEFAULT_ELITE_NUM
             )
             parser.add_argument(
-                "--mutation_prob", type=float, default=GATuner.DEFAULT_MUTATION_PROB
+                "--ga_mutation_prob", type=float, default=GATuner.DEFAULT_MUTATION_PROB
             )
             parser.add_argument(
-                "--expected_tune_num",
+                "--ga_expected_tune_num",
                 type=int,
                 default=GATuner.DEFAULT_EXPECTED_TUNE_NUM,
             )
@@ -465,7 +477,6 @@ def tuner_batch_bench(ir_moudles):
 
         assert flags.space_percent > 0 and flags.space_percent <= 1.0
         space = TuningSpace(module, flags.space_percent)
-        print("flags.search_alg", flags.search_alg)
         if flags.search_alg == "grid":
             tuner = GridTuner(
                 tuner_batch_bench,
@@ -473,6 +484,7 @@ def tuner_batch_bench(ir_moudles):
                 flags.tuning_batch,
                 flags.early_stop,
                 flags.checkpoint_path,
+                flags.tuner_verbose,
             )
         else:
             tuner = GATuner(
@@ -481,6 +493,7 @@ def tuner_batch_bench(ir_moudles):
                 flags.tuning_batch,
                 flags.early_stop,
                 flags.checkpoint_path,
+                flags.tuner_verbose,
                 random_seed=flags.random_seed,
                 expected_tune_num=flags.expected_tune_num,
             )
diff --git a/test/benchgc/src/benchgc/tuner/README.md b/test/benchgc/src/benchgc/tuner/README.md
index 503fa1da0..94cfceaf0 100644
--- a/test/benchgc/src/benchgc/tuner/README.md
+++ b/test/benchgc/src/benchgc/tuner/README.md
@@ -1 +1,175 @@
-#TODO
\ No newline at end of file
+# Tuner - auto tuning tools
+## Description
+Tuner is a tool used to  tuner is a tool used to select the best-performing configuration for a graph with tunable operations. Tunable operations refer to operations, such as matmul, conv, etc., whose kernel performance depends on certain configurations, and a tuner generates different configuration combinations for a graph and records their performance. 
+
+## Prerequisite
+`mode T` for benchgc
+
+## Options
+Since bench is also required within the tuner, the tuner also supports benchmarking options.
+Unlike bench mode, in tuner mode, a batch quantity of modules is generated each time, and The default values for warm-up and repeat have been adjusted accordingly.
+* --bench_kind [py, grid]
+* --warm_up [int], default: 2
+* --repeat [int], default: 4
+
+### --tuning_batch [int]
+* The batch size of configs, default: `50`
+* The tuner first generates a batch of configurations, then proceeds to perform performance testing on these configs. 
+
+### --early_stop [int]
+* If the tuner does not find a better performance after testing the number of configurations specified by the `early_stop` value, it will terminate its execution.
+* default: `-1`, represents that early stopping is disabled.
+
+### --max_tuning_iters [int]
+* The maximum number of configurations the tuner needs to attempt.
+* default: `sys.maxsize`
+
+### --timeout [int]
+* The maximum runtime limit for the tuner, unit: second
+* default: `-1`, means there is no limit.
+
+### --space_percent [float]
+* For the set of all possible configurations for a graph, we refer to it as the tuning space. The value of `space_percent` represents the percentage of configurations that we need to tune.
+* value range `(0, 1]`, default: 1.0, means 100 percent of tuning space
+
+### --checkpoint_path [str]
+* When the checkpoint file exists, the tuner will first load the contents of the checkpoint to restore the previous state upon startup, and it will update the checkpoint file after executing each batch.
+
+### --search_alg [str]
+* There are two algorithms within the tuner to search for new configurations.
+* grid: grid search which is a exhaustive search
+* ga: genetic algorithm.
+* default: `grid`
+
+### Options when `--search_alg ga`
+* --ga_random_seed [int]: random seed in genetic algorithm, default: 0
+* --ga_elite_num [int]: default: 9
+* --ga_mutation_prob [float]: default: 0.1 
+* --ga_expected_tune_num [int] : default: 0, In the tuner implemented with a genetic algorithm, a data structure is needed to determine whether a new config is a duplicate of a previous one. By default, a set is used for this purpose when this option is not specified. If the user sets this value, a bloom filter is used instead.
+
+## OP config
+If users need to make adjustments to the candidates in the config of tunable operations, please manually modify `op_config.py`.For example, you can reduce the tuning space by adjusting the candidates.
+
+## Skip the tuner for the specified OP
+
+If you need to skip the tuner for certain operations, you can add the following attribute to them in MLIR.
+Then you can proceed with tuning by using the `--driver=mlir` option
+```
+linalg.matmul {skipTuner = true} ins(..) outs(...) ...
+```
+
+## Example
+* General cmd
+```
+OMP_NUM_THREADS=1 python3 -m benchgc  --mode T  --driver linalg --case matmul --md 0:128x128xf32 --md 1:128x128xf32 --md 2:128x128xf32 --bench_kind wrapper --wram_up 2 --repeat 2 --search_alg grid --tunning_batch 100 --early_stop 1000 --max_tuning_iters 1000000 --timeout 1000000 --space_percent 0.8 --checkpoint_path {path_to_checkpoint_file}
+```
+
+* single matmul
+```
+OMP_NUM_THREADS=1 python3 -m benchgc  --mode T  --driver linalg --case matmul --md 0:128x128xf32 --md 1:128x128xf32 --md 2:128x128xf32
+
+[ 50 / 512 ] skipped: 79 best: 0.025305896997451782 ms
+[ 100 / 512 ] skipped: 105 best: 0.025296583771705627 ms
+[ 150 / 512 ] skipped: 115 best: 0.025296583771705627 ms
+[ 200 / 512 ] skipped: 135 best: 0.025292858481407166 ms
+[ 250 / 512 ] skipped: 147 best: 0.025292858481407166 ms
+[ 300 / 512 ] skipped: 165 best: 0.025292858481407166 ms
+[ 343 / 512 ] skipped: 169 best: 0.025292858481407166 ms
+Tuner returns empty batch, early stop now
+Tuning ends in 26.26677966117859 s
+Best cost: 0.025292858481407166 ms
+Best config: [{
+    "MatMulConfig": {
+        "M_threads": 1,
+        "K_threads": 1,
+        "N_threads": 1,
+        "M_block": 64,
+        "K_block": 32,
+        "N_block": 64,
+        "innermostM_block": 16,
+        "innermostK_block": 16,
+        "innermostN_block": 16
+    }
+}]
+mlir:
+ module attributes {dlti.target_system_spec = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"L1_cache_size_in_bytes", 49152 : ui32>, #dlti.dl_entry<"L2_cache_size_in_bytes", 2097152 : ui64>, #dlti.dl_entry<"L3_cache_size_in_bytes", 110100480 : ui64>, #dlti.dl_entry<"num_threads", 1 : i32>, #dlti.dl_entry<"max_vector_width", 512 : i64>>>} {
+  func.func @entry(%arg0: tensor<128x128xf32>, %arg1: tensor<128x128xf32>) -> tensor<128x128xf32> attributes {llvm.emit_c_interface} {
+    %cst = arith.constant 0.000000e+00 : f32
+    %0 = tensor.empty() : tensor<128x128xf32>
+    %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<128x128xf32>) -> tensor<128x128xf32>
+    %2 = linalg.matmul {KBlock = 32 : i32, Kthreads = 1 : i32, MBlock = 64 : i32, Mthreads = 1 : i32, NBlock = 64 : i32, Nthreads = 1 : i32, cast = #linalg.type_fn<cast_signed>, innermostKBlock = 16 : i32, innermostMBlock = 16 : i32, innermostNBlock = 16 : i32} ins(%arg0, %arg1 : tensor<128x128xf32>, tensor<128x128xf32>) outs(%1 : tensor<128x128xf32>) -> tensor<128x128xf32>
+    return %2 : tensor<128x128xf32>
+  }
+}
+```
+
+* mlp
+
+```
+OMP_NUM_THREADS=1 python -m benchgc --mode T --driver pattern --case mlp --batch_size=32 --hidden_size_list=16x32x64 --has_bias=1x1 --act_type=relu --warm_up 2 --repeat 2
+[ 50 / 1536 ] skipped: 352 best: 0.0069122761487960815 ms
+[ 100 / 1536 ] skipped: 415 best: 0.006860122084617615 ms
+[ 150 / 1536 ] skipped: 662 best: 0.006856396794319153 ms
+[ 200 / 1536 ] skipped: 821 best: 0.006856396794319153 ms
+[ 250 / 1536 ] skipped: 972 best: 0.006856396794319153 ms
+[ 300 / 1536 ] skipped: 1029 best: 0.006856396794319153 ms
+[ 350 / 1536 ] skipped: 1080 best: 0.006834045052528381 ms
+[ 400 / 1536 ] skipped: 1131 best: 0.006834045052528381 ms
+[ 405 / 1536 ] skipped: 1131 best: 0.006834045052528381 ms
+Tuner returns empty batch, early stop now
+Tuning ends in 80.21396946907043 s
+Best cost: 0.006834045052528381 ms
+Best config: [{
+    "MatMulConfig": {
+        "M_threads": 1,
+        "K_threads": 1,
+        "N_threads": 1,
+        "M_block": 32,
+        "K_block": 16,
+        "N_block": 32,
+        "innermostM_block": 16,
+        "innermostK_block": 16,
+        "innermostN_block": 32
+    }
+}, {
+    "MatMulConfig": {
+        "M_threads": 1,
+        "K_threads": 1,
+        "N_threads": 1,
+        "M_block": 32,
+        "K_block": 32,
+        "N_block": 64,
+        "innermostM_block": 16,
+        "innermostK_block": 16,
+        "innermostN_block": 32
+    }
+}]
+mlir:
+ module attributes {dlti.target_system_spec = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"L1_cache_size_in_bytes", 49152 : ui32>, #dlti.dl_entry<"L2_cache_size_in_bytes", 2097152 : ui64>, #dlti.dl_entry<"L3_cache_size_in_bytes", 110100480 : ui64>, #dlti.dl_entry<"num_threads", 1 : i32>, #dlti.dl_entry<"max_vector_width", 512 : i64>>>} {
+  func.func @entry(%arg0: tensor<32x16xf32>, %arg1: tensor<16x32xf32>, %arg2: tensor<32x64xf32>, %arg3: tensor<32xf32>, %arg4: tensor<64xf32>) -> tensor<32x64xf32> attributes {llvm.emit_c_interface} {
+    %cst = arith.constant 0.000000e+00 : f32
+    %0 = tensor.empty() : tensor<32x32xf32>
+    %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<32x32xf32>) -> tensor<32x32xf32>
+    %2 = linalg.matmul {KBlock = 16 : i32, Kthreads = 1 : i32, MBlock = 32 : i32, Mthreads = 1 : i32, NBlock = 32 : i32, Nthreads = 1 : i32, cast = #linalg.type_fn<cast_signed>, innermostKBlock = 16 : i32, innermostMBlock = 16 : i32, innermostNBlock = 32 : i32} ins(%arg0, %arg1 : tensor<32x16xf32>, tensor<16x32xf32>) outs(%1 : tensor<32x32xf32>) -> tensor<32x32xf32>
+    %3 = tensor.empty() : tensor<32x32xf32>
+    %broadcasted = linalg.broadcast ins(%arg3 : tensor<32xf32>) outs(%3 : tensor<32x32xf32>) dimensions = [0] 
+    %4 = tensor.empty() : tensor<32x32xf32>
+    %5 = linalg.add ins(%2, %broadcasted : tensor<32x32xf32>, tensor<32x32xf32>) outs(%4 : tensor<32x32xf32>) -> tensor<32x32xf32>
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<32x32xf32>
+    %6 = tensor.empty() : tensor<32x32xf32>
+    %7 = linalg.max ins(%5, %cst_0 : tensor<32x32xf32>, tensor<32x32xf32>) outs(%6 : tensor<32x32xf32>) -> tensor<32x32xf32>
+    %8 = tensor.empty() : tensor<32x64xf32>
+    %9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<32x64xf32>) -> tensor<32x64xf32>
+    %10 = linalg.matmul {KBlock = 32 : i32, Kthreads = 1 : i32, MBlock = 32 : i32, Mthreads = 1 : i32, NBlock = 64 : i32, Nthreads = 1 : i32, cast = #linalg.type_fn<cast_signed>, innermostKBlock = 16 : i32, innermostMBlock = 16 : i32, innermostNBlock = 32 : i32} ins(%7, %arg2 : tensor<32x32xf32>, tensor<32x64xf32>) outs(%9 : tensor<32x64xf32>) -> tensor<32x64xf32>
+    %11 = tensor.empty() : tensor<32x64xf32>
+    %broadcasted_1 = linalg.broadcast ins(%arg4 : tensor<64xf32>) outs(%11 : tensor<32x64xf32>) dimensions = [0] 
+    %12 = tensor.empty() : tensor<32x64xf32>
+    %13 = linalg.add ins(%10, %broadcasted_1 : tensor<32x64xf32>, tensor<32x64xf32>) outs(%12 : tensor<32x64xf32>) -> tensor<32x64xf32>
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<32x64xf32>
+    %14 = tensor.empty() : tensor<32x64xf32>
+    %15 = linalg.max ins(%13, %cst_2 : tensor<32x64xf32>, tensor<32x64xf32>) outs(%14 : tensor<32x64xf32>) -> tensor<32x64xf32>
+    return %15 : tensor<32x64xf32>
+  }
+}
+```
+
diff --git a/test/benchgc/src/benchgc/tuner/op_config.py b/test/benchgc/src/benchgc/tuner/op_config.py
index 67b80a701..0bb6151d5 100644
--- a/test/benchgc/src/benchgc/tuner/op_config.py
+++ b/test/benchgc/src/benchgc/tuner/op_config.py
@@ -81,7 +81,8 @@ def __init__(
         super().__init__()
 
     def init_candidates(self):
-        default_blocks = [16, 32, 64]
+        default_blocks = [16, 32, 64, 128, 256, 512]
+        default_innermost_blocks = [16, 32]
         self.field_candidates["M_threads"] = find_factors(self.num_threads)
         self.field_candidates["K_threads"] = find_factors(self.num_threads)
         self.field_candidates["N_threads"] = find_factors(self.num_threads)
@@ -95,13 +96,13 @@ def init_candidates(self):
             block for block in default_blocks if self.N >= block
         ]
         self.field_candidates["innermostM_block"] = [
-            block for block in default_blocks if self.M >= block
+            block for block in default_innermost_blocks if self.M >= block
         ]
         self.field_candidates["innermostK_block"] = [
-            block for block in default_blocks if self.K >= block
+            block for block in default_innermost_blocks if self.K >= block
         ]
         self.field_candidates["innermostN_block"] = [
-            block for block in default_blocks if self.N >= block
+            block for block in default_innermost_blocks if self.N >= block
         ]
 
     def init_constraints(self):
diff --git a/test/benchgc/src/benchgc/tuner/tuner.py b/test/benchgc/src/benchgc/tuner/tuner.py
index f8c1e1a02..8c847a1ed 100644
--- a/test/benchgc/src/benchgc/tuner/tuner.py
+++ b/test/benchgc/src/benchgc/tuner/tuner.py
@@ -30,9 +30,6 @@
 from benchgc.tuner.op_config import *
 from benchgc.tuner.utils import attach_configs_to_ir, gen_configs_from_ir
 
-tuner_verbose = False
-
-
 class TuningSpace:
     """
     The class works as a bridge between the tuner and the configs in MLIR module.
@@ -134,6 +131,7 @@ def __init__(
         batch_size=DEFAULT_BATCH_SIZE,
         early_stop=DEFAULT_EARLY_STOP,
         checkpoint="",
+        tuner_verbose=False,
     ):
         self.batch_executor = batch_executor
         self.batch_size = batch_size
@@ -147,6 +145,7 @@ def __init__(
         self.checkpoint = checkpoint
         if self.checkpoint:
             os.makedirs(os.path.dirname(self.checkpoint), exist_ok=True)
+        self.tuner_verbose = tuner_verbose
         assert len(tunning_space.graph_config), "There are no tunable ops"
 
     def tuner_update(self, config_indices_batch: List[List[int]], costs: List[float]):
@@ -219,7 +218,7 @@ def run(self, max_iter: int = DEFAULT_MAX_ITERS, timeout: int = DEFAULT_TIMEOUT)
 
             old_iter = self.iter
             self.iter += len(config_indices_batch)
-            if tuner_verbose:
+            if self.tuner_verbose:
                 print("config_indices_batch:", config_indices_batch)
             perf_result = []
             ir_modules = []
@@ -276,9 +275,15 @@ def __init__(
         batch_size=Tuner.DEFAULT_BATCH_SIZE,
         early_stop=Tuner.DEFAULT_EARLY_STOP,
         checkpoint="",
+        tuner_verbose=False,
     ):
         super().__init__(
-            batch_executor, tunning_space, batch_size, early_stop, checkpoint
+            batch_executor,
+            tunning_space,
+            batch_size,
+            early_stop,
+            checkpoint,
+            tuner_verbose,
         )
         self.current_idx = 0
         self.cumulative_size = [1] * len(self.tunning_space.flatten_candidates)
@@ -307,12 +312,12 @@ def get_next_config_indices_batch(self) -> list:
             self.current_idx = self.current_idx + 1
             if valid_config_idx:
                 config_indices_batch.append(config_ids)
-                if tuner_verbose:
+                if self.tuner_verbose:
                     print(self.tunning_space.make_config_from_indexes(config_ids))
             else:
                 self.skipped_num += 1
-                if tuner_verbose:
-                    print("bad config, skip")
+                if self.tuner_verbose:
+                    print("bad config, skip...")
         return config_indices_batch
 
     def save_status(self):
@@ -370,6 +375,7 @@ def __init__(
         pop_size=Tuner.DEFAULT_BATCH_SIZE,
         early_stop=Tuner.DEFAULT_EARLY_STOP,
         checkpoint="",
+        tuner_verbose=False,
         elite_num: int = DEFAULT_ELITE_NUM,
         mutation_prob: float = DEFAULT_MUTATION_PROB,
         random_seed: int = DEFAULT_RANDOM_SEED,
@@ -497,7 +503,7 @@ def get_next_config_indices_batch(self) -> list:
         for i in range(self.pop_size):
             self.get_next_config(prob_range, to_tune)
 
-        if tuner_verbose:
+        if self.tuner_verbose:
             print("to_tune", to_tune)
             for to_tune_config in to_tune:
                 print(self.tunning_space.make_config_from_indexes(to_tune_config))

From 6cdcfeb37140af53d3026a191dfc2c17defbeb86 Mon Sep 17 00:00:00 2001
From: "Xu, Rui" <rui.xu@intel.com>
Date: Tue, 24 Sep 2024 00:13:10 -0700
Subject: [PATCH 03/14] fix style

---
 test/benchgc/src/benchgc/tuner/tuner.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test/benchgc/src/benchgc/tuner/tuner.py b/test/benchgc/src/benchgc/tuner/tuner.py
index 8c847a1ed..3b402fc77 100644
--- a/test/benchgc/src/benchgc/tuner/tuner.py
+++ b/test/benchgc/src/benchgc/tuner/tuner.py
@@ -24,11 +24,10 @@
 from copy import deepcopy
 from typing import List
 
-from gc_mlir import ir
-
 from benchgc.tuner.config_filter import BloomFilter, HashSetFilter
 from benchgc.tuner.op_config import *
 from benchgc.tuner.utils import attach_configs_to_ir, gen_configs_from_ir
+from gc_mlir import ir
 
 class TuningSpace:
     """

From b545f678ce01d8317579b42a121261852a3a9864 Mon Sep 17 00:00:00 2001
From: "Xu, Rui" <rui.xu@intel.com>
Date: Tue, 24 Sep 2024 00:16:37 -0700
Subject: [PATCH 04/14] update readme

---
 test/benchgc/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/benchgc/README.md b/test/benchgc/README.md
index 69da9a0d6..51e626739 100644
--- a/test/benchgc/README.md
+++ b/test/benchgc/README.md
@@ -44,6 +44,7 @@ python -m benchgc [OPTIONS] --mode [MODE] --driver [DRIVER] --case [CASE]
 ### --mode [str]
 * C : correctness testing (by default)
 * P : performance testing
+* T : performance tuning, see tuner [`README.md`](src/benchgc/tuner/README.md)
 
 ###  --driver [str]
 * linalg: test the single op in linalg dialect

From 8f27020f32622f0a52130d30b2f36b528054afb7 Mon Sep 17 00:00:00 2001
From: "Xu, Rui" <rui.xu@intel.com>
Date: Tue, 24 Sep 2024 00:20:27 -0700
Subject: [PATCH 05/14] fix style

---
 test/benchgc/src/benchgc/tuner/tuner.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/benchgc/src/benchgc/tuner/tuner.py b/test/benchgc/src/benchgc/tuner/tuner.py
index 3b402fc77..7c87ff0b7 100644
--- a/test/benchgc/src/benchgc/tuner/tuner.py
+++ b/test/benchgc/src/benchgc/tuner/tuner.py
@@ -29,6 +29,7 @@
 from benchgc.tuner.utils import attach_configs_to_ir, gen_configs_from_ir
 from gc_mlir import ir
 
+
 class TuningSpace:
     """
     The class works as a bridge between the tuner and the configs in MLIR module.

From 1f237f9f57131ca774f4f20a0cf76a620ccfb0ab Mon Sep 17 00:00:00 2001
From: "Xu, Rui" <rui.xu@intel.com>
Date: Tue, 24 Sep 2024 00:32:07 -0700
Subject: [PATCH 06/14] rm verbose in correctness.sh

---
 scripts/correctness.sh | 160 ++++++++++++++++++++---------------------
 1 file changed, 80 insertions(+), 80 deletions(-)

diff --git a/scripts/correctness.sh b/scripts/correctness.sh
index 30998d481..d01bd0015 100755
--- a/scripts/correctness.sh
+++ b/scripts/correctness.sh
@@ -6,113 +6,113 @@ FAIL=0
 set -e
 
 # bf16
-python3 -m benchgc --verbose 0 --driver linalg --case matmul --md 0:32x128xbf16 --md 1:128x64xbf16 --md 2:32x64xbf16 --cast cast_signed || FAIL=1
+python3 -m benchgc --driver linalg --case matmul --md 0:32x128xbf16 --md 1:128x64xbf16 --md 2:32x64xbf16 --cast cast_signed || FAIL=1
 
 # f32
 
 # reduce
 
-python3 -m benchgc --verbose 0 --driver linalg --case reduce.add --md 0:128x64x8xf32 --md 1:128xf32 --dimensions=1 --dimensions=2 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case reduce.mul --md 0:128x8xf32 --md 1:128xf32 --dimensions=1 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case reduce.max --md 0:128x64x8xf32 --md 1:128xf32 --dimensions=1 --dimensions=2 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case reduce.min --md 0:128x64x8xf32 --md 1:128xf32 --dimensions=1 --dimensions=2 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case reduce.l1 --md 0:128x64x8xf32 --md 1:128xf32 --dimensions=1 --dimensions=2 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case reduce.l2_square --md 0:128x64x8xf32 --md 1:128xf32 --dimensions=1 --dimensions=2 || FAIL=1
+python3 -m benchgc --driver linalg --case reduce.add --md 0:128x64x8xf32 --md 1:128xf32 --dimensions=1 --dimensions=2 || FAIL=1
+python3 -m benchgc --driver linalg --case reduce.mul --md 0:128x8xf32 --md 1:128xf32 --dimensions=1 || FAIL=1
+python3 -m benchgc --driver linalg --case reduce.max --md 0:128x64x8xf32 --md 1:128xf32 --dimensions=1 --dimensions=2 || FAIL=1
+python3 -m benchgc --driver linalg --case reduce.min --md 0:128x64x8xf32 --md 1:128xf32 --dimensions=1 --dimensions=2 || FAIL=1
+python3 -m benchgc --driver linalg --case reduce.l1 --md 0:128x64x8xf32 --md 1:128xf32 --dimensions=1 --dimensions=2 || FAIL=1
+python3 -m benchgc --driver linalg --case reduce.l2_square --md 0:128x64x8xf32 --md 1:128xf32 --dimensions=1 --dimensions=2 || FAIL=1
 
 # misc
-python3 -m benchgc --verbose 0 --driver linalg --case fill --md 0:f32 --md 1:32x4096xf32 --cmp 1:P:0:0 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case copy --md 0:1024x1024xf32 --md 1:1024x1024xbf16 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case broadcast --md 0:1024xf32 --md 1:2x32x1024xf32 --dimensions=0 --dimensions=1 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case transpose --md 0:32x64x128xf32 --md 1:64x128x32xf32 --permutation=1 --permutation=2 --permutation=0 || FAIL=1
+python3 -m benchgc --driver linalg --case fill --md 0:f32 --md 1:32x4096xf32 --cmp 1:P:0:0 || FAIL=1
+python3 -m benchgc --driver linalg --case copy --md 0:1024x1024xf32 --md 1:1024x1024xbf16 || FAIL=1
+python3 -m benchgc --driver linalg --case broadcast --md 0:1024xf32 --md 1:2x32x1024xf32 --dimensions=0 --dimensions=1 || FAIL=1
+python3 -m benchgc --driver linalg --case transpose --md 0:32x64x128xf32 --md 1:64x128x32xf32 --permutation=1 --permutation=2 --permutation=0 || FAIL=1
 
 # matmul
-python3 -m benchgc --verbose 0 --driver linalg --case batch_matmul --md 0:16x512x64xf32 --md 1:16x64x32xf32 --md 2:16x512x32xf32 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case batch_matmul_transpose_a --md 0:16x512x64xf32 --md 1:16x512x32xf32 --md 2:16x64x32xf32 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case batch_matmul_transpose_b --md 0:16x512x64xf32 --md 1:16x128x64xf32 --md 2:16x512x128xf32 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case batch_matvec --md 0:16x512x64xf32 --md 1:16x64xf32 --md 2:16x512xf32 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case batch_mmt4d --md 0:4x4x8x4x2xf32 --md 1:4x8x8x4x2xf32 --md 2:4x4x8x4x4xf32 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case batch_reduce_matmul --md 0:16x512x64xf32 --md 1:16x64x32xf32 --md 2:512x32xf32 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case batch_vecmat --md 0:16x64xf32 --md 1:16x64x512xf32 --md 2:16x512xf32 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case dot --md 0:4096xf32 --md 1:4096xf32 --md 2:0xf32 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case matmul --md 0:1024x512xf32 --md 1:512x512xf32 --md 2:1024x512xf32 --cast cast_signed || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case matmul_transpose_a --md 0:1024x512xf32 --md 1:1024x512xf32 --md 2:512x512xf32 --cast cast_signed || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case matmul_transpose_b --md 0:1024x512xf32 --md 1:1024x512xf32 --md 2:1024x1024xf32 --cast cast_signed || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case matvec --md 0:512x64xf32 --md 1:64xf32 --md 2:512xf32 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case mmt4d --md 0:4x8x4x2xf32 --md 1:8x8x4x2xf32 --md 2:4x8x4x4xf32 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case vecmat --md 0:512xf32 --md 1:512x64xf32 --md 2:64xf32 || FAIL=1
+python3 -m benchgc --driver linalg --case batch_matmul --md 0:16x512x64xf32 --md 1:16x64x32xf32 --md 2:16x512x32xf32 || FAIL=1
+python3 -m benchgc --driver linalg --case batch_matmul_transpose_a --md 0:16x512x64xf32 --md 1:16x512x32xf32 --md 2:16x64x32xf32 || FAIL=1
+python3 -m benchgc --driver linalg --case batch_matmul_transpose_b --md 0:16x512x64xf32 --md 1:16x128x64xf32 --md 2:16x512x128xf32 || FAIL=1
+python3 -m benchgc --driver linalg --case batch_matvec --md 0:16x512x64xf32 --md 1:16x64xf32 --md 2:16x512xf32 || FAIL=1
+python3 -m benchgc --driver linalg --case batch_mmt4d --md 0:4x4x8x4x2xf32 --md 1:4x8x8x4x2xf32 --md 2:4x4x8x4x4xf32 || FAIL=1
+python3 -m benchgc --driver linalg --case batch_reduce_matmul --md 0:16x512x64xf32 --md 1:16x64x32xf32 --md 2:512x32xf32 || FAIL=1
+python3 -m benchgc --driver linalg --case batch_vecmat --md 0:16x64xf32 --md 1:16x64x512xf32 --md 2:16x512xf32 || FAIL=1
+python3 -m benchgc --driver linalg --case dot --md 0:4096xf32 --md 1:4096xf32 --md 2:0xf32 || FAIL=1
+python3 -m benchgc --driver linalg --case matmul --md 0:1024x512xf32 --md 1:512x512xf32 --md 2:1024x512xf32 --cast cast_signed || FAIL=1
+python3 -m benchgc --driver linalg --case matmul_transpose_a --md 0:1024x512xf32 --md 1:1024x512xf32 --md 2:512x512xf32 --cast cast_signed || FAIL=1
+python3 -m benchgc --driver linalg --case matmul_transpose_b --md 0:1024x512xf32 --md 1:1024x512xf32 --md 2:1024x1024xf32 --cast cast_signed || FAIL=1
+python3 -m benchgc --driver linalg --case matvec --md 0:512x64xf32 --md 1:64xf32 --md 2:512xf32 || FAIL=1
+python3 -m benchgc --driver linalg --case mmt4d --md 0:4x8x4x2xf32 --md 1:8x8x4x2xf32 --md 2:4x8x4x4xf32 || FAIL=1
+python3 -m benchgc --driver linalg --case vecmat --md 0:512xf32 --md 1:512x64xf32 --md 2:64xf32 || FAIL=1
 
 # binary
-python3 -m benchgc --verbose 0 --driver linalg --case add --md 0:1x32x4096xf32 --md 1:1x32x4096xf32 --md 2:1x32x4096xf32 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case sub --md 0:1x32x4096xf32 --md 1:1x32x4096xf32 --md 2:1x32x4096xf32 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case mul --md 0:1x32x4096xf32 --md 1:1x32x4096xf32 --md 2:1x32x4096xf32 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case div --md 0:1x32x4096xf32 --md 1:1x32x4096xf32 --md 2:1x32x4096xf32 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case max --md 0:1024x1024xf32 --md 1:1024x1024xf32 --md 2:1024x1024xf32 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case min --md 0:1024x1024xf32 --md 1:1024x1024xf32 --md 2:1024x1024xf32 || FAIL=1
+python3 -m benchgc --driver linalg --case add --md 0:1x32x4096xf32 --md 1:1x32x4096xf32 --md 2:1x32x4096xf32 || FAIL=1
+python3 -m benchgc --driver linalg --case sub --md 0:1x32x4096xf32 --md 1:1x32x4096xf32 --md 2:1x32x4096xf32 || FAIL=1
+python3 -m benchgc --driver linalg --case mul --md 0:1x32x4096xf32 --md 1:1x32x4096xf32 --md 2:1x32x4096xf32 || FAIL=1
+python3 -m benchgc --driver linalg --case div --md 0:1x32x4096xf32 --md 1:1x32x4096xf32 --md 2:1x32x4096xf32 || FAIL=1
+python3 -m benchgc --driver linalg --case max --md 0:1024x1024xf32 --md 1:1024x1024xf32 --md 2:1024x1024xf32 || FAIL=1
+python3 -m benchgc --driver linalg --case min --md 0:1024x1024xf32 --md 1:1024x1024xf32 --md 2:1024x1024xf32 || FAIL=1
 
 # element wise
-python3 -m benchgc --verbose 0 --driver linalg --case abs --md 0:32x4096xf32 --md 1:32x4096xf32 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case ceil --md 0:32x4096xf32 --md 1:32x4096xf32 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case erf --md 0:1024x512xf32 --md 1:1024x512xf32 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case floor --md 0:32x4096xf32 --md 1:32x4096xf32 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case log --md 0:4096x32xf32 --md 1:4096x32xf32 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case negf --md 0:32x4096xf32 --md 1:32x4096xf32 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case exp --md 0:32x4096xf32 --md 1:32x4096xf32 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case round --md 0:32x4096xf32 --md 1:32x4096xf32 || FAIL=1
-# python3 -m benchgc --verbose 0 --driver linalg --case rsqrt --md 0:32x4096xf32 --md 1:32x4096xf32 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case sqrt --md 0:32x4096xf32 --md 1:32x4096xf32 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case square --md 0:32x4096xf32 --md 1:32x4096xf32 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case tanh --md 0:128x128xf32 --md 1:128x128xf32 || FAIL=1
+python3 -m benchgc --driver linalg --case abs --md 0:32x4096xf32 --md 1:32x4096xf32 || FAIL=1
+python3 -m benchgc --driver linalg --case ceil --md 0:32x4096xf32 --md 1:32x4096xf32 || FAIL=1
+python3 -m benchgc --driver linalg --case erf --md 0:1024x512xf32 --md 1:1024x512xf32 || FAIL=1
+python3 -m benchgc --driver linalg --case floor --md 0:32x4096xf32 --md 1:32x4096xf32 || FAIL=1
+python3 -m benchgc --driver linalg --case log --md 0:4096x32xf32 --md 1:4096x32xf32 || FAIL=1
+python3 -m benchgc --driver linalg --case negf --md 0:32x4096xf32 --md 1:32x4096xf32 || FAIL=1
+python3 -m benchgc --driver linalg --case exp --md 0:32x4096xf32 --md 1:32x4096xf32 || FAIL=1
+python3 -m benchgc --driver linalg --case round --md 0:32x4096xf32 --md 1:32x4096xf32 || FAIL=1
+# python3 -m benchgc --driver linalg --case rsqrt --md 0:32x4096xf32 --md 1:32x4096xf32 || FAIL=1
+python3 -m benchgc --driver linalg --case sqrt --md 0:32x4096xf32 --md 1:32x4096xf32 || FAIL=1
+python3 -m benchgc --driver linalg --case square --md 0:32x4096xf32 --md 1:32x4096xf32 || FAIL=1
+python3 -m benchgc --driver linalg --case tanh --md 0:128x128xf32 --md 1:128x128xf32 || FAIL=1
 
 # conv
-python3 -m benchgc --verbose 0 --driver linalg --case conv_1d_ncw_fcw --md 0:4x4x32xf32 --md 1:8x4x4xf32 --md 2:4x8x13xf32 --strides 2 --dilations 2 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case conv_1d_nwc_wcf --md 0:4x32x4xf32 --md 1:4x4x8xf32 --md 2:4x13x8xf32 --strides 2 --dilations 2 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case conv_1d --md 0:32xf32 --md 1:4xf32 --md 2:29xf32 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case conv_2d_nchw_fchw --md 0:4x4x32x32xf32 --md 1:8x4x4x4xf32 --md 2:4x8x13x13xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case conv_2d_ngchw_fgchw --md 0:4x2x2x32x32xf32 --md 1:4x2x2x4x4xf32 --md 2:4x2x4x13x13xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case conv_2d_ngchw_gfchw --md 0:4x2x2x32x32xf32 --md 1:2x4x2x4x4xf32 --md 2:4x2x4x13x13xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case conv_2d_nhwc_fhwc --md 0:4x32x32x4xf32 --md 1:8x4x4x4xf32 --md 2:4x13x13x8xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case conv_2d_nhwc_hwcf --md 0:4x32x32x4xf32 --md 1:4x4x4x8xf32 --md 2:4x13x13x8xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case conv_2d --md 0:32x32xf32 --md 1:4x4xf32 --md 2:29x29xf32 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case conv_3d_ncdhw_fcdhw --md 0:4x4x32x32x32xf32 --md 1:8x4x4x4x4xf32 --md 2:4x8x13x13x13xf32 --strides 2 --strides 2 --strides 2 --dilations 2 --dilations 2 --dilations 2 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case conv_3d_ndhwc_dhwcf --md 0:4x32x32x32x4xf32 --md 1:4x4x4x4x8xf32 --md 2:4x13x13x13x8xf32 --strides 2 --strides 2 --strides 2 --dilations 2 --dilations 2 --dilations 2 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case conv_3d --md 0:32x32x32xf32 --md 1:4x4x4xf32 --md 2:29x29x29xf32 || FAIL=1
+python3 -m benchgc --driver linalg --case conv_1d_ncw_fcw --md 0:4x4x32xf32 --md 1:8x4x4xf32 --md 2:4x8x13xf32 --strides 2 --dilations 2 || FAIL=1
+python3 -m benchgc --driver linalg --case conv_1d_nwc_wcf --md 0:4x32x4xf32 --md 1:4x4x8xf32 --md 2:4x13x8xf32 --strides 2 --dilations 2 || FAIL=1
+python3 -m benchgc --driver linalg --case conv_1d --md 0:32xf32 --md 1:4xf32 --md 2:29xf32 || FAIL=1
+python3 -m benchgc --driver linalg --case conv_2d_nchw_fchw --md 0:4x4x32x32xf32 --md 1:8x4x4x4xf32 --md 2:4x8x13x13xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1
+python3 -m benchgc --driver linalg --case conv_2d_ngchw_fgchw --md 0:4x2x2x32x32xf32 --md 1:4x2x2x4x4xf32 --md 2:4x2x4x13x13xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1
+python3 -m benchgc --driver linalg --case conv_2d_ngchw_gfchw --md 0:4x2x2x32x32xf32 --md 1:2x4x2x4x4xf32 --md 2:4x2x4x13x13xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1
+python3 -m benchgc --driver linalg --case conv_2d_nhwc_fhwc --md 0:4x32x32x4xf32 --md 1:8x4x4x4xf32 --md 2:4x13x13x8xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1
+python3 -m benchgc --driver linalg --case conv_2d_nhwc_hwcf --md 0:4x32x32x4xf32 --md 1:4x4x4x8xf32 --md 2:4x13x13x8xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1
+python3 -m benchgc --driver linalg --case conv_2d --md 0:32x32xf32 --md 1:4x4xf32 --md 2:29x29xf32 || FAIL=1
+python3 -m benchgc --driver linalg --case conv_3d_ncdhw_fcdhw --md 0:4x4x32x32x32xf32 --md 1:8x4x4x4x4xf32 --md 2:4x8x13x13x13xf32 --strides 2 --strides 2 --strides 2 --dilations 2 --dilations 2 --dilations 2 || FAIL=1
+python3 -m benchgc --driver linalg --case conv_3d_ndhwc_dhwcf --md 0:4x32x32x32x4xf32 --md 1:4x4x4x4x8xf32 --md 2:4x13x13x13x8xf32 --strides 2 --strides 2 --strides 2 --dilations 2 --dilations 2 --dilations 2 || FAIL=1
+python3 -m benchgc --driver linalg --case conv_3d --md 0:32x32x32xf32 --md 1:4x4x4xf32 --md 2:29x29x29xf32 || FAIL=1
 
 # depthwise conv
-python3 -m benchgc --verbose 0 --driver linalg --case depthwise_conv_1d_ncw_cw --md 0:4x4x32xf32 --md 1:4x4xf32 --md 2:4x4x13xf32 --strides 2 --dilations 2 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case depthwise_conv_1d_nwc_wc --md 0:4x32x4xf32 --md 1:4x4xf32 --md 2:4x13x4xf32 --strides 2 --dilations 2 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case depthwise_conv_1d_nwc_wcm --md 0:4x32x4xf32 --md 1:4x4x3xf32 --md 2:4x13x4x3xf32 --strides 2 --dilations 2 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case depthwise_conv_2d_nchw_chw --md 0:4x4x32x32xf32 --md 1:4x4x4xf32 --md 2:4x4x13x13xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case depthwise_conv_2d_nhwc_hwc --md 0:4x32x32x4xf32 --md 1:4x4x4xf32 --md 2:4x13x13x4xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case depthwise_conv_2d_nhwc_hwcm --md 0:4x32x32x4xf32 --md 1:4x4x4x3xf32 --md 2:4x13x13x4x3xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case depthwise_conv_3d_ncdhw_cdhw --md 0:4x4x32x32x32xf32 --md 1:4x4x4x4xf32 --md 2:4x4x13x13x13xf32 --strides 2 --strides 2 --strides 2 --dilations 2 --dilations 2 --dilations 2 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case depthwise_conv_3d_ndhwc_dhwc --md 0:4x32x32x32x4xf32 --md 1:4x4x4x4xf32 --md 2:4x13x13x13x4xf32 --strides 2 --strides 2 --strides 2 --dilations 2 --dilations 2 --dilations 2 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case depthwise_conv_3d_ndhwc_dhwcm --md 0:4x32x32x32x4xf32 --md 1:4x4x4x4x3xf32 --md 2:4x13x13x13x4x3xf32 --strides 2 --strides 2 --strides 2 --dilations 2 --dilations 2 --dilations 2 || FAIL=1
+python3 -m benchgc --driver linalg --case depthwise_conv_1d_ncw_cw --md 0:4x4x32xf32 --md 1:4x4xf32 --md 2:4x4x13xf32 --strides 2 --dilations 2 || FAIL=1
+python3 -m benchgc --driver linalg --case depthwise_conv_1d_nwc_wc --md 0:4x32x4xf32 --md 1:4x4xf32 --md 2:4x13x4xf32 --strides 2 --dilations 2 || FAIL=1
+python3 -m benchgc --driver linalg --case depthwise_conv_1d_nwc_wcm --md 0:4x32x4xf32 --md 1:4x4x3xf32 --md 2:4x13x4x3xf32 --strides 2 --dilations 2 || FAIL=1
+python3 -m benchgc --driver linalg --case depthwise_conv_2d_nchw_chw --md 0:4x4x32x32xf32 --md 1:4x4x4xf32 --md 2:4x4x13x13xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1
+python3 -m benchgc --driver linalg --case depthwise_conv_2d_nhwc_hwc --md 0:4x32x32x4xf32 --md 1:4x4x4xf32 --md 2:4x13x13x4xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1
+python3 -m benchgc --driver linalg --case depthwise_conv_2d_nhwc_hwcm --md 0:4x32x32x4xf32 --md 1:4x4x4x3xf32 --md 2:4x13x13x4x3xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1
+python3 -m benchgc --driver linalg --case depthwise_conv_3d_ncdhw_cdhw --md 0:4x4x32x32x32xf32 --md 1:4x4x4x4xf32 --md 2:4x4x13x13x13xf32 --strides 2 --strides 2 --strides 2 --dilations 2 --dilations 2 --dilations 2 || FAIL=1
+python3 -m benchgc --driver linalg --case depthwise_conv_3d_ndhwc_dhwc --md 0:4x32x32x32x4xf32 --md 1:4x4x4x4xf32 --md 2:4x13x13x13x4xf32 --strides 2 --strides 2 --strides 2 --dilations 2 --dilations 2 --dilations 2 || FAIL=1
+python3 -m benchgc --driver linalg --case depthwise_conv_3d_ndhwc_dhwcm --md 0:4x32x32x32x4xf32 --md 1:4x4x4x4x3xf32 --md 2:4x13x13x13x4x3xf32 --strides 2 --strides 2 --strides 2 --dilations 2 --dilations 2 --dilations 2 || FAIL=1
 
 # pool
-python3 -m benchgc --verbose 0 --driver linalg --case pooling_nchw_max --md 0:4x4x32x32xf32 --md 1:4x4xf32 --md 2:4x4x13x13xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case pooling_nchw_sum --md 0:4x4x32x32xf32 --md 1:4x4xf32 --md 2:4x4x13x13xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case pooling_ncw_max --md 0:4x4x32xf32 --md 1:4xf32 --md 2:4x4x13xf32 --strides 2 --dilations 2 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case pooling_ncw_sum --md 0:4x4x32xf32 --md 1:4xf32 --md 2:4x4x13xf32 --strides 2 --dilations 2 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case pooling_ndhwc_max --md 0:4x32x32x32x4xf32 --md 1:4x4x4xf32 --md 2:4x13x13x13x4xf32 --strides 2 --strides 2 --strides 2 --dilations 2 --dilations 2 --dilations 2 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case pooling_ndhwc_sum --md 0:4x32x32x32x4xf32 --md 1:4x4x4xf32 --md 2:4x13x13x13x4xf32 --strides 2 --strides 2 --strides 2 --dilations 2 --dilations 2 --dilations 2 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case pooling_nhwc_max --md 0:4x32x32x4xf32 --md 1:4x4xf32 --md 2:4x13x13x4xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case pooling_nhwc_sum --md 0:4x32x32x4xf32 --md 1:4x4xf32 --md 2:4x13x13x4xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case pooling_nhwc_min --md 0:4x32x32x4xf32 --md 1:4x4xf32 --md 2:4x13x13x4xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case pooling_nwc_max --md 0:4x32x4xf32 --md 1:4xf32 --md 2:4x13x4xf32 --strides 2 --dilations 2 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case pooling_nwc_sum --md 0:4x32x4xf32 --md 1:4xf32 --md 2:4x13x4xf32 --strides 2 --dilations 2 || FAIL=1
-python3 -m benchgc --verbose 0 --driver linalg --case pooling_nwc_min --md 0:4x32x4xf32 --md 1:4xf32 --md 2:4x13x4xf32 --strides 2 --dilations 2 || FAIL=1
+python3 -m benchgc --driver linalg --case pooling_nchw_max --md 0:4x4x32x32xf32 --md 1:4x4xf32 --md 2:4x4x13x13xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1
+python3 -m benchgc --driver linalg --case pooling_nchw_sum --md 0:4x4x32x32xf32 --md 1:4x4xf32 --md 2:4x4x13x13xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1
+python3 -m benchgc --driver linalg --case pooling_ncw_max --md 0:4x4x32xf32 --md 1:4xf32 --md 2:4x4x13xf32 --strides 2 --dilations 2 || FAIL=1
+python3 -m benchgc --driver linalg --case pooling_ncw_sum --md 0:4x4x32xf32 --md 1:4xf32 --md 2:4x4x13xf32 --strides 2 --dilations 2 || FAIL=1
+python3 -m benchgc --driver linalg --case pooling_ndhwc_max --md 0:4x32x32x32x4xf32 --md 1:4x4x4xf32 --md 2:4x13x13x13x4xf32 --strides 2 --strides 2 --strides 2 --dilations 2 --dilations 2 --dilations 2 || FAIL=1
+python3 -m benchgc --driver linalg --case pooling_ndhwc_sum --md 0:4x32x32x32x4xf32 --md 1:4x4x4xf32 --md 2:4x13x13x13x4xf32 --strides 2 --strides 2 --strides 2 --dilations 2 --dilations 2 --dilations 2 || FAIL=1
+python3 -m benchgc --driver linalg --case pooling_nhwc_max --md 0:4x32x32x4xf32 --md 1:4x4xf32 --md 2:4x13x13x4xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1
+python3 -m benchgc --driver linalg --case pooling_nhwc_sum --md 0:4x32x32x4xf32 --md 1:4x4xf32 --md 2:4x13x13x4xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1
+python3 -m benchgc --driver linalg --case pooling_nhwc_min --md 0:4x32x32x4xf32 --md 1:4x4xf32 --md 2:4x13x13x4xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1
+python3 -m benchgc --driver linalg --case pooling_nwc_max --md 0:4x32x4xf32 --md 1:4xf32 --md 2:4x13x4xf32 --strides 2 --dilations 2 || FAIL=1
+python3 -m benchgc --driver linalg --case pooling_nwc_sum --md 0:4x32x4xf32 --md 1:4xf32 --md 2:4x13x4xf32 --strides 2 --dilations 2 || FAIL=1
+python3 -m benchgc --driver linalg --case pooling_nwc_min --md 0:4x32x4xf32 --md 1:4xf32 --md 2:4x13x4xf32 --strides 2 --dilations 2 || FAIL=1
 
 # generic
-python3 -m benchgc --verbose 0 --driver mlir --case ${CASE_DIR}/generic.mlir || FAIL=1
+python3 -m benchgc --driver mlir --case ${CASE_DIR}/generic.mlir || FAIL=1
 
 # softmax
-# python3 -m benchgc --verbose 0 --driver linalg --case softmax --md 0:32x4096xf32 --md 1:32x4096xf32 --dimension 1 || FAIL=1
+# python3 -m benchgc --driver linalg --case softmax --md 0:32x4096xf32 --md 1:32x4096xf32 --dimension 1 || FAIL=1
 
 # mlir
-# python3 -m benchgc --verbose 0 --driver mlir --case ${CASE_DIR}/llama2.mlir || FAIL=1
+# python3 -m benchgc --driver mlir --case ${CASE_DIR}/llama2.mlir || FAIL=1
 
 #mlp
-python3 -m benchgc --verbose 1  --driver pattern --case mlp --batch_size=32 --hidden_size_list=32x16x64 --has_bias=1x1 --act_type=noop --dtype=f32 
+python3 -m benchgc --driver pattern --case mlp --batch_size=32 --hidden_size_list=32x16x64 --has_bias=1x1 --act_type=noop --dtype=f32 
 
 set +e
 exit $FAIL
\ No newline at end of file

From 662cb3b7a93eaf595c70f45954b243c48c10c10d Mon Sep 17 00:00:00 2001
From: "Xu, Rui" <rui.xu@intel.com>
Date: Tue, 24 Sep 2024 00:46:42 -0700
Subject: [PATCH 07/14] fix

---
 test/benchgc/src/benchgc/tuner/op_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/benchgc/src/benchgc/tuner/op_config.py b/test/benchgc/src/benchgc/tuner/op_config.py
index 0bb6151d5..e7a8ecba6 100644
--- a/test/benchgc/src/benchgc/tuner/op_config.py
+++ b/test/benchgc/src/benchgc/tuner/op_config.py
@@ -52,7 +52,7 @@ def find_factors(num):
 class MatMulConfig(Config):
     def __init__(
         self,
-        op: OpView = None,
+        op: OpView,
         M_threads: int = 1,
         K_threads: int = 1,
         N_threads: int = 1,

From 76350446c03a3e85bec5d80acb4807ceae7a5d86 Mon Sep 17 00:00:00 2001
From: "Xu, Rui" <rui.xu@intel.com>
Date: Mon, 7 Oct 2024 19:43:59 -0700
Subject: [PATCH 08/14] fix config field name

---
 test/benchgc/src/benchgc/tuner/README.md    |  64 ++++-----
 test/benchgc/src/benchgc/tuner/op_config.py | 146 ++++++++++----------
 2 files changed, 105 insertions(+), 105 deletions(-)

diff --git a/test/benchgc/src/benchgc/tuner/README.md b/test/benchgc/src/benchgc/tuner/README.md
index 94cfceaf0..7baf5dd9b 100644
--- a/test/benchgc/src/benchgc/tuner/README.md
+++ b/test/benchgc/src/benchgc/tuner/README.md
@@ -80,15 +80,15 @@ Tuning ends in 26.26677966117859 s
 Best cost: 0.025292858481407166 ms
 Best config: [{
     "MatMulConfig": {
-        "M_threads": 1,
-        "K_threads": 1,
-        "N_threads": 1,
-        "M_block": 64,
-        "K_block": 32,
-        "N_block": 64,
-        "innermostM_block": 16,
-        "innermostK_block": 16,
-        "innermostN_block": 16
+        "MThreads": 1,
+        "KThreads": 1,
+        "NThreads": 1,
+        "MBlock": 128,
+        "KBlock": 64,
+        "NBlock": 16,
+        "innerMostMBlock": 32,
+        "innerMostKBlock": 16,
+        "innerMostNBlock": 16
     }
 }]
 mlir:
@@ -97,7 +97,7 @@ mlir:
     %cst = arith.constant 0.000000e+00 : f32
     %0 = tensor.empty() : tensor<128x128xf32>
     %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<128x128xf32>) -> tensor<128x128xf32>
-    %2 = linalg.matmul {KBlock = 32 : i32, Kthreads = 1 : i32, MBlock = 64 : i32, Mthreads = 1 : i32, NBlock = 64 : i32, Nthreads = 1 : i32, cast = #linalg.type_fn<cast_signed>, innermostKBlock = 16 : i32, innermostMBlock = 16 : i32, innermostNBlock = 16 : i32} ins(%arg0, %arg1 : tensor<128x128xf32>, tensor<128x128xf32>) outs(%1 : tensor<128x128xf32>) -> tensor<128x128xf32>
+    %2 = linalg.matmul {KBlock = 64 : i32, KThreads = 1 : i32, MBlock = 128 : i32, MThreads = 1 : i32, NBlock = 16 : i32, NThreads = 1 : i32, cast = #linalg.type_fn<cast_signed>, innerMostKBlock = 16 : i32, innerMostMBlock = 32 : i32, innerMostNBlock = 16 : i32} ins(%arg0, %arg1 : tensor<128x128xf32>, tensor<128x128xf32>) outs(%1 : tensor<128x128xf32>) -> tensor<128x128xf32>
     return %2 : tensor<128x128xf32>
   }
 }
@@ -117,31 +117,31 @@ OMP_NUM_THREADS=1 python -m benchgc --mode T --driver pattern --case mlp --batch
 [ 400 / 1536 ] skipped: 1131 best: 0.006834045052528381 ms
 [ 405 / 1536 ] skipped: 1131 best: 0.006834045052528381 ms
 Tuner returns empty batch, early stop now
-Tuning ends in 80.21396946907043 s
-Best cost: 0.006834045052528381 ms
+Tuning ends in 80.10290145874023 s
+Best cost: 0.006632879376411438 ms
 Best config: [{
     "MatMulConfig": {
-        "M_threads": 1,
-        "K_threads": 1,
-        "N_threads": 1,
-        "M_block": 32,
-        "K_block": 16,
-        "N_block": 32,
-        "innermostM_block": 16,
-        "innermostK_block": 16,
-        "innermostN_block": 32
+        "MThreads": 1,
+        "KThreads": 1,
+        "NThreads": 1,
+        "MBlock": 32,
+        "KBlock": 16,
+        "NBlock": 32,
+        "innerMostMBlock": 32,
+        "innerMostKBlock": 16,
+        "innerMostNBlock": 16
     }
 }, {
     "MatMulConfig": {
-        "M_threads": 1,
-        "K_threads": 1,
-        "N_threads": 1,
-        "M_block": 32,
-        "K_block": 32,
-        "N_block": 64,
-        "innermostM_block": 16,
-        "innermostK_block": 16,
-        "innermostN_block": 32
+        "MThreads": 1,
+        "KThreads": 1,
+        "NThreads": 1,
+        "MBlock": 32,
+        "KBlock": 32,
+        "NBlock": 16,
+        "innerMostMBlock": 16,
+        "innerMostKBlock": 32,
+        "innerMostNBlock": 16
     }
 }]
 mlir:
@@ -150,7 +150,7 @@ mlir:
     %cst = arith.constant 0.000000e+00 : f32
     %0 = tensor.empty() : tensor<32x32xf32>
     %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<32x32xf32>) -> tensor<32x32xf32>
-    %2 = linalg.matmul {KBlock = 16 : i32, Kthreads = 1 : i32, MBlock = 32 : i32, Mthreads = 1 : i32, NBlock = 32 : i32, Nthreads = 1 : i32, cast = #linalg.type_fn<cast_signed>, innermostKBlock = 16 : i32, innermostMBlock = 16 : i32, innermostNBlock = 32 : i32} ins(%arg0, %arg1 : tensor<32x16xf32>, tensor<16x32xf32>) outs(%1 : tensor<32x32xf32>) -> tensor<32x32xf32>
+    %2 = linalg.matmul {KBlock = 16 : i32, KThreads = 1 : i32, MBlock = 32 : i32, MThreads = 1 : i32, NBlock = 32 : i32, NThreads = 1 : i32, cast = #linalg.type_fn<cast_signed>, innerMostKBlock = 16 : i32, innerMostMBlock = 32 : i32, innerMostNBlock = 16 : i32} ins(%arg0, %arg1 : tensor<32x16xf32>, tensor<16x32xf32>) outs(%1 : tensor<32x32xf32>) -> tensor<32x32xf32>
     %3 = tensor.empty() : tensor<32x32xf32>
     %broadcasted = linalg.broadcast ins(%arg3 : tensor<32xf32>) outs(%3 : tensor<32x32xf32>) dimensions = [0] 
     %4 = tensor.empty() : tensor<32x32xf32>
@@ -160,7 +160,7 @@ mlir:
     %7 = linalg.max ins(%5, %cst_0 : tensor<32x32xf32>, tensor<32x32xf32>) outs(%6 : tensor<32x32xf32>) -> tensor<32x32xf32>
     %8 = tensor.empty() : tensor<32x64xf32>
     %9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<32x64xf32>) -> tensor<32x64xf32>
-    %10 = linalg.matmul {KBlock = 32 : i32, Kthreads = 1 : i32, MBlock = 32 : i32, Mthreads = 1 : i32, NBlock = 64 : i32, Nthreads = 1 : i32, cast = #linalg.type_fn<cast_signed>, innermostKBlock = 16 : i32, innermostMBlock = 16 : i32, innermostNBlock = 32 : i32} ins(%7, %arg2 : tensor<32x32xf32>, tensor<32x64xf32>) outs(%9 : tensor<32x64xf32>) -> tensor<32x64xf32>
+    %10 = linalg.matmul {KBlock = 32 : i32, KThreads = 1 : i32, MBlock = 32 : i32, MThreads = 1 : i32, NBlock = 16 : i32, NThreads = 1 : i32, cast = #linalg.type_fn<cast_signed>, innerMostKBlock = 32 : i32, innerMostMBlock = 16 : i32, innerMostNBlock = 16 : i32} ins(%7, %arg2 : tensor<32x32xf32>, tensor<32x64xf32>) outs(%9 : tensor<32x64xf32>) -> tensor<32x64xf32>
     %11 = tensor.empty() : tensor<32x64xf32>
     %broadcasted_1 = linalg.broadcast ins(%arg4 : tensor<64xf32>) outs(%11 : tensor<32x64xf32>) dimensions = [0] 
     %12 = tensor.empty() : tensor<32x64xf32>
diff --git a/test/benchgc/src/benchgc/tuner/op_config.py b/test/benchgc/src/benchgc/tuner/op_config.py
index e7a8ecba6..d7d4039b3 100644
--- a/test/benchgc/src/benchgc/tuner/op_config.py
+++ b/test/benchgc/src/benchgc/tuner/op_config.py
@@ -53,101 +53,101 @@ class MatMulConfig(Config):
     def __init__(
         self,
         op: OpView,
-        M_threads: int = 1,
-        K_threads: int = 1,
-        N_threads: int = 1,
-        M_block: int = 1,
-        K_block: int = 1,
-        N_block: int = 1,
-        innermostM_block: int = 1,
-        innermostK_block: int = 1,
-        innermostN_block: int = 1,
+        MThreads: int = 1,
+        KThreads: int = 1,
+        NThreads: int = 1,
+        MBlock: int = 1,
+        KBlock: int = 1,
+        NBlock: int = 1,
+        innerMostMBlock: int = 1,
+        innerMostKBlock: int = 1,
+        innerMostNBlock: int = 1,
     ):
         # you can set the default value and candidates by info from matmul_op
-        self.M = op.inputs[0].type.shape[0]
-        self.K = op.inputs[0].type.shape[1]
-        self.N = op.inputs[1].type.shape[1]
+        self.m = op.inputs[0].type.shape[0]
+        self.k = op.inputs[0].type.shape[1]
+        self.n = op.inputs[1].type.shape[1]
         # self.input_a_dtype = str(op.inputs[0].type.element_type)
         self.num_threads = int(os.environ.get("OMP_NUM_THREADS", 1))
-        self.M_threads = M_threads
-        self.K_threads = K_threads
-        self.N_threads = N_threads
-        self.M_block = M_block
-        self.K_block = K_block
-        self.N_block = N_block
-        self.innermostM_block = innermostM_block
-        self.innermostK_block = innermostK_block
-        self.innermostN_block = innermostN_block
+        self.m_threads = MThreads
+        self.k_threads = KThreads
+        self.n_threads = NThreads
+        self.m_block = MBlock
+        self.k_block = KBlock
+        self.n_block = NBlock
+        self.innermost_m_block = innerMostMBlock
+        self.innermost_k_block = innerMostKBlock
+        self.innermost_n_block = innerMostNBlock
         super().__init__()
 
     def init_candidates(self):
         default_blocks = [16, 32, 64, 128, 256, 512]
         default_innermost_blocks = [16, 32]
-        self.field_candidates["M_threads"] = find_factors(self.num_threads)
-        self.field_candidates["K_threads"] = find_factors(self.num_threads)
-        self.field_candidates["N_threads"] = find_factors(self.num_threads)
-        self.field_candidates["M_block"] = [
-            block for block in default_blocks if self.M >= block
+        self.field_candidates["m_threads"] = find_factors(self.num_threads)
+        self.field_candidates["k_threads"] = find_factors(self.num_threads)
+        self.field_candidates["n_threads"] = find_factors(self.num_threads)
+        self.field_candidates["m_block"] = [
+            block for block in default_blocks if self.m >= block
         ]
-        self.field_candidates["K_block"] = [
-            block for block in default_blocks if self.K >= block
+        self.field_candidates["k_block"] = [
+            block for block in default_blocks if self.k >= block
         ]
-        self.field_candidates["N_block"] = [
-            block for block in default_blocks if self.N >= block
+        self.field_candidates["n_block"] = [
+            block for block in default_blocks if self.n >= block
         ]
-        self.field_candidates["innermostM_block"] = [
-            block for block in default_innermost_blocks if self.M >= block
+        self.field_candidates["innermost_m_block"] = [
+            block for block in default_innermost_blocks if self.m >= block
         ]
-        self.field_candidates["innermostK_block"] = [
-            block for block in default_innermost_blocks if self.K >= block
+        self.field_candidates["innermost_k_block"] = [
+            block for block in default_innermost_blocks if self.k >= block
         ]
-        self.field_candidates["innermostN_block"] = [
-            block for block in default_innermost_blocks if self.N >= block
+        self.field_candidates["innermost_n_block"] = [
+            block for block in default_innermost_blocks if self.n >= block
         ]
 
     def init_constraints(self):
         # example: using lambda to add constraints, adding constraints by the order of the fields
-        self.field_constraints["M_threads"] = None
-        self.field_constraints["K_threads"] = (
-            lambda MatMulConfig, K_threads: self.num_threads
-            % (MatMulConfig.M_threads * K_threads)
+        self.field_constraints["m_threads"] = None
+        self.field_constraints["k_threads"] = (
+            lambda MatMulConfig, k_threads: self.num_threads
+            % (MatMulConfig.m_threads * k_threads)
             == 0
         )
-        self.field_constraints["N_threads"] = (
-            lambda MatMulConfig, N_threads: self.num_threads
-            % (MatMulConfig.M_threads * MatMulConfig.K_threads * N_threads)
+        self.field_constraints["n_threads"] = (
+            lambda MatMulConfig, n_threads: self.num_threads
+            % (MatMulConfig.m_threads * MatMulConfig.k_threads * n_threads)
             == 0
         )
-        self.field_constraints["M_block"] = None
-        self.field_constraints["K_block"] = None
-        self.field_constraints["N_block"] = None
-        self.field_constraints["innermostM_block"] = (
-            lambda MatMulConfig, innermostM_block: MatMulConfig.M_block
-            % innermostM_block
+        self.field_constraints["m_block"] = None
+        self.field_constraints["k_block"] = None
+        self.field_constraints["n_block"] = None
+        self.field_constraints["innermost_m_block"] = (
+            lambda MatMulConfig, innermost_m_block: MatMulConfig.m_block
+            % innermost_m_block
             == 0
         )
-        self.field_constraints["innermostK_block"] = (
-            lambda MatMulConfig, innermostK_block: MatMulConfig.K_block
-            % innermostK_block
+        self.field_constraints["innermost_k_block"] = (
+            lambda MatMulConfig, innermost_k_block: MatMulConfig.k_block
+            % innermost_k_block
             == 0
         )
-        self.field_constraints["innermostN_block"] = (
-            lambda MatMulConfig, innermostN_block: MatMulConfig.N_block
-            % innermostN_block
+        self.field_constraints["innermost_n_block"] = (
+            lambda MatMulConfig, innermost_n_block: MatMulConfig.n_block
+            % innermost_n_block
             == 0
         )
 
     def attach_to_ir(self, op: OpView):
         attr_to_field = {
-            "Mthreads": self.M_threads,
-            "Kthreads": self.K_threads,
-            "Nthreads": self.N_threads,
-            "MBlock": self.M_block,
-            "KBlock": self.K_block,
-            "NBlock": self.N_block,
-            "innermostMBlock": self.innermostM_block,
-            "innermostKBlock": self.innermostK_block,
-            "innermostNBlock": self.innermostN_block,
+            "MThreads": self.m_threads,
+            "KThreads": self.k_threads,
+            "NThreads": self.n_threads,
+            "MBlock": self.m_block,
+            "KBlock": self.k_block,
+            "NBlock": self.n_block,
+            "innerMostMBlock": self.innermost_m_block,
+            "innerMostKBlock": self.innermost_k_block,
+            "innerMostNBlock": self.innermost_n_block,
         }
         for name, value in attr_to_field.items():
             op.attributes[name] = IntegerAttr.get(T.i32(), value)
@@ -158,15 +158,15 @@ def __repr__(self) -> str:
     def __str__(self) -> str:
         obj_dict = {
             "MatMulConfig": {
-                "M_threads": self.M_threads,
-                "K_threads": self.K_threads,
-                "N_threads": self.N_threads,
-                "M_block": self.M_block,
-                "K_block": self.K_block,
-                "N_block": self.N_block,
-                "innermostM_block": self.innermostM_block,
-                "innermostK_block": self.innermostK_block,
-                "innermostN_block": self.innermostN_block,
+                "MThreads": self.m_threads,
+                "KThreads": self.k_threads,
+                "NThreads": self.n_threads,
+                "MBlock": self.m_block,
+                "KBlock": self.k_block,
+                "NBlock": self.n_block,
+                "innerMostMBlock": self.innermost_m_block,
+                "innerMostKBlock": self.innermost_k_block,
+                "innerMostNBlock": self.innermost_n_block,
             }
         }
         return json.dumps(obj_dict, indent=4)

From 07b698c9d44baa9227253023f31d331cb4379dd6 Mon Sep 17 00:00:00 2001
From: "Xu, Rui" <rui.xu@intel.com>
Date: Mon, 14 Oct 2024 00:02:49 -0700
Subject: [PATCH 09/14] add check matmul config from binding

---
 python/CMakeLists.txt                       |  7 +++
 python/gc_mlir/tools/__init__.py            |  2 +
 test/benchgc/src/benchgc/__main__.py        |  6 ++-
 test/benchgc/src/benchgc/tuner/op_config.py | 40 +++++++++++++++++-
 test/benchgc/src/benchgc/tuner/tuner.py     | 47 +++++++++++++++++----
 5 files changed, 90 insertions(+), 12 deletions(-)

diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 355aba91f..0c83ec158 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -95,6 +95,13 @@ declare_mlir_python_extension(GcPythonSources.CpuInfoExtension
     CPUInfo.cpp
 )
 
+declare_mlir_python_extension(GcPythonSources.ToolsExtension
+  MODULE_NAME _tools
+  ADD_TO_PARENT GcPythonSources
+  SOURCES
+    Tools.cpp
+)
+
 ################################################################################
 # Common CAPI
 ################################################################################
diff --git a/python/gc_mlir/tools/__init__.py b/python/gc_mlir/tools/__init__.py
index 172887970..ee2856a1a 100644
--- a/python/gc_mlir/tools/__init__.py
+++ b/python/gc_mlir/tools/__init__.py
@@ -5,3 +5,5 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
 # ===-----------------------------------------------------------------------===#
+
+from .._mlir_libs._tools import validate_matmul_config
\ No newline at end of file
diff --git a/test/benchgc/src/benchgc/__main__.py b/test/benchgc/src/benchgc/__main__.py
index 936e25d72..eaeeaf58b 100644
--- a/test/benchgc/src/benchgc/__main__.py
+++ b/test/benchgc/src/benchgc/__main__.py
@@ -494,8 +494,10 @@ def tuner_batch_bench(ir_moudles):
                 flags.early_stop,
                 flags.checkpoint_path,
                 flags.tuner_verbose,
-                random_seed=flags.random_seed,
-                expected_tune_num=flags.expected_tune_num,
+                flags.ga_elite_num,
+                flags.ga_mutation_prob,
+                random_seed=flags.ga_random_seed,
+                expected_tune_num=flags.ga_expected_tune_num,
             )
         tuner.run(flags.max_tuning_iters, flags.timeout)
 
diff --git a/test/benchgc/src/benchgc/tuner/op_config.py b/test/benchgc/src/benchgc/tuner/op_config.py
index d7d4039b3..1b7f545cd 100644
--- a/test/benchgc/src/benchgc/tuner/op_config.py
+++ b/test/benchgc/src/benchgc/tuner/op_config.py
@@ -21,6 +21,7 @@
 
 from gc_mlir.extras import types as T
 from gc_mlir.ir import IntegerAttr, OpView
+from gc_mlir.tools import validate_matmul_config
 
 
 class Config:
@@ -39,6 +40,9 @@ def init_constraints(self):
     def attach_to_ir(self, op: OpView):
         pass
 
+    def verify(self) -> bool:
+        pass
+
 
 def find_factors(num):
     factors = set()
@@ -67,7 +71,7 @@ def __init__(
         self.m = op.inputs[0].type.shape[0]
         self.k = op.inputs[0].type.shape[1]
         self.n = op.inputs[1].type.shape[1]
-        # self.input_a_dtype = str(op.inputs[0].type.element_type)
+        self.input_a_dtype = str(op.inputs[0].type.element_type)
         self.num_threads = int(os.environ.get("OMP_NUM_THREADS", 1))
         self.m_threads = MThreads
         self.k_threads = KThreads
@@ -137,6 +141,26 @@ def init_constraints(self):
             == 0
         )
 
+    def verify(self):
+        allow_indivisible_innerblock = False
+        is_vnni_mm2d = True if self.input_a_dtype == "bf16" else False
+        return validate_matmul_config(
+            [
+                self.m_threads,
+                self.k_threads,
+                self.n_threads,
+                self.m_block,
+                self.k_block,
+                self.n_block,
+                self.innermost_m_block,
+                self.innermost_k_block,
+                self.innermost_n_block,
+            ],
+            [self.m, self.k, self.n],
+            allow_indivisible_innerblock,
+            is_vnni_mm2d,
+        )
+
     def attach_to_ir(self, op: OpView):
         attr_to_field = {
             "MThreads": self.m_threads,
@@ -153,7 +177,19 @@ def attach_to_ir(self, op: OpView):
             op.attributes[name] = IntegerAttr.get(T.i32(), value)
 
     def __repr__(self) -> str:
-        return self.__str__()
+        return str(
+            [
+                self.m_threads,
+                self.k_threads,
+                self.n_threads,
+                self.m_block,
+                self.k_block,
+                self.n_block,
+                self.innermost_m_block,
+                self.innermost_k_block,
+                self.innermost_n_block,
+            ]
+        )
 
     def __str__(self) -> str:
         obj_dict = {
diff --git a/test/benchgc/src/benchgc/tuner/tuner.py b/test/benchgc/src/benchgc/tuner/tuner.py
index 7c87ff0b7..f827f895a 100644
--- a/test/benchgc/src/benchgc/tuner/tuner.py
+++ b/test/benchgc/src/benchgc/tuner/tuner.py
@@ -85,8 +85,17 @@ def verify_config(self, candidate_idx, val) -> bool:
         constraint = self.flatten_constraints[candidate_idx]
         val = self.flatten_candidates[candidate_idx][val]
         setattr(config, field_name, val)
-        if constraint:
-            return constraint(config, val)
+        if constraint and (not constraint(config, val)):
+            return False
+        # verify the config when it has all fields
+        if (candidate_idx + 1) == len(
+            self.flatten_candidates
+        ) or self.ind_candidate_to_config[
+            candidate_idx + 1
+        ] != self.ind_candidate_to_config[
+            candidate_idx
+        ]:
+            return config.verify()
         return True
 
     def filter_next_candidates(self, candidate_idx, val) -> List[int]:
@@ -186,7 +195,7 @@ def tuner_finish(self, tuning_time):
         print("Tuning ends in", tuning_time, "s")
         best_config = self.tunning_space.make_config_from_indexes(self.best)
         print("Best cost:", self.best_cost, "ms")
-        print("Best config:", best_config)
+        print("Best config:", str(best_config))
         attach_configs_to_ir(self.tunning_space.initial_ir, best_config)
         print(
             "mlir:\n",
@@ -218,8 +227,6 @@ def run(self, max_iter: int = DEFAULT_MAX_ITERS, timeout: int = DEFAULT_TIMEOUT)
 
             old_iter = self.iter
             self.iter += len(config_indices_batch)
-            if self.tuner_verbose:
-                print("config_indices_batch:", config_indices_batch)
             perf_result = []
             ir_modules = []
             for config_indexes in config_indices_batch:
@@ -235,6 +242,15 @@ def run(self, max_iter: int = DEFAULT_MAX_ITERS, timeout: int = DEFAULT_TIMEOUT)
                 ir_modules.append(new_ir)
             res = self.batch_executor(ir_modules)
             perf_result = [item[1] for item in res]
+            # print the perf result of each config
+            if self.tuner_verbose:
+                for i, config_indexes in enumerate(config_indices_batch):
+                    real_config = self.tunning_space.make_config_from_indexes(
+                        config_indexes
+                    )
+                    perf_to_cfg = {"cost": perf_result[i], "cfg": repr(real_config)}
+                    print(json.dumps(perf_to_cfg))
+
             old_best = self.best_cost
             self.tuner_update(config_indices_batch, perf_result)
             print(
@@ -313,7 +329,10 @@ def get_next_config_indices_batch(self) -> list:
             if valid_config_idx:
                 config_indices_batch.append(config_ids)
                 if self.tuner_verbose:
-                    print(self.tunning_space.make_config_from_indexes(config_ids))
+                    print(
+                        "find valid config",
+                        self.tunning_space.make_config_from_indexes(config_ids),
+                    )
             else:
                 self.skipped_num += 1
                 if self.tuner_verbose:
@@ -381,7 +400,14 @@ def __init__(
         random_seed: int = DEFAULT_RANDOM_SEED,
         expected_tune_num: int = DEFAULT_EXPECTED_TUNE_NUM,
     ):
-        super().__init__(batch_executor, tuning_space, pop_size, early_stop, checkpoint)
+        super().__init__(
+            batch_executor,
+            tuning_space,
+            pop_size,
+            early_stop,
+            checkpoint,
+            tuner_verbose,
+        )
         self.elite_num = min(elite_num, pop_size)
         self.mutation_prob = mutation_prob
         self.pop_size = pop_size
@@ -486,6 +512,11 @@ def push_to_tune(self, to_tune, gene) -> bool:
             )
             return False
 
+        graph_cfg = self.tunning_space.make_config_from_indexes(gene)
+        for cfg in graph_cfg:
+            if not cfg.verify():
+                return False
+
         to_tune.append(gene)
         self.cur_mutation_prob = GATuner.update_mutation_prob(
             self.cur_mutation_prob, self.mutation_prob, False
@@ -504,7 +535,7 @@ def get_next_config_indices_batch(self) -> list:
             self.get_next_config(prob_range, to_tune)
 
         if self.tuner_verbose:
-            print("to_tune", to_tune)
+            print("to_tune list:")
             for to_tune_config in to_tune:
                 print(self.tunning_space.make_config_from_indexes(to_tune_config))
 

From d7e952e839dbaa6196e644e0e0370d7311b3ad97 Mon Sep 17 00:00:00 2001
From: "Xu, Rui" <rui.xu@intel.com>
Date: Mon, 14 Oct 2024 00:15:32 -0700
Subject: [PATCH 10/14] fix

---
 python/Tools.cpp                        | 40 +++++++++++++++++++++++++
 test/benchgc/src/benchgc/tuner/tuner.py |  4 ++-
 2 files changed, 43 insertions(+), 1 deletion(-)
 create mode 100644 python/Tools.cpp

diff --git a/python/Tools.cpp b/python/Tools.cpp
new file mode 100644
index 000000000..4953779b3
--- /dev/null
+++ b/python/Tools.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "gc/Analysis/MatmulConfigAnalysis.h"
+#include "mlir/Bindings/Python/PybindAdaptors.h"
+
+PYBIND11_MODULE(_tools, m) {
+
+  m.def(
+      "validate_matmul_config",
+      [](const std::vector<uint32_t> &cfg_list, std::vector<uint32_t> &shape,
+         bool allow_indivisible_innerblock, bool is_vnni_mm2d) {
+        if (cfg_list.size() != 9) {
+          throw std::invalid_argument("cfg_list must have exactly 9 elements");
+        }
+        mlir::gc::MatmulConfig cfg{cfg_list[0], cfg_list[1], cfg_list[2],
+                                   cfg_list[3], cfg_list[4], cfg_list[5],
+                                   cfg_list[6], cfg_list[7], cfg_list[8]};
+        return mlir::gc::validateConfig(
+            cfg, shape, allow_indivisible_innerblock, is_vnni_mm2d);
+      },
+      py::arg("cfg_list"), py::arg("shape"),
+      py::arg("allow_indivisible_innerblock"), py::arg("is_vnni_mm2d"),
+      "Validate the matmul configuration");
+}
\ No newline at end of file
diff --git a/test/benchgc/src/benchgc/tuner/tuner.py b/test/benchgc/src/benchgc/tuner/tuner.py
index f827f895a..7e7f6a23f 100644
--- a/test/benchgc/src/benchgc/tuner/tuner.py
+++ b/test/benchgc/src/benchgc/tuner/tuner.py
@@ -195,7 +195,7 @@ def tuner_finish(self, tuning_time):
         print("Tuning ends in", tuning_time, "s")
         best_config = self.tunning_space.make_config_from_indexes(self.best)
         print("Best cost:", self.best_cost, "ms")
-        print("Best config:", str(best_config))
+        print("Best config:", [str(single_cfg) for single_cfg in best_config])
         attach_configs_to_ir(self.tunning_space.initial_ir, best_config)
         print(
             "mlir:\n",
@@ -240,6 +240,8 @@ def run(self, max_iter: int = DEFAULT_MAX_ITERS, timeout: int = DEFAULT_TIMEOUT)
                 )
                 attach_configs_to_ir(new_ir, real_config)
                 ir_modules.append(new_ir)
+            if self.tuner_verbose:
+                print("start to execute the batch of configs ...")
             res = self.batch_executor(ir_modules)
             perf_result = [item[1] for item in res]
             # print the perf result of each config

From f64715e63707ef0899ae79183275e2dc528f7cea Mon Sep 17 00:00:00 2001
From: "Xu, Rui" <rui.xu@intel.com>
Date: Tue, 15 Oct 2024 09:34:29 +0800
Subject: [PATCH 11/14] fix name

---
 test/benchgc/src/benchgc/tuner/op_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/benchgc/src/benchgc/tuner/op_config.py b/test/benchgc/src/benchgc/tuner/op_config.py
index 1b7f545cd..ca6368353 100644
--- a/test/benchgc/src/benchgc/tuner/op_config.py
+++ b/test/benchgc/src/benchgc/tuner/op_config.py
@@ -156,7 +156,7 @@ def verify(self):
                 self.innermost_k_block,
                 self.innermost_n_block,
             ],
-            [self.m, self.k, self.n],
+            [self.m, self.n, self.k],
             allow_indivisible_innerblock,
             is_vnni_mm2d,
         )

From afa0b41929b449fb696b18fe0c0f1ea4cb8fdacc Mon Sep 17 00:00:00 2001
From: "Xu, Rui" <rui.xu@intel.com>
Date: Mon, 14 Oct 2024 19:21:28 -0700
Subject: [PATCH 12/14] fix order of m n k

---
 test/benchgc/src/benchgc/tuner/op_config.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/benchgc/src/benchgc/tuner/op_config.py b/test/benchgc/src/benchgc/tuner/op_config.py
index ca6368353..e990fb996 100644
--- a/test/benchgc/src/benchgc/tuner/op_config.py
+++ b/test/benchgc/src/benchgc/tuner/op_config.py
@@ -147,14 +147,14 @@ def verify(self):
         return validate_matmul_config(
             [
                 self.m_threads,
-                self.k_threads,
                 self.n_threads,
+                self.k_threads,
                 self.m_block,
-                self.k_block,
                 self.n_block,
+                self.k_block,
                 self.innermost_m_block,
-                self.innermost_k_block,
                 self.innermost_n_block,
+                self.innermost_k_block,
             ],
             [self.m, self.n, self.k],
             allow_indivisible_innerblock,

From 085ef357d8936577c1695b7af43013295d93eec6 Mon Sep 17 00:00:00 2001
From: "Xu, Rui" <rui.xu@intel.com>
Date: Thu, 17 Oct 2024 10:12:58 +0800
Subject: [PATCH 13/14] fix attr name

---
 lib/gc/Analysis/MatmulConfigAnalysis.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/gc/Analysis/MatmulConfigAnalysis.cpp b/lib/gc/Analysis/MatmulConfigAnalysis.cpp
index 65758ee2e..539f046f8 100644
--- a/lib/gc/Analysis/MatmulConfigAnalysis.cpp
+++ b/lib/gc/Analysis/MatmulConfigAnalysis.cpp
@@ -386,13 +386,13 @@ bool readConfigFromAttrs(MatmulConfig &config, ArrayRef<NamedAttribute> attrs) {
     } else if (attr.getName() == "MThreads") {
       config.MThreads = cast<IntegerAttr>(attr.getValue()).getInt();
       cfgItemCnt++;
-    } else if (attr.getName() == "innermostMBlock") {
+    } else if (attr.getName() == "innerMostMBlock") {
       config.innerMostMBlock = cast<IntegerAttr>(attr.getValue()).getInt();
       cfgItemCnt++;
-    } else if (attr.getName() == "innermostNBlock") {
+    } else if (attr.getName() == "innerMostNBlock") {
       config.innerMostNBlock = cast<IntegerAttr>(attr.getValue()).getInt();
       cfgItemCnt++;
-    } else if (attr.getName() == "innermostKBlock") {
+    } else if (attr.getName() == "innerMostKBlock") {
       config.innerMostKBlock = cast<IntegerAttr>(attr.getValue()).getInt();
       cfgItemCnt++;
     }

From 07a94f03394f17caaa5b3157c85e1cbee07131e3 Mon Sep 17 00:00:00 2001
From: "Xu, Rui" <rui.xu@intel.com>
Date: Thu, 17 Oct 2024 10:21:22 +0800
Subject: [PATCH 14/14] FIX

---
 lib/gc/Analysis/MatmulConfigAnalysis.cpp    |  6 +++---
 test/benchgc/src/benchgc/tuner/op_config.py | 21 +++++++++++----------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/lib/gc/Analysis/MatmulConfigAnalysis.cpp b/lib/gc/Analysis/MatmulConfigAnalysis.cpp
index 539f046f8..65758ee2e 100644
--- a/lib/gc/Analysis/MatmulConfigAnalysis.cpp
+++ b/lib/gc/Analysis/MatmulConfigAnalysis.cpp
@@ -386,13 +386,13 @@ bool readConfigFromAttrs(MatmulConfig &config, ArrayRef<NamedAttribute> attrs) {
     } else if (attr.getName() == "MThreads") {
       config.MThreads = cast<IntegerAttr>(attr.getValue()).getInt();
       cfgItemCnt++;
-    } else if (attr.getName() == "innerMostMBlock") {
+    } else if (attr.getName() == "innermostMBlock") {
       config.innerMostMBlock = cast<IntegerAttr>(attr.getValue()).getInt();
       cfgItemCnt++;
-    } else if (attr.getName() == "innerMostNBlock") {
+    } else if (attr.getName() == "innermostNBlock") {
       config.innerMostNBlock = cast<IntegerAttr>(attr.getValue()).getInt();
       cfgItemCnt++;
-    } else if (attr.getName() == "innerMostKBlock") {
+    } else if (attr.getName() == "innermostKBlock") {
       config.innerMostKBlock = cast<IntegerAttr>(attr.getValue()).getInt();
       cfgItemCnt++;
     }
diff --git a/test/benchgc/src/benchgc/tuner/op_config.py b/test/benchgc/src/benchgc/tuner/op_config.py
index e990fb996..49b925b9c 100644
--- a/test/benchgc/src/benchgc/tuner/op_config.py
+++ b/test/benchgc/src/benchgc/tuner/op_config.py
@@ -169,9 +169,9 @@ def attach_to_ir(self, op: OpView):
             "MBlock": self.m_block,
             "KBlock": self.k_block,
             "NBlock": self.n_block,
-            "innerMostMBlock": self.innermost_m_block,
-            "innerMostKBlock": self.innermost_k_block,
-            "innerMostNBlock": self.innermost_n_block,
+            "innermostMBlock": self.innermost_m_block,
+            "innermostKBlock": self.innermost_k_block,
+            "innermostNBlock": self.innermost_n_block,
         }
         for name, value in attr_to_field.items():
             op.attributes[name] = IntegerAttr.get(T.i32(), value)
@@ -180,29 +180,30 @@ def __repr__(self) -> str:
         return str(
             [
                 self.m_threads,
-                self.k_threads,
                 self.n_threads,
+                self.k_threads,
                 self.m_block,
-                self.k_block,
                 self.n_block,
+                self.k_block,
                 self.innermost_m_block,
-                self.innermost_k_block,
                 self.innermost_n_block,
+                self.innermost_k_block,
             ]
         )
 
     def __str__(self) -> str:
         obj_dict = {
             "MatMulConfig": {
-                "MThreads": self.m_threads,
-                "KThreads": self.k_threads,
+                "MThreads": self.m_threads,          
                 "NThreads": self.n_threads,
+                "KThreads": self.k_threads,
                 "MBlock": self.m_block,
-                "KBlock": self.k_block,
                 "NBlock": self.n_block,
+                "KBlock": self.k_block,            
                 "innerMostMBlock": self.innermost_m_block,
-                "innerMostKBlock": self.innermost_k_block,
                 "innerMostNBlock": self.innermost_n_block,
+                "innerMostKBlock": self.innermost_k_block,
+                
             }
         }
         return json.dumps(obj_dict, indent=4)