microsoft · LeiWang1999 · Nov 7, 2024 · Oct 16, 2024 · Oct 17, 2024 · Oct 17, 2024
diff --git a/.gitignore b/.gitignore
@@ -76,3 +76,6 @@ models/frozenmodels/
 
 # .bitblas_database
 .bitblas_database
+
+# rocprof workloads
+workloads
diff --git a/3rdparty/tvm b/3rdparty/tvm
diff --git a/bitblas/__init__.py b/bitblas/__init__.py
@@ -3,62 +3,21 @@
 import sys
 import os
 
-# installing tvm
-install_tvm_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "3rdparty", "tvm")
-install_cutlass_path = os.path.join(
-    os.path.dirname(os.path.abspath(__file__)), "3rdparty", "cutlass")
-if os.path.exists(install_tvm_path) and install_tvm_path not in sys.path:
-    os.environ["PYTHONPATH"] = install_tvm_path + "/python:" + os.environ.get("PYTHONPATH", "")
-    os.environ["TL_CUTLASS_PATH"] = install_cutlass_path + "/include"
-    os.environ["TL_TEMPLATE_PATH"] = os.path.join(install_tvm_path, "src/tl")
-    sys.path.insert(0, install_tvm_path + "/python")
-
-develop_tvm_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "3rdparty", "tvm")
-develop_cutlass_path = os.path.join(
-    os.path.dirname(os.path.abspath(__file__)), "..", "3rdparty", "cutlass")
-if os.path.exists(develop_tvm_path) and develop_tvm_path not in sys.path:
-    os.environ["PYTHONPATH"] = develop_tvm_path + "/python:" + os.environ.get("PYTHONPATH", "")
-    os.environ["TL_CUTLASS_PATH"] = develop_cutlass_path + "/include"
-    os.environ["TL_TEMPLATE_PATH"] = os.path.join(install_tvm_path, "src/tl")
-    sys.path.insert(0, develop_tvm_path + "/python")
-
-import tvm as tvm  # noqa: E402
-from . import gpu  # noqa: F401
-from .base import (
-    TileDevice,  # noqa: F401
-    fast_tune,  # noqa: F401
-    ApplyDefaultSchedule,  # noqa: F401
-    ApplyFastTuning,  # noqa: F401
-    BlockInfo,  # noqa: F401
-    IterInfo,  # noqa: F401
-    ScheduleRule,  # noqa: F401
-    normalize_prim_func,  # noqa: F401
-    try_inline,  # noqa: F401
-    try_inline_contiguous_spatial,  # noqa: F401
-)
-
-from . import testing  # noqa: F401
-from .utils import auto_detect_nvidia_target, apply_transform_on_input  # noqa: F401
-from .ops.general_matmul import MatmulConfig, Matmul  # noqa: F401
-from .ops.general_matmul_splitk import MatmulConfigWithSplitK, MatmulWithSplitK  # noqa: F401
-from .ops.general_flashatten import FlashAttenConfig, FlashAtten  # noqa: F401
-from .module import Linear  # noqa: F401
-
 import warnings
 import functools
 import logging
 from tqdm import tqdm
 
 
 class TqdmLoggingHandler(logging.Handler):
-    """ Custom logging handler that directs log output to tqdm progress bar to avoid interference. """
+    """Custom logging handler that directs log output to tqdm progress bar to avoid interference."""
 
     def __init__(self, level=logging.NOTSET):
-        """ Initialize the handler with an optional log level. """
+        """Initialize the handler with an optional log level."""
         super().__init__(level)
 
     def emit(self, record):
-        """ Emit a log record. Messages are written to tqdm to ensure output in progress bars isn't corrupted. """
+        """Emit a log record. Messages are written to tqdm to ensure output in progress bars isn't corrupted."""
         try:
             msg = self.format(record)
             tqdm.write(msg)
@@ -67,8 +26,8 @@ def emit(self, record):
 
 
 def set_log_level(level):
-    """ Set the logging level for the module's logger.
-    
+    """Set the logging level for the module's logger.
+
     Args:
         level (str or int): Can be the string name of the level (e.g., 'INFO') or the actual level (e.g., logging.INFO).
         OPTIONS: 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'
@@ -80,15 +39,17 @@ def set_log_level(level):
 
 
 def _init_logger():
-    """ Initialize the logger specific for this module with custom settings and a Tqdm-based handler. """
+    """Initialize the logger specific for this module with custom settings and a Tqdm-based handler."""
     logger = logging.getLogger(__name__)
     handler = TqdmLoggingHandler()
     formatter = logging.Formatter(
-        fmt="%(asctime)s [BitBLAS:%(levelname)s]: %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
+        fmt="%(asctime)s [BitBLAS:%(levelname)s]: %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
     handler.setFormatter(formatter)
     logger.addHandler(handler)
     logger.propagate = False
-    set_log_level('WARNING')
+    set_log_level("WARNING")
 
 
 _init_logger()
@@ -107,12 +68,90 @@ def new_func(*args, **kwargs):
             warnings.warn(
                 f"Call to deprecated function {func.__name__} ({reason}).",
                 category=DeprecationWarning,
-                stacklevel=2)
+                stacklevel=2,
+            )
             return func(*args, **kwargs)
 
         return new_func
 
     return decorator
 
 
+logger = logging.getLogger(__name__)
+
+# SETUP ENVIRONMENT VARIABLES
+CUTLASS_NOT_FOUND_MESSAGE = ("CUTLASS is not installed or found in the expected path")
+", which may lead to compilation bugs when utilize tilelang backend."
+TL_TEMPLATE_NOT_FOUND_MESSAGE = ("TileLang is not installed or found in the expected path")
+", which may lead to compilation bugs when utilize tilelang backend."
+
+# Handle TVM_IMPORT_PYTHON_PATH to import tvm from the specified path
+TVM_IMPORT_PYTHON_PATH = os.environ.get("TVM_IMPORT_PYTHON_PATH", None)
+
+if TVM_IMPORT_PYTHON_PATH is not None:
+    os.environ["PYTHONPATH"] = (TVM_IMPORT_PYTHON_PATH + ":" + os.environ.get("PYTHONPATH", ""))
+    sys.path.insert(0, TVM_IMPORT_PYTHON_PATH + "/python")
+else:
+    # remove the existing tvm path in PYTHONPATH
+    def remove_tvm_path(path):
+        return "tvm" in path
+
+    # installed 3rdparty tvm
+    install_tvm_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "3rdparty", "tvm")
+    if os.path.exists(install_tvm_path) and install_tvm_path not in sys.path:
+        os.environ["PYTHONPATH"] = ":".join(
+            filter(remove_tvm_path,
+                   os.environ.get("PYTHONPATH", "").split(":")))
+        sys.path = [path for path in sys.path if not remove_tvm_path(path)]
+
+        os.environ["PYTHONPATH"] = (
+            install_tvm_path + "/python:" + os.environ.get("PYTHONPATH", ""))
+        sys.path.insert(0, install_tvm_path + "/python")
+
+    # developed 3rdparty tvm
+    develop_tvm_path = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), "..", "3rdparty", "tvm")
+    if os.path.exists(develop_tvm_path) and develop_tvm_path not in sys.path:
+        os.environ["PYTHONPATH"] = ":".join(
+            filter(remove_tvm_path,
+                   os.environ.get("PYTHONPATH", "").split(":")))
+        sys.path = [path for path in sys.path if not remove_tvm_path(path)]
+        os.environ["PYTHONPATH"] = (
+            develop_tvm_path + "/python:" + os.environ.get("PYTHONPATH", ""))
+        sys.path.insert(0, develop_tvm_path + "/python")
+
+if os.environ.get("TL_CUTLASS_PATH", None) is None:
+    install_cutlass_path = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), "3rdparty", "cutlass")
+    develop_cutlass_path = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), "..", "3rdparty", "cutlass")
+    if os.path.exists(install_cutlass_path):
+        os.environ["TL_CUTLASS_PATH"] = install_cutlass_path + "/include"
+    elif (os.path.exists(develop_cutlass_path) and develop_cutlass_path not in sys.path):
+        os.environ["TL_CUTLASS_PATH"] = develop_cutlass_path + "/include"
+    else:
+        logger.warning(CUTLASS_NOT_FOUND_MESSAGE)
+
+import tvm as tvm  # noqa: E402
+from . import gpu  # noqa: F401
+from .base import (
+    TileDevice,  # noqa: F401
+    fast_tune,  # noqa: F401
+    ApplyDefaultSchedule,  # noqa: F401
+    ApplyFastTuning,  # noqa: F401
+    BlockInfo,  # noqa: F401
+    IterInfo,  # noqa: F401
+    ScheduleRule,  # noqa: F401
+    normalize_prim_func,  # noqa: F401
+    try_inline,  # noqa: F401
+    try_inline_contiguous_spatial,  # noqa: F401
+)
+
+from . import testing  # noqa: F401
+from .utils import auto_detect_nvidia_target, apply_transform_on_input  # noqa: F401
+from .ops.general_matmul import MatmulConfig, Matmul  # noqa: F401
+from .ops.general_matmul_splitk import MatmulConfigWithSplitK, MatmulWithSplitK  # noqa: F401
+from .ops.general_flashatten import FlashAttenConfig, FlashAtten  # noqa: F401
+from .module import Linear  # noqa: F401
+
 __version__ = "0.0.1.dev15"
diff --git a/bitblas/ops/general_matmul/tilelang/dense/matmul_tensorcore.py b/bitblas/ops/general_matmul/tilelang/dense/matmul_tensorcore.py
@@ -9,7 +9,7 @@
     make_swizzle_layout,
 )
 
-from bitblas.tl.macro_generator import (
+from bitblas.tl.mma_macro_generator import (
     TensorCoreIntrinEmitter,
     TensorCoreIntrinEmitterWithLadderTransform,
 )

diff --git a/bitblas/ops/general_matmul/tilelang/dense/matmul_tensorcore_s4.py b/bitblas/ops/general_matmul/tilelang/dense/matmul_tensorcore_s4.py
@@ -13,7 +13,7 @@
     MatmulFineGrainScheduler,
     MatmulWeightPropagationScheduler,
 )
-from bitblas.tl.macro_generator import (
+from bitblas.tl.mma_macro_generator import (
     INT4TensorCoreIntrinEmitter,
     INT4TensorCoreIntrinEmitterWithLadderTransform,
 )

diff --git a/bitblas/ops/general_matmul/tilelang/dequantize/finegrained_primitive_tensorcore.py b/bitblas/ops/general_matmul/tilelang/dequantize/finegrained_primitive_tensorcore.py
@@ -9,7 +9,7 @@
     make_swizzle_layout,  # noqa: F401
 )
 
-from bitblas.tl.macro_generator import (
+from bitblas.tl.mma_macro_generator import (
     TensorCoreIntrinEmitter,  # noqa: F401
 )
 from bitblas.ops.common import TransformKind  # noqa: F401

diff --git a/bitblas/ops/general_matmul/tilelang/dequantize/finegrained_primitive_tensorcore_s4.py b/bitblas/ops/general_matmul/tilelang/dequantize/finegrained_primitive_tensorcore_s4.py
@@ -10,7 +10,7 @@
     index_to_coordinates,  # noqa: F401
 )
 
-from bitblas.tl.macro_generator import (
+from bitblas.tl.mma_macro_generator import (
     INT4TensorCoreIntrinEmitter,  # noqa: F401
 )
 from bitblas.base.arch import TileDevice

diff --git a/bitblas/ops/general_matmul/tilelang/dequantize/ladder_weight_transform_tensorcore.py b/bitblas/ops/general_matmul/tilelang/dequantize/ladder_weight_transform_tensorcore.py
@@ -9,7 +9,7 @@
     make_swizzle_layout,  # noqa: F401
 )
 from .finegrained_primitive_tensorcore import MatmulDequantizeFineGrainedScheduler
-from bitblas.tl.macro_generator import (
+from bitblas.tl.mma_macro_generator import (
     TensorCoreIntrinEmitterWithLadderTransform,  # noqa: F401
 )
 from bitblas.ops.common import TransformKind  # noqa: F401

diff --git a/bitblas/ops/general_matmul/tilelang/dequantize/ladder_weight_transform_tensorcore_s4.py b/bitblas/ops/general_matmul/tilelang/dequantize/ladder_weight_transform_tensorcore_s4.py
@@ -11,7 +11,7 @@
 )
 from bitblas.base.arch import TileDevice
 from bitblas.base.roller.hint import Hint
-from bitblas.tl.macro_generator import (
+from bitblas.tl.mma_macro_generator import (
     INT4TensorCoreIntrinEmitterWithLadderTransform,  # noqa: F401
 )
 from bitblas.ops.common import TransformKind  # noqa: F401

diff --git a/bitblas/tl/__init__.py b/bitblas/tl/__init__.py
@@ -7,7 +7,7 @@
     get_ldmatrix_offset,  # noqa: F401
 )
 
-from .macro_generator import (
+from .mma_macro_generator import (
     TensorCoreIntrinEmitter,  # noqa: F401
     TensorCoreIntrinEmitterWithLadderTransform,  # noqa: F401
 )
diff --git a/bitblas/tl/base_layout.py b/bitblas/tl/base_layout.py
@@ -0,0 +1,12 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+
+def make_shared_to_local_linear_layout_2d(i, j, stride=16, local_size=4):
+
+    def shared_to_local_linear_layout_2d(i, j):
+        thread_id = j + (i // local_size) * stride
+        local = (i % local_size)
+        return thread_id, local
+
+    return shared_to_local_linear_layout_2d
diff --git a/bitblas/tl/mfma_layout.py b/bitblas/tl/mfma_layout.py
@@ -0,0 +1,80 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+import os
+from tvm.runtime import convert
+
+
+def shared_16x4_to_local_64x1_layout_A(i, j):
+    thread_id = (j * 16 + i)
+    return thread_id, convert(0)
+
+
+def thread_id_shared_access_64x1_to_16x4_layout_A(thread_id, local_id):
+    i = thread_id % 16
+    j = thread_id // 16
+    return i, j
+
+
+def shared_4x16_to_local_64x1_layout_B(i, j):
+    thread_id = (i * 16 + j)
+    return thread_id, convert(0)
+
+
+def thread_id_shared_access_64x1_to_4x16_layout_B(thread_id, local_id):
+    i = thread_id // 16
+    j = thread_id % 16
+    return i, j
+
+
+def shared_16x16_to_local_64x4_layout_C(i, j):
+    thread_id = j + (i // 4) * 16
+    local = (i % 4)
+    return thread_id, local
+
+
+def shared_16x16_to_ldmatrix_64x4_layout(ind):
+    i, j = ind[0], ind[1]
+    thread_id, local_id = shared_16x16_to_local_64x4_layout_C(i, j)
+    return convert([thread_id, local_id])
+
+
+def thread_id_shared_access_64x4_to_16x16_layout_A(thread_id, local_id):
+    i = thread_id % 16
+    j = (thread_id // 16) * 4 + local_id
+    return i, j
+
+
+def shared_16x16_to_local_64x4_layout_A(i, j):
+    thread_id = i + 16 * (j // 4)
+    local = (j % 4)
+    return thread_id, local
+
+
+def thread_id_shared_access_64x4_to_16x16_layout_B(thread_id, local_id):
+    i = local_id + (thread_id // 16) * 4
+    j = thread_id % 16
+    return i, j
+
+
+def shared_16x16_to_local_64x4_layout_B(i, j):
+    thread_id = j + (i // 4) * 16
+    local = (i % 4)
+    return thread_id, local
+
+
+def thread_id_shared_access_64x4_to_16x16_layout_C_smooth(thread_id, local_id):
+    i = thread_id % 16
+    j = local_id + (thread_id // 16) * 4
+    return j, i
+
+
+def thread_id_shared_access_64x4_to_16x16_layout_C(thread_id, local_id):
+    # This is a hacky implementation to simulate the performance
+    is_smooth = os.environ.get("TILE_LANG_SMOOTH_LAYOUT") == "1"
+    print(is_smooth)
+    if is_smooth:
+        return thread_id_shared_access_64x4_to_16x16_layout_C_smooth(thread_id, local_id)
+
+    i = local_id + (thread_id // 16) * 4
+    j = thread_id % 16
+    return i, j