From 10c8362723e286721a4b85ff183e234e4cd4ef0f Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Sun, 3 Dec 2023 15:40:40 +0000
Subject: [PATCH 01/16] Bump xDSL & MLIR, update codebase.

---
 .github/workflows/ci-mlir-mpi.yml   | 6 +++---
 .github/workflows/ci-mlir.yml       | 4 ++--
 devito/ir/ietxdsl/cluster_to_ssa.py | 6 +++---
 devito/ir/ietxdsl/iet_ssa.py        | 4 ++--
 tests/test_xdsl_iet.py              | 2 +-
 xdsl_llvm.docker                    | 2 +-
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/ci-mlir-mpi.yml b/.github/workflows/ci-mlir-mpi.yml
index 7b64bf62ce..e581a37943 100644
--- a/.github/workflows/ci-mlir-mpi.yml
+++ b/.github/workflows/ci-mlir-mpi.yml
@@ -20,8 +20,8 @@ on:
 
 jobs:
   build:
-    runs-on: ubuntu-20.04
-    container: papychacal/xdsl-llvm:04fc02e583b06b846315904a55af9c273c8b20b9
+    runs-on: ubuntu-latest
+    container: papychacal/xdsl-llvm:98e674c9f16d677d95c67bc130e267fae331e43c
     steps:
     - name: Checkout Devito
       uses: actions/checkout@v3
@@ -38,7 +38,7 @@ jobs:
       run: |
         pip install -e .[tests]
         pip install mpi4py
-        pip install git+https://github.com/xdslproject/xdsl@2825897b87443c9369abf89871f4721e2fce2da9
+        pip install git+https://github.com/xdslproject/xdsl@cd020eace6f2a06c33916d5283efbb24362ab61d
 
     - name: Test with MPI
       run: |
diff --git a/.github/workflows/ci-mlir.yml b/.github/workflows/ci-mlir.yml
index ddd49fdcc7..1e27d3fe6b 100644
--- a/.github/workflows/ci-mlir.yml
+++ b/.github/workflows/ci-mlir.yml
@@ -21,7 +21,7 @@ on:
 jobs:
   build:
     runs-on: ubuntu-latest
-    container: papychacal/xdsl-llvm:04fc02e583b06b846315904a55af9c273c8b20b9
+    container: papychacal/xdsl-llvm:98e674c9f16d677d95c67bc130e267fae331e43c
     steps:
     - name: Checkout Devito
       uses: actions/checkout@v3
@@ -38,7 +38,7 @@ jobs:
       run: |
         pip install -e .[tests]
         pip install mpi4py
-        pip install git+https://github.com/xdslproject/xdsl@2825897b87443c9369abf89871f4721e2fce2da9
+        pip install git+https://github.com/xdslproject/xdsl@cd020eace6f2a06c33916d5283efbb24362ab61d
 
     - name: Test no-MPI, no-Openmp
       run: |
diff --git a/devito/ir/ietxdsl/cluster_to_ssa.py b/devito/ir/ietxdsl/cluster_to_ssa.py
index e51b0c1a7c..42032dfdb6 100644
--- a/devito/ir/ietxdsl/cluster_to_ssa.py
+++ b/devito/ir/ietxdsl/cluster_to_ssa.py
@@ -188,7 +188,7 @@ def _visit_math_nodes(self, node: Expr) -> SSAValue:
             else:
                 raise ValueError("Expected float or int as pow args!")
 
-            op = op_cls.get(base, ex)
+            op = op_cls(base, ex)
             self.block.add_op(op)
             return op.result
         # Handle Mod
@@ -367,7 +367,7 @@ def match_and_rewrite(self, op: func.FuncOp, rewriter: PatternRewriter):
         assert ret is not None
 
         rewriter.insert_op_before([
-            timers := iet_ssa.LoadSymbolic.get('timers', llvm.LLVMPointerType.typed(builtin.f64)),
+            timers := iet_ssa.LoadSymbolic.get('timers', llvm.LLVMPointerType.opaque()),
             t1 := func.Call('timer_end', [t0], [builtin.f64]),
             llvm.StoreOp(t1, timers),
         ], ret)
@@ -441,7 +441,7 @@ def match_and_rewrite(self, op: iet_ssa.Stencil, rewriter: PatternRewriter, /):
                     stencil.IndexAttr.get(*([0] * rank)),
                     stencil.IndexAttr.get(*op.shape),
                 ),
-                scf.Yield.get(op.output, *op.input_indices),
+                scf.Yield(op.output, *op.input_indices),
             ]
         )
 
diff --git a/devito/ir/ietxdsl/iet_ssa.py b/devito/ir/ietxdsl/iet_ssa.py
index 0e71be1b54..1c1eda3336 100644
--- a/devito/ir/ietxdsl/iet_ssa.py
+++ b/devito/ir/ietxdsl/iet_ssa.py
@@ -366,7 +366,7 @@ class For(IRDLOperation):
 
     subindices: IntAttr = attr_def(IntAttr)
 
-    properties: ArrayAttr[builtin.StringAttr] = attr_def(ArrayAttr[builtin.StringAttr])
+    _properties: ArrayAttr[builtin.StringAttr] = attr_def(ArrayAttr[builtin.StringAttr])
     pragmas: ArrayAttr[builtin.StringAttr] = attr_def(ArrayAttr[builtin.StringAttr])
 
     def subindice_ssa_vals(self) -> tuple[SSAValue, ...]:
@@ -382,7 +382,7 @@ def parallelism_property(self) -> str | None:
         Return either "parallel" or "sequential" (or None),
         depending on the properties present
         """
-        for attr in self.properties.data:
+        for attr in self._properties.data:
             if attr.data in ('parallel', 'sequential'):
                 return attr.data
         return None
diff --git a/tests/test_xdsl_iet.py b/tests/test_xdsl_iet.py
index 4dc2b559da..33fb291ecb 100644
--- a/tests/test_xdsl_iet.py
+++ b/tests/test_xdsl_iet.py
@@ -50,7 +50,7 @@ def test_powi():
 
     mod = ModuleOp([
         cst1 := Constant.from_int_and_width(1, i32),
-        ut1 := FPowIOp.get(cst1, cst1),
+        ut1 := FPowIOp(cst1, cst1),
     ])
 
 
diff --git a/xdsl_llvm.docker b/xdsl_llvm.docker
index 9d57b4c46c..1af7f0519c 100644
--- a/xdsl_llvm.docker
+++ b/xdsl_llvm.docker
@@ -1,6 +1,6 @@
 FROM ubuntu:22.04 as build
 
-ARG mlirhash=04fc02e583b06b846315904a55af9c273c8b20b9
+ARG mlirhash=98e674c9f16d677d95c67bc130e267fae331e43c
 
 # base requirements
 RUN apt-get update \

From 954e180fa3bdd59ce11776235eabd7c383ee4a36 Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Mon, 4 Dec 2023 16:28:45 +0000
Subject: [PATCH 02/16] Bump to new xDSL main, include xDSL canonicalize in
 passes, also properly handle xDSL dependency.

---
 .github/workflows/ci-mlir-mpi.yml |    1 -
 .github/workflows/ci-mlir.yml     |    2 -
 devito/core/cpu.py                |   10 +-
 devito/core/gpu.py                |    8 +-
 devito/operator/xdsl_operator.py  | 1350 +++++++++++++++++++++++++++++
 requirements.txt                  |    2 +-
 6 files changed, 1364 insertions(+), 9 deletions(-)
 create mode 100644 devito/operator/xdsl_operator.py

diff --git a/.github/workflows/ci-mlir-mpi.yml b/.github/workflows/ci-mlir-mpi.yml
index e581a37943..e669d89366 100644
--- a/.github/workflows/ci-mlir-mpi.yml
+++ b/.github/workflows/ci-mlir-mpi.yml
@@ -38,7 +38,6 @@ jobs:
       run: |
         pip install -e .[tests]
         pip install mpi4py
-        pip install git+https://github.com/xdslproject/xdsl@cd020eace6f2a06c33916d5283efbb24362ab61d
 
     - name: Test with MPI
       run: |
diff --git a/.github/workflows/ci-mlir.yml b/.github/workflows/ci-mlir.yml
index 1e27d3fe6b..35b6563d41 100644
--- a/.github/workflows/ci-mlir.yml
+++ b/.github/workflows/ci-mlir.yml
@@ -38,8 +38,6 @@ jobs:
       run: |
         pip install -e .[tests]
         pip install mpi4py
-        pip install git+https://github.com/xdslproject/xdsl@cd020eace6f2a06c33916d5283efbb24362ab61d
-
     - name: Test no-MPI, no-Openmp
       run: |
         export DEVITO_MPI=0
diff --git a/devito/core/cpu.py b/devito/core/cpu.py
index 685c69f54c..8022e4d6ce 100644
--- a/devito/core/cpu.py
+++ b/devito/core/cpu.py
@@ -32,6 +32,8 @@
 from devito.types import TimeFunction
 from devito.types.mlir_types import ptr_of, f32
 
+from devito.arch.archinfo import get_nvidia_cc
+
 from xdsl.printer import Printer
 
 
@@ -861,7 +863,8 @@ def generate_XDSL_CPU_PIPELINE(nb_tiled_dims):
     passes = [
         "stencil-shape-inference",
         f"convert-stencil-to-ll-mlir{{{generate_tiling_arg(nb_tiled_dims)}}}",
-        "printf-to-llvm"
+        "printf-to-llvm",
+        "canonicalize"
     ]
 
     return generate_pipeline(passes)
@@ -879,12 +882,13 @@ def generate_XDSL_CPU_noop_PIPELINE():
 
 def generate_XDSL_MPI_PIPELINE(decomp, nb_tiled_dims):
     passes = [
-        f"dmp-decompose{decomp}",
+        f"distribute-stencil{decomp}",
         "canonicalize-dmp",
         f"convert-stencil-to-ll-mlir{{{generate_tiling_arg(nb_tiled_dims)}}}",
         "dmp-to-mpi{mpi_init=false}",
         "lower-mpi",
-        "printf-to-llvm"
+        "printf-to-llvm",
+        "canonicalize"
     ]
 
     return generate_pipeline(passes)
diff --git a/devito/core/gpu.py b/devito/core/gpu.py
index 0b2650dc7d..3fa216f1b7 100644
--- a/devito/core/gpu.py
+++ b/devito/core/gpu.py
@@ -3,6 +3,7 @@
 from io import StringIO
 
 import numpy as np
+from devito.arch.archinfo import get_nvidia_cc
 
 from devito.core.operator import CoreOperator, CustomOperator, ParTile
 
@@ -545,7 +546,8 @@ def generate_XDSL_GPU_PIPELINE():
         "stencil-shape-inference",
         "convert-stencil-to-ll-mlir{target=gpu}",
         "reconcile-unrealized-casts",
-        "printf-to-llvm"
+        "printf-to-llvm",
+        "canonicalize"
     ]
 
     return generate_pipeline(passes)
@@ -580,8 +582,10 @@ def generate_MLIR_GPU_PIPELINE(block_sizes):
         "canonicalize",
         "cse",
         "convert-func-to-llvm{{use-bare-ptr-memref-call-conv}}",
-        "gpu.module(convert-gpu-to-nvvm,reconcile-unrealized-casts,canonicalize,gpu-to-cubin)",  # noqa
+        f"nvvm-attach-target{{O=3 ftz fast chip=sm_{get_nvidia_cc()}}}",
+        "gpu.module(convert-gpu-to-nvvm,canonicalize,cse)",
         "gpu-to-llvm",
+        "gpu-module-to-binary",
         "canonicalize",
         "cse)"
     ]
diff --git a/devito/operator/xdsl_operator.py b/devito/operator/xdsl_operator.py
new file mode 100644
index 0000000000..089bcfa297
--- /dev/null
+++ b/devito/operator/xdsl_operator.py
@@ -0,0 +1,1350 @@
+import os
+import subprocess
+import ctypes
+import tempfile
+
+from math import ceil
+from collections import OrderedDict, namedtuple
+from io import StringIO
+from operator import attrgetter
+
+from cached_property import cached_property
+
+from devito import Operator
+from devito.arch import compiler_registry, platform_registry
+from devito.data import default_allocator
+from devito.exceptions import InvalidOperator
+from devito.ir.clusters import ClusterGroup, clusterize
+from devito.ir.equations import LoweredEq, lower_exprs
+from devito.ir.iet import (Callable, CInterface, EntryFunction, FindSymbols, MetaCall,
+                           derive_parameters, iet_build)
+from devito.ir.ietxdsl import (finalize_module_with_globals)
+from devito.ir.stree import stree_build
+from devito.ir.support import AccessMode, SymbolRegistry
+from devito.ir.ietxdsl.cluster_to_ssa import (ExtractDevitoStencilConversion,
+                                              convert_devito_stencil_to_xdsl_stencil)
+from devito.logger import debug, info, perf, warning, is_log_enabled_for
+from devito.operator.operator import IRs
+from devito.operator.profiling import AdvancedProfilerVerbose, create_profile
+from devito.parameters import configuration
+from devito.passes import (Graph, lower_index_derivatives, generate_implicit,
+                           generate_macros, minimize_symbols, unevaluate)
+from devito.passes.iet import CTarget
+from devito.symbolics import estimate_cost
+from devito.tools import (DAG, OrderedSet, ReducerMap, as_tuple, flatten,
+                          filter_sorted, frozendict, is_integer, split, timed_pass,
+                          contains_val)
+from devito.types import Evaluable, TimeFunction, Grid
+from devito.types.mlir_types import ptr_of, f32
+from devito.mpi import MPI
+
+from xdsl.printer import Printer
+
+# flake8: noqa
+
+__all__ = ['XDSLOperator']
+
+
+# small interop shim script for stuff that we don't want to implement in mlir-ir
+_INTEROP_C = """
+#include <time.h>
+
+double timer_start() {
+  // return a number representing the current point in time
+  // it might be offset by a fixed ammount
+  struct timespec t;
+  clock_gettime(CLOCK_MONOTONIC, &t);
+  return (t.tv_sec) + (t.tv_nsec * 1e-9);
+}
+
+double timer_end(double start) {
+  // return time elaspes since start in seconds
+  return (timer_start() - start);
+}
+"""
+
+
+def generate_tiling_arg(nb_tiled_dims: int):
+    """
+    Generate the tile-sizes arg for the convert-stencil-to-ll-mlir pass. Generating no argument if the diled_dims arg is 0
+    """
+    if nb_tiled_dims == 0:
+        return ''
+    return "tile-sizes=" + ",".join(["64"]*nb_tiled_dims)
+
+
+CFLAGS = "-O3 -march=native -mtune=native -lmlir_c_runner_utils"
+
+MLIR_CPU_PIPELINE = '"builtin.module(canonicalize, cse, loop-invariant-code-motion, canonicalize, cse, loop-invariant-code-motion,cse,canonicalize,fold-memref-alias-ops,expand-strided-metadata, loop-invariant-code-motion,lower-affine,convert-scf-to-cf,convert-math-to-llvm,convert-func-to-llvm{use-bare-ptr-memref-call-conv},finalize-memref-to-llvm,canonicalize,cse)"'
+MLIR_OPENMP_PIPELINE = '"builtin.module(canonicalize, cse, loop-invariant-code-motion, canonicalize, cse, loop-invariant-code-motion,cse,canonicalize,fold-memref-alias-ops,expand-strided-metadata, loop-invariant-code-motion,lower-affine,finalize-memref-to-llvm,loop-invariant-code-motion,canonicalize,cse,convert-scf-to-openmp,finalize-memref-to-llvm,convert-scf-to-cf,convert-func-to-llvm{use-bare-ptr-memref-call-conv},convert-openmp-to-llvm,convert-math-to-llvm,reconcile-unrealized-casts,canonicalize,cse)"'
+# gpu-launch-sink-index-computations seemed to have no impact
+MLIR_GPU_PIPELINE = lambda block_sizes: f'"builtin.module(test-math-algebraic-simplification,scf-parallel-loop-tiling{{parallel-loop-tile-sizes={block_sizes}}},func.func(gpu-map-parallel-loops),convert-parallel-loops-to-gpu,lower-affine, canonicalize,cse, fold-memref-alias-ops, gpu-launch-sink-index-computations, gpu-kernel-outlining, canonicalize{{region-simplify}},cse,fold-memref-alias-ops,expand-strided-metadata,lower-affine,canonicalize,cse,func.func(gpu-async-region),canonicalize,cse,convert-arith-to-llvm{{index-bitwidth=64}},convert-scf-to-cf,convert-cf-to-llvm{{index-bitwidth=64}},canonicalize,cse,convert-func-to-llvm{{use-bare-ptr-memref-call-conv}},gpu.module(convert-gpu-to-nvvm,reconcile-unrealized-casts,canonicalize,gpu-to-cubin),gpu-to-llvm,canonicalize,cse)"'
+
+XDSL_CPU_PIPELINE = lambda nb_tiled_dims: f'"stencil-shape-inference,convert-stencil-to-ll-mlir{{{generate_tiling_arg(nb_tiled_dims)}}},printf-to-llvm,canonicalize"'
+XDSL_GPU_PIPELINE = "stencil-shape-inference,convert-stencil-to-ll-mlir{target=gpu},reconcile-unrealized-casts,printf-to-llvm,canonicalize"
+XDSL_MPI_PIPELINE = lambda decomp, nb_tiled_dims: f'"distribute-stencil{decomp},canonicalize-dmp,convert-stencil-to-ll-mlir{{{generate_tiling_arg(nb_tiled_dims)}}},dmp-to-mpi{{mpi_init=false}},lower-mpi,printf-to-llvm,canonicalize"'
+
+
+class XDSLOperator(Operator):
+
+    _Target = CTarget
+
+    def __new__(cls, expressions, **kwargs):
+        self = super(XDSLOperator, cls).__new__(cls, expressions, **kwargs)
+        delete = not os.getenv("XDSL_SKIP_CLEAN", False)
+        self._tf = tempfile.NamedTemporaryFile(prefix="devito-jit-", suffix='.so',
+                                               delete=delete)
+        self._interop_tf = tempfile.NamedTemporaryFile(prefix="devito-jit-interop-",
+                                                       suffix=".o", delete=delete)
+        self._make_interop_o()
+        self.__class__ = cls
+        return self
+
+    def _make_interop_o(self):
+        """
+        compile the interop.o file
+        """
+        res = subprocess.run(
+            f'clang -x c - -c -o {self._interop_tf.name}',
+            shell=True,
+            input=_INTEROP_C,
+            text=True,
+            stderr=subprocess.DEVNULL,
+            stdout=subprocess.DEVNULL,
+        )
+        assert res.returncode == 0
+
+    @property
+    def mpi_shape(self) -> tuple:
+        dist = self.functions[0].grid.distributor
+
+        # reverse topology for row->column major
+        return dist.topology, dist.myrank
+
+    def _jit_compile(self):
+        """
+        JIT-compile the C code generated by the Operator.
+        It is ensured that JIT compilation will only be performed
+        once per Operator, reagardless of how many times this method
+        is invoked.
+        """
+       
+        with self._profiler.timer_on('jit-compile'):
+            is_mpi = MPI.Is_initialized()
+            is_gpu = os.environ.get("DEVITO_PLATFORM", None) == 'nvidiaX'
+            is_omp = os.environ.get("DEVITO_LANGUAGE", None) == 'openmp'
+
+            if is_mpi and is_gpu:
+                raise RuntimeError("Cannot run MPI+GPU for now!")
+
+            if is_omp and is_gpu:
+                raise RuntimeError("Cannot run OMP+GPU!")
+
+            # specialize the code for the specific apply parameters
+            finalize_module_with_globals(self._module, self._jit_kernel_constants,
+                                         gpu_boilerplate=is_gpu)
+
+            # print module as IR
+            module_str = StringIO()
+            Printer(stream=module_str).print(self._module)
+            module_str = module_str.getvalue()
+
+            to_tile = len(list(filter(lambda s: str(s) in ["x", "y", "z"], self.dimensions)))-1
+
+            xdsl_pipeline = XDSL_CPU_PIPELINE(to_tile)
+            mlir_pipeline = MLIR_CPU_PIPELINE
+
+            block_sizes: list[int] = [min(target, self._jit_kernel_constants.get(f"{dim}_size", 1)) for target, dim in zip([32, 4, 8], ["x", "y", "z"])]
+            block_sizes = ','.join(str(bs) for bs in block_sizes)
+
+            if is_omp:
+                mlir_pipeline = MLIR_OPENMP_PIPELINE
+
+            if is_mpi:
+                shape, mpi_rank = self.mpi_shape
+                # Run with restrict domain=false so we only introduce the swaps but don't
+                # reduce the domain of the computation (as devito has already done that for us)
+                slices = ','.join(str(x) for x in shape)
+
+                decomp = "2d-grid" if len(shape) == 2 else "3d-grid"
+
+                decomp = f"{{strategy={decomp} slices={slices} restrict_domain=false}}"
+                xdsl_pipeline = XDSL_MPI_PIPELINE(decomp, to_tile)
+            elif is_gpu:
+                xdsl_pipeline = XDSL_GPU_PIPELINE
+                mlir_pipeline = MLIR_GPU_PIPELINE(block_sizes)
+
+            # allow jit backdooring to provide your own xdsl code
+            backdoor = os.getenv('XDSL_JIT_BACKDOOR')
+            if backdoor is not None:
+                if os.path.splitext(backdoor)[1] == ".so":
+                    info(f"JIT Backdoor: skipping compilation and using {backdoor}")
+                    self._tf.name = backdoor
+                    return
+                print("JIT Backdoor: loading xdsl file from: " + backdoor)
+                with open(backdoor, 'r') as f:
+                    module_str = f.read()
+            source_name = os.path.splitext(self._tf.name)[0] + ".mlir"
+            source_file = open(source_name, "w")
+            source_file.write(module_str)
+            source_file.close()
+
+            # Compile IR using xdsl-opt | mlir-opt | mlir-translate | clang
+            try:
+                cflags = CFLAGS
+                cc = "clang"
+
+                if is_mpi:
+                    cflags += ' -lmpi '
+                    cc = "mpicc -cc=clang"
+                if is_omp:
+                    cflags += " -fopenmp "
+                if is_gpu:
+                    cflags += " -lmlir_cuda_runtime "
+
+                # TODO More detailed error handling manually,
+                # instead of relying on a bash-only feature.
+
+                xdsl_cmd = f'xdsl-opt {source_name} -p {xdsl_pipeline}'
+                mlir_cmd = f'mlir-opt -p {mlir_pipeline}'
+                mlir_translate_cmd = 'mlir-translate --mlir-to-llvmir'
+                clang_cmd = f'{cc} {cflags} -shared -o {self._tf.name} {self._interop_tf.name} -xir -'
+
+
+                comp_steps = [
+                              xdsl_cmd,
+                              mlir_cmd,
+                              mlir_translate_cmd,
+                              clang_cmd 
+                             ]
+
+                # Execute each command and store the outputs
+                outputs = []
+                stdout = None
+                for cmd in comp_steps:
+                    return_code, stdout, stderr = self._cmd_compile(cmd, stdout)
+                    # Use DEVITO_LOGGING=DEBUG to print
+                    debug(cmd)
+                    outputs.append({
+                        'command': cmd,
+                        'return_code': return_code,
+                        'stdout': stdout,
+                        'stderr': stderr
+                    })
+                
+            except Exception as ex:
+                print("error")
+                raise ex
+
+        elapsed = self._profiler.py_timers['jit-compile']
+
+        perf("XDSLOperator `%s` jit-compiled `%s` in %.2f s with `mlir-opt`" %
+             (self.name, source_name, elapsed))
+
+
+    def _cmd_compile(self, cmd, input=None):
+        stdin = subprocess.PIPE if input is not None else None
+
+        res = subprocess.run(
+            cmd,
+            input=input,
+            shell=True,
+            text=True,
+            capture_output=True,
+            executable="/bin/bash"
+        )
+
+        if res.returncode != 0:
+            print("compilation failed with output:")
+            print(res.stderr)
+
+        assert res.returncode == 0
+        return res.returncode, res.stdout, res.stderr
+
+    @property
+    def _soname(self):
+        return self._tf.name
+
+    def setup_memref_args(self):
+        """
+        Add memrefs to args dictionary so they can be passed to the cfunction
+        """
+        args = dict()
+        for arg in self.functions:
+            if isinstance(arg, TimeFunction):
+                data = arg._data_allocated
+                # iterate over the first dimension (time)
+                for t in range(data.shape[0]):
+                    args[f'{arg._C_name}_{t}'] = data[t, ...].ctypes.data_as(ptr_of(f32))
+        self._jit_kernel_constants.update(args)
+
+    @classmethod
+    def _normalize_kwargs(cls, **kwargs):
+        return kwargs
+
+    @classmethod
+    def _check_kwargs(cls, **kwargs):
+        return
+
+    @classmethod
+    def _build(cls, expressions, **kwargs) -> Callable:
+        debug("-Building operator")
+        # Python- (i.e., compile-) and C-level (i.e., run-time) performance
+        profiler = create_profile('timers')
+
+        # Lower the input expressions into an IET
+        debug("-Lower expressions")
+        irs, _, module = cls._lower(expressions, profiler=profiler, **kwargs)
+
+        # Make it an actual Operator
+        op = Callable.__new__(cls, **irs.iet.args)
+        Callable.__init__(op, **op.args)
+
+        # Header files, etc.
+        # op._headers = OrderedSet(*cls._default_headers)
+        # op._headers.update(byproduct.headers)
+        # op._globals = OrderedSet(*cls._default_globals)
+        # op._includes = OrderedSet(*cls._default_includes)
+        # op._includes.update(profiler._default_includes)
+        # op._includes.update(byproduct.includes)
+        op._module = module
+
+        # Required for the jit-compilation
+        op._compiler = kwargs['compiler']
+        op._lib = None
+        op._cfunction = None
+
+        # Potentially required for lazily allocated Functions
+        op._mode = kwargs['mode']
+        op._options = kwargs['options']
+        op._allocator = kwargs['allocator']
+        op._platform = kwargs['platform']
+
+        # References to local or external routines
+        op._func_table = OrderedDict()
+        op._func_table.update(OrderedDict([(i, MetaCall(None, False))
+                                           for i in profiler._ext_calls]))
+        # op._func_table.update(OrderedDict([(i.root.name, i) for i in byproduct.funcs]))
+
+        # Internal mutable state to store information about previous runs, autotuning
+        # reports, etc
+        op._state = cls._initialize_state(**kwargs)
+
+        # Produced by the various compilation passes
+        op._reads = filter_sorted(flatten(e.reads for e in irs.expressions))
+        op._writes = filter_sorted(flatten(e.writes for e in irs.expressions))
+        op._dimensions = set().union(*[e.dimensions for e in irs.expressions])
+        op._dtype, op._dspace = irs.clusters.meta
+        op._profiler = profiler
+
+        return op
+
+    def __init__(self, *args, **kwargs):
+        # Bypass the silent call to __init__ triggered through the backends engine
+        pass
+
+    # Compilation -- Expression level
+
+    @classmethod
+    def _lower(cls, expressions, **kwargs):
+        """
+        Perform the lowering Expressions -> Clusters -> ScheduleTree -> IET.
+        """
+        # Create a symbol registry
+        kwargs['sregistry'] = SymbolRegistry()
+
+        expressions = as_tuple(expressions)
+
+        # Input check
+        if any(not isinstance(i, Evaluable) for i in expressions):
+            raise InvalidOperator("Only `devito.Evaluable` are allowed.")
+
+        # Enable recursive lowering
+        # This may be used by a compilation pass that constructs a new
+        # expression for which a partial or complete lowering is desired
+        kwargs['lower'] = cls._lower
+
+        # [Eq] -> [LoweredEq]
+        expressions = cls._lower_exprs(expressions, **kwargs)
+
+        conv = ExtractDevitoStencilConversion(expressions)
+        module = conv.convert()
+        convert_devito_stencil_to_xdsl_stencil(module, timed=True)
+
+        # [LoweredEq] -> [Clusters]
+        clusters = cls._lower_clusters(expressions, **kwargs)
+
+        # [Clusters] -> ScheduleTree
+        stree = cls._lower_stree(clusters, **kwargs)
+
+        # ScheduleTree -> unbounded IET
+        uiet = cls._lower_uiet(stree, **kwargs)
+
+        # unbounded IET -> IET
+        iet, byproduct = cls._lower_iet(uiet, **kwargs)
+
+        return IRs(expressions, clusters, stree, uiet, iet), byproduct, module
+
+    @classmethod
+    def _rcompile_wrapper(cls, **kwargs):
+        def wrapper(expressions, kwargs=kwargs):
+            return rcompile(expressions, kwargs)
+        return wrapper
+
+    @classmethod
+    def _initialize_state(cls, **kwargs):
+        return {}
+
+    @classmethod
+    def _specialize_dsl(cls, expressions, **kwargs):
+        """
+        Backend hook for specialization at the DSL level. The input is made of
+        expressions and other higher order objects such as Injection or
+        Interpolation; the expressions are still unevaluated at this stage,
+        meaning that they are still in tensorial form and derivatives aren't
+        expanded yet.
+        """
+        return expressions
+
+    @classmethod
+    def _specialize_exprs(cls, expressions, **kwargs):
+        """
+        Backend hook for specialization at the expression level.
+        """
+        return expressions
+
+    @classmethod
+    @timed_pass(name='lowering.Expressions')
+    def _lower_exprs(cls, expressions, **kwargs):
+        """
+        Expression lowering:
+
+            * Apply rewrite rules;
+            * Evaluate derivatives;
+            * Flatten vectorial equations;
+            * Indexify Functions;
+            * Apply substitution rules;
+            * Shift indices for domain alignment.
+        """
+        expand = kwargs['options'].get('expand', True)
+
+        # Specialization is performed on unevaluated expressions
+        expressions = cls._specialize_dsl(expressions, **kwargs)
+
+        # Lower FD derivatives
+        # NOTE: we force expansion of derivatives along SteppingDimensions
+        # because it drastically simplifies the subsequent lowering into
+        # ModuloDimensions
+        if not expand:
+            expand = lambda d: d.is_Stepping
+        expressions = flatten([i._evaluate(expand=expand) for i in expressions])
+
+        # Scalarize the tensor equations, if any
+        expressions = [j for i in expressions for j in i._flatten]
+
+        # A second round of specialization is performed on evaluated expressions
+        expressions = cls._specialize_exprs(expressions, **kwargs)
+
+        # "True" lowering (indexification, shifting, ...)
+        expressions = lower_exprs(expressions, **kwargs)
+
+        processed = [LoweredEq(i) for i in expressions]
+
+        return processed
+
+    # Compilation -- Cluster level
+
+    @classmethod
+    def _specialize_clusters(cls, clusters, **kwargs):
+        """
+        Backend hook for specialization at the Cluster level.
+        """
+        return clusters
+
+    @classmethod
+    @timed_pass(name='lowering.Clusters')
+    def _lower_clusters(cls, expressions, profiler=None, **kwargs):
+        """
+        Clusters lowering:
+
+            * Group expressions into Clusters;
+            * Introduce guards for conditional Clusters;
+            * Analyze Clusters to detect computational properties such
+              as parallelism.
+            * Optimize Clusters for performance
+        """
+        sregistry = kwargs['sregistry']
+
+        # Build a sequence of Clusters from a sequence of Eqs
+        clusters = clusterize(expressions, **kwargs)
+
+        # Operation count before specialization
+        init_ops = sum(estimate_cost(c.exprs) for c in clusters if c.is_dense)
+
+        clusters = cls._specialize_clusters(clusters, **kwargs)
+
+        # Operation count after specialization
+        final_ops = sum(estimate_cost(c.exprs) for c in clusters if c.is_dense)
+        try:
+            profiler.record_ops_variation(init_ops, final_ops)
+        except AttributeError:
+            pass
+
+        # Generate implicit Clusters from higher level abstractions
+        clusters = generate_implicit(clusters, sregistry=sregistry)
+
+        # Lower all remaining high order symbolic objects
+        clusters = lower_index_derivatives(clusters, **kwargs)
+
+        # Make sure no reconstructions can unpick any of the symbolic
+        # optimizations performed so far
+        clusters = unevaluate(clusters)
+
+        return ClusterGroup(clusters)
+
+    # Compilation -- ScheduleTree level
+
+    @classmethod
+    def _specialize_stree(cls, stree, **kwargs):
+        """
+        DEPRECATED: Backend hook for specialization at the Schedule tree level.
+        """
+        return stree
+
+    @classmethod
+    @timed_pass(name='lowering.ScheduleTree')
+    def _lower_stree(cls, clusters, **kwargs):
+        """
+        Schedule tree lowering:
+
+            * Turn a sequence of Clusters into a ScheduleTree;
+            * Derive and attach metadata for distributed-memory parallelism;
+            * Derive sections for performance profiling
+        """
+        # DEPRECATED: Build a ScheduleTree from a sequence of Clusters
+        stree = stree_build(clusters, **kwargs)
+        stree = cls._specialize_stree(stree)
+
+        return stree
+
+    # Compilation -- Iteration/Expression tree level
+
+    @classmethod
+    def _specialize_iet(cls, graph, **kwargs):
+        """
+        Backend hook for specialization at the Iteration/Expression tree level.
+        """
+        return graph
+
+    @classmethod
+    @timed_pass(name='lowering.uIET')
+    def _lower_uiet(cls, stree, profiler=None, **kwargs):
+        """
+        Turn a ScheduleTree into an unbounded Iteration/Expression tree, that is
+        in essence a "floating" IET where one or more variables may be unbounded
+        (i.e., no definition placed yet).
+        """
+        # Build an unbounded IET from a ScheduleTree
+        uiet = iet_build(stree)
+
+        # Analyze the IET Sections for C-level profiling
+        try:
+            profiler.analyze(uiet)
+        except AttributeError:
+            pass
+
+        return uiet
+
+    @classmethod
+    @timed_pass(name='lowering.IET')
+    def _lower_iet(cls, uiet, profiler=None, **kwargs):
+        """
+        Iteration/Expression tree lowering:
+
+            * Introduce distributed-memory, shared-memory, and SIMD parallelism;
+            * Introduce optimizations for data locality;
+            * Finalize (e.g., symbol definitions, array casts)
+        """
+        name = kwargs.get("name", "Kernel")
+        sregistry = kwargs['sregistry']
+
+        # Wrap the IET with an EntryFunction (a special Callable representing
+        # the entry point of the generated library)
+        parameters = derive_parameters(uiet, True)
+        iet = EntryFunction(name, uiet, 'int', parameters, ())
+
+        # Lower IET to a target-specific IET
+        graph = Graph(iet, sregistry=sregistry)
+        graph = cls._specialize_iet(graph, **kwargs)
+
+        # Instrument the IET for C-level profiling
+        # Note: this is postponed until after _specialize_iet because during
+        # specialization further Sections may be introduced
+        cls._Target.instrument(graph, profiler=profiler, **kwargs)
+
+        # Extract the necessary macros from the symbolic objects
+        generate_macros(graph)
+
+        # Target-independent optimizations
+        minimize_symbols(graph)
+
+        return graph.root, graph
+
+    # Read-only properties exposed to the outside world
+
+    @cached_property
+    def reads(self):
+        return tuple(self._reads)
+
+    @cached_property
+    def writes(self):
+        return tuple(self._writes)
+
+    @cached_property
+    def dimensions(self):
+        ret = set().union(*[d._defines for d in self._dimensions])
+
+        # During compilation other Dimensions may have been produced
+        dimensions = FindSymbols('dimensions').visit(self)
+        ret.update(d for d in dimensions if d.is_PerfKnob)
+
+        ret = tuple(sorted(ret, key=attrgetter('name')))
+
+        return ret
+
+    @cached_property
+    def input(self):
+        return tuple(i for i in self.parameters if i.is_Input)
+
+    @cached_property
+    def temporaries(self):
+        return tuple(i for i in self.parameters if i.is_TempFunction)
+
+    @cached_property
+    def objects(self):
+        return tuple(i for i in self.parameters if i.is_Object)
+
+    # Arguments processing
+
+    @cached_property
+    def _access_modes(self):
+        """
+        A table providing the AccessMode of all user-accessible symbols in `self`.
+        """
+        return frozendict({i: AccessMode(i in self.reads, i in self.writes)
+                           for i in self.input})
+
+    def _prepare_arguments(self, autotune=None, **kwargs):
+        """
+        Process runtime arguments passed to ``.apply()` and derive
+        default values for any remaining arguments.
+        """
+        # Sanity check -- all user-provided keywords must be known to the Operator
+        if not configuration['ignore-unknowns']:
+            for k, v in kwargs.items():
+                if k not in self._known_arguments:
+                    raise ValueError("Unrecognized argument %s=%s" % (k, v))
+
+        # Pre-process Dimension overrides. This may help ruling out ambiguities
+        # when processing the `defaults` arguments. A topological sorting is used
+        # as DerivedDimensions may depend on their parents
+        nodes = self.dimensions
+        edges = [(i, i.parent) for i in self.dimensions
+                 if i.is_Derived and i.parent in set(nodes)]
+        toposort = DAG(nodes, edges).topological_sort()
+
+        futures = {}
+        for d in reversed(toposort):
+            if set(d._arg_names).intersection(kwargs):
+                futures.update(d._arg_values(self._dspace[d], args={}, **kwargs))
+
+        overrides, defaults = split(self.input, lambda p: p.name in kwargs)
+
+        # Process data-carrier overrides
+        args = kwargs['args'] = ReducerMap()
+        for p in overrides:
+            args.update(p._arg_values(**kwargs))
+            try:
+                args.reduce_inplace()
+            except ValueError:
+                raise ValueError("Override `%s` is incompatible with overrides `%s`" %
+                                 (p, [i for i in overrides if i.name in args]))
+
+        # Process data-carrier defaults
+        for p in defaults:
+            if p.name in args:
+                # E.g., SubFunctions
+                continue
+            for k, v in p._arg_values(**kwargs).items():
+                if k not in args:
+                    args[k] = v
+                elif k in futures:
+                    # An explicit override is later going to set `args[k]`
+                    pass
+                elif k in kwargs:
+                    # User is in control
+                    # E.g., given a ConditionalDimension `t_sub` with factor `fact` and
+                    # a TimeFunction `usave(t_sub, x, y)`, an override for `fact` is
+                    # supplied w/o overriding `usave`; that's legal
+                    pass
+                elif is_integer(args[k]) and not contains_val(args[k], v):
+                    raise ValueError("Default `%s` is incompatible with other args as "
+                                     "`%s=%s`, while `%s=%s` is expected. Perhaps you "
+                                     "forgot to override `%s`?" %
+                                     (p, k, v, k, args[k], p))
+
+        args = kwargs['args'] = args.reduce_all()
+
+        # DiscreteFunctions may be created from CartesianDiscretizations, which in
+        # turn could be Grids or SubDomains. Both may provide arguments
+        discretizations = {getattr(kwargs[p.name], 'grid', None) for p in overrides}
+        discretizations.update({getattr(p, 'grid', None) for p in defaults})
+        discretizations.discard(None)
+        # Remove subgrids if multiple grids
+        if len(discretizations) > 1:
+            discretizations = {g for g in discretizations
+                               if not any(d.is_Derived for d in g.dimensions)}
+
+        for i in discretizations:
+            args.update(i._arg_values(**kwargs))
+
+        # There can only be one Grid from which DiscreteFunctions were created
+        grids = {i for i in discretizations if isinstance(i, Grid)}
+        if len(grids) > 1:
+            # We loosely tolerate multiple Grids for backwards compatibility
+            # with spacial subsampling, which should be revisited however. And
+            # With MPI it would definitely break!
+            if configuration['mpi']:
+                raise ValueError("Multiple Grids found")
+        try:
+            grid = grids.pop()
+        except KeyError:
+            grid = None
+
+        # An ArgumentsMap carries additional metadata that may be used by
+        # the subsequent phases of the arguments processing
+        args = kwargs['args'] = ArgumentsMap(args, grid, self)
+
+        # Process Dimensions
+        for d in reversed(toposort):
+            args.update(d._arg_values(self._dspace[d], grid, **kwargs))
+
+        # Process Objects
+        for o in self.objects:
+            args.update(o._arg_values(grid=grid, **kwargs))
+
+        # In some "lower-level" Operators implementing a random piece of C, such as
+        # one or more calls to third-party library functions, there could still be
+        # at this point unprocessed arguments (e.g., scalars)
+        kwargs.pop('args')
+        args.update({k: v for k, v in kwargs.items() if k not in args})
+
+        # Sanity check
+        for p in self.parameters:
+            p._arg_check(args, self._dspace[p], am=self._access_modes.get(p))
+        for d in self.dimensions:
+            if d.is_Derived:
+                d._arg_check(args, self._dspace[p])
+
+        # Turn arguments into a format suitable for the generated code
+        # E.g., instead of NumPy arrays for Functions, the generated code expects
+        # pointers to ctypes.Struct
+        for p in self.parameters:
+            try:
+                args.update(kwargs.get(p.name, p)._arg_finalize(args, alias=p))
+            except AttributeError:
+                # User-provided floats/ndarray obviously do not have `_arg_finalize`
+                args.update(p._arg_finalize(args, alias=p))
+
+        # Execute autotuning and adjust arguments accordingly
+        args.update(self._autotune(args, autotune or configuration['autotuning']))
+
+        return args
+
+    def _postprocess_arguments(self, args, **kwargs):
+        """Process runtime arguments upon returning from ``.apply()``."""
+        for p in self.parameters:
+            try:
+                subfuncs = (args[getattr(p, s).name] for s in p._sub_functions)
+                p._arg_apply(args[p.name], *subfuncs, alias=kwargs.get(p.name))
+            except AttributeError:
+                p._arg_apply(args[p.name], alias=kwargs.get(p.name))
+
+    @cached_property
+    def _known_arguments(self):
+        """The arguments that can be passed to ``apply`` when running the Operator."""
+        ret = set()
+        for i in self.input:
+            ret.update(i._arg_names)
+            try:
+                ret.update(i.grid._arg_names)
+            except AttributeError:
+                pass
+        for d in self.dimensions:
+            ret.update(d._arg_names)
+        ret.update(p.name for p in self.parameters)
+        return frozenset(ret)
+
+    def _autotune(self, args, setup):
+        """Auto-tuning to improve runtime performance."""
+        return args
+
+    def arguments(self, **kwargs):
+        """Arguments to run the Operator."""
+        args = self._prepare_arguments(**kwargs)
+        # Check all arguments are present
+        for p in self.parameters:
+            if args.get(p.name) is None:
+                raise ValueError("No value found for parameter %s" % p.name)
+        return args
+
+    # Code generation and JIT compilation
+
+    #@cached_property
+    #def _soname(self):
+    #    """A unique name for the shared object resulting from JIT compilation."""
+    #    return Signer._digest(self, configuration)
+
+    @cached_property
+    def ccode(self):
+        try:
+            return self._ccode_handler(compiler=self._compiler).visit(self)
+        except (AttributeError, TypeError):
+            from devito.ir.iet.visitors import CGen
+            return CGen(compiler=self._compiler).visit(self)
+
+    @property
+    def cfunction(self):
+        """The JIT-compiled C function as a ctypes.FuncPtr object."""
+        if self._lib is None:
+            self._jit_compile()
+            self.setup_memref_args()
+            self._lib = self._compiler.load(self._tf.name)
+            self._lib.name = self._tf.name
+
+        if self._cfunction is None:
+            self._cfunction = getattr(self._lib, "apply_kernel")
+            # Associate a C type to each argument for runtime type check
+            self._cfunction.argtypes = self._construct_cfunction_args(self._jit_kernel_constants, get_types=True)
+
+        return self._cfunction
+
+    def cinterface(self, force=False):
+        """
+        Generate two files under the prescribed temporary directory:
+
+            * `X.c` (or `X.cpp`): the code generated for this Operator;
+            * `X.h`: an header file representing the interface of `X.c`.
+
+        Where `X=self.name`.
+
+        Parameters
+        ----------
+        force : bool, optional
+            Overwrite any existing files. Defaults to False.
+        """
+        dest = self._compiler.get_jit_dir()
+        name = dest.joinpath(self.name)
+
+        cfile = name.with_suffix(".%s" % self._compiler.src_ext)
+        hfile = name.with_suffix('.h')
+
+        # Generate the .c and .h code
+        ccode, hcode = CInterface().visit(self)
+
+        for f, code in [(cfile, ccode), (hfile, hcode)]:
+            if not force and f.is_file():
+                debug("`%s` was not saved in `%s` as it already exists" % (f.name, dest))
+            else:
+                with open(str(f), 'w') as ff:
+                    ff.write(str(code))
+                debug("`%s` successfully saved in `%s`" % (f.name, dest))
+
+        return ccode, hcode
+
+    # Execution
+
+    def __call__(self, **kwargs):
+        return self.apply(**kwargs)
+
+    def apply(self, **kwargs):
+        """
+        Execute the Operator.
+
+        With no arguments provided, the Operator runs using the data carried by the
+        objects appearing in the input expressions -- these are referred to as the
+        "default arguments".
+
+        Optionally, any of the Operator default arguments may be replaced by passing
+        suitable key-value arguments. Given ``apply(k=v, ...)``, ``(k, v)`` may be
+        used to:
+
+        * replace a Constant. In this case, ``k`` is the name of the Constant,
+          ``v`` is either a Constant or a scalar value.
+
+        * replace a Function (SparseFunction). Here, ``k`` is the name of the
+          Function, ``v`` is either a Function or a numpy.ndarray.
+
+        * alter the iteration interval along a Dimension. Consider a generic
+          Dimension ``d`` iterated over by the Operator.  By default, the Operator
+          runs over all iterations within the compact interval ``[d_m, d_M]``,
+          where ``d_m`` and ``d_M`` are, respectively, the smallest and largest
+          integers not causing out-of-bounds memory accesses (for the Grid
+          Dimensions, this typically implies iterating over the entire physical
+          domain). So now ``k`` can be either ``d_m`` or ``d_M``, while ``v``
+          is an integer value.
+
+        Examples
+        --------
+        Consider the following Operator
+
+        >>> from devito import Eq, Grid, TimeFunction, Operator
+        >>> grid = Grid(shape=(3, 3))
+        >>> u = TimeFunction(name='u', grid=grid, save=3)
+        >>> op = Operator(Eq(u.forward, u + 1))
+
+        The Operator is run by calling ``apply``
+
+        >>> summary = op.apply()
+
+        The variable ``summary`` contains information about runtime performance.
+        As no key-value parameters are specified, the Operator runs with its
+        default arguments, namely ``u=u, x_m=0, x_M=2, y_m=0, y_M=2, time_m=0,
+        time_M=1``.
+
+        At this point, the same Operator can be used for a completely different
+        run, for example
+
+        >>> u2 = TimeFunction(name='u', grid=grid, save=5)
+        >>> summary = op.apply(u=u2, x_m=1, y_M=1)
+
+        Now, the Operator will run with a different set of arguments, namely
+        ``u=u2, x_m=1, x_M=2, y_m=0, y_M=1, time_m=0, time_M=3``.
+
+        To run an Operator that only uses buffered TimeFunctions, the maximum
+        iteration point along the time dimension must be explicitly specified
+        (otherwise, the Operator wouldn't know how many iterations to run).
+
+        >>> u3 = TimeFunction(name='u', grid=grid)
+        >>> op = Operator(Eq(u3.forward, u3 + 1))
+        >>> summary = op.apply(time_M=10)
+        """
+        # Build the arguments list to invoke the kernel function
+        with self._profiler.timer_on('arguments'):
+            args = self.arguments(**kwargs)
+            self._jit_kernel_constants = args
+
+        cfunction = self.cfunction
+        try:
+            # Invoke kernel function with args
+            arg_values = self._construct_cfunction_args(args)
+            with self._profiler.timer_on('apply', comm=args.comm):
+                cfunction(*arg_values)
+        except ctypes.ArgumentError as e:
+            if e.args[0].startswith("argument "):
+                argnum = int(e.args[0][9:].split(':')[0]) - 1
+                newmsg = "error in argument '%s' with value '%s': %s" % (
+                    self.parameters[argnum].name,
+                    arg_values[argnum],
+                    e.args[0])
+                raise ctypes.ArgumentError(newmsg) from e
+            else:
+                raise
+
+        # Post-process runtime arguments
+        self._postprocess_arguments(args, **kwargs)
+
+        # Output summary of performance achieved
+        return self._emit_apply_profiling(args)
+
+    def _construct_cfunction_args(self, args, get_types = False):
+        """
+        Either construct the args for the cfunction, or construct the
+        arg types for it.
+        """
+        ps = {
+            p._C_name: p._C_ctype for p in self.parameters
+        }
+        
+        things = []
+        things_types = []
+
+        for name in get_arg_names_from_module(self._module):
+            thing = args[name]
+            things.append(thing)
+            if name in ps:
+                things_types.append(ps[name])
+            else:
+                things_types.append(type(thing))
+
+        if get_types:
+            return things_types
+        else:
+            return things
+
+    def _emit_build_profiling(self):
+        if not is_log_enabled_for('PERF'):
+            return
+
+        # Rounder to K decimal places
+        fround = lambda i, n=100: ceil(i * n) / n
+
+        timings = self._profiler.py_timers.copy()
+
+        tot = timings.pop('op-compile')
+        perf("Operator `%s` generated in %.2f s" % (self.name, fround(tot)))
+
+        max_hotspots = 3
+        threshold = 20.
+
+        def _emit_timings(timings, indent=''):
+            timings.pop('total', None)
+            entries = sorted(timings, key=lambda i: timings[i]['total'], reverse=True)
+            for i in entries[:max_hotspots]:
+                v = fround(timings[i]['total'])
+                perc = fround(v/tot*100, n=10)
+                if perc > threshold:
+                    perf("%s%s: %.2f s (%.1f %%)" % (indent, i.lstrip('_'), v, perc))
+                    _emit_timings(timings[i], ' '*len(indent) + ' * ')
+
+        _emit_timings(timings, '  * ')
+
+        if self._profiler._ops:
+            ops = ['%d --> %d' % i for i in self._profiler._ops]
+            perf("Flops reduction after symbolic optimization: [%s]" % ' ; '.join(ops))
+
+    def _emit_apply_profiling(self, args):
+        """Produce a performance summary of the profiled sections."""
+        # Rounder to 2 decimal places
+        fround = lambda i: ceil(i * 100) / 100
+
+        elapsed = fround(self._profiler.py_timers['apply'])
+        info("Operator `%s` ran in %.2f s" % (self.name, elapsed))
+
+        summary = self._profiler.summary(args, self._dtype, reduce_over=elapsed)
+
+        if not is_log_enabled_for('PERF'):
+            # Do not waste time
+            return summary
+
+        if summary.globals:
+            # Note that with MPI enabled, the global performance indicators
+            # represent "cross-rank" performance data
+            metrics = []
+
+            v = summary.globals.get('vanilla')
+            if v is not None:
+                metrics.append("OI=%.2f" % fround(v.oi))
+                metrics.append("%.2f GFlops/s" % fround(v.gflopss))
+
+            v = summary.globals.get('fdlike')
+            if v is not None:
+                metrics.append("%.2f GPts/s" % fround(v.gpointss))
+
+            if metrics:
+                perf("Global performance: [%s]" % ', '.join(metrics))
+
+            perf("Local performance:")
+            indent = " "*2
+        else:
+            indent = ""
+
+            if isinstance(self._profiler, AdvancedProfilerVerbose):
+                metrics = []
+
+                v = summary.globals.get('fdlike-nosetup')
+                if v is not None:
+                    metrics.append("%.2f GPts/s" % fround(v.gpointss))
+
+                if metrics:
+                    perf("Global performance <w/o setup>: [%s]" % ', '.join(metrics))
+
+        # Emit local, i.e. "per-rank" performance. Without MPI, this is the only
+        # thing that will be emitted
+        def lower_perfentry(v):
+            if v.gflopss:
+                oi = "OI=%.2f" % fround(v.oi)
+                gflopss = "%.2f GFlops/s" % fround(v.gflopss)
+                gpointss = "%.2f GPts/s" % fround(v.gpointss)
+                return "[%s]" % ", ".join([oi, gflopss, gpointss])
+            elif v.gpointss:
+                gpointss = "%.2f GPts/s" % fround(v.gpointss)
+                return "[%s]" % gpointss
+            else:
+                return ""
+
+        for k, v in summary.items():
+            rank = "[rank%d]" % k.rank if k.rank is not None else ""
+
+            metrics = lower_perfentry(v)
+
+            itershapes = [",".join(str(i) for i in its) for its in v.itershapes]
+            if len(itershapes) > 1:
+                itershapes = ",".join("<%s>" % i for i in itershapes)
+            elif len(itershapes) == 1:
+                itershapes = itershapes[0]
+            else:
+                itershapes = ""
+            name = "%s%s<%s>" % (k.name, rank, itershapes)
+
+            perf("%s* %s ran in %.2f s %s" % (indent, name, fround(v.time), metrics))
+            for n, v1 in summary.subsections.get(k.name, {}).items():
+                metrics = lower_perfentry(v1)
+
+                perf("%s+ %s ran in %.2f s [%.2f%%] %s" %
+                     (indent*2, n, fround(v1.time), fround(v1.time/v.time*100),
+                      metrics))
+
+        # Emit performance mode and arguments
+        perf_args = {}
+        for i in self.input + self.dimensions:
+            if not i.is_PerfKnob:
+                continue
+            try:
+                perf_args[i.name] = args[i.name]
+            except KeyError:
+                # Try with the aliases
+                for a in i._arg_names:
+                    if a in args:
+                        perf_args[a] = args[a]
+                        break
+        perf("Performance[mode=%s] arguments: %s" % (self._mode, perf_args))
+
+        return summary
+
+    # Pickling support
+
+    def __getstate__(self):
+        if self._lib:
+            state = dict(self.__dict__)
+            # The compiled shared-object will be pickled; upon unpickling, it
+            # will be restored into a potentially different temporary directory,
+            # so the entire process during which the shared-object is loaded and
+            # given to ctypes must be performed again
+            state['_lib'] = None
+            state['_cfunction'] = None
+            # Do not pickle the `args` used to construct the Operator. Not only
+            # would this be completely useless, but it might also lead to
+            # allocating additional memory upon unpickling, as the user-provided
+            # equations typically carry different instances of the same Function
+            # (e.g., f(t, x-1), f(t, x), f(t, x+1)), which are different objects
+            # with distinct `.data` fields
+            state['_args'] = None
+            with open(self._lib._name, 'rb') as f:
+                state['binary'] = f.read()
+                state['soname'] = self._soname
+            return state
+        else:
+            return self.__dict__
+
+    def __getnewargs_ex__(self):
+        return (None,), {}
+
+    def __setstate__(self, state):
+        soname = state.pop('soname', None)
+        binary = state.pop('binary', None)
+        for k, v in state.items():
+            setattr(self, k, v)
+        if soname is not None:
+            self._compiler.save(soname, binary)
+            self._lib = self._compiler.load(soname)
+            self._lib.name = soname
+
+
+# Default action (perform or bypass) for selected compilation passes upon
+# recursive compilation
+# NOTE: it may not only be pointless to apply the following passes recursively
+# (because once, during the main compilation phase, is simply enough), but also
+# dangerous as some of them (the minority) might break in some circumstances
+# if applied in cascade (e.g., `linearization` on top of `linearization`)
+rcompile_registry = {
+    'mpi': False,
+    'linearize': False,
+    'place-transfers': False
+}
+
+
+def rcompile(expressions, kwargs=None):
+    """
+    Perform recursive compilation on an ordered sequence of symbolic expressions.
+    """
+    if not kwargs or 'options' not in kwargs:
+        kwargs = parse_kwargs(**kwargs)
+        cls = operator_selector(**kwargs)
+        kwargs = cls._normalize_kwargs(**kwargs)
+    else:
+        cls = operator_selector(**kwargs)
+
+    # Tweak the compilation kwargs
+    options = dict(kwargs['options'])
+    options.update(rcompile_registry)
+    kwargs['options'] = options
+
+    # Recursive profiling not supported -- would be a complete mess
+    kwargs.pop('profiler', None)
+
+    return cls._lower(expressions, **kwargs)
+
+
+# Misc helpers
+
+
+IRs = namedtuple('IRs', 'expressions clusters stree uiet iet')
+
+
+class ArgumentsMap(dict):
+
+    def __init__(self, args, grid, op):
+        super().__init__(args)
+
+        self.grid = grid
+
+        self.allocator = op._allocator
+        self.platform = op._platform
+        # self.language = op._language
+        self.compiler = op._compiler
+        self.options = op._options
+
+    @property
+    def comm(self):
+        """The MPI communicator the arguments are collective over."""
+        return self.grid.comm if self.grid is not None else MPI.COMM_NULL
+
+    @property
+    def opkwargs(self):
+        temp_registry = {v: k for k, v in platform_registry.items()}
+        platform = temp_registry[self.platform]
+
+        temp_registry = {v: k for k, v in compiler_registry.items()}
+        compiler = temp_registry[self.compiler.__class__]
+
+        return {'platform': platform, 'compiler': compiler, 'language': self.language}
+
+
+def parse_kwargs(**kwargs):
+    """
+    Parse keyword arguments provided to an Operator.
+    """
+    # `dse` -- deprecated, dropped
+    dse = kwargs.pop("dse", None)
+    if dse is not None:
+        warning("The `dse` argument is deprecated. "
+                "The optimization level is now controlled via the `opt` argument")
+
+    # `dle` -- deprecated, replaced by `opt`
+    if 'dle' in kwargs:
+        warning("The `dle` argument is deprecated. "
+                "The optimization level is now controlled via the `opt` argument")
+        dle = kwargs.pop('dle')
+        if 'opt' in kwargs:
+            warning("Both `dle` and `opt` were passed; ignoring `dle` argument")
+            opt = kwargs.pop('opt')
+        else:
+            warning("Setting `opt=%s`" % str(dle))
+            opt = dle
+    elif 'opt' in kwargs:
+        opt = kwargs.pop('opt')
+    else:
+        opt = configuration['opt']
+
+    if not opt or isinstance(opt, str):
+        mode, options = opt, {}
+    elif isinstance(opt, tuple):
+        if len(opt) == 0:
+            mode, options = 'noop', {}
+        elif isinstance(opt[-1], dict):
+            if len(opt) == 2:
+                mode, options = opt
+            else:
+                mode, options = tuple(flatten(i.split(',') for i in opt[:-1])), opt[-1]
+        else:
+            mode, options = tuple(flatten(i.split(',') for i in opt)), {}
+    else:
+        raise InvalidOperator("Illegal `opt=%s`" % str(opt))
+
+    # `opt`, deprecated kwargs
+    kwopenmp = kwargs.get('openmp', options.get('openmp'))
+    if kwopenmp is None:
+        openmp = kwargs.get('language', configuration['language']) == 'openmp'
+    else:
+        openmp = kwopenmp
+
+    # `opt`, options
+    options = dict(options)
+    options.setdefault('openmp', openmp)
+    options.setdefault('mpi', configuration['mpi'])
+    for k, v in configuration['opt-options'].items():
+        options.setdefault(k, v)
+    # Handle deprecations
+    deprecated_options = ('cire-mincost-inv', 'cire-mincost-sops', 'cire-maxalias')
+    for i in deprecated_options:
+        try:
+            options.pop(i)
+            warning("Ignoring deprecated optimization option `%s`" % i)
+        except KeyError:
+            pass
+    kwargs['options'] = options
+
+    # `opt`, mode
+    if mode is None:
+        mode = 'noop'
+    kwargs['mode'] = mode
+
+    # `platform`
+    platform = kwargs.get('platform')
+    if platform is not None:
+        if not isinstance(platform, str):
+            raise ValueError("Argument `platform` should be a `str`")
+        if platform not in configuration._accepted['platform']:
+            raise InvalidOperator("Illegal `platform=%s`" % str(platform))
+        kwargs['platform'] = platform_registry[platform]()
+    else:
+        kwargs['platform'] = configuration['platform']
+
+    # `language`
+    language = kwargs.get('language')
+    if language is not None:
+        if not isinstance(language, str):
+            raise ValueError("Argument `language` should be a `str`")
+        if language not in configuration._accepted['language']:
+            raise InvalidOperator("Illegal `language=%s`" % str(language))
+        kwargs['language'] = language
+    elif kwopenmp is not None:
+        # Handle deprecated `openmp` kwarg for backward compatibility
+        kwargs['language'] = 'openmp' if openmp else 'C'
+    else:
+        kwargs['language'] = configuration['language']
+
+    # `compiler`
+    compiler = kwargs.get('compiler')
+    if compiler is not None:
+        if not isinstance(compiler, str):
+            raise ValueError("Argument `compiler` should be a `str`")
+        if compiler not in configuration._accepted['compiler']:
+            raise InvalidOperator("Illegal `compiler=%s`" % str(compiler))
+        kwargs['compiler'] = compiler_registry[compiler](platform=kwargs['platform'],
+                                                         language=kwargs['language'],
+                                                         mpi=configuration['mpi'])
+    elif any([platform, language]):
+        kwargs['compiler'] =\
+            configuration['compiler'].__new_with__(platform=kwargs['platform'],
+                                                   language=kwargs['language'],
+                                                   mpi=configuration['mpi'])
+    else:
+        kwargs['compiler'] = configuration['compiler'].__new_with__()
+
+    # `allocator`
+    kwargs['allocator'] = default_allocator(
+        '%s.%s.%s' % (kwargs['compiler'].name,
+                      kwargs['language'],
+                      kwargs['platform'])
+    )
+
+    return kwargs
+
+
+def get_arg_names_from_module(op):
+    return [
+        str_attr.data 
+        for str_attr in op.body.block.ops.first.attributes['param_names'].data
+    ]
diff --git a/requirements.txt b/requirements.txt
index fdd231d76f..d0a267109e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,5 +15,5 @@ distributed<2022.10
 pytest>=3.6,<8.0
 pytest-runner
 pytest-cov
-xdsl>=0.11
+git+https://github.com/xdslproject/xdsl@cd020eace6f2a06c33916d5283efbb24362ab61d
 frozenlist<=1.4

From 07a5ec0e44e4c06bb7ca4c477ac778db10fd01d5 Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Mon, 4 Dec 2023 16:49:39 +0000
Subject: [PATCH 03/16] Revert versionning.

---
 .github/workflows/ci-mlir-mpi.yml | 1 +
 .github/workflows/ci-mlir.yml     | 2 ++
 requirements.txt                  | 2 +-
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci-mlir-mpi.yml b/.github/workflows/ci-mlir-mpi.yml
index e669d89366..3a6eda0b3f 100644
--- a/.github/workflows/ci-mlir-mpi.yml
+++ b/.github/workflows/ci-mlir-mpi.yml
@@ -38,6 +38,7 @@ jobs:
       run: |
         pip install -e .[tests]
         pip install mpi4py
+        pip install git+https://github.com/xdslproject/xdsl@5500ff6d82d1a920b369615292ba507ecbf92fc9
 
     - name: Test with MPI
       run: |
diff --git a/.github/workflows/ci-mlir.yml b/.github/workflows/ci-mlir.yml
index 35b6563d41..0515757c09 100644
--- a/.github/workflows/ci-mlir.yml
+++ b/.github/workflows/ci-mlir.yml
@@ -38,6 +38,8 @@ jobs:
       run: |
         pip install -e .[tests]
         pip install mpi4py
+        pip install git+https://github.com/xdslproject/xdsl@5500ff6d82d1a920b369615292ba507ecbf92fc9
+        
     - name: Test no-MPI, no-Openmp
       run: |
         export DEVITO_MPI=0
diff --git a/requirements.txt b/requirements.txt
index d0a267109e..903f18d84e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,5 +15,5 @@ distributed<2022.10
 pytest>=3.6,<8.0
 pytest-runner
 pytest-cov
-git+https://github.com/xdslproject/xdsl@cd020eace6f2a06c33916d5283efbb24362ab61d
+xdsl
 frozenlist<=1.4

From 380175b24e992c9ffc9e7a797d5d889f5c06e77f Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Mon, 4 Dec 2023 17:46:18 +0000
Subject: [PATCH 04/16] Update GPU lwoering to new MLIR.

---
 devito/operator/xdsl_operator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/devito/operator/xdsl_operator.py b/devito/operator/xdsl_operator.py
index 089bcfa297..cbdb804642 100644
--- a/devito/operator/xdsl_operator.py
+++ b/devito/operator/xdsl_operator.py
@@ -78,7 +78,7 @@ def generate_tiling_arg(nb_tiled_dims: int):
 MLIR_CPU_PIPELINE = '"builtin.module(canonicalize, cse, loop-invariant-code-motion, canonicalize, cse, loop-invariant-code-motion,cse,canonicalize,fold-memref-alias-ops,expand-strided-metadata, loop-invariant-code-motion,lower-affine,convert-scf-to-cf,convert-math-to-llvm,convert-func-to-llvm{use-bare-ptr-memref-call-conv},finalize-memref-to-llvm,canonicalize,cse)"'
 MLIR_OPENMP_PIPELINE = '"builtin.module(canonicalize, cse, loop-invariant-code-motion, canonicalize, cse, loop-invariant-code-motion,cse,canonicalize,fold-memref-alias-ops,expand-strided-metadata, loop-invariant-code-motion,lower-affine,finalize-memref-to-llvm,loop-invariant-code-motion,canonicalize,cse,convert-scf-to-openmp,finalize-memref-to-llvm,convert-scf-to-cf,convert-func-to-llvm{use-bare-ptr-memref-call-conv},convert-openmp-to-llvm,convert-math-to-llvm,reconcile-unrealized-casts,canonicalize,cse)"'
 # gpu-launch-sink-index-computations seemed to have no impact
-MLIR_GPU_PIPELINE = lambda block_sizes: f'"builtin.module(test-math-algebraic-simplification,scf-parallel-loop-tiling{{parallel-loop-tile-sizes={block_sizes}}},func.func(gpu-map-parallel-loops),convert-parallel-loops-to-gpu,lower-affine, canonicalize,cse, fold-memref-alias-ops, gpu-launch-sink-index-computations, gpu-kernel-outlining, canonicalize{{region-simplify}},cse,fold-memref-alias-ops,expand-strided-metadata,lower-affine,canonicalize,cse,func.func(gpu-async-region),canonicalize,cse,convert-arith-to-llvm{{index-bitwidth=64}},convert-scf-to-cf,convert-cf-to-llvm{{index-bitwidth=64}},canonicalize,cse,convert-func-to-llvm{{use-bare-ptr-memref-call-conv}},gpu.module(convert-gpu-to-nvvm,reconcile-unrealized-casts,canonicalize,gpu-to-cubin),gpu-to-llvm,canonicalize,cse)"'
+MLIR_GPU_PIPELINE = lambda block_sizes: f'"builtin.module(test-math-algebraic-simplification,scf-parallel-loop-tiling{{parallel-loop-tile-sizes={block_sizes}}},func.func(gpu-map-parallel-loops),convert-parallel-loops-to-gpu,lower-affine, canonicalize,cse, fold-memref-alias-ops, gpu-launch-sink-index-computations, gpu-kernel-outlining, canonicalize{{region-simplify}},cse,fold-memref-alias-ops,expand-strided-metadata,lower-affine,canonicalize,cse,func.func(gpu-async-region),canonicalize,cse,convert-arith-to-llvm{{index-bitwidth=64}},convert-scf-to-cf,convert-cf-to-llvm{{index-bitwidth=64}},canonicalize,cse,convert-func-to-llvm{{use-bare-ptr-memref-call-conv}},test-lower-to-nvvm,canonicalize,cse)"'
 
 XDSL_CPU_PIPELINE = lambda nb_tiled_dims: f'"stencil-shape-inference,convert-stencil-to-ll-mlir{{{generate_tiling_arg(nb_tiled_dims)}}},printf-to-llvm,canonicalize"'
 XDSL_GPU_PIPELINE = "stencil-shape-inference,convert-stencil-to-ll-mlir{target=gpu},reconcile-unrealized-casts,printf-to-llvm,canonicalize"

From 8efbc1ed8a2082f79f1c6dc5c984e0fa767e8389 Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Mon, 11 Dec 2023 16:30:42 +0000
Subject: [PATCH 05/16] Bare pointers for GPU lowering.

---
 devito/operator/xdsl_operator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/devito/operator/xdsl_operator.py b/devito/operator/xdsl_operator.py
index cbdb804642..7b0845e99c 100644
--- a/devito/operator/xdsl_operator.py
+++ b/devito/operator/xdsl_operator.py
@@ -78,7 +78,7 @@ def generate_tiling_arg(nb_tiled_dims: int):
 MLIR_CPU_PIPELINE = '"builtin.module(canonicalize, cse, loop-invariant-code-motion, canonicalize, cse, loop-invariant-code-motion,cse,canonicalize,fold-memref-alias-ops,expand-strided-metadata, loop-invariant-code-motion,lower-affine,convert-scf-to-cf,convert-math-to-llvm,convert-func-to-llvm{use-bare-ptr-memref-call-conv},finalize-memref-to-llvm,canonicalize,cse)"'
 MLIR_OPENMP_PIPELINE = '"builtin.module(canonicalize, cse, loop-invariant-code-motion, canonicalize, cse, loop-invariant-code-motion,cse,canonicalize,fold-memref-alias-ops,expand-strided-metadata, loop-invariant-code-motion,lower-affine,finalize-memref-to-llvm,loop-invariant-code-motion,canonicalize,cse,convert-scf-to-openmp,finalize-memref-to-llvm,convert-scf-to-cf,convert-func-to-llvm{use-bare-ptr-memref-call-conv},convert-openmp-to-llvm,convert-math-to-llvm,reconcile-unrealized-casts,canonicalize,cse)"'
 # gpu-launch-sink-index-computations seemed to have no impact
-MLIR_GPU_PIPELINE = lambda block_sizes: f'"builtin.module(test-math-algebraic-simplification,scf-parallel-loop-tiling{{parallel-loop-tile-sizes={block_sizes}}},func.func(gpu-map-parallel-loops),convert-parallel-loops-to-gpu,lower-affine, canonicalize,cse, fold-memref-alias-ops, gpu-launch-sink-index-computations, gpu-kernel-outlining, canonicalize{{region-simplify}},cse,fold-memref-alias-ops,expand-strided-metadata,lower-affine,canonicalize,cse,func.func(gpu-async-region),canonicalize,cse,convert-arith-to-llvm{{index-bitwidth=64}},convert-scf-to-cf,convert-cf-to-llvm{{index-bitwidth=64}},canonicalize,cse,convert-func-to-llvm{{use-bare-ptr-memref-call-conv}},test-lower-to-nvvm,canonicalize,cse)"'
+MLIR_GPU_PIPELINE = lambda block_sizes: f'"builtin.module(test-math-algebraic-simplification,scf-parallel-loop-tiling{{parallel-loop-tile-sizes={block_sizes}}},func.func(gpu-map-parallel-loops),convert-parallel-loops-to-gpu,lower-affine, canonicalize,cse, fold-memref-alias-ops, gpu-launch-sink-index-computations, gpu-kernel-outlining, canonicalize{{region-simplify}},cse,fold-memref-alias-ops,expand-strided-metadata,lower-affine,canonicalize,cse,func.func(gpu-async-region),canonicalize,cse,convert-arith-to-llvm{{index-bitwidth=64}},convert-scf-to-cf,convert-cf-to-llvm{{index-bitwidth=64}},canonicalize,cse,convert-func-to-llvm{{use-bare-ptr-memref-call-conv}},test-lower-to-nvvm{{host-bare-ptr-calling-convention kernel-bare-ptr-calling-convention}},canonicalize,cse)"'
 
 XDSL_CPU_PIPELINE = lambda nb_tiled_dims: f'"stencil-shape-inference,convert-stencil-to-ll-mlir{{{generate_tiling_arg(nb_tiled_dims)}}},printf-to-llvm,canonicalize"'
 XDSL_GPU_PIPELINE = "stencil-shape-inference,convert-stencil-to-ll-mlir{target=gpu},reconcile-unrealized-casts,printf-to-llvm,canonicalize"

From 9eebab42e5f5a000acb1ca8f1375d739297c848e Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Mon, 11 Dec 2023 16:58:04 +0000
Subject: [PATCH 06/16] Try.

---
 devito/operator/xdsl_operator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/devito/operator/xdsl_operator.py b/devito/operator/xdsl_operator.py
index 7b0845e99c..3bea734081 100644
--- a/devito/operator/xdsl_operator.py
+++ b/devito/operator/xdsl_operator.py
@@ -78,7 +78,7 @@ def generate_tiling_arg(nb_tiled_dims: int):
 MLIR_CPU_PIPELINE = '"builtin.module(canonicalize, cse, loop-invariant-code-motion, canonicalize, cse, loop-invariant-code-motion,cse,canonicalize,fold-memref-alias-ops,expand-strided-metadata, loop-invariant-code-motion,lower-affine,convert-scf-to-cf,convert-math-to-llvm,convert-func-to-llvm{use-bare-ptr-memref-call-conv},finalize-memref-to-llvm,canonicalize,cse)"'
 MLIR_OPENMP_PIPELINE = '"builtin.module(canonicalize, cse, loop-invariant-code-motion, canonicalize, cse, loop-invariant-code-motion,cse,canonicalize,fold-memref-alias-ops,expand-strided-metadata, loop-invariant-code-motion,lower-affine,finalize-memref-to-llvm,loop-invariant-code-motion,canonicalize,cse,convert-scf-to-openmp,finalize-memref-to-llvm,convert-scf-to-cf,convert-func-to-llvm{use-bare-ptr-memref-call-conv},convert-openmp-to-llvm,convert-math-to-llvm,reconcile-unrealized-casts,canonicalize,cse)"'
 # gpu-launch-sink-index-computations seemed to have no impact
-MLIR_GPU_PIPELINE = lambda block_sizes: f'"builtin.module(test-math-algebraic-simplification,scf-parallel-loop-tiling{{parallel-loop-tile-sizes={block_sizes}}},func.func(gpu-map-parallel-loops),convert-parallel-loops-to-gpu,lower-affine, canonicalize,cse, fold-memref-alias-ops, gpu-launch-sink-index-computations, gpu-kernel-outlining, canonicalize{{region-simplify}},cse,fold-memref-alias-ops,expand-strided-metadata,lower-affine,canonicalize,cse,func.func(gpu-async-region),canonicalize,cse,convert-arith-to-llvm{{index-bitwidth=64}},convert-scf-to-cf,convert-cf-to-llvm{{index-bitwidth=64}},canonicalize,cse,convert-func-to-llvm{{use-bare-ptr-memref-call-conv}},test-lower-to-nvvm{{host-bare-ptr-calling-convention kernel-bare-ptr-calling-convention}},canonicalize,cse)"'
+MLIR_GPU_PIPELINE = lambda block_sizes: f'"builtin.module(test-math-algebraic-simplification,scf-parallel-loop-tiling{{parallel-loop-tile-sizes={block_sizes}}},func.func(gpu-map-parallel-loops),convert-parallel-loops-to-gpu,lower-affine, canonicalize,cse, fold-memref-alias-ops, gpu-launch-sink-index-computations, gpu-kernel-outlining, canonicalize{{region-simplify}},cse,fold-memref-alias-ops,expand-strided-metadata,lower-affine,canonicalize,cse,func.func(gpu-async-region),canonicalize,cse,convert-arith-to-llvm{{index-bitwidth=64}},convert-scf-to-cf,convert-cf-to-llvm{{index-bitwidth=64}},canonicalize,cse,convert-func-to-llvm{{use-bare-ptr-memref-call-conv}},nvvm-attach-target{{O=3 ftz fast}},gpu.module(convert-gpu-to-nvvm,canonicalize,cse),gpu-to-llvm,gpu-module-to-binary,canonicalize,cse)"'
 
 XDSL_CPU_PIPELINE = lambda nb_tiled_dims: f'"stencil-shape-inference,convert-stencil-to-ll-mlir{{{generate_tiling_arg(nb_tiled_dims)}}},printf-to-llvm,canonicalize"'
 XDSL_GPU_PIPELINE = "stencil-shape-inference,convert-stencil-to-ll-mlir{target=gpu},reconcile-unrealized-casts,printf-to-llvm,canonicalize"

From 89c780d5aa995415b0faed1d57cafc89c6350d38 Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Mon, 11 Dec 2023 17:10:41 +0000
Subject: [PATCH 07/16] Explicit chip.

---
 devito/operator/xdsl_operator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/devito/operator/xdsl_operator.py b/devito/operator/xdsl_operator.py
index 3bea734081..800e9ac677 100644
--- a/devito/operator/xdsl_operator.py
+++ b/devito/operator/xdsl_operator.py
@@ -78,7 +78,7 @@ def generate_tiling_arg(nb_tiled_dims: int):
 MLIR_CPU_PIPELINE = '"builtin.module(canonicalize, cse, loop-invariant-code-motion, canonicalize, cse, loop-invariant-code-motion,cse,canonicalize,fold-memref-alias-ops,expand-strided-metadata, loop-invariant-code-motion,lower-affine,convert-scf-to-cf,convert-math-to-llvm,convert-func-to-llvm{use-bare-ptr-memref-call-conv},finalize-memref-to-llvm,canonicalize,cse)"'
 MLIR_OPENMP_PIPELINE = '"builtin.module(canonicalize, cse, loop-invariant-code-motion, canonicalize, cse, loop-invariant-code-motion,cse,canonicalize,fold-memref-alias-ops,expand-strided-metadata, loop-invariant-code-motion,lower-affine,finalize-memref-to-llvm,loop-invariant-code-motion,canonicalize,cse,convert-scf-to-openmp,finalize-memref-to-llvm,convert-scf-to-cf,convert-func-to-llvm{use-bare-ptr-memref-call-conv},convert-openmp-to-llvm,convert-math-to-llvm,reconcile-unrealized-casts,canonicalize,cse)"'
 # gpu-launch-sink-index-computations seemed to have no impact
-MLIR_GPU_PIPELINE = lambda block_sizes: f'"builtin.module(test-math-algebraic-simplification,scf-parallel-loop-tiling{{parallel-loop-tile-sizes={block_sizes}}},func.func(gpu-map-parallel-loops),convert-parallel-loops-to-gpu,lower-affine, canonicalize,cse, fold-memref-alias-ops, gpu-launch-sink-index-computations, gpu-kernel-outlining, canonicalize{{region-simplify}},cse,fold-memref-alias-ops,expand-strided-metadata,lower-affine,canonicalize,cse,func.func(gpu-async-region),canonicalize,cse,convert-arith-to-llvm{{index-bitwidth=64}},convert-scf-to-cf,convert-cf-to-llvm{{index-bitwidth=64}},canonicalize,cse,convert-func-to-llvm{{use-bare-ptr-memref-call-conv}},nvvm-attach-target{{O=3 ftz fast}},gpu.module(convert-gpu-to-nvvm,canonicalize,cse),gpu-to-llvm,gpu-module-to-binary,canonicalize,cse)"'
+MLIR_GPU_PIPELINE = lambda block_sizes: f'"builtin.module(test-math-algebraic-simplification,scf-parallel-loop-tiling{{parallel-loop-tile-sizes={block_sizes}}},func.func(gpu-map-parallel-loops),convert-parallel-loops-to-gpu,lower-affine, canonicalize,cse, fold-memref-alias-ops, gpu-launch-sink-index-computations, gpu-kernel-outlining, canonicalize{{region-simplify}},cse,fold-memref-alias-ops,expand-strided-metadata,lower-affine,canonicalize,cse,func.func(gpu-async-region),canonicalize,cse,convert-arith-to-llvm{{index-bitwidth=64}},convert-scf-to-cf,convert-cf-to-llvm{{index-bitwidth=64}},canonicalize,cse,convert-func-to-llvm{{use-bare-ptr-memref-call-conv}},nvvm-attach-target{{O=3 ftz fast chip=sm_70}},gpu.module(convert-gpu-to-nvvm,canonicalize,cse),gpu-to-llvm,gpu-module-to-binary,canonicalize,cse)"'
 
 XDSL_CPU_PIPELINE = lambda nb_tiled_dims: f'"stencil-shape-inference,convert-stencil-to-ll-mlir{{{generate_tiling_arg(nb_tiled_dims)}}},printf-to-llvm,canonicalize"'
 XDSL_GPU_PIPELINE = "stencil-shape-inference,convert-stencil-to-ll-mlir{target=gpu},reconcile-unrealized-casts,printf-to-llvm,canonicalize"

From ef4a89ac935669d7128d1eeb33be7577e2dddc3e Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Mon, 11 Dec 2023 18:10:31 +0000
Subject: [PATCH 08/16] Use Devito helper to determine compute capability.

---
 devito/operator/xdsl_operator.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/devito/operator/xdsl_operator.py b/devito/operator/xdsl_operator.py
index 800e9ac677..ed6c416125 100644
--- a/devito/operator/xdsl_operator.py
+++ b/devito/operator/xdsl_operator.py
@@ -12,6 +12,7 @@
 
 from devito import Operator
 from devito.arch import compiler_registry, platform_registry
+from devito.arch.archinfo import get_nvidia_cc
 from devito.data import default_allocator
 from devito.exceptions import InvalidOperator
 from devito.ir.clusters import ClusterGroup, clusterize
@@ -78,7 +79,7 @@ def generate_tiling_arg(nb_tiled_dims: int):
 MLIR_CPU_PIPELINE = '"builtin.module(canonicalize, cse, loop-invariant-code-motion, canonicalize, cse, loop-invariant-code-motion,cse,canonicalize,fold-memref-alias-ops,expand-strided-metadata, loop-invariant-code-motion,lower-affine,convert-scf-to-cf,convert-math-to-llvm,convert-func-to-llvm{use-bare-ptr-memref-call-conv},finalize-memref-to-llvm,canonicalize,cse)"'
 MLIR_OPENMP_PIPELINE = '"builtin.module(canonicalize, cse, loop-invariant-code-motion, canonicalize, cse, loop-invariant-code-motion,cse,canonicalize,fold-memref-alias-ops,expand-strided-metadata, loop-invariant-code-motion,lower-affine,finalize-memref-to-llvm,loop-invariant-code-motion,canonicalize,cse,convert-scf-to-openmp,finalize-memref-to-llvm,convert-scf-to-cf,convert-func-to-llvm{use-bare-ptr-memref-call-conv},convert-openmp-to-llvm,convert-math-to-llvm,reconcile-unrealized-casts,canonicalize,cse)"'
 # gpu-launch-sink-index-computations seemed to have no impact
-MLIR_GPU_PIPELINE = lambda block_sizes: f'"builtin.module(test-math-algebraic-simplification,scf-parallel-loop-tiling{{parallel-loop-tile-sizes={block_sizes}}},func.func(gpu-map-parallel-loops),convert-parallel-loops-to-gpu,lower-affine, canonicalize,cse, fold-memref-alias-ops, gpu-launch-sink-index-computations, gpu-kernel-outlining, canonicalize{{region-simplify}},cse,fold-memref-alias-ops,expand-strided-metadata,lower-affine,canonicalize,cse,func.func(gpu-async-region),canonicalize,cse,convert-arith-to-llvm{{index-bitwidth=64}},convert-scf-to-cf,convert-cf-to-llvm{{index-bitwidth=64}},canonicalize,cse,convert-func-to-llvm{{use-bare-ptr-memref-call-conv}},nvvm-attach-target{{O=3 ftz fast chip=sm_70}},gpu.module(convert-gpu-to-nvvm,canonicalize,cse),gpu-to-llvm,gpu-module-to-binary,canonicalize,cse)"'
+MLIR_GPU_PIPELINE = lambda block_sizes: f'"builtin.module(test-math-algebraic-simplification,scf-parallel-loop-tiling{{parallel-loop-tile-sizes={block_sizes}}},func.func(gpu-map-parallel-loops),convert-parallel-loops-to-gpu,lower-affine, canonicalize,cse, fold-memref-alias-ops, gpu-launch-sink-index-computations, gpu-kernel-outlining, canonicalize{{region-simplify}},cse,fold-memref-alias-ops,expand-strided-metadata,lower-affine,canonicalize,cse,func.func(gpu-async-region),canonicalize,cse,convert-arith-to-llvm{{index-bitwidth=64}},convert-scf-to-cf,convert-cf-to-llvm{{index-bitwidth=64}},canonicalize,cse,convert-func-to-llvm{{use-bare-ptr-memref-call-conv}},nvvm-attach-target{{O=3 ftz fast chip=sm_{get_nvidia_cc()}}},gpu.module(convert-gpu-to-nvvm,canonicalize,cse),gpu-to-llvm,gpu-module-to-binary,canonicalize,cse)"'
 
 XDSL_CPU_PIPELINE = lambda nb_tiled_dims: f'"stencil-shape-inference,convert-stencil-to-ll-mlir{{{generate_tiling_arg(nb_tiled_dims)}}},printf-to-llvm,canonicalize"'
 XDSL_GPU_PIPELINE = "stencil-shape-inference,convert-stencil-to-ll-mlir{target=gpu},reconcile-unrealized-casts,printf-to-llvm,canonicalize"

From 1fafa57aa69fbb3d35beea256680c60078806329 Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Mon, 11 Dec 2023 20:27:28 +0000
Subject: [PATCH 09/16] Conditional import of matplotlib

---
 fast/diffusion_2D_wBCs.py | 4 +++-
 fast/diffusion_3D_wBCs.py | 4 +++-
 fast/wave2d_b.py          | 4 +++-
 fast/wave3d_b.py          | 4 +++-
 4 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/fast/diffusion_2D_wBCs.py b/fast/diffusion_2D_wBCs.py
index ff6535ec99..a3f205b6f5 100644
--- a/fast/diffusion_2D_wBCs.py
+++ b/fast/diffusion_2D_wBCs.py
@@ -8,7 +8,6 @@
 from devito import (Grid, TimeFunction, Eq, solve, Operator,
                     Constant, norm, configuration)
 from examples.cfd import init_hat
-from fast.bench_utils import plot_2dfunc
 
 parser = argparse.ArgumentParser(description='Process arguments.')
 
@@ -27,6 +26,9 @@
 parser.add_argument("-xdsl", "--xdsl", default=False, type=bool, help="xDSL run")
 args = parser.parse_args()
 
+if args.plot:
+    from fast.bench_utils import plot_2dfunc
+
 mpiconf = configuration['mpi']
 
 # flake8: noqa
diff --git a/fast/diffusion_3D_wBCs.py b/fast/diffusion_3D_wBCs.py
index f497a6729e..99919bfd13 100644
--- a/fast/diffusion_3D_wBCs.py
+++ b/fast/diffusion_3D_wBCs.py
@@ -7,7 +7,6 @@
 
 from devito import (Grid, TimeFunction, Eq, solve, Constant,
                     norm, Operator, configuration)
-from fast.bench_utils import plot_3dfunc
 
 parser = argparse.ArgumentParser(description='Process arguments.')
 
@@ -26,6 +25,9 @@
 parser.add_argument("-xdsl", "--xdsl", default=False, type=bool, help="xDSL run")
 args = parser.parse_args()
 
+if args.plot:
+    from fast.bench_utils import plot_3dfunc
+
 mpiconf = configuration['mpi']
 
 # Some variable declarations
diff --git a/fast/wave2d_b.py b/fast/wave2d_b.py
index 5c8a7f41ff..43e4b13372 100644
--- a/fast/wave2d_b.py
+++ b/fast/wave2d_b.py
@@ -4,7 +4,6 @@
 
 from devito import (TimeFunction, Eq, Operator, solve, norm,
                     configuration, Grid)
-from fast.bench_utils import plot_2dfunc
 from devito.tools import as_tuple
 
 import argparse
@@ -28,6 +27,9 @@
 parser.add_argument("-xdsl", "--xdsl", default=False, type=bool, help="xDSL run")
 args = parser.parse_args()
 
+if args.plot:
+    from fast.bench_utils import plot_2dfunc
+
 
 mpiconf = configuration['mpi']
 
diff --git a/fast/wave3d_b.py b/fast/wave3d_b.py
index b17307eae8..e71f481190 100644
--- a/fast/wave3d_b.py
+++ b/fast/wave3d_b.py
@@ -4,7 +4,6 @@
 
 from devito import (TimeFunction, Eq, Operator, solve, norm,
                     configuration, Grid)
-from fast.bench_utils import plot_3dfunc
 from devito.tools import as_tuple
 
 import argparse
@@ -28,6 +27,9 @@
 parser.add_argument("-xdsl", "--xdsl", default=False, type=bool, help="xDSL run")
 args = parser.parse_args()
 
+if args.plot:
+    from fast.bench_utils import plot_3dfunc
+
 
 mpiconf = configuration['mpi']
 

From 7ef35118ae53a06d4b3329b3f352c48dd4277c01 Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Thu, 14 Dec 2023 11:30:20 +0000
Subject: [PATCH 10/16] More updating.

---
 devito/ir/ietxdsl/lowering.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/devito/ir/ietxdsl/lowering.py b/devito/ir/ietxdsl/lowering.py
index a74e7ae908..2327422131 100644
--- a/devito/ir/ietxdsl/lowering.py
+++ b/devito/ir/ietxdsl/lowering.py
@@ -16,9 +16,9 @@ def match_and_rewrite(self, op: scf.For, rewriter: PatternRewriter, /):
         if isinstance(op.lb.type, builtin.IndexType):
             return
         for val in (op.lb, op.ub, op.step):
-            cast = arith.IndexCastOp.get(val, builtin.IndexType())
+            cast = arith.IndexCastOp(val, builtin.IndexType())
             rewriter.insert_op_before_matched_op(cast)
-            op.replace_operand(op.operands.index(val), cast.result)
+            op.operands[op.operands.index(val)] = cast.result
 
 
 class ConvertScfParallelArgsToIndex(RewritePattern):
@@ -41,9 +41,9 @@ def match_and_rewrite(self, op: scf.ParallelOp, rewriter: PatternRewriter,
         for val in (*op.lowerBound, *op.upperBound, *op.step):
             if isinstance(val.type, builtin.IndexType):
                 continue
-            cast = arith.IndexCastOp.get(val, builtin.IndexType())
+            cast = arith.IndexCastOp(val, builtin.IndexType())
             rewriter.insert_op_before_matched_op(cast)
-            op.replace_operand(op.operands.index(val), cast.result)
+            op.operands[op.operands.index(val)] = cast.result
 
 
 class ConvertForLoopVarToIndex(RewritePattern):
@@ -60,12 +60,12 @@ def match_and_rewrite(self, op: Operation, rewriter: PatternRewriter, /):
             # insert a cast from index to i64 at the start of the loop
 
             rewriter.insert_op_at_start(
-                i64_val := arith.IndexCastOp.get(loop_var, builtin.i64),
+                i64_val := arith.IndexCastOp(loop_var, builtin.i64),
                 block,
             )
 
             loop_var.replace_by(i64_val.result)
-            i64_val.replace_operand(0, loop_var)
+            i64_val.operands[0] = loop_var
 
 
 class LowerIetForToScfFor(RewritePattern):

From 91897f9f64acbbf0325184d165aa0a4c4b1f8d3b Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Thu, 14 Dec 2023 11:31:23 +0000
Subject: [PATCH 11/16] Re-remove xdsl_operator.py

---
 devito/operator/xdsl_operator.py | 1351 ------------------------------
 1 file changed, 1351 deletions(-)
 delete mode 100644 devito/operator/xdsl_operator.py

diff --git a/devito/operator/xdsl_operator.py b/devito/operator/xdsl_operator.py
deleted file mode 100644
index ed6c416125..0000000000
--- a/devito/operator/xdsl_operator.py
+++ /dev/null
@@ -1,1351 +0,0 @@
-import os
-import subprocess
-import ctypes
-import tempfile
-
-from math import ceil
-from collections import OrderedDict, namedtuple
-from io import StringIO
-from operator import attrgetter
-
-from cached_property import cached_property
-
-from devito import Operator
-from devito.arch import compiler_registry, platform_registry
-from devito.arch.archinfo import get_nvidia_cc
-from devito.data import default_allocator
-from devito.exceptions import InvalidOperator
-from devito.ir.clusters import ClusterGroup, clusterize
-from devito.ir.equations import LoweredEq, lower_exprs
-from devito.ir.iet import (Callable, CInterface, EntryFunction, FindSymbols, MetaCall,
-                           derive_parameters, iet_build)
-from devito.ir.ietxdsl import (finalize_module_with_globals)
-from devito.ir.stree import stree_build
-from devito.ir.support import AccessMode, SymbolRegistry
-from devito.ir.ietxdsl.cluster_to_ssa import (ExtractDevitoStencilConversion,
-                                              convert_devito_stencil_to_xdsl_stencil)
-from devito.logger import debug, info, perf, warning, is_log_enabled_for
-from devito.operator.operator import IRs
-from devito.operator.profiling import AdvancedProfilerVerbose, create_profile
-from devito.parameters import configuration
-from devito.passes import (Graph, lower_index_derivatives, generate_implicit,
-                           generate_macros, minimize_symbols, unevaluate)
-from devito.passes.iet import CTarget
-from devito.symbolics import estimate_cost
-from devito.tools import (DAG, OrderedSet, ReducerMap, as_tuple, flatten,
-                          filter_sorted, frozendict, is_integer, split, timed_pass,
-                          contains_val)
-from devito.types import Evaluable, TimeFunction, Grid
-from devito.types.mlir_types import ptr_of, f32
-from devito.mpi import MPI
-
-from xdsl.printer import Printer
-
-# flake8: noqa
-
-__all__ = ['XDSLOperator']
-
-
-# small interop shim script for stuff that we don't want to implement in mlir-ir
-_INTEROP_C = """
-#include <time.h>
-
-double timer_start() {
-  // return a number representing the current point in time
-  // it might be offset by a fixed ammount
-  struct timespec t;
-  clock_gettime(CLOCK_MONOTONIC, &t);
-  return (t.tv_sec) + (t.tv_nsec * 1e-9);
-}
-
-double timer_end(double start) {
-  // return time elaspes since start in seconds
-  return (timer_start() - start);
-}
-"""
-
-
-def generate_tiling_arg(nb_tiled_dims: int):
-    """
-    Generate the tile-sizes arg for the convert-stencil-to-ll-mlir pass. Generating no argument if the diled_dims arg is 0
-    """
-    if nb_tiled_dims == 0:
-        return ''
-    return "tile-sizes=" + ",".join(["64"]*nb_tiled_dims)
-
-
-CFLAGS = "-O3 -march=native -mtune=native -lmlir_c_runner_utils"
-
-MLIR_CPU_PIPELINE = '"builtin.module(canonicalize, cse, loop-invariant-code-motion, canonicalize, cse, loop-invariant-code-motion,cse,canonicalize,fold-memref-alias-ops,expand-strided-metadata, loop-invariant-code-motion,lower-affine,convert-scf-to-cf,convert-math-to-llvm,convert-func-to-llvm{use-bare-ptr-memref-call-conv},finalize-memref-to-llvm,canonicalize,cse)"'
-MLIR_OPENMP_PIPELINE = '"builtin.module(canonicalize, cse, loop-invariant-code-motion, canonicalize, cse, loop-invariant-code-motion,cse,canonicalize,fold-memref-alias-ops,expand-strided-metadata, loop-invariant-code-motion,lower-affine,finalize-memref-to-llvm,loop-invariant-code-motion,canonicalize,cse,convert-scf-to-openmp,finalize-memref-to-llvm,convert-scf-to-cf,convert-func-to-llvm{use-bare-ptr-memref-call-conv},convert-openmp-to-llvm,convert-math-to-llvm,reconcile-unrealized-casts,canonicalize,cse)"'
-# gpu-launch-sink-index-computations seemed to have no impact
-MLIR_GPU_PIPELINE = lambda block_sizes: f'"builtin.module(test-math-algebraic-simplification,scf-parallel-loop-tiling{{parallel-loop-tile-sizes={block_sizes}}},func.func(gpu-map-parallel-loops),convert-parallel-loops-to-gpu,lower-affine, canonicalize,cse, fold-memref-alias-ops, gpu-launch-sink-index-computations, gpu-kernel-outlining, canonicalize{{region-simplify}},cse,fold-memref-alias-ops,expand-strided-metadata,lower-affine,canonicalize,cse,func.func(gpu-async-region),canonicalize,cse,convert-arith-to-llvm{{index-bitwidth=64}},convert-scf-to-cf,convert-cf-to-llvm{{index-bitwidth=64}},canonicalize,cse,convert-func-to-llvm{{use-bare-ptr-memref-call-conv}},nvvm-attach-target{{O=3 ftz fast chip=sm_{get_nvidia_cc()}}},gpu.module(convert-gpu-to-nvvm,canonicalize,cse),gpu-to-llvm,gpu-module-to-binary,canonicalize,cse)"'
-
-XDSL_CPU_PIPELINE = lambda nb_tiled_dims: f'"stencil-shape-inference,convert-stencil-to-ll-mlir{{{generate_tiling_arg(nb_tiled_dims)}}},printf-to-llvm,canonicalize"'
-XDSL_GPU_PIPELINE = "stencil-shape-inference,convert-stencil-to-ll-mlir{target=gpu},reconcile-unrealized-casts,printf-to-llvm,canonicalize"
-XDSL_MPI_PIPELINE = lambda decomp, nb_tiled_dims: f'"distribute-stencil{decomp},canonicalize-dmp,convert-stencil-to-ll-mlir{{{generate_tiling_arg(nb_tiled_dims)}}},dmp-to-mpi{{mpi_init=false}},lower-mpi,printf-to-llvm,canonicalize"'
-
-
-class XDSLOperator(Operator):
-
-    _Target = CTarget
-
-    def __new__(cls, expressions, **kwargs):
-        self = super(XDSLOperator, cls).__new__(cls, expressions, **kwargs)
-        delete = not os.getenv("XDSL_SKIP_CLEAN", False)
-        self._tf = tempfile.NamedTemporaryFile(prefix="devito-jit-", suffix='.so',
-                                               delete=delete)
-        self._interop_tf = tempfile.NamedTemporaryFile(prefix="devito-jit-interop-",
-                                                       suffix=".o", delete=delete)
-        self._make_interop_o()
-        self.__class__ = cls
-        return self
-
-    def _make_interop_o(self):
-        """
-        compile the interop.o file
-        """
-        res = subprocess.run(
-            f'clang -x c - -c -o {self._interop_tf.name}',
-            shell=True,
-            input=_INTEROP_C,
-            text=True,
-            stderr=subprocess.DEVNULL,
-            stdout=subprocess.DEVNULL,
-        )
-        assert res.returncode == 0
-
-    @property
-    def mpi_shape(self) -> tuple:
-        dist = self.functions[0].grid.distributor
-
-        # reverse topology for row->column major
-        return dist.topology, dist.myrank
-
-    def _jit_compile(self):
-        """
-        JIT-compile the C code generated by the Operator.
-        It is ensured that JIT compilation will only be performed
-        once per Operator, reagardless of how many times this method
-        is invoked.
-        """
-       
-        with self._profiler.timer_on('jit-compile'):
-            is_mpi = MPI.Is_initialized()
-            is_gpu = os.environ.get("DEVITO_PLATFORM", None) == 'nvidiaX'
-            is_omp = os.environ.get("DEVITO_LANGUAGE", None) == 'openmp'
-
-            if is_mpi and is_gpu:
-                raise RuntimeError("Cannot run MPI+GPU for now!")
-
-            if is_omp and is_gpu:
-                raise RuntimeError("Cannot run OMP+GPU!")
-
-            # specialize the code for the specific apply parameters
-            finalize_module_with_globals(self._module, self._jit_kernel_constants,
-                                         gpu_boilerplate=is_gpu)
-
-            # print module as IR
-            module_str = StringIO()
-            Printer(stream=module_str).print(self._module)
-            module_str = module_str.getvalue()
-
-            to_tile = len(list(filter(lambda s: str(s) in ["x", "y", "z"], self.dimensions)))-1
-
-            xdsl_pipeline = XDSL_CPU_PIPELINE(to_tile)
-            mlir_pipeline = MLIR_CPU_PIPELINE
-
-            block_sizes: list[int] = [min(target, self._jit_kernel_constants.get(f"{dim}_size", 1)) for target, dim in zip([32, 4, 8], ["x", "y", "z"])]
-            block_sizes = ','.join(str(bs) for bs in block_sizes)
-
-            if is_omp:
-                mlir_pipeline = MLIR_OPENMP_PIPELINE
-
-            if is_mpi:
-                shape, mpi_rank = self.mpi_shape
-                # Run with restrict domain=false so we only introduce the swaps but don't
-                # reduce the domain of the computation (as devito has already done that for us)
-                slices = ','.join(str(x) for x in shape)
-
-                decomp = "2d-grid" if len(shape) == 2 else "3d-grid"
-
-                decomp = f"{{strategy={decomp} slices={slices} restrict_domain=false}}"
-                xdsl_pipeline = XDSL_MPI_PIPELINE(decomp, to_tile)
-            elif is_gpu:
-                xdsl_pipeline = XDSL_GPU_PIPELINE
-                mlir_pipeline = MLIR_GPU_PIPELINE(block_sizes)
-
-            # allow jit backdooring to provide your own xdsl code
-            backdoor = os.getenv('XDSL_JIT_BACKDOOR')
-            if backdoor is not None:
-                if os.path.splitext(backdoor)[1] == ".so":
-                    info(f"JIT Backdoor: skipping compilation and using {backdoor}")
-                    self._tf.name = backdoor
-                    return
-                print("JIT Backdoor: loading xdsl file from: " + backdoor)
-                with open(backdoor, 'r') as f:
-                    module_str = f.read()
-            source_name = os.path.splitext(self._tf.name)[0] + ".mlir"
-            source_file = open(source_name, "w")
-            source_file.write(module_str)
-            source_file.close()
-
-            # Compile IR using xdsl-opt | mlir-opt | mlir-translate | clang
-            try:
-                cflags = CFLAGS
-                cc = "clang"
-
-                if is_mpi:
-                    cflags += ' -lmpi '
-                    cc = "mpicc -cc=clang"
-                if is_omp:
-                    cflags += " -fopenmp "
-                if is_gpu:
-                    cflags += " -lmlir_cuda_runtime "
-
-                # TODO More detailed error handling manually,
-                # instead of relying on a bash-only feature.
-
-                xdsl_cmd = f'xdsl-opt {source_name} -p {xdsl_pipeline}'
-                mlir_cmd = f'mlir-opt -p {mlir_pipeline}'
-                mlir_translate_cmd = 'mlir-translate --mlir-to-llvmir'
-                clang_cmd = f'{cc} {cflags} -shared -o {self._tf.name} {self._interop_tf.name} -xir -'
-
-
-                comp_steps = [
-                              xdsl_cmd,
-                              mlir_cmd,
-                              mlir_translate_cmd,
-                              clang_cmd 
-                             ]
-
-                # Execute each command and store the outputs
-                outputs = []
-                stdout = None
-                for cmd in comp_steps:
-                    return_code, stdout, stderr = self._cmd_compile(cmd, stdout)
-                    # Use DEVITO_LOGGING=DEBUG to print
-                    debug(cmd)
-                    outputs.append({
-                        'command': cmd,
-                        'return_code': return_code,
-                        'stdout': stdout,
-                        'stderr': stderr
-                    })
-                
-            except Exception as ex:
-                print("error")
-                raise ex
-
-        elapsed = self._profiler.py_timers['jit-compile']
-
-        perf("XDSLOperator `%s` jit-compiled `%s` in %.2f s with `mlir-opt`" %
-             (self.name, source_name, elapsed))
-
-
-    def _cmd_compile(self, cmd, input=None):
-        stdin = subprocess.PIPE if input is not None else None
-
-        res = subprocess.run(
-            cmd,
-            input=input,
-            shell=True,
-            text=True,
-            capture_output=True,
-            executable="/bin/bash"
-        )
-
-        if res.returncode != 0:
-            print("compilation failed with output:")
-            print(res.stderr)
-
-        assert res.returncode == 0
-        return res.returncode, res.stdout, res.stderr
-
-    @property
-    def _soname(self):
-        return self._tf.name
-
-    def setup_memref_args(self):
-        """
-        Add memrefs to args dictionary so they can be passed to the cfunction
-        """
-        args = dict()
-        for arg in self.functions:
-            if isinstance(arg, TimeFunction):
-                data = arg._data_allocated
-                # iterate over the first dimension (time)
-                for t in range(data.shape[0]):
-                    args[f'{arg._C_name}_{t}'] = data[t, ...].ctypes.data_as(ptr_of(f32))
-        self._jit_kernel_constants.update(args)
-
-    @classmethod
-    def _normalize_kwargs(cls, **kwargs):
-        return kwargs
-
-    @classmethod
-    def _check_kwargs(cls, **kwargs):
-        return
-
-    @classmethod
-    def _build(cls, expressions, **kwargs) -> Callable:
-        debug("-Building operator")
-        # Python- (i.e., compile-) and C-level (i.e., run-time) performance
-        profiler = create_profile('timers')
-
-        # Lower the input expressions into an IET
-        debug("-Lower expressions")
-        irs, _, module = cls._lower(expressions, profiler=profiler, **kwargs)
-
-        # Make it an actual Operator
-        op = Callable.__new__(cls, **irs.iet.args)
-        Callable.__init__(op, **op.args)
-
-        # Header files, etc.
-        # op._headers = OrderedSet(*cls._default_headers)
-        # op._headers.update(byproduct.headers)
-        # op._globals = OrderedSet(*cls._default_globals)
-        # op._includes = OrderedSet(*cls._default_includes)
-        # op._includes.update(profiler._default_includes)
-        # op._includes.update(byproduct.includes)
-        op._module = module
-
-        # Required for the jit-compilation
-        op._compiler = kwargs['compiler']
-        op._lib = None
-        op._cfunction = None
-
-        # Potentially required for lazily allocated Functions
-        op._mode = kwargs['mode']
-        op._options = kwargs['options']
-        op._allocator = kwargs['allocator']
-        op._platform = kwargs['platform']
-
-        # References to local or external routines
-        op._func_table = OrderedDict()
-        op._func_table.update(OrderedDict([(i, MetaCall(None, False))
-                                           for i in profiler._ext_calls]))
-        # op._func_table.update(OrderedDict([(i.root.name, i) for i in byproduct.funcs]))
-
-        # Internal mutable state to store information about previous runs, autotuning
-        # reports, etc
-        op._state = cls._initialize_state(**kwargs)
-
-        # Produced by the various compilation passes
-        op._reads = filter_sorted(flatten(e.reads for e in irs.expressions))
-        op._writes = filter_sorted(flatten(e.writes for e in irs.expressions))
-        op._dimensions = set().union(*[e.dimensions for e in irs.expressions])
-        op._dtype, op._dspace = irs.clusters.meta
-        op._profiler = profiler
-
-        return op
-
-    def __init__(self, *args, **kwargs):
-        # Bypass the silent call to __init__ triggered through the backends engine
-        pass
-
-    # Compilation -- Expression level
-
-    @classmethod
-    def _lower(cls, expressions, **kwargs):
-        """
-        Perform the lowering Expressions -> Clusters -> ScheduleTree -> IET.
-        """
-        # Create a symbol registry
-        kwargs['sregistry'] = SymbolRegistry()
-
-        expressions = as_tuple(expressions)
-
-        # Input check
-        if any(not isinstance(i, Evaluable) for i in expressions):
-            raise InvalidOperator("Only `devito.Evaluable` are allowed.")
-
-        # Enable recursive lowering
-        # This may be used by a compilation pass that constructs a new
-        # expression for which a partial or complete lowering is desired
-        kwargs['lower'] = cls._lower
-
-        # [Eq] -> [LoweredEq]
-        expressions = cls._lower_exprs(expressions, **kwargs)
-
-        conv = ExtractDevitoStencilConversion(expressions)
-        module = conv.convert()
-        convert_devito_stencil_to_xdsl_stencil(module, timed=True)
-
-        # [LoweredEq] -> [Clusters]
-        clusters = cls._lower_clusters(expressions, **kwargs)
-
-        # [Clusters] -> ScheduleTree
-        stree = cls._lower_stree(clusters, **kwargs)
-
-        # ScheduleTree -> unbounded IET
-        uiet = cls._lower_uiet(stree, **kwargs)
-
-        # unbounded IET -> IET
-        iet, byproduct = cls._lower_iet(uiet, **kwargs)
-
-        return IRs(expressions, clusters, stree, uiet, iet), byproduct, module
-
-    @classmethod
-    def _rcompile_wrapper(cls, **kwargs):
-        def wrapper(expressions, kwargs=kwargs):
-            return rcompile(expressions, kwargs)
-        return wrapper
-
-    @classmethod
-    def _initialize_state(cls, **kwargs):
-        return {}
-
-    @classmethod
-    def _specialize_dsl(cls, expressions, **kwargs):
-        """
-        Backend hook for specialization at the DSL level. The input is made of
-        expressions and other higher order objects such as Injection or
-        Interpolation; the expressions are still unevaluated at this stage,
-        meaning that they are still in tensorial form and derivatives aren't
-        expanded yet.
-        """
-        return expressions
-
-    @classmethod
-    def _specialize_exprs(cls, expressions, **kwargs):
-        """
-        Backend hook for specialization at the expression level.
-        """
-        return expressions
-
-    @classmethod
-    @timed_pass(name='lowering.Expressions')
-    def _lower_exprs(cls, expressions, **kwargs):
-        """
-        Expression lowering:
-
-            * Apply rewrite rules;
-            * Evaluate derivatives;
-            * Flatten vectorial equations;
-            * Indexify Functions;
-            * Apply substitution rules;
-            * Shift indices for domain alignment.
-        """
-        expand = kwargs['options'].get('expand', True)
-
-        # Specialization is performed on unevaluated expressions
-        expressions = cls._specialize_dsl(expressions, **kwargs)
-
-        # Lower FD derivatives
-        # NOTE: we force expansion of derivatives along SteppingDimensions
-        # because it drastically simplifies the subsequent lowering into
-        # ModuloDimensions
-        if not expand:
-            expand = lambda d: d.is_Stepping
-        expressions = flatten([i._evaluate(expand=expand) for i in expressions])
-
-        # Scalarize the tensor equations, if any
-        expressions = [j for i in expressions for j in i._flatten]
-
-        # A second round of specialization is performed on evaluated expressions
-        expressions = cls._specialize_exprs(expressions, **kwargs)
-
-        # "True" lowering (indexification, shifting, ...)
-        expressions = lower_exprs(expressions, **kwargs)
-
-        processed = [LoweredEq(i) for i in expressions]
-
-        return processed
-
-    # Compilation -- Cluster level
-
-    @classmethod
-    def _specialize_clusters(cls, clusters, **kwargs):
-        """
-        Backend hook for specialization at the Cluster level.
-        """
-        return clusters
-
-    @classmethod
-    @timed_pass(name='lowering.Clusters')
-    def _lower_clusters(cls, expressions, profiler=None, **kwargs):
-        """
-        Clusters lowering:
-
-            * Group expressions into Clusters;
-            * Introduce guards for conditional Clusters;
-            * Analyze Clusters to detect computational properties such
-              as parallelism.
-            * Optimize Clusters for performance
-        """
-        sregistry = kwargs['sregistry']
-
-        # Build a sequence of Clusters from a sequence of Eqs
-        clusters = clusterize(expressions, **kwargs)
-
-        # Operation count before specialization
-        init_ops = sum(estimate_cost(c.exprs) for c in clusters if c.is_dense)
-
-        clusters = cls._specialize_clusters(clusters, **kwargs)
-
-        # Operation count after specialization
-        final_ops = sum(estimate_cost(c.exprs) for c in clusters if c.is_dense)
-        try:
-            profiler.record_ops_variation(init_ops, final_ops)
-        except AttributeError:
-            pass
-
-        # Generate implicit Clusters from higher level abstractions
-        clusters = generate_implicit(clusters, sregistry=sregistry)
-
-        # Lower all remaining high order symbolic objects
-        clusters = lower_index_derivatives(clusters, **kwargs)
-
-        # Make sure no reconstructions can unpick any of the symbolic
-        # optimizations performed so far
-        clusters = unevaluate(clusters)
-
-        return ClusterGroup(clusters)
-
-    # Compilation -- ScheduleTree level
-
-    @classmethod
-    def _specialize_stree(cls, stree, **kwargs):
-        """
-        DEPRECATED: Backend hook for specialization at the Schedule tree level.
-        """
-        return stree
-
-    @classmethod
-    @timed_pass(name='lowering.ScheduleTree')
-    def _lower_stree(cls, clusters, **kwargs):
-        """
-        Schedule tree lowering:
-
-            * Turn a sequence of Clusters into a ScheduleTree;
-            * Derive and attach metadata for distributed-memory parallelism;
-            * Derive sections for performance profiling
-        """
-        # DEPRECATED: Build a ScheduleTree from a sequence of Clusters
-        stree = stree_build(clusters, **kwargs)
-        stree = cls._specialize_stree(stree)
-
-        return stree
-
-    # Compilation -- Iteration/Expression tree level
-
-    @classmethod
-    def _specialize_iet(cls, graph, **kwargs):
-        """
-        Backend hook for specialization at the Iteration/Expression tree level.
-        """
-        return graph
-
-    @classmethod
-    @timed_pass(name='lowering.uIET')
-    def _lower_uiet(cls, stree, profiler=None, **kwargs):
-        """
-        Turn a ScheduleTree into an unbounded Iteration/Expression tree, that is
-        in essence a "floating" IET where one or more variables may be unbounded
-        (i.e., no definition placed yet).
-        """
-        # Build an unbounded IET from a ScheduleTree
-        uiet = iet_build(stree)
-
-        # Analyze the IET Sections for C-level profiling
-        try:
-            profiler.analyze(uiet)
-        except AttributeError:
-            pass
-
-        return uiet
-
-    @classmethod
-    @timed_pass(name='lowering.IET')
-    def _lower_iet(cls, uiet, profiler=None, **kwargs):
-        """
-        Iteration/Expression tree lowering:
-
-            * Introduce distributed-memory, shared-memory, and SIMD parallelism;
-            * Introduce optimizations for data locality;
-            * Finalize (e.g., symbol definitions, array casts)
-        """
-        name = kwargs.get("name", "Kernel")
-        sregistry = kwargs['sregistry']
-
-        # Wrap the IET with an EntryFunction (a special Callable representing
-        # the entry point of the generated library)
-        parameters = derive_parameters(uiet, True)
-        iet = EntryFunction(name, uiet, 'int', parameters, ())
-
-        # Lower IET to a target-specific IET
-        graph = Graph(iet, sregistry=sregistry)
-        graph = cls._specialize_iet(graph, **kwargs)
-
-        # Instrument the IET for C-level profiling
-        # Note: this is postponed until after _specialize_iet because during
-        # specialization further Sections may be introduced
-        cls._Target.instrument(graph, profiler=profiler, **kwargs)
-
-        # Extract the necessary macros from the symbolic objects
-        generate_macros(graph)
-
-        # Target-independent optimizations
-        minimize_symbols(graph)
-
-        return graph.root, graph
-
-    # Read-only properties exposed to the outside world
-
-    @cached_property
-    def reads(self):
-        return tuple(self._reads)
-
-    @cached_property
-    def writes(self):
-        return tuple(self._writes)
-
-    @cached_property
-    def dimensions(self):
-        ret = set().union(*[d._defines for d in self._dimensions])
-
-        # During compilation other Dimensions may have been produced
-        dimensions = FindSymbols('dimensions').visit(self)
-        ret.update(d for d in dimensions if d.is_PerfKnob)
-
-        ret = tuple(sorted(ret, key=attrgetter('name')))
-
-        return ret
-
-    @cached_property
-    def input(self):
-        return tuple(i for i in self.parameters if i.is_Input)
-
-    @cached_property
-    def temporaries(self):
-        return tuple(i for i in self.parameters if i.is_TempFunction)
-
-    @cached_property
-    def objects(self):
-        return tuple(i for i in self.parameters if i.is_Object)
-
-    # Arguments processing
-
-    @cached_property
-    def _access_modes(self):
-        """
-        A table providing the AccessMode of all user-accessible symbols in `self`.
-        """
-        return frozendict({i: AccessMode(i in self.reads, i in self.writes)
-                           for i in self.input})
-
-    def _prepare_arguments(self, autotune=None, **kwargs):
-        """
-        Process runtime arguments passed to ``.apply()` and derive
-        default values for any remaining arguments.
-        """
-        # Sanity check -- all user-provided keywords must be known to the Operator
-        if not configuration['ignore-unknowns']:
-            for k, v in kwargs.items():
-                if k not in self._known_arguments:
-                    raise ValueError("Unrecognized argument %s=%s" % (k, v))
-
-        # Pre-process Dimension overrides. This may help ruling out ambiguities
-        # when processing the `defaults` arguments. A topological sorting is used
-        # as DerivedDimensions may depend on their parents
-        nodes = self.dimensions
-        edges = [(i, i.parent) for i in self.dimensions
-                 if i.is_Derived and i.parent in set(nodes)]
-        toposort = DAG(nodes, edges).topological_sort()
-
-        futures = {}
-        for d in reversed(toposort):
-            if set(d._arg_names).intersection(kwargs):
-                futures.update(d._arg_values(self._dspace[d], args={}, **kwargs))
-
-        overrides, defaults = split(self.input, lambda p: p.name in kwargs)
-
-        # Process data-carrier overrides
-        args = kwargs['args'] = ReducerMap()
-        for p in overrides:
-            args.update(p._arg_values(**kwargs))
-            try:
-                args.reduce_inplace()
-            except ValueError:
-                raise ValueError("Override `%s` is incompatible with overrides `%s`" %
-                                 (p, [i for i in overrides if i.name in args]))
-
-        # Process data-carrier defaults
-        for p in defaults:
-            if p.name in args:
-                # E.g., SubFunctions
-                continue
-            for k, v in p._arg_values(**kwargs).items():
-                if k not in args:
-                    args[k] = v
-                elif k in futures:
-                    # An explicit override is later going to set `args[k]`
-                    pass
-                elif k in kwargs:
-                    # User is in control
-                    # E.g., given a ConditionalDimension `t_sub` with factor `fact` and
-                    # a TimeFunction `usave(t_sub, x, y)`, an override for `fact` is
-                    # supplied w/o overriding `usave`; that's legal
-                    pass
-                elif is_integer(args[k]) and not contains_val(args[k], v):
-                    raise ValueError("Default `%s` is incompatible with other args as "
-                                     "`%s=%s`, while `%s=%s` is expected. Perhaps you "
-                                     "forgot to override `%s`?" %
-                                     (p, k, v, k, args[k], p))
-
-        args = kwargs['args'] = args.reduce_all()
-
-        # DiscreteFunctions may be created from CartesianDiscretizations, which in
-        # turn could be Grids or SubDomains. Both may provide arguments
-        discretizations = {getattr(kwargs[p.name], 'grid', None) for p in overrides}
-        discretizations.update({getattr(p, 'grid', None) for p in defaults})
-        discretizations.discard(None)
-        # Remove subgrids if multiple grids
-        if len(discretizations) > 1:
-            discretizations = {g for g in discretizations
-                               if not any(d.is_Derived for d in g.dimensions)}
-
-        for i in discretizations:
-            args.update(i._arg_values(**kwargs))
-
-        # There can only be one Grid from which DiscreteFunctions were created
-        grids = {i for i in discretizations if isinstance(i, Grid)}
-        if len(grids) > 1:
-            # We loosely tolerate multiple Grids for backwards compatibility
-            # with spacial subsampling, which should be revisited however. And
-            # With MPI it would definitely break!
-            if configuration['mpi']:
-                raise ValueError("Multiple Grids found")
-        try:
-            grid = grids.pop()
-        except KeyError:
-            grid = None
-
-        # An ArgumentsMap carries additional metadata that may be used by
-        # the subsequent phases of the arguments processing
-        args = kwargs['args'] = ArgumentsMap(args, grid, self)
-
-        # Process Dimensions
-        for d in reversed(toposort):
-            args.update(d._arg_values(self._dspace[d], grid, **kwargs))
-
-        # Process Objects
-        for o in self.objects:
-            args.update(o._arg_values(grid=grid, **kwargs))
-
-        # In some "lower-level" Operators implementing a random piece of C, such as
-        # one or more calls to third-party library functions, there could still be
-        # at this point unprocessed arguments (e.g., scalars)
-        kwargs.pop('args')
-        args.update({k: v for k, v in kwargs.items() if k not in args})
-
-        # Sanity check
-        for p in self.parameters:
-            p._arg_check(args, self._dspace[p], am=self._access_modes.get(p))
-        for d in self.dimensions:
-            if d.is_Derived:
-                d._arg_check(args, self._dspace[p])
-
-        # Turn arguments into a format suitable for the generated code
-        # E.g., instead of NumPy arrays for Functions, the generated code expects
-        # pointers to ctypes.Struct
-        for p in self.parameters:
-            try:
-                args.update(kwargs.get(p.name, p)._arg_finalize(args, alias=p))
-            except AttributeError:
-                # User-provided floats/ndarray obviously do not have `_arg_finalize`
-                args.update(p._arg_finalize(args, alias=p))
-
-        # Execute autotuning and adjust arguments accordingly
-        args.update(self._autotune(args, autotune or configuration['autotuning']))
-
-        return args
-
-    def _postprocess_arguments(self, args, **kwargs):
-        """Process runtime arguments upon returning from ``.apply()``."""
-        for p in self.parameters:
-            try:
-                subfuncs = (args[getattr(p, s).name] for s in p._sub_functions)
-                p._arg_apply(args[p.name], *subfuncs, alias=kwargs.get(p.name))
-            except AttributeError:
-                p._arg_apply(args[p.name], alias=kwargs.get(p.name))
-
-    @cached_property
-    def _known_arguments(self):
-        """The arguments that can be passed to ``apply`` when running the Operator."""
-        ret = set()
-        for i in self.input:
-            ret.update(i._arg_names)
-            try:
-                ret.update(i.grid._arg_names)
-            except AttributeError:
-                pass
-        for d in self.dimensions:
-            ret.update(d._arg_names)
-        ret.update(p.name for p in self.parameters)
-        return frozenset(ret)
-
-    def _autotune(self, args, setup):
-        """Auto-tuning to improve runtime performance."""
-        return args
-
-    def arguments(self, **kwargs):
-        """Arguments to run the Operator."""
-        args = self._prepare_arguments(**kwargs)
-        # Check all arguments are present
-        for p in self.parameters:
-            if args.get(p.name) is None:
-                raise ValueError("No value found for parameter %s" % p.name)
-        return args
-
-    # Code generation and JIT compilation
-
-    #@cached_property
-    #def _soname(self):
-    #    """A unique name for the shared object resulting from JIT compilation."""
-    #    return Signer._digest(self, configuration)
-
-    @cached_property
-    def ccode(self):
-        try:
-            return self._ccode_handler(compiler=self._compiler).visit(self)
-        except (AttributeError, TypeError):
-            from devito.ir.iet.visitors import CGen
-            return CGen(compiler=self._compiler).visit(self)
-
-    @property
-    def cfunction(self):
-        """The JIT-compiled C function as a ctypes.FuncPtr object."""
-        if self._lib is None:
-            self._jit_compile()
-            self.setup_memref_args()
-            self._lib = self._compiler.load(self._tf.name)
-            self._lib.name = self._tf.name
-
-        if self._cfunction is None:
-            self._cfunction = getattr(self._lib, "apply_kernel")
-            # Associate a C type to each argument for runtime type check
-            self._cfunction.argtypes = self._construct_cfunction_args(self._jit_kernel_constants, get_types=True)
-
-        return self._cfunction
-
-    def cinterface(self, force=False):
-        """
-        Generate two files under the prescribed temporary directory:
-
-            * `X.c` (or `X.cpp`): the code generated for this Operator;
-            * `X.h`: an header file representing the interface of `X.c`.
-
-        Where `X=self.name`.
-
-        Parameters
-        ----------
-        force : bool, optional
-            Overwrite any existing files. Defaults to False.
-        """
-        dest = self._compiler.get_jit_dir()
-        name = dest.joinpath(self.name)
-
-        cfile = name.with_suffix(".%s" % self._compiler.src_ext)
-        hfile = name.with_suffix('.h')
-
-        # Generate the .c and .h code
-        ccode, hcode = CInterface().visit(self)
-
-        for f, code in [(cfile, ccode), (hfile, hcode)]:
-            if not force and f.is_file():
-                debug("`%s` was not saved in `%s` as it already exists" % (f.name, dest))
-            else:
-                with open(str(f), 'w') as ff:
-                    ff.write(str(code))
-                debug("`%s` successfully saved in `%s`" % (f.name, dest))
-
-        return ccode, hcode
-
-    # Execution
-
-    def __call__(self, **kwargs):
-        return self.apply(**kwargs)
-
-    def apply(self, **kwargs):
-        """
-        Execute the Operator.
-
-        With no arguments provided, the Operator runs using the data carried by the
-        objects appearing in the input expressions -- these are referred to as the
-        "default arguments".
-
-        Optionally, any of the Operator default arguments may be replaced by passing
-        suitable key-value arguments. Given ``apply(k=v, ...)``, ``(k, v)`` may be
-        used to:
-
-        * replace a Constant. In this case, ``k`` is the name of the Constant,
-          ``v`` is either a Constant or a scalar value.
-
-        * replace a Function (SparseFunction). Here, ``k`` is the name of the
-          Function, ``v`` is either a Function or a numpy.ndarray.
-
-        * alter the iteration interval along a Dimension. Consider a generic
-          Dimension ``d`` iterated over by the Operator.  By default, the Operator
-          runs over all iterations within the compact interval ``[d_m, d_M]``,
-          where ``d_m`` and ``d_M`` are, respectively, the smallest and largest
-          integers not causing out-of-bounds memory accesses (for the Grid
-          Dimensions, this typically implies iterating over the entire physical
-          domain). So now ``k`` can be either ``d_m`` or ``d_M``, while ``v``
-          is an integer value.
-
-        Examples
-        --------
-        Consider the following Operator
-
-        >>> from devito import Eq, Grid, TimeFunction, Operator
-        >>> grid = Grid(shape=(3, 3))
-        >>> u = TimeFunction(name='u', grid=grid, save=3)
-        >>> op = Operator(Eq(u.forward, u + 1))
-
-        The Operator is run by calling ``apply``
-
-        >>> summary = op.apply()
-
-        The variable ``summary`` contains information about runtime performance.
-        As no key-value parameters are specified, the Operator runs with its
-        default arguments, namely ``u=u, x_m=0, x_M=2, y_m=0, y_M=2, time_m=0,
-        time_M=1``.
-
-        At this point, the same Operator can be used for a completely different
-        run, for example
-
-        >>> u2 = TimeFunction(name='u', grid=grid, save=5)
-        >>> summary = op.apply(u=u2, x_m=1, y_M=1)
-
-        Now, the Operator will run with a different set of arguments, namely
-        ``u=u2, x_m=1, x_M=2, y_m=0, y_M=1, time_m=0, time_M=3``.
-
-        To run an Operator that only uses buffered TimeFunctions, the maximum
-        iteration point along the time dimension must be explicitly specified
-        (otherwise, the Operator wouldn't know how many iterations to run).
-
-        >>> u3 = TimeFunction(name='u', grid=grid)
-        >>> op = Operator(Eq(u3.forward, u3 + 1))
-        >>> summary = op.apply(time_M=10)
-        """
-        # Build the arguments list to invoke the kernel function
-        with self._profiler.timer_on('arguments'):
-            args = self.arguments(**kwargs)
-            self._jit_kernel_constants = args
-
-        cfunction = self.cfunction
-        try:
-            # Invoke kernel function with args
-            arg_values = self._construct_cfunction_args(args)
-            with self._profiler.timer_on('apply', comm=args.comm):
-                cfunction(*arg_values)
-        except ctypes.ArgumentError as e:
-            if e.args[0].startswith("argument "):
-                argnum = int(e.args[0][9:].split(':')[0]) - 1
-                newmsg = "error in argument '%s' with value '%s': %s" % (
-                    self.parameters[argnum].name,
-                    arg_values[argnum],
-                    e.args[0])
-                raise ctypes.ArgumentError(newmsg) from e
-            else:
-                raise
-
-        # Post-process runtime arguments
-        self._postprocess_arguments(args, **kwargs)
-
-        # Output summary of performance achieved
-        return self._emit_apply_profiling(args)
-
-    def _construct_cfunction_args(self, args, get_types = False):
-        """
-        Either construct the args for the cfunction, or construct the
-        arg types for it.
-        """
-        ps = {
-            p._C_name: p._C_ctype for p in self.parameters
-        }
-        
-        things = []
-        things_types = []
-
-        for name in get_arg_names_from_module(self._module):
-            thing = args[name]
-            things.append(thing)
-            if name in ps:
-                things_types.append(ps[name])
-            else:
-                things_types.append(type(thing))
-
-        if get_types:
-            return things_types
-        else:
-            return things
-
-    def _emit_build_profiling(self):
-        if not is_log_enabled_for('PERF'):
-            return
-
-        # Rounder to K decimal places
-        fround = lambda i, n=100: ceil(i * n) / n
-
-        timings = self._profiler.py_timers.copy()
-
-        tot = timings.pop('op-compile')
-        perf("Operator `%s` generated in %.2f s" % (self.name, fround(tot)))
-
-        max_hotspots = 3
-        threshold = 20.
-
-        def _emit_timings(timings, indent=''):
-            timings.pop('total', None)
-            entries = sorted(timings, key=lambda i: timings[i]['total'], reverse=True)
-            for i in entries[:max_hotspots]:
-                v = fround(timings[i]['total'])
-                perc = fround(v/tot*100, n=10)
-                if perc > threshold:
-                    perf("%s%s: %.2f s (%.1f %%)" % (indent, i.lstrip('_'), v, perc))
-                    _emit_timings(timings[i], ' '*len(indent) + ' * ')
-
-        _emit_timings(timings, '  * ')
-
-        if self._profiler._ops:
-            ops = ['%d --> %d' % i for i in self._profiler._ops]
-            perf("Flops reduction after symbolic optimization: [%s]" % ' ; '.join(ops))
-
-    def _emit_apply_profiling(self, args):
-        """Produce a performance summary of the profiled sections."""
-        # Rounder to 2 decimal places
-        fround = lambda i: ceil(i * 100) / 100
-
-        elapsed = fround(self._profiler.py_timers['apply'])
-        info("Operator `%s` ran in %.2f s" % (self.name, elapsed))
-
-        summary = self._profiler.summary(args, self._dtype, reduce_over=elapsed)
-
-        if not is_log_enabled_for('PERF'):
-            # Do not waste time
-            return summary
-
-        if summary.globals:
-            # Note that with MPI enabled, the global performance indicators
-            # represent "cross-rank" performance data
-            metrics = []
-
-            v = summary.globals.get('vanilla')
-            if v is not None:
-                metrics.append("OI=%.2f" % fround(v.oi))
-                metrics.append("%.2f GFlops/s" % fround(v.gflopss))
-
-            v = summary.globals.get('fdlike')
-            if v is not None:
-                metrics.append("%.2f GPts/s" % fround(v.gpointss))
-
-            if metrics:
-                perf("Global performance: [%s]" % ', '.join(metrics))
-
-            perf("Local performance:")
-            indent = " "*2
-        else:
-            indent = ""
-
-            if isinstance(self._profiler, AdvancedProfilerVerbose):
-                metrics = []
-
-                v = summary.globals.get('fdlike-nosetup')
-                if v is not None:
-                    metrics.append("%.2f GPts/s" % fround(v.gpointss))
-
-                if metrics:
-                    perf("Global performance <w/o setup>: [%s]" % ', '.join(metrics))
-
-        # Emit local, i.e. "per-rank" performance. Without MPI, this is the only
-        # thing that will be emitted
-        def lower_perfentry(v):
-            if v.gflopss:
-                oi = "OI=%.2f" % fround(v.oi)
-                gflopss = "%.2f GFlops/s" % fround(v.gflopss)
-                gpointss = "%.2f GPts/s" % fround(v.gpointss)
-                return "[%s]" % ", ".join([oi, gflopss, gpointss])
-            elif v.gpointss:
-                gpointss = "%.2f GPts/s" % fround(v.gpointss)
-                return "[%s]" % gpointss
-            else:
-                return ""
-
-        for k, v in summary.items():
-            rank = "[rank%d]" % k.rank if k.rank is not None else ""
-
-            metrics = lower_perfentry(v)
-
-            itershapes = [",".join(str(i) for i in its) for its in v.itershapes]
-            if len(itershapes) > 1:
-                itershapes = ",".join("<%s>" % i for i in itershapes)
-            elif len(itershapes) == 1:
-                itershapes = itershapes[0]
-            else:
-                itershapes = ""
-            name = "%s%s<%s>" % (k.name, rank, itershapes)
-
-            perf("%s* %s ran in %.2f s %s" % (indent, name, fround(v.time), metrics))
-            for n, v1 in summary.subsections.get(k.name, {}).items():
-                metrics = lower_perfentry(v1)
-
-                perf("%s+ %s ran in %.2f s [%.2f%%] %s" %
-                     (indent*2, n, fround(v1.time), fround(v1.time/v.time*100),
-                      metrics))
-
-        # Emit performance mode and arguments
-        perf_args = {}
-        for i in self.input + self.dimensions:
-            if not i.is_PerfKnob:
-                continue
-            try:
-                perf_args[i.name] = args[i.name]
-            except KeyError:
-                # Try with the aliases
-                for a in i._arg_names:
-                    if a in args:
-                        perf_args[a] = args[a]
-                        break
-        perf("Performance[mode=%s] arguments: %s" % (self._mode, perf_args))
-
-        return summary
-
-    # Pickling support
-
-    def __getstate__(self):
-        if self._lib:
-            state = dict(self.__dict__)
-            # The compiled shared-object will be pickled; upon unpickling, it
-            # will be restored into a potentially different temporary directory,
-            # so the entire process during which the shared-object is loaded and
-            # given to ctypes must be performed again
-            state['_lib'] = None
-            state['_cfunction'] = None
-            # Do not pickle the `args` used to construct the Operator. Not only
-            # would this be completely useless, but it might also lead to
-            # allocating additional memory upon unpickling, as the user-provided
-            # equations typically carry different instances of the same Function
-            # (e.g., f(t, x-1), f(t, x), f(t, x+1)), which are different objects
-            # with distinct `.data` fields
-            state['_args'] = None
-            with open(self._lib._name, 'rb') as f:
-                state['binary'] = f.read()
-                state['soname'] = self._soname
-            return state
-        else:
-            return self.__dict__
-
-    def __getnewargs_ex__(self):
-        return (None,), {}
-
-    def __setstate__(self, state):
-        soname = state.pop('soname', None)
-        binary = state.pop('binary', None)
-        for k, v in state.items():
-            setattr(self, k, v)
-        if soname is not None:
-            self._compiler.save(soname, binary)
-            self._lib = self._compiler.load(soname)
-            self._lib.name = soname
-
-
-# Default action (perform or bypass) for selected compilation passes upon
-# recursive compilation
-# NOTE: it may not only be pointless to apply the following passes recursively
-# (because once, during the main compilation phase, is simply enough), but also
-# dangerous as some of them (the minority) might break in some circumstances
-# if applied in cascade (e.g., `linearization` on top of `linearization`)
-rcompile_registry = {
-    'mpi': False,
-    'linearize': False,
-    'place-transfers': False
-}
-
-
-def rcompile(expressions, kwargs=None):
-    """
-    Perform recursive compilation on an ordered sequence of symbolic expressions.
-    """
-    if not kwargs or 'options' not in kwargs:
-        kwargs = parse_kwargs(**kwargs)
-        cls = operator_selector(**kwargs)
-        kwargs = cls._normalize_kwargs(**kwargs)
-    else:
-        cls = operator_selector(**kwargs)
-
-    # Tweak the compilation kwargs
-    options = dict(kwargs['options'])
-    options.update(rcompile_registry)
-    kwargs['options'] = options
-
-    # Recursive profiling not supported -- would be a complete mess
-    kwargs.pop('profiler', None)
-
-    return cls._lower(expressions, **kwargs)
-
-
-# Misc helpers
-
-
-IRs = namedtuple('IRs', 'expressions clusters stree uiet iet')
-
-
-class ArgumentsMap(dict):
-
-    def __init__(self, args, grid, op):
-        super().__init__(args)
-
-        self.grid = grid
-
-        self.allocator = op._allocator
-        self.platform = op._platform
-        # self.language = op._language
-        self.compiler = op._compiler
-        self.options = op._options
-
-    @property
-    def comm(self):
-        """The MPI communicator the arguments are collective over."""
-        return self.grid.comm if self.grid is not None else MPI.COMM_NULL
-
-    @property
-    def opkwargs(self):
-        temp_registry = {v: k for k, v in platform_registry.items()}
-        platform = temp_registry[self.platform]
-
-        temp_registry = {v: k for k, v in compiler_registry.items()}
-        compiler = temp_registry[self.compiler.__class__]
-
-        return {'platform': platform, 'compiler': compiler, 'language': self.language}
-
-
-def parse_kwargs(**kwargs):
-    """
-    Parse keyword arguments provided to an Operator.
-    """
-    # `dse` -- deprecated, dropped
-    dse = kwargs.pop("dse", None)
-    if dse is not None:
-        warning("The `dse` argument is deprecated. "
-                "The optimization level is now controlled via the `opt` argument")
-
-    # `dle` -- deprecated, replaced by `opt`
-    if 'dle' in kwargs:
-        warning("The `dle` argument is deprecated. "
-                "The optimization level is now controlled via the `opt` argument")
-        dle = kwargs.pop('dle')
-        if 'opt' in kwargs:
-            warning("Both `dle` and `opt` were passed; ignoring `dle` argument")
-            opt = kwargs.pop('opt')
-        else:
-            warning("Setting `opt=%s`" % str(dle))
-            opt = dle
-    elif 'opt' in kwargs:
-        opt = kwargs.pop('opt')
-    else:
-        opt = configuration['opt']
-
-    if not opt or isinstance(opt, str):
-        mode, options = opt, {}
-    elif isinstance(opt, tuple):
-        if len(opt) == 0:
-            mode, options = 'noop', {}
-        elif isinstance(opt[-1], dict):
-            if len(opt) == 2:
-                mode, options = opt
-            else:
-                mode, options = tuple(flatten(i.split(',') for i in opt[:-1])), opt[-1]
-        else:
-            mode, options = tuple(flatten(i.split(',') for i in opt)), {}
-    else:
-        raise InvalidOperator("Illegal `opt=%s`" % str(opt))
-
-    # `opt`, deprecated kwargs
-    kwopenmp = kwargs.get('openmp', options.get('openmp'))
-    if kwopenmp is None:
-        openmp = kwargs.get('language', configuration['language']) == 'openmp'
-    else:
-        openmp = kwopenmp
-
-    # `opt`, options
-    options = dict(options)
-    options.setdefault('openmp', openmp)
-    options.setdefault('mpi', configuration['mpi'])
-    for k, v in configuration['opt-options'].items():
-        options.setdefault(k, v)
-    # Handle deprecations
-    deprecated_options = ('cire-mincost-inv', 'cire-mincost-sops', 'cire-maxalias')
-    for i in deprecated_options:
-        try:
-            options.pop(i)
-            warning("Ignoring deprecated optimization option `%s`" % i)
-        except KeyError:
-            pass
-    kwargs['options'] = options
-
-    # `opt`, mode
-    if mode is None:
-        mode = 'noop'
-    kwargs['mode'] = mode
-
-    # `platform`
-    platform = kwargs.get('platform')
-    if platform is not None:
-        if not isinstance(platform, str):
-            raise ValueError("Argument `platform` should be a `str`")
-        if platform not in configuration._accepted['platform']:
-            raise InvalidOperator("Illegal `platform=%s`" % str(platform))
-        kwargs['platform'] = platform_registry[platform]()
-    else:
-        kwargs['platform'] = configuration['platform']
-
-    # `language`
-    language = kwargs.get('language')
-    if language is not None:
-        if not isinstance(language, str):
-            raise ValueError("Argument `language` should be a `str`")
-        if language not in configuration._accepted['language']:
-            raise InvalidOperator("Illegal `language=%s`" % str(language))
-        kwargs['language'] = language
-    elif kwopenmp is not None:
-        # Handle deprecated `openmp` kwarg for backward compatibility
-        kwargs['language'] = 'openmp' if openmp else 'C'
-    else:
-        kwargs['language'] = configuration['language']
-
-    # `compiler`
-    compiler = kwargs.get('compiler')
-    if compiler is not None:
-        if not isinstance(compiler, str):
-            raise ValueError("Argument `compiler` should be a `str`")
-        if compiler not in configuration._accepted['compiler']:
-            raise InvalidOperator("Illegal `compiler=%s`" % str(compiler))
-        kwargs['compiler'] = compiler_registry[compiler](platform=kwargs['platform'],
-                                                         language=kwargs['language'],
-                                                         mpi=configuration['mpi'])
-    elif any([platform, language]):
-        kwargs['compiler'] =\
-            configuration['compiler'].__new_with__(platform=kwargs['platform'],
-                                                   language=kwargs['language'],
-                                                   mpi=configuration['mpi'])
-    else:
-        kwargs['compiler'] = configuration['compiler'].__new_with__()
-
-    # `allocator`
-    kwargs['allocator'] = default_allocator(
-        '%s.%s.%s' % (kwargs['compiler'].name,
-                      kwargs['language'],
-                      kwargs['platform'])
-    )
-
-    return kwargs
-
-
-def get_arg_names_from_module(op):
-    return [
-        str_attr.data 
-        for str_attr in op.body.block.ops.first.attributes['param_names'].data
-    ]

From 4c48dd766d203b4315114588031eda41e72a87f4 Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Tue, 19 Dec 2023 11:27:45 +0000
Subject: [PATCH 12/16] GPU pipelie syntax fixes.

---
 devito/core/gpu.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/devito/core/gpu.py b/devito/core/gpu.py
index 3fa216f1b7..d1e88faf7d 100644
--- a/devito/core/gpu.py
+++ b/devito/core/gpu.py
@@ -566,7 +566,7 @@ def generate_MLIR_GPU_PIPELINE(block_sizes):
         "fold-memref-alias-ops",
         "gpu-launch-sink-index-computations",
         "gpu-kernel-outlining",
-        "canonicalize{{region-simplify}}",
+        "canonicalize{region-simplify}",
         "cse",
         "fold-memref-alias-ops",
         "expand-strided-metadata",
@@ -576,12 +576,12 @@ def generate_MLIR_GPU_PIPELINE(block_sizes):
         "func.func(gpu-async-region)",
         "canonicalize",
         "cse",
-        "convert-arith-to-llvm{{index-bitwidth=64}}",
+        "convert-arith-to-llvm{index-bitwidth=64}",
         "convert-scf-to-cf",
-        "convert-cf-to-llvm{{index-bitwidth=64}}",
+        "convert-cf-to-llvm{index-bitwidth=64}",
         "canonicalize",
         "cse",
-        "convert-func-to-llvm{{use-bare-ptr-memref-call-conv}}",
+        "convert-func-to-llvm{use-bare-ptr-memref-call-conv}",
         f"nvvm-attach-target{{O=3 ftz fast chip=sm_{get_nvidia_cc()}}}",
         "gpu.module(convert-gpu-to-nvvm,canonicalize,cse)",
         "gpu-to-llvm",

From 498a3aa3a10c14e0758099e925173a61e81f4696 Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Tue, 19 Dec 2023 13:12:02 +0000
Subject: [PATCH 13/16] Revert import changes.

---
 fast/diffusion_2D_wBCs.py | 4 +---
 fast/diffusion_3D_wBCs.py | 4 +---
 fast/wave2d_b.py          | 6 +-----
 fast/wave3d_b.py          | 6 +-----
 4 files changed, 4 insertions(+), 16 deletions(-)

diff --git a/fast/diffusion_2D_wBCs.py b/fast/diffusion_2D_wBCs.py
index a3f205b6f5..ff6535ec99 100644
--- a/fast/diffusion_2D_wBCs.py
+++ b/fast/diffusion_2D_wBCs.py
@@ -8,6 +8,7 @@
 from devito import (Grid, TimeFunction, Eq, solve, Operator,
                     Constant, norm, configuration)
 from examples.cfd import init_hat
+from fast.bench_utils import plot_2dfunc
 
 parser = argparse.ArgumentParser(description='Process arguments.')
 
@@ -26,9 +27,6 @@
 parser.add_argument("-xdsl", "--xdsl", default=False, type=bool, help="xDSL run")
 args = parser.parse_args()
 
-if args.plot:
-    from fast.bench_utils import plot_2dfunc
-
 mpiconf = configuration['mpi']
 
 # flake8: noqa
diff --git a/fast/diffusion_3D_wBCs.py b/fast/diffusion_3D_wBCs.py
index 99919bfd13..f497a6729e 100644
--- a/fast/diffusion_3D_wBCs.py
+++ b/fast/diffusion_3D_wBCs.py
@@ -7,6 +7,7 @@
 
 from devito import (Grid, TimeFunction, Eq, solve, Constant,
                     norm, Operator, configuration)
+from fast.bench_utils import plot_3dfunc
 
 parser = argparse.ArgumentParser(description='Process arguments.')
 
@@ -25,9 +26,6 @@
 parser.add_argument("-xdsl", "--xdsl", default=False, type=bool, help="xDSL run")
 args = parser.parse_args()
 
-if args.plot:
-    from fast.bench_utils import plot_3dfunc
-
 mpiconf = configuration['mpi']
 
 # Some variable declarations
diff --git a/fast/wave2d_b.py b/fast/wave2d_b.py
index 43e4b13372..ac20870282 100644
--- a/fast/wave2d_b.py
+++ b/fast/wave2d_b.py
@@ -8,7 +8,7 @@
 
 import argparse
 np.set_printoptions(threshold=np.inf)
-
+from fast.bench_utils import plot_2dfunc
 
 parser = argparse.ArgumentParser(description='Process arguments.')
 
@@ -27,10 +27,6 @@
 parser.add_argument("-xdsl", "--xdsl", default=False, type=bool, help="xDSL run")
 args = parser.parse_args()
 
-if args.plot:
-    from fast.bench_utils import plot_2dfunc
-
-
 mpiconf = configuration['mpi']
 
 # Define a physical size
diff --git a/fast/wave3d_b.py b/fast/wave3d_b.py
index e71f481190..aa56e076df 100644
--- a/fast/wave3d_b.py
+++ b/fast/wave3d_b.py
@@ -8,7 +8,7 @@
 
 import argparse
 np.set_printoptions(threshold=np.inf)
-
+from fast.bench_utils import plot_3dfunc
 
 parser = argparse.ArgumentParser(description='Process arguments.')
 
@@ -27,10 +27,6 @@
 parser.add_argument("-xdsl", "--xdsl", default=False, type=bool, help="xDSL run")
 args = parser.parse_args()
 
-if args.plot:
-    from fast.bench_utils import plot_3dfunc
-
-
 mpiconf = configuration['mpi']
 
 # Define a physical size

From f4e08605655d09250feee3da56bbb48b893165da Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Tue, 19 Dec 2023 13:12:59 +0000
Subject: [PATCH 14/16] EOL.

---
 .github/workflows/ci-mlir-mpi.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci-mlir-mpi.yml b/.github/workflows/ci-mlir-mpi.yml
index 3a6eda0b3f..a3c23be800 100644
--- a/.github/workflows/ci-mlir-mpi.yml
+++ b/.github/workflows/ci-mlir-mpi.yml
@@ -44,4 +44,4 @@ jobs:
       run: |
         # Add mlir-opt to the path
         export PATH=/xdsl-sc/llvm-project/build/bin/:$PATH
-        pytest -m "parallel" -k "not adjoint" tests/test_xdsl_*
\ No newline at end of file
+        pytest -m "parallel" -k "not adjoint" tests/test_xdsl_*

From f6507f988ed565131341e3561e173e938a87366b Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Tue, 19 Dec 2023 13:13:57 +0000
Subject: [PATCH 15/16] Flake pass.

---
 devito/core/cpu.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/devito/core/cpu.py b/devito/core/cpu.py
index 8022e4d6ce..a3220ba210 100644
--- a/devito/core/cpu.py
+++ b/devito/core/cpu.py
@@ -32,8 +32,6 @@
 from devito.types import TimeFunction
 from devito.types.mlir_types import ptr_of, f32
 
-from devito.arch.archinfo import get_nvidia_cc
-
 from xdsl.printer import Printer
 
 

From adfe3cc6ec0d1414ef9324f4368eadd67903caea Mon Sep 17 00:00:00 2001
From: Emilien Bauer <bauer.emilien@gmail.com>
Date: Tue, 19 Dec 2023 15:29:12 +0000
Subject: [PATCH 16/16] Flake pass returns.

---
 fast/wave2d_b.py | 3 ++-
 fast/wave3d_b.py | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/fast/wave2d_b.py b/fast/wave2d_b.py
index ac20870282..7e0857f6c1 100644
--- a/fast/wave2d_b.py
+++ b/fast/wave2d_b.py
@@ -7,9 +7,10 @@
 from devito.tools import as_tuple
 
 import argparse
-np.set_printoptions(threshold=np.inf)
 from fast.bench_utils import plot_2dfunc
 
+np.set_printoptions(threshold=np.inf)
+
 parser = argparse.ArgumentParser(description='Process arguments.')
 
 parser.add_argument("-d", "--shape", default=(16, 16), type=int, nargs="+",
diff --git a/fast/wave3d_b.py b/fast/wave3d_b.py
index aa56e076df..c7631b7e0d 100644
--- a/fast/wave3d_b.py
+++ b/fast/wave3d_b.py
@@ -7,9 +7,10 @@
 from devito.tools import as_tuple
 
 import argparse
-np.set_printoptions(threshold=np.inf)
 from fast.bench_utils import plot_3dfunc
 
+np.set_printoptions(threshold=np.inf)
+
 parser = argparse.ArgumentParser(description='Process arguments.')
 
 parser.add_argument("-d", "--shape", default=(16, 16, 16), type=int, nargs="+",