onnx · AlexandreEichenberger · Sep 18, 2024 · Sep 3, 2024 · Sep 4, 2024 · Sep 4, 2024
diff --git a/src/Accelerators/NNPA/Transform/ZLow/ZLowStickExpansion.cpp b/src/Accelerators/NNPA/Transform/ZLow/ZLowStickExpansion.cpp
@@ -261,7 +261,7 @@ class UnstickExpansionPattern : public OpRewritePattern<ZLowUnstickOp> {
                       // Store f32 values back to the (normal layout) output.
                       DimsExpr outputAF = SymListIE(inputAF);
                       outputAF[E1] = outputAF[E1] + l;
-                      create.vec.storeIE(vecF32H, alloc, outputAF, {});
+                      create.vec.storeIE(vecF32H, alloc, outputAF);
                       create.vec.storeIE(
                           vecF32L, alloc, outputAF, {litArchVLHalf.getValue()});
                     });
@@ -277,8 +277,8 @@ class UnstickExpansionPattern : public OpRewritePattern<ZLowUnstickOp> {
                 Value vecF32L = convertOp.getResult(1);
                 // Save into archVL value buffer.
                 Value bufferF32 = create.mem.alignedAlloca(bufferType);
-                create.vec.storeIE(vecF32H, bufferF32, {litZero}, {});
-                create.vec.storeIE(vecF32L, bufferF32, {litArchVLHalf}, {});
+                create.vec.storeIE(vecF32H, bufferF32, {litZero});
+                create.vec.storeIE(vecF32L, bufferF32, {litArchVLHalf});
                 // Save the remaining values as scalars.
                 create.scf.forLoop(litZero.getValue(),
                     remainingScalarValues.getValue(), 1,

diff --git a/src/Conversion/KrnlToAffine/KrnlCopyFromBuffer.cpp b/src/Conversion/KrnlToAffine/KrnlCopyFromBuffer.cpp
@@ -124,7 +124,7 @@ class KrnlCopyFromBufferLowering : public ConversionPattern {
         // Nothing to write.
       } else {
         // Loop to copy the data.
-        createAffine.forLoopIE(zeroIE, writeUBs[i], 1,
+        createAffine.forLoopIE(zeroIE, writeUBs[i], 1, false /*parallel*/,
             [&](AffineBuilderKrnlMem &createAffine, ValueRange loopInd) {
               loopIndices.emplace_back(loopInd[0]);
               genCopyLoops(createAffine, enclosingScope, buffMemref, destMemref,

diff --git a/src/Conversion/ONNXToKrnl/Math/Elementwise.cpp b/src/Conversion/ONNXToKrnl/Math/Elementwise.cpp
@@ -1527,8 +1527,7 @@ static LogicalResult getPartiallyFlattenedSimdCode(
 
         create.krnl.simdIterateIE(zero, SymIE(simdUb), VL, simdOnly,
             useParallelInSimdLoop, inputs, inputAFs, {output}, {outputAF},
-            [&](KrnlBuilder &kb, ArrayRef<Value> inputVals,
-                SmallVectorImpl<Value> &resVals, int64_t VL) {
+            {[&](const KrnlBuilder &kb, ArrayRef<Value> inputVals, int64_t VL) {
               MultiDialectBuilder<MathBuilder> create(kb);
               Type currElementType = outputElementType;
               if (VL > 1)
@@ -1557,9 +1556,9 @@ static LogicalResult getPartiallyFlattenedSimdCode(
                 res = emitPostProcessingFor<OP_TYPE>(rewriter, create.getLoc(),
                     op, currElementType, accumulated);
               }
-              resVals.emplace_back(res);
-            }); // SIMD kernel.
-      });       // Outer loops.
+              return res;
+            }}); // SIMD kernel.
+      });        // Outer loops.
 
   rewriter.replaceOp(op, alloc);
   return success();

diff --git a/src/Conversion/ONNXToKrnl/Math/Reduction.cpp b/src/Conversion/ONNXToKrnl/Math/Reduction.cpp
diff --git a/src/Conversion/ONNXToKrnl/NN/Normalization.cpp b/src/Conversion/ONNXToKrnl/NN/Normalization.cpp
@@ -255,7 +255,7 @@ struct ONNXInstanceNormalizationOpLowering
                 for (int d = 0; d < rank - 2; ++d)
                   inputAccessFct.emplace_back(spatial_loopInd[d]);
                 // tmp += input[n,c, spatial dims]
-                Value oldSum = create.krnl.load(tmpMemRef, {});
+                Value oldSum = create.krnl.load(tmpMemRef);
                 Value val = create.krnl.load(inputMemRef, inputAccessFct);
                 val = create.math.sub(val, mean);
                 val = create.math.mul(val, val);

diff --git a/src/Conversion/ONNXToKrnl/Quantization/QuantizeLinear.cpp b/src/Conversion/ONNXToKrnl/Quantization/QuantizeLinear.cpp
@@ -70,8 +70,7 @@ void emitQuantizationLinearScalarParameters(ConversionPatternRewriter &rewriter,
   outputAF.emplace_back(zero);
   create.krnl.simdIterateIE(simdLb, simdUb, totVL, simdOnly, enableParallel,
       {flatInput}, {inputAF}, {flatAlloc}, {outputAF},
-      [&](KrnlBuilder &kb, ArrayRef<Value> inputVals,
-          SmallVectorImpl<Value> &resVals, int64_t VL) {
+      {[&](const KrnlBuilder &kb, ArrayRef<Value> inputVals, int64_t VL) {
         MultiDialectBuilder<MathBuilder> create(kb);
         Value x = inputVals[0];
         // Scale
@@ -87,8 +86,8 @@ void emitQuantizationLinearScalarParameters(ConversionPatternRewriter &rewriter,
         // Saturate
         Value saturateX = create.math.clip(adjustX, qMin, qMax);
         Value res = create.math.cast(quantizedElementType, saturateX);
-        resVals.emplace_back(res);
-      });
+        return res;
+      }});
   if (totVL > 1)
     onnxToKrnlSimdReport(op, /*successful*/ true, totVL,
         simdLoopStaticTripCount, "quantizationLinear whole tensor");

diff --git a/src/Dialect/Krnl/DialectBuilder.cpp b/src/Dialect/Krnl/DialectBuilder.cpp
@@ -222,37 +222,56 @@ KrnlIterateOp KrnlBuilder::iterateIE(ValueRange originalLoops,
       });
 }
 
+void KrnlBuilder::forLoopIE(IndexExpr lb, IndexExpr ub, int64_t step,
+    bool useParallel, KrnlLoopBodyFn builderFn) const {
+  ValueRange originalLoopDef = defineLoops(1);
+  llvm::SmallVector<Value, 1> optLoopDef(1, originalLoopDef[0]);
+  if (step > 1) {
+    // Block loop by step.
+    ValueRange blockedLoopDef = block(originalLoopDef[0], step);
+    optLoopDef[0] = blockedLoopDef[0];
+  }
+  if (useParallel)
+    parallel(optLoopDef[0]);
+  iterateIE(originalLoopDef, optLoopDef, {lb}, {ub}, builderFn);
+}
+
 void KrnlBuilder::simdIterateIE(IndexExpr lb, IndexExpr ub, int64_t VL,
     bool fullySimd, bool useParallel, ArrayRef<Value> inputs,
     ArrayRef<DimsExpr> inputAFs, ArrayRef<Value> outputs,
     ArrayRef<DimsExpr> outputAFs,
-    function_ref<void(KrnlBuilder &b, ArrayRef<Value> inputVals,
-        llvm::SmallVectorImpl<Value> &resultVals, int64_t VL)>
-        bodyBuilderFn) const {
+    ArrayRef<KrnlSimdIterateBodyFn> iterateBodyFnList) const {
   onnx_mlir::impl::simdIterateIE<KrnlBuilder, KrnlBuilder>(*this, lb, ub, VL,
       fullySimd, useParallel, inputs, inputAFs, outputs, outputAFs,
-      bodyBuilderFn);
+      iterateBodyFnList);
 }
 
 void KrnlBuilder::simdReduceIE(IndexExpr lb, IndexExpr ub, int64_t VL,
     bool fullySimd, ArrayRef<Value> inputs, ArrayRef<DimsExpr> inputAFs,
     ArrayRef<Value> tmps, ArrayRef<DimsExpr> tmpAFs, ArrayRef<Value> outputs,
     ArrayRef<DimsExpr> outputAFs, ArrayRef<Value> initVals,
     /* reduction function (simd or scalar) */
-    function_ref<void(const KrnlBuilder &b, ArrayRef<Value> inputVals,
-        ArrayRef<Value> tmpVals, llvm::SmallVectorImpl<Value> &resultVals,
-        int64_t VL)>
-        reductionBuilderFn,
+    ArrayRef<KrnlSimdReductionBodyFn> reductionBodyFnList,
     /* post reduction function (simd to scalar + post processing)*/
-    function_ref<void(const KrnlBuilder &b, ArrayRef<Value> tmpVals,
-        llvm::SmallVectorImpl<Value> &scalarOutputs, int64_t VL)>
-        postProcessingBuilderFn) const {
+    ArrayRef<KrnlSimdPostReductionBodyFn> postReductionBodyFnList) const {
   onnx_mlir::impl::simdReduceIE<KrnlBuilder, KrnlBuilder>(*this, lb, ub, VL,
       fullySimd, inputs, inputAFs, tmps, tmpAFs, outputs, outputAFs, initVals,
-      reductionBuilderFn, postProcessingBuilderFn);
+      reductionBodyFnList, postReductionBodyFnList);
+}
+
+void KrnlBuilder::simdReduce2DIE(IndexExpr lb, IndexExpr ub, int64_t VL,
+    bool fullySimd, Value input, DimsExpr inputAF, Value tmp, DimsExpr tmpAF,
+    Value output, DimsExpr outputAF, Value initVal,
+    /* reduction functions (simd or scalar) */
+    KrnlSimdReductionBodyFn reductionBodyFn,
+    /* post reduction functions (post processing ONLY)*/
+    KrnlSimdPostReductionBodyFn postReductionBodyFn) const {
+  onnx_mlir::impl::simdReduce2DIE<KrnlBuilder, KrnlBuilder>(*this, lb, ub, VL,
+      fullySimd, input, inputAF, tmp, tmpAF, output, outputAF, initVal,
+      reductionBodyFn, postReductionBodyFn);
 }
 
-void KrnlBuilder::yield(mlir::ValueRange iterArgs) const {
+void KrnlBuilder::yield(ValueRange iterArgs) const {
   b().create<KrnlYieldOp>(loc(), iterArgs);
 }
 

diff --git a/src/Dialect/Krnl/DialectBuilder.hpp b/src/Dialect/Krnl/DialectBuilder.hpp
@@ -30,12 +30,12 @@ struct KrnlBuilder : public DialectBuilder {
   KrnlBuilder(const DialectBuilder &db) : DialectBuilder(db) {}
   virtual ~KrnlBuilder() {}
 
+  // Common load/store interface (krnl/affine/memref)
   // Add offsets (if any) to the least significant dims.
   mlir::Value load(mlir::Value memref, mlir::ValueRange indices = {},
       mlir::ValueRange offsets = {}) const;
   mlir::Value loadIE(mlir::Value memref, mlir::ArrayRef<IndexExpr> indices = {},
       mlir::ValueRange offsets = {}) const;
-  // Add offsets (if any) to the least significant dims.
   void store(mlir::Value val, mlir::Value memref, mlir::ValueRange indices = {},
       mlir::ValueRange offsets = {}) const;
   void storeIE(mlir::Value val, mlir::Value memref,
@@ -70,11 +70,12 @@ struct KrnlBuilder : public DialectBuilder {
   // Iterate over optimized loops given the original loops, lbs and ubs. Lambda
   // function implement the body of the loop, and receive a KRNL builder and the
   // loop indices.
+  using KrnlLoopBodyFn =
+      mlir::function_ref<void(KrnlBuilder &, mlir::ValueRange)>;
+
   void iterate(mlir::ValueRange originalLoops, mlir::ValueRange optimizedLoops,
       mlir::ValueRange lbs, mlir::ValueRange ubs,
-      mlir::function_ref<void(
-          KrnlBuilder &createKrnl, mlir::ValueRange indices)>
-          bodyBuilderFn) const;
+      KrnlLoopBodyFn bodyBuilderFn) const;
   mlir::KrnlIterateOp iterate(mlir::ValueRange originalLoops,
       mlir::ValueRange optimizedLoops, mlir::ValueRange lbs,
       mlir::ValueRange ubs, mlir::ValueRange inits,
@@ -87,31 +88,38 @@ struct KrnlBuilder : public DialectBuilder {
   // Same versions with Index Expressions for bounds.
   void iterateIE(mlir::ValueRange originalLoops,
       mlir::ValueRange optimizedLoops, mlir::ArrayRef<IndexExpr> lbs,
-      mlir::ArrayRef<IndexExpr> ubs,
-      mlir::function_ref<void(
-          KrnlBuilder &createKrnl, mlir::ValueRange indices)>
-          bodyBuilderFn) const;
+      mlir::ArrayRef<IndexExpr> ubs, KrnlLoopBodyFn bodyBuilderFn) const;
   mlir::KrnlIterateOp iterateIE(mlir::ValueRange originalLoops,
       mlir::ValueRange optimizedLoops, mlir::ArrayRef<IndexExpr> lbs,
       mlir::ArrayRef<IndexExpr> ubs, mlir::ValueRange inits,
       mlir::function_ref<void(KrnlBuilder &createKrnl, mlir::ValueRange indices,
           mlir::ValueRange blockIters)>
           bodyBuilderFn) const;
 
+  // Common loop interface (krnl/affine/scf).
+  void forLoopIE(IndexExpr lb, IndexExpr ub, int64_t step, bool useParallel,
+      KrnlLoopBodyFn builderFn) const;
+
+  // Common simd loop interface (krnl/affine/scf).
   /*
      Iterate over a loop executing the loop body in SIMD mode (of vector length
      VL) from lb to ub. A scalar loop may execute up to VL-1 loop
      iterations when the trip count is not a multiple of VL. If fullySimd is
      true, then the call assumes that the trip count is a multiple of VL.
 
-     This call needs be given each of the memref inputs to the loop body, given
-     as an ordered pair memref value and its corresponding access function. Same
-     hold for all the memref outputs of the loop body.
+     This simdIterateIE needs be given each of the memref inputs to the loop
+     body, given as an ordered pair memref value and its corresponding access
+     function. Same hold for all the memref outputs of the loop body.
+
+     The loop body is constructed by calling each of the KrnlSimdIterateBodyFn
+     given in the list. Each function is responsible for returning one output
+     value. The returned values are eventually stored in the output memrefs at a
+     location given by its respective output access function.
 
-     The loop body is given a KRNL builder, a list of loaded input (same order
-     as the input's memrefs and access functions). It will generate values that
-     must be placed in the result list in the same order as the output's memrefs
-     and access functions.
+     To generate their output, each KrnlSimdIterateBodyFn function is given
+     a KRNL builder, a list of loaded input (same order
+     as the input's memrefs and access functions), and the current VectorLength
+     (VL). VL is either the original VL or 1 (when executing in scalar mode).
 
      It will be the responsibility of this call to load each of the inputs and
      store each of the outputs. When operating in SIMD mode, every input and
@@ -129,45 +137,61 @@ struct KrnlBuilder : public DialectBuilder {
      Dialect/Mlir/DialectBuilder.hpp.inc.
     */
 
+  using KrnlSimdIterateBodyFn = impl::SimdIterateBodyFn<KrnlBuilder>;
   void simdIterateIE(IndexExpr lb, IndexExpr ub, int64_t VL, bool fullySimd,
       bool useParallel, mlir::ArrayRef<mlir::Value> inputs,
       mlir::ArrayRef<DimsExpr> inputAFs, mlir::ArrayRef<mlir::Value> outputs,
       mlir::ArrayRef<DimsExpr> outputAFs,
-      mlir::function_ref<void(KrnlBuilder &b,
-          mlir::ArrayRef<mlir::Value> inputVals,
-          llvm::SmallVectorImpl<mlir::Value> &resultVals, int64_t VL)>
-          bodyBuilderFn) const;
+      mlir::ArrayRef<KrnlSimdIterateBodyFn> bodyBuilderFnList) const;
 
   /*
      Works similarly as simdIterateIE, but performs a reduction to a single
      scalar per output value. Inputs must be strided in their innermost
      dimensions. Temps are used to hold the temporary results (partial results
      per SIMD lane), and the outputs have the scalar reduction outputs
-     Two functions are given: reductionBuilderFn to perform the partial
-     reductions into the temporary values tmps, finishing with up to VL partial
-     reductions
-     The second function: postProcessingBuilderFn performs the reductions of the
-     up to VL partial reductions into a final scalar reduction to be stored into
-     the outputs (a scalar value). For some reductions, post processing is also
-     needed, for example, mean reduction divide the accumulated sum by the
-     number of elements. That step is also performed here.
+
+     Two function lists are given: a list of reductionBodyFn to perform the
+     partial reductions into the temporary values tmps, finishing with up to VL
+     partial reductions The second list of postReductionBodyFn perform the
+     reductions of the up to VL partial reductions into a final scalar reduction
+     to be stored into the outputs (a scalar value). For some reductions, post
+     processing is also needed, for example, mean reduction divide the
+     accumulated sum by the number of elements. That step is also performed
+     here.
     */
+  using KrnlSimdReductionBodyFn = impl::SimdReductionBodyFn<KrnlBuilder>;
+  using KrnlSimdPostReductionBodyFn =
+      impl::SimdPostReductionBodyFn<KrnlBuilder>;
+
   void simdReduceIE(IndexExpr lb, IndexExpr ub, int64_t VL, bool fullySimd,
       mlir::ArrayRef<mlir::Value> inputs, mlir::ArrayRef<DimsExpr> inputAFs,
       mlir::ArrayRef<mlir::Value> tmps, mlir::ArrayRef<DimsExpr> tmpAFs,
       mlir::ArrayRef<mlir::Value> outputs, mlir::ArrayRef<DimsExpr> outputAFs,
       mlir::ArrayRef<mlir::Value> initVals,
       /* reduction function (simd or scalar) */
-      mlir::function_ref<void(const KrnlBuilder &b,
-          mlir::ArrayRef<mlir::Value> inputVals,
-          mlir::ArrayRef<mlir::Value> tmpVals,
-          llvm::SmallVectorImpl<mlir::Value> &resultVals, int64_t VL)>
-          reductionBuilderFn,
+      mlir::ArrayRef<KrnlSimdReductionBodyFn> reductionBodyFnList,
       /* post reduction function (simd to scalar + post processing)*/
-      mlir::function_ref<void(const KrnlBuilder &b,
-          mlir::ArrayRef<mlir::Value> tmpVals,
-          llvm::SmallVectorImpl<mlir::Value> &scalarOutputs, int64_t VL)>
-          postProcessingBuilderFn) const;
+      mlir::ArrayRef<KrnlSimdPostReductionBodyFn> postReductionBodyFnList)
+      const;
+
+  /*
+    Same as simdReduceIE, but perform VL reductions at once. It expect at least
+    VL iterations in the second to last dimension of inputs/outputs.
+
+    Unlike simdReduceIE, the second function is for post processing only. In
+    simdReduceIE, that function was also used to reduce the SIMD temporary
+    reduction into a single scalar.
+
+    Also, at this time, simdReduce2DIE process only one reduction at a time,
+    whereas simdReduceIE could process an arbitrary number of reductions.
+  */
+  void simdReduce2DIE(IndexExpr lb, IndexExpr ub, int64_t VL, bool fullySimd,
+      mlir::Value input, DimsExpr inputAF, mlir::Value tmp, DimsExpr tmpAF,
+      mlir::Value output, DimsExpr outputAF, mlir::Value initVal,
+      /* reduction functions (simd or scalar) */
+      KrnlSimdReductionBodyFn reductionBodyFn,
+      /* post reduction functions (post processing ONLY)*/
+      KrnlSimdPostReductionBodyFn postReductionBodyFn) const;
 
   void yield(mlir::ValueRange iterArgs) const;