From 8ea914ddda39d7158c1d2797819abe66a01e0e96 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy@amd.com>
Date: Fri, 26 Jul 2024 23:36:45 -0500
Subject: [PATCH 1/3] [flang][OpenMP] Implement more robust loop-nest detection
 logic

The previous loop-nest detection algorithm fell short, in some cases, to
detect whether a pair of `do concurrent` loops are perfectly nested or
not. This is a re-implementation using forward and backward slice
extraction algorithms to compare the set of ops required to setup the
inner loop bounds vs. the set of ops nested in the outer loop other
thatn the nested loop itself.
---
 .../OpenMP/DoConcurrentConversion.cpp         | 130 +++++++++++-------
 .../DoConcurrent/loop_nest_test.f90           |  89 ++++++++++++
 2 files changed, 167 insertions(+), 52 deletions(-)
 create mode 100644 flang/test/Transforms/DoConcurrent/loop_nest_test.f90

diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
index a929b0bdd16943..75a0c4d52f1d00 100644
--- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
@@ -36,7 +36,8 @@ namespace flangomp {
 #include "flang/Optimizer/OpenMP/Passes.h.inc"
 } // namespace flangomp
 
-#define DEBUG_TYPE "fopenmp-do-concurrent-conversion"
+#define DEBUG_TYPE "do-concurrent-conversion"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE << "]: ")
 
 namespace Fortran {
 namespace lower {
@@ -359,6 +360,81 @@ void collectIndirectConstOpChain(mlir::Operation *link,
   opChain.insert(link);
 }
 
+/// Loop \p innerLoop is considered perfectly-nested inside \p outerLoop iff
+/// there are no operations in \p outerloop's other than:
+///
+/// 1. those operations needed to setup \p innerLoop's LB, UB, and step values,
+/// 2. the operations needed to assing/update \p outerLoop's induction variable.
+/// 3. \p innerLoop itself.
+///
+/// \p return true if \p innerLoop is perfectly nested inside \p outerLoop
+/// according to the above definition.
+bool isPerfectlyNested(fir::DoLoopOp outerLoop, fir::DoLoopOp innerLoop) {
+  mlir::BackwardSliceOptions backwardSliceOptions;
+  backwardSliceOptions.inclusive = true;
+  // We will collect the backward slices for innerLoop's LB, UB, and step.
+  // However, we want to limit the scope of these slices to the scope of
+  // outerLoop's region.
+  backwardSliceOptions.filter = [&](mlir::Operation *op) {
+    return !mlir::areValuesDefinedAbove(op->getResults(),
+                                        outerLoop.getRegion());
+  };
+
+  llvm::SetVector<mlir::Operation *> lbSlice;
+  mlir::getBackwardSlice(innerLoop.getLowerBound(), &lbSlice,
+                         backwardSliceOptions);
+
+  llvm::SetVector<mlir::Operation *> ubSlice;
+  mlir::getBackwardSlice(innerLoop.getUpperBound(), &ubSlice,
+                         backwardSliceOptions);
+
+  llvm::SetVector<mlir::Operation *> stepSlice;
+  mlir::getBackwardSlice(innerLoop.getStep(), &stepSlice, backwardSliceOptions);
+
+  mlir::ForwardSliceOptions forwardSliceOptions;
+  forwardSliceOptions.inclusive = true;
+  // We don't care of the outer loop's induction variable's uses within the
+  // inner loop, so we filter out these uses.
+  forwardSliceOptions.filter = [&](mlir::Operation *op) {
+    return mlir::areValuesDefinedAbove(op->getResults(), innerLoop.getRegion());
+  };
+
+  llvm::SetVector<mlir::Operation *> indVarSlice;
+  mlir::getForwardSlice(outerLoop.getInductionVar(), &indVarSlice,
+                        forwardSliceOptions);
+
+  llvm::SetVector<mlir::Operation *> innerLoopSetupOpsVec;
+  innerLoopSetupOpsVec.set_union(indVarSlice);
+  innerLoopSetupOpsVec.set_union(lbSlice);
+  innerLoopSetupOpsVec.set_union(ubSlice);
+  innerLoopSetupOpsVec.set_union(stepSlice);
+  llvm::DenseSet<mlir::Operation *> innerLoopSetupOpsSet;
+
+  for (mlir::Operation *op : innerLoopSetupOpsVec)
+    innerLoopSetupOpsSet.insert(op);
+
+  llvm::DenseSet<mlir::Operation *> loopBodySet;
+  outerLoop.walk<mlir::WalkOrder::PreOrder>([&](mlir::Operation *op) {
+    if (op == outerLoop)
+      return mlir::WalkResult::advance();
+
+    if (op == innerLoop)
+      return mlir::WalkResult::skip();
+
+    if (op->hasTrait<mlir::OpTrait::IsTerminator>())
+      return mlir::WalkResult::advance();
+
+    loopBodySet.insert(op);
+    return mlir::WalkResult::advance();
+  });
+
+  bool result = (loopBodySet == innerLoopSetupOpsSet);
+  LLVM_DEBUG(DBGS() << "Loop pair starting at location " << outerLoop.getLoc()
+                    << " is" << (result ? "" : " not")
+                    << " perfectly nested\n");
+  return result;
+}
+
 /// Starting with `outerLoop` collect a perfectly nested loop nest, if any. This
 /// function collects as much as possible loops in the nest; it case it fails to
 /// recognize a certain nested loop as part of the nest it just returns the
@@ -399,57 +475,7 @@ mlir::LogicalResult collectLoopNest(fir::DoLoopOp outerLoop,
     llvm::SmallVector<mlir::Value> nestedLiveIns;
     collectLoopLiveIns(nestedUnorderedLoop, nestedLiveIns);
 
-    llvm::DenseSet<mlir::Value> outerLiveInsSet;
-    llvm::DenseSet<mlir::Value> nestedLiveInsSet;
-
-    // Returns a "unified" view of an mlir::Value. This utility checks if the
-    // value is defined by an op, and if so, return the first value defined by
-    // that op (if there are many), otherwise just returns the value.
-    //
-    // This serves the purpose that if, for example, `%op_res#0` is used in the
-    // outer loop and `%op_res#1` is used in the nested loop (or vice versa),
-    // that we detect both as the same value. If we did not do so, we might
-    // falesely detect that the 2 loops are not perfectly nested since they use
-    // "different" sets of values.
-    auto getUnifiedLiveInView = [](mlir::Value liveIn) {
-      return liveIn.getDefiningOp() != nullptr
-                 ? liveIn.getDefiningOp()->getResult(0)
-                 : liveIn;
-    };
-
-    // Re-package both lists of live-ins into sets so that we can use set
-    // equality to compare the values used in the outerloop vs. the nestd one.
-
-    for (auto liveIn : nestedLiveIns)
-      nestedLiveInsSet.insert(getUnifiedLiveInView(liveIn));
-
-    mlir::Value outerLoopIV;
-    for (auto liveIn : outerLoopLiveIns) {
-      outerLiveInsSet.insert(getUnifiedLiveInView(liveIn));
-
-      // Keep track of the IV of the outerloop. See `isPerfectlyNested` for more
-      // info on the reason.
-      if (outerLoopIV == nullptr)
-        outerLoopIV = getUnifiedLiveInView(liveIn);
-    }
-
-    // For the 2 loops to be perfectly nested, either:
-    // * both would have exactly the same set of live-in values or,
-    // * the outer loop would have exactly 1 extra live-in value: the outer
-    //   loop's induction variable; this happens when the outer loop's IV is
-    //   *not* referenced in the nested loop.
-    bool isPerfectlyNested = [&]() {
-      if (outerLiveInsSet == nestedLiveInsSet)
-        return true;
-
-      if ((outerLiveInsSet.size() == nestedLiveIns.size() + 1) &&
-          !nestedLiveInsSet.contains(outerLoopIV))
-        return true;
-
-      return false;
-    }();
-
-    if (!isPerfectlyNested)
+    if (!isPerfectlyNested(outerLoop, nestedUnorderedLoop))
       return mlir::failure();
 
     outerLoop = nestedUnorderedLoop;
diff --git a/flang/test/Transforms/DoConcurrent/loop_nest_test.f90 b/flang/test/Transforms/DoConcurrent/loop_nest_test.f90
new file mode 100644
index 00000000000000..a0463ea6fdb601
--- /dev/null
+++ b/flang/test/Transforms/DoConcurrent/loop_nest_test.f90
@@ -0,0 +1,89 @@
+! Tests loop-nest detection algorithm for do-concurrent mapping.
+
+! REQUIRES: asserts
+
+! RUN: %flang_fc1 -emit-hlfir  -fopenmp -fdo-concurrent-parallel=host \
+! RUN:   -mmlir -debug %s -o - 2> %t.log || true
+
+! RUN: FileCheck %s < %t.log
+
+program main
+  implicit none
+
+contains
+
+subroutine foo(n)
+  implicit none
+  integer :: n, m
+  integer :: i, j, k
+  integer :: x
+  integer, dimension(n) :: a
+  integer, dimension(n, n, n) :: b
+
+  ! NOTE This for sure is a perfect loop nest. However, the way `do-concurrent`
+  ! loops are now emitted by flang is probably not correct. This is being looked
+  ! into at the moment and once we have flang emitting proper loop headers, we
+  ! will revisit this.
+  !
+  ! CHECK:      Loop pair starting at location loc("{{.*}}":[[# @LINE + 2]]:{{.*}})
+  ! CHECK-SAME: is not perfectly nested
+  do concurrent(i=1:n, j=1:bar(n*m, n/m))
+    a(i) = n
+  end do
+
+  ! NOTE same as above.
+  !
+  ! CHECK:      Loop pair starting at location loc("{{.*}}":[[# @LINE + 2]]:{{.*}})
+  ! CHECK-SAME: is not perfectly nested
+  do concurrent(i=bar(n, x):n, j=1:bar(n*m, n/m))
+    a(i) = n
+  end do
+
+  ! NOTE This is **not** a perfect nest since the inner call to `bar` will allocate
+  ! memory for the temp results of `n*m` and `n/m` **inside** the outer loop.
+  !
+  ! CHECK:      Loop pair starting at location loc("{{.*}}":[[# @LINE + 2]]:{{.*}})
+  ! CHECK-SAME: is not perfectly nested
+  do concurrent(i=bar(n, x):n)
+    do concurrent(j=1:bar(n*m, n/m))
+      a(i) = n
+    end do
+  end do
+
+  ! CHECK:      Loop pair starting at location loc("{{.*}}":[[# @LINE + 2]]:{{.*}})
+  ! CHECK-SAME: is not perfectly nested
+  do concurrent(i=1:n)
+    x = 10
+    do concurrent(j=1:m)
+      b(i,j,k) = i * j + k
+    end do
+  end do
+
+  ! CHECK:      Loop pair starting at location loc("{{.*}}":[[# @LINE + 2]]:{{.*}})
+  ! CHECK-SAME: is not perfectly nested
+  do concurrent(i=1:n)
+    do concurrent(j=1:m)
+      b(i,j,k) = i * j + k
+    end do
+    x = 10
+  end do
+
+  ! CHECK:      Loop pair starting at location loc("{{.*}}":[[# @LINE + 2]]:{{.*}})
+  ! CHECK-SAME: is perfectly nested
+  do concurrent(i=1:n)
+    do concurrent(j=1:m)
+      b(i,j,k) = i * j + k
+      x = 10
+    end do
+  end do
+end subroutine
+
+pure function bar(n, m)
+    implicit none
+    integer, intent(in) :: n, m
+    integer :: bar
+
+    bar = n + m
+end function
+
+end program main

From 3e674f5e9a4a5f75f29cd0e5a45eb89f1d4f9c16 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy@amd.com>
Date: Tue, 20 Aug 2024 08:40:17 -0500
Subject: [PATCH 2/3] [flang][OpenMP][DoConcurrent] Map loop bounds/step to the
 target region.

---
 .../OpenMP/DoConcurrentConversion.cpp         | 262 ++++++++++++------
 .../Transforms/DoConcurrent/basic_device.f90  |  35 ++-
 .../multiple_iteration_ranges.f90             |  76 +++--
 .../DoConcurrent/not_perfectly_nested.f90     |   9 +-
 .../DoConcurrent/runtime_sized_array.f90      |   5 +-
 .../DoConcurrent/skip_all_nested_loops.f90    |   8 +-
 6 files changed, 275 insertions(+), 120 deletions(-)

diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
index 75a0c4d52f1d00..9e86ab2f9de068 100644
--- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
@@ -24,11 +24,13 @@
 #include "mlir/IR/IRMapping.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/LoopInvariantCodeMotionUtils.h"
 #include "mlir/Transforms/RegionUtils.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
 
 #include <algorithm>
 #include <memory>
+#include <string>
 #include <utility>
 
 namespace flangomp {
@@ -249,29 +251,59 @@ bool isIndVarUltimateOperand(mlir::Operation *op, fir::DoLoopOp doLoop) {
   return false;
 };
 
+mlir::Value findLoopIndVar(fir::DoLoopOp doLoop) {
+  mlir::Value result = nullptr;
+  mlir::visitUsedValuesDefinedAbove(
+      doLoop.getRegion(), [&](mlir::OpOperand *operand) {
+        if (isIndVarUltimateOperand(operand->getOwner(), doLoop))
+          result = operand->get();
+      });
+
+  assert(result != nullptr);
+  return result;
+}
+
 /// Collect the list of values used inside the loop but defined outside of it.
 /// The first item in the returned list is always the loop's induction
 /// variable.
-void collectLoopLiveIns(fir::DoLoopOp doLoop,
-                        llvm::SmallVectorImpl<mlir::Value> &liveIns) {
+void collectLoopNestLiveIns(
+    LoopNestToIndVarMap &loopNest, llvm::SmallVectorImpl<mlir::Value> &liveIns,
+    llvm::DenseMap<mlir::Value, std::string> *liveInToName = nullptr) {
   llvm::SmallDenseSet<mlir::Value> seenValues;
   llvm::SmallDenseSet<mlir::Operation *> seenOps;
 
-  mlir::visitUsedValuesDefinedAbove(
-      doLoop.getRegion(), [&](mlir::OpOperand *operand) {
-        if (!seenValues.insert(operand->get()).second)
-          return;
+  auto addValueToLiveIns = [&](mlir::Value liveIn) {
+    if (!seenValues.insert(liveIn).second)
+      return false;
 
-        mlir::Operation *definingOp = operand->get().getDefiningOp();
-        // We want to collect ops corresponding to live-ins only once.
-        if (definingOp && !seenOps.insert(definingOp).second)
-          return;
+    mlir::Operation *definingOp = liveIn.getDefiningOp();
+    // We want to collect ops corresponding to live-ins only once.
+    if (definingOp && !seenOps.insert(definingOp).second)
+      return false;
 
-        liveIns.push_back(operand->get());
+    liveIns.push_back(liveIn);
+    return true;
+  };
 
-        if (isIndVarUltimateOperand(operand->getOwner(), doLoop))
-          std::swap(*liveIns.begin(), *liveIns.rbegin());
-      });
+  size_t nestLevel = 0;
+  for (auto [loop, _] : loopNest) {
+    auto addBoundOrStepToLiveIns = [&](mlir::Value operand, std::string name) {
+      (*liveInToName)[operand] = name;
+      addValueToLiveIns(operand);
+    };
+
+    addBoundOrStepToLiveIns(loop.getLowerBound(),
+                            "loop." + std::to_string(nestLevel) + ".lb");
+    addBoundOrStepToLiveIns(loop.getUpperBound(),
+                            "loop." + std::to_string(nestLevel) + ".ub");
+    addBoundOrStepToLiveIns(loop.getStep(),
+                            "loop." + std::to_string(nestLevel) + ".step");
+    ++nestLevel;
+  }
+
+  mlir::visitUsedValuesDefinedAbove(
+      loopNest.front().first.getRegion(),
+      [&](mlir::OpOperand *operand) { addValueToLiveIns(operand->get()); });
 }
 
 /// Collects the op(s) responsible for updating a loop's iteration variable with
@@ -439,20 +471,17 @@ bool isPerfectlyNested(fir::DoLoopOp outerLoop, fir::DoLoopOp innerLoop) {
 /// function collects as much as possible loops in the nest; it case it fails to
 /// recognize a certain nested loop as part of the nest it just returns the
 /// parent loops it discovered before.
-mlir::LogicalResult collectLoopNest(fir::DoLoopOp outerLoop,
+mlir::LogicalResult collectLoopNest(fir::DoLoopOp currentLoop,
                                     LoopNestToIndVarMap &loopNest) {
-  assert(outerLoop.getUnordered());
-  llvm::SmallVector<mlir::Value> outerLoopLiveIns;
-  collectLoopLiveIns(outerLoop, outerLoopLiveIns);
-
+  assert(currentLoop.getUnordered());
   while (true) {
     loopNest.try_emplace(
-        outerLoop,
+        currentLoop,
         InductionVariableInfo{
-            outerLoopLiveIns.front().getDefiningOp(),
-            std::move(looputils::extractIndVarUpdateOps(outerLoop))});
+            findLoopIndVar(currentLoop).getDefiningOp(),
+            std::move(looputils::extractIndVarUpdateOps(currentLoop))});
 
-    auto directlyNestedLoops = outerLoop.getRegion().getOps<fir::DoLoopOp>();
+    auto directlyNestedLoops = currentLoop.getRegion().getOps<fir::DoLoopOp>();
     llvm::SmallVector<fir::DoLoopOp> unorderedLoops;
 
     for (auto nestedLoop : directlyNestedLoops)
@@ -472,14 +501,10 @@ mlir::LogicalResult collectLoopNest(fir::DoLoopOp outerLoop,
         (nestedUnorderedLoop.getStep().getDefiningOp() == nullptr))
       return mlir::failure();
 
-    llvm::SmallVector<mlir::Value> nestedLiveIns;
-    collectLoopLiveIns(nestedUnorderedLoop, nestedLiveIns);
-
-    if (!isPerfectlyNested(outerLoop, nestedUnorderedLoop))
+    if (!isPerfectlyNested(currentLoop, nestedUnorderedLoop))
       return mlir::failure();
 
-    outerLoop = nestedUnorderedLoop;
-    outerLoopLiveIns = std::move(nestedLiveIns);
+    currentLoop = nestedUnorderedLoop;
   }
 
   return mlir::success();
@@ -653,10 +678,6 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
                   "defining operation.");
     }
 
-    llvm::SmallVector<mlir::Value> outermostLoopLiveIns;
-    looputils::collectLoopLiveIns(doLoop, outermostLoopLiveIns);
-    assert(!outermostLoopLiveIns.empty());
-
     looputils::LoopNestToIndVarMap loopNest;
     bool hasRemainingNestedLoops =
         failed(looputils::collectLoopNest(doLoop, loopNest));
@@ -665,15 +686,57 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
                         "Some `do concurent` loops are not perfectly-nested. "
                         "These will be serialzied.");
 
+    llvm::DenseMap<mlir::Value, std::string> liveInToName;
+    llvm::SmallVector<mlir::Value> loopNestLiveIns;
+
+    // TODO outline into a separete function. This hoists the ops to compute
+    // bounds of all loops in the entire loop nest outside the outermost loop.
+    // Without this hoisting, values/variables that are required to compute
+    // these bounds will be stuck inside the original `fir.do_loop` ops and
+    // therefore their SSA values won't be visible from within the `target`
+    // region.
+    {
+      fir::DoLoopOp outermostLoop = loopNest.front().first;
+
+      mlir::BackwardSliceOptions backwardSliceOptions;
+      backwardSliceOptions.inclusive = true;
+      // We will collect the backward slices for innerLoop's LB, UB, and step.
+      // However, we want to limit the scope of these slices to the scope of
+      // outerLoop's region.
+      backwardSliceOptions.filter = [&](mlir::Operation *op) {
+        return !mlir::areValuesDefinedAbove(op->getResults(),
+                                            outermostLoop.getRegion());
+      };
+
+      for (auto [loop, _] : loopNest) {
+        auto moveBoundOrStepOutOfLoopNest = [&](mlir::Value operand) {
+          llvm::SetVector<mlir::Operation *> loopOperandSlice;
+          mlir::getBackwardSlice(operand, &loopOperandSlice,
+                                 backwardSliceOptions);
+
+          for (mlir::Operation *sliceOp : loopOperandSlice) {
+            outermostLoop.moveOutOfLoop(sliceOp);
+          }
+        };
+
+        moveBoundOrStepOutOfLoopNest(loop.getLowerBound());
+        moveBoundOrStepOutOfLoopNest(loop.getUpperBound());
+        moveBoundOrStepOutOfLoopNest(loop.getStep());
+      }
+    }
+
+    looputils::collectLoopNestLiveIns(loopNest, loopNestLiveIns, &liveInToName);
+    assert(!loopNestLiveIns.empty());
+
     llvm::SetVector<mlir::Value> locals;
     looputils::collectLoopLocalValues(loopNest.back().first, locals);
     // We do not want to map "loop-local" values to the device through
     // `omp.map.info` ops. Therefore, we remove them from the list of live-ins.
-    outermostLoopLiveIns.erase(llvm::remove_if(outermostLoopLiveIns,
-                                               [&](mlir::Value liveIn) {
-                                                 return locals.contains(liveIn);
-                                               }),
-                               outermostLoopLiveIns.end());
+    loopNestLiveIns.erase(llvm::remove_if(loopNestLiveIns,
+                                          [&](mlir::Value liveIn) {
+                                            return locals.contains(liveIn);
+                                          }),
+                          loopNestLiveIns.end());
 
     looputils::sinkLoopIVArgs(rewriter, loopNest);
 
@@ -688,12 +751,13 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
       // The outermost loop will contain all the live-in values in all nested
       // loops since live-in values are collected recursively for all nested
       // ops.
-      for (mlir::Value liveIn : outermostLoopLiveIns)
+      for (mlir::Value liveIn : loopNestLiveIns) {
         targetClauseOps.mapVars.push_back(
-            genMapInfoOpForLiveIn(rewriter, liveIn));
+            genMapInfoOpForLiveIn(rewriter, liveIn, liveInToName));
+      }
 
-      targetOp = genTargetOp(doLoop.getLoc(), rewriter, mapper,
-                             outermostLoopLiveIns, targetClauseOps);
+      targetOp = genTargetOp(doLoop.getLoc(), rewriter, mapper, loopNestLiveIns,
+                             targetClauseOps);
       genTeamsOp(doLoop.getLoc(), rewriter);
     }
 
@@ -746,14 +810,14 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
 
 private:
   void genBoundsOps(mlir::ConversionPatternRewriter &rewriter,
-                    mlir::Location loc, hlfir::DeclareOp declareOp,
+                    mlir::Location loc, mlir::Value shape,
                     llvm::SmallVectorImpl<mlir::Value> &boundsOps) const {
-    if (declareOp.getShape() == nullptr) {
+    if (shape == nullptr) {
       return;
     }
 
-    auto shapeOp = mlir::dyn_cast_if_present<fir::ShapeOp>(
-        declareOp.getShape().getDefiningOp());
+    auto shapeOp =
+        mlir::dyn_cast_if_present<fir::ShapeOp>(shape.getDefiningOp());
 
     if (shapeOp == nullptr)
       TODO(loc, "Shapes not defined by shape op's are not supported yet.");
@@ -778,15 +842,36 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
       boundsOps.push_back(genBoundsOp(extent));
   }
 
-  mlir::omp::MapInfoOp
-  genMapInfoOpForLiveIn(mlir::ConversionPatternRewriter &rewriter,
-                        mlir::Value liveIn) const {
-    auto declareOp =
-        mlir::dyn_cast_if_present<hlfir::DeclareOp>(liveIn.getDefiningOp());
+  mlir::omp::MapInfoOp genMapInfoOpForLiveIn(
+      mlir::ConversionPatternRewriter &rewriter, mlir::Value liveIn,
+      const llvm::DenseMap<mlir::Value, std::string> &liveInToName) const {
+    mlir::Value rawAddr = liveIn;
+    mlir::Value shape = nullptr;
+    std::string name = "";
 
-    if (declareOp == nullptr)
-      TODO(liveIn.getLoc(),
-           "Values not defined by declare op's are not supported yet.");
+    mlir::Operation *liveInDefiningOp = liveIn.getDefiningOp();
+    auto declareOp =
+        mlir::dyn_cast_if_present<hlfir::DeclareOp>(liveInDefiningOp);
+
+    if (declareOp != nullptr) {
+      // Use the raw address to avoid unboxing `fir.box` values whenever
+      // possible. Put differently, if we have access to the direct value memory
+      // reference/address, we use it.
+      rawAddr = declareOp.getOriginalBase();
+      shape = declareOp.getShape();
+      name = declareOp.getUniqName().str();
+    } else if (liveInToName.contains(liveIn))
+      name = liveInToName.at(liveIn);
+
+    if (!llvm::isa<mlir::omp::PointerLikeType>(rawAddr.getType())) {
+      fir::FirOpBuilder builder(
+          rewriter, fir::getKindMapping(
+                        liveInDefiningOp->getParentOfType<mlir::ModuleOp>()));
+      builder.setInsertionPointAfter(liveInDefiningOp);
+      auto copyVal = builder.createTemporary(liveIn.getLoc(), liveIn.getType());
+      builder.createStoreWithConvert(copyVal.getLoc(), liveIn, copyVal);
+      rawAddr = copyVal;
+    }
 
     mlir::Type liveInType = liveIn.getType();
     mlir::Type eleType = liveInType;
@@ -806,15 +891,11 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
     }
 
     llvm::SmallVector<mlir::Value> boundsOps;
-    genBoundsOps(rewriter, liveIn.getLoc(), declareOp, boundsOps);
+    genBoundsOps(rewriter, liveIn.getLoc(), shape, boundsOps);
 
-    // Use the raw address to avoid unboxing `fir.box` values whenever possible.
-    // Put differently, if we have access to the direct value memory
-    // reference/address, we use it.
-    mlir::Value rawAddr = declareOp.getOriginalBase();
     return Fortran::lower::omp ::internal::createMapInfoOp(
         rewriter, liveIn.getLoc(), rawAddr,
-        /*varPtrPtr=*/{}, declareOp.getUniqName().str(), boundsOps,
+        /*varPtrPtr=*/{}, name, boundsOps,
         /*members=*/{},
         /*membersIndex=*/mlir::DenseIntElementsAttr{},
         static_cast<
@@ -835,34 +916,40 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
     llvm::SmallVector<mlir::Type> liveInTypes;
     llvm::SmallVector<mlir::Location> liveInLocs;
 
-    for (mlir::Value liveIn : liveIns) {
-      liveInTypes.push_back(liveIn.getType());
-      liveInLocs.push_back(liveIn.getLoc());
+    for (mlir::Value mapInfoOp : clauseOps.mapVars) {
+      auto miOp = mlir::cast<mlir::omp::MapInfoOp>(mapInfoOp.getDefiningOp());
+      liveInTypes.push_back(miOp.getVarPtr().getType());
+      liveInLocs.push_back(miOp.getVarPtr().getLoc());
     }
 
     rewriter.createBlock(&region, {}, liveInTypes, liveInLocs);
+    fir::FirOpBuilder firBuilder(
+        rewriter,
+        fir::getKindMapping(targetOp->getParentOfType<mlir::ModuleOp>()));
 
-    for (auto [arg, mapInfoOp] :
-         llvm::zip_equal(region.getArguments(), clauseOps.mapVars)) {
+    for (auto [liveIn, arg, mapInfoOp] :
+         llvm::zip_equal(liveIns, region.getArguments(), clauseOps.mapVars)) {
       auto miOp = mlir::cast<mlir::omp::MapInfoOp>(mapInfoOp.getDefiningOp());
       hlfir::DeclareOp liveInDeclare = genLiveInDeclare(rewriter, arg, miOp);
-      mlir::Value miOperand = miOp.getVariableOperand(0);
 
-      // TODO If `miOperand.getDefiningOp()` is a `fir::BoxAddrOp`, we probably
+      // TODO If `liveIn.getDefiningOp()` is a `fir::BoxAddrOp`, we probably
       // need to "unpack" the box by getting the defining op of it's value.
       // However, we did not hit this case in reality yet so leaving it as a
       // todo for now.
 
-      mapper.map(miOperand, liveInDeclare.getOriginalBase());
+      if (!llvm::isa<mlir::omp::PointerLikeType>(liveIn.getType()))
+        mapper.map(liveIn,
+                   firBuilder.loadIfRef(liveIn.getLoc(),
+                                        liveInDeclare.getOriginalBase()));
+      else
+        mapper.map(liveIn, liveInDeclare.getOriginalBase());
 
       if (auto origDeclareOp = mlir::dyn_cast_if_present<hlfir::DeclareOp>(
-              miOperand.getDefiningOp()))
+              liveIn.getDefiningOp())) {
         mapper.map(origDeclareOp.getBase(), liveInDeclare.getBase());
+      }
     }
 
-    fir::FirOpBuilder firBuilder(
-        rewriter,
-        fir::getKindMapping(targetOp->getParentOfType<mlir::ModuleOp>()));
     Fortran::lower::omp::internal::cloneOrMapRegionOutsiders(firBuilder,
                                                              targetOp);
     rewriter.setInsertionPoint(
@@ -943,24 +1030,31 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
       looputils::collectIndirectConstOpChain(operation, opChain);
 
       mlir::Operation *result;
-      for (mlir::Operation *link : opChain)
+      for (mlir::Operation *link : opChain) {
         result = rewriter.clone(*link, mapper);
+      }
 
       return result;
     };
 
     for (auto &[doLoop, _] : loopNest) {
-      mlir::Operation *lbOp = doLoop.getLowerBound().getDefiningOp();
-      loopNestClauseOps.loopLowerBounds.push_back(
-          cloneBoundOrStepOpChain(lbOp)->getResult(0));
-
-      mlir::Operation *ubOp = doLoop.getUpperBound().getDefiningOp();
-      loopNestClauseOps.loopUpperBounds.push_back(
-          cloneBoundOrStepOpChain(ubOp)->getResult(0));
-
-      mlir::Operation *stepOp = doLoop.getStep().getDefiningOp();
-      loopNestClauseOps.loopSteps.push_back(
-          cloneBoundOrStepOpChain(stepOp)->getResult(0));
+      auto addBoundsOrStep =
+          [&](mlir::Value value,
+              llvm::SmallVectorImpl<mlir::Value> &boundsOrStepVec) {
+            if (mapper.contains(value))
+              boundsOrStepVec.push_back(mapper.lookup(value));
+            else {
+              mlir::Operation *definingOp = value.getDefiningOp();
+              boundsOrStepVec.push_back(
+                  cloneBoundOrStepOpChain(definingOp)->getResult(0));
+            }
+          };
+
+      addBoundsOrStep(doLoop.getLowerBound(),
+                      loopNestClauseOps.loopLowerBounds);
+      addBoundsOrStep(doLoop.getUpperBound(),
+                      loopNestClauseOps.loopUpperBounds);
+      addBoundsOrStep(doLoop.getStep(), loopNestClauseOps.loopSteps);
     }
 
     loopNestClauseOps.loopInclusive = rewriter.getUnitAttr();
diff --git a/flang/test/Transforms/DoConcurrent/basic_device.f90 b/flang/test/Transforms/DoConcurrent/basic_device.f90
index 2f762c003ddf16..a8b70e67839975 100644
--- a/flang/test/Transforms/DoConcurrent/basic_device.f90
+++ b/flang/test/Transforms/DoConcurrent/basic_device.f90
@@ -22,6 +22,11 @@ program do_concurrent_basic
     ! CHECK-NOT: fir.do_loop
 
     ! CHECK-DAG: %[[I_MAP_INFO:.*]] = omp.map.info var_ptr(%[[I_ORIG_DECL]]#1
+
+    ! CHECK-DAG: %[[LB_MAP_INFO:.*]] = omp.map.info {{.*}} !fir.ref<index> {name = "loop.0.lb"}
+    ! CHECK-DAG: %[[UB_MAP_INFO:.*]] = omp.map.info {{.*}} !fir.ref<index> {name = "loop.0.ub"}
+    ! CHECK-DAG: %[[STEP_MAP_INFO:.*]] = omp.map.info {{.*}} !fir.ref<index> {name = "loop.0.step"}
+
     ! CHECK: %[[C0:.*]] = arith.constant 0 : index
     ! CHECK: %[[UPPER_BOUND:.*]] = arith.subi %[[A_EXTENT]], %[[C0]] : index
 
@@ -33,31 +38,40 @@ program do_concurrent_basic
     ! CHECK-SAME: map_clauses(implicit, tofrom) capture(ByRef) bounds(%[[A_BOUNDS]])
 
     ! CHECK: %[[TRIP_COUNT:.*]] = arith.muli %{{.*}}, %{{.*}} : i64
-
     ! CHECK: omp.target
-    ! CHECK-SAME: map_entries(%[[I_MAP_INFO]] -> %[[I_ARG:[[:alnum:]]+]],
+    ! CHECK-SAME: map_entries(%[[LB_MAP_INFO]] -> %[[LB_ARG:.[[:alnum:]]+]],
+    ! CHECK-SAME:             %[[UB_MAP_INFO]] -> %[[UB_ARG:.[[:alnum:]]+]],
+    ! CHECK-SAME:             %[[STEP_MAP_INFO]] -> %[[STEP_ARG:.[[:alnum:]]+]],
+    ! CHECK-SAME:             %[[I_MAP_INFO]] -> %[[I_ARG:[[:alnum:]]+]],
     ! CHECK-SAME:             %[[A_MAP_INFO]] -> %[[A_ARG:.[[:alnum:]]+]]
     ! CHECK-SAME: trip_count(%[[TRIP_COUNT]] : i64)
 
-    ! CHECK-NEXT: ^{{.*}}(%[[I_ARG]]: !fir.ref<i32>, %[[A_ARG]]: !fir.ref<!fir.array<10xi32>>):
+    ! CHECK-NEXT: ^{{.*}}(%[[LB_ARG]]: !fir.ref<index>,
+    ! CHECK-SAME:         %[[UB_ARG]]: !fir.ref<index>, %[[STEP_ARG]]: !fir.ref<index>,
+    ! CHECK-SAME:         %[[I_ARG]]: !fir.ref<i32>,
+    ! CHECK-SAME:         %[[A_ARG]]: !fir.ref<!fir.array<10xi32>>, %[[A_EXT_ARG]]: !fir.ref<index>):
+
+    ! CHECK: %[[LB_DEV_DECL:.*]]:2 = hlfir.declare %[[LB_ARG]]
+    ! CHECK: %[[LB_DEV_VAL:.*]] = fir.load %[[LB_DEV_DECL]]#1
+
+    ! CHECK: %[[UB_DEV_DECL:.*]]:2 = hlfir.declare %[[UB_ARG]]
+    ! CHECK: %[[UB_DEV_VAL:.*]] = fir.load %[[UB_DEV_DECL]]#1
+
+    ! CHECK: %[[STEP_DEV_DECL:.*]]:2 = hlfir.declare %[[STEP_ARG]]
+    ! CHECK: %[[STEP_DEV_VAL:.*]] = fir.load %[[STEP_DEV_DECL]]#1
 
     ! CHECK: %[[A_DEV_DECL:.*]]:2 = hlfir.declare %[[A_ARG]]
+
     ! CHECK: omp.teams {
     ! CHECK-NEXT: omp.parallel {
 
     ! CHECK-NEXT: %[[ITER_VAR:.*]] = fir.alloca i32 {bindc_name = "i"}
     ! CHECK-NEXT: %[[BINDING:.*]]:2 = hlfir.declare %[[ITER_VAR]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 
-    ! CHECK: %[[C1:.*]] = arith.constant 1 : i32
-    ! CHECK: %[[LB:.*]] = fir.convert %[[C1]] : (i32) -> index
-    ! CHECK: %[[C10:.*]] = arith.constant 10 : i32
-    ! CHECK: %[[UB:.*]] = fir.convert %[[C10]] : (i32) -> index
-    ! CHECK: %[[STEP:.*]] = arith.constant 1 : index
-
     ! CHECK-NEXT: omp.distribute {
     ! CHECK-NEXT: omp.wsloop {
 
-    ! CHECK-NEXT: omp.loop_nest (%[[ARG0:.*]]) : index = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
+    ! CHECK-NEXT: omp.loop_nest (%[[ARG0:.*]]) : index = (%[[LB_DEV_VAL]]) to (%[[UB_DEV_VAL]]) inclusive step (%[[STEP_DEV_VAL]]) {
     ! CHECK-NEXT: %[[IV_IDX:.*]] = fir.convert %[[ARG0]] : (index) -> i32
     ! CHECK-NEXT: fir.store %[[IV_IDX]] to %[[BINDING]]#1 : !fir.ref<i32>
     ! CHECK-NEXT: %[[IV_VAL1:.*]] = fir.load %[[BINDING]]#0 : !fir.ref<i32>
@@ -78,6 +92,7 @@ program do_concurrent_basic
     ! CHECK-NEXT: }
     ! CHECK-NEXT: omp.terminator
     ! CHECK-NEXT: }
+
     do concurrent (i=1:10)
         a(i) = i
     end do
diff --git a/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90 b/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90
index 17cf27a9b70b27..18758cfc5efcdd 100644
--- a/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90
+++ b/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90
@@ -20,6 +20,9 @@
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-parallel=device %t/partially_nested.f90 -o - \
 ! RUN:   | FileCheck %s --check-prefixes=DEVICE,COMMON
 
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-parallel=device %t/dummy_arg_loop_bounds.f90 -o - \
+! RUN:   | FileCheck %s --check-prefixes=DUMMY_UBS
+
 !--- multi_range.f90
 program main
    integer, parameter :: n = 10
@@ -76,32 +79,14 @@ program main
 ! COMMON-NEXT: %[[ITER_VAR_K:.*]] = fir.alloca i32 {bindc_name = "k"}
 ! COMMON-NEXT: %[[BINDING_K:.*]]:2 = hlfir.declare %[[ITER_VAR_K]] {uniq_name = "_QFEk"}
 
-! COMMON: %[[C1_1:.*]] = arith.constant 1 : i32
-! COMMON: %[[LB_I:.*]] = fir.convert %[[C1_1]] : (i32) -> index
-! COMMON: %[[C10:.*]] = arith.constant 10 : i32
-! COMMON: %[[UB_I:.*]] = fir.convert %[[C10]] : (i32) -> index
-! COMMON: %[[STEP_I:.*]] = arith.constant 1 : index
-
-! COMMON: %[[C1_2:.*]] = arith.constant 1 : i32
-! COMMON: %[[LB_J:.*]] = fir.convert %[[C1_2]] : (i32) -> index
-! COMMON: %[[C20:.*]] = arith.constant 20 : i32
-! COMMON: %[[UB_J:.*]] = fir.convert %[[C20]] : (i32) -> index
-! COMMON: %[[STEP_J:.*]] = arith.constant 1 : index
-
-! COMMON: %[[C1_3:.*]] = arith.constant 1 : i32
-! COMMON: %[[LB_K:.*]] = fir.convert %[[C1_3]] : (i32) -> index
-! COMMON: %[[C30:.*]] = arith.constant 30 : i32
-! COMMON: %[[UB_K:.*]] = fir.convert %[[C30]] : (i32) -> index
-! COMMON: %[[STEP_K:.*]] = arith.constant 1 : index
-
 ! DEVICE: omp.distribute
 
 ! COMMON: omp.wsloop {
 ! COMMON-NEXT: omp.loop_nest
 ! COMMON-SAME:   (%[[ARG0:[^[:space:]]+]], %[[ARG1:[^[:space:]]+]], %[[ARG2:[^[:space:]]+]])
-! COMMON-SAME:   : index = (%[[LB_I]], %[[LB_J]], %[[LB_K]])
-! COMMON-SAME:     to (%[[UB_I]], %[[UB_J]], %[[UB_K]]) inclusive
-! COMMON-SAME:     step (%[[STEP_I]], %[[STEP_J]], %[[STEP_K]]) {
+! COMMON-SAME:   : index = (%{{[^[:space:]]+}}, %{{[^[:space:]]+}}, %{{[^[:space:]]+}})
+! COMMON-SAME:     to (%{{[^[:space:]]+}}, %{{[^[:space:]]+}}, %{{[^[:space:]]+}}) inclusive
+! COMMON-SAME:     step (%{{[^[:space:]]+}}, %{{[^[:space:]]+}}, %{{[^[:space:]]+}}) {
 
 ! COMMON-NEXT: %[[IV_IDX_I:.*]] = fir.convert %[[ARG0]]
 ! COMMON-NEXT: fir.store %[[IV_IDX_I]] to %[[BINDING_I]]#1
@@ -119,3 +104,52 @@ program main
 
 ! HOST-NEXT: omp.terminator
 ! HOST-NEXT: }
+
+!--- dummy_arg_loop_bounds.f90
+
+subroutine foo(n, m)
+   implicit none
+   integer :: n, m
+   integer :: i, j
+   integer :: a(n, m)
+
+   do concurrent(i=1:n, j=1:m)
+       a(i,j) = i * j
+   end do
+end subroutine
+
+! DUMMY_UBS-DAG: omp.map.info {{.*}} {name = "loop.0.lb"}
+! DUMMY_UBS-DAG: omp.map.info {{.*}} {name = "loop.0.ub"}
+! DUMMY_UBS-DAG: omp.map.info {{.*}} {name = "loop.0.step"}
+
+! DUMMY_UBS-DAG: omp.map.info {{.*}} {name = "loop.1.lb"}
+! DUMMY_UBS-DAG: omp.map.info {{.*}} {name = "loop.1.ub"}
+! DUMMY_UBS-DAG: omp.map.info {{.*}} {name = "loop.1.step"}
+
+
+! DUMMY_UBS: omp.target {{.*}} {
+
+! DUMMY_UBS-DAG:   %[[LOOP0_LB_DECL:.*]]:2 = hlfir.declare %arg{{.*}} {uniq_name = "loop.0.lb"}
+! DUMMY_UBS-DAG:   %[[LOOP0_UB_DECL:.*]]:2 = hlfir.declare %arg{{.*}} {uniq_name = "loop.0.ub"}
+! DUMMY_UBS-DAG:   %[[LOOP0_STEP_DECL:.*]]:2 = hlfir.declare %arg{{.*}} {uniq_name = "loop.0.step"}
+
+! DUMMY_UBS-DAG:   %[[LOOP1_LB_DECL:.*]]:2 = hlfir.declare %arg{{.*}} {uniq_name = "loop.1.lb"}
+! DUMMY_UBS-DAG:   %[[LOOP1_UB_DECL:.*]]:2 = hlfir.declare %arg{{.*}} {uniq_name = "loop.1.ub"}
+! DUMMY_UBS-DAG:   %[[LOOP1_STEP_DECL:.*]]:2 = hlfir.declare %arg{{.*}} {uniq_name = "loop.1.step"}
+
+! DUMMY_UBS-DAG:   %[[LOOP0_LB:.*]] = fir.load %[[LOOP0_LB_DECL]]#1
+! DUMMY_UBS-DAG:   %[[LOOP0_UB:.*]] = fir.load %[[LOOP0_UB_DECL]]#1
+! DUMMY_UBS-DAG:   %[[LOOP0_STEP:.*]] = fir.load %[[LOOP0_STEP_DECL]]#1
+
+! DUMMY_UBS-DAG:   %[[LOOP1_LB:.*]] = fir.load %[[LOOP1_LB_DECL]]#1
+! DUMMY_UBS-DAG:   %[[LOOP1_UB:.*]] = fir.load %[[LOOP1_UB_DECL]]#1
+! DUMMY_UBS-DAG:   %[[LOOP1_STEP:.*]] = fir.load %[[LOOP1_STEP_DECL]]#1
+
+! DUMMY_UBS:       omp.loop_nest (%{{.*}}, %{{.*}}) : index
+! DUMMY_UBS-SAME:  = (%[[LOOP0_LB]], %[[LOOP1_LB]])
+! DUMMY_UBS-SAME:  to (%[[LOOP0_UB]], %[[LOOP1_UB]])
+! DUMMY_UBS-SAME:  inclusive step (%[[LOOP0_STEP]], %[[LOOP1_STEP]])
+
+! DUMMY_UBS: omp.terminator
+! DUMMY_UBS: }
+
diff --git a/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90 b/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90
index f3f2e78f5b3183..9b0c9fa5b25179 100644
--- a/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90
+++ b/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90
@@ -31,9 +31,14 @@ program main
 
 ! DEVICE: omp.target
 
-! DEVICE:   ^bb0(%[[I_ARG:[^[:space:]]+]]: !fir.ref<i32>, %[[X_ARG:[^[:space:]]+]]: !fir.ref<i32>,
+! DEVICE:   ^bb0(
+! DEVICE-SAME:   %{{[^[:space:]]+}}: {{[^[:space:]]+}},
+! DEVICE-SAME:   %{{[^[:space:]]+}}: {{[^[:space:]]+}},
+! DEVICE-SAME:   %{{[^[:space:]]+}}: {{[^[:space:]]+}},
+! DEVICE-SAME:   %[[I_ARG:[^[:space:]]+]]: !fir.ref<i32>,
+! DEVICE-SAME:   %[[X_ARG:[^[:space:]]+]]: !fir.ref<i32>,
 ! DEVICE-SAME:   %[[J_ARG:[^[:space:]]+]]: !fir.ref<i32>, %[[K_ARG:[^[:space:]]+]]: !fir.ref<i32>,
-! DEVICE-SAME:   %[[A_ARG:[^[:space:]]+]]: !fir.ref<!fir.array<10x20x30xi32>>):
+! DEVICE-SAME:   %[[A_ARG:[^[:space:]]+]]: !fir.ref<!fir.array<10x20x30xi32>>
 
 ! DEVICE: %[[TARGET_J_DECL:.*]]:2 = hlfir.declare %[[J_ARG]] {uniq_name = "_QFEj"}
 ! DEVICE: %[[TARGET_K_DECL:.*]]:2 = hlfir.declare %[[K_ARG]] {uniq_name = "_QFEk"}
diff --git a/flang/test/Transforms/DoConcurrent/runtime_sized_array.f90 b/flang/test/Transforms/DoConcurrent/runtime_sized_array.f90
index 5420ff4586be60..5be66261b48443 100644
--- a/flang/test/Transforms/DoConcurrent/runtime_sized_array.f90
+++ b/flang/test/Transforms/DoConcurrent/runtime_sized_array.f90
@@ -26,7 +26,10 @@ subroutine foo(n)
 ! CHECK-DAG: %[[N_MAP:.*]] = omp.map.info var_ptr(%[[N_ALLOC]] : {{.*}})
 
 ! CHECK: omp.target
-! CHECK-SAME: map_entries(%[[I_MAP]] -> %[[I_ARG:arg[0-9]*]],
+! CHECK-SAME: map_entries(%[[LOOP_LB_MAP]] -> %[[LB_ARG:arg[0-9]*]],
+! CHECK-SAME:             %[[LOOP_UB_MAP]] -> %[[UB_ARG:arg[0-9]*]],
+! CHECK-SAME:             %[[LOOP_STEP_MAP]] -> %[[STEP_ARG:arg[0-9]*]],
+! CHECK-SAME:             %[[I_MAP]] -> %[[I_ARG:arg[0-9]*]],
 ! CHECK-SAME:             %[[A_MAP]] -> %[[A_ARG:arg[0-9]*]],
 ! CHECK-SAME:             %[[N_MAP]] -> %[[N_ARG:arg[0-9]*]] : {{.*}})
 ! CHECK-SAME: {{.*}} {
diff --git a/flang/test/Transforms/DoConcurrent/skip_all_nested_loops.f90 b/flang/test/Transforms/DoConcurrent/skip_all_nested_loops.f90
index 429500cead1073..49174fd6d40610 100644
--- a/flang/test/Transforms/DoConcurrent/skip_all_nested_loops.f90
+++ b/flang/test/Transforms/DoConcurrent/skip_all_nested_loops.f90
@@ -31,9 +31,13 @@ program main
 
 ! DEVICE: omp.target
 
-! DEVICE:   ^bb0(%[[I_ARG:[^[:space:]]+]]: !fir.ref<i32>,
+! DEVICE:   ^bb0(
+! DEVICE-SAME:   %{{[^[:space:]]+}}: {{[^[:space:]]+}},
+! DEVICE-SAME:   %{{[^[:space:]]+}}: {{[^[:space:]]+}},
+! DEVICE-SAME:   %{{[^[:space:]]+}}: {{[^[:space:]]+}},
+! DEVICE-SAME:   %[[I_ARG:[^[:space:]]+]]: !fir.ref<i32>,
 ! DEVICE-SAME:   %[[J_ARG:[^[:space:]]+]]: !fir.ref<i32>, %[[K_ARG:[^[:space:]]+]]: !fir.ref<i32>,
-! DEVICE-SAME:   %[[A_ARG:[^[:space:]]+]]: !fir.ref<!fir.array<10x20x30xi32>>):
+! DEVICE-SAME:   %[[A_ARG:[^[:space:]]+]]: !fir.ref<!fir.array<10x20x30xi32>>
 
 ! DEVICE: %[[TARGET_J_DECL:.*]]:2 = hlfir.declare %[[J_ARG]] {uniq_name = "_QFEj"}
 ! DEVICE: %[[TARGET_K_DECL:.*]]:2 = hlfir.declare %[[K_ARG]] {uniq_name = "_QFEk"}

From e509514417066243df4200edbceab06e74a92f32 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy@amd.com>
Date: Thu, 22 Aug 2024 04:26:25 -0500
Subject: [PATCH 3/3] [flang][OpenMP][DoConcurrent] Support `fir.shape_shift`
 values

---
 .../OpenMP/DoConcurrentConversion.cpp         | 238 +++++++++++-------
 .../Transforms/DoConcurrent/basic_device.f90  |  10 +-
 .../DoConcurrent/runtime_sized_array.f90      |  13 +-
 3 files changed, 167 insertions(+), 94 deletions(-)

diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
index 9e86ab2f9de068..75392c601789da 100644
--- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
@@ -151,6 +151,36 @@ mlir::Value calculateTripCount(fir::FirOpBuilder &builder, mlir::Location loc,
   return tripCount;
 }
 
+mlir::Value mapTemporaryValue(fir::FirOpBuilder &builder,
+                              mlir::omp::TargetOp targetOp, mlir::Value val,
+                              std::string name = "") {
+  mlir::OpBuilder::InsertionGuard guard(builder);
+  builder.setInsertionPointAfterValue(val);
+  auto copyVal = builder.createTemporary(val.getLoc(), val.getType());
+  builder.createStoreWithConvert(copyVal.getLoc(), val, copyVal);
+
+  llvm::SmallVector<mlir::Value> bounds;
+  builder.setInsertionPoint(targetOp);
+  mlir::Value mapOp = createMapInfoOp(
+      builder, copyVal.getLoc(), copyVal,
+      /*varPtrPtr=*/mlir::Value{}, name, bounds,
+      /*members=*/llvm::SmallVector<mlir::Value>{},
+      /*membersIndex=*/mlir::DenseIntElementsAttr{},
+      static_cast<std::underlying_type_t<llvm::omp::OpenMPOffloadMappingFlags>>(
+          llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT),
+      mlir::omp::VariableCaptureKind::ByCopy, copyVal.getType());
+  targetOp.getMapVarsMutable().append(mapOp);
+
+  mlir::Region &targetRegion = targetOp.getRegion();
+  mlir::Block *targetEntryBlock = &targetRegion.getBlocks().front();
+  mlir::Value clonedValArg =
+      targetRegion.addArgument(copyVal.getType(), copyVal.getLoc());
+  builder.setInsertionPointToStart(targetEntryBlock);
+  auto loadOp =
+      builder.create<fir::LoadOp>(clonedValArg.getLoc(), clonedValArg);
+  return loadOp.getResult();
+}
+
 /// Check if cloning the bounds introduced any dependency on the outer region.
 /// If so, then either clone them as well if they are MemoryEffectFree, or else
 /// copy them to a new temporary and add them to the map and block_argument
@@ -179,31 +209,9 @@ void cloneOrMapRegionOutsiders(fir::FirOpBuilder &builder,
               return use.getOwner()->getBlock() == targetEntryBlock;
             });
       } else {
-        mlir::OpBuilder::InsertionGuard guard(builder);
-        builder.setInsertionPointAfter(valOp);
-        auto copyVal = builder.createTemporary(val.getLoc(), val.getType());
-        builder.createStoreWithConvert(copyVal.getLoc(), val, copyVal);
-
-        llvm::SmallVector<mlir::Value> bounds;
-        std::stringstream name;
-        builder.setInsertionPoint(targetOp);
-        mlir::Value mapOp = createMapInfoOp(
-            builder, copyVal.getLoc(), copyVal,
-            /*varPtrPtr=*/mlir::Value{}, name.str(), bounds,
-            /*members=*/llvm::SmallVector<mlir::Value>{},
-            /*membersIndex=*/mlir::DenseIntElementsAttr{},
-            static_cast<
-                std::underlying_type_t<llvm::omp::OpenMPOffloadMappingFlags>>(
-                llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT),
-            mlir::omp::VariableCaptureKind::ByCopy, copyVal.getType());
-        targetOp.getMapVarsMutable().append(mapOp);
-        mlir::Value clonedValArg =
-            targetRegion.addArgument(copyVal.getType(), copyVal.getLoc());
-        builder.setInsertionPointToStart(targetEntryBlock);
-        auto loadOp =
-            builder.create<fir::LoadOp>(clonedValArg.getLoc(), clonedValArg);
+        mlir::Value mappedTemp = mapTemporaryValue(builder, targetOp, val);
         val.replaceUsesWithIf(
-            loadOp->getResult(0), [targetEntryBlock](mlir::OpOperand &use) {
+            mappedTemp, [targetEntryBlock](mlir::OpOperand &use) {
               return use.getOwner()->getBlock() == targetEntryBlock;
             });
       }
@@ -747,17 +755,18 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
 
     if (mapToDevice) {
       mlir::omp::TargetOperands targetClauseOps;
+      LiveInShapeInfoMap liveInShapeInfoMap;
 
       // The outermost loop will contain all the live-in values in all nested
       // loops since live-in values are collected recursively for all nested
       // ops.
       for (mlir::Value liveIn : loopNestLiveIns) {
-        targetClauseOps.mapVars.push_back(
-            genMapInfoOpForLiveIn(rewriter, liveIn, liveInToName));
+        targetClauseOps.mapVars.push_back(genMapInfoOpForLiveIn(
+            rewriter, liveIn, liveInToName, liveInShapeInfoMap[liveIn]));
       }
 
       targetOp = genTargetOp(doLoop.getLoc(), rewriter, mapper, loopNestLiveIns,
-                             targetClauseOps);
+                             targetClauseOps, liveInShapeInfoMap);
       genTeamsOp(doLoop.getLoc(), rewriter);
     }
 
@@ -809,42 +818,76 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
   }
 
 private:
-  void genBoundsOps(mlir::ConversionPatternRewriter &rewriter,
-                    mlir::Location loc, mlir::Value shape,
-                    llvm::SmallVectorImpl<mlir::Value> &boundsOps) const {
+  struct TargetDeclareShapeCreationInfo {
+    std::vector<mlir::Value> startIndices{};
+    std::vector<mlir::Value> extents{};
+
+    bool isShapedValue() const { return !extents.empty(); }
+    bool isShapeShiftedValue() const { return !startIndices.empty(); }
+  };
+
+  using LiveInShapeInfoMap =
+      llvm::DenseMap<mlir::Value, TargetDeclareShapeCreationInfo>;
+
+  void
+  genBoundsOps(mlir::ConversionPatternRewriter &rewriter, mlir::Location loc,
+               mlir::Value shape, llvm::SmallVectorImpl<mlir::Value> &boundsOps,
+               TargetDeclareShapeCreationInfo &targetShapeCreationInfo) const {
     if (shape == nullptr) {
       return;
     }
 
     auto shapeOp =
         mlir::dyn_cast_if_present<fir::ShapeOp>(shape.getDefiningOp());
+    auto shapeShiftOp =
+        mlir::dyn_cast_if_present<fir::ShapeShiftOp>(shape.getDefiningOp());
 
-    if (shapeOp == nullptr)
-      TODO(loc, "Shapes not defined by shape op's are not supported yet.");
+    if (shapeOp == nullptr && shapeShiftOp == nullptr)
+      TODO(loc,
+           "Shapes not defined by `fir.shape` or `fir.shape_shift` op's are "
+           "not supported yet.");
 
-    auto extents = shapeOp.getExtents();
+    auto extents = shapeOp != nullptr
+                       ? std::vector<mlir::Value>(shapeOp.getExtents().begin(),
+                                                  shapeOp.getExtents().end())
+                       : shapeShiftOp.getExtents();
 
-    auto genBoundsOp = [&](mlir::Value extent) {
-      mlir::Type extentType = extent.getType();
-      auto lb = rewriter.create<mlir::arith::ConstantOp>(
-          loc, extentType, rewriter.getIntegerAttr(extentType, 0));
-      // TODO I think this caluclation might not be correct. But this is how
-      // it is done in PFT->OpenMP lowering. So keeping it like this until we
-      // double check.
-      mlir::Value ub = rewriter.create<mlir::arith::SubIOp>(loc, extent, lb);
+    mlir::Type idxType = extents.front().getType();
+
+    auto one = rewriter.create<mlir::arith::ConstantOp>(
+        loc, idxType, rewriter.getIntegerAttr(idxType, 1));
+    // For non-shifted values, that starting index is the default Fortran
+    // value: 1.
+    std::vector<mlir::Value> startIndices =
+        shapeOp != nullptr ? std::vector<mlir::Value>(extents.size(), one)
+                           : shapeShiftOp.getOrigins();
+
+    auto genBoundsOp = [&](mlir::Value startIndex, mlir::Value extent) {
+      // We map the entire range of data by default, therefore, we always map
+      // from the start.
+      auto normalizedLB = rewriter.create<mlir::arith::ConstantOp>(
+          loc, idxType, rewriter.getIntegerAttr(idxType, 0));
+
+      mlir::Value ub = rewriter.create<mlir::arith::SubIOp>(loc, extent, one);
 
       return rewriter.create<mlir::omp::MapBoundsOp>(
-          loc, rewriter.getType<mlir::omp::MapBoundsType>(), lb, ub, extent,
-          mlir::Value{}, false, mlir::Value{});
+          loc, rewriter.getType<mlir::omp::MapBoundsType>(), normalizedLB, ub,
+          extent,
+          /*stride=*/mlir::Value{}, /*stride_in_bytes=*/false, startIndex);
     };
 
-    for (auto extent : extents)
-      boundsOps.push_back(genBoundsOp(extent));
+    for (auto [startIndex, extent] : llvm::zip_equal(startIndices, extents))
+      boundsOps.push_back(genBoundsOp(startIndex, extent));
+
+    if (shapeShiftOp != nullptr)
+      targetShapeCreationInfo.startIndices = std::move(startIndices);
+    targetShapeCreationInfo.extents = std::move(extents);
   }
 
   mlir::omp::MapInfoOp genMapInfoOpForLiveIn(
       mlir::ConversionPatternRewriter &rewriter, mlir::Value liveIn,
-      const llvm::DenseMap<mlir::Value, std::string> &liveInToName) const {
+      const llvm::DenseMap<mlir::Value, std::string> &liveInToName,
+      TargetDeclareShapeCreationInfo &targetShapeCreationInfo) const {
     mlir::Value rawAddr = liveIn;
     mlir::Value shape = nullptr;
     std::string name = "";
@@ -891,7 +934,8 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
     }
 
     llvm::SmallVector<mlir::Value> boundsOps;
-    genBoundsOps(rewriter, liveIn.getLoc(), shape, boundsOps);
+    genBoundsOps(rewriter, liveIn.getLoc(), shape, boundsOps,
+                 targetShapeCreationInfo);
 
     return Fortran::lower::omp ::internal::createMapInfoOp(
         rewriter, liveIn.getLoc(), rawAddr,
@@ -904,11 +948,12 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
         captureKind, rawAddr.getType());
   }
 
-  mlir::omp::TargetOp genTargetOp(mlir::Location loc,
-                                  mlir::ConversionPatternRewriter &rewriter,
-                                  mlir::IRMapping &mapper,
-                                  llvm::ArrayRef<mlir::Value> liveIns,
-                                  mlir::omp::TargetOperands &clauseOps) const {
+  mlir::omp::TargetOp
+  genTargetOp(mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
+              mlir::IRMapping &mapper,
+              const llvm::ArrayRef<mlir::Value> liveIns,
+              const mlir::omp::TargetOperands &clauseOps,
+              const LiveInShapeInfoMap &liveInShapeInfoMap) const {
     auto targetOp = rewriter.create<mlir::omp::TargetOp>(loc, clauseOps);
 
     mlir::Region &region = targetOp.getRegion();
@@ -923,14 +968,17 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
     }
 
     rewriter.createBlock(&region, {}, liveInTypes, liveInLocs);
-    fir::FirOpBuilder firBuilder(
+    fir::FirOpBuilder builder(
         rewriter,
         fir::getKindMapping(targetOp->getParentOfType<mlir::ModuleOp>()));
 
-    for (auto [liveIn, arg, mapInfoOp] :
-         llvm::zip_equal(liveIns, region.getArguments(), clauseOps.mapVars)) {
+    size_t argIdx = 0;
+    for (auto [liveIn, mapInfoOp] :
+         llvm::zip_equal(liveIns, clauseOps.mapVars)) {
       auto miOp = mlir::cast<mlir::omp::MapInfoOp>(mapInfoOp.getDefiningOp());
-      hlfir::DeclareOp liveInDeclare = genLiveInDeclare(rewriter, arg, miOp);
+      hlfir::DeclareOp liveInDeclare =
+          genLiveInDeclare(builder, targetOp, region.getArgument(argIdx), miOp,
+                           liveInShapeInfoMap.at(liveIn));
 
       // TODO If `liveIn.getDefiningOp()` is a `fir::BoxAddrOp`, we probably
       // need to "unpack" the box by getting the defining op of it's value.
@@ -938,9 +986,8 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
       // todo for now.
 
       if (!llvm::isa<mlir::omp::PointerLikeType>(liveIn.getType()))
-        mapper.map(liveIn,
-                   firBuilder.loadIfRef(liveIn.getLoc(),
-                                        liveInDeclare.getOriginalBase()));
+        mapper.map(liveIn, builder.loadIfRef(liveIn.getLoc(),
+                                             liveInDeclare.getOriginalBase()));
       else
         mapper.map(liveIn, liveInDeclare.getOriginalBase());
 
@@ -948,56 +995,75 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
               liveIn.getDefiningOp())) {
         mapper.map(origDeclareOp.getBase(), liveInDeclare.getBase());
       }
+      ++argIdx;
     }
 
-    Fortran::lower::omp::internal::cloneOrMapRegionOutsiders(firBuilder,
-                                                             targetOp);
+    Fortran::lower::omp::internal::cloneOrMapRegionOutsiders(builder, targetOp);
     rewriter.setInsertionPoint(
         rewriter.create<mlir::omp::TerminatorOp>(targetOp.getLoc()));
 
     return targetOp;
   }
 
-  hlfir::DeclareOp
-  genLiveInDeclare(mlir::ConversionPatternRewriter &rewriter,
-                   mlir::Value liveInArg,
-                   mlir::omp::MapInfoOp liveInMapInfoOp) const {
+  hlfir::DeclareOp genLiveInDeclare(
+      fir::FirOpBuilder &builder, mlir::omp::TargetOp targetOp,
+      mlir::Value liveInArg, mlir::omp::MapInfoOp liveInMapInfoOp,
+      const TargetDeclareShapeCreationInfo &targetShapeCreationInfo) const {
     mlir::Type liveInType = liveInArg.getType();
+    std::string liveInName = liveInMapInfoOp.getName().has_value()
+                                 ? liveInMapInfoOp.getName().value().str()
+                                 : std::string("");
 
     if (fir::isa_ref_type(liveInType))
       liveInType = fir::unwrapRefType(liveInType);
 
     mlir::Value shape = [&]() -> mlir::Value {
-      if (hlfir::isFortranScalarNumericalType(liveInType))
+      if (!targetShapeCreationInfo.isShapedValue())
         return {};
 
-      if (hlfir::isFortranArrayObject(liveInType)) {
-        llvm::SmallVector<mlir::Value> shapeOpOperands;
+      llvm::SmallVector<mlir::Value> extentOperands;
+      llvm::SmallVector<mlir::Value> startIndexOperands;
+
+      if (targetShapeCreationInfo.isShapeShiftedValue()) {
+        llvm::SmallVector<mlir::Value> shapeShiftOperands;
+
+        size_t shapeIdx = 0;
+        for (auto [startIndex, extent] :
+             llvm::zip_equal(targetShapeCreationInfo.startIndices,
+                             targetShapeCreationInfo.extents)) {
+          shapeShiftOperands.push_back(
+              Fortran::lower::omp::internal::mapTemporaryValue(
+                  builder, targetOp, startIndex,
+                  liveInName + ".start_idx.dim" + std::to_string(shapeIdx)));
+          shapeShiftOperands.push_back(
+              Fortran::lower::omp::internal::mapTemporaryValue(
+                  builder, targetOp, extent,
+                  liveInName + ".extent.dim" + std::to_string(shapeIdx)));
+          ++shapeIdx;
+        }
 
-        for (auto boundsOperand : liveInMapInfoOp.getBounds()) {
-          auto boundsOp =
-              mlir::cast<mlir::omp::MapBoundsOp>(boundsOperand.getDefiningOp());
-          mlir::Operation *localExtentDef =
-              boundsOp.getExtent().getDefiningOp()->clone();
-          rewriter.getInsertionBlock()->push_back(localExtentDef);
-          assert(localExtentDef->getNumResults() == 1);
+        auto shapeShiftType = fir::ShapeShiftType::get(
+            builder.getContext(), shapeShiftOperands.size() / 2);
+        return builder.create<fir::ShapeShiftOp>(
+            liveInArg.getLoc(), shapeShiftType, shapeShiftOperands);
+      }
 
-          shapeOpOperands.push_back(localExtentDef->getResult(0));
-        }
+      llvm::SmallVector<mlir::Value> shapeOperands;
 
-        return rewriter.create<fir::ShapeOp>(liveInArg.getLoc(),
-                                             shapeOpOperands);
+      size_t shapeIdx = 0;
+      for (auto extent : targetShapeCreationInfo.extents) {
+        shapeOperands.push_back(
+            Fortran::lower::omp::internal::mapTemporaryValue(
+                builder, targetOp, extent,
+                liveInName + ".extent.dim" + std::to_string(shapeIdx)));
+        ++shapeIdx;
       }
 
-      std::string opStr;
-      llvm::raw_string_ostream opOs(opStr);
-      opOs << "Unsupported type: " << liveInType;
-      llvm_unreachable(opOs.str().c_str());
+      return builder.create<fir::ShapeOp>(liveInArg.getLoc(), shapeOperands);
     }();
 
-    return rewriter.create<hlfir::DeclareOp>(liveInArg.getLoc(), liveInArg,
-                                             liveInMapInfoOp.getName().value(),
-                                             shape);
+    return builder.create<hlfir::DeclareOp>(liveInArg.getLoc(), liveInArg,
+                                            liveInName, shape);
   }
 
   mlir::omp::TeamsOp
diff --git a/flang/test/Transforms/DoConcurrent/basic_device.f90 b/flang/test/Transforms/DoConcurrent/basic_device.f90
index a8b70e67839975..e5de687cad758a 100644
--- a/flang/test/Transforms/DoConcurrent/basic_device.f90
+++ b/flang/test/Transforms/DoConcurrent/basic_device.f90
@@ -27,12 +27,14 @@ program do_concurrent_basic
     ! CHECK-DAG: %[[UB_MAP_INFO:.*]] = omp.map.info {{.*}} !fir.ref<index> {name = "loop.0.ub"}
     ! CHECK-DAG: %[[STEP_MAP_INFO:.*]] = omp.map.info {{.*}} !fir.ref<index> {name = "loop.0.step"}
 
+    ! CHECK: %[[C1:.*]] = arith.constant 1 : index
     ! CHECK: %[[C0:.*]] = arith.constant 0 : index
-    ! CHECK: %[[UPPER_BOUND:.*]] = arith.subi %[[A_EXTENT]], %[[C0]] : index
+    ! CHECK: %[[UPPER_BOUND:.*]] = arith.subi %[[A_EXTENT]], %[[C1]] : index
 
     ! CHECK: %[[A_BOUNDS:.*]] = omp.map.bounds lower_bound(%[[C0]] : index)
     ! CHECK-SAME: upper_bound(%[[UPPER_BOUND]] : index)
     ! CHECK-SAME: extent(%[[A_EXTENT]] : index)
+    ! CHECK-SAME: start_idx(%[[C1]] : index)
 
     ! CHECK-DAG: %[[A_MAP_INFO:.*]] = omp.map.info var_ptr(%[[A_ORIG_DECL]]#1 : {{[^(]+}})
     ! CHECK-SAME: map_clauses(implicit, tofrom) capture(ByRef) bounds(%[[A_BOUNDS]])
@@ -44,6 +46,7 @@ program do_concurrent_basic
     ! CHECK-SAME:             %[[STEP_MAP_INFO]] -> %[[STEP_ARG:.[[:alnum:]]+]],
     ! CHECK-SAME:             %[[I_MAP_INFO]] -> %[[I_ARG:[[:alnum:]]+]],
     ! CHECK-SAME:             %[[A_MAP_INFO]] -> %[[A_ARG:.[[:alnum:]]+]]
+    ! CHECK-SAME:             %[[A_EXT:.*]] -> %[[A_EXT_ARG:.[[:alnum:]]+]]
     ! CHECK-SAME: trip_count(%[[TRIP_COUNT]] : i64)
 
     ! CHECK-NEXT: ^{{.*}}(%[[LB_ARG]]: !fir.ref<index>,
@@ -51,6 +54,8 @@ program do_concurrent_basic
     ! CHECK-SAME:         %[[I_ARG]]: !fir.ref<i32>,
     ! CHECK-SAME:         %[[A_ARG]]: !fir.ref<!fir.array<10xi32>>, %[[A_EXT_ARG]]: !fir.ref<index>):
 
+    ! CHECK: %[[A_EXT:.*]] = fir.load %[[A_EXT_ARG]] : !fir.ref<index>
+
     ! CHECK: %[[LB_DEV_DECL:.*]]:2 = hlfir.declare %[[LB_ARG]]
     ! CHECK: %[[LB_DEV_VAL:.*]] = fir.load %[[LB_DEV_DECL]]#1
 
@@ -60,7 +65,8 @@ program do_concurrent_basic
     ! CHECK: %[[STEP_DEV_DECL:.*]]:2 = hlfir.declare %[[STEP_ARG]]
     ! CHECK: %[[STEP_DEV_VAL:.*]] = fir.load %[[STEP_DEV_DECL]]#1
 
-    ! CHECK: %[[A_DEV_DECL:.*]]:2 = hlfir.declare %[[A_ARG]]
+    ! CHECK: %[[A_SHAPE:.*]] = fir.shape %[[A_EXT]] : (index) -> !fir.shape<1>
+    ! CHECK: %[[A_DEV_DECL:.*]]:2 = hlfir.declare %[[A_ARG]](%[[A_SHAPE]])
 
     ! CHECK: omp.teams {
     ! CHECK-NEXT: omp.parallel {
diff --git a/flang/test/Transforms/DoConcurrent/runtime_sized_array.f90 b/flang/test/Transforms/DoConcurrent/runtime_sized_array.f90
index 5be66261b48443..b7a3bf8e5b6b87 100644
--- a/flang/test/Transforms/DoConcurrent/runtime_sized_array.f90
+++ b/flang/test/Transforms/DoConcurrent/runtime_sized_array.f90
@@ -23,7 +23,10 @@ subroutine foo(n)
 
 ! CHECK-DAG: %[[I_MAP:.*]] = omp.map.info var_ptr(%[[I_DECL]]#1 : {{.*}})
 ! CHECK-DAG: %[[A_MAP:.*]] = omp.map.info var_ptr(%[[A_DECL]]#1 : {{.*}})
-! CHECK-DAG: %[[N_MAP:.*]] = omp.map.info var_ptr(%[[N_ALLOC]] : {{.*}})
+! CHECK-DAG: %[[LOOP_LB_MAP:.*]] = omp.map.info var_ptr(%{{.*}} : {{.*}}) {{.*}} {name = "loop.0.lb"}
+! CHECK-DAG: %[[LOOP_UB_MAP:.*]] = omp.map.info var_ptr(%{{.*}} : {{.*}}) {{.*}} {name = "loop.0.ub"}
+! CHECK-DAG: %[[LOOP_STEP_MAP:.*]] = omp.map.info var_ptr(%{{.*}} : {{.*}}) {{.*}} {name = "loop.0.step"}
+! CHECK-DAG: %[[N_MAP:.*]] = omp.map.info var_ptr(%{{.*}} : {{.*}})
 
 ! CHECK: omp.target
 ! CHECK-SAME: map_entries(%[[LOOP_LB_MAP]] -> %[[LB_ARG:arg[0-9]*]],
@@ -34,12 +37,10 @@ subroutine foo(n)
 ! CHECK-SAME:             %[[N_MAP]] -> %[[N_ARG:arg[0-9]*]] : {{.*}})
 ! CHECK-SAME: {{.*}} {
 
+! CHECK-DAG:  %[[N_VAL:.*]] = fir.load %[[N_ARG]]
+! CHECK-DAG:  %[[A_SHAPE:.*]] = fir.shape %[[N_VAL]] : (index) -> !fir.shape<1>
 ! CHECK-DAG:  %{{.*}} = hlfir.declare %[[I_ARG]]
-! CHECK-DAG:  %{{.*}} = hlfir.declare %[[A_ARG]]
-! CHECK-DAG:  %{{.*}} = fir.load %[[N_ARG]]
+! CHECK-DAG:  %{{.*}} = hlfir.declare %[[A_ARG]](%[[A_SHAPE]])
 
 ! CHECK:   omp.terminator
 ! CHECK: }
-
-
-