diff --git a/flang/docs/DoConcurrentConversionToOpenMP.md b/flang/docs/DoConcurrentConversionToOpenMP.md index 4f5741f724d60a..ce5b7711192b87 100644 --- a/flang/docs/DoConcurrentConversionToOpenMP.md +++ b/flang/docs/DoConcurrentConversionToOpenMP.md @@ -234,9 +234,16 @@ see the "Data environment" section below. By default, variables that are used inside a `do concurernt` loop nest are either treated as `shared` in case of mapping to `host`, or mapped into the `target` region using a `map` clause in case of mapping to `device`. The only -exception to this is the loop's iteration variable(s) (IV) of **perfect** loop -nest. In that case, for each IV, we allocate a local copy as shown the by the -mapping examples above. +exceptions to this are: + 1. the loop's iteration variable(s) (IV) of **perfect** loop nests. In that + case, for each IV, we allocate a local copy as shown the by the mapping + examples above. + 1. any values that are from allocations outside the loop nest and used + exclusively inside of it. In such cases, a local privatized + value is created in the OpenMP region to prevent multiple teams of threads + from accessing and destroying the same memory block which causes runtime + issues. For an example of such cases, see + `flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90`. #### Non-perfectly-nested loops' IVs diff --git a/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp b/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp index b30379da272ea6..da11941e10e41c 100644 --- a/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp +++ b/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp @@ -15,7 +15,10 @@ #include "flang/Optimizer/HLFIR/HLFIRDialect.h" #include "flang/Optimizer/HLFIR/HLFIROps.h" #include "flang/Optimizer/Transforms/Passes.h" +#include "mlir/Analysis/SliceAnalysis.h" +#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h" #include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/Math/IR/Math.h" #include "mlir/Dialect/OpenMP/OpenMPDialect.h" #include "mlir/IR/Diagnostics.h" #include "mlir/IR/IRMapping.h" @@ -468,6 +471,61 @@ void sinkLoopIVArgs(mlir::ConversionPatternRewriter &rewriter, ++idx; } } + +/// Collects values that are local to a loop: "loop-local values". A loop-local +/// value is one that is used exclusively inside the loop but allocated outside +/// of it. This usually corresponds to temporary values that are used inside the +/// loop body for initialzing other variables for example. +/// +/// \param [in] doLoop - the loop within which the function searches for values +/// used exclusively inside. +/// +/// \param [out] locals - the list of loop-local values detected for \p doLoop. +static void collectLoopLocalValues(fir::DoLoopOp doLoop, + llvm::SetVector &locals) { + doLoop.walk([&](mlir::Operation *op) { + for (mlir::Value operand : op->getOperands()) { + if (locals.contains(operand)) + continue; + + bool isLocal = true; + + if (!mlir::isa_and_present(operand.getDefiningOp())) + continue; + + // Values defined inside the loop are not interesting since they do not + // need to be localized. + if (doLoop->isAncestor(operand.getDefiningOp())) + continue; + + for (auto *user : operand.getUsers()) { + if (!doLoop->isAncestor(user)) { + isLocal = false; + break; + } + } + + if (isLocal) + locals.insert(operand); + } + }); +} + +/// For a "loop-local" value \p local within a loop's scope, localizes that +/// value within the scope of the parallel region the loop maps to. Towards that +/// end, this function moves the allocation of \p local within \p allocRegion. +/// +/// \param local - the value used exclusively within a loop's scope (see +/// collectLoopLocalValues). +/// +/// \param allocRegion - the parallel region where \p local's allocation will be +/// privatized. +/// +/// \param rewriter - builder used for updating \p allocRegion. +static void localizeLoopLocalValue(mlir::Value local, mlir::Region &allocRegion, + mlir::ConversionPatternRewriter &rewriter) { + rewriter.moveOpBefore(local.getDefiningOp(), &allocRegion.front().front()); +} } // namespace looputils class DoConcurrentConversion : public mlir::OpConversionPattern { @@ -519,9 +577,13 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { bool hasRemainingNestedLoops = failed(looputils::collectLoopNest(doLoop, loopNest)); + mlir::IRMapping mapper; + + llvm::SetVector locals; + looputils::collectLoopLocalValues(loopNest.back().first, locals); + looputils::sinkLoopIVArgs(rewriter, loopNest); - mlir::IRMapping mapper; mlir::omp::TargetOp targetOp; mlir::omp::LoopNestClauseOps loopNestClauseOps; @@ -541,8 +603,13 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { genDistributeOp(doLoop.getLoc(), rewriter); } - genParallelOp(doLoop.getLoc(), rewriter, loopNest, mapper, - loopNestClauseOps); + mlir::omp::ParallelOp parallelOp = genParallelOp( + doLoop.getLoc(), rewriter, loopNest, mapper, loopNestClauseOps); + + for (mlir::Value local : locals) + looputils::localizeLoopLocalValue(local, parallelOp.getRegion(), + rewriter); + mlir::omp::LoopNestOp ompLoopNest = genWsLoopOp(rewriter, loopNest.back().first, mapper, loopNestClauseOps); @@ -919,9 +986,10 @@ class DoConcurrentConversionPass context, mapTo == fir::omp::DoConcurrentMappingKind::DCMK_Device, concurrentLoopsToSkip); mlir::ConversionTarget target(*context); - target.addLegalDialect(); + target.addLegalDialect< + fir::FIROpsDialect, hlfir::hlfirDialect, mlir::arith::ArithDialect, + mlir::func::FuncDialect, mlir::omp::OpenMPDialect, + mlir::cf::ControlFlowDialect, mlir::math::MathDialect>(); target.addDynamicallyLegalOp([&](fir::DoLoopOp op) { return !op.getUnordered() || concurrentLoopsToSkip.contains(op); diff --git a/flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90 b/flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90 new file mode 100644 index 00000000000000..8b79d87d12c907 --- /dev/null +++ b/flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90 @@ -0,0 +1,66 @@ +! Tests that locally destroyed values in a `do concurrent` loop are properly +! handled. Locally destroyed values are those values for which the Fortran runtime +! calls `@_FortranADestroy` inside the loops body. If these values are allocated +! outside the loop, and the loop is mapped to OpenMP, then a runtime error would +! occur due to multiple teams trying to access the same allocation. + +! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-parallel=host %s -o - \ +! RUN: | FileCheck %s + +module struct_mod + type test_struct + integer, allocatable :: x_ + end type + + interface test_struct + pure module function construct_from_components(x) result(struct) + implicit none + integer, intent(in) :: x + type(test_struct) struct + end function + end interface +end module + +submodule(struct_mod) struct_sub + implicit none + +contains + module procedure construct_from_components + struct%x_ = x + end procedure +end submodule struct_sub + +program main + use struct_mod, only : test_struct + + implicit none + type(test_struct), dimension(10) :: a + integer :: i + integer :: total + + do concurrent (i=1:10) + a(i) = test_struct(i) + end do + + do i=1,10 + total = total + a(i)%x_ + end do + + print *, "total =", total +end program main + +! CHECK: omp.parallel { +! CHECK: %[[LOCAL_TEMP:.*]] = fir.alloca !fir.type<_QMstruct_modTtest_struct{x_:!fir.box>}> {bindc_name = ".result"} +! CHECK: omp.wsloop { +! CHECK: omp.loop_nest {{.*}} { +! CHECK: %[[TEMP_VAL:.*]] = fir.call @_QMstruct_modPconstruct_from_components +! CHECK: fir.save_result %[[TEMP_VAL]] to %[[LOCAL_TEMP]] +! CHECK: %[[EMBOXED_LOCAL:.*]] = fir.embox %[[LOCAL_TEMP]] +! CHECK: %[[CONVERTED_LOCAL:.*]] = fir.convert %[[EMBOXED_LOCAL]] +! CHECK: fir.call @_FortranADestroy(%[[CONVERTED_LOCAL]]) +! CHECK: omp.yield +! CHECK: } +! CHECK: omp.terminator +! CHECK: } +! CHECK: omp.terminator +! CHECK: }