Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[flang][OpenMP] Privatize locally destroyed values in do concurent #112

Merged
merged 6 commits into from
Jul 14, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions flang/docs/DoConcurrentConversionToOpenMP.md
Original file line number Diff line number Diff line change
Expand Up @@ -234,9 +234,16 @@ see the "Data environment" section below.
By default, variables that are used inside a `do concurernt` loop nest are
either treated as `shared` in case of mapping to `host`, or mapped into the
`target` region using a `map` clause in case of mapping to `device`. The only
exception to this is the loop's iteration variable(s) (IV) of **perfect** loop
nest. In that case, for each IV, we allocate a local copy as shown the by the
mapping examples above.
exceptions to this are:
1. the loop's iteration variable(s) (IV) of **perfect** loop nests. In that
case, for each IV, we allocate a local copy as shown the by the mapping
examples above.
1. any values that are from allocations outside the loop nest and destroyed
inside of it. In such cases, a local privatized value is created in the
OpenMP region to prevent multiple teams of treads from accessing and
ergawy marked this conversation as resolved.
Show resolved Hide resolved
destroying the same memory block which causes runtime issues. For an
example of such cases, see
`flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90`.

#### Non-perfectly-nested loops' IVs

Expand Down
97 changes: 91 additions & 6 deletions flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@
#include "flang/Optimizer/HLFIR/HLFIRDialect.h"
#include "flang/Optimizer/HLFIR/HLFIROps.h"
#include "flang/Optimizer/Transforms/Passes.h"
#include "mlir/Analysis/SliceAnalysis.h"
#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/Math/IR/Math.h"
#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
#include "mlir/IR/Diagnostics.h"
#include "mlir/IR/IRMapping.h"
Expand Down Expand Up @@ -468,6 +471,77 @@ void sinkLoopIVArgs(mlir::ConversionPatternRewriter &rewriter,
++idx;
}
}

/// Collects values are that are destroyed by the Fortran runtime within the
/// loop's scope.
///
/// \param [in] doLoop - the loop within which the function searches for locally
/// destroyed values.
ergawy marked this conversation as resolved.
Show resolved Hide resolved
///
/// \param [out] local - a map from locally destroyed values to the runtime
/// destroy opertaions that destroy them.
void collectLocallyDestroyedValuesInLoop(
fir::DoLoopOp doLoop,
llvm::DenseMap<mlir::Value, mlir::Operation *> &locals) {
constexpr static auto destroy{"_FortranADestroy"};
doLoop.getRegion().walk([&](fir::CallOp call) {
auto callee = call.getCallee();

if (!callee.has_value())
return;

if (callee.value().getLeafReference().str() != destroy)
return;

assert(call.getNumOperands() == 1);

mlir::BackwardSliceOptions options;
options.inclusive = true;
llvm::SetVector<mlir::Operation *> opSlice;
mlir::getBackwardSlice(call, &opSlice, options);

if (auto alloca = mlir::dyn_cast_if_present<fir::AllocaOp>(opSlice.front()))
locals.try_emplace(alloca.getResult(), call);
});
}

/// For a locally destroyed value \p local within a loop's scope, localizes that
/// value within the scope of the parallel region the loop maps to. Towards that
/// end, this function allocates a private copy of \p local within \p
/// allocRegion.
///
/// \param local - the locally destroyed value within a loop's scope (see
/// collectLocallyDestroyedValuesInLoop).
///
/// \param localDestroyer - the Fortran runtime call operation that destroys \p
/// local.
///
/// \param allocRegion - the parallel region where \p local's allocation will be
/// cloned (i.e. privatized).
///
/// \param rewriter - builder used for updating \p allocRegion.
///
/// \param mapper - mapper to track updated references \p local within \p
/// allocRegion.
void localizeLocallyDestroyedValue(mlir::Value local,
mlir::Operation *localDestroyer,
mlir::Region &allocRegion,
mlir::ConversionPatternRewriter &rewriter,
mlir::IRMapping &mapper) {
mlir::Region *loopRegion = localDestroyer->getParentRegion();
assert(loopRegion != nullptr);

mlir::IRRewriter::InsertPoint ip = rewriter.saveInsertionPoint();
ergawy marked this conversation as resolved.
Show resolved Hide resolved
rewriter.setInsertionPointToStart(&allocRegion.front());
mlir::Operation *newLocalDef = rewriter.clone(*local.getDefiningOp(), mapper);
ergawy marked this conversation as resolved.
Show resolved Hide resolved
rewriter.replaceUsesWithIf(
local, newLocalDef->getResult(0), [&](mlir::OpOperand &operand) {
return operand.getOwner()->getParentRegion() == loopRegion;
});
mapper.map(local, newLocalDef->getResult(0));

rewriter.restoreInsertionPoint(ip);
}
} // namespace looputils

class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
Expand Down Expand Up @@ -519,9 +593,14 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
bool hasRemainingNestedLoops =
failed(looputils::collectLoopNest(doLoop, loopNest));

mlir::IRMapping mapper;

llvm::DenseMap<mlir::Value, mlir::Operation *> locals;
looputils::collectLocallyDestroyedValuesInLoop(loopNest.back().first,
locals);

looputils::sinkLoopIVArgs(rewriter, loopNest);

mlir::IRMapping mapper;
mlir::omp::TargetOp targetOp;
mlir::omp::LoopNestClauseOps loopNestClauseOps;

Expand All @@ -541,8 +620,13 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
genDistributeOp(doLoop.getLoc(), rewriter);
}

genParallelOp(doLoop.getLoc(), rewriter, loopNest, mapper,
loopNestClauseOps);
mlir::omp::ParallelOp parallelOp = genParallelOp(
doLoop.getLoc(), rewriter, loopNest, mapper, loopNestClauseOps);

for (auto &[local, localDestroyer] : locals)
looputils::localizeLocallyDestroyedValue(
local, localDestroyer, parallelOp.getRegion(), rewriter, mapper);

mlir::omp::LoopNestOp ompLoopNest =
genWsLoopOp(rewriter, loopNest.back().first, mapper, loopNestClauseOps);

Expand Down Expand Up @@ -919,9 +1003,10 @@ class DoConcurrentConversionPass
context, mapTo == fir::omp::DoConcurrentMappingKind::DCMK_Device,
concurrentLoopsToSkip);
mlir::ConversionTarget target(*context);
target.addLegalDialect<fir::FIROpsDialect, hlfir::hlfirDialect,
mlir::arith::ArithDialect, mlir::func::FuncDialect,
mlir::omp::OpenMPDialect>();
target.addLegalDialect<
fir::FIROpsDialect, hlfir::hlfirDialect, mlir::arith::ArithDialect,
mlir::func::FuncDialect, mlir::omp::OpenMPDialect,
mlir::cf::ControlFlowDialect, mlir::math::MathDialect>();

target.addDynamicallyLegalOp<fir::DoLoopOp>([&](fir::DoLoopOp op) {
return !op.getUnordered() || concurrentLoopsToSkip.contains(op);
Expand Down
66 changes: 66 additions & 0 deletions flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
! Tests that locally destroyed values in a `do concurrent` loop are properly
! handled. Locally destroyed values are those values for which the Fortran runtime
! calls `@_FortranADestroy` inside the loops body. If these values are allocated
! outside the loop, and the loop is mapped to OpenMP, then a runtime error would
! occur due to multiple teams trying to access the same allocation.

! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-parallel=host %s -o - \
! RUN: | FileCheck %s

module struct_mod
type test_struct
integer, allocatable :: x_
end type

interface test_struct
pure module function construct_from_components(x) result(struct)
implicit none
integer, intent(in) :: x
type(test_struct) struct
end function
end interface
end module

submodule(struct_mod) struct_sub
implicit none

contains
module procedure construct_from_components
struct%x_ = x
end procedure
end submodule struct_sub

program main
use struct_mod, only : test_struct

implicit none
type(test_struct), dimension(10) :: a
integer :: i
integer :: total

do concurrent (i=1:10)
a(i) = test_struct(i)
end do

do i=1,10
total = total + a(i)%x_
end do

print *, "total =", total
end program main

! CHECK: omp.parallel {
! CHECK: %[[LOCAL_TEMP:.*]] = fir.alloca !fir.type<_QMstruct_modTtest_struct{x_:!fir.box<!fir.heap<i32>>}> {bindc_name = ".result"}
! CHECK: omp.wsloop {
! CHECK: omp.loop_nest {{.*}} {
! CHECK: %[[TEMP_VAL:.*]] = fir.call @_QMstruct_modPconstruct_from_components
! CHECK: fir.save_result %[[TEMP_VAL]] to %[[LOCAL_TEMP]]
! CHECK: %[[EMBOXED_LOCAL:.*]] = fir.embox %[[LOCAL_TEMP]]
! CHECK: %[[CONVERTED_LOCAL:.*]] = fir.convert %[[EMBOXED_LOCAL]]
! CHECK: fir.call @_FortranADestroy(%[[CONVERTED_LOCAL]])
! CHECK: omp.yield
! CHECK: }
! CHECK: omp.terminator
! CHECK: }
! CHECK: omp.terminator
! CHECK: }