1//===-- CUFComputeSharedMemoryOffsetsAndSize.cpp --------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "flang/Optimizer/Builder/BoxValue.h"
10#include "flang/Optimizer/Builder/CUFCommon.h"
11#include "flang/Optimizer/Builder/FIRBuilder.h"
12#include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
13#include "flang/Optimizer/Builder/Todo.h"
14#include "flang/Optimizer/CodeGen/Target.h"
15#include "flang/Optimizer/CodeGen/TypeConverter.h"
16#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
17#include "flang/Optimizer/Dialect/FIRAttr.h"
18#include "flang/Optimizer/Dialect/FIRDialect.h"
19#include "flang/Optimizer/Dialect/FIROps.h"
20#include "flang/Optimizer/Dialect/FIROpsSupport.h"
21#include "flang/Optimizer/Dialect/FIRType.h"
22#include "flang/Optimizer/Support/DataLayout.h"
23#include "flang/Runtime/CUDA/registration.h"
24#include "flang/Runtime/entry-names.h"
25#include "mlir/Dialect/DLTI/DLTI.h"
26#include "mlir/Dialect/GPU/IR/GPUDialect.h"
27#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
28#include "mlir/IR/Value.h"
29#include "mlir/Pass/Pass.h"
30#include "llvm/ADT/SmallVector.h"
31
32namespace fir {
33#define GEN_PASS_DEF_CUFCOMPUTESHAREDMEMORYOFFSETSANDSIZE
34#include "flang/Optimizer/Transforms/Passes.h.inc"
35} // namespace fir
36
37using namespace Fortran::runtime::cuda;
38
39namespace {
40
41struct CUFComputeSharedMemoryOffsetsAndSize
42 : public fir::impl::CUFComputeSharedMemoryOffsetsAndSizeBase<
43 CUFComputeSharedMemoryOffsetsAndSize> {
44
45 void runOnOperation() override {
46 mlir::ModuleOp mod = getOperation();
47 mlir::SymbolTable symTab(mod);
48 mlir::OpBuilder opBuilder{mod.getBodyRegion()};
49 fir::FirOpBuilder builder(opBuilder, mod);
50 fir::KindMapping kindMap{fir::getKindMapping(mod)};
51 std::optional<mlir::DataLayout> dl =
52 fir::support::getOrSetMLIRDataLayout(mod, /*allowDefaultLayout=*/false);
53 if (!dl) {
54 mlir::emitError(mod.getLoc(),
55 "data layout attribute is required to perform " +
56 getName() + "pass");
57 }
58
59 auto gpuMod = cuf::getOrCreateGPUModule(mod, symTab);
60 mlir::Type i8Ty = builder.getI8Type();
61 mlir::Type i32Ty = builder.getI32Type();
62 mlir::Type idxTy = builder.getIndexType();
63 for (auto funcOp : gpuMod.getOps<mlir::gpu::GPUFuncOp>()) {
64 unsigned nbDynamicSharedVariables = 0;
65 unsigned nbStaticSharedVariables = 0;
66 uint64_t sharedMemSize = 0;
67 unsigned short alignment = 0;
68 mlir::Value crtDynOffset;
69
70 // Go over each shared memory operation and compute their start offset and
71 // the size and alignment of the global to be generated if all variables
72 // are static. If this is dynamic shared memory, then only the alignment
73 // is computed.
74 for (auto sharedOp : funcOp.getOps<cuf::SharedMemoryOp>()) {
75 mlir::Location loc = sharedOp.getLoc();
76 builder.setInsertionPoint(sharedOp);
77 if (fir::hasDynamicSize(sharedOp.getInType())) {
78 mlir::Type ty = sharedOp.getInType();
79 if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(ty))
80 ty = seqTy.getEleTy();
81 unsigned short align = dl->getTypeABIAlignment(ty);
82 alignment = std::max(alignment, align);
83 uint64_t tySize = dl->getTypeSize(ty);
84 ++nbDynamicSharedVariables;
85 if (crtDynOffset) {
86 sharedOp.getOffsetMutable().assign(
87 builder.createConvert(loc, i32Ty, crtDynOffset));
88 } else {
89 mlir::Value zero = builder.createIntegerConstant(loc, i32Ty, 0);
90 sharedOp.getOffsetMutable().assign(zero);
91 }
92
93 mlir::Value dynSize =
94 builder.createIntegerConstant(loc, idxTy, tySize);
95 for (auto extent : sharedOp.getShape())
96 dynSize = builder.create<mlir::arith::MulIOp>(loc, dynSize, extent);
97 if (crtDynOffset)
98 crtDynOffset =
99 builder.create<mlir::arith::AddIOp>(loc, crtDynOffset, dynSize);
100 else
101 crtDynOffset = dynSize;
102
103 continue;
104 }
105 auto [size, align] = fir::getTypeSizeAndAlignmentOrCrash(
106 sharedOp.getLoc(), sharedOp.getInType(), *dl, kindMap);
107 ++nbStaticSharedVariables;
108 mlir::Value offset = builder.createIntegerConstant(
109 loc, i32Ty, llvm::alignTo(sharedMemSize, align));
110 sharedOp.getOffsetMutable().assign(offset);
111 sharedMemSize =
112 llvm::alignTo(sharedMemSize, align) + llvm::alignTo(size, align);
113 alignment = std::max(alignment, align);
114 }
115
116 if (nbDynamicSharedVariables == 0 && nbStaticSharedVariables == 0)
117 continue;
118
119 if (nbDynamicSharedVariables > 0 && nbStaticSharedVariables > 0)
120 mlir::emitError(
121 funcOp.getLoc(),
122 "static and dynamic shared variables in a single kernel");
123
124 mlir::DenseElementsAttr init = {};
125 if (sharedMemSize > 0) {
126 auto vecTy = mlir::VectorType::get(sharedMemSize, i8Ty);
127 mlir::Attribute zero = mlir::IntegerAttr::get(i8Ty, 0);
128 init = mlir::DenseElementsAttr::get(vecTy, llvm::ArrayRef(zero));
129 }
130
131 // Create the shared memory global where each shared variable will point
132 // to.
133 auto sharedMemType = fir::SequenceType::get(sharedMemSize, i8Ty);
134 std::string sharedMemGlobalName =
135 (funcOp.getName() + llvm::Twine(cudaSharedMemSuffix)).str();
136 mlir::StringAttr linkage = builder.createInternalLinkage();
137 builder.setInsertionPointToEnd(gpuMod.getBody());
138 llvm::SmallVector<mlir::NamedAttribute> attrs;
139 auto globalOpName = mlir::OperationName(fir::GlobalOp::getOperationName(),
140 gpuMod.getContext());
141 attrs.push_back(mlir::NamedAttribute(
142 fir::GlobalOp::getDataAttrAttrName(globalOpName),
143 cuf::DataAttributeAttr::get(gpuMod.getContext(),
144 cuf::DataAttribute::Shared)));
145 auto sharedMem = builder.create<fir::GlobalOp>(
146 funcOp.getLoc(), sharedMemGlobalName, false, false, sharedMemType,
147 init, linkage, attrs);
148 sharedMem.setAlignment(alignment);
149 }
150 }
151};
152
153} // end anonymous namespace
154

source code of flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp