1 | //===-- CUFComputeSharedMemoryOffsetsAndSize.cpp --------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "flang/Optimizer/Builder/BoxValue.h" |
10 | #include "flang/Optimizer/Builder/CUFCommon.h" |
11 | #include "flang/Optimizer/Builder/FIRBuilder.h" |
12 | #include "flang/Optimizer/Builder/Runtime/RTBuilder.h" |
13 | #include "flang/Optimizer/Builder/Todo.h" |
14 | #include "flang/Optimizer/CodeGen/Target.h" |
15 | #include "flang/Optimizer/CodeGen/TypeConverter.h" |
16 | #include "flang/Optimizer/Dialect/CUF/CUFOps.h" |
17 | #include "flang/Optimizer/Dialect/FIRAttr.h" |
18 | #include "flang/Optimizer/Dialect/FIRDialect.h" |
19 | #include "flang/Optimizer/Dialect/FIROps.h" |
20 | #include "flang/Optimizer/Dialect/FIROpsSupport.h" |
21 | #include "flang/Optimizer/Dialect/FIRType.h" |
22 | #include "flang/Optimizer/Support/DataLayout.h" |
23 | #include "flang/Runtime/CUDA/registration.h" |
24 | #include "flang/Runtime/entry-names.h" |
25 | #include "mlir/Dialect/DLTI/DLTI.h" |
26 | #include "mlir/Dialect/GPU/IR/GPUDialect.h" |
27 | #include "mlir/Dialect/LLVMIR/LLVMDialect.h" |
28 | #include "mlir/IR/Value.h" |
29 | #include "mlir/Pass/Pass.h" |
30 | #include "llvm/ADT/SmallVector.h" |
31 | |
32 | namespace fir { |
33 | #define GEN_PASS_DEF_CUFCOMPUTESHAREDMEMORYOFFSETSANDSIZE |
34 | #include "flang/Optimizer/Transforms/Passes.h.inc" |
35 | } // namespace fir |
36 | |
37 | using namespace Fortran::runtime::cuda; |
38 | |
39 | namespace { |
40 | |
41 | struct CUFComputeSharedMemoryOffsetsAndSize |
42 | : public fir::impl::CUFComputeSharedMemoryOffsetsAndSizeBase< |
43 | CUFComputeSharedMemoryOffsetsAndSize> { |
44 | |
45 | void runOnOperation() override { |
46 | mlir::ModuleOp mod = getOperation(); |
47 | mlir::SymbolTable symTab(mod); |
48 | mlir::OpBuilder opBuilder{mod.getBodyRegion()}; |
49 | fir::FirOpBuilder builder(opBuilder, mod); |
50 | fir::KindMapping kindMap{fir::getKindMapping(mod)}; |
51 | std::optional<mlir::DataLayout> dl = |
52 | fir::support::getOrSetMLIRDataLayout(mod, /*allowDefaultLayout=*/false); |
53 | if (!dl) { |
54 | mlir::emitError(mod.getLoc(), |
55 | "data layout attribute is required to perform " + |
56 | getName() + "pass" ); |
57 | } |
58 | |
59 | auto gpuMod = cuf::getOrCreateGPUModule(mod, symTab); |
60 | mlir::Type i8Ty = builder.getI8Type(); |
61 | mlir::Type i32Ty = builder.getI32Type(); |
62 | mlir::Type idxTy = builder.getIndexType(); |
63 | for (auto funcOp : gpuMod.getOps<mlir::gpu::GPUFuncOp>()) { |
64 | unsigned nbDynamicSharedVariables = 0; |
65 | unsigned nbStaticSharedVariables = 0; |
66 | uint64_t sharedMemSize = 0; |
67 | unsigned short alignment = 0; |
68 | mlir::Value crtDynOffset; |
69 | |
70 | // Go over each shared memory operation and compute their start offset and |
71 | // the size and alignment of the global to be generated if all variables |
72 | // are static. If this is dynamic shared memory, then only the alignment |
73 | // is computed. |
74 | for (auto sharedOp : funcOp.getOps<cuf::SharedMemoryOp>()) { |
75 | mlir::Location loc = sharedOp.getLoc(); |
76 | builder.setInsertionPoint(sharedOp); |
77 | if (fir::hasDynamicSize(sharedOp.getInType())) { |
78 | mlir::Type ty = sharedOp.getInType(); |
79 | if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(ty)) |
80 | ty = seqTy.getEleTy(); |
81 | unsigned short align = dl->getTypeABIAlignment(ty); |
82 | alignment = std::max(alignment, align); |
83 | uint64_t tySize = dl->getTypeSize(ty); |
84 | ++nbDynamicSharedVariables; |
85 | if (crtDynOffset) { |
86 | sharedOp.getOffsetMutable().assign( |
87 | builder.createConvert(loc, i32Ty, crtDynOffset)); |
88 | } else { |
89 | mlir::Value zero = builder.createIntegerConstant(loc, i32Ty, 0); |
90 | sharedOp.getOffsetMutable().assign(zero); |
91 | } |
92 | |
93 | mlir::Value dynSize = |
94 | builder.createIntegerConstant(loc, idxTy, tySize); |
95 | for (auto extent : sharedOp.getShape()) |
96 | dynSize = builder.create<mlir::arith::MulIOp>(loc, dynSize, extent); |
97 | if (crtDynOffset) |
98 | crtDynOffset = |
99 | builder.create<mlir::arith::AddIOp>(loc, crtDynOffset, dynSize); |
100 | else |
101 | crtDynOffset = dynSize; |
102 | |
103 | continue; |
104 | } |
105 | auto [size, align] = fir::getTypeSizeAndAlignmentOrCrash( |
106 | sharedOp.getLoc(), sharedOp.getInType(), *dl, kindMap); |
107 | ++nbStaticSharedVariables; |
108 | mlir::Value offset = builder.createIntegerConstant( |
109 | loc, i32Ty, llvm::alignTo(sharedMemSize, align)); |
110 | sharedOp.getOffsetMutable().assign(offset); |
111 | sharedMemSize = |
112 | llvm::alignTo(sharedMemSize, align) + llvm::alignTo(size, align); |
113 | alignment = std::max(alignment, align); |
114 | } |
115 | |
116 | if (nbDynamicSharedVariables == 0 && nbStaticSharedVariables == 0) |
117 | continue; |
118 | |
119 | if (nbDynamicSharedVariables > 0 && nbStaticSharedVariables > 0) |
120 | mlir::emitError( |
121 | funcOp.getLoc(), |
122 | "static and dynamic shared variables in a single kernel" ); |
123 | |
124 | mlir::DenseElementsAttr init = {}; |
125 | if (sharedMemSize > 0) { |
126 | auto vecTy = mlir::VectorType::get(sharedMemSize, i8Ty); |
127 | mlir::Attribute zero = mlir::IntegerAttr::get(i8Ty, 0); |
128 | init = mlir::DenseElementsAttr::get(vecTy, llvm::ArrayRef(zero)); |
129 | } |
130 | |
131 | // Create the shared memory global where each shared variable will point |
132 | // to. |
133 | auto sharedMemType = fir::SequenceType::get(sharedMemSize, i8Ty); |
134 | std::string sharedMemGlobalName = |
135 | (funcOp.getName() + llvm::Twine(cudaSharedMemSuffix)).str(); |
136 | mlir::StringAttr linkage = builder.createInternalLinkage(); |
137 | builder.setInsertionPointToEnd(gpuMod.getBody()); |
138 | llvm::SmallVector<mlir::NamedAttribute> attrs; |
139 | auto globalOpName = mlir::OperationName(fir::GlobalOp::getOperationName(), |
140 | gpuMod.getContext()); |
141 | attrs.push_back(mlir::NamedAttribute( |
142 | fir::GlobalOp::getDataAttrAttrName(globalOpName), |
143 | cuf::DataAttributeAttr::get(gpuMod.getContext(), |
144 | cuf::DataAttribute::Shared))); |
145 | auto sharedMem = builder.create<fir::GlobalOp>( |
146 | funcOp.getLoc(), sharedMemGlobalName, false, false, sharedMemType, |
147 | init, linkage, attrs); |
148 | sharedMem.setAlignment(alignment); |
149 | } |
150 | } |
151 | }; |
152 | |
153 | } // end anonymous namespace |
154 | |