CUFComputeSharedMemoryOffsetsAndSize.cpp source code [flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp]

1	//===-- CUFComputeSharedMemoryOffsetsAndSize.cpp --------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "flang/Optimizer/Builder/BoxValue.h"
10	#include "flang/Optimizer/Builder/CUFCommon.h"
11	#include "flang/Optimizer/Builder/FIRBuilder.h"
12	#include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
13	#include "flang/Optimizer/Builder/Todo.h"
14	#include "flang/Optimizer/CodeGen/Target.h"
15	#include "flang/Optimizer/CodeGen/TypeConverter.h"
16	#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
17	#include "flang/Optimizer/Dialect/FIRAttr.h"
18	#include "flang/Optimizer/Dialect/FIRDialect.h"
19	#include "flang/Optimizer/Dialect/FIROps.h"
20	#include "flang/Optimizer/Dialect/FIROpsSupport.h"
21	#include "flang/Optimizer/Dialect/FIRType.h"
22	#include "flang/Optimizer/Support/DataLayout.h"
23	#include "flang/Runtime/CUDA/registration.h"
24	#include "flang/Runtime/entry-names.h"
25	#include "mlir/Dialect/DLTI/DLTI.h"
26	#include "mlir/Dialect/GPU/IR/GPUDialect.h"
27	#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
28	#include "mlir/IR/Value.h"
29	#include "mlir/Pass/Pass.h"
30	#include "llvm/ADT/SmallVector.h"
31
32	namespace fir {
33	#define GEN_PASS_DEF_CUFCOMPUTESHAREDMEMORYOFFSETSANDSIZE
34	#include "flang/Optimizer/Transforms/Passes.h.inc"
35	} // namespace fir
36
37	using namespace Fortran::runtime::cuda;
38
39	namespace {
40
41	struct CUFComputeSharedMemoryOffsetsAndSize
42	: public fir::impl::CUFComputeSharedMemoryOffsetsAndSizeBase<
43	CUFComputeSharedMemoryOffsetsAndSize> {
44
45	void runOnOperation() override {
46	mlir::ModuleOp mod = getOperation();
47	mlir::SymbolTable symTab(mod);
48	mlir::OpBuilder opBuilder{mod.getBodyRegion()};
49	fir::FirOpBuilder builder(opBuilder, mod);
50	fir::KindMapping kindMap{fir::getKindMapping(mod)};
51	std::optional<mlir::DataLayout> dl =
52	fir::support::getOrSetMLIRDataLayout(mod, /allowDefaultLayout=/false);
53	if (!dl) {
54	mlir::emitError(mod.getLoc(),
55	"data layout attribute is required to perform " +
56	getName() + "pass");
57	}
58
59	auto gpuMod = cuf::getOrCreateGPUModule(mod, symTab);
60	mlir::Type i8Ty = builder.getI8Type();
61	mlir::Type i32Ty = builder.getI32Type();
62	mlir::Type idxTy = builder.getIndexType();
63	for (auto funcOp : gpuMod.getOps<mlir::gpu::GPUFuncOp>()) {
64	unsigned nbDynamicSharedVariables = `0`;
65	unsigned nbStaticSharedVariables = `0`;
66	uint64_t sharedMemSize = `0`;
67	unsigned short alignment = `0`;
68	mlir::Value crtDynOffset;
69
70	// Go over each shared memory operation and compute their start offset and
71	// the size and alignment of the global to be generated if all variables
72	// are static. If this is dynamic shared memory, then only the alignment
73	// is computed.
74	for (auto sharedOp : funcOp.getOps<cuf::SharedMemoryOp>()) {
75	mlir::Location loc = sharedOp.getLoc();
76	builder.setInsertionPoint(sharedOp);
77	if (fir::hasDynamicSize(sharedOp.getInType())) {
78	mlir::Type ty = sharedOp.getInType();
79	if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(ty))
80	ty = seqTy.getEleTy();
81	unsigned short align = dl->getTypeABIAlignment(ty);
82	alignment = std::max(alignment, align);
83	uint64_t tySize = dl->getTypeSize(ty);
84	++nbDynamicSharedVariables;
85	if (crtDynOffset) {
86	sharedOp.getOffsetMutable().assign(
87	builder.createConvert(loc, i32Ty, crtDynOffset));
88	} else {
89	mlir::Value zero = builder.createIntegerConstant(loc, i32Ty, `0`);
90	sharedOp.getOffsetMutable().assign(zero);
91	}
92
93	mlir::Value dynSize =
94	builder.createIntegerConstant(loc, idxTy, tySize);
95	for (auto extent : sharedOp.getShape())
96	dynSize = builder.create<mlir::arith::MulIOp>(loc, dynSize, extent);
97	if (crtDynOffset)
98	crtDynOffset =
99	builder.create<mlir::arith::AddIOp>(loc, crtDynOffset, dynSize);
100	else
101	crtDynOffset = dynSize;
102
103	continue;
104	}
105	auto [size, align] = fir::getTypeSizeAndAlignmentOrCrash(
106	sharedOp.getLoc(), sharedOp.getInType(), *dl, kindMap);
107	++nbStaticSharedVariables;
108	mlir::Value offset = builder.createIntegerConstant(
109	loc, i32Ty, llvm::alignTo(sharedMemSize, align));
110	sharedOp.getOffsetMutable().assign(offset);
111	sharedMemSize =
112	llvm::alignTo(sharedMemSize, align) + llvm::alignTo(size, align);
113	alignment = std::max(alignment, align);
114	}
115
116	if (nbDynamicSharedVariables == `0` && nbStaticSharedVariables == `0`)
117	continue;
118
119	if (nbDynamicSharedVariables > `0` && nbStaticSharedVariables > `0`)
120	mlir::emitError(
121	funcOp.getLoc(),
122	"static and dynamic shared variables in a single kernel");
123
124	mlir::DenseElementsAttr init = {};
125	if (sharedMemSize > `0`) {
126	auto vecTy = mlir::VectorType::get(sharedMemSize, i8Ty);
127	mlir::Attribute zero = mlir::IntegerAttr::get(i8Ty, `0`);
128	init = mlir::DenseElementsAttr::get(vecTy, llvm::ArrayRef(zero));
129	}
130
131	// Create the shared memory global where each shared variable will point
132	// to.
133	auto sharedMemType = fir::SequenceType::get(sharedMemSize, i8Ty);
134	std::string sharedMemGlobalName =
135	(funcOp.getName() + llvm::Twine(cudaSharedMemSuffix)).str();
136	mlir::StringAttr linkage = builder.createInternalLinkage();
137	builder.setInsertionPointToEnd(gpuMod.getBody());
138	llvm::SmallVector<mlir::NamedAttribute> attrs;
139	auto globalOpName = mlir::OperationName(fir::GlobalOp::getOperationName(),
140	gpuMod.getContext());
141	attrs.push_back(mlir::NamedAttribute(
142	fir::GlobalOp::getDataAttrAttrName(globalOpName),
143	cuf::DataAttributeAttr::get(gpuMod.getContext(),
144	cuf::DataAttribute::Shared)));
145	auto sharedMem = builder.create<fir::GlobalOp>(
146	funcOp.getLoc(), sharedMemGlobalName, false, false, sharedMemType,
147	init, linkage, attrs);
148	sharedMem.setAlignment(alignment);
149	}
150	}
151	};
152
153	} // end anonymous namespace
154

source code of flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp