1//===-- AMDGPULowerModuleLDSPass.cpp ------------------------------*- C++ -*-=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass eliminates local data store, LDS, uses from non-kernel functions.
10// LDS is contiguous memory allocated per kernel execution.
11//
12// Background.
13//
14// The programming model is global variables, or equivalently function local
15// static variables, accessible from kernels or other functions. For uses from
16// kernels this is straightforward - assign an integer to the kernel for the
17// memory required by all the variables combined, allocate them within that.
18// For uses from functions there are performance tradeoffs to choose between.
19//
20// This model means the GPU runtime can specify the amount of memory allocated.
21// If this is more than the kernel assumed, the excess can be made available
22// using a language specific feature, which IR represents as a variable with
23// no initializer. This feature is referred to here as "Dynamic LDS" and is
24// lowered slightly differently to the normal case.
25//
26// Consequences of this GPU feature:
27// - memory is limited and exceeding it halts compilation
28// - a global accessed by one kernel exists independent of other kernels
29// - a global exists independent of simultaneous execution of the same kernel
30// - the address of the global may be different from different kernels as they
31// do not alias, which permits only allocating variables they use
32// - if the address is allowed to differ, functions need help to find it
33//
34// Uses from kernels are implemented here by grouping them in a per-kernel
35// struct instance. This duplicates the variables, accurately modelling their
36// aliasing properties relative to a single global representation. It also
37// permits control over alignment via padding.
38//
39// Uses from functions are more complicated and the primary purpose of this
40// IR pass. Several different lowering are chosen between to meet requirements
41// to avoid allocating any LDS where it is not necessary, as that impacts
42// occupancy and may fail the compilation, while not imposing overhead on a
43// feature whose primary advantage over global memory is performance. The basic
44// design goal is to avoid one kernel imposing overhead on another.
45//
46// Implementation.
47//
48// LDS variables with constant annotation or non-undef initializer are passed
49// through unchanged for simplification or error diagnostics in later passes.
50// Non-undef initializers are not yet implemented for LDS.
51//
52// LDS variables that are always allocated at the same address can be found
53// by lookup at that address. Otherwise runtime information/cost is required.
54//
55// The simplest strategy possible is to group all LDS variables in a single
56// struct and allocate that struct in every kernel such that the original
57// variables are always at the same address. LDS is however a limited resource
58// so this strategy is unusable in practice. It is not implemented here.
59//
60// Strategy | Precise allocation | Zero runtime cost | General purpose |
61// --------+--------------------+-------------------+-----------------+
62// Module | No | Yes | Yes |
63// Table | Yes | No | Yes |
64// Kernel | Yes | Yes | No |
65// Hybrid | Yes | Partial | Yes |
66//
67// "Module" spends LDS memory to save cycles. "Table" spends cycles and global
68// memory to save LDS. "Kernel" is as fast as kernel allocation but only works
69// for variables that are known reachable from a single kernel. "Hybrid" picks
70// between all three. When forced to choose between LDS and cycles we minimise
71// LDS use.
72
73// The "module" lowering implemented here finds LDS variables which are used by
74// non-kernel functions and creates a new struct with a field for each of those
75// LDS variables. Variables that are only used from kernels are excluded.
76//
77// The "table" lowering implemented here has three components.
78// First kernels are assigned a unique integer identifier which is available in
79// functions it calls through the intrinsic amdgcn_lds_kernel_id. The integer
80// is passed through a specific SGPR, thus works with indirect calls.
81// Second, each kernel allocates LDS variables independent of other kernels and
82// writes the addresses it chose for each variable into an array in consistent
83// order. If the kernel does not allocate a given variable, it writes undef to
84// the corresponding array location. These arrays are written to a constant
85// table in the order matching the kernel unique integer identifier.
86// Third, uses from non-kernel functions are replaced with a table lookup using
87// the intrinsic function to find the address of the variable.
88//
89// "Kernel" lowering is only applicable for variables that are unambiguously
90// reachable from exactly one kernel. For those cases, accesses to the variable
91// can be lowered to ConstantExpr address of a struct instance specific to that
92// one kernel. This is zero cost in space and in compute. It will raise a fatal
93// error on any variable that might be reachable from multiple kernels and is
94// thus most easily used as part of the hybrid lowering strategy.
95//
96// Hybrid lowering is a mixture of the above. It uses the zero cost kernel
97// lowering where it can. It lowers the variable accessed by the greatest
98// number of kernels using the module strategy as that is free for the first
99// variable. Any futher variables that can be lowered with the module strategy
100// without incurring LDS memory overhead are. The remaining ones are lowered
101// via table.
102//
103// Consequences
104// - No heuristics or user controlled magic numbers, hybrid is the right choice
105// - Kernels that don't use functions (or have had them all inlined) are not
106// affected by any lowering for kernels that do.
107// - Kernels that don't make indirect function calls are not affected by those
108// that do.
109// - Variables which are used by lots of kernels, e.g. those injected by a
110// language runtime in most kernels, are expected to have no overhead
111// - Implementations that instantiate templates per-kernel where those templates
112// use LDS are expected to hit the "Kernel" lowering strategy
113// - The runtime properties impose a cost in compiler implementation complexity
114//
115// Dynamic LDS implementation
116// Dynamic LDS is lowered similarly to the "table" strategy above and uses the
117// same intrinsic to identify which kernel is at the root of the dynamic call
118// graph. This relies on the specified behaviour that all dynamic LDS variables
119// alias one another, i.e. are at the same address, with respect to a given
120// kernel. Therefore this pass creates new dynamic LDS variables for each kernel
121// that allocates any dynamic LDS and builds a table of addresses out of those.
122// The AMDGPUPromoteAlloca pass skips kernels that use dynamic LDS.
123// The corresponding optimisation for "kernel" lowering where the table lookup
124// is elided is not implemented.
125//
126//
127// Implementation notes / limitations
128// A single LDS global variable represents an instance per kernel that can reach
129// said variables. This pass essentially specialises said variables per kernel.
130// Handling ConstantExpr during the pass complicated this significantly so now
131// all ConstantExpr uses of LDS variables are expanded to instructions. This
132// may need amending when implementing non-undef initialisers.
133//
134// Lowering is split between this IR pass and the back end. This pass chooses
135// where given variables should be allocated and marks them with metadata,
136// MD_absolute_symbol. The backend places the variables in coincidentally the
137// same location and raises a fatal error if something has gone awry. This works
138// in practice because the only pass between this one and the backend that
139// changes LDS is PromoteAlloca and the changes it makes do not conflict.
140//
141// Addresses are written to constant global arrays based on the same metadata.
142//
143// The backend lowers LDS variables in the order of traversal of the function.
144// This is at odds with the deterministic layout required. The workaround is to
145// allocate the fixed-address variables immediately upon starting the function
146// where they can be placed as intended. This requires a means of mapping from
147// the function to the variables that it allocates. For the module scope lds,
148// this is via metadata indicating whether the variable is not required. If a
149// pass deletes that metadata, a fatal error on disagreement with the absolute
150// symbol metadata will occur. For kernel scope and dynamic, this is by _name_
151// correspondence between the function and the variable. It requires the
152// kernel to have a name (which is only a limitation for tests in practice) and
153// for nothing to rename the corresponding symbols. This is a hazard if the pass
154// is run multiple times during debugging. Alternative schemes considered all
155// involve bespoke metadata.
156//
157// If the name correspondence can be replaced, multiple distinct kernels that
158// have the same memory layout can map to the same kernel id (as the address
159// itself is handled by the absolute symbol metadata) and that will allow more
160// uses of the "kernel" style faster lowering and reduce the size of the lookup
161// tables.
162//
163// There is a test that checks this does not fire for a graphics shader. This
164// lowering is expected to work for graphics if the isKernel test is changed.
165//
166// The current markUsedByKernel is sufficient for PromoteAlloca but is elided
167// before codegen. Replacing this with an equivalent intrinsic which lasts until
168// shortly after the machine function lowering of LDS would help break the name
169// mapping. The other part needed is probably to amend PromoteAlloca to embed
170// the LDS variables it creates in the same struct created here. That avoids the
171// current hazard where a PromoteAlloca LDS variable might be allocated before
172// the kernel scope (and thus error on the address check). Given a new invariant
173// that no LDS variables exist outside of the structs managed here, and an
174// intrinsic that lasts until after the LDS frame lowering, it should be
175// possible to drop the name mapping and fold equivalent memory layouts.
176//
177//===----------------------------------------------------------------------===//
178
179#include "AMDGPU.h"
180#include "AMDGPUTargetMachine.h"
181#include "Utils/AMDGPUBaseInfo.h"
182#include "Utils/AMDGPUMemoryUtils.h"
183#include "llvm/ADT/BitVector.h"
184#include "llvm/ADT/DenseMap.h"
185#include "llvm/ADT/DenseSet.h"
186#include "llvm/ADT/STLExtras.h"
187#include "llvm/ADT/SetOperations.h"
188#include "llvm/Analysis/CallGraph.h"
189#include "llvm/CodeGen/TargetPassConfig.h"
190#include "llvm/IR/Constants.h"
191#include "llvm/IR/DerivedTypes.h"
192#include "llvm/IR/IRBuilder.h"
193#include "llvm/IR/InlineAsm.h"
194#include "llvm/IR/Instructions.h"
195#include "llvm/IR/IntrinsicsAMDGPU.h"
196#include "llvm/IR/MDBuilder.h"
197#include "llvm/IR/ReplaceConstant.h"
198#include "llvm/InitializePasses.h"
199#include "llvm/Pass.h"
200#include "llvm/Support/CommandLine.h"
201#include "llvm/Support/Debug.h"
202#include "llvm/Support/Format.h"
203#include "llvm/Support/OptimizedStructLayout.h"
204#include "llvm/Support/raw_ostream.h"
205#include "llvm/Transforms/Utils/BasicBlockUtils.h"
206#include "llvm/Transforms/Utils/ModuleUtils.h"
207
208#include <vector>
209
210#include <cstdio>
211
212#define DEBUG_TYPE "amdgpu-lower-module-lds"
213
214using namespace llvm;
215
216namespace {
217
218cl::opt<bool> SuperAlignLDSGlobals(
219 "amdgpu-super-align-lds-globals",
220 cl::desc("Increase alignment of LDS if it is not on align boundary"),
221 cl::init(Val: true), cl::Hidden);
222
223enum class LoweringKind { module, table, kernel, hybrid };
224cl::opt<LoweringKind> LoweringKindLoc(
225 "amdgpu-lower-module-lds-strategy",
226 cl::desc("Specify lowering strategy for function LDS access:"), cl::Hidden,
227 cl::init(Val: LoweringKind::hybrid),
228 cl::values(
229 clEnumValN(LoweringKind::table, "table", "Lower via table lookup"),
230 clEnumValN(LoweringKind::module, "module", "Lower via module struct"),
231 clEnumValN(
232 LoweringKind::kernel, "kernel",
233 "Lower variables reachable from one kernel, otherwise abort"),
234 clEnumValN(LoweringKind::hybrid, "hybrid",
235 "Lower via mixture of above strategies")));
236
237bool isKernelLDS(const Function *F) {
238 // Some weirdness here. AMDGPU::isKernelCC does not call into
239 // AMDGPU::isKernel with the calling conv, it instead calls into
240 // isModuleEntryFunction which returns true for more calling conventions
241 // than AMDGPU::isKernel does. There's a FIXME on AMDGPU::isKernel.
242 // There's also a test that checks that the LDS lowering does not hit on
243 // a graphics shader, denoted amdgpu_ps, so stay with the limited case.
244 // Putting LDS in the name of the function to draw attention to this.
245 return AMDGPU::isKernel(CC: F->getCallingConv());
246}
247
248template <typename T> std::vector<T> sortByName(std::vector<T> &&V) {
249 llvm::sort(V.begin(), V.end(), [](const auto *L, const auto *R) {
250 return L->getName() < R->getName();
251 });
252 return {std::move(V)};
253}
254
255class AMDGPULowerModuleLDS {
256 const AMDGPUTargetMachine &TM;
257
258 static void
259 removeLocalVarsFromUsedLists(Module &M,
260 const DenseSet<GlobalVariable *> &LocalVars) {
261 // The verifier rejects used lists containing an inttoptr of a constant
262 // so remove the variables from these lists before replaceAllUsesWith
263 SmallPtrSet<Constant *, 8> LocalVarsSet;
264 for (GlobalVariable *LocalVar : LocalVars)
265 LocalVarsSet.insert(Ptr: cast<Constant>(Val: LocalVar->stripPointerCasts()));
266
267 removeFromUsedLists(
268 M, ShouldRemove: [&LocalVarsSet](Constant *C) { return LocalVarsSet.count(Ptr: C); });
269
270 for (GlobalVariable *LocalVar : LocalVars)
271 LocalVar->removeDeadConstantUsers();
272 }
273
274 static void markUsedByKernel(Function *Func, GlobalVariable *SGV) {
275 // The llvm.amdgcn.module.lds instance is implicitly used by all kernels
276 // that might call a function which accesses a field within it. This is
277 // presently approximated to 'all kernels' if there are any such functions
278 // in the module. This implicit use is redefined as an explicit use here so
279 // that later passes, specifically PromoteAlloca, account for the required
280 // memory without any knowledge of this transform.
281
282 // An operand bundle on llvm.donothing works because the call instruction
283 // survives until after the last pass that needs to account for LDS. It is
284 // better than inline asm as the latter survives until the end of codegen. A
285 // totally robust solution would be a function with the same semantics as
286 // llvm.donothing that takes a pointer to the instance and is lowered to a
287 // no-op after LDS is allocated, but that is not presently necessary.
288
289 // This intrinsic is eliminated shortly before instruction selection. It
290 // does not suffice to indicate to ISel that a given global which is not
291 // immediately used by the kernel must still be allocated by it. An
292 // equivalent target specific intrinsic which lasts until immediately after
293 // codegen would suffice for that, but one would still need to ensure that
294 // the variables are allocated in the anticpated order.
295 BasicBlock *Entry = &Func->getEntryBlock();
296 IRBuilder<> Builder(Entry, Entry->getFirstNonPHIIt());
297
298 Function *Decl =
299 Intrinsic::getDeclaration(M: Func->getParent(), Intrinsic::id: donothing, Tys: {});
300
301 Value *UseInstance[1] = {
302 Builder.CreateConstInBoundsGEP1_32(Ty: SGV->getValueType(), Ptr: SGV, Idx0: 0)};
303
304 Builder.CreateCall(
305 Callee: Decl, Args: {}, OpBundles: {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)});
306 }
307
308 static bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M) {
309 // Constants are uniqued within LLVM. A ConstantExpr referring to a LDS
310 // global may have uses from multiple different functions as a result.
311 // This pass specialises LDS variables with respect to the kernel that
312 // allocates them.
313
314 // This is semantically equivalent to (the unimplemented as slow):
315 // for (auto &F : M.functions())
316 // for (auto &BB : F)
317 // for (auto &I : BB)
318 // for (Use &Op : I.operands())
319 // if (constantExprUsesLDS(Op))
320 // replaceConstantExprInFunction(I, Op);
321
322 SmallVector<Constant *> LDSGlobals;
323 for (auto &GV : M.globals())
324 if (AMDGPU::isLDSVariableToLower(GV))
325 LDSGlobals.push_back(Elt: &GV);
326
327 return convertUsersOfConstantsToInstructions(Consts: LDSGlobals);
328 }
329
330public:
331 AMDGPULowerModuleLDS(const AMDGPUTargetMachine &TM_) : TM(TM_) {}
332
333 using FunctionVariableMap = DenseMap<Function *, DenseSet<GlobalVariable *>>;
334
335 using VariableFunctionMap = DenseMap<GlobalVariable *, DenseSet<Function *>>;
336
337 static void getUsesOfLDSByFunction(CallGraph const &CG, Module &M,
338 FunctionVariableMap &kernels,
339 FunctionVariableMap &functions) {
340
341 // Get uses from the current function, excluding uses by called functions
342 // Two output variables to avoid walking the globals list twice
343 for (auto &GV : M.globals()) {
344 if (!AMDGPU::isLDSVariableToLower(GV)) {
345 continue;
346 }
347
348 for (User *V : GV.users()) {
349 if (auto *I = dyn_cast<Instruction>(Val: V)) {
350 Function *F = I->getFunction();
351 if (isKernelLDS(F)) {
352 kernels[F].insert(V: &GV);
353 } else {
354 functions[F].insert(V: &GV);
355 }
356 }
357 }
358 }
359 }
360
361 struct LDSUsesInfoTy {
362 FunctionVariableMap direct_access;
363 FunctionVariableMap indirect_access;
364 };
365
366 static LDSUsesInfoTy getTransitiveUsesOfLDS(CallGraph const &CG, Module &M) {
367
368 FunctionVariableMap direct_map_kernel;
369 FunctionVariableMap direct_map_function;
370 getUsesOfLDSByFunction(CG, M, kernels&: direct_map_kernel, functions&: direct_map_function);
371
372 // Collect variables that are used by functions whose address has escaped
373 DenseSet<GlobalVariable *> VariablesReachableThroughFunctionPointer;
374 for (Function &F : M.functions()) {
375 if (!isKernelLDS(F: &F))
376 if (F.hasAddressTaken(nullptr,
377 /* IgnoreCallbackUses */ false,
378 /* IgnoreAssumeLikeCalls */ false,
379 /* IgnoreLLVMUsed */ IngoreLLVMUsed: true,
380 /* IgnoreArcAttachedCall */ IgnoreARCAttachedCall: false)) {
381 set_union(S1&: VariablesReachableThroughFunctionPointer,
382 S2: direct_map_function[&F]);
383 }
384 }
385
386 auto functionMakesUnknownCall = [&](const Function *F) -> bool {
387 assert(!F->isDeclaration());
388 for (const CallGraphNode::CallRecord &R : *CG[F]) {
389 if (!R.second->getFunction()) {
390 return true;
391 }
392 }
393 return false;
394 };
395
396 // Work out which variables are reachable through function calls
397 FunctionVariableMap transitive_map_function = direct_map_function;
398
399 // If the function makes any unknown call, assume the worst case that it can
400 // access all variables accessed by functions whose address escaped
401 for (Function &F : M.functions()) {
402 if (!F.isDeclaration() && functionMakesUnknownCall(&F)) {
403 if (!isKernelLDS(F: &F)) {
404 set_union(S1&: transitive_map_function[&F],
405 S2: VariablesReachableThroughFunctionPointer);
406 }
407 }
408 }
409
410 // Direct implementation of collecting all variables reachable from each
411 // function
412 for (Function &Func : M.functions()) {
413 if (Func.isDeclaration() || isKernelLDS(F: &Func))
414 continue;
415
416 DenseSet<Function *> seen; // catches cycles
417 SmallVector<Function *, 4> wip{&Func};
418
419 while (!wip.empty()) {
420 Function *F = wip.pop_back_val();
421
422 // Can accelerate this by referring to transitive map for functions that
423 // have already been computed, with more care than this
424 set_union(S1&: transitive_map_function[&Func], S2: direct_map_function[F]);
425
426 for (const CallGraphNode::CallRecord &R : *CG[F]) {
427 Function *ith = R.second->getFunction();
428 if (ith) {
429 if (!seen.contains(V: ith)) {
430 seen.insert(V: ith);
431 wip.push_back(Elt: ith);
432 }
433 }
434 }
435 }
436 }
437
438 // direct_map_kernel lists which variables are used by the kernel
439 // find the variables which are used through a function call
440 FunctionVariableMap indirect_map_kernel;
441
442 for (Function &Func : M.functions()) {
443 if (Func.isDeclaration() || !isKernelLDS(F: &Func))
444 continue;
445
446 for (const CallGraphNode::CallRecord &R : *CG[&Func]) {
447 Function *ith = R.second->getFunction();
448 if (ith) {
449 set_union(S1&: indirect_map_kernel[&Func], S2: transitive_map_function[ith]);
450 } else {
451 set_union(S1&: indirect_map_kernel[&Func],
452 S2: VariablesReachableThroughFunctionPointer);
453 }
454 }
455 }
456
457 // Verify that we fall into one of 2 cases:
458 // - All variables are absolute: this is a re-run of the pass
459 // so we don't have anything to do.
460 // - No variables are absolute.
461 std::optional<bool> HasAbsoluteGVs;
462 for (auto &Map : {direct_map_kernel, indirect_map_kernel}) {
463 for (auto &[Fn, GVs] : Map) {
464 for (auto *GV : GVs) {
465 bool IsAbsolute = GV->isAbsoluteSymbolRef();
466 if (HasAbsoluteGVs.has_value()) {
467 if (*HasAbsoluteGVs != IsAbsolute) {
468 report_fatal_error(
469 reason: "Module cannot mix absolute and non-absolute LDS GVs");
470 }
471 } else
472 HasAbsoluteGVs = IsAbsolute;
473 }
474 }
475 }
476
477 // If we only had absolute GVs, we have nothing to do, return an empty
478 // result.
479 if (HasAbsoluteGVs && *HasAbsoluteGVs)
480 return {.direct_access: FunctionVariableMap(), .indirect_access: FunctionVariableMap()};
481
482 return {.direct_access: std::move(direct_map_kernel), .indirect_access: std::move(indirect_map_kernel)};
483 }
484
485 struct LDSVariableReplacement {
486 GlobalVariable *SGV = nullptr;
487 DenseMap<GlobalVariable *, Constant *> LDSVarsToConstantGEP;
488 };
489
490 // remap from lds global to a constantexpr gep to where it has been moved to
491 // for each kernel
492 // an array with an element for each kernel containing where the corresponding
493 // variable was remapped to
494
495 static Constant *getAddressesOfVariablesInKernel(
496 LLVMContext &Ctx, ArrayRef<GlobalVariable *> Variables,
497 const DenseMap<GlobalVariable *, Constant *> &LDSVarsToConstantGEP) {
498 // Create a ConstantArray containing the address of each Variable within the
499 // kernel corresponding to LDSVarsToConstantGEP, or poison if that kernel
500 // does not allocate it
501 // TODO: Drop the ptrtoint conversion
502
503 Type *I32 = Type::getInt32Ty(C&: Ctx);
504
505 ArrayType *KernelOffsetsType = ArrayType::get(ElementType: I32, NumElements: Variables.size());
506
507 SmallVector<Constant *> Elements;
508 for (size_t i = 0; i < Variables.size(); i++) {
509 GlobalVariable *GV = Variables[i];
510 auto ConstantGepIt = LDSVarsToConstantGEP.find(Val: GV);
511 if (ConstantGepIt != LDSVarsToConstantGEP.end()) {
512 auto elt = ConstantExpr::getPtrToInt(C: ConstantGepIt->second, Ty: I32);
513 Elements.push_back(Elt: elt);
514 } else {
515 Elements.push_back(Elt: PoisonValue::get(T: I32));
516 }
517 }
518 return ConstantArray::get(T: KernelOffsetsType, V: Elements);
519 }
520
521 static GlobalVariable *buildLookupTable(
522 Module &M, ArrayRef<GlobalVariable *> Variables,
523 ArrayRef<Function *> kernels,
524 DenseMap<Function *, LDSVariableReplacement> &KernelToReplacement) {
525 if (Variables.empty()) {
526 return nullptr;
527 }
528 LLVMContext &Ctx = M.getContext();
529
530 const size_t NumberVariables = Variables.size();
531 const size_t NumberKernels = kernels.size();
532
533 ArrayType *KernelOffsetsType =
534 ArrayType::get(ElementType: Type::getInt32Ty(C&: Ctx), NumElements: NumberVariables);
535
536 ArrayType *AllKernelsOffsetsType =
537 ArrayType::get(ElementType: KernelOffsetsType, NumElements: NumberKernels);
538
539 Constant *Missing = PoisonValue::get(T: KernelOffsetsType);
540 std::vector<Constant *> overallConstantExprElts(NumberKernels);
541 for (size_t i = 0; i < NumberKernels; i++) {
542 auto Replacement = KernelToReplacement.find(Val: kernels[i]);
543 overallConstantExprElts[i] =
544 (Replacement == KernelToReplacement.end())
545 ? Missing
546 : getAddressesOfVariablesInKernel(
547 Ctx, Variables, LDSVarsToConstantGEP: Replacement->second.LDSVarsToConstantGEP);
548 }
549
550 Constant *init =
551 ConstantArray::get(T: AllKernelsOffsetsType, V: overallConstantExprElts);
552
553 return new GlobalVariable(
554 M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, init,
555 "llvm.amdgcn.lds.offset.table", nullptr, GlobalValue::NotThreadLocal,
556 AMDGPUAS::CONSTANT_ADDRESS);
557 }
558
559 void replaceUseWithTableLookup(Module &M, IRBuilder<> &Builder,
560 GlobalVariable *LookupTable,
561 GlobalVariable *GV, Use &U,
562 Value *OptionalIndex) {
563 // Table is a constant array of the same length as OrderedKernels
564 LLVMContext &Ctx = M.getContext();
565 Type *I32 = Type::getInt32Ty(C&: Ctx);
566 auto *I = cast<Instruction>(Val: U.getUser());
567
568 Value *tableKernelIndex = getTableLookupKernelIndex(M, F: I->getFunction());
569
570 if (auto *Phi = dyn_cast<PHINode>(Val: I)) {
571 BasicBlock *BB = Phi->getIncomingBlock(U);
572 Builder.SetInsertPoint(&(*(BB->getFirstInsertionPt())));
573 } else {
574 Builder.SetInsertPoint(I);
575 }
576
577 SmallVector<Value *, 3> GEPIdx = {
578 ConstantInt::get(Ty: I32, V: 0),
579 tableKernelIndex,
580 };
581 if (OptionalIndex)
582 GEPIdx.push_back(Elt: OptionalIndex);
583
584 Value *Address = Builder.CreateInBoundsGEP(
585 Ty: LookupTable->getValueType(), Ptr: LookupTable, IdxList: GEPIdx, Name: GV->getName());
586
587 Value *loaded = Builder.CreateLoad(Ty: I32, Ptr: Address);
588
589 Value *replacement =
590 Builder.CreateIntToPtr(V: loaded, DestTy: GV->getType(), Name: GV->getName());
591
592 U.set(replacement);
593 }
594
595 void replaceUsesInInstructionsWithTableLookup(
596 Module &M, ArrayRef<GlobalVariable *> ModuleScopeVariables,
597 GlobalVariable *LookupTable) {
598
599 LLVMContext &Ctx = M.getContext();
600 IRBuilder<> Builder(Ctx);
601 Type *I32 = Type::getInt32Ty(C&: Ctx);
602
603 for (size_t Index = 0; Index < ModuleScopeVariables.size(); Index++) {
604 auto *GV = ModuleScopeVariables[Index];
605
606 for (Use &U : make_early_inc_range(Range: GV->uses())) {
607 auto *I = dyn_cast<Instruction>(Val: U.getUser());
608 if (!I)
609 continue;
610
611 replaceUseWithTableLookup(M, Builder, LookupTable, GV, U,
612 OptionalIndex: ConstantInt::get(Ty: I32, V: Index));
613 }
614 }
615 }
616
617 static DenseSet<Function *> kernelsThatIndirectlyAccessAnyOfPassedVariables(
618 Module &M, LDSUsesInfoTy &LDSUsesInfo,
619 DenseSet<GlobalVariable *> const &VariableSet) {
620
621 DenseSet<Function *> KernelSet;
622
623 if (VariableSet.empty())
624 return KernelSet;
625
626 for (Function &Func : M.functions()) {
627 if (Func.isDeclaration() || !isKernelLDS(F: &Func))
628 continue;
629 for (GlobalVariable *GV : LDSUsesInfo.indirect_access[&Func]) {
630 if (VariableSet.contains(V: GV)) {
631 KernelSet.insert(V: &Func);
632 break;
633 }
634 }
635 }
636
637 return KernelSet;
638 }
639
640 static GlobalVariable *
641 chooseBestVariableForModuleStrategy(const DataLayout &DL,
642 VariableFunctionMap &LDSVars) {
643 // Find the global variable with the most indirect uses from kernels
644
645 struct CandidateTy {
646 GlobalVariable *GV = nullptr;
647 size_t UserCount = 0;
648 size_t Size = 0;
649
650 CandidateTy() = default;
651
652 CandidateTy(GlobalVariable *GV, uint64_t UserCount, uint64_t AllocSize)
653 : GV(GV), UserCount(UserCount), Size(AllocSize) {}
654
655 bool operator<(const CandidateTy &Other) const {
656 // Fewer users makes module scope variable less attractive
657 if (UserCount < Other.UserCount) {
658 return true;
659 }
660 if (UserCount > Other.UserCount) {
661 return false;
662 }
663
664 // Bigger makes module scope variable less attractive
665 if (Size < Other.Size) {
666 return false;
667 }
668
669 if (Size > Other.Size) {
670 return true;
671 }
672
673 // Arbitrary but consistent
674 return GV->getName() < Other.GV->getName();
675 }
676 };
677
678 CandidateTy MostUsed;
679
680 for (auto &K : LDSVars) {
681 GlobalVariable *GV = K.first;
682 if (K.second.size() <= 1) {
683 // A variable reachable by only one kernel is best lowered with kernel
684 // strategy
685 continue;
686 }
687 CandidateTy Candidate(
688 GV, K.second.size(),
689 DL.getTypeAllocSize(Ty: GV->getValueType()).getFixedValue());
690 if (MostUsed < Candidate)
691 MostUsed = Candidate;
692 }
693
694 return MostUsed.GV;
695 }
696
697 static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV,
698 uint32_t Address) {
699 // Write the specified address into metadata where it can be retrieved by
700 // the assembler. Format is a half open range, [Address Address+1)
701 LLVMContext &Ctx = M->getContext();
702 auto *IntTy =
703 M->getDataLayout().getIntPtrType(C&: Ctx, AddressSpace: AMDGPUAS::LOCAL_ADDRESS);
704 auto *MinC = ConstantAsMetadata::get(C: ConstantInt::get(Ty: IntTy, V: Address));
705 auto *MaxC = ConstantAsMetadata::get(C: ConstantInt::get(Ty: IntTy, V: Address + 1));
706 GV->setMetadata(KindID: LLVMContext::MD_absolute_symbol,
707 Node: MDNode::get(Context&: Ctx, MDs: {MinC, MaxC}));
708 }
709
710 DenseMap<Function *, Value *> tableKernelIndexCache;
711 Value *getTableLookupKernelIndex(Module &M, Function *F) {
712 // Accesses from a function use the amdgcn_lds_kernel_id intrinsic which
713 // lowers to a read from a live in register. Emit it once in the entry
714 // block to spare deduplicating it later.
715 auto [It, Inserted] = tableKernelIndexCache.try_emplace(Key: F);
716 if (Inserted) {
717 Function *Decl =
718 Intrinsic::getDeclaration(M: &M, Intrinsic::id: amdgcn_lds_kernel_id, Tys: {});
719
720 auto InsertAt = F->getEntryBlock().getFirstNonPHIOrDbgOrAlloca();
721 IRBuilder<> Builder(&*InsertAt);
722
723 It->second = Builder.CreateCall(Callee: Decl, Args: {});
724 }
725
726 return It->second;
727 }
728
729 static std::vector<Function *> assignLDSKernelIDToEachKernel(
730 Module *M, DenseSet<Function *> const &KernelsThatAllocateTableLDS,
731 DenseSet<Function *> const &KernelsThatIndirectlyAllocateDynamicLDS) {
732 // Associate kernels in the set with an arbirary but reproducible order and
733 // annotate them with that order in metadata. This metadata is recognised by
734 // the backend and lowered to a SGPR which can be read from using
735 // amdgcn_lds_kernel_id.
736
737 std::vector<Function *> OrderedKernels;
738 if (!KernelsThatAllocateTableLDS.empty() ||
739 !KernelsThatIndirectlyAllocateDynamicLDS.empty()) {
740
741 for (Function &Func : M->functions()) {
742 if (Func.isDeclaration())
743 continue;
744 if (!isKernelLDS(F: &Func))
745 continue;
746
747 if (KernelsThatAllocateTableLDS.contains(V: &Func) ||
748 KernelsThatIndirectlyAllocateDynamicLDS.contains(V: &Func)) {
749 assert(Func.hasName()); // else fatal error earlier
750 OrderedKernels.push_back(x: &Func);
751 }
752 }
753
754 // Put them in an arbitrary but reproducible order
755 OrderedKernels = sortByName(V: std::move(OrderedKernels));
756
757 // Annotate the kernels with their order in this vector
758 LLVMContext &Ctx = M->getContext();
759 IRBuilder<> Builder(Ctx);
760
761 if (OrderedKernels.size() > UINT32_MAX) {
762 // 32 bit keeps it in one SGPR. > 2**32 kernels won't fit on the GPU
763 report_fatal_error(reason: "Unimplemented LDS lowering for > 2**32 kernels");
764 }
765
766 for (size_t i = 0; i < OrderedKernels.size(); i++) {
767 Metadata *AttrMDArgs[1] = {
768 ConstantAsMetadata::get(C: Builder.getInt32(C: i)),
769 };
770 OrderedKernels[i]->setMetadata(Kind: "llvm.amdgcn.lds.kernel.id",
771 Node: MDNode::get(Context&: Ctx, MDs: AttrMDArgs));
772 }
773 }
774 return OrderedKernels;
775 }
776
777 static void partitionVariablesIntoIndirectStrategies(
778 Module &M, LDSUsesInfoTy const &LDSUsesInfo,
779 VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly,
780 DenseSet<GlobalVariable *> &ModuleScopeVariables,
781 DenseSet<GlobalVariable *> &TableLookupVariables,
782 DenseSet<GlobalVariable *> &KernelAccessVariables,
783 DenseSet<GlobalVariable *> &DynamicVariables) {
784
785 GlobalVariable *HybridModuleRoot =
786 LoweringKindLoc != LoweringKind::hybrid
787 ? nullptr
788 : chooseBestVariableForModuleStrategy(
789 DL: M.getDataLayout(), LDSVars&: LDSToKernelsThatNeedToAccessItIndirectly);
790
791 DenseSet<Function *> const EmptySet;
792 DenseSet<Function *> const &HybridModuleRootKernels =
793 HybridModuleRoot
794 ? LDSToKernelsThatNeedToAccessItIndirectly[HybridModuleRoot]
795 : EmptySet;
796
797 for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
798 // Each iteration of this loop assigns exactly one global variable to
799 // exactly one of the implementation strategies.
800
801 GlobalVariable *GV = K.first;
802 assert(AMDGPU::isLDSVariableToLower(*GV));
803 assert(K.second.size() != 0);
804
805 if (AMDGPU::isDynamicLDS(GV: *GV)) {
806 DynamicVariables.insert(V: GV);
807 continue;
808 }
809
810 switch (LoweringKindLoc) {
811 case LoweringKind::module:
812 ModuleScopeVariables.insert(V: GV);
813 break;
814
815 case LoweringKind::table:
816 TableLookupVariables.insert(V: GV);
817 break;
818
819 case LoweringKind::kernel:
820 if (K.second.size() == 1) {
821 KernelAccessVariables.insert(V: GV);
822 } else {
823 report_fatal_error(
824 reason: "cannot lower LDS '" + GV->getName() +
825 "' to kernel access as it is reachable from multiple kernels");
826 }
827 break;
828
829 case LoweringKind::hybrid: {
830 if (GV == HybridModuleRoot) {
831 assert(K.second.size() != 1);
832 ModuleScopeVariables.insert(V: GV);
833 } else if (K.second.size() == 1) {
834 KernelAccessVariables.insert(V: GV);
835 } else if (set_is_subset(S1: K.second, S2: HybridModuleRootKernels)) {
836 ModuleScopeVariables.insert(V: GV);
837 } else {
838 TableLookupVariables.insert(V: GV);
839 }
840 break;
841 }
842 }
843 }
844
845 // All LDS variables accessed indirectly have now been partitioned into
846 // the distinct lowering strategies.
847 assert(ModuleScopeVariables.size() + TableLookupVariables.size() +
848 KernelAccessVariables.size() + DynamicVariables.size() ==
849 LDSToKernelsThatNeedToAccessItIndirectly.size());
850 }
851
852 static GlobalVariable *lowerModuleScopeStructVariables(
853 Module &M, DenseSet<GlobalVariable *> const &ModuleScopeVariables,
854 DenseSet<Function *> const &KernelsThatAllocateModuleLDS) {
855 // Create a struct to hold the ModuleScopeVariables
856 // Replace all uses of those variables from non-kernel functions with the
857 // new struct instance Replace only the uses from kernel functions that will
858 // allocate this instance. That is a space optimisation - kernels that use a
859 // subset of the module scope struct and do not need to allocate it for
860 // indirect calls will only allocate the subset they use (they do so as part
861 // of the per-kernel lowering).
862 if (ModuleScopeVariables.empty()) {
863 return nullptr;
864 }
865
866 LLVMContext &Ctx = M.getContext();
867
868 LDSVariableReplacement ModuleScopeReplacement =
869 createLDSVariableReplacement(M, VarName: "llvm.amdgcn.module.lds",
870 LDSVarsToTransform: ModuleScopeVariables);
871
872 appendToCompilerUsed(M, Values: {static_cast<GlobalValue *>(
873 ConstantExpr::getPointerBitCastOrAddrSpaceCast(
874 C: cast<Constant>(Val: ModuleScopeReplacement.SGV),
875 Ty: PointerType::getUnqual(C&: Ctx)))});
876
877 // module.lds will be allocated at zero in any kernel that allocates it
878 recordLDSAbsoluteAddress(M: &M, GV: ModuleScopeReplacement.SGV, Address: 0);
879
880 // historic
881 removeLocalVarsFromUsedLists(M, LocalVars: ModuleScopeVariables);
882
883 // Replace all uses of module scope variable from non-kernel functions
884 replaceLDSVariablesWithStruct(
885 M, LDSVarsToTransformArg: ModuleScopeVariables, Replacement: ModuleScopeReplacement, Predicate: [&](Use &U) {
886 Instruction *I = dyn_cast<Instruction>(Val: U.getUser());
887 if (!I) {
888 return false;
889 }
890 Function *F = I->getFunction();
891 return !isKernelLDS(F);
892 });
893
894 // Replace uses of module scope variable from kernel functions that
895 // allocate the module scope variable, otherwise leave them unchanged
896 // Record on each kernel whether the module scope global is used by it
897
898 for (Function &Func : M.functions()) {
899 if (Func.isDeclaration() || !isKernelLDS(F: &Func))
900 continue;
901
902 if (KernelsThatAllocateModuleLDS.contains(V: &Func)) {
903 replaceLDSVariablesWithStruct(
904 M, LDSVarsToTransformArg: ModuleScopeVariables, Replacement: ModuleScopeReplacement, Predicate: [&](Use &U) {
905 Instruction *I = dyn_cast<Instruction>(Val: U.getUser());
906 if (!I) {
907 return false;
908 }
909 Function *F = I->getFunction();
910 return F == &Func;
911 });
912
913 markUsedByKernel(Func: &Func, SGV: ModuleScopeReplacement.SGV);
914 }
915 }
916
917 return ModuleScopeReplacement.SGV;
918 }
919
920 static DenseMap<Function *, LDSVariableReplacement>
921 lowerKernelScopeStructVariables(
922 Module &M, LDSUsesInfoTy &LDSUsesInfo,
923 DenseSet<GlobalVariable *> const &ModuleScopeVariables,
924 DenseSet<Function *> const &KernelsThatAllocateModuleLDS,
925 GlobalVariable *MaybeModuleScopeStruct) {
926
927 // Create a struct for each kernel for the non-module-scope variables.
928
929 DenseMap<Function *, LDSVariableReplacement> KernelToReplacement;
930 for (Function &Func : M.functions()) {
931 if (Func.isDeclaration() || !isKernelLDS(F: &Func))
932 continue;
933
934 DenseSet<GlobalVariable *> KernelUsedVariables;
935 // Allocating variables that are used directly in this struct to get
936 // alignment aware allocation and predictable frame size.
937 for (auto &v : LDSUsesInfo.direct_access[&Func]) {
938 if (!AMDGPU::isDynamicLDS(GV: *v)) {
939 KernelUsedVariables.insert(V: v);
940 }
941 }
942
943 // Allocating variables that are accessed indirectly so that a lookup of
944 // this struct instance can find them from nested functions.
945 for (auto &v : LDSUsesInfo.indirect_access[&Func]) {
946 if (!AMDGPU::isDynamicLDS(GV: *v)) {
947 KernelUsedVariables.insert(V: v);
948 }
949 }
950
951 // Variables allocated in module lds must all resolve to that struct,
952 // not to the per-kernel instance.
953 if (KernelsThatAllocateModuleLDS.contains(V: &Func)) {
954 for (GlobalVariable *v : ModuleScopeVariables) {
955 KernelUsedVariables.erase(V: v);
956 }
957 }
958
959 if (KernelUsedVariables.empty()) {
960 // Either used no LDS, or the LDS it used was all in the module struct
961 // or dynamically sized
962 continue;
963 }
964
965 // The association between kernel function and LDS struct is done by
966 // symbol name, which only works if the function in question has a
967 // name This is not expected to be a problem in practice as kernels
968 // are called by name making anonymous ones (which are named by the
969 // backend) difficult to use. This does mean that llvm test cases need
970 // to name the kernels.
971 if (!Func.hasName()) {
972 report_fatal_error(reason: "Anonymous kernels cannot use LDS variables");
973 }
974
975 std::string VarName =
976 (Twine("llvm.amdgcn.kernel.") + Func.getName() + ".lds").str();
977
978 auto Replacement =
979 createLDSVariableReplacement(M, VarName, LDSVarsToTransform: KernelUsedVariables);
980
981 // If any indirect uses, create a direct use to ensure allocation
982 // TODO: Simpler to unconditionally mark used but that regresses
983 // codegen in test/CodeGen/AMDGPU/noclobber-barrier.ll
984 auto Accesses = LDSUsesInfo.indirect_access.find(Val: &Func);
985 if ((Accesses != LDSUsesInfo.indirect_access.end()) &&
986 !Accesses->second.empty())
987 markUsedByKernel(Func: &Func, SGV: Replacement.SGV);
988
989 // remove preserves existing codegen
990 removeLocalVarsFromUsedLists(M, LocalVars: KernelUsedVariables);
991 KernelToReplacement[&Func] = Replacement;
992
993 // Rewrite uses within kernel to the new struct
994 replaceLDSVariablesWithStruct(
995 M, LDSVarsToTransformArg: KernelUsedVariables, Replacement, Predicate: [&Func](Use &U) {
996 Instruction *I = dyn_cast<Instruction>(Val: U.getUser());
997 return I && I->getFunction() == &Func;
998 });
999 }
1000 return KernelToReplacement;
1001 }
1002
1003 static GlobalVariable *
1004 buildRepresentativeDynamicLDSInstance(Module &M, LDSUsesInfoTy &LDSUsesInfo,
1005 Function *func) {
1006 // Create a dynamic lds variable with a name associated with the passed
1007 // function that has the maximum alignment of any dynamic lds variable
1008 // reachable from this kernel. Dynamic LDS is allocated after the static LDS
1009 // allocation, possibly after alignment padding. The representative variable
1010 // created here has the maximum alignment of any other dynamic variable
1011 // reachable by that kernel. All dynamic LDS variables are allocated at the
1012 // same address in each kernel in order to provide the documented aliasing
1013 // semantics. Setting the alignment here allows this IR pass to accurately
1014 // predict the exact constant at which it will be allocated.
1015
1016 assert(isKernelLDS(func));
1017
1018 LLVMContext &Ctx = M.getContext();
1019 const DataLayout &DL = M.getDataLayout();
1020 Align MaxDynamicAlignment(1);
1021
1022 auto UpdateMaxAlignment = [&MaxDynamicAlignment, &DL](GlobalVariable *GV) {
1023 if (AMDGPU::isDynamicLDS(GV: *GV)) {
1024 MaxDynamicAlignment =
1025 std::max(a: MaxDynamicAlignment, b: AMDGPU::getAlign(DL, GV));
1026 }
1027 };
1028
1029 for (GlobalVariable *GV : LDSUsesInfo.indirect_access[func]) {
1030 UpdateMaxAlignment(GV);
1031 }
1032
1033 for (GlobalVariable *GV : LDSUsesInfo.direct_access[func]) {
1034 UpdateMaxAlignment(GV);
1035 }
1036
1037 assert(func->hasName()); // Checked by caller
1038 auto emptyCharArray = ArrayType::get(ElementType: Type::getInt8Ty(C&: Ctx), NumElements: 0);
1039 GlobalVariable *N = new GlobalVariable(
1040 M, emptyCharArray, false, GlobalValue::ExternalLinkage, nullptr,
1041 Twine("llvm.amdgcn." + func->getName() + ".dynlds"), nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
1042 false);
1043 N->setAlignment(MaxDynamicAlignment);
1044
1045 assert(AMDGPU::isDynamicLDS(*N));
1046 return N;
1047 }
1048
1049 /// Strip "amdgpu-no-lds-kernel-id" from any functions where we may have
1050 /// introduced its use. If AMDGPUAttributor ran prior to the pass, we inferred
1051 /// the lack of llvm.amdgcn.lds.kernel.id calls.
1052 void removeNoLdsKernelIdFromReachable(CallGraph &CG, Function *KernelRoot) {
1053 KernelRoot->removeFnAttr(Kind: "amdgpu-no-lds-kernel-id");
1054
1055 SmallVector<Function *> WorkList({CG[KernelRoot]->getFunction()});
1056 SmallPtrSet<Function *, 8> Visited;
1057 bool SeenUnknownCall = false;
1058
1059 while (!WorkList.empty()) {
1060 Function *F = WorkList.pop_back_val();
1061
1062 for (auto &CallRecord : *CG[F]) {
1063 if (!CallRecord.second)
1064 continue;
1065
1066 Function *Callee = CallRecord.second->getFunction();
1067 if (!Callee) {
1068 if (!SeenUnknownCall) {
1069 SeenUnknownCall = true;
1070
1071 // If we see any indirect calls, assume nothing about potential
1072 // targets.
1073 // TODO: This could be refined to possible LDS global users.
1074 for (auto &ExternalCallRecord : *CG.getExternalCallingNode()) {
1075 Function *PotentialCallee =
1076 ExternalCallRecord.second->getFunction();
1077 assert(PotentialCallee);
1078 if (!isKernelLDS(F: PotentialCallee))
1079 PotentialCallee->removeFnAttr(Kind: "amdgpu-no-lds-kernel-id");
1080 }
1081 }
1082 } else {
1083 Callee->removeFnAttr(Kind: "amdgpu-no-lds-kernel-id");
1084 if (Visited.insert(Ptr: Callee).second)
1085 WorkList.push_back(Elt: Callee);
1086 }
1087 }
1088 }
1089 }
1090
1091 DenseMap<Function *, GlobalVariable *> lowerDynamicLDSVariables(
1092 Module &M, LDSUsesInfoTy &LDSUsesInfo,
1093 DenseSet<Function *> const &KernelsThatIndirectlyAllocateDynamicLDS,
1094 DenseSet<GlobalVariable *> const &DynamicVariables,
1095 std::vector<Function *> const &OrderedKernels) {
1096 DenseMap<Function *, GlobalVariable *> KernelToCreatedDynamicLDS;
1097 if (!KernelsThatIndirectlyAllocateDynamicLDS.empty()) {
1098 LLVMContext &Ctx = M.getContext();
1099 IRBuilder<> Builder(Ctx);
1100 Type *I32 = Type::getInt32Ty(C&: Ctx);
1101
1102 std::vector<Constant *> newDynamicLDS;
1103
1104 // Table is built in the same order as OrderedKernels
1105 for (auto &func : OrderedKernels) {
1106
1107 if (KernelsThatIndirectlyAllocateDynamicLDS.contains(V: func)) {
1108 assert(isKernelLDS(func));
1109 if (!func->hasName()) {
1110 report_fatal_error(reason: "Anonymous kernels cannot use LDS variables");
1111 }
1112
1113 GlobalVariable *N =
1114 buildRepresentativeDynamicLDSInstance(M, LDSUsesInfo, func);
1115
1116 KernelToCreatedDynamicLDS[func] = N;
1117
1118 markUsedByKernel(Func: func, SGV: N);
1119
1120 auto emptyCharArray = ArrayType::get(ElementType: Type::getInt8Ty(C&: Ctx), NumElements: 0);
1121 auto GEP = ConstantExpr::getGetElementPtr(
1122 Ty: emptyCharArray, C: N, Idx: ConstantInt::get(Ty: I32, V: 0), InBounds: true);
1123 newDynamicLDS.push_back(x: ConstantExpr::getPtrToInt(C: GEP, Ty: I32));
1124 } else {
1125 newDynamicLDS.push_back(x: PoisonValue::get(T: I32));
1126 }
1127 }
1128 assert(OrderedKernels.size() == newDynamicLDS.size());
1129
1130 ArrayType *t = ArrayType::get(ElementType: I32, NumElements: newDynamicLDS.size());
1131 Constant *init = ConstantArray::get(T: t, V: newDynamicLDS);
1132 GlobalVariable *table = new GlobalVariable(
1133 M, t, true, GlobalValue::InternalLinkage, init,
1134 "llvm.amdgcn.dynlds.offset.table", nullptr,
1135 GlobalValue::NotThreadLocal, AMDGPUAS::CONSTANT_ADDRESS);
1136
1137 for (GlobalVariable *GV : DynamicVariables) {
1138 for (Use &U : make_early_inc_range(Range: GV->uses())) {
1139 auto *I = dyn_cast<Instruction>(Val: U.getUser());
1140 if (!I)
1141 continue;
1142 if (isKernelLDS(F: I->getFunction()))
1143 continue;
1144
1145 replaceUseWithTableLookup(M, Builder, LookupTable: table, GV, U, OptionalIndex: nullptr);
1146 }
1147 }
1148 }
1149 return KernelToCreatedDynamicLDS;
1150 }
1151
1152 bool runOnModule(Module &M) {
1153 CallGraph CG = CallGraph(M);
1154 bool Changed = superAlignLDSGlobals(M);
1155
1156 Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M);
1157
1158 Changed = true; // todo: narrow this down
1159
1160 // For each kernel, what variables does it access directly or through
1161 // callees
1162 LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M);
1163
1164 // For each variable accessed through callees, which kernels access it
1165 VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly;
1166 for (auto &K : LDSUsesInfo.indirect_access) {
1167 Function *F = K.first;
1168 assert(isKernelLDS(F));
1169 for (GlobalVariable *GV : K.second) {
1170 LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(V: F);
1171 }
1172 }
1173
1174 // Partition variables accessed indirectly into the different strategies
1175 DenseSet<GlobalVariable *> ModuleScopeVariables;
1176 DenseSet<GlobalVariable *> TableLookupVariables;
1177 DenseSet<GlobalVariable *> KernelAccessVariables;
1178 DenseSet<GlobalVariable *> DynamicVariables;
1179 partitionVariablesIntoIndirectStrategies(
1180 M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly,
1181 ModuleScopeVariables, TableLookupVariables, KernelAccessVariables,
1182 DynamicVariables);
1183
1184 // If the kernel accesses a variable that is going to be stored in the
1185 // module instance through a call then that kernel needs to allocate the
1186 // module instance
1187 const DenseSet<Function *> KernelsThatAllocateModuleLDS =
1188 kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo,
1189 VariableSet: ModuleScopeVariables);
1190 const DenseSet<Function *> KernelsThatAllocateTableLDS =
1191 kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo,
1192 VariableSet: TableLookupVariables);
1193
1194 const DenseSet<Function *> KernelsThatIndirectlyAllocateDynamicLDS =
1195 kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo,
1196 VariableSet: DynamicVariables);
1197
1198 GlobalVariable *MaybeModuleScopeStruct = lowerModuleScopeStructVariables(
1199 M, ModuleScopeVariables, KernelsThatAllocateModuleLDS);
1200
1201 DenseMap<Function *, LDSVariableReplacement> KernelToReplacement =
1202 lowerKernelScopeStructVariables(M, LDSUsesInfo, ModuleScopeVariables,
1203 KernelsThatAllocateModuleLDS,
1204 MaybeModuleScopeStruct);
1205
1206 // Lower zero cost accesses to the kernel instances just created
1207 for (auto &GV : KernelAccessVariables) {
1208 auto &funcs = LDSToKernelsThatNeedToAccessItIndirectly[GV];
1209 assert(funcs.size() == 1); // Only one kernel can access it
1210 LDSVariableReplacement Replacement =
1211 KernelToReplacement[*(funcs.begin())];
1212
1213 DenseSet<GlobalVariable *> Vec;
1214 Vec.insert(V: GV);
1215
1216 replaceLDSVariablesWithStruct(M, LDSVarsToTransformArg: Vec, Replacement, Predicate: [](Use &U) {
1217 return isa<Instruction>(Val: U.getUser());
1218 });
1219 }
1220
1221 // The ith element of this vector is kernel id i
1222 std::vector<Function *> OrderedKernels =
1223 assignLDSKernelIDToEachKernel(M: &M, KernelsThatAllocateTableLDS,
1224 KernelsThatIndirectlyAllocateDynamicLDS);
1225
1226 if (!KernelsThatAllocateTableLDS.empty()) {
1227 LLVMContext &Ctx = M.getContext();
1228 IRBuilder<> Builder(Ctx);
1229
1230 // The order must be consistent between lookup table and accesses to
1231 // lookup table
1232 auto TableLookupVariablesOrdered =
1233 sortByName(V: std::vector<GlobalVariable *>(TableLookupVariables.begin(),
1234 TableLookupVariables.end()));
1235
1236 GlobalVariable *LookupTable = buildLookupTable(
1237 M, Variables: TableLookupVariablesOrdered, kernels: OrderedKernels, KernelToReplacement);
1238 replaceUsesInInstructionsWithTableLookup(M, ModuleScopeVariables: TableLookupVariablesOrdered,
1239 LookupTable);
1240
1241 // Strip amdgpu-no-lds-kernel-id from all functions reachable from the
1242 // kernel. We may have inferred this wasn't used prior to the pass.
1243 //
1244 // TODO: We could filter out subgraphs that do not access LDS globals.
1245 for (Function *F : KernelsThatAllocateTableLDS)
1246 removeNoLdsKernelIdFromReachable(CG, KernelRoot: F);
1247 }
1248
1249 DenseMap<Function *, GlobalVariable *> KernelToCreatedDynamicLDS =
1250 lowerDynamicLDSVariables(M, LDSUsesInfo,
1251 KernelsThatIndirectlyAllocateDynamicLDS,
1252 DynamicVariables, OrderedKernels);
1253
1254 // All kernel frames have been allocated. Calculate and record the
1255 // addresses.
1256 {
1257 const DataLayout &DL = M.getDataLayout();
1258
1259 for (Function &Func : M.functions()) {
1260 if (Func.isDeclaration() || !isKernelLDS(F: &Func))
1261 continue;
1262
1263 // All three of these are optional. The first variable is allocated at
1264 // zero. They are allocated by AMDGPUMachineFunction as one block.
1265 // Layout:
1266 //{
1267 // module.lds
1268 // alignment padding
1269 // kernel instance
1270 // alignment padding
1271 // dynamic lds variables
1272 //}
1273
1274 const bool AllocateModuleScopeStruct =
1275 MaybeModuleScopeStruct &&
1276 KernelsThatAllocateModuleLDS.contains(V: &Func);
1277
1278 auto Replacement = KernelToReplacement.find(Val: &Func);
1279 const bool AllocateKernelScopeStruct =
1280 Replacement != KernelToReplacement.end();
1281
1282 const bool AllocateDynamicVariable =
1283 KernelToCreatedDynamicLDS.contains(Val: &Func);
1284
1285 uint32_t Offset = 0;
1286
1287 if (AllocateModuleScopeStruct) {
1288 // Allocated at zero, recorded once on construction, not once per
1289 // kernel
1290 Offset += DL.getTypeAllocSize(Ty: MaybeModuleScopeStruct->getValueType());
1291 }
1292
1293 if (AllocateKernelScopeStruct) {
1294 GlobalVariable *KernelStruct = Replacement->second.SGV;
1295 Offset = alignTo(Size: Offset, A: AMDGPU::getAlign(DL, GV: KernelStruct));
1296 recordLDSAbsoluteAddress(M: &M, GV: KernelStruct, Address: Offset);
1297 Offset += DL.getTypeAllocSize(Ty: KernelStruct->getValueType());
1298 }
1299
1300 // If there is dynamic allocation, the alignment needed is included in
1301 // the static frame size. There may be no reference to the dynamic
1302 // variable in the kernel itself, so without including it here, that
1303 // alignment padding could be missed.
1304 if (AllocateDynamicVariable) {
1305 GlobalVariable *DynamicVariable = KernelToCreatedDynamicLDS[&Func];
1306 Offset = alignTo(Size: Offset, A: AMDGPU::getAlign(DL, GV: DynamicVariable));
1307 recordLDSAbsoluteAddress(M: &M, GV: DynamicVariable, Address: Offset);
1308 }
1309
1310 if (Offset != 0) {
1311 (void)TM; // TODO: Account for target maximum LDS
1312 std::string Buffer;
1313 raw_string_ostream SS{Buffer};
1314 SS << format(Fmt: "%u", Vals: Offset);
1315
1316 // Instead of explictly marking kernels that access dynamic variables
1317 // using special case metadata, annotate with min-lds == max-lds, i.e.
1318 // that there is no more space available for allocating more static
1319 // LDS variables. That is the right condition to prevent allocating
1320 // more variables which would collide with the addresses assigned to
1321 // dynamic variables.
1322 if (AllocateDynamicVariable)
1323 SS << format(Fmt: ",%u", Vals: Offset);
1324
1325 Func.addFnAttr(Kind: "amdgpu-lds-size", Val: Buffer);
1326 }
1327 }
1328 }
1329
1330 for (auto &GV : make_early_inc_range(Range: M.globals()))
1331 if (AMDGPU::isLDSVariableToLower(GV)) {
1332 // probably want to remove from used lists
1333 GV.removeDeadConstantUsers();
1334 if (GV.use_empty())
1335 GV.eraseFromParent();
1336 }
1337
1338 return Changed;
1339 }
1340
1341private:
1342 // Increase the alignment of LDS globals if necessary to maximise the chance
1343 // that we can use aligned LDS instructions to access them.
1344 static bool superAlignLDSGlobals(Module &M) {
1345 const DataLayout &DL = M.getDataLayout();
1346 bool Changed = false;
1347 if (!SuperAlignLDSGlobals) {
1348 return Changed;
1349 }
1350
1351 for (auto &GV : M.globals()) {
1352 if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
1353 // Only changing alignment of LDS variables
1354 continue;
1355 }
1356 if (!GV.hasInitializer()) {
1357 // cuda/hip extern __shared__ variable, leave alignment alone
1358 continue;
1359 }
1360
1361 Align Alignment = AMDGPU::getAlign(DL, GV: &GV);
1362 TypeSize GVSize = DL.getTypeAllocSize(Ty: GV.getValueType());
1363
1364 if (GVSize > 8) {
1365 // We might want to use a b96 or b128 load/store
1366 Alignment = std::max(a: Alignment, b: Align(16));
1367 } else if (GVSize > 4) {
1368 // We might want to use a b64 load/store
1369 Alignment = std::max(a: Alignment, b: Align(8));
1370 } else if (GVSize > 2) {
1371 // We might want to use a b32 load/store
1372 Alignment = std::max(a: Alignment, b: Align(4));
1373 } else if (GVSize > 1) {
1374 // We might want to use a b16 load/store
1375 Alignment = std::max(a: Alignment, b: Align(2));
1376 }
1377
1378 if (Alignment != AMDGPU::getAlign(DL, GV: &GV)) {
1379 Changed = true;
1380 GV.setAlignment(Alignment);
1381 }
1382 }
1383 return Changed;
1384 }
1385
1386 static LDSVariableReplacement createLDSVariableReplacement(
1387 Module &M, std::string VarName,
1388 DenseSet<GlobalVariable *> const &LDSVarsToTransform) {
1389 // Create a struct instance containing LDSVarsToTransform and map from those
1390 // variables to ConstantExprGEP
1391 // Variables may be introduced to meet alignment requirements. No aliasing
1392 // metadata is useful for these as they have no uses. Erased before return.
1393
1394 LLVMContext &Ctx = M.getContext();
1395 const DataLayout &DL = M.getDataLayout();
1396 assert(!LDSVarsToTransform.empty());
1397
1398 SmallVector<OptimizedStructLayoutField, 8> LayoutFields;
1399 LayoutFields.reserve(N: LDSVarsToTransform.size());
1400 {
1401 // The order of fields in this struct depends on the order of
1402 // varables in the argument which varies when changing how they
1403 // are identified, leading to spurious test breakage.
1404 auto Sorted = sortByName(V: std::vector<GlobalVariable *>(
1405 LDSVarsToTransform.begin(), LDSVarsToTransform.end()));
1406
1407 for (GlobalVariable *GV : Sorted) {
1408 OptimizedStructLayoutField F(GV,
1409 DL.getTypeAllocSize(Ty: GV->getValueType()),
1410 AMDGPU::getAlign(DL, GV));
1411 LayoutFields.emplace_back(Args&: F);
1412 }
1413 }
1414
1415 performOptimizedStructLayout(Fields: LayoutFields);
1416
1417 std::vector<GlobalVariable *> LocalVars;
1418 BitVector IsPaddingField;
1419 LocalVars.reserve(n: LDSVarsToTransform.size()); // will be at least this large
1420 IsPaddingField.reserve(N: LDSVarsToTransform.size());
1421 {
1422 uint64_t CurrentOffset = 0;
1423 for (size_t I = 0; I < LayoutFields.size(); I++) {
1424 GlobalVariable *FGV = static_cast<GlobalVariable *>(
1425 const_cast<void *>(LayoutFields[I].Id));
1426 Align DataAlign = LayoutFields[I].Alignment;
1427
1428 uint64_t DataAlignV = DataAlign.value();
1429 if (uint64_t Rem = CurrentOffset % DataAlignV) {
1430 uint64_t Padding = DataAlignV - Rem;
1431
1432 // Append an array of padding bytes to meet alignment requested
1433 // Note (o + (a - (o % a)) ) % a == 0
1434 // (offset + Padding ) % align == 0
1435
1436 Type *ATy = ArrayType::get(ElementType: Type::getInt8Ty(C&: Ctx), NumElements: Padding);
1437 LocalVars.push_back(x: new GlobalVariable(
1438 M, ATy, false, GlobalValue::InternalLinkage,
1439 PoisonValue::get(T: ATy), "", nullptr, GlobalValue::NotThreadLocal,
1440 AMDGPUAS::LOCAL_ADDRESS, false));
1441 IsPaddingField.push_back(Val: true);
1442 CurrentOffset += Padding;
1443 }
1444
1445 LocalVars.push_back(x: FGV);
1446 IsPaddingField.push_back(Val: false);
1447 CurrentOffset += LayoutFields[I].Size;
1448 }
1449 }
1450
1451 std::vector<Type *> LocalVarTypes;
1452 LocalVarTypes.reserve(n: LocalVars.size());
1453 std::transform(
1454 first: LocalVars.cbegin(), last: LocalVars.cend(), result: std::back_inserter(x&: LocalVarTypes),
1455 unary_op: [](const GlobalVariable *V) -> Type * { return V->getValueType(); });
1456
1457 StructType *LDSTy = StructType::create(Context&: Ctx, Elements: LocalVarTypes, Name: VarName + ".t");
1458
1459 Align StructAlign = AMDGPU::getAlign(DL, GV: LocalVars[0]);
1460
1461 GlobalVariable *SGV = new GlobalVariable(
1462 M, LDSTy, false, GlobalValue::InternalLinkage, PoisonValue::get(T: LDSTy),
1463 VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
1464 false);
1465 SGV->setAlignment(StructAlign);
1466
1467 DenseMap<GlobalVariable *, Constant *> Map;
1468 Type *I32 = Type::getInt32Ty(C&: Ctx);
1469 for (size_t I = 0; I < LocalVars.size(); I++) {
1470 GlobalVariable *GV = LocalVars[I];
1471 Constant *GEPIdx[] = {ConstantInt::get(Ty: I32, V: 0), ConstantInt::get(Ty: I32, V: I)};
1472 Constant *GEP = ConstantExpr::getGetElementPtr(Ty: LDSTy, C: SGV, IdxList: GEPIdx, InBounds: true);
1473 if (IsPaddingField[I]) {
1474 assert(GV->use_empty());
1475 GV->eraseFromParent();
1476 } else {
1477 Map[GV] = GEP;
1478 }
1479 }
1480 assert(Map.size() == LDSVarsToTransform.size());
1481 return {.SGV: SGV, .LDSVarsToConstantGEP: std::move(Map)};
1482 }
1483
1484 template <typename PredicateTy>
1485 static void replaceLDSVariablesWithStruct(
1486 Module &M, DenseSet<GlobalVariable *> const &LDSVarsToTransformArg,
1487 const LDSVariableReplacement &Replacement, PredicateTy Predicate) {
1488 LLVMContext &Ctx = M.getContext();
1489 const DataLayout &DL = M.getDataLayout();
1490
1491 // A hack... we need to insert the aliasing info in a predictable order for
1492 // lit tests. Would like to have them in a stable order already, ideally the
1493 // same order they get allocated, which might mean an ordered set container
1494 auto LDSVarsToTransform = sortByName(V: std::vector<GlobalVariable *>(
1495 LDSVarsToTransformArg.begin(), LDSVarsToTransformArg.end()));
1496
1497 // Create alias.scope and their lists. Each field in the new structure
1498 // does not alias with all other fields.
1499 SmallVector<MDNode *> AliasScopes;
1500 SmallVector<Metadata *> NoAliasList;
1501 const size_t NumberVars = LDSVarsToTransform.size();
1502 if (NumberVars > 1) {
1503 MDBuilder MDB(Ctx);
1504 AliasScopes.reserve(N: NumberVars);
1505 MDNode *Domain = MDB.createAnonymousAliasScopeDomain();
1506 for (size_t I = 0; I < NumberVars; I++) {
1507 MDNode *Scope = MDB.createAnonymousAliasScope(Domain);
1508 AliasScopes.push_back(Elt: Scope);
1509 }
1510 NoAliasList.append(in_start: &AliasScopes[1], in_end: AliasScopes.end());
1511 }
1512
1513 // Replace uses of ith variable with a constantexpr to the corresponding
1514 // field of the instance that will be allocated by AMDGPUMachineFunction
1515 for (size_t I = 0; I < NumberVars; I++) {
1516 GlobalVariable *GV = LDSVarsToTransform[I];
1517 Constant *GEP = Replacement.LDSVarsToConstantGEP.at(Val: GV);
1518
1519 GV->replaceUsesWithIf(New: GEP, ShouldReplace: Predicate);
1520
1521 APInt APOff(DL.getIndexTypeSizeInBits(Ty: GEP->getType()), 0);
1522 GEP->stripAndAccumulateInBoundsConstantOffsets(DL, Offset&: APOff);
1523 uint64_t Offset = APOff.getZExtValue();
1524
1525 Align A =
1526 commonAlignment(A: Replacement.SGV->getAlign().valueOrOne(), Offset);
1527
1528 if (I)
1529 NoAliasList[I - 1] = AliasScopes[I - 1];
1530 MDNode *NoAlias =
1531 NoAliasList.empty() ? nullptr : MDNode::get(Context&: Ctx, MDs: NoAliasList);
1532 MDNode *AliasScope =
1533 AliasScopes.empty() ? nullptr : MDNode::get(Context&: Ctx, MDs: {AliasScopes[I]});
1534
1535 refineUsesAlignmentAndAA(Ptr: GEP, A, DL, AliasScope, NoAlias);
1536 }
1537 }
1538
1539 static void refineUsesAlignmentAndAA(Value *Ptr, Align A,
1540 const DataLayout &DL, MDNode *AliasScope,
1541 MDNode *NoAlias, unsigned MaxDepth = 5) {
1542 if (!MaxDepth || (A == 1 && !AliasScope))
1543 return;
1544
1545 for (User *U : Ptr->users()) {
1546 if (auto *I = dyn_cast<Instruction>(Val: U)) {
1547 if (AliasScope && I->mayReadOrWriteMemory()) {
1548 MDNode *AS = I->getMetadata(KindID: LLVMContext::MD_alias_scope);
1549 AS = (AS ? MDNode::getMostGenericAliasScope(A: AS, B: AliasScope)
1550 : AliasScope);
1551 I->setMetadata(KindID: LLVMContext::MD_alias_scope, Node: AS);
1552
1553 MDNode *NA = I->getMetadata(KindID: LLVMContext::MD_noalias);
1554 NA = (NA ? MDNode::intersect(A: NA, B: NoAlias) : NoAlias);
1555 I->setMetadata(KindID: LLVMContext::MD_noalias, Node: NA);
1556 }
1557 }
1558
1559 if (auto *LI = dyn_cast<LoadInst>(Val: U)) {
1560 LI->setAlignment(std::max(a: A, b: LI->getAlign()));
1561 continue;
1562 }
1563 if (auto *SI = dyn_cast<StoreInst>(Val: U)) {
1564 if (SI->getPointerOperand() == Ptr)
1565 SI->setAlignment(std::max(a: A, b: SI->getAlign()));
1566 continue;
1567 }
1568 if (auto *AI = dyn_cast<AtomicRMWInst>(Val: U)) {
1569 // None of atomicrmw operations can work on pointers, but let's
1570 // check it anyway in case it will or we will process ConstantExpr.
1571 if (AI->getPointerOperand() == Ptr)
1572 AI->setAlignment(std::max(a: A, b: AI->getAlign()));
1573 continue;
1574 }
1575 if (auto *AI = dyn_cast<AtomicCmpXchgInst>(Val: U)) {
1576 if (AI->getPointerOperand() == Ptr)
1577 AI->setAlignment(std::max(a: A, b: AI->getAlign()));
1578 continue;
1579 }
1580 if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: U)) {
1581 unsigned BitWidth = DL.getIndexTypeSizeInBits(Ty: GEP->getType());
1582 APInt Off(BitWidth, 0);
1583 if (GEP->getPointerOperand() == Ptr) {
1584 Align GA;
1585 if (GEP->accumulateConstantOffset(DL, Offset&: Off))
1586 GA = commonAlignment(A, Offset: Off.getLimitedValue());
1587 refineUsesAlignmentAndAA(Ptr: GEP, A: GA, DL, AliasScope, NoAlias,
1588 MaxDepth: MaxDepth - 1);
1589 }
1590 continue;
1591 }
1592 if (auto *I = dyn_cast<Instruction>(Val: U)) {
1593 if (I->getOpcode() == Instruction::BitCast ||
1594 I->getOpcode() == Instruction::AddrSpaceCast)
1595 refineUsesAlignmentAndAA(Ptr: I, A, DL, AliasScope, NoAlias, MaxDepth: MaxDepth - 1);
1596 }
1597 }
1598 }
1599};
1600
1601class AMDGPULowerModuleLDSLegacy : public ModulePass {
1602public:
1603 const AMDGPUTargetMachine *TM;
1604 static char ID;
1605
1606 AMDGPULowerModuleLDSLegacy(const AMDGPUTargetMachine *TM_ = nullptr)
1607 : ModulePass(ID), TM(TM_) {
1608 initializeAMDGPULowerModuleLDSLegacyPass(*PassRegistry::getPassRegistry());
1609 }
1610
1611 void getAnalysisUsage(AnalysisUsage &AU) const override {
1612 if (!TM)
1613 AU.addRequired<TargetPassConfig>();
1614 }
1615
1616 bool runOnModule(Module &M) override {
1617 if (!TM) {
1618 auto &TPC = getAnalysis<TargetPassConfig>();
1619 TM = &TPC.getTM<AMDGPUTargetMachine>();
1620 }
1621
1622 return AMDGPULowerModuleLDS(*TM).runOnModule(M);
1623 }
1624};
1625
1626} // namespace
1627char AMDGPULowerModuleLDSLegacy::ID = 0;
1628
1629char &llvm::AMDGPULowerModuleLDSLegacyPassID = AMDGPULowerModuleLDSLegacy::ID;
1630
1631INITIALIZE_PASS_BEGIN(AMDGPULowerModuleLDSLegacy, DEBUG_TYPE,
1632 "Lower uses of LDS variables from non-kernel functions",
1633 false, false)
1634INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
1635INITIALIZE_PASS_END(AMDGPULowerModuleLDSLegacy, DEBUG_TYPE,
1636 "Lower uses of LDS variables from non-kernel functions",
1637 false, false)
1638
1639ModulePass *
1640llvm::createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM) {
1641 return new AMDGPULowerModuleLDSLegacy(TM);
1642}
1643
1644PreservedAnalyses AMDGPULowerModuleLDSPass::run(Module &M,
1645 ModuleAnalysisManager &) {
1646 return AMDGPULowerModuleLDS(TM).runOnModule(M) ? PreservedAnalyses::none()
1647 : PreservedAnalyses::all();
1648}
1649

source code of llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp