1//===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This provides a generalized class for OpenMP runtime code generation
10// specialized by GPU targets NVPTX and AMDGCN.
11//
12//===----------------------------------------------------------------------===//
13
14#include "CGOpenMPRuntimeGPU.h"
15#include "CodeGenFunction.h"
16#include "clang/AST/Attr.h"
17#include "clang/AST/DeclOpenMP.h"
18#include "clang/AST/OpenMPClause.h"
19#include "clang/AST/StmtOpenMP.h"
20#include "clang/AST/StmtVisitor.h"
21#include "clang/Basic/Cuda.h"
22#include "llvm/ADT/SmallPtrSet.h"
23#include "llvm/Frontend/OpenMP/OMPGridValues.h"
24#include "llvm/Support/MathExtras.h"
25
26using namespace clang;
27using namespace CodeGen;
28using namespace llvm::omp;
29
30namespace {
31/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
32class NVPTXActionTy final : public PrePostActionTy {
33 llvm::FunctionCallee EnterCallee = nullptr;
34 ArrayRef<llvm::Value *> EnterArgs;
35 llvm::FunctionCallee ExitCallee = nullptr;
36 ArrayRef<llvm::Value *> ExitArgs;
37 bool Conditional = false;
38 llvm::BasicBlock *ContBlock = nullptr;
39
40public:
41 NVPTXActionTy(llvm::FunctionCallee EnterCallee,
42 ArrayRef<llvm::Value *> EnterArgs,
43 llvm::FunctionCallee ExitCallee,
44 ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
45 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
46 ExitArgs(ExitArgs), Conditional(Conditional) {}
47 void Enter(CodeGenFunction &CGF) override {
48 llvm::Value *EnterRes = CGF.EmitRuntimeCall(callee: EnterCallee, args: EnterArgs);
49 if (Conditional) {
50 llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(Arg: EnterRes);
51 auto *ThenBlock = CGF.createBasicBlock(name: "omp_if.then");
52 ContBlock = CGF.createBasicBlock(name: "omp_if.end");
53 // Generate the branch (If-stmt)
54 CGF.Builder.CreateCondBr(Cond: CallBool, True: ThenBlock, False: ContBlock);
55 CGF.EmitBlock(BB: ThenBlock);
56 }
57 }
58 void Done(CodeGenFunction &CGF) {
59 // Emit the rest of blocks/branches
60 CGF.EmitBranch(Block: ContBlock);
61 CGF.EmitBlock(BB: ContBlock, IsFinished: true);
62 }
63 void Exit(CodeGenFunction &CGF) override {
64 CGF.EmitRuntimeCall(callee: ExitCallee, args: ExitArgs);
65 }
66};
67
68/// A class to track the execution mode when codegening directives within
69/// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
70/// to the target region and used by containing directives such as 'parallel'
71/// to emit optimized code.
72class ExecutionRuntimeModesRAII {
73private:
74 CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode =
75 CGOpenMPRuntimeGPU::EM_Unknown;
76 CGOpenMPRuntimeGPU::ExecutionMode &ExecMode;
77
78public:
79 ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
80 CGOpenMPRuntimeGPU::ExecutionMode EntryMode)
81 : ExecMode(ExecMode) {
82 SavedExecMode = ExecMode;
83 ExecMode = EntryMode;
84 }
85 ~ExecutionRuntimeModesRAII() { ExecMode = SavedExecMode; }
86};
87
88static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
89 RefExpr = RefExpr->IgnoreParens();
90 if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(Val: RefExpr)) {
91 const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
92 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Val: Base))
93 Base = TempASE->getBase()->IgnoreParenImpCasts();
94 RefExpr = Base;
95 } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(Val: RefExpr)) {
96 const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
97 while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Val: Base))
98 Base = TempOASE->getBase()->IgnoreParenImpCasts();
99 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Val: Base))
100 Base = TempASE->getBase()->IgnoreParenImpCasts();
101 RefExpr = Base;
102 }
103 RefExpr = RefExpr->IgnoreParenImpCasts();
104 if (const auto *DE = dyn_cast<DeclRefExpr>(Val: RefExpr))
105 return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
106 const auto *ME = cast<MemberExpr>(Val: RefExpr);
107 return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
108}
109
110static RecordDecl *buildRecordForGlobalizedVars(
111 ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
112 ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
113 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
114 &MappedDeclsFields,
115 int BufSize) {
116 using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;
117 if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
118 return nullptr;
119 SmallVector<VarsDataTy, 4> GlobalizedVars;
120 for (const ValueDecl *D : EscapedDecls)
121 GlobalizedVars.emplace_back(Args: C.getDeclAlign(D), Args&: D);
122 for (const ValueDecl *D : EscapedDeclsForTeams)
123 GlobalizedVars.emplace_back(Args: C.getDeclAlign(D), Args&: D);
124
125 // Build struct _globalized_locals_ty {
126 // /* globalized vars */[WarSize] align (decl_align)
127 // /* globalized vars */ for EscapedDeclsForTeams
128 // };
129 RecordDecl *GlobalizedRD = C.buildImplicitRecord(Name: "_globalized_locals_ty");
130 GlobalizedRD->startDefinition();
131 llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(
132 EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
133 for (const auto &Pair : GlobalizedVars) {
134 const ValueDecl *VD = Pair.second;
135 QualType Type = VD->getType();
136 if (Type->isLValueReferenceType())
137 Type = C.getPointerType(T: Type.getNonReferenceType());
138 else
139 Type = Type.getNonReferenceType();
140 SourceLocation Loc = VD->getLocation();
141 FieldDecl *Field;
142 if (SingleEscaped.count(Ptr: VD)) {
143 Field = FieldDecl::Create(
144 C, DC: GlobalizedRD, StartLoc: Loc, IdLoc: Loc, Id: VD->getIdentifier(), T: Type,
145 TInfo: C.getTrivialTypeSourceInfo(T: Type, Loc: SourceLocation()),
146 /*BW=*/nullptr, /*Mutable=*/false,
147 /*InitStyle=*/ICIS_NoInit);
148 Field->setAccess(AS_public);
149 if (VD->hasAttrs()) {
150 for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
151 E(VD->getAttrs().end());
152 I != E; ++I)
153 Field->addAttr(*I);
154 }
155 } else {
156 if (BufSize > 1) {
157 llvm::APInt ArraySize(32, BufSize);
158 Type = C.getConstantArrayType(EltTy: Type, ArySize: ArraySize, SizeExpr: nullptr,
159 ASM: ArraySizeModifier::Normal, IndexTypeQuals: 0);
160 }
161 Field = FieldDecl::Create(
162 C, DC: GlobalizedRD, StartLoc: Loc, IdLoc: Loc, Id: VD->getIdentifier(), T: Type,
163 TInfo: C.getTrivialTypeSourceInfo(T: Type, Loc: SourceLocation()),
164 /*BW=*/nullptr, /*Mutable=*/false,
165 /*InitStyle=*/ICIS_NoInit);
166 Field->setAccess(AS_public);
167 llvm::APInt Align(32, Pair.first.getQuantity());
168 Field->addAttr(AlignedAttr::CreateImplicit(
169 C, /*IsAlignmentExpr=*/true,
170 IntegerLiteral::Create(C, Align,
171 C.getIntTypeForBitwidth(32, /*Signed=*/0),
172 SourceLocation()),
173 {}, AlignedAttr::GNU_aligned));
174 }
175 GlobalizedRD->addDecl(Field);
176 MappedDeclsFields.try_emplace(Key: VD, Args&: Field);
177 }
178 GlobalizedRD->completeDefinition();
179 return GlobalizedRD;
180}
181
182/// Get the list of variables that can escape their declaration context.
183class CheckVarsEscapingDeclContext final
184 : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
185 CodeGenFunction &CGF;
186 llvm::SetVector<const ValueDecl *> EscapedDecls;
187 llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
188 llvm::SetVector<const ValueDecl *> DelayedVariableLengthDecls;
189 llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
190 RecordDecl *GlobalizedRD = nullptr;
191 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
192 bool AllEscaped = false;
193 bool IsForCombinedParallelRegion = false;
194
195 void markAsEscaped(const ValueDecl *VD) {
196 // Do not globalize declare target variables.
197 if (!isa<VarDecl>(VD) ||
198 OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
199 return;
200 VD = cast<ValueDecl>(VD->getCanonicalDecl());
201 // Use user-specified allocation.
202 if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
203 return;
204 // Variables captured by value must be globalized.
205 bool IsCaptured = false;
206 if (auto *CSI = CGF.CapturedStmtInfo) {
207 if (const FieldDecl *FD = CSI->lookup(VD: cast<VarDecl>(Val: VD))) {
208 // Check if need to capture the variable that was already captured by
209 // value in the outer region.
210 IsCaptured = true;
211 if (!IsForCombinedParallelRegion) {
212 if (!FD->hasAttrs())
213 return;
214 const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
215 if (!Attr)
216 return;
217 if (((Attr->getCaptureKind() != OMPC_map) &&
218 !isOpenMPPrivate(Attr->getCaptureKind())) ||
219 ((Attr->getCaptureKind() == OMPC_map) &&
220 !FD->getType()->isAnyPointerType()))
221 return;
222 }
223 if (!FD->getType()->isReferenceType()) {
224 assert(!VD->getType()->isVariablyModifiedType() &&
225 "Parameter captured by value with variably modified type");
226 EscapedParameters.insert(VD);
227 } else if (!IsForCombinedParallelRegion) {
228 return;
229 }
230 }
231 }
232 if ((!CGF.CapturedStmtInfo ||
233 (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
234 VD->getType()->isReferenceType())
235 // Do not globalize variables with reference type.
236 return;
237 if (VD->getType()->isVariablyModifiedType()) {
238 // If not captured at the target region level then mark the escaped
239 // variable as delayed.
240 if (IsCaptured)
241 EscapedVariableLengthDecls.insert(X: VD);
242 else
243 DelayedVariableLengthDecls.insert(X: VD);
244 } else
245 EscapedDecls.insert(X: VD);
246 }
247
248 void VisitValueDecl(const ValueDecl *VD) {
249 if (VD->getType()->isLValueReferenceType())
250 markAsEscaped(VD);
251 if (const auto *VarD = dyn_cast<VarDecl>(Val: VD)) {
252 if (!isa<ParmVarDecl>(Val: VarD) && VarD->hasInit()) {
253 const bool SavedAllEscaped = AllEscaped;
254 AllEscaped = VD->getType()->isLValueReferenceType();
255 Visit(VarD->getInit());
256 AllEscaped = SavedAllEscaped;
257 }
258 }
259 }
260 void VisitOpenMPCapturedStmt(const CapturedStmt *S,
261 ArrayRef<OMPClause *> Clauses,
262 bool IsCombinedParallelRegion) {
263 if (!S)
264 return;
265 for (const CapturedStmt::Capture &C : S->captures()) {
266 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
267 const ValueDecl *VD = C.getCapturedVar();
268 bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
269 if (IsCombinedParallelRegion) {
270 // Check if the variable is privatized in the combined construct and
271 // those private copies must be shared in the inner parallel
272 // directive.
273 IsForCombinedParallelRegion = false;
274 for (const OMPClause *C : Clauses) {
275 if (!isOpenMPPrivate(C->getClauseKind()) ||
276 C->getClauseKind() == OMPC_reduction ||
277 C->getClauseKind() == OMPC_linear ||
278 C->getClauseKind() == OMPC_private)
279 continue;
280 ArrayRef<const Expr *> Vars;
281 if (const auto *PC = dyn_cast<OMPFirstprivateClause>(Val: C))
282 Vars = PC->getVarRefs();
283 else if (const auto *PC = dyn_cast<OMPLastprivateClause>(Val: C))
284 Vars = PC->getVarRefs();
285 else
286 llvm_unreachable("Unexpected clause.");
287 for (const auto *E : Vars) {
288 const Decl *D =
289 cast<DeclRefExpr>(Val: E)->getDecl()->getCanonicalDecl();
290 if (D == VD->getCanonicalDecl()) {
291 IsForCombinedParallelRegion = true;
292 break;
293 }
294 }
295 if (IsForCombinedParallelRegion)
296 break;
297 }
298 }
299 markAsEscaped(VD);
300 if (isa<OMPCapturedExprDecl>(Val: VD))
301 VisitValueDecl(VD);
302 IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
303 }
304 }
305 }
306
307 void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
308 assert(!GlobalizedRD &&
309 "Record for globalized variables is built already.");
310 ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
311 unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
312 if (IsInTTDRegion)
313 EscapedDeclsForTeams = EscapedDecls.getArrayRef();
314 else
315 EscapedDeclsForParallel = EscapedDecls.getArrayRef();
316 GlobalizedRD = ::buildRecordForGlobalizedVars(
317 C&: CGF.getContext(), EscapedDecls: EscapedDeclsForParallel, EscapedDeclsForTeams,
318 MappedDeclsFields, BufSize: WarpSize);
319 }
320
321public:
322 CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
323 ArrayRef<const ValueDecl *> TeamsReductions)
324 : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
325 }
326 virtual ~CheckVarsEscapingDeclContext() = default;
327 void VisitDeclStmt(const DeclStmt *S) {
328 if (!S)
329 return;
330 for (const Decl *D : S->decls())
331 if (const auto *VD = dyn_cast_or_null<ValueDecl>(Val: D))
332 VisitValueDecl(VD);
333 }
334 void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
335 if (!D)
336 return;
337 if (!D->hasAssociatedStmt())
338 return;
339 if (const auto *S =
340 dyn_cast_or_null<CapturedStmt>(Val: D->getAssociatedStmt())) {
341 // Do not analyze directives that do not actually require capturing,
342 // like `omp for` or `omp simd` directives.
343 llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
344 getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
345 if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
346 VisitStmt(S: S->getCapturedStmt());
347 return;
348 }
349 VisitOpenMPCapturedStmt(
350 S, D->clauses(),
351 CaptureRegions.back() == OMPD_parallel &&
352 isOpenMPDistributeDirective(D->getDirectiveKind()));
353 }
354 }
355 void VisitCapturedStmt(const CapturedStmt *S) {
356 if (!S)
357 return;
358 for (const CapturedStmt::Capture &C : S->captures()) {
359 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
360 const ValueDecl *VD = C.getCapturedVar();
361 markAsEscaped(VD);
362 if (isa<OMPCapturedExprDecl>(Val: VD))
363 VisitValueDecl(VD);
364 }
365 }
366 }
367 void VisitLambdaExpr(const LambdaExpr *E) {
368 if (!E)
369 return;
370 for (const LambdaCapture &C : E->captures()) {
371 if (C.capturesVariable()) {
372 if (C.getCaptureKind() == LCK_ByRef) {
373 const ValueDecl *VD = C.getCapturedVar();
374 markAsEscaped(VD);
375 if (E->isInitCapture(Capture: &C) || isa<OMPCapturedExprDecl>(Val: VD))
376 VisitValueDecl(VD);
377 }
378 }
379 }
380 }
381 void VisitBlockExpr(const BlockExpr *E) {
382 if (!E)
383 return;
384 for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
385 if (C.isByRef()) {
386 const VarDecl *VD = C.getVariable();
387 markAsEscaped(VD);
388 if (isa<OMPCapturedExprDecl>(Val: VD) || VD->isInitCapture())
389 VisitValueDecl(VD);
390 }
391 }
392 }
393 void VisitCallExpr(const CallExpr *E) {
394 if (!E)
395 return;
396 for (const Expr *Arg : E->arguments()) {
397 if (!Arg)
398 continue;
399 if (Arg->isLValue()) {
400 const bool SavedAllEscaped = AllEscaped;
401 AllEscaped = true;
402 Visit(Arg);
403 AllEscaped = SavedAllEscaped;
404 } else {
405 Visit(Arg);
406 }
407 }
408 Visit(E->getCallee());
409 }
410 void VisitDeclRefExpr(const DeclRefExpr *E) {
411 if (!E)
412 return;
413 const ValueDecl *VD = E->getDecl();
414 if (AllEscaped)
415 markAsEscaped(VD);
416 if (isa<OMPCapturedExprDecl>(Val: VD))
417 VisitValueDecl(VD);
418 else if (VD->isInitCapture())
419 VisitValueDecl(VD);
420 }
421 void VisitUnaryOperator(const UnaryOperator *E) {
422 if (!E)
423 return;
424 if (E->getOpcode() == UO_AddrOf) {
425 const bool SavedAllEscaped = AllEscaped;
426 AllEscaped = true;
427 Visit(E->getSubExpr());
428 AllEscaped = SavedAllEscaped;
429 } else {
430 Visit(E->getSubExpr());
431 }
432 }
433 void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
434 if (!E)
435 return;
436 if (E->getCastKind() == CK_ArrayToPointerDecay) {
437 const bool SavedAllEscaped = AllEscaped;
438 AllEscaped = true;
439 Visit(E->getSubExpr());
440 AllEscaped = SavedAllEscaped;
441 } else {
442 Visit(E->getSubExpr());
443 }
444 }
445 void VisitExpr(const Expr *E) {
446 if (!E)
447 return;
448 bool SavedAllEscaped = AllEscaped;
449 if (!E->isLValue())
450 AllEscaped = false;
451 for (const Stmt *Child : E->children())
452 if (Child)
453 Visit(Child);
454 AllEscaped = SavedAllEscaped;
455 }
456 void VisitStmt(const Stmt *S) {
457 if (!S)
458 return;
459 for (const Stmt *Child : S->children())
460 if (Child)
461 Visit(Child);
462 }
463
464 /// Returns the record that handles all the escaped local variables and used
465 /// instead of their original storage.
466 const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
467 if (!GlobalizedRD)
468 buildRecordForGlobalizedVars(IsInTTDRegion);
469 return GlobalizedRD;
470 }
471
472 /// Returns the field in the globalized record for the escaped variable.
473 const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
474 assert(GlobalizedRD &&
475 "Record for globalized variables must be generated already.");
476 return MappedDeclsFields.lookup(Val: VD);
477 }
478
479 /// Returns the list of the escaped local variables/parameters.
480 ArrayRef<const ValueDecl *> getEscapedDecls() const {
481 return EscapedDecls.getArrayRef();
482 }
483
484 /// Checks if the escaped local variable is actually a parameter passed by
485 /// value.
486 const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
487 return EscapedParameters;
488 }
489
490 /// Returns the list of the escaped variables with the variably modified
491 /// types.
492 ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
493 return EscapedVariableLengthDecls.getArrayRef();
494 }
495
496 /// Returns the list of the delayed variables with the variably modified
497 /// types.
498 ArrayRef<const ValueDecl *> getDelayedVariableLengthDecls() const {
499 return DelayedVariableLengthDecls.getArrayRef();
500 }
501};
502} // anonymous namespace
503
504/// Get the id of the warp in the block.
505/// We assume that the warp size is 32, which is always the case
506/// on the NVPTX device, to generate more efficient code.
507static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
508 CGBuilderTy &Bld = CGF.Builder;
509 unsigned LaneIDBits =
510 llvm::Log2_32(Value: CGF.getTarget().getGridValue().GV_Warp_Size);
511 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
512 return Bld.CreateAShr(LHS: RT.getGPUThreadID(CGF), RHS: LaneIDBits, Name: "nvptx_warp_id");
513}
514
515/// Get the id of the current lane in the Warp.
516/// We assume that the warp size is 32, which is always the case
517/// on the NVPTX device, to generate more efficient code.
518static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
519 CGBuilderTy &Bld = CGF.Builder;
520 unsigned LaneIDBits =
521 llvm::Log2_32(Value: CGF.getTarget().getGridValue().GV_Warp_Size);
522 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
523 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
524 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
525 return Bld.CreateAnd(LHS: RT.getGPUThreadID(CGF), RHS: Bld.getInt32(C: LaneIDMask),
526 Name: "nvptx_lane_id");
527}
528
529CGOpenMPRuntimeGPU::ExecutionMode
530CGOpenMPRuntimeGPU::getExecutionMode() const {
531 return CurrentExecutionMode;
532}
533
534CGOpenMPRuntimeGPU::DataSharingMode
535CGOpenMPRuntimeGPU::getDataSharingMode() const {
536 return CurrentDataSharingMode;
537}
538
539/// Check for inner (nested) SPMD construct, if any
540static bool hasNestedSPMDDirective(ASTContext &Ctx,
541 const OMPExecutableDirective &D) {
542 const auto *CS = D.getInnermostCapturedStmt();
543 const auto *Body =
544 CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
545 const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
546
547 if (const auto *NestedDir =
548 dyn_cast_or_null<OMPExecutableDirective>(Val: ChildStmt)) {
549 OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
550 switch (D.getDirectiveKind()) {
551 case OMPD_target:
552 if (isOpenMPParallelDirective(DKind))
553 return true;
554 if (DKind == OMPD_teams) {
555 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
556 /*IgnoreCaptured=*/true);
557 if (!Body)
558 return false;
559 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
560 if (const auto *NND =
561 dyn_cast_or_null<OMPExecutableDirective>(Val: ChildStmt)) {
562 DKind = NND->getDirectiveKind();
563 if (isOpenMPParallelDirective(DKind))
564 return true;
565 }
566 }
567 return false;
568 case OMPD_target_teams:
569 return isOpenMPParallelDirective(DKind);
570 case OMPD_target_simd:
571 case OMPD_target_parallel:
572 case OMPD_target_parallel_for:
573 case OMPD_target_parallel_for_simd:
574 case OMPD_target_teams_distribute:
575 case OMPD_target_teams_distribute_simd:
576 case OMPD_target_teams_distribute_parallel_for:
577 case OMPD_target_teams_distribute_parallel_for_simd:
578 case OMPD_parallel:
579 case OMPD_for:
580 case OMPD_parallel_for:
581 case OMPD_parallel_master:
582 case OMPD_parallel_sections:
583 case OMPD_for_simd:
584 case OMPD_parallel_for_simd:
585 case OMPD_cancel:
586 case OMPD_cancellation_point:
587 case OMPD_ordered:
588 case OMPD_threadprivate:
589 case OMPD_allocate:
590 case OMPD_task:
591 case OMPD_simd:
592 case OMPD_sections:
593 case OMPD_section:
594 case OMPD_single:
595 case OMPD_master:
596 case OMPD_critical:
597 case OMPD_taskyield:
598 case OMPD_barrier:
599 case OMPD_taskwait:
600 case OMPD_taskgroup:
601 case OMPD_atomic:
602 case OMPD_flush:
603 case OMPD_depobj:
604 case OMPD_scan:
605 case OMPD_teams:
606 case OMPD_target_data:
607 case OMPD_target_exit_data:
608 case OMPD_target_enter_data:
609 case OMPD_distribute:
610 case OMPD_distribute_simd:
611 case OMPD_distribute_parallel_for:
612 case OMPD_distribute_parallel_for_simd:
613 case OMPD_teams_distribute:
614 case OMPD_teams_distribute_simd:
615 case OMPD_teams_distribute_parallel_for:
616 case OMPD_teams_distribute_parallel_for_simd:
617 case OMPD_target_update:
618 case OMPD_declare_simd:
619 case OMPD_declare_variant:
620 case OMPD_begin_declare_variant:
621 case OMPD_end_declare_variant:
622 case OMPD_declare_target:
623 case OMPD_end_declare_target:
624 case OMPD_declare_reduction:
625 case OMPD_declare_mapper:
626 case OMPD_taskloop:
627 case OMPD_taskloop_simd:
628 case OMPD_master_taskloop:
629 case OMPD_master_taskloop_simd:
630 case OMPD_parallel_master_taskloop:
631 case OMPD_parallel_master_taskloop_simd:
632 case OMPD_requires:
633 case OMPD_unknown:
634 default:
635 llvm_unreachable("Unexpected directive.");
636 }
637 }
638
639 return false;
640}
641
642static bool supportsSPMDExecutionMode(ASTContext &Ctx,
643 const OMPExecutableDirective &D) {
644 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
645 switch (DirectiveKind) {
646 case OMPD_target:
647 case OMPD_target_teams:
648 return hasNestedSPMDDirective(Ctx, D);
649 case OMPD_target_parallel_loop:
650 case OMPD_target_parallel:
651 case OMPD_target_parallel_for:
652 case OMPD_target_parallel_for_simd:
653 case OMPD_target_teams_distribute_parallel_for:
654 case OMPD_target_teams_distribute_parallel_for_simd:
655 case OMPD_target_simd:
656 case OMPD_target_teams_distribute_simd:
657 return true;
658 case OMPD_target_teams_distribute:
659 return false;
660 case OMPD_target_teams_loop:
661 // Whether this is true or not depends on how the directive will
662 // eventually be emitted.
663 if (auto *TTLD = dyn_cast<OMPTargetTeamsGenericLoopDirective>(Val: &D))
664 return TTLD->canBeParallelFor();
665 return false;
666 case OMPD_parallel:
667 case OMPD_for:
668 case OMPD_parallel_for:
669 case OMPD_parallel_master:
670 case OMPD_parallel_sections:
671 case OMPD_for_simd:
672 case OMPD_parallel_for_simd:
673 case OMPD_cancel:
674 case OMPD_cancellation_point:
675 case OMPD_ordered:
676 case OMPD_threadprivate:
677 case OMPD_allocate:
678 case OMPD_task:
679 case OMPD_simd:
680 case OMPD_sections:
681 case OMPD_section:
682 case OMPD_single:
683 case OMPD_master:
684 case OMPD_critical:
685 case OMPD_taskyield:
686 case OMPD_barrier:
687 case OMPD_taskwait:
688 case OMPD_taskgroup:
689 case OMPD_atomic:
690 case OMPD_flush:
691 case OMPD_depobj:
692 case OMPD_scan:
693 case OMPD_teams:
694 case OMPD_target_data:
695 case OMPD_target_exit_data:
696 case OMPD_target_enter_data:
697 case OMPD_distribute:
698 case OMPD_distribute_simd:
699 case OMPD_distribute_parallel_for:
700 case OMPD_distribute_parallel_for_simd:
701 case OMPD_teams_distribute:
702 case OMPD_teams_distribute_simd:
703 case OMPD_teams_distribute_parallel_for:
704 case OMPD_teams_distribute_parallel_for_simd:
705 case OMPD_target_update:
706 case OMPD_declare_simd:
707 case OMPD_declare_variant:
708 case OMPD_begin_declare_variant:
709 case OMPD_end_declare_variant:
710 case OMPD_declare_target:
711 case OMPD_end_declare_target:
712 case OMPD_declare_reduction:
713 case OMPD_declare_mapper:
714 case OMPD_taskloop:
715 case OMPD_taskloop_simd:
716 case OMPD_master_taskloop:
717 case OMPD_master_taskloop_simd:
718 case OMPD_parallel_master_taskloop:
719 case OMPD_parallel_master_taskloop_simd:
720 case OMPD_requires:
721 case OMPD_unknown:
722 default:
723 break;
724 }
725 llvm_unreachable(
726 "Unknown programming model for OpenMP directive on NVPTX target.");
727}
728
729void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
730 StringRef ParentName,
731 llvm::Function *&OutlinedFn,
732 llvm::Constant *&OutlinedFnID,
733 bool IsOffloadEntry,
734 const RegionCodeGenTy &CodeGen) {
735 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_NonSPMD);
736 EntryFunctionState EST;
737 WrapperFunctionsMap.clear();
738
739 [[maybe_unused]] bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
740 assert(!IsBareKernel && "bare kernel should not be at generic mode");
741
742 // Emit target region as a standalone region.
743 class NVPTXPrePostActionTy : public PrePostActionTy {
744 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
745 const OMPExecutableDirective &D;
746
747 public:
748 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST,
749 const OMPExecutableDirective &D)
750 : EST(EST), D(D) {}
751 void Enter(CodeGenFunction &CGF) override {
752 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
753 RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ false);
754 // Skip target region initialization.
755 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
756 }
757 void Exit(CodeGenFunction &CGF) override {
758 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
759 RT.clearLocThreadIdInsertPt(CGF);
760 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false);
761 }
762 } Action(EST, D);
763 CodeGen.setAction(Action);
764 IsInTTDRegion = true;
765 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
766 IsOffloadEntry, CodeGen);
767 IsInTTDRegion = false;
768}
769
770void CGOpenMPRuntimeGPU::emitKernelInit(const OMPExecutableDirective &D,
771 CodeGenFunction &CGF,
772 EntryFunctionState &EST, bool IsSPMD) {
773 int32_t MinThreadsVal = 1, MaxThreadsVal = -1, MinTeamsVal = 1,
774 MaxTeamsVal = -1;
775 computeMinAndMaxThreadsAndTeams(D, CGF, MinThreadsVal, MaxThreadsVal,
776 MinTeamsVal, MaxTeamsVal);
777
778 CGBuilderTy &Bld = CGF.Builder;
779 Bld.restoreIP(IP: OMPBuilder.createTargetInit(
780 Loc: Bld, IsSPMD, MinThreadsVal, MaxThreadsVal, MinTeamsVal, MaxTeamsVal));
781 if (!IsSPMD)
782 emitGenericVarsProlog(CGF, Loc: EST.Loc);
783}
784
785void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
786 EntryFunctionState &EST,
787 bool IsSPMD) {
788 if (!IsSPMD)
789 emitGenericVarsEpilog(CGF);
790
791 // This is temporary until we remove the fixed sized buffer.
792 ASTContext &C = CGM.getContext();
793 RecordDecl *StaticRD = C.buildImplicitRecord(
794 Name: "_openmp_teams_reduction_type_$_", TK: RecordDecl::TagKind::Union);
795 StaticRD->startDefinition();
796 for (const RecordDecl *TeamReductionRec : TeamsReductions) {
797 QualType RecTy = C.getRecordType(Decl: TeamReductionRec);
798 auto *Field = FieldDecl::Create(
799 C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
800 C.getTrivialTypeSourceInfo(T: RecTy, Loc: SourceLocation()),
801 /*BW=*/nullptr, /*Mutable=*/false,
802 /*InitStyle=*/ICIS_NoInit);
803 Field->setAccess(AS_public);
804 StaticRD->addDecl(D: Field);
805 }
806 StaticRD->completeDefinition();
807 QualType StaticTy = C.getRecordType(Decl: StaticRD);
808 llvm::Type *LLVMReductionsBufferTy =
809 CGM.getTypes().ConvertTypeForMem(T: StaticTy);
810 const auto &DL = CGM.getModule().getDataLayout();
811 uint64_t ReductionDataSize =
812 TeamsReductions.empty()
813 ? 0
814 : DL.getTypeAllocSize(Ty: LLVMReductionsBufferTy).getFixedValue();
815 CGBuilderTy &Bld = CGF.Builder;
816 OMPBuilder.createTargetDeinit(Loc: Bld, TeamsReductionDataSize: ReductionDataSize,
817 TeamsReductionBufferLength: C.getLangOpts().OpenMPCUDAReductionBufNum);
818 TeamsReductions.clear();
819}
820
821void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
822 StringRef ParentName,
823 llvm::Function *&OutlinedFn,
824 llvm::Constant *&OutlinedFnID,
825 bool IsOffloadEntry,
826 const RegionCodeGenTy &CodeGen) {
827 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_SPMD);
828 EntryFunctionState EST;
829
830 bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
831
832 // Emit target region as a standalone region.
833 class NVPTXPrePostActionTy : public PrePostActionTy {
834 CGOpenMPRuntimeGPU &RT;
835 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
836 bool IsBareKernel;
837 DataSharingMode Mode;
838 const OMPExecutableDirective &D;
839
840 public:
841 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
842 CGOpenMPRuntimeGPU::EntryFunctionState &EST,
843 bool IsBareKernel, const OMPExecutableDirective &D)
844 : RT(RT), EST(EST), IsBareKernel(IsBareKernel),
845 Mode(RT.CurrentDataSharingMode), D(D) {}
846 void Enter(CodeGenFunction &CGF) override {
847 if (IsBareKernel) {
848 RT.CurrentDataSharingMode = DataSharingMode::DS_CUDA;
849 return;
850 }
851 RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ true);
852 // Skip target region initialization.
853 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
854 }
855 void Exit(CodeGenFunction &CGF) override {
856 if (IsBareKernel) {
857 RT.CurrentDataSharingMode = Mode;
858 return;
859 }
860 RT.clearLocThreadIdInsertPt(CGF);
861 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true);
862 }
863 } Action(*this, EST, IsBareKernel, D);
864 CodeGen.setAction(Action);
865 IsInTTDRegion = true;
866 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
867 IsOffloadEntry, CodeGen);
868 IsInTTDRegion = false;
869}
870
871void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
872 const OMPExecutableDirective &D, StringRef ParentName,
873 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
874 bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
875 if (!IsOffloadEntry) // Nothing to do.
876 return;
877
878 assert(!ParentName.empty() && "Invalid target region parent name!");
879
880 bool Mode = supportsSPMDExecutionMode(Ctx&: CGM.getContext(), D);
881 bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
882 if (Mode || IsBareKernel)
883 emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
884 CodeGen);
885 else
886 emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
887 CodeGen);
888}
889
890CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
891 : CGOpenMPRuntime(CGM) {
892 llvm::OpenMPIRBuilderConfig Config(
893 CGM.getLangOpts().OpenMPIsTargetDevice, isGPU(),
894 CGM.getLangOpts().OpenMPOffloadMandatory,
895 /*HasRequiresReverseOffload*/ false, /*HasRequiresUnifiedAddress*/ false,
896 hasRequiresUnifiedSharedMemory(), /*HasRequiresDynamicAllocators*/ false);
897 OMPBuilder.setConfig(Config);
898
899 if (!CGM.getLangOpts().OpenMPIsTargetDevice)
900 llvm_unreachable("OpenMP can only handle device code.");
901
902 if (CGM.getLangOpts().OpenMPCUDAMode)
903 CurrentDataSharingMode = CGOpenMPRuntimeGPU::DS_CUDA;
904
905 llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
906 if (CGM.getLangOpts().NoGPULib || CGM.getLangOpts().OMPHostIRFile.empty())
907 return;
908
909 OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPTargetDebug,
910 Name: "__omp_rtl_debug_kind");
911 OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPTeamSubscription,
912 Name: "__omp_rtl_assume_teams_oversubscription");
913 OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPThreadSubscription,
914 Name: "__omp_rtl_assume_threads_oversubscription");
915 OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPNoThreadState,
916 Name: "__omp_rtl_assume_no_thread_state");
917 OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPNoNestedParallelism,
918 Name: "__omp_rtl_assume_no_nested_parallelism");
919}
920
921void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF,
922 ProcBindKind ProcBind,
923 SourceLocation Loc) {
924 // Nothing to do.
925}
926
927void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF,
928 llvm::Value *NumThreads,
929 SourceLocation Loc) {
930 // Nothing to do.
931}
932
933void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF,
934 const Expr *NumTeams,
935 const Expr *ThreadLimit,
936 SourceLocation Loc) {}
937
938llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction(
939 CodeGenFunction &CGF, const OMPExecutableDirective &D,
940 const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
941 const RegionCodeGenTy &CodeGen) {
942 // Emit target region as a standalone region.
943 bool PrevIsInTTDRegion = IsInTTDRegion;
944 IsInTTDRegion = false;
945 auto *OutlinedFun =
946 cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
947 CGF, D, ThreadIDVar, InnermostKind, CodeGen));
948 IsInTTDRegion = PrevIsInTTDRegion;
949 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) {
950 llvm::Function *WrapperFun =
951 createParallelDataSharingWrapper(OutlinedParallelFn: OutlinedFun, D);
952 WrapperFunctionsMap[OutlinedFun] = WrapperFun;
953 }
954
955 return OutlinedFun;
956}
957
958/// Get list of lastprivate variables from the teams distribute ... or
959/// teams {distribute ...} directives.
960static void
961getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D,
962 llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
963 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
964 "expected teams directive.");
965 const OMPExecutableDirective *Dir = &D;
966 if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {
967 if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild(
968 Ctx,
969 Body: D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
970 /*IgnoreCaptured=*/true))) {
971 Dir = dyn_cast_or_null<OMPExecutableDirective>(Val: S);
972 if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
973 Dir = nullptr;
974 }
975 }
976 if (!Dir)
977 return;
978 for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
979 for (const Expr *E : C->getVarRefs())
980 Vars.push_back(getPrivateItem(E));
981 }
982}
983
984/// Get list of reduction variables from the teams ... directives.
985static void
986getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D,
987 llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
988 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
989 "expected teams directive.");
990 for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
991 for (const Expr *E : C->privates())
992 Vars.push_back(Elt: getPrivateItem(RefExpr: E));
993 }
994}
995
996llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
997 CodeGenFunction &CGF, const OMPExecutableDirective &D,
998 const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
999 const RegionCodeGenTy &CodeGen) {
1000 SourceLocation Loc = D.getBeginLoc();
1001
1002 const RecordDecl *GlobalizedRD = nullptr;
1003 llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
1004 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
1005 unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size;
1006 // Globalize team reductions variable unconditionally in all modes.
1007 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1008 getTeamsReductionVars(Ctx&: CGM.getContext(), D, Vars&: LastPrivatesReductions);
1009 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
1010 getDistributeLastprivateVars(Ctx&: CGM.getContext(), D, Vars&: LastPrivatesReductions);
1011 if (!LastPrivatesReductions.empty()) {
1012 GlobalizedRD = ::buildRecordForGlobalizedVars(
1013 C&: CGM.getContext(), EscapedDecls: std::nullopt, EscapedDeclsForTeams: LastPrivatesReductions,
1014 MappedDeclsFields, BufSize: WarpSize);
1015 }
1016 } else if (!LastPrivatesReductions.empty()) {
1017 assert(!TeamAndReductions.first &&
1018 "Previous team declaration is not expected.");
1019 TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
1020 std::swap(LHS&: TeamAndReductions.second, RHS&: LastPrivatesReductions);
1021 }
1022
1023 // Emit target region as a standalone region.
1024 class NVPTXPrePostActionTy : public PrePostActionTy {
1025 SourceLocation &Loc;
1026 const RecordDecl *GlobalizedRD;
1027 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1028 &MappedDeclsFields;
1029
1030 public:
1031 NVPTXPrePostActionTy(
1032 SourceLocation &Loc, const RecordDecl *GlobalizedRD,
1033 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1034 &MappedDeclsFields)
1035 : Loc(Loc), GlobalizedRD(GlobalizedRD),
1036 MappedDeclsFields(MappedDeclsFields) {}
1037 void Enter(CodeGenFunction &CGF) override {
1038 auto &Rt =
1039 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1040 if (GlobalizedRD) {
1041 auto I = Rt.FunctionGlobalizedDecls.try_emplace(Key: CGF.CurFn).first;
1042 I->getSecond().MappedParams =
1043 std::make_unique<CodeGenFunction::OMPMapVars>();
1044 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
1045 for (const auto &Pair : MappedDeclsFields) {
1046 assert(Pair.getFirst()->isCanonicalDecl() &&
1047 "Expected canonical declaration");
1048 Data.insert(std::make_pair(x: Pair.getFirst(), y: MappedVarData()));
1049 }
1050 }
1051 Rt.emitGenericVarsProlog(CGF, Loc);
1052 }
1053 void Exit(CodeGenFunction &CGF) override {
1054 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
1055 .emitGenericVarsEpilog(CGF);
1056 }
1057 } Action(Loc, GlobalizedRD, MappedDeclsFields);
1058 CodeGen.setAction(Action);
1059 llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
1060 CGF, D, ThreadIDVar, InnermostKind, CodeGen);
1061
1062 return OutlinedFun;
1063}
1064
1065void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
1066 SourceLocation Loc) {
1067 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1068 return;
1069
1070 CGBuilderTy &Bld = CGF.Builder;
1071
1072 const auto I = FunctionGlobalizedDecls.find(Val: CGF.CurFn);
1073 if (I == FunctionGlobalizedDecls.end())
1074 return;
1075
1076 for (auto &Rec : I->getSecond().LocalVarData) {
1077 const auto *VD = cast<VarDecl>(Val: Rec.first);
1078 bool EscapedParam = I->getSecond().EscapedParameters.count(Ptr: Rec.first);
1079 QualType VarTy = VD->getType();
1080
1081 // Get the local allocation of a firstprivate variable before sharing
1082 llvm::Value *ParValue;
1083 if (EscapedParam) {
1084 LValue ParLVal =
1085 CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
1086 ParValue = CGF.EmitLoadOfScalar(lvalue: ParLVal, Loc);
1087 }
1088
1089 // Allocate space for the variable to be globalized
1090 llvm::Value *AllocArgs[] = {CGF.getTypeSize(Ty: VD->getType())};
1091 llvm::CallBase *VoidPtr =
1092 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1093 M&: CGM.getModule(), FnID: OMPRTL___kmpc_alloc_shared),
1094 AllocArgs, VD->getName());
1095 // FIXME: We should use the variables actual alignment as an argument.
1096 VoidPtr->addRetAttr(llvm::Attribute::get(
1097 CGM.getLLVMContext(), llvm::Attribute::Alignment,
1098 CGM.getContext().getTargetInfo().getNewAlign() / 8));
1099
1100 // Cast the void pointer and get the address of the globalized variable.
1101 llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(T: VarTy)->getPointerTo();
1102 llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1103 VoidPtr, VarPtrTy, VD->getName() + "_on_stack");
1104 LValue VarAddr =
1105 CGF.MakeNaturalAlignPointeeRawAddrLValue(V: CastedVoidPtr, T: VarTy);
1106 Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
1107 Rec.second.GlobalizedVal = VoidPtr;
1108
1109 // Assign the local allocation to the newly globalized location.
1110 if (EscapedParam) {
1111 CGF.EmitStoreOfScalar(value: ParValue, lvalue: VarAddr);
1112 I->getSecond().MappedParams->setVarAddr(CGF, LocalVD: VD, TempAddr: VarAddr.getAddress(CGF));
1113 }
1114 if (auto *DI = CGF.getDebugInfo())
1115 VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(Loc: VD->getLocation()));
1116 }
1117
1118 for (const auto *ValueD : I->getSecond().EscapedVariableLengthDecls) {
1119 const auto *VD = cast<VarDecl>(Val: ValueD);
1120 std::pair<llvm::Value *, llvm::Value *> AddrSizePair =
1121 getKmpcAllocShared(CGF, VD);
1122 I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(Args&: AddrSizePair);
1123 LValue Base = CGF.MakeAddrLValue(AddrSizePair.first, VD->getType(),
1124 CGM.getContext().getDeclAlign(VD),
1125 AlignmentSource::Decl);
1126 I->getSecond().MappedParams->setVarAddr(CGF, LocalVD: VD, TempAddr: Base.getAddress(CGF));
1127 }
1128 I->getSecond().MappedParams->apply(CGF);
1129}
1130
1131bool CGOpenMPRuntimeGPU::isDelayedVariableLengthDecl(CodeGenFunction &CGF,
1132 const VarDecl *VD) const {
1133 const auto I = FunctionGlobalizedDecls.find(Val: CGF.CurFn);
1134 if (I == FunctionGlobalizedDecls.end())
1135 return false;
1136
1137 // Check variable declaration is delayed:
1138 return llvm::is_contained(Range: I->getSecond().DelayedVariableLengthDecls, Element: VD);
1139}
1140
1141std::pair<llvm::Value *, llvm::Value *>
1142CGOpenMPRuntimeGPU::getKmpcAllocShared(CodeGenFunction &CGF,
1143 const VarDecl *VD) {
1144 CGBuilderTy &Bld = CGF.Builder;
1145
1146 // Compute size and alignment.
1147 llvm::Value *Size = CGF.getTypeSize(Ty: VD->getType());
1148 CharUnits Align = CGM.getContext().getDeclAlign(VD);
1149 Size = Bld.CreateNUWAdd(
1150 LHS: Size, RHS: llvm::ConstantInt::get(Ty: CGF.SizeTy, V: Align.getQuantity() - 1));
1151 llvm::Value *AlignVal =
1152 llvm::ConstantInt::get(Ty: CGF.SizeTy, V: Align.getQuantity());
1153 Size = Bld.CreateUDiv(LHS: Size, RHS: AlignVal);
1154 Size = Bld.CreateNUWMul(LHS: Size, RHS: AlignVal);
1155
1156 // Allocate space for this VLA object to be globalized.
1157 llvm::Value *AllocArgs[] = {Size};
1158 llvm::CallBase *VoidPtr =
1159 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1160 M&: CGM.getModule(), FnID: OMPRTL___kmpc_alloc_shared),
1161 AllocArgs, VD->getName());
1162 VoidPtr->addRetAttr(llvm::Attribute::get(
1163 CGM.getLLVMContext(), llvm::Attribute::Alignment, Align.getQuantity()));
1164
1165 return std::make_pair(x&: VoidPtr, y&: Size);
1166}
1167
1168void CGOpenMPRuntimeGPU::getKmpcFreeShared(
1169 CodeGenFunction &CGF,
1170 const std::pair<llvm::Value *, llvm::Value *> &AddrSizePair) {
1171 // Deallocate the memory for each globalized VLA object
1172 CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1173 M&: CGM.getModule(), FnID: OMPRTL___kmpc_free_shared),
1174 args: {AddrSizePair.first, AddrSizePair.second});
1175}
1176
1177void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) {
1178 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1179 return;
1180
1181 const auto I = FunctionGlobalizedDecls.find(Val: CGF.CurFn);
1182 if (I != FunctionGlobalizedDecls.end()) {
1183 // Deallocate the memory for each globalized VLA object that was
1184 // globalized in the prolog (i.e. emitGenericVarsProlog).
1185 for (const auto &AddrSizePair :
1186 llvm::reverse(C&: I->getSecond().EscapedVariableLengthDeclsAddrs)) {
1187 CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1188 M&: CGM.getModule(), FnID: OMPRTL___kmpc_free_shared),
1189 args: {AddrSizePair.first, AddrSizePair.second});
1190 }
1191 // Deallocate the memory for each globalized value
1192 for (auto &Rec : llvm::reverse(C&: I->getSecond().LocalVarData)) {
1193 const auto *VD = cast<VarDecl>(Val: Rec.first);
1194 I->getSecond().MappedParams->restore(CGF);
1195
1196 llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
1197 CGF.getTypeSize(Ty: VD->getType())};
1198 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1199 M&: CGM.getModule(), FnID: OMPRTL___kmpc_free_shared),
1200 FreeArgs);
1201 }
1202 }
1203}
1204
1205void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF,
1206 const OMPExecutableDirective &D,
1207 SourceLocation Loc,
1208 llvm::Function *OutlinedFn,
1209 ArrayRef<llvm::Value *> CapturedVars) {
1210 if (!CGF.HaveInsertPoint())
1211 return;
1212
1213 bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
1214
1215 RawAddress ZeroAddr = CGF.CreateDefaultAlignTempAlloca(Ty: CGF.Int32Ty,
1216 /*Name=*/".zero.addr");
1217 CGF.Builder.CreateStore(Val: CGF.Builder.getInt32(/*C*/ 0), Addr: ZeroAddr);
1218 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
1219 // We don't emit any thread id function call in bare kernel, but because the
1220 // outlined function has a pointer argument, we emit a nullptr here.
1221 if (IsBareKernel)
1222 OutlinedFnArgs.push_back(Elt: llvm::ConstantPointerNull::get(T: CGM.VoidPtrTy));
1223 else
1224 OutlinedFnArgs.push_back(Elt: emitThreadIDAddress(CGF, Loc).emitRawPointer(CGF));
1225 OutlinedFnArgs.push_back(Elt: ZeroAddr.getPointer());
1226 OutlinedFnArgs.append(in_start: CapturedVars.begin(), in_end: CapturedVars.end());
1227 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, Args: OutlinedFnArgs);
1228}
1229
1230void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
1231 SourceLocation Loc,
1232 llvm::Function *OutlinedFn,
1233 ArrayRef<llvm::Value *> CapturedVars,
1234 const Expr *IfCond,
1235 llvm::Value *NumThreads) {
1236 if (!CGF.HaveInsertPoint())
1237 return;
1238
1239 auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond,
1240 NumThreads](CodeGenFunction &CGF,
1241 PrePostActionTy &Action) {
1242 CGBuilderTy &Bld = CGF.Builder;
1243 llvm::Value *NumThreadsVal = NumThreads;
1244 llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
1245 llvm::Value *ID = llvm::ConstantPointerNull::get(T: CGM.Int8PtrTy);
1246 if (WFn)
1247 ID = Bld.CreateBitOrPointerCast(V: WFn, DestTy: CGM.Int8PtrTy);
1248 llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(V: OutlinedFn, DestTy: CGM.Int8PtrTy);
1249
1250 // Create a private scope that will globalize the arguments
1251 // passed from the outside of the target region.
1252 // TODO: Is that needed?
1253 CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
1254
1255 Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
1256 Ty: llvm::ArrayType::get(ElementType: CGM.VoidPtrTy, NumElements: CapturedVars.size()),
1257 Name: "captured_vars_addrs");
1258 // There's something to share.
1259 if (!CapturedVars.empty()) {
1260 // Prepare for parallel region. Indicate the outlined function.
1261 ASTContext &Ctx = CGF.getContext();
1262 unsigned Idx = 0;
1263 for (llvm::Value *V : CapturedVars) {
1264 Address Dst = Bld.CreateConstArrayGEP(Addr: CapturedVarsAddrs, Index: Idx);
1265 llvm::Value *PtrV;
1266 if (V->getType()->isIntegerTy())
1267 PtrV = Bld.CreateIntToPtr(V, DestTy: CGF.VoidPtrTy);
1268 else
1269 PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, DestTy: CGF.VoidPtrTy);
1270 CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
1271 Ctx.getPointerType(Ctx.VoidPtrTy));
1272 ++Idx;
1273 }
1274 }
1275
1276 llvm::Value *IfCondVal = nullptr;
1277 if (IfCond)
1278 IfCondVal = Bld.CreateIntCast(V: CGF.EvaluateExprAsBool(E: IfCond), DestTy: CGF.Int32Ty,
1279 /* isSigned */ false);
1280 else
1281 IfCondVal = llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: 1);
1282
1283 if (!NumThreadsVal)
1284 NumThreadsVal = llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: -1);
1285 else
1286 NumThreadsVal = Bld.CreateZExtOrTrunc(V: NumThreadsVal, DestTy: CGF.Int32Ty),
1287
1288 assert(IfCondVal && "Expected a value");
1289 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1290 llvm::Value *Args[] = {
1291 RTLoc,
1292 getThreadID(CGF, Loc),
1293 IfCondVal,
1294 NumThreadsVal,
1295 llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: -1),
1296 FnPtr,
1297 ID,
1298 Bld.CreateBitOrPointerCast(V: CapturedVarsAddrs.emitRawPointer(CGF),
1299 DestTy: CGF.VoidPtrPtrTy),
1300 llvm::ConstantInt::get(Ty: CGM.SizeTy, V: CapturedVars.size())};
1301 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1302 M&: CGM.getModule(), FnID: OMPRTL___kmpc_parallel_51),
1303 Args);
1304 };
1305
1306 RegionCodeGenTy RCG(ParallelGen);
1307 RCG(CGF);
1308}
1309
1310void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
1311 // Always emit simple barriers!
1312 if (!CGF.HaveInsertPoint())
1313 return;
1314 // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
1315 // This function does not use parameters, so we can emit just default values.
1316 llvm::Value *Args[] = {
1317 llvm::ConstantPointerNull::get(
1318 T: cast<llvm::PointerType>(getIdentTyPointerTy())),
1319 llvm::ConstantInt::get(Ty: CGF.Int32Ty, /*V=*/0, /*isSigned=*/IsSigned: true)};
1320 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1321 M&: CGM.getModule(), FnID: OMPRTL___kmpc_barrier_simple_spmd),
1322 Args);
1323}
1324
1325void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF,
1326 SourceLocation Loc,
1327 OpenMPDirectiveKind Kind, bool,
1328 bool) {
1329 // Always emit simple barriers!
1330 if (!CGF.HaveInsertPoint())
1331 return;
1332 // Build call __kmpc_cancel_barrier(loc, thread_id);
1333 unsigned Flags = getDefaultFlagsForBarriers(Kind);
1334 llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
1335 getThreadID(CGF, Loc)};
1336
1337 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1338 M&: CGM.getModule(), FnID: OMPRTL___kmpc_barrier),
1339 Args);
1340}
1341
1342void CGOpenMPRuntimeGPU::emitCriticalRegion(
1343 CodeGenFunction &CGF, StringRef CriticalName,
1344 const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
1345 const Expr *Hint) {
1346 llvm::BasicBlock *LoopBB = CGF.createBasicBlock(name: "omp.critical.loop");
1347 llvm::BasicBlock *TestBB = CGF.createBasicBlock(name: "omp.critical.test");
1348 llvm::BasicBlock *SyncBB = CGF.createBasicBlock(name: "omp.critical.sync");
1349 llvm::BasicBlock *BodyBB = CGF.createBasicBlock(name: "omp.critical.body");
1350 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(name: "omp.critical.exit");
1351
1352 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1353
1354 // Get the mask of active threads in the warp.
1355 llvm::Value *Mask = CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1356 M&: CGM.getModule(), FnID: OMPRTL___kmpc_warp_active_thread_mask));
1357 // Fetch team-local id of the thread.
1358 llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1359
1360 // Get the width of the team.
1361 llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
1362
1363 // Initialize the counter variable for the loop.
1364 QualType Int32Ty =
1365 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
1366 Address Counter = CGF.CreateMemTemp(T: Int32Ty, Name: "critical_counter");
1367 LValue CounterLVal = CGF.MakeAddrLValue(Addr: Counter, T: Int32Ty);
1368 CGF.EmitStoreOfScalar(value: llvm::Constant::getNullValue(Ty: CGM.Int32Ty), lvalue: CounterLVal,
1369 /*isInit=*/true);
1370
1371 // Block checks if loop counter exceeds upper bound.
1372 CGF.EmitBlock(BB: LoopBB);
1373 llvm::Value *CounterVal = CGF.EmitLoadOfScalar(lvalue: CounterLVal, Loc);
1374 llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(LHS: CounterVal, RHS: TeamWidth);
1375 CGF.Builder.CreateCondBr(Cond: CmpLoopBound, True: TestBB, False: ExitBB);
1376
1377 // Block tests which single thread should execute region, and which threads
1378 // should go straight to synchronisation point.
1379 CGF.EmitBlock(BB: TestBB);
1380 CounterVal = CGF.EmitLoadOfScalar(lvalue: CounterLVal, Loc);
1381 llvm::Value *CmpThreadToCounter =
1382 CGF.Builder.CreateICmpEQ(LHS: ThreadID, RHS: CounterVal);
1383 CGF.Builder.CreateCondBr(Cond: CmpThreadToCounter, True: BodyBB, False: SyncBB);
1384
1385 // Block emits the body of the critical region.
1386 CGF.EmitBlock(BB: BodyBB);
1387
1388 // Output the critical statement.
1389 CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
1390 Hint);
1391
1392 // After the body surrounded by the critical region, the single executing
1393 // thread will jump to the synchronisation point.
1394 // Block waits for all threads in current team to finish then increments the
1395 // counter variable and returns to the loop.
1396 CGF.EmitBlock(BB: SyncBB);
1397 // Reconverge active threads in the warp.
1398 (void)CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1399 M&: CGM.getModule(), FnID: OMPRTL___kmpc_syncwarp),
1400 args: Mask);
1401
1402 llvm::Value *IncCounterVal =
1403 CGF.Builder.CreateNSWAdd(LHS: CounterVal, RHS: CGF.Builder.getInt32(C: 1));
1404 CGF.EmitStoreOfScalar(value: IncCounterVal, lvalue: CounterLVal);
1405 CGF.EmitBranch(Block: LoopBB);
1406
1407 // Block that is reached when all threads in the team complete the region.
1408 CGF.EmitBlock(BB: ExitBB, /*IsFinished=*/true);
1409}
1410
1411/// Cast value to the specified type.
1412static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
1413 QualType ValTy, QualType CastTy,
1414 SourceLocation Loc) {
1415 assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
1416 "Cast type must sized.");
1417 assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
1418 "Val type must sized.");
1419 llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(T: CastTy);
1420 if (ValTy == CastTy)
1421 return Val;
1422 if (CGF.getContext().getTypeSizeInChars(T: ValTy) ==
1423 CGF.getContext().getTypeSizeInChars(T: CastTy))
1424 return CGF.Builder.CreateBitCast(V: Val, DestTy: LLVMCastTy);
1425 if (CastTy->isIntegerType() && ValTy->isIntegerType())
1426 return CGF.Builder.CreateIntCast(V: Val, DestTy: LLVMCastTy,
1427 isSigned: CastTy->hasSignedIntegerRepresentation());
1428 Address CastItem = CGF.CreateMemTemp(T: CastTy);
1429 Address ValCastItem = CastItem.withElementType(ElemTy: Val->getType());
1430 CGF.EmitStoreOfScalar(Value: Val, Addr: ValCastItem, /*Volatile=*/false, Ty: ValTy,
1431 BaseInfo: LValueBaseInfo(AlignmentSource::Type),
1432 TBAAInfo: TBAAAccessInfo());
1433 return CGF.EmitLoadOfScalar(Addr: CastItem, /*Volatile=*/false, Ty: CastTy, Loc,
1434 BaseInfo: LValueBaseInfo(AlignmentSource::Type),
1435 TBAAInfo: TBAAAccessInfo());
1436}
1437
1438/// This function creates calls to one of two shuffle functions to copy
1439/// variables between lanes in a warp.
1440static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
1441 llvm::Value *Elem,
1442 QualType ElemType,
1443 llvm::Value *Offset,
1444 SourceLocation Loc) {
1445 CodeGenModule &CGM = CGF.CGM;
1446 CGBuilderTy &Bld = CGF.Builder;
1447 CGOpenMPRuntimeGPU &RT =
1448 *(static_cast<CGOpenMPRuntimeGPU *>(&CGM.getOpenMPRuntime()));
1449 llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder();
1450
1451 CharUnits Size = CGF.getContext().getTypeSizeInChars(T: ElemType);
1452 assert(Size.getQuantity() <= 8 &&
1453 "Unsupported bitwidth in shuffle instruction.");
1454
1455 RuntimeFunction ShuffleFn = Size.getQuantity() <= 4
1456 ? OMPRTL___kmpc_shuffle_int32
1457 : OMPRTL___kmpc_shuffle_int64;
1458
1459 // Cast all types to 32- or 64-bit values before calling shuffle routines.
1460 QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
1461 DestWidth: Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
1462 llvm::Value *ElemCast = castValueToType(CGF, Val: Elem, ValTy: ElemType, CastTy, Loc);
1463 llvm::Value *WarpSize =
1464 Bld.CreateIntCast(V: RT.getGPUWarpSize(CGF), DestTy: CGM.Int16Ty, /*isSigned=*/true);
1465
1466 llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
1467 callee: OMPBuilder.getOrCreateRuntimeFunction(M&: CGM.getModule(), FnID: ShuffleFn),
1468 args: {ElemCast, Offset, WarpSize});
1469
1470 return castValueToType(CGF, Val: ShuffledVal, ValTy: CastTy, CastTy: ElemType, Loc);
1471}
1472
1473static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
1474 Address DestAddr, QualType ElemType,
1475 llvm::Value *Offset, SourceLocation Loc) {
1476 CGBuilderTy &Bld = CGF.Builder;
1477
1478 CharUnits Size = CGF.getContext().getTypeSizeInChars(T: ElemType);
1479 // Create the loop over the big sized data.
1480 // ptr = (void*)Elem;
1481 // ptrEnd = (void*) Elem + 1;
1482 // Step = 8;
1483 // while (ptr + Step < ptrEnd)
1484 // shuffle((int64_t)*ptr);
1485 // Step = 4;
1486 // while (ptr + Step < ptrEnd)
1487 // shuffle((int32_t)*ptr);
1488 // ...
1489 Address ElemPtr = DestAddr;
1490 Address Ptr = SrcAddr;
1491 Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(
1492 Addr: Bld.CreateConstGEP(Addr: SrcAddr, Index: 1), Ty: CGF.VoidPtrTy, ElementTy: CGF.Int8Ty);
1493 for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
1494 if (Size < CharUnits::fromQuantity(Quantity: IntSize))
1495 continue;
1496 QualType IntType = CGF.getContext().getIntTypeForBitwidth(
1497 DestWidth: CGF.getContext().toBits(CharSize: CharUnits::fromQuantity(Quantity: IntSize)),
1498 /*Signed=*/1);
1499 llvm::Type *IntTy = CGF.ConvertTypeForMem(T: IntType);
1500 Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Addr: Ptr, Ty: IntTy->getPointerTo(),
1501 ElementTy: IntTy);
1502 ElemPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1503 Addr: ElemPtr, Ty: IntTy->getPointerTo(), ElementTy: IntTy);
1504 if (Size.getQuantity() / IntSize > 1) {
1505 llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(name: ".shuffle.pre_cond");
1506 llvm::BasicBlock *ThenBB = CGF.createBasicBlock(name: ".shuffle.then");
1507 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(name: ".shuffle.exit");
1508 llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
1509 CGF.EmitBlock(BB: PreCondBB);
1510 llvm::PHINode *PhiSrc =
1511 Bld.CreatePHI(Ty: Ptr.getType(), /*NumReservedValues=*/2);
1512 PhiSrc->addIncoming(V: Ptr.emitRawPointer(CGF), BB: CurrentBB);
1513 llvm::PHINode *PhiDest =
1514 Bld.CreatePHI(Ty: ElemPtr.getType(), /*NumReservedValues=*/2);
1515 PhiDest->addIncoming(V: ElemPtr.emitRawPointer(CGF), BB: CurrentBB);
1516 Ptr = Address(PhiSrc, Ptr.getElementType(), Ptr.getAlignment());
1517 ElemPtr =
1518 Address(PhiDest, ElemPtr.getElementType(), ElemPtr.getAlignment());
1519 llvm::Value *PtrEndRaw = PtrEnd.emitRawPointer(CGF);
1520 llvm::Value *PtrRaw = Ptr.emitRawPointer(CGF);
1521 llvm::Value *PtrDiff = Bld.CreatePtrDiff(
1522 ElemTy: CGF.Int8Ty, LHS: PtrEndRaw,
1523 RHS: Bld.CreatePointerBitCastOrAddrSpaceCast(V: PtrRaw, DestTy: CGF.VoidPtrTy));
1524 Bld.CreateCondBr(Cond: Bld.CreateICmpSGT(LHS: PtrDiff, RHS: Bld.getInt64(C: IntSize - 1)),
1525 True: ThenBB, False: ExitBB);
1526 CGF.EmitBlock(BB: ThenBB);
1527 llvm::Value *Res = createRuntimeShuffleFunction(
1528 CGF,
1529 Elem: CGF.EmitLoadOfScalar(Addr: Ptr, /*Volatile=*/false, Ty: IntType, Loc,
1530 BaseInfo: LValueBaseInfo(AlignmentSource::Type),
1531 TBAAInfo: TBAAAccessInfo()),
1532 ElemType: IntType, Offset, Loc);
1533 CGF.EmitStoreOfScalar(Value: Res, Addr: ElemPtr, /*Volatile=*/false, Ty: IntType,
1534 BaseInfo: LValueBaseInfo(AlignmentSource::Type),
1535 TBAAInfo: TBAAAccessInfo());
1536 Address LocalPtr = Bld.CreateConstGEP(Addr: Ptr, Index: 1);
1537 Address LocalElemPtr = Bld.CreateConstGEP(Addr: ElemPtr, Index: 1);
1538 PhiSrc->addIncoming(V: LocalPtr.emitRawPointer(CGF), BB: ThenBB);
1539 PhiDest->addIncoming(V: LocalElemPtr.emitRawPointer(CGF), BB: ThenBB);
1540 CGF.EmitBranch(Block: PreCondBB);
1541 CGF.EmitBlock(BB: ExitBB);
1542 } else {
1543 llvm::Value *Res = createRuntimeShuffleFunction(
1544 CGF,
1545 Elem: CGF.EmitLoadOfScalar(Addr: Ptr, /*Volatile=*/false, Ty: IntType, Loc,
1546 BaseInfo: LValueBaseInfo(AlignmentSource::Type),
1547 TBAAInfo: TBAAAccessInfo()),
1548 ElemType: IntType, Offset, Loc);
1549 CGF.EmitStoreOfScalar(Value: Res, Addr: ElemPtr, /*Volatile=*/false, Ty: IntType,
1550 BaseInfo: LValueBaseInfo(AlignmentSource::Type),
1551 TBAAInfo: TBAAAccessInfo());
1552 Ptr = Bld.CreateConstGEP(Addr: Ptr, Index: 1);
1553 ElemPtr = Bld.CreateConstGEP(Addr: ElemPtr, Index: 1);
1554 }
1555 Size = Size % IntSize;
1556 }
1557}
1558
1559namespace {
1560enum CopyAction : unsigned {
1561 // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
1562 // the warp using shuffle instructions.
1563 RemoteLaneToThread,
1564 // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
1565 ThreadCopy,
1566};
1567} // namespace
1568
1569struct CopyOptionsTy {
1570 llvm::Value *RemoteLaneOffset;
1571 llvm::Value *ScratchpadIndex;
1572 llvm::Value *ScratchpadWidth;
1573};
1574
1575/// Emit instructions to copy a Reduce list, which contains partially
1576/// aggregated values, in the specified direction.
1577static void emitReductionListCopy(
1578 CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
1579 ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
1580 CopyOptionsTy CopyOptions = {.RemoteLaneOffset: nullptr, .ScratchpadIndex: nullptr, .ScratchpadWidth: nullptr}) {
1581
1582 CodeGenModule &CGM = CGF.CGM;
1583 ASTContext &C = CGM.getContext();
1584 CGBuilderTy &Bld = CGF.Builder;
1585
1586 llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
1587
1588 // Iterates, element-by-element, through the source Reduce list and
1589 // make a copy.
1590 unsigned Idx = 0;
1591 for (const Expr *Private : Privates) {
1592 Address SrcElementAddr = Address::invalid();
1593 Address DestElementAddr = Address::invalid();
1594 Address DestElementPtrAddr = Address::invalid();
1595 // Should we shuffle in an element from a remote lane?
1596 bool ShuffleInElement = false;
1597 // Set to true to update the pointer in the dest Reduce list to a
1598 // newly created element.
1599 bool UpdateDestListPtr = false;
1600 QualType PrivatePtrType = C.getPointerType(T: Private->getType());
1601 llvm::Type *PrivateLlvmPtrType = CGF.ConvertType(T: PrivatePtrType);
1602
1603 switch (Action) {
1604 case RemoteLaneToThread: {
1605 // Step 1.1: Get the address for the src element in the Reduce list.
1606 Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(Addr: SrcBase, Index: Idx);
1607 SrcElementAddr = CGF.EmitLoadOfPointer(
1608 Ptr: SrcElementPtrAddr.withElementType(ElemTy: PrivateLlvmPtrType),
1609 PtrTy: PrivatePtrType->castAs<PointerType>());
1610
1611 // Step 1.2: Create a temporary to store the element in the destination
1612 // Reduce list.
1613 DestElementPtrAddr = Bld.CreateConstArrayGEP(Addr: DestBase, Index: Idx);
1614 DestElementAddr =
1615 CGF.CreateMemTemp(T: Private->getType(), Name: ".omp.reduction.element");
1616 ShuffleInElement = true;
1617 UpdateDestListPtr = true;
1618 break;
1619 }
1620 case ThreadCopy: {
1621 // Step 1.1: Get the address for the src element in the Reduce list.
1622 Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(Addr: SrcBase, Index: Idx);
1623 SrcElementAddr = CGF.EmitLoadOfPointer(
1624 Ptr: SrcElementPtrAddr.withElementType(ElemTy: PrivateLlvmPtrType),
1625 PtrTy: PrivatePtrType->castAs<PointerType>());
1626
1627 // Step 1.2: Get the address for dest element. The destination
1628 // element has already been created on the thread's stack.
1629 DestElementPtrAddr = Bld.CreateConstArrayGEP(Addr: DestBase, Index: Idx);
1630 DestElementAddr = CGF.EmitLoadOfPointer(
1631 Ptr: DestElementPtrAddr.withElementType(ElemTy: PrivateLlvmPtrType),
1632 PtrTy: PrivatePtrType->castAs<PointerType>());
1633 break;
1634 }
1635 }
1636
1637 // Regardless of src and dest of copy, we emit the load of src
1638 // element as this is required in all directions
1639 SrcElementAddr = SrcElementAddr.withElementType(
1640 ElemTy: CGF.ConvertTypeForMem(T: Private->getType()));
1641 DestElementAddr =
1642 DestElementAddr.withElementType(ElemTy: SrcElementAddr.getElementType());
1643
1644 // Now that all active lanes have read the element in the
1645 // Reduce list, shuffle over the value from the remote lane.
1646 if (ShuffleInElement) {
1647 shuffleAndStore(CGF, SrcAddr: SrcElementAddr, DestAddr: DestElementAddr, ElemType: Private->getType(),
1648 Offset: RemoteLaneOffset, Loc: Private->getExprLoc());
1649 } else {
1650 switch (CGF.getEvaluationKind(T: Private->getType())) {
1651 case TEK_Scalar: {
1652 llvm::Value *Elem = CGF.EmitLoadOfScalar(
1653 Addr: SrcElementAddr, /*Volatile=*/false, Ty: Private->getType(),
1654 Loc: Private->getExprLoc(), BaseInfo: LValueBaseInfo(AlignmentSource::Type),
1655 TBAAInfo: TBAAAccessInfo());
1656 // Store the source element value to the dest element address.
1657 CGF.EmitStoreOfScalar(
1658 Value: Elem, Addr: DestElementAddr, /*Volatile=*/false, Ty: Private->getType(),
1659 BaseInfo: LValueBaseInfo(AlignmentSource::Type), TBAAInfo: TBAAAccessInfo());
1660 break;
1661 }
1662 case TEK_Complex: {
1663 CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex(
1664 src: CGF.MakeAddrLValue(Addr: SrcElementAddr, T: Private->getType()),
1665 loc: Private->getExprLoc());
1666 CGF.EmitStoreOfComplex(
1667 V: Elem, dest: CGF.MakeAddrLValue(Addr: DestElementAddr, T: Private->getType()),
1668 /*isInit=*/false);
1669 break;
1670 }
1671 case TEK_Aggregate:
1672 CGF.EmitAggregateCopy(
1673 Dest: CGF.MakeAddrLValue(Addr: DestElementAddr, T: Private->getType()),
1674 Src: CGF.MakeAddrLValue(Addr: SrcElementAddr, T: Private->getType()),
1675 EltTy: Private->getType(), MayOverlap: AggValueSlot::DoesNotOverlap);
1676 break;
1677 }
1678 }
1679
1680 // Step 3.1: Modify reference in dest Reduce list as needed.
1681 // Modifying the reference in Reduce list to point to the newly
1682 // created element. The element is live in the current function
1683 // scope and that of functions it invokes (i.e., reduce_function).
1684 // RemoteReduceData[i] = (void*)&RemoteElem
1685 if (UpdateDestListPtr) {
1686 CGF.EmitStoreOfScalar(
1687 Bld.CreatePointerBitCastOrAddrSpaceCast(
1688 V: DestElementAddr.emitRawPointer(CGF), DestTy: CGF.VoidPtrTy),
1689 DestElementPtrAddr, /*Volatile=*/false, C.VoidPtrTy);
1690 }
1691
1692 ++Idx;
1693 }
1694}
1695
1696/// This function emits a helper that gathers Reduce lists from the first
1697/// lane of every active warp to lanes in the first warp.
1698///
1699/// void inter_warp_copy_func(void* reduce_data, num_warps)
1700/// shared smem[warp_size];
1701/// For all data entries D in reduce_data:
1702/// sync
1703/// If (I am the first lane in each warp)
1704/// Copy my local D to smem[warp_id]
1705/// sync
1706/// if (I am the first warp)
1707/// Copy smem[thread_id] to my local D
1708static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
1709 ArrayRef<const Expr *> Privates,
1710 QualType ReductionArrayTy,
1711 SourceLocation Loc) {
1712 ASTContext &C = CGM.getContext();
1713 llvm::Module &M = CGM.getModule();
1714
1715 // ReduceList: thread local Reduce list.
1716 // At the stage of the computation when this function is called, partially
1717 // aggregated values reside in the first lane of every active warp.
1718 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
1719 C.VoidPtrTy, ImplicitParamKind::Other);
1720 // NumWarps: number of warps active in the parallel region. This could
1721 // be smaller than 32 (max warps in a CTA) for partial block reduction.
1722 ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
1723 C.getIntTypeForBitwidth(DestWidth: 32, /* Signed */ true),
1724 ImplicitParamKind::Other);
1725 FunctionArgList Args;
1726 Args.push_back(&ReduceListArg);
1727 Args.push_back(&NumWarpsArg);
1728
1729 const CGFunctionInfo &CGFI =
1730 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1731 auto *Fn = llvm::Function::Create(Ty: CGM.getTypes().GetFunctionType(Info: CGFI),
1732 Linkage: llvm::GlobalValue::InternalLinkage,
1733 N: "_omp_reduction_inter_warp_copy_func", M: &M);
1734 CGM.SetInternalFunctionAttributes(GD: GlobalDecl(), F: Fn, FI: CGFI);
1735 Fn->setDoesNotRecurse();
1736 CodeGenFunction CGF(CGM);
1737 CGF.StartFunction(GD: GlobalDecl(), RetTy: C.VoidTy, Fn: Fn, FnInfo: CGFI, Args, Loc, StartLoc: Loc);
1738
1739 CGBuilderTy &Bld = CGF.Builder;
1740
1741 // This array is used as a medium to transfer, one reduce element at a time,
1742 // the data from the first lane of every warp to lanes in the first warp
1743 // in order to perform the final step of a reduction in a parallel region
1744 // (reduction across warps). The array is placed in NVPTX __shared__ memory
1745 // for reduced latency, as well as to have a distinct copy for concurrently
1746 // executing target regions. The array is declared with common linkage so
1747 // as to be shared across compilation units.
1748 StringRef TransferMediumName =
1749 "__openmp_nvptx_data_transfer_temporary_storage";
1750 llvm::GlobalVariable *TransferMedium =
1751 M.getGlobalVariable(Name: TransferMediumName);
1752 unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
1753 if (!TransferMedium) {
1754 auto *Ty = llvm::ArrayType::get(ElementType: CGM.Int32Ty, NumElements: WarpSize);
1755 unsigned SharedAddressSpace = C.getTargetAddressSpace(AS: LangAS::cuda_shared);
1756 TransferMedium = new llvm::GlobalVariable(
1757 M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage,
1758 llvm::UndefValue::get(T: Ty), TransferMediumName,
1759 /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
1760 SharedAddressSpace);
1761 CGM.addCompilerUsedGlobal(GV: TransferMedium);
1762 }
1763
1764 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1765 // Get the CUDA thread id of the current OpenMP thread on the GPU.
1766 llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1767 // nvptx_lane_id = nvptx_id % warpsize
1768 llvm::Value *LaneID = getNVPTXLaneID(CGF);
1769 // nvptx_warp_id = nvptx_id / warpsize
1770 llvm::Value *WarpID = getNVPTXWarpID(CGF);
1771
1772 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1773 llvm::Type *ElemTy = CGF.ConvertTypeForMem(T: ReductionArrayTy);
1774 Address LocalReduceList(
1775 Bld.CreatePointerBitCastOrAddrSpaceCast(
1776 CGF.EmitLoadOfScalar(
1777 AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc,
1778 LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()),
1779 ElemTy->getPointerTo()),
1780 ElemTy, CGF.getPointerAlign());
1781
1782 unsigned Idx = 0;
1783 for (const Expr *Private : Privates) {
1784 //
1785 // Warp master copies reduce element to transfer medium in __shared__
1786 // memory.
1787 //
1788 unsigned RealTySize =
1789 C.getTypeSizeInChars(T: Private->getType())
1790 .alignTo(Align: C.getTypeAlignInChars(T: Private->getType()))
1791 .getQuantity();
1792 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
1793 unsigned NumIters = RealTySize / TySize;
1794 if (NumIters == 0)
1795 continue;
1796 QualType CType = C.getIntTypeForBitwidth(
1797 DestWidth: C.toBits(CharSize: CharUnits::fromQuantity(Quantity: TySize)), /*Signed=*/1);
1798 llvm::Type *CopyType = CGF.ConvertTypeForMem(T: CType);
1799 CharUnits Align = CharUnits::fromQuantity(Quantity: TySize);
1800 llvm::Value *Cnt = nullptr;
1801 Address CntAddr = Address::invalid();
1802 llvm::BasicBlock *PrecondBB = nullptr;
1803 llvm::BasicBlock *ExitBB = nullptr;
1804 if (NumIters > 1) {
1805 CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
1806 CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(Ty: CGM.IntTy), CntAddr,
1807 /*Volatile=*/false, C.IntTy);
1808 PrecondBB = CGF.createBasicBlock(name: "precond");
1809 ExitBB = CGF.createBasicBlock(name: "exit");
1810 llvm::BasicBlock *BodyBB = CGF.createBasicBlock(name: "body");
1811 // There is no need to emit line number for unconditional branch.
1812 (void)ApplyDebugLocation::CreateEmpty(CGF);
1813 CGF.EmitBlock(BB: PrecondBB);
1814 Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
1815 llvm::Value *Cmp =
1816 Bld.CreateICmpULT(LHS: Cnt, RHS: llvm::ConstantInt::get(Ty: CGM.IntTy, V: NumIters));
1817 Bld.CreateCondBr(Cond: Cmp, True: BodyBB, False: ExitBB);
1818 CGF.EmitBlock(BB: BodyBB);
1819 }
1820 // kmpc_barrier.
1821 CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
1822 /*EmitChecks=*/false,
1823 /*ForceSimpleCall=*/true);
1824 llvm::BasicBlock *ThenBB = CGF.createBasicBlock(name: "then");
1825 llvm::BasicBlock *ElseBB = CGF.createBasicBlock(name: "else");
1826 llvm::BasicBlock *MergeBB = CGF.createBasicBlock(name: "ifcont");
1827
1828 // if (lane_id == 0)
1829 llvm::Value *IsWarpMaster = Bld.CreateIsNull(Arg: LaneID, Name: "warp_master");
1830 Bld.CreateCondBr(Cond: IsWarpMaster, True: ThenBB, False: ElseBB);
1831 CGF.EmitBlock(BB: ThenBB);
1832
1833 // Reduce element = LocalReduceList[i]
1834 Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(Addr: LocalReduceList, Index: Idx);
1835 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
1836 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1837 // elemptr = ((CopyType*)(elemptrptr)) + I
1838 Address ElemPtr(ElemPtrPtr, CopyType, Align);
1839 if (NumIters > 1)
1840 ElemPtr = Bld.CreateGEP(CGF, Addr: ElemPtr, Index: Cnt);
1841
1842 // Get pointer to location in transfer medium.
1843 // MediumPtr = &medium[warp_id]
1844 llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
1845 Ty: TransferMedium->getValueType(), Ptr: TransferMedium,
1846 IdxList: {llvm::Constant::getNullValue(Ty: CGM.Int64Ty), WarpID});
1847 // Casting to actual data type.
1848 // MediumPtr = (CopyType*)MediumPtrAddr;
1849 Address MediumPtr(MediumPtrVal, CopyType, Align);
1850
1851 // elem = *elemptr
1852 //*MediumPtr = elem
1853 llvm::Value *Elem = CGF.EmitLoadOfScalar(
1854 Addr: ElemPtr, /*Volatile=*/false, Ty: CType, Loc,
1855 BaseInfo: LValueBaseInfo(AlignmentSource::Type), TBAAInfo: TBAAAccessInfo());
1856 // Store the source element value to the dest element address.
1857 CGF.EmitStoreOfScalar(Value: Elem, Addr: MediumPtr, /*Volatile=*/true, Ty: CType,
1858 BaseInfo: LValueBaseInfo(AlignmentSource::Type),
1859 TBAAInfo: TBAAAccessInfo());
1860
1861 Bld.CreateBr(Dest: MergeBB);
1862
1863 CGF.EmitBlock(BB: ElseBB);
1864 Bld.CreateBr(Dest: MergeBB);
1865
1866 CGF.EmitBlock(BB: MergeBB);
1867
1868 // kmpc_barrier.
1869 CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
1870 /*EmitChecks=*/false,
1871 /*ForceSimpleCall=*/true);
1872
1873 //
1874 // Warp 0 copies reduce element from transfer medium.
1875 //
1876 llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock(name: "then");
1877 llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock(name: "else");
1878 llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock(name: "ifcont");
1879
1880 Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
1881 llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
1882 AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
1883
1884 // Up to 32 threads in warp 0 are active.
1885 llvm::Value *IsActiveThread =
1886 Bld.CreateICmpULT(LHS: ThreadID, RHS: NumWarpsVal, Name: "is_active_thread");
1887 Bld.CreateCondBr(Cond: IsActiveThread, True: W0ThenBB, False: W0ElseBB);
1888
1889 CGF.EmitBlock(BB: W0ThenBB);
1890
1891 // SrcMediumPtr = &medium[tid]
1892 llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
1893 Ty: TransferMedium->getValueType(), Ptr: TransferMedium,
1894 IdxList: {llvm::Constant::getNullValue(Ty: CGM.Int64Ty), ThreadID});
1895 // SrcMediumVal = *SrcMediumPtr;
1896 Address SrcMediumPtr(SrcMediumPtrVal, CopyType, Align);
1897
1898 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
1899 Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(Addr: LocalReduceList, Index: Idx);
1900 llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
1901 TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
1902 Address TargetElemPtr(TargetElemPtrVal, CopyType, Align);
1903 if (NumIters > 1)
1904 TargetElemPtr = Bld.CreateGEP(CGF, Addr: TargetElemPtr, Index: Cnt);
1905
1906 // *TargetElemPtr = SrcMediumVal;
1907 llvm::Value *SrcMediumValue =
1908 CGF.EmitLoadOfScalar(Addr: SrcMediumPtr, /*Volatile=*/true, Ty: CType, Loc);
1909 CGF.EmitStoreOfScalar(Value: SrcMediumValue, Addr: TargetElemPtr, /*Volatile=*/false,
1910 Ty: CType);
1911 Bld.CreateBr(Dest: W0MergeBB);
1912
1913 CGF.EmitBlock(BB: W0ElseBB);
1914 Bld.CreateBr(Dest: W0MergeBB);
1915
1916 CGF.EmitBlock(BB: W0MergeBB);
1917
1918 if (NumIters > 1) {
1919 Cnt = Bld.CreateNSWAdd(LHS: Cnt, RHS: llvm::ConstantInt::get(Ty: CGM.IntTy, /*V=*/1));
1920 CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
1921 CGF.EmitBranch(Block: PrecondBB);
1922 (void)ApplyDebugLocation::CreateEmpty(CGF);
1923 CGF.EmitBlock(BB: ExitBB);
1924 }
1925 RealTySize %= TySize;
1926 }
1927 ++Idx;
1928 }
1929
1930 CGF.FinishFunction();
1931 return Fn;
1932}
1933
1934/// Emit a helper that reduces data across two OpenMP threads (lanes)
1935/// in the same warp. It uses shuffle instructions to copy over data from
1936/// a remote lane's stack. The reduction algorithm performed is specified
1937/// by the fourth parameter.
1938///
1939/// Algorithm Versions.
1940/// Full Warp Reduce (argument value 0):
1941/// This algorithm assumes that all 32 lanes are active and gathers
1942/// data from these 32 lanes, producing a single resultant value.
1943/// Contiguous Partial Warp Reduce (argument value 1):
1944/// This algorithm assumes that only a *contiguous* subset of lanes
1945/// are active. This happens for the last warp in a parallel region
1946/// when the user specified num_threads is not an integer multiple of
1947/// 32. This contiguous subset always starts with the zeroth lane.
1948/// Partial Warp Reduce (argument value 2):
1949/// This algorithm gathers data from any number of lanes at any position.
1950/// All reduced values are stored in the lowest possible lane. The set
1951/// of problems every algorithm addresses is a super set of those
1952/// addressable by algorithms with a lower version number. Overhead
1953/// increases as algorithm version increases.
1954///
1955/// Terminology
1956/// Reduce element:
1957/// Reduce element refers to the individual data field with primitive
1958/// data types to be combined and reduced across threads.
1959/// Reduce list:
1960/// Reduce list refers to a collection of local, thread-private
1961/// reduce elements.
1962/// Remote Reduce list:
1963/// Remote Reduce list refers to a collection of remote (relative to
1964/// the current thread) reduce elements.
1965///
1966/// We distinguish between three states of threads that are important to
1967/// the implementation of this function.
1968/// Alive threads:
1969/// Threads in a warp executing the SIMT instruction, as distinguished from
1970/// threads that are inactive due to divergent control flow.
1971/// Active threads:
1972/// The minimal set of threads that has to be alive upon entry to this
1973/// function. The computation is correct iff active threads are alive.
1974/// Some threads are alive but they are not active because they do not
1975/// contribute to the computation in any useful manner. Turning them off
1976/// may introduce control flow overheads without any tangible benefits.
1977/// Effective threads:
1978/// In order to comply with the argument requirements of the shuffle
1979/// function, we must keep all lanes holding data alive. But at most
1980/// half of them perform value aggregation; we refer to this half of
1981/// threads as effective. The other half is simply handing off their
1982/// data.
1983///
1984/// Procedure
1985/// Value shuffle:
1986/// In this step active threads transfer data from higher lane positions
1987/// in the warp to lower lane positions, creating Remote Reduce list.
1988/// Value aggregation:
1989/// In this step, effective threads combine their thread local Reduce list
1990/// with Remote Reduce list and store the result in the thread local
1991/// Reduce list.
1992/// Value copy:
1993/// In this step, we deal with the assumption made by algorithm 2
1994/// (i.e. contiguity assumption). When we have an odd number of lanes
1995/// active, say 2k+1, only k threads will be effective and therefore k
1996/// new values will be produced. However, the Reduce list owned by the
1997/// (2k+1)th thread is ignored in the value aggregation. Therefore
1998/// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
1999/// that the contiguity assumption still holds.
2000static llvm::Function *emitShuffleAndReduceFunction(
2001 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2002 QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
2003 ASTContext &C = CGM.getContext();
2004
2005 // Thread local Reduce list used to host the values of data to be reduced.
2006 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2007 C.VoidPtrTy, ImplicitParamKind::Other);
2008 // Current lane id; could be logical.
2009 ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
2010 ImplicitParamKind::Other);
2011 // Offset of the remote source lane relative to the current lane.
2012 ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2013 C.ShortTy, ImplicitParamKind::Other);
2014 // Algorithm version. This is expected to be known at compile time.
2015 ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2016 C.ShortTy, ImplicitParamKind::Other);
2017 FunctionArgList Args;
2018 Args.push_back(&ReduceListArg);
2019 Args.push_back(&LaneIDArg);
2020 Args.push_back(&RemoteLaneOffsetArg);
2021 Args.push_back(&AlgoVerArg);
2022
2023 const CGFunctionInfo &CGFI =
2024 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2025 auto *Fn = llvm::Function::Create(
2026 Ty: CGM.getTypes().GetFunctionType(Info: CGFI), Linkage: llvm::GlobalValue::InternalLinkage,
2027 N: "_omp_reduction_shuffle_and_reduce_func", M: &CGM.getModule());
2028 CGM.SetInternalFunctionAttributes(GD: GlobalDecl(), F: Fn, FI: CGFI);
2029 Fn->setDoesNotRecurse();
2030
2031 CodeGenFunction CGF(CGM);
2032 CGF.StartFunction(GD: GlobalDecl(), RetTy: C.VoidTy, Fn: Fn, FnInfo: CGFI, Args, Loc, StartLoc: Loc);
2033
2034 CGBuilderTy &Bld = CGF.Builder;
2035
2036 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2037 llvm::Type *ElemTy = CGF.ConvertTypeForMem(T: ReductionArrayTy);
2038 Address LocalReduceList(
2039 Bld.CreatePointerBitCastOrAddrSpaceCast(
2040 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2041 C.VoidPtrTy, SourceLocation()),
2042 ElemTy->getPointerTo()),
2043 ElemTy, CGF.getPointerAlign());
2044
2045 Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
2046 llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
2047 AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2048
2049 Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
2050 llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
2051 AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2052
2053 Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
2054 llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
2055 AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2056
2057 // Create a local thread-private variable to host the Reduce list
2058 // from a remote lane.
2059 Address RemoteReduceList =
2060 CGF.CreateMemTemp(T: ReductionArrayTy, Name: ".omp.reduction.remote_reduce_list");
2061
2062 // This loop iterates through the list of reduce elements and copies,
2063 // element by element, from a remote lane in the warp to RemoteReduceList,
2064 // hosted on the thread's stack.
2065 emitReductionListCopy(Action: RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
2066 SrcBase: LocalReduceList, DestBase: RemoteReduceList,
2067 CopyOptions: {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
2068 /*ScratchpadIndex=*/nullptr,
2069 /*ScratchpadWidth=*/nullptr});
2070
2071 // The actions to be performed on the Remote Reduce list is dependent
2072 // on the algorithm version.
2073 //
2074 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2075 // LaneId % 2 == 0 && Offset > 0):
2076 // do the reduction value aggregation
2077 //
2078 // The thread local variable Reduce list is mutated in place to host the
2079 // reduced data, which is the aggregated value produced from local and
2080 // remote lanes.
2081 //
2082 // Note that AlgoVer is expected to be a constant integer known at compile
2083 // time.
2084 // When AlgoVer==0, the first conjunction evaluates to true, making
2085 // the entire predicate true during compile time.
2086 // When AlgoVer==1, the second conjunction has only the second part to be
2087 // evaluated during runtime. Other conjunctions evaluates to false
2088 // during compile time.
2089 // When AlgoVer==2, the third conjunction has only the second part to be
2090 // evaluated during runtime. Other conjunctions evaluates to false
2091 // during compile time.
2092 llvm::Value *CondAlgo0 = Bld.CreateIsNull(Arg: AlgoVerArgVal);
2093
2094 llvm::Value *Algo1 = Bld.CreateICmpEQ(LHS: AlgoVerArgVal, RHS: Bld.getInt16(C: 1));
2095 llvm::Value *CondAlgo1 = Bld.CreateAnd(
2096 LHS: Algo1, RHS: Bld.CreateICmpULT(LHS: LaneIDArgVal, RHS: RemoteLaneOffsetArgVal));
2097
2098 llvm::Value *Algo2 = Bld.CreateICmpEQ(LHS: AlgoVerArgVal, RHS: Bld.getInt16(C: 2));
2099 llvm::Value *CondAlgo2 = Bld.CreateAnd(
2100 LHS: Algo2, RHS: Bld.CreateIsNull(Arg: Bld.CreateAnd(LHS: LaneIDArgVal, RHS: Bld.getInt16(C: 1))));
2101 CondAlgo2 = Bld.CreateAnd(
2102 LHS: CondAlgo2, RHS: Bld.CreateICmpSGT(LHS: RemoteLaneOffsetArgVal, RHS: Bld.getInt16(C: 0)));
2103
2104 llvm::Value *CondReduce = Bld.CreateOr(LHS: CondAlgo0, RHS: CondAlgo1);
2105 CondReduce = Bld.CreateOr(LHS: CondReduce, RHS: CondAlgo2);
2106
2107 llvm::BasicBlock *ThenBB = CGF.createBasicBlock(name: "then");
2108 llvm::BasicBlock *ElseBB = CGF.createBasicBlock(name: "else");
2109 llvm::BasicBlock *MergeBB = CGF.createBasicBlock(name: "ifcont");
2110 Bld.CreateCondBr(Cond: CondReduce, True: ThenBB, False: ElseBB);
2111
2112 CGF.EmitBlock(BB: ThenBB);
2113 // reduce_function(LocalReduceList, RemoteReduceList)
2114 llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2115 V: LocalReduceList.emitRawPointer(CGF), DestTy: CGF.VoidPtrTy);
2116 llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2117 V: RemoteReduceList.emitRawPointer(CGF), DestTy: CGF.VoidPtrTy);
2118 CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2119 CGF, Loc, OutlinedFn: ReduceFn, Args: {LocalReduceListPtr, RemoteReduceListPtr});
2120 Bld.CreateBr(Dest: MergeBB);
2121
2122 CGF.EmitBlock(BB: ElseBB);
2123 Bld.CreateBr(Dest: MergeBB);
2124
2125 CGF.EmitBlock(BB: MergeBB);
2126
2127 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
2128 // Reduce list.
2129 Algo1 = Bld.CreateICmpEQ(LHS: AlgoVerArgVal, RHS: Bld.getInt16(C: 1));
2130 llvm::Value *CondCopy = Bld.CreateAnd(
2131 LHS: Algo1, RHS: Bld.CreateICmpUGE(LHS: LaneIDArgVal, RHS: RemoteLaneOffsetArgVal));
2132
2133 llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock(name: "then");
2134 llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock(name: "else");
2135 llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock(name: "ifcont");
2136 Bld.CreateCondBr(Cond: CondCopy, True: CpyThenBB, False: CpyElseBB);
2137
2138 CGF.EmitBlock(BB: CpyThenBB);
2139 emitReductionListCopy(Action: ThreadCopy, CGF, ReductionArrayTy, Privates,
2140 SrcBase: RemoteReduceList, DestBase: LocalReduceList);
2141 Bld.CreateBr(Dest: CpyMergeBB);
2142
2143 CGF.EmitBlock(BB: CpyElseBB);
2144 Bld.CreateBr(Dest: CpyMergeBB);
2145
2146 CGF.EmitBlock(BB: CpyMergeBB);
2147
2148 CGF.FinishFunction();
2149 return Fn;
2150}
2151
2152/// This function emits a helper that copies all the reduction variables from
2153/// the team into the provided global buffer for the reduction variables.
2154///
2155/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
2156/// For all data entries D in reduce_data:
2157/// Copy local D to buffer.D[Idx]
2158static llvm::Value *emitListToGlobalCopyFunction(
2159 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2160 QualType ReductionArrayTy, SourceLocation Loc,
2161 const RecordDecl *TeamReductionRec,
2162 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2163 &VarFieldMap) {
2164 ASTContext &C = CGM.getContext();
2165
2166 // Buffer: global reduction buffer.
2167 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2168 C.VoidPtrTy, ImplicitParamKind::Other);
2169 // Idx: index of the buffer.
2170 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2171 ImplicitParamKind::Other);
2172 // ReduceList: thread local Reduce list.
2173 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2174 C.VoidPtrTy, ImplicitParamKind::Other);
2175 FunctionArgList Args;
2176 Args.push_back(&BufferArg);
2177 Args.push_back(&IdxArg);
2178 Args.push_back(&ReduceListArg);
2179
2180 const CGFunctionInfo &CGFI =
2181 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2182 auto *Fn = llvm::Function::Create(
2183 Ty: CGM.getTypes().GetFunctionType(Info: CGFI), Linkage: llvm::GlobalValue::InternalLinkage,
2184 N: "_omp_reduction_list_to_global_copy_func", M: &CGM.getModule());
2185 CGM.SetInternalFunctionAttributes(GD: GlobalDecl(), F: Fn, FI: CGFI);
2186 Fn->setDoesNotRecurse();
2187 CodeGenFunction CGF(CGM);
2188 CGF.StartFunction(GD: GlobalDecl(), RetTy: C.VoidTy, Fn: Fn, FnInfo: CGFI, Args, Loc, StartLoc: Loc);
2189
2190 CGBuilderTy &Bld = CGF.Builder;
2191
2192 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2193 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2194 llvm::Type *ElemTy = CGF.ConvertTypeForMem(T: ReductionArrayTy);
2195 Address LocalReduceList(
2196 Bld.CreatePointerBitCastOrAddrSpaceCast(
2197 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2198 C.VoidPtrTy, Loc),
2199 ElemTy->getPointerTo()),
2200 ElemTy, CGF.getPointerAlign());
2201 QualType StaticTy = C.getRecordType(Decl: TeamReductionRec);
2202 llvm::Type *LLVMReductionsBufferTy =
2203 CGM.getTypes().ConvertTypeForMem(T: StaticTy);
2204 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2205 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2206 LLVMReductionsBufferTy->getPointerTo());
2207 llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2208 /*Volatile=*/false, C.IntTy,
2209 Loc)};
2210 unsigned Idx = 0;
2211 for (const Expr *Private : Privates) {
2212 // Reduce element = LocalReduceList[i]
2213 Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(Addr: LocalReduceList, Index: Idx);
2214 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2215 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2216 // elemptr = ((CopyType*)(elemptrptr)) + I
2217 ElemTy = CGF.ConvertTypeForMem(T: Private->getType());
2218 ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2219 V: ElemPtrPtr, DestTy: ElemTy->getPointerTo());
2220 Address ElemPtr =
2221 Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(T: Private->getType()));
2222 const ValueDecl *VD = cast<DeclRefExpr>(Val: Private)->getDecl();
2223 // Global = Buffer.VD[Idx];
2224 const FieldDecl *FD = VarFieldMap.lookup(Val: VD);
2225 llvm::Value *BufferPtr =
2226 Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);
2227 LValue GlobLVal = CGF.EmitLValueForField(
2228 Base: CGF.MakeNaturalAlignRawAddrLValue(V: BufferPtr, T: StaticTy), Field: FD);
2229 Address GlobAddr = GlobLVal.getAddress(CGF);
2230 GlobLVal.setAddress(Address(GlobAddr.emitRawPointer(CGF),
2231 CGF.ConvertTypeForMem(T: Private->getType()),
2232 GlobAddr.getAlignment()));
2233 switch (CGF.getEvaluationKind(T: Private->getType())) {
2234 case TEK_Scalar: {
2235 llvm::Value *V = CGF.EmitLoadOfScalar(
2236 Addr: ElemPtr, /*Volatile=*/false, Ty: Private->getType(), Loc,
2237 BaseInfo: LValueBaseInfo(AlignmentSource::Type), TBAAInfo: TBAAAccessInfo());
2238 CGF.EmitStoreOfScalar(value: V, lvalue: GlobLVal);
2239 break;
2240 }
2241 case TEK_Complex: {
2242 CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(
2243 src: CGF.MakeAddrLValue(Addr: ElemPtr, T: Private->getType()), loc: Loc);
2244 CGF.EmitStoreOfComplex(V, dest: GlobLVal, /*isInit=*/false);
2245 break;
2246 }
2247 case TEK_Aggregate:
2248 CGF.EmitAggregateCopy(Dest: GlobLVal,
2249 Src: CGF.MakeAddrLValue(Addr: ElemPtr, T: Private->getType()),
2250 EltTy: Private->getType(), MayOverlap: AggValueSlot::DoesNotOverlap);
2251 break;
2252 }
2253 ++Idx;
2254 }
2255
2256 CGF.FinishFunction();
2257 return Fn;
2258}
2259
2260/// This function emits a helper that reduces all the reduction variables from
2261/// the team into the provided global buffer for the reduction variables.
2262///
2263/// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
2264/// void *GlobPtrs[];
2265/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
2266/// ...
2267/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
2268/// reduce_function(GlobPtrs, reduce_data);
2269static llvm::Value *emitListToGlobalReduceFunction(
2270 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2271 QualType ReductionArrayTy, SourceLocation Loc,
2272 const RecordDecl *TeamReductionRec,
2273 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2274 &VarFieldMap,
2275 llvm::Function *ReduceFn) {
2276 ASTContext &C = CGM.getContext();
2277
2278 // Buffer: global reduction buffer.
2279 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2280 C.VoidPtrTy, ImplicitParamKind::Other);
2281 // Idx: index of the buffer.
2282 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2283 ImplicitParamKind::Other);
2284 // ReduceList: thread local Reduce list.
2285 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2286 C.VoidPtrTy, ImplicitParamKind::Other);
2287 FunctionArgList Args;
2288 Args.push_back(&BufferArg);
2289 Args.push_back(&IdxArg);
2290 Args.push_back(&ReduceListArg);
2291
2292 const CGFunctionInfo &CGFI =
2293 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2294 auto *Fn = llvm::Function::Create(
2295 Ty: CGM.getTypes().GetFunctionType(Info: CGFI), Linkage: llvm::GlobalValue::InternalLinkage,
2296 N: "_omp_reduction_list_to_global_reduce_func", M: &CGM.getModule());
2297 CGM.SetInternalFunctionAttributes(GD: GlobalDecl(), F: Fn, FI: CGFI);
2298 Fn->setDoesNotRecurse();
2299 CodeGenFunction CGF(CGM);
2300 CGF.StartFunction(GD: GlobalDecl(), RetTy: C.VoidTy, Fn: Fn, FnInfo: CGFI, Args, Loc, StartLoc: Loc);
2301
2302 CGBuilderTy &Bld = CGF.Builder;
2303
2304 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2305 QualType StaticTy = C.getRecordType(Decl: TeamReductionRec);
2306 llvm::Type *LLVMReductionsBufferTy =
2307 CGM.getTypes().ConvertTypeForMem(T: StaticTy);
2308 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2309 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2310 LLVMReductionsBufferTy->getPointerTo());
2311
2312 // 1. Build a list of reduction variables.
2313 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2314 RawAddress ReductionList =
2315 CGF.CreateMemTemp(T: ReductionArrayTy, Name: ".omp.reduction.red_list");
2316 auto IPriv = Privates.begin();
2317 llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2318 /*Volatile=*/false, C.IntTy,
2319 Loc)};
2320 unsigned Idx = 0;
2321 for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2322 Address Elem = CGF.Builder.CreateConstArrayGEP(Addr: ReductionList, Index: Idx);
2323 // Global = Buffer.VD[Idx];
2324 const ValueDecl *VD = cast<DeclRefExpr>(Val: *IPriv)->getDecl();
2325 const FieldDecl *FD = VarFieldMap.lookup(Val: VD);
2326 llvm::Value *BufferPtr =
2327 Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);
2328 LValue GlobLVal = CGF.EmitLValueForField(
2329 Base: CGF.MakeNaturalAlignRawAddrLValue(V: BufferPtr, T: StaticTy), Field: FD);
2330 Address GlobAddr = GlobLVal.getAddress(CGF);
2331 CGF.EmitStoreOfScalar(GlobAddr.emitRawPointer(CGF), Elem,
2332 /*Volatile=*/false, C.VoidPtrTy);
2333 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2334 // Store array size.
2335 ++Idx;
2336 Elem = CGF.Builder.CreateConstArrayGEP(Addr: ReductionList, Index: Idx);
2337 llvm::Value *Size = CGF.Builder.CreateIntCast(
2338 V: CGF.getVLASize(
2339 vla: CGF.getContext().getAsVariableArrayType(T: (*IPriv)->getType()))
2340 .NumElts,
2341 DestTy: CGF.SizeTy, /*isSigned=*/false);
2342 CGF.Builder.CreateStore(Val: CGF.Builder.CreateIntToPtr(V: Size, DestTy: CGF.VoidPtrTy),
2343 Addr: Elem);
2344 }
2345 }
2346
2347 // Call reduce_function(GlobalReduceList, ReduceList)
2348 llvm::Value *GlobalReduceList = ReductionList.getPointer();
2349 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2350 llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
2351 AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
2352 CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2353 CGF, Loc, OutlinedFn: ReduceFn, Args: {GlobalReduceList, ReducedPtr});
2354 CGF.FinishFunction();
2355 return Fn;
2356}
2357
2358/// This function emits a helper that copies all the reduction variables from
2359/// the team into the provided global buffer for the reduction variables.
2360///
2361/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
2362/// For all data entries D in reduce_data:
2363/// Copy buffer.D[Idx] to local D;
2364static llvm::Value *emitGlobalToListCopyFunction(
2365 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2366 QualType ReductionArrayTy, SourceLocation Loc,
2367 const RecordDecl *TeamReductionRec,
2368 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2369 &VarFieldMap) {
2370 ASTContext &C = CGM.getContext();
2371
2372 // Buffer: global reduction buffer.
2373 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2374 C.VoidPtrTy, ImplicitParamKind::Other);
2375 // Idx: index of the buffer.
2376 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2377 ImplicitParamKind::Other);
2378 // ReduceList: thread local Reduce list.
2379 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2380 C.VoidPtrTy, ImplicitParamKind::Other);
2381 FunctionArgList Args;
2382 Args.push_back(&BufferArg);
2383 Args.push_back(&IdxArg);
2384 Args.push_back(&ReduceListArg);
2385
2386 const CGFunctionInfo &CGFI =
2387 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2388 auto *Fn = llvm::Function::Create(
2389 Ty: CGM.getTypes().GetFunctionType(Info: CGFI), Linkage: llvm::GlobalValue::InternalLinkage,
2390 N: "_omp_reduction_global_to_list_copy_func", M: &CGM.getModule());
2391 CGM.SetInternalFunctionAttributes(GD: GlobalDecl(), F: Fn, FI: CGFI);
2392 Fn->setDoesNotRecurse();
2393 CodeGenFunction CGF(CGM);
2394 CGF.StartFunction(GD: GlobalDecl(), RetTy: C.VoidTy, Fn: Fn, FnInfo: CGFI, Args, Loc, StartLoc: Loc);
2395
2396 CGBuilderTy &Bld = CGF.Builder;
2397
2398 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2399 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2400 llvm::Type *ElemTy = CGF.ConvertTypeForMem(T: ReductionArrayTy);
2401 Address LocalReduceList(
2402 Bld.CreatePointerBitCastOrAddrSpaceCast(
2403 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2404 C.VoidPtrTy, Loc),
2405 ElemTy->getPointerTo()),
2406 ElemTy, CGF.getPointerAlign());
2407 QualType StaticTy = C.getRecordType(Decl: TeamReductionRec);
2408 llvm::Type *LLVMReductionsBufferTy =
2409 CGM.getTypes().ConvertTypeForMem(T: StaticTy);
2410 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2411 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2412 LLVMReductionsBufferTy->getPointerTo());
2413
2414 llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2415 /*Volatile=*/false, C.IntTy,
2416 Loc)};
2417 unsigned Idx = 0;
2418 for (const Expr *Private : Privates) {
2419 // Reduce element = LocalReduceList[i]
2420 Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(Addr: LocalReduceList, Index: Idx);
2421 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2422 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2423 // elemptr = ((CopyType*)(elemptrptr)) + I
2424 ElemTy = CGF.ConvertTypeForMem(T: Private->getType());
2425 ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2426 V: ElemPtrPtr, DestTy: ElemTy->getPointerTo());
2427 Address ElemPtr =
2428 Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(T: Private->getType()));
2429 const ValueDecl *VD = cast<DeclRefExpr>(Val: Private)->getDecl();
2430 // Global = Buffer.VD[Idx];
2431 const FieldDecl *FD = VarFieldMap.lookup(Val: VD);
2432 llvm::Value *BufferPtr =
2433 Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);
2434 LValue GlobLVal = CGF.EmitLValueForField(
2435 Base: CGF.MakeNaturalAlignRawAddrLValue(V: BufferPtr, T: StaticTy), Field: FD);
2436 Address GlobAddr = GlobLVal.getAddress(CGF);
2437 GlobLVal.setAddress(Address(GlobAddr.emitRawPointer(CGF),
2438 CGF.ConvertTypeForMem(T: Private->getType()),
2439 GlobAddr.getAlignment()));
2440 switch (CGF.getEvaluationKind(T: Private->getType())) {
2441 case TEK_Scalar: {
2442 llvm::Value *V = CGF.EmitLoadOfScalar(lvalue: GlobLVal, Loc);
2443 CGF.EmitStoreOfScalar(Value: V, Addr: ElemPtr, /*Volatile=*/false, Ty: Private->getType(),
2444 BaseInfo: LValueBaseInfo(AlignmentSource::Type),
2445 TBAAInfo: TBAAAccessInfo());
2446 break;
2447 }
2448 case TEK_Complex: {
2449 CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(src: GlobLVal, loc: Loc);
2450 CGF.EmitStoreOfComplex(V, dest: CGF.MakeAddrLValue(Addr: ElemPtr, T: Private->getType()),
2451 /*isInit=*/false);
2452 break;
2453 }
2454 case TEK_Aggregate:
2455 CGF.EmitAggregateCopy(Dest: CGF.MakeAddrLValue(Addr: ElemPtr, T: Private->getType()),
2456 Src: GlobLVal, EltTy: Private->getType(),
2457 MayOverlap: AggValueSlot::DoesNotOverlap);
2458 break;
2459 }
2460 ++Idx;
2461 }
2462
2463 CGF.FinishFunction();
2464 return Fn;
2465}
2466
2467/// This function emits a helper that reduces all the reduction variables from
2468/// the team into the provided global buffer for the reduction variables.
2469///
2470/// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
2471/// void *GlobPtrs[];
2472/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
2473/// ...
2474/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
2475/// reduce_function(reduce_data, GlobPtrs);
2476static llvm::Value *emitGlobalToListReduceFunction(
2477 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2478 QualType ReductionArrayTy, SourceLocation Loc,
2479 const RecordDecl *TeamReductionRec,
2480 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2481 &VarFieldMap,
2482 llvm::Function *ReduceFn) {
2483 ASTContext &C = CGM.getContext();
2484
2485 // Buffer: global reduction buffer.
2486 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2487 C.VoidPtrTy, ImplicitParamKind::Other);
2488 // Idx: index of the buffer.
2489 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2490 ImplicitParamKind::Other);
2491 // ReduceList: thread local Reduce list.
2492 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2493 C.VoidPtrTy, ImplicitParamKind::Other);
2494 FunctionArgList Args;
2495 Args.push_back(&BufferArg);
2496 Args.push_back(&IdxArg);
2497 Args.push_back(&ReduceListArg);
2498
2499 const CGFunctionInfo &CGFI =
2500 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2501 auto *Fn = llvm::Function::Create(
2502 Ty: CGM.getTypes().GetFunctionType(Info: CGFI), Linkage: llvm::GlobalValue::InternalLinkage,
2503 N: "_omp_reduction_global_to_list_reduce_func", M: &CGM.getModule());
2504 CGM.SetInternalFunctionAttributes(GD: GlobalDecl(), F: Fn, FI: CGFI);
2505 Fn->setDoesNotRecurse();
2506 CodeGenFunction CGF(CGM);
2507 CGF.StartFunction(GD: GlobalDecl(), RetTy: C.VoidTy, Fn: Fn, FnInfo: CGFI, Args, Loc, StartLoc: Loc);
2508
2509 CGBuilderTy &Bld = CGF.Builder;
2510
2511 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2512 QualType StaticTy = C.getRecordType(Decl: TeamReductionRec);
2513 llvm::Type *LLVMReductionsBufferTy =
2514 CGM.getTypes().ConvertTypeForMem(T: StaticTy);
2515 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2516 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2517 LLVMReductionsBufferTy->getPointerTo());
2518
2519 // 1. Build a list of reduction variables.
2520 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2521 Address ReductionList =
2522 CGF.CreateMemTemp(T: ReductionArrayTy, Name: ".omp.reduction.red_list");
2523 auto IPriv = Privates.begin();
2524 llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2525 /*Volatile=*/false, C.IntTy,
2526 Loc)};
2527 unsigned Idx = 0;
2528 for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2529 Address Elem = CGF.Builder.CreateConstArrayGEP(Addr: ReductionList, Index: Idx);
2530 // Global = Buffer.VD[Idx];
2531 const ValueDecl *VD = cast<DeclRefExpr>(Val: *IPriv)->getDecl();
2532 const FieldDecl *FD = VarFieldMap.lookup(Val: VD);
2533 llvm::Value *BufferPtr =
2534 Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);
2535 LValue GlobLVal = CGF.EmitLValueForField(
2536 Base: CGF.MakeNaturalAlignRawAddrLValue(V: BufferPtr, T: StaticTy), Field: FD);
2537 Address GlobAddr = GlobLVal.getAddress(CGF);
2538 CGF.EmitStoreOfScalar(GlobAddr.emitRawPointer(CGF), Elem,
2539 /*Volatile=*/false, C.VoidPtrTy);
2540 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2541 // Store array size.
2542 ++Idx;
2543 Elem = CGF.Builder.CreateConstArrayGEP(Addr: ReductionList, Index: Idx);
2544 llvm::Value *Size = CGF.Builder.CreateIntCast(
2545 V: CGF.getVLASize(
2546 vla: CGF.getContext().getAsVariableArrayType(T: (*IPriv)->getType()))
2547 .NumElts,
2548 DestTy: CGF.SizeTy, /*isSigned=*/false);
2549 CGF.Builder.CreateStore(Val: CGF.Builder.CreateIntToPtr(V: Size, DestTy: CGF.VoidPtrTy),
2550 Addr: Elem);
2551 }
2552 }
2553
2554 // Call reduce_function(ReduceList, GlobalReduceList)
2555 llvm::Value *GlobalReduceList = ReductionList.emitRawPointer(CGF);
2556 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2557 llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
2558 AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
2559 CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2560 CGF, Loc, OutlinedFn: ReduceFn, Args: {ReducedPtr, GlobalReduceList});
2561 CGF.FinishFunction();
2562 return Fn;
2563}
2564
2565///
2566/// Design of OpenMP reductions on the GPU
2567///
2568/// Consider a typical OpenMP program with one or more reduction
2569/// clauses:
2570///
2571/// float foo;
2572/// double bar;
2573/// #pragma omp target teams distribute parallel for \
2574/// reduction(+:foo) reduction(*:bar)
2575/// for (int i = 0; i < N; i++) {
2576/// foo += A[i]; bar *= B[i];
2577/// }
2578///
2579/// where 'foo' and 'bar' are reduced across all OpenMP threads in
2580/// all teams. In our OpenMP implementation on the NVPTX device an
2581/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
2582/// within a team are mapped to CUDA threads within a threadblock.
2583/// Our goal is to efficiently aggregate values across all OpenMP
2584/// threads such that:
2585///
2586/// - the compiler and runtime are logically concise, and
2587/// - the reduction is performed efficiently in a hierarchical
2588/// manner as follows: within OpenMP threads in the same warp,
2589/// across warps in a threadblock, and finally across teams on
2590/// the NVPTX device.
2591///
2592/// Introduction to Decoupling
2593///
2594/// We would like to decouple the compiler and the runtime so that the
2595/// latter is ignorant of the reduction variables (number, data types)
2596/// and the reduction operators. This allows a simpler interface
2597/// and implementation while still attaining good performance.
2598///
2599/// Pseudocode for the aforementioned OpenMP program generated by the
2600/// compiler is as follows:
2601///
2602/// 1. Create private copies of reduction variables on each OpenMP
2603/// thread: 'foo_private', 'bar_private'
2604/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
2605/// to it and writes the result in 'foo_private' and 'bar_private'
2606/// respectively.
2607/// 3. Call the OpenMP runtime on the GPU to reduce within a team
2608/// and store the result on the team master:
2609///
2610/// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
2611/// reduceData, shuffleReduceFn, interWarpCpyFn)
2612///
2613/// where:
2614/// struct ReduceData {
2615/// double *foo;
2616/// double *bar;
2617/// } reduceData
2618/// reduceData.foo = &foo_private
2619/// reduceData.bar = &bar_private
2620///
2621/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
2622/// auxiliary functions generated by the compiler that operate on
2623/// variables of type 'ReduceData'. They aid the runtime perform
2624/// algorithmic steps in a data agnostic manner.
2625///
2626/// 'shuffleReduceFn' is a pointer to a function that reduces data
2627/// of type 'ReduceData' across two OpenMP threads (lanes) in the
2628/// same warp. It takes the following arguments as input:
2629///
2630/// a. variable of type 'ReduceData' on the calling lane,
2631/// b. its lane_id,
2632/// c. an offset relative to the current lane_id to generate a
2633/// remote_lane_id. The remote lane contains the second
2634/// variable of type 'ReduceData' that is to be reduced.
2635/// d. an algorithm version parameter determining which reduction
2636/// algorithm to use.
2637///
2638/// 'shuffleReduceFn' retrieves data from the remote lane using
2639/// efficient GPU shuffle intrinsics and reduces, using the
2640/// algorithm specified by the 4th parameter, the two operands
2641/// element-wise. The result is written to the first operand.
2642///
2643/// Different reduction algorithms are implemented in different
2644/// runtime functions, all calling 'shuffleReduceFn' to perform
2645/// the essential reduction step. Therefore, based on the 4th
2646/// parameter, this function behaves slightly differently to
2647/// cooperate with the runtime to ensure correctness under
2648/// different circumstances.
2649///
2650/// 'InterWarpCpyFn' is a pointer to a function that transfers
2651/// reduced variables across warps. It tunnels, through CUDA
2652/// shared memory, the thread-private data of type 'ReduceData'
2653/// from lane 0 of each warp to a lane in the first warp.
2654/// 4. Call the OpenMP runtime on the GPU to reduce across teams.
2655/// The last team writes the global reduced value to memory.
2656///
2657/// ret = __kmpc_nvptx_teams_reduce_nowait(...,
2658/// reduceData, shuffleReduceFn, interWarpCpyFn,
2659/// scratchpadCopyFn, loadAndReduceFn)
2660///
2661/// 'scratchpadCopyFn' is a helper that stores reduced
2662/// data from the team master to a scratchpad array in
2663/// global memory.
2664///
2665/// 'loadAndReduceFn' is a helper that loads data from
2666/// the scratchpad array and reduces it with the input
2667/// operand.
2668///
2669/// These compiler generated functions hide address
2670/// calculation and alignment information from the runtime.
2671/// 5. if ret == 1:
2672/// The team master of the last team stores the reduced
2673/// result to the globals in memory.
2674/// foo += reduceData.foo; bar *= reduceData.bar
2675///
2676///
2677/// Warp Reduction Algorithms
2678///
2679/// On the warp level, we have three algorithms implemented in the
2680/// OpenMP runtime depending on the number of active lanes:
2681///
2682/// Full Warp Reduction
2683///
2684/// The reduce algorithm within a warp where all lanes are active
2685/// is implemented in the runtime as follows:
2686///
2687/// full_warp_reduce(void *reduce_data,
2688/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2689/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
2690/// ShuffleReduceFn(reduce_data, 0, offset, 0);
2691/// }
2692///
2693/// The algorithm completes in log(2, WARPSIZE) steps.
2694///
2695/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
2696/// not used therefore we save instructions by not retrieving lane_id
2697/// from the corresponding special registers. The 4th parameter, which
2698/// represents the version of the algorithm being used, is set to 0 to
2699/// signify full warp reduction.
2700///
2701/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2702///
2703/// #reduce_elem refers to an element in the local lane's data structure
2704/// #remote_elem is retrieved from a remote lane
2705/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2706/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
2707///
2708/// Contiguous Partial Warp Reduction
2709///
2710/// This reduce algorithm is used within a warp where only the first
2711/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
2712/// number of OpenMP threads in a parallel region is not a multiple of
2713/// WARPSIZE. The algorithm is implemented in the runtime as follows:
2714///
2715/// void
2716/// contiguous_partial_reduce(void *reduce_data,
2717/// kmp_ShuffleReductFctPtr ShuffleReduceFn,
2718/// int size, int lane_id) {
2719/// int curr_size;
2720/// int offset;
2721/// curr_size = size;
2722/// mask = curr_size/2;
2723/// while (offset>0) {
2724/// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
2725/// curr_size = (curr_size+1)/2;
2726/// offset = curr_size/2;
2727/// }
2728/// }
2729///
2730/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2731///
2732/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2733/// if (lane_id < offset)
2734/// reduce_elem = reduce_elem REDUCE_OP remote_elem
2735/// else
2736/// reduce_elem = remote_elem
2737///
2738/// This algorithm assumes that the data to be reduced are located in a
2739/// contiguous subset of lanes starting from the first. When there is
2740/// an odd number of active lanes, the data in the last lane is not
2741/// aggregated with any other lane's dat but is instead copied over.
2742///
2743/// Dispersed Partial Warp Reduction
2744///
2745/// This algorithm is used within a warp when any discontiguous subset of
2746/// lanes are active. It is used to implement the reduction operation
2747/// across lanes in an OpenMP simd region or in a nested parallel region.
2748///
2749/// void
2750/// dispersed_partial_reduce(void *reduce_data,
2751/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2752/// int size, remote_id;
2753/// int logical_lane_id = number_of_active_lanes_before_me() * 2;
2754/// do {
2755/// remote_id = next_active_lane_id_right_after_me();
2756/// # the above function returns 0 of no active lane
2757/// # is present right after the current lane.
2758/// size = number_of_active_lanes_in_this_warp();
2759/// logical_lane_id /= 2;
2760/// ShuffleReduceFn(reduce_data, logical_lane_id,
2761/// remote_id-1-threadIdx.x, 2);
2762/// } while (logical_lane_id % 2 == 0 && size > 1);
2763/// }
2764///
2765/// There is no assumption made about the initial state of the reduction.
2766/// Any number of lanes (>=1) could be active at any position. The reduction
2767/// result is returned in the first active lane.
2768///
2769/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2770///
2771/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2772/// if (lane_id % 2 == 0 && offset > 0)
2773/// reduce_elem = reduce_elem REDUCE_OP remote_elem
2774/// else
2775/// reduce_elem = remote_elem
2776///
2777///
2778/// Intra-Team Reduction
2779///
2780/// This function, as implemented in the runtime call
2781/// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
2782/// threads in a team. It first reduces within a warp using the
2783/// aforementioned algorithms. We then proceed to gather all such
2784/// reduced values at the first warp.
2785///
2786/// The runtime makes use of the function 'InterWarpCpyFn', which copies
2787/// data from each of the "warp master" (zeroth lane of each warp, where
2788/// warp-reduced data is held) to the zeroth warp. This step reduces (in
2789/// a mathematical sense) the problem of reduction across warp masters in
2790/// a block to the problem of warp reduction.
2791///
2792///
2793/// Inter-Team Reduction
2794///
2795/// Once a team has reduced its data to a single value, it is stored in
2796/// a global scratchpad array. Since each team has a distinct slot, this
2797/// can be done without locking.
2798///
2799/// The last team to write to the scratchpad array proceeds to reduce the
2800/// scratchpad array. One or more workers in the last team use the helper
2801/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
2802/// the k'th worker reduces every k'th element.
2803///
2804/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
2805/// reduce across workers and compute a globally reduced value.
2806///
2807void CGOpenMPRuntimeGPU::emitReduction(
2808 CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
2809 ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
2810 ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
2811 if (!CGF.HaveInsertPoint())
2812 return;
2813
2814 bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
2815#ifndef NDEBUG
2816 bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
2817#endif
2818
2819 if (Options.SimpleReduction) {
2820 assert(!TeamsReduction && !ParallelReduction &&
2821 "Invalid reduction selection in emitReduction.");
2822 CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
2823 ReductionOps, Options);
2824 return;
2825 }
2826
2827 assert((TeamsReduction || ParallelReduction) &&
2828 "Invalid reduction selection in emitReduction.");
2829
2830 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
2831 llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
2832 int Cnt = 0;
2833 for (const Expr *DRE : Privates) {
2834 PrivatesReductions[Cnt] = cast<DeclRefExpr>(Val: DRE)->getDecl();
2835 ++Cnt;
2836 }
2837
2838 ASTContext &C = CGM.getContext();
2839 const RecordDecl *ReductionRec = ::buildRecordForGlobalizedVars(
2840 C&: CGM.getContext(), EscapedDecls: PrivatesReductions, EscapedDeclsForTeams: std::nullopt, MappedDeclsFields&: VarFieldMap, BufSize: 1);
2841
2842 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
2843 // RedList, shuffle_reduce_func, interwarp_copy_func);
2844 // or
2845 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
2846 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2847
2848 llvm::Value *Res;
2849 // 1. Build a list of reduction variables.
2850 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2851 auto Size = RHSExprs.size();
2852 for (const Expr *E : Privates) {
2853 if (E->getType()->isVariablyModifiedType())
2854 // Reserve place for array size.
2855 ++Size;
2856 }
2857 llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
2858 QualType ReductionArrayTy = C.getConstantArrayType(
2859 EltTy: C.VoidPtrTy, ArySize: ArraySize, SizeExpr: nullptr, ASM: ArraySizeModifier::Normal,
2860 /*IndexTypeQuals=*/0);
2861 Address ReductionList =
2862 CGF.CreateMemTemp(T: ReductionArrayTy, Name: ".omp.reduction.red_list");
2863 auto IPriv = Privates.begin();
2864 unsigned Idx = 0;
2865 for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
2866 Address Elem = CGF.Builder.CreateConstArrayGEP(Addr: ReductionList, Index: Idx);
2867 CGF.Builder.CreateStore(
2868 Val: CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2869 V: CGF.EmitLValue(E: RHSExprs[I]).getPointer(CGF), DestTy: CGF.VoidPtrTy),
2870 Addr: Elem);
2871 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2872 // Store array size.
2873 ++Idx;
2874 Elem = CGF.Builder.CreateConstArrayGEP(Addr: ReductionList, Index: Idx);
2875 llvm::Value *Size = CGF.Builder.CreateIntCast(
2876 V: CGF.getVLASize(
2877 vla: CGF.getContext().getAsVariableArrayType(T: (*IPriv)->getType()))
2878 .NumElts,
2879 DestTy: CGF.SizeTy, /*isSigned=*/false);
2880 CGF.Builder.CreateStore(Val: CGF.Builder.CreateIntToPtr(V: Size, DestTy: CGF.VoidPtrTy),
2881 Addr: Elem);
2882 }
2883 }
2884
2885 llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2886 V: ReductionList.emitRawPointer(CGF), DestTy: CGF.VoidPtrTy);
2887 llvm::Function *ReductionFn = emitReductionFunction(
2888 CGF.CurFn->getName(), Loc, CGF.ConvertTypeForMem(T: ReductionArrayTy),
2889 Privates, LHSExprs, RHSExprs, ReductionOps);
2890 llvm::Value *ReductionDataSize =
2891 CGF.getTypeSize(Ty: C.getRecordType(Decl: ReductionRec));
2892 ReductionDataSize =
2893 CGF.Builder.CreateSExtOrTrunc(V: ReductionDataSize, DestTy: CGF.Int64Ty);
2894 llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
2895 CGM, Privates, ReductionArrayTy, ReduceFn: ReductionFn, Loc);
2896 llvm::Value *InterWarpCopyFn =
2897 emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
2898
2899 if (ParallelReduction) {
2900 llvm::Value *Args[] = {RTLoc, ReductionDataSize, RL, ShuffleAndReduceFn,
2901 InterWarpCopyFn};
2902
2903 Res = CGF.EmitRuntimeCall(
2904 callee: OMPBuilder.getOrCreateRuntimeFunction(
2905 M&: CGM.getModule(), FnID: OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2),
2906 args: Args);
2907 } else {
2908 assert(TeamsReduction && "expected teams reduction.");
2909 TeamsReductions.push_back(Elt: ReductionRec);
2910 auto *KernelTeamsReductionPtr = CGF.EmitRuntimeCall(
2911 callee: OMPBuilder.getOrCreateRuntimeFunction(
2912 M&: CGM.getModule(), FnID: OMPRTL___kmpc_reduction_get_fixed_buffer),
2913 args: {}, name: "_openmp_teams_reductions_buffer_$_$ptr");
2914 llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
2915 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec: ReductionRec, VarFieldMap);
2916 llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
2917 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec: ReductionRec, VarFieldMap,
2918 ReduceFn: ReductionFn);
2919 llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
2920 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec: ReductionRec, VarFieldMap);
2921 llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
2922 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec: ReductionRec, VarFieldMap,
2923 ReduceFn: ReductionFn);
2924
2925 llvm::Value *Args[] = {
2926 RTLoc,
2927 KernelTeamsReductionPtr,
2928 CGF.Builder.getInt32(C: C.getLangOpts().OpenMPCUDAReductionBufNum),
2929 ReductionDataSize,
2930 RL,
2931 ShuffleAndReduceFn,
2932 InterWarpCopyFn,
2933 GlobalToBufferCpyFn,
2934 GlobalToBufferRedFn,
2935 BufferToGlobalCpyFn,
2936 BufferToGlobalRedFn};
2937
2938 Res = CGF.EmitRuntimeCall(
2939 callee: OMPBuilder.getOrCreateRuntimeFunction(
2940 M&: CGM.getModule(), FnID: OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2),
2941 args: Args);
2942 }
2943
2944 // 5. Build if (res == 1)
2945 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(name: ".omp.reduction.done");
2946 llvm::BasicBlock *ThenBB = CGF.createBasicBlock(name: ".omp.reduction.then");
2947 llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
2948 LHS: Res, RHS: llvm::ConstantInt::get(Ty: CGM.Int32Ty, /*V=*/1));
2949 CGF.Builder.CreateCondBr(Cond, True: ThenBB, False: ExitBB);
2950
2951 // 6. Build then branch: where we have reduced values in the master
2952 // thread in each team.
2953 // __kmpc_end_reduce{_nowait}(<gtid>);
2954 // break;
2955 CGF.EmitBlock(BB: ThenBB);
2956
2957 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
2958 auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
2959 this](CodeGenFunction &CGF, PrePostActionTy &Action) {
2960 auto IPriv = Privates.begin();
2961 auto ILHS = LHSExprs.begin();
2962 auto IRHS = RHSExprs.begin();
2963 for (const Expr *E : ReductionOps) {
2964 emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(Val: *ILHS),
2965 cast<DeclRefExpr>(Val: *IRHS));
2966 ++IPriv;
2967 ++ILHS;
2968 ++IRHS;
2969 }
2970 };
2971 RegionCodeGenTy RCG(CodeGen);
2972 RCG(CGF);
2973 // There is no need to emit line number for unconditional branch.
2974 (void)ApplyDebugLocation::CreateEmpty(CGF);
2975 CGF.EmitBlock(BB: ExitBB, /*IsFinished=*/true);
2976}
2977
2978const VarDecl *
2979CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD,
2980 const VarDecl *NativeParam) const {
2981 if (!NativeParam->getType()->isReferenceType())
2982 return NativeParam;
2983 QualType ArgType = NativeParam->getType();
2984 QualifierCollector QC;
2985 const Type *NonQualTy = QC.strip(type: ArgType);
2986 QualType PointeeTy = cast<ReferenceType>(Val: NonQualTy)->getPointeeType();
2987 if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
2988 if (Attr->getCaptureKind() == OMPC_map) {
2989 PointeeTy = CGM.getContext().getAddrSpaceQualType(T: PointeeTy,
2990 AddressSpace: LangAS::opencl_global);
2991 }
2992 }
2993 ArgType = CGM.getContext().getPointerType(T: PointeeTy);
2994 QC.addRestrict();
2995 enum { NVPTX_local_addr = 5 };
2996 QC.addAddressSpace(space: getLangASFromTargetAS(TargetAS: NVPTX_local_addr));
2997 ArgType = QC.apply(Context: CGM.getContext(), QT: ArgType);
2998 if (isa<ImplicitParamDecl>(Val: NativeParam))
2999 return ImplicitParamDecl::Create(
3000 CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
3001 NativeParam->getIdentifier(), ArgType, ImplicitParamKind::Other);
3002 return ParmVarDecl::Create(
3003 C&: CGM.getContext(),
3004 DC: const_cast<DeclContext *>(NativeParam->getDeclContext()),
3005 StartLoc: NativeParam->getBeginLoc(), IdLoc: NativeParam->getLocation(),
3006 Id: NativeParam->getIdentifier(), T: ArgType,
3007 /*TInfo=*/nullptr, S: SC_None, /*DefArg=*/nullptr);
3008}
3009
3010Address
3011CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF,
3012 const VarDecl *NativeParam,
3013 const VarDecl *TargetParam) const {
3014 assert(NativeParam != TargetParam &&
3015 NativeParam->getType()->isReferenceType() &&
3016 "Native arg must not be the same as target arg.");
3017 Address LocalAddr = CGF.GetAddrOfLocalVar(VD: TargetParam);
3018 QualType NativeParamType = NativeParam->getType();
3019 QualifierCollector QC;
3020 const Type *NonQualTy = QC.strip(type: NativeParamType);
3021 QualType NativePointeeTy = cast<ReferenceType>(Val: NonQualTy)->getPointeeType();
3022 unsigned NativePointeeAddrSpace =
3023 CGF.getTypes().getTargetAddressSpace(T: NativePointeeTy);
3024 QualType TargetTy = TargetParam->getType();
3025 llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(Addr: LocalAddr, /*Volatile=*/false,
3026 Ty: TargetTy, Loc: SourceLocation());
3027 // Cast to native address space.
3028 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3029 V: TargetAddr,
3030 DestTy: llvm::PointerType::get(C&: CGF.getLLVMContext(), AddressSpace: NativePointeeAddrSpace));
3031 Address NativeParamAddr = CGF.CreateMemTemp(T: NativeParamType);
3032 CGF.EmitStoreOfScalar(Value: TargetAddr, Addr: NativeParamAddr, /*Volatile=*/false,
3033 Ty: NativeParamType);
3034 return NativeParamAddr;
3035}
3036
3037void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall(
3038 CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
3039 ArrayRef<llvm::Value *> Args) const {
3040 SmallVector<llvm::Value *, 4> TargetArgs;
3041 TargetArgs.reserve(N: Args.size());
3042 auto *FnType = OutlinedFn.getFunctionType();
3043 for (unsigned I = 0, E = Args.size(); I < E; ++I) {
3044 if (FnType->isVarArg() && FnType->getNumParams() <= I) {
3045 TargetArgs.append(in_start: std::next(x: Args.begin(), n: I), in_end: Args.end());
3046 break;
3047 }
3048 llvm::Type *TargetType = FnType->getParamType(i: I);
3049 llvm::Value *NativeArg = Args[I];
3050 if (!TargetType->isPointerTy()) {
3051 TargetArgs.emplace_back(Args&: NativeArg);
3052 continue;
3053 }
3054 TargetArgs.emplace_back(
3055 Args: CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(V: NativeArg, DestTy: TargetType));
3056 }
3057 CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
3058}
3059
3060/// Emit function which wraps the outline parallel region
3061/// and controls the arguments which are passed to this function.
3062/// The wrapper ensures that the outlined function is called
3063/// with the correct arguments when data is shared.
3064llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
3065 llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
3066 ASTContext &Ctx = CGM.getContext();
3067 const auto &CS = *D.getCapturedStmt(OMPD_parallel);
3068
3069 // Create a function that takes as argument the source thread.
3070 FunctionArgList WrapperArgs;
3071 QualType Int16QTy =
3072 Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
3073 QualType Int32QTy =
3074 Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
3075 ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
3076 /*Id=*/nullptr, Int16QTy,
3077 ImplicitParamKind::Other);
3078 ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
3079 /*Id=*/nullptr, Int32QTy,
3080 ImplicitParamKind::Other);
3081 WrapperArgs.emplace_back(Args: &ParallelLevelArg);
3082 WrapperArgs.emplace_back(Args: &WrapperArg);
3083
3084 const CGFunctionInfo &CGFI =
3085 CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
3086
3087 auto *Fn = llvm::Function::Create(
3088 Ty: CGM.getTypes().GetFunctionType(Info: CGFI), Linkage: llvm::GlobalValue::InternalLinkage,
3089 N: Twine(OutlinedParallelFn->getName(), "_wrapper"), M: &CGM.getModule());
3090
3091 // Ensure we do not inline the function. This is trivially true for the ones
3092 // passed to __kmpc_fork_call but the ones calles in serialized regions
3093 // could be inlined. This is not a perfect but it is closer to the invariant
3094 // we want, namely, every data environment starts with a new function.
3095 // TODO: We should pass the if condition to the runtime function and do the
3096 // handling there. Much cleaner code.
3097 Fn->addFnAttr(llvm::Attribute::NoInline);
3098
3099 CGM.SetInternalFunctionAttributes(GD: GlobalDecl(), F: Fn, FI: CGFI);
3100 Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
3101 Fn->setDoesNotRecurse();
3102
3103 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
3104 CGF.StartFunction(GD: GlobalDecl(), RetTy: Ctx.VoidTy, Fn: Fn, FnInfo: CGFI, Args: WrapperArgs,
3105 Loc: D.getBeginLoc(), StartLoc: D.getBeginLoc());
3106
3107 const auto *RD = CS.getCapturedRecordDecl();
3108 auto CurField = RD->field_begin();
3109
3110 Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(Ty: CGF.Int32Ty,
3111 /*Name=*/".zero.addr");
3112 CGF.Builder.CreateStore(Val: CGF.Builder.getInt32(/*C*/ 0), Addr: ZeroAddr);
3113 // Get the array of arguments.
3114 SmallVector<llvm::Value *, 8> Args;
3115
3116 Args.emplace_back(Args: CGF.GetAddrOfLocalVar(&WrapperArg).emitRawPointer(CGF));
3117 Args.emplace_back(Args: ZeroAddr.emitRawPointer(CGF));
3118
3119 CGBuilderTy &Bld = CGF.Builder;
3120 auto CI = CS.capture_begin();
3121
3122 // Use global memory for data sharing.
3123 // Handle passing of global args to workers.
3124 RawAddress GlobalArgs =
3125 CGF.CreateDefaultAlignTempAlloca(Ty: CGF.VoidPtrPtrTy, Name: "global_args");
3126 llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
3127 llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
3128 CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
3129 M&: CGM.getModule(), FnID: OMPRTL___kmpc_get_shared_variables),
3130 args: DataSharingArgs);
3131
3132 // Retrieve the shared variables from the list of references returned
3133 // by the runtime. Pass the variables to the outlined function.
3134 Address SharedArgListAddress = Address::invalid();
3135 if (CS.capture_size() > 0 ||
3136 isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
3137 SharedArgListAddress = CGF.EmitLoadOfPointer(
3138 Ptr: GlobalArgs, PtrTy: CGF.getContext()
3139 .getPointerType(CGF.getContext().VoidPtrTy)
3140 .castAs<PointerType>());
3141 }
3142 unsigned Idx = 0;
3143 if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
3144 Address Src = Bld.CreateConstInBoundsGEP(Addr: SharedArgListAddress, Index: Idx);
3145 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3146 Addr: Src, Ty: CGF.SizeTy->getPointerTo(), ElementTy: CGF.SizeTy);
3147 llvm::Value *LB = CGF.EmitLoadOfScalar(
3148 Addr: TypedAddress,
3149 /*Volatile=*/false,
3150 Ty: CGF.getContext().getPointerType(T: CGF.getContext().getSizeType()),
3151 Loc: cast<OMPLoopDirective>(Val: D).getLowerBoundVariable()->getExprLoc());
3152 Args.emplace_back(Args&: LB);
3153 ++Idx;
3154 Src = Bld.CreateConstInBoundsGEP(Addr: SharedArgListAddress, Index: Idx);
3155 TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3156 Addr: Src, Ty: CGF.SizeTy->getPointerTo(), ElementTy: CGF.SizeTy);
3157 llvm::Value *UB = CGF.EmitLoadOfScalar(
3158 Addr: TypedAddress,
3159 /*Volatile=*/false,
3160 Ty: CGF.getContext().getPointerType(T: CGF.getContext().getSizeType()),
3161 Loc: cast<OMPLoopDirective>(Val: D).getUpperBoundVariable()->getExprLoc());
3162 Args.emplace_back(Args&: UB);
3163 ++Idx;
3164 }
3165 if (CS.capture_size() > 0) {
3166 ASTContext &CGFContext = CGF.getContext();
3167 for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
3168 QualType ElemTy = CurField->getType();
3169 Address Src = Bld.CreateConstInBoundsGEP(Addr: SharedArgListAddress, Index: I + Idx);
3170 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3171 Addr: Src, Ty: CGF.ConvertTypeForMem(T: CGFContext.getPointerType(T: ElemTy)),
3172 ElementTy: CGF.ConvertTypeForMem(T: ElemTy));
3173 llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
3174 /*Volatile=*/false,
3175 CGFContext.getPointerType(T: ElemTy),
3176 CI->getLocation());
3177 if (CI->capturesVariableByCopy() &&
3178 !CI->getCapturedVar()->getType()->isAnyPointerType()) {
3179 Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
3180 CI->getLocation());
3181 }
3182 Args.emplace_back(Args&: Arg);
3183 }
3184 }
3185
3186 emitOutlinedFunctionCall(CGF, Loc: D.getBeginLoc(), OutlinedFn: OutlinedParallelFn, Args);
3187 CGF.FinishFunction();
3188 return Fn;
3189}
3190
3191void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,
3192 const Decl *D) {
3193 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
3194 return;
3195
3196 assert(D && "Expected function or captured|block decl.");
3197 assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
3198 "Function is registered already.");
3199 assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
3200 "Team is set but not processed.");
3201 const Stmt *Body = nullptr;
3202 bool NeedToDelayGlobalization = false;
3203 if (const auto *FD = dyn_cast<FunctionDecl>(Val: D)) {
3204 Body = FD->getBody();
3205 } else if (const auto *BD = dyn_cast<BlockDecl>(Val: D)) {
3206 Body = BD->getBody();
3207 } else if (const auto *CD = dyn_cast<CapturedDecl>(Val: D)) {
3208 Body = CD->getBody();
3209 NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
3210 if (NeedToDelayGlobalization &&
3211 getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
3212 return;
3213 }
3214 if (!Body)
3215 return;
3216 CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
3217 VarChecker.Visit(Body);
3218 const RecordDecl *GlobalizedVarsRecord =
3219 VarChecker.getGlobalizedRecord(IsInTTDRegion);
3220 TeamAndReductions.first = nullptr;
3221 TeamAndReductions.second.clear();
3222 ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
3223 VarChecker.getEscapedVariableLengthDecls();
3224 ArrayRef<const ValueDecl *> DelayedVariableLengthDecls =
3225 VarChecker.getDelayedVariableLengthDecls();
3226 if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty() &&
3227 DelayedVariableLengthDecls.empty())
3228 return;
3229 auto I = FunctionGlobalizedDecls.try_emplace(Key: CGF.CurFn).first;
3230 I->getSecond().MappedParams =
3231 std::make_unique<CodeGenFunction::OMPMapVars>();
3232 I->getSecond().EscapedParameters.insert(
3233 I: VarChecker.getEscapedParameters().begin(),
3234 E: VarChecker.getEscapedParameters().end());
3235 I->getSecond().EscapedVariableLengthDecls.append(
3236 in_start: EscapedVariableLengthDecls.begin(), in_end: EscapedVariableLengthDecls.end());
3237 I->getSecond().DelayedVariableLengthDecls.append(
3238 in_start: DelayedVariableLengthDecls.begin(), in_end: DelayedVariableLengthDecls.end());
3239 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
3240 for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
3241 assert(VD->isCanonicalDecl() && "Expected canonical declaration");
3242 Data.insert(std::make_pair(x&: VD, y: MappedVarData()));
3243 }
3244 if (!NeedToDelayGlobalization) {
3245 emitGenericVarsProlog(CGF, Loc: D->getBeginLoc());
3246 struct GlobalizationScope final : EHScopeStack::Cleanup {
3247 GlobalizationScope() = default;
3248
3249 void Emit(CodeGenFunction &CGF, Flags flags) override {
3250 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
3251 .emitGenericVarsEpilog(CGF);
3252 }
3253 };
3254 CGF.EHStack.pushCleanup<GlobalizationScope>(Kind: NormalAndEHCleanup);
3255 }
3256}
3257
3258Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,
3259 const VarDecl *VD) {
3260 if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
3261 const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
3262 auto AS = LangAS::Default;
3263 switch (A->getAllocatorType()) {
3264 // Use the default allocator here as by default local vars are
3265 // threadlocal.
3266 case OMPAllocateDeclAttr::OMPNullMemAlloc:
3267 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3268 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3269 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3270 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3271 // Follow the user decision - use default allocation.
3272 return Address::invalid();
3273 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3274 // TODO: implement aupport for user-defined allocators.
3275 return Address::invalid();
3276 case OMPAllocateDeclAttr::OMPConstMemAlloc:
3277 AS = LangAS::cuda_constant;
3278 break;
3279 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3280 AS = LangAS::cuda_shared;
3281 break;
3282 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3283 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3284 break;
3285 }
3286 llvm::Type *VarTy = CGF.ConvertTypeForMem(T: VD->getType());
3287 auto *GV = new llvm::GlobalVariable(
3288 CGM.getModule(), VarTy, /*isConstant=*/false,
3289 llvm::GlobalValue::InternalLinkage, llvm::PoisonValue::get(T: VarTy),
3290 VD->getName(),
3291 /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
3292 CGM.getContext().getTargetAddressSpace(AS));
3293 CharUnits Align = CGM.getContext().getDeclAlign(VD);
3294 GV->setAlignment(Align.getAsAlign());
3295 return Address(
3296 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3297 GV, VarTy->getPointerTo(AddrSpace: CGM.getContext().getTargetAddressSpace(
3298 AS: VD->getType().getAddressSpace()))),
3299 VarTy, Align);
3300 }
3301
3302 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
3303 return Address::invalid();
3304
3305 VD = VD->getCanonicalDecl();
3306 auto I = FunctionGlobalizedDecls.find(Val: CGF.CurFn);
3307 if (I == FunctionGlobalizedDecls.end())
3308 return Address::invalid();
3309 auto VDI = I->getSecond().LocalVarData.find(VD);
3310 if (VDI != I->getSecond().LocalVarData.end())
3311 return VDI->second.PrivateAddr;
3312 if (VD->hasAttrs()) {
3313 for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),
3314 E(VD->attr_end());
3315 IT != E; ++IT) {
3316 auto VDI = I->getSecond().LocalVarData.find(
3317 cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
3318 ->getCanonicalDecl());
3319 if (VDI != I->getSecond().LocalVarData.end())
3320 return VDI->second.PrivateAddr;
3321 }
3322 }
3323
3324 return Address::invalid();
3325}
3326
3327void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) {
3328 FunctionGlobalizedDecls.erase(Val: CGF.CurFn);
3329 CGOpenMPRuntime::functionFinished(CGF);
3330}
3331
3332void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(
3333 CodeGenFunction &CGF, const OMPLoopDirective &S,
3334 OpenMPDistScheduleClauseKind &ScheduleKind,
3335 llvm::Value *&Chunk) const {
3336 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
3337 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
3338 ScheduleKind = OMPC_DIST_SCHEDULE_static;
3339 Chunk = CGF.EmitScalarConversion(
3340 Src: RT.getGPUNumThreads(CGF),
3341 SrcTy: CGF.getContext().getIntTypeForBitwidth(DestWidth: 32, /*Signed=*/0),
3342 DstTy: S.getIterationVariable()->getType(), Loc: S.getBeginLoc());
3343 return;
3344 }
3345 CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
3346 CGF, S, ScheduleKind, Chunk);
3347}
3348
3349void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk(
3350 CodeGenFunction &CGF, const OMPLoopDirective &S,
3351 OpenMPScheduleClauseKind &ScheduleKind,
3352 const Expr *&ChunkExpr) const {
3353 ScheduleKind = OMPC_SCHEDULE_static;
3354 // Chunk size is 1 in this case.
3355 llvm::APInt ChunkSize(32, 1);
3356 ChunkExpr = IntegerLiteral::Create(C: CGF.getContext(), V: ChunkSize,
3357 type: CGF.getContext().getIntTypeForBitwidth(DestWidth: 32, /*Signed=*/0),
3358 l: SourceLocation());
3359}
3360
3361void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas(
3362 CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
3363 assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&
3364 " Expected target-based directive.");
3365 const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
3366 for (const CapturedStmt::Capture &C : CS->captures()) {
3367 // Capture variables captured by reference in lambdas for target-based
3368 // directives.
3369 if (!C.capturesVariable())
3370 continue;
3371 const VarDecl *VD = C.getCapturedVar();
3372 const auto *RD = VD->getType()
3373 .getCanonicalType()
3374 .getNonReferenceType()
3375 ->getAsCXXRecordDecl();
3376 if (!RD || !RD->isLambda())
3377 continue;
3378 Address VDAddr = CGF.GetAddrOfLocalVar(VD);
3379 LValue VDLVal;
3380 if (VD->getType().getCanonicalType()->isReferenceType())
3381 VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
3382 else
3383 VDLVal = CGF.MakeAddrLValue(
3384 VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
3385 llvm::DenseMap<const ValueDecl *, FieldDecl *> Captures;
3386 FieldDecl *ThisCapture = nullptr;
3387 RD->getCaptureFields(Captures, ThisCapture);
3388 if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
3389 LValue ThisLVal =
3390 CGF.EmitLValueForFieldInitialization(Base: VDLVal, Field: ThisCapture);
3391 llvm::Value *CXXThis = CGF.LoadCXXThis();
3392 CGF.EmitStoreOfScalar(value: CXXThis, lvalue: ThisLVal);
3393 }
3394 for (const LambdaCapture &LC : RD->captures()) {
3395 if (LC.getCaptureKind() != LCK_ByRef)
3396 continue;
3397 const ValueDecl *VD = LC.getCapturedVar();
3398 // FIXME: For now VD is always a VarDecl because OpenMP does not support
3399 // capturing structured bindings in lambdas yet.
3400 if (!CS->capturesVariable(cast<VarDecl>(VD)))
3401 continue;
3402 auto It = Captures.find(VD);
3403 assert(It != Captures.end() && "Found lambda capture without field.");
3404 LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
3405 Address VDAddr = CGF.GetAddrOfLocalVar(cast<VarDecl>(VD));
3406 if (VD->getType().getCanonicalType()->isReferenceType())
3407 VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
3408 VD->getType().getCanonicalType())
3409 .getAddress(CGF);
3410 CGF.EmitStoreOfScalar(VDAddr.emitRawPointer(CGF), VarLVal);
3411 }
3412 }
3413}
3414
3415bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
3416 LangAS &AS) {
3417 if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())
3418 return false;
3419 const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
3420 switch(A->getAllocatorType()) {
3421 case OMPAllocateDeclAttr::OMPNullMemAlloc:
3422 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3423 // Not supported, fallback to the default mem space.
3424 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3425 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3426 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3427 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3428 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3429 AS = LangAS::Default;
3430 return true;
3431 case OMPAllocateDeclAttr::OMPConstMemAlloc:
3432 AS = LangAS::cuda_constant;
3433 return true;
3434 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3435 AS = LangAS::cuda_shared;
3436 return true;
3437 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3438 llvm_unreachable("Expected predefined allocator for the variables with the "
3439 "static storage.");
3440 }
3441 return false;
3442}
3443
3444// Get current CudaArch and ignore any unknown values
3445static CudaArch getCudaArch(CodeGenModule &CGM) {
3446 if (!CGM.getTarget().hasFeature(Feature: "ptx"))
3447 return CudaArch::UNKNOWN;
3448 for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
3449 if (Feature.getValue()) {
3450 CudaArch Arch = StringToCudaArch(S: Feature.getKey());
3451 if (Arch != CudaArch::UNKNOWN)
3452 return Arch;
3453 }
3454 }
3455 return CudaArch::UNKNOWN;
3456}
3457
3458/// Check to see if target architecture supports unified addressing which is
3459/// a restriction for OpenMP requires clause "unified_shared_memory".
3460void CGOpenMPRuntimeGPU::processRequiresDirective(
3461 const OMPRequiresDecl *D) {
3462 for (const OMPClause *Clause : D->clauselists()) {
3463 if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
3464 CudaArch Arch = getCudaArch(CGM);
3465 switch (Arch) {
3466 case CudaArch::SM_20:
3467 case CudaArch::SM_21:
3468 case CudaArch::SM_30:
3469 case CudaArch::SM_32_:
3470 case CudaArch::SM_35:
3471 case CudaArch::SM_37:
3472 case CudaArch::SM_50:
3473 case CudaArch::SM_52:
3474 case CudaArch::SM_53: {
3475 SmallString<256> Buffer;
3476 llvm::raw_svector_ostream Out(Buffer);
3477 Out << "Target architecture " << CudaArchToString(A: Arch)
3478 << " does not support unified addressing";
3479 CGM.Error(loc: Clause->getBeginLoc(), error: Out.str());
3480 return;
3481 }
3482 case CudaArch::SM_60:
3483 case CudaArch::SM_61:
3484 case CudaArch::SM_62:
3485 case CudaArch::SM_70:
3486 case CudaArch::SM_72:
3487 case CudaArch::SM_75:
3488 case CudaArch::SM_80:
3489 case CudaArch::SM_86:
3490 case CudaArch::SM_87:
3491 case CudaArch::SM_89:
3492 case CudaArch::SM_90:
3493 case CudaArch::SM_90a:
3494 case CudaArch::GFX600:
3495 case CudaArch::GFX601:
3496 case CudaArch::GFX602:
3497 case CudaArch::GFX700:
3498 case CudaArch::GFX701:
3499 case CudaArch::GFX702:
3500 case CudaArch::GFX703:
3501 case CudaArch::GFX704:
3502 case CudaArch::GFX705:
3503 case CudaArch::GFX801:
3504 case CudaArch::GFX802:
3505 case CudaArch::GFX803:
3506 case CudaArch::GFX805:
3507 case CudaArch::GFX810:
3508 case CudaArch::GFX900:
3509 case CudaArch::GFX902:
3510 case CudaArch::GFX904:
3511 case CudaArch::GFX906:
3512 case CudaArch::GFX908:
3513 case CudaArch::GFX909:
3514 case CudaArch::GFX90a:
3515 case CudaArch::GFX90c:
3516 case CudaArch::GFX940:
3517 case CudaArch::GFX941:
3518 case CudaArch::GFX942:
3519 case CudaArch::GFX1010:
3520 case CudaArch::GFX1011:
3521 case CudaArch::GFX1012:
3522 case CudaArch::GFX1013:
3523 case CudaArch::GFX1030:
3524 case CudaArch::GFX1031:
3525 case CudaArch::GFX1032:
3526 case CudaArch::GFX1033:
3527 case CudaArch::GFX1034:
3528 case CudaArch::GFX1035:
3529 case CudaArch::GFX1036:
3530 case CudaArch::GFX1100:
3531 case CudaArch::GFX1101:
3532 case CudaArch::GFX1102:
3533 case CudaArch::GFX1103:
3534 case CudaArch::GFX1150:
3535 case CudaArch::GFX1151:
3536 case CudaArch::GFX1200:
3537 case CudaArch::GFX1201:
3538 case CudaArch::Generic:
3539 case CudaArch::UNUSED:
3540 case CudaArch::UNKNOWN:
3541 break;
3542 case CudaArch::LAST:
3543 llvm_unreachable("Unexpected Cuda arch.");
3544 }
3545 }
3546 }
3547 CGOpenMPRuntime::processRequiresDirective(D);
3548}
3549
3550llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) {
3551 CGBuilderTy &Bld = CGF.Builder;
3552 llvm::Module *M = &CGF.CGM.getModule();
3553 const char *LocSize = "__kmpc_get_hardware_num_threads_in_block";
3554 llvm::Function *F = M->getFunction(Name: LocSize);
3555 if (!F) {
3556 F = llvm::Function::Create(
3557 Ty: llvm::FunctionType::get(Result: CGF.Int32Ty, Params: std::nullopt, isVarArg: false),
3558 Linkage: llvm::GlobalVariable::ExternalLinkage, N: LocSize, M: &CGF.CGM.getModule());
3559 }
3560 return Bld.CreateCall(Callee: F, Args: std::nullopt, Name: "nvptx_num_threads");
3561}
3562
3563llvm::Value *CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction &CGF) {
3564 ArrayRef<llvm::Value *> Args{};
3565 return CGF.EmitRuntimeCall(
3566 callee: OMPBuilder.getOrCreateRuntimeFunction(
3567 M&: CGM.getModule(), FnID: OMPRTL___kmpc_get_hardware_thread_id_in_block),
3568 args: Args);
3569}
3570
3571llvm::Value *CGOpenMPRuntimeGPU::getGPUWarpSize(CodeGenFunction &CGF) {
3572 ArrayRef<llvm::Value *> Args{};
3573 return CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
3574 M&: CGM.getModule(), FnID: OMPRTL___kmpc_get_warp_size),
3575 args: Args);
3576}
3577

source code of clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp