CGOpenMPRuntimeGPU.cpp source code [clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp]

1	//===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This provides a generalized class for OpenMP runtime code generation
10	// specialized by GPU targets NVPTX and AMDGCN.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "CGOpenMPRuntimeGPU.h"
15	#include "CodeGenFunction.h"
16	#include "clang/AST/Attr.h"
17	#include "clang/AST/DeclOpenMP.h"
18	#include "clang/AST/OpenMPClause.h"
19	#include "clang/AST/StmtOpenMP.h"
20	#include "clang/AST/StmtVisitor.h"
21	#include "clang/Basic/Cuda.h"
22	#include "llvm/ADT/SmallPtrSet.h"
23	#include "llvm/Frontend/OpenMP/OMPGridValues.h"
24	#include "llvm/Support/MathExtras.h"
25
26	using namespace clang;
27	using namespace CodeGen;
28	using namespace llvm::omp;
29
30	namespace {
31	/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
32	class NVPTXActionTy final : public PrePostActionTy {
33	llvm::FunctionCallee EnterCallee = nullptr;
34	ArrayRef<llvm::Value *> EnterArgs;
35	llvm::FunctionCallee ExitCallee = nullptr;
36	ArrayRef<llvm::Value *> ExitArgs;
37	bool Conditional = false;
38	llvm::BasicBlock ContBlock = nullptr*;
39
40	public:
41	NVPTXActionTy(llvm::FunctionCallee EnterCallee,
42	ArrayRef<llvm::Value *> EnterArgs,
43	llvm::FunctionCallee ExitCallee,
44	ArrayRef<llvm::Value > ExitArgs, bool* Conditional = false)
45	: EnterCallee (EnterCallee), EnterArgs (EnterArgs), ExitCallee (ExitCallee),
46	ExitArgs (ExitArgs), Conditional(Conditional) {}
47	void Enter(CodeGenFunction &CGF) override {
48	llvm::Value *EnterRes = CGF.EmitRuntimeCall(callee: EnterCallee, args: EnterArgs);
49	if (Conditional) {
50	llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(Arg: EnterRes);
51	auto *ThenBlock = CGF.createBasicBlock(name: "omp_if.then");
52	ContBlock = CGF.createBasicBlock(name: "omp_if.end");
53	// Generate the branch (If-stmt)
54	CGF.Builder.CreateCondBr(Cond: CallBool, True: ThenBlock, False: ContBlock);
55	CGF.EmitBlock(BB: ThenBlock);
56	}
57	}
58	void Done(CodeGenFunction &CGF) {
59	// Emit the rest of blocks/branches
60	CGF.EmitBranch(Block: ContBlock);
61	CGF.EmitBlock(BB: ContBlock, IsFinished: true);
62	}
63	void Exit(CodeGenFunction &CGF) override {
64	CGF.EmitRuntimeCall(callee: ExitCallee, args: ExitArgs);
65	}
66	};
67
68	/// A class to track the execution mode when codegening directives within
69	/// a target region. The appropriate mode (SPMD\|NON-SPMD) is set on entry
70	/// to the target region and used by containing directives such as 'parallel'
71	/// to emit optimized code.
72	class ExecutionRuntimeModesRAII {
73	private:
74	CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode =
75	CGOpenMPRuntimeGPU::EM_Unknown;
76	CGOpenMPRuntimeGPU::ExecutionMode &ExecMode;
77
78	public:
79	ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
80	CGOpenMPRuntimeGPU::ExecutionMode EntryMode)
81	: ExecMode(ExecMode) {
82	SavedExecMode = ExecMode;
83	ExecMode = EntryMode;
84	}
85	~ExecutionRuntimeModesRAII() { ExecMode = SavedExecMode; }
86	};
87
88	static const ValueDecl getPrivateItem(const* Expr *RefExpr) {
89	RefExpr = RefExpr->IgnoreParens();
90	if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(Val: RefExpr)) {
91	const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
92	while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Val: Base))
93	Base = TempASE->getBase()->IgnoreParenImpCasts();
94	RefExpr = Base;
95	} else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(Val: RefExpr)) {
96	const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
97	while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Val: Base))
98	Base = TempOASE->getBase()->IgnoreParenImpCasts();
99	while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Val: Base))
100	Base = TempASE->getBase()->IgnoreParenImpCasts();
101	RefExpr = Base;
102	}
103	RefExpr = RefExpr->IgnoreParenImpCasts();
104	if (const auto *DE = dyn_cast<DeclRefExpr>(Val: RefExpr))
105	return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
106	const auto *ME = cast<MemberExpr>(Val: RefExpr);
107	return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
108	}
109
110	static RecordDecl *buildRecordForGlobalizedVars(
111	ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
112	ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
113	llvm::SmallDenseMap<const ValueDecl , const* FieldDecl *>
114	&MappedDeclsFields,
115	int BufSize) {
116	using VarsDataTy = std::pair<CharUnits /Align/, const ValueDecl *>;
117	if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
118	return nullptr;
119	SmallVector<VarsDataTy, `4`> GlobalizedVars;
120	for (const ValueDecl *D : EscapedDecls)
121	GlobalizedVars.emplace_back(Args: C.getDeclAlign(D), Args&: D);
122	for (const ValueDecl *D : EscapedDeclsForTeams)
123	GlobalizedVars.emplace_back(Args: C.getDeclAlign(D), Args&: D);
124
125	// Build struct _globalized_locals_ty {
126	// / globalized vars /[WarSize] align (decl_align)
127	// / globalized vars / for EscapedDeclsForTeams
128	// };
129	RecordDecl *GlobalizedRD = C.buildImplicitRecord(Name: "_globalized_locals_ty");
130	GlobalizedRD->startDefinition();
131	llvm::SmallPtrSet<const ValueDecl *, `16`> SingleEscaped(
132	EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
133	for (const auto &Pair : GlobalizedVars) {
134	const ValueDecl *VD = Pair.second;
135	QualType Type = VD->getType();
136	if (Type ->isLValueReferenceType())
137	Type = C.getPointerType(T: Type.getNonReferenceType());
138	else
139	Type = Type.getNonReferenceType();
140	SourceLocation Loc = VD->getLocation();
141	FieldDecl *Field;
142	if (SingleEscaped.count(Ptr: VD)) {
143	Field = FieldDecl::Create(
144	C, DC: GlobalizedRD, StartLoc: Loc, IdLoc: Loc, Id: VD->getIdentifier(), T: Type,
145	TInfo: C.getTrivialTypeSourceInfo(T: Type, Loc: SourceLocation ()),
146	/BW=/nullptr, /Mutable=/false,
147	/InitStyle=/ICIS_NoInit);
148	Field->setAccess(AS_public);
149	if (VD->hasAttrs()) {
150	for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
151	E(VD->getAttrs().end());
152	I != E; ++I)
153	Field->addAttr(*I);
154	}
155	} else {
156	if (BufSize > `1`) {
157	llvm::APInt ArraySize(`32`, BufSize);
158	Type = C.getConstantArrayType(EltTy: Type, ArySize: ArraySize, SizeExpr: nullptr,
159	ASM: ArraySizeModifier::Normal, IndexTypeQuals: `0`);
160	}
161	Field = FieldDecl::Create(
162	C, DC: GlobalizedRD, StartLoc: Loc, IdLoc: Loc, Id: VD->getIdentifier(), T: Type,
163	TInfo: C.getTrivialTypeSourceInfo(T: Type, Loc: SourceLocation ()),
164	/BW=/nullptr, /Mutable=/false,
165	/InitStyle=/ICIS_NoInit);
166	Field->setAccess(AS_public);
167	llvm::APInt Align(`32`, Pair.first.getQuantity());
168	Field->addAttr(AlignedAttr::CreateImplicit(
169	C, /IsAlignmentExpr=/true,
170	IntegerLiteral::Create(C, Align,
171	C.getIntTypeForBitwidth(`32`, /Signed=/`0`),
172	SourceLocation()),
173	{}, AlignedAttr::GNU_aligned));
174	}
175	GlobalizedRD->addDecl(Field);
176	MappedDeclsFields.try_emplace(Key: VD, Args&: Field);
177	}
178	GlobalizedRD->completeDefinition();
179	return GlobalizedRD;
180	}
181
182	/// Get the list of variables that can escape their declaration context.
183	class CheckVarsEscapingDeclContext final
184	: public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
185	CodeGenFunction &CGF;
186	llvm::SetVector<const ValueDecl *> EscapedDecls;
187	llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
188	llvm::SetVector<const ValueDecl *> DelayedVariableLengthDecls;
189	llvm::SmallPtrSet<const Decl *, `4`> EscapedParameters;
190	RecordDecl GlobalizedRD = nullptr*;
191	llvm::SmallDenseMap<const ValueDecl , const* FieldDecl *> MappedDeclsFields;
192	bool AllEscaped = false;
193	bool IsForCombinedParallelRegion = false;
194
195	void markAsEscaped(const ValueDecl *VD) {
196	// Do not globalize declare target variables.
197	if (!isa<VarDecl>(VD) \|\|
198	OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
199	return;
200	VD = cast<ValueDecl>(VD->getCanonicalDecl());
201	// Use user-specified allocation.
202	if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
203	return;
204	// Variables captured by value must be globalized.
205	bool IsCaptured = false;
206	if (auto *CSI = CGF.CapturedStmtInfo) {
207	if (const FieldDecl *FD = CSI->lookup(VD: cast<VarDecl>(Val: VD))) {
208	// Check if need to capture the variable that was already captured by
209	// value in the outer region.
210	IsCaptured = true;
211	if (!IsForCombinedParallelRegion) {
212	if (!FD->hasAttrs())
213	return;
214	const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
215	if (!Attr)
216	return;
217	if (((Attr->getCaptureKind() != OMPC_map) &&
218	!isOpenMPPrivate(Attr->getCaptureKind())) \|\|
219	((Attr->getCaptureKind() == OMPC_map) &&
220	!FD->getType()->isAnyPointerType()))
221	return;
222	}
223	if (!FD->getType()->isReferenceType()) {
224	assert(!VD->getType()->isVariablyModifiedType() &&
225	"Parameter captured by value with variably modified type");
226	EscapedParameters.insert(VD);
227	} else if (!IsForCombinedParallelRegion) {
228	return;
229	}
230	}
231	}
232	if ((!CGF.CapturedStmtInfo \|\|
233	(IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
234	VD->getType()->isReferenceType())
235	// Do not globalize variables with reference type.
236	return;
237	if (VD->getType()->isVariablyModifiedType()) {
238	// If not captured at the target region level then mark the escaped
239	// variable as delayed.
240	if (IsCaptured)
241	EscapedVariableLengthDecls.insert(X: VD);
242	else
243	DelayedVariableLengthDecls.insert(X: VD);
244	} else
245	EscapedDecls.insert(X: VD);
246	}
247
248	void VisitValueDecl(const ValueDecl *VD) {
249	if (VD->getType()->isLValueReferenceType())
250	markAsEscaped(VD);
251	if (const auto *VarD = dyn_cast<VarDecl>(Val: VD)) {
252	if (!isa<ParmVarDecl>(Val: VarD) && VarD->hasInit()) {
253	const bool SavedAllEscaped = AllEscaped;
254	AllEscaped = VD->getType()->isLValueReferenceType();
255	Visit(VarD->getInit());
256	AllEscaped = SavedAllEscaped;
257	}
258	}
259	}
260	void VisitOpenMPCapturedStmt(const CapturedStmt *S,
261	ArrayRef<OMPClause *> Clauses,
262	bool IsCombinedParallelRegion) {
263	if (!S)
264	return;
265	for (const CapturedStmt::Capture &C : S->captures()) {
266	if (C.capturesVariable() && !C.capturesVariableByCopy()) {
267	const ValueDecl *VD = C.getCapturedVar();
268	bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
269	if (IsCombinedParallelRegion) {
270	// Check if the variable is privatized in the combined construct and
271	// those private copies must be shared in the inner parallel
272	// directive.
273	IsForCombinedParallelRegion = false;
274	for (const OMPClause *C : Clauses) {
275	if (!isOpenMPPrivate(C->getClauseKind()) \|\|
276	C->getClauseKind() == OMPC_reduction \|\|
277	C->getClauseKind() == OMPC_linear \|\|
278	C->getClauseKind() == OMPC_private)
279	continue;
280	ArrayRef<const Expr *> Vars;
281	if (const auto *PC = dyn_cast<OMPFirstprivateClause>(Val: C))
282	Vars = PC->getVarRefs();
283	else if (const auto *PC = dyn_cast<OMPLastprivateClause>(Val: C))
284	Vars = PC->getVarRefs();
285	else
286	llvm_unreachable("Unexpected clause.");
287	for (const auto *E : Vars) {
288	const Decl *D =
289	cast<DeclRefExpr>(Val: E)->getDecl()->getCanonicalDecl();
290	if (D == VD->getCanonicalDecl()) {
291	IsForCombinedParallelRegion = true;
292	break;
293	}
294	}
295	if (IsForCombinedParallelRegion)
296	break;
297	}
298	}
299	markAsEscaped(VD);
300	if (isa<OMPCapturedExprDecl>(Val: VD))
301	VisitValueDecl(VD);
302	IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
303	}
304	}
305	}
306
307	void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
308	assert(!GlobalizedRD &&
309	"Record for globalized variables is built already.");
310	ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
311	unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
312	if (IsInTTDRegion)
313	EscapedDeclsForTeams = EscapedDecls.getArrayRef();
314	else
315	EscapedDeclsForParallel = EscapedDecls.getArrayRef();
316	GlobalizedRD = ::buildRecordForGlobalizedVars(
317	C&: CGF.getContext(), EscapedDecls: EscapedDeclsForParallel, EscapedDeclsForTeams,
318	MappedDeclsFields, BufSize: WarpSize);
319	}
320
321	public:
322	CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
323	ArrayRef<const ValueDecl *> TeamsReductions)
324	: CGF(CGF), EscapedDecls (TeamsReductions.begin(), TeamsReductions.end()) {
325	}
326	virtual ~CheckVarsEscapingDeclContext() = default;
327	void VisitDeclStmt(const DeclStmt *S) {
328	if (!S)
329	return;
330	for (const Decl *D : S->decls())
331	if (const auto *VD = dyn_cast_or_null<ValueDecl>(Val: D))
332	VisitValueDecl(VD);
333	}
334	void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
335	if (!D)
336	return;
337	if (!D->hasAssociatedStmt())
338	return;
339	if (const auto *S =
340	dyn_cast_or_null<CapturedStmt>(Val: D->getAssociatedStmt())) {
341	// Do not analyze directives that do not actually require capturing,
342	// like `omp for` or `omp simd` directives.
343	llvm::SmallVector<OpenMPDirectiveKind, `4`> CaptureRegions;
344	getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
345	if (CaptureRegions.size() == `1` && CaptureRegions.back() == OMPD_unknown) {
346	VisitStmt(S: S->getCapturedStmt());
347	return;
348	}
349	VisitOpenMPCapturedStmt(
350	S, D->clauses(),
351	CaptureRegions.back() == OMPD_parallel &&
352	isOpenMPDistributeDirective(D->getDirectiveKind()));
353	}
354	}
355	void VisitCapturedStmt(const CapturedStmt *S) {
356	if (!S)
357	return;
358	for (const CapturedStmt::Capture &C : S->captures()) {
359	if (C.capturesVariable() && !C.capturesVariableByCopy()) {
360	const ValueDecl *VD = C.getCapturedVar();
361	markAsEscaped(VD);
362	if (isa<OMPCapturedExprDecl>(Val: VD))
363	VisitValueDecl(VD);
364	}
365	}
366	}
367	void VisitLambdaExpr(const LambdaExpr *E) {
368	if (!E)
369	return;
370	for (const LambdaCapture &C : E->captures()) {
371	if (C.capturesVariable()) {
372	if (C.getCaptureKind() == LCK_ByRef) {
373	const ValueDecl *VD = C.getCapturedVar();
374	markAsEscaped(VD);
375	if (E->isInitCapture(Capture: &C) \|\| isa<OMPCapturedExprDecl>(Val: VD))
376	VisitValueDecl(VD);
377	}
378	}
379	}
380	}
381	void VisitBlockExpr(const BlockExpr *E) {
382	if (!E)
383	return;
384	for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
385	if (C.isByRef()) {
386	const VarDecl *VD = C.getVariable();
387	markAsEscaped(VD);
388	if (isa<OMPCapturedExprDecl>(Val: VD) \|\| VD->isInitCapture())
389	VisitValueDecl(VD);
390	}
391	}
392	}
393	void VisitCallExpr(const CallExpr *E) {
394	if (!E)
395	return;
396	for (const Expr *Arg : E->arguments()) {
397	if (!Arg)
398	continue;
399	if (Arg->isLValue()) {
400	const bool SavedAllEscaped = AllEscaped;
401	AllEscaped = true;
402	Visit(Arg);
403	AllEscaped = SavedAllEscaped;
404	} else {
405	Visit(Arg);
406	}
407	}
408	Visit(E->getCallee());
409	}
410	void VisitDeclRefExpr(const DeclRefExpr *E) {
411	if (!E)
412	return;
413	const ValueDecl *VD = E->getDecl();
414	if (AllEscaped)
415	markAsEscaped(VD);
416	if (isa<OMPCapturedExprDecl>(Val: VD))
417	VisitValueDecl(VD);
418	else if (VD->isInitCapture())
419	VisitValueDecl(VD);
420	}
421	void VisitUnaryOperator(const UnaryOperator *E) {
422	if (!E)
423	return;
424	if (E->getOpcode() == UO_AddrOf) {
425	const bool SavedAllEscaped = AllEscaped;
426	AllEscaped = true;
427	Visit(E->getSubExpr());
428	AllEscaped = SavedAllEscaped;
429	} else {
430	Visit(E->getSubExpr());
431	}
432	}
433	void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
434	if (!E)
435	return;
436	if (E->getCastKind() == CK_ArrayToPointerDecay) {
437	const bool SavedAllEscaped = AllEscaped;
438	AllEscaped = true;
439	Visit(E->getSubExpr());
440	AllEscaped = SavedAllEscaped;
441	} else {
442	Visit(E->getSubExpr());
443	}
444	}
445	void VisitExpr(const Expr *E) {
446	if (!E)
447	return;
448	bool SavedAllEscaped = AllEscaped;
449	if (!E->isLValue())
450	AllEscaped = false;
451	for (const Stmt *Child : E->children())
452	if (Child)
453	Visit(Child);
454	AllEscaped = SavedAllEscaped;
455	}
456	void VisitStmt(const Stmt *S) {
457	if (!S)
458	return;
459	for (const Stmt *Child : S->children())
460	if (Child)
461	Visit(Child);
462	}
463
464	/// Returns the record that handles all the escaped local variables and used
465	/// instead of their original storage.
466	const RecordDecl getGlobalizedRecord(bool* IsInTTDRegion) {
467	if (!GlobalizedRD)
468	buildRecordForGlobalizedVars(IsInTTDRegion);
469	return GlobalizedRD;
470	}
471
472	/// Returns the field in the globalized record for the escaped variable.
473	const FieldDecl getFieldForGlobalizedVar(const* ValueDecl VD) const* {
474	assert(GlobalizedRD &&
475	"Record for globalized variables must be generated already.");
476	return MappedDeclsFields.lookup(Val: VD);
477	}
478
479	/// Returns the list of the escaped local variables/parameters.
480	ArrayRef<const ValueDecl > getEscapedDecls() const* {
481	return EscapedDecls.getArrayRef();
482	}
483
484	/// Checks if the escaped local variable is actually a parameter passed by
485	/// value.
486	const llvm::SmallPtrSetImpl<const Decl > &getEscapedParameters() const* {
487	return EscapedParameters;
488	}
489
490	/// Returns the list of the escaped variables with the variably modified
491	/// types.
492	ArrayRef<const ValueDecl > getEscapedVariableLengthDecls() const* {
493	return EscapedVariableLengthDecls.getArrayRef();
494	}
495
496	/// Returns the list of the delayed variables with the variably modified
497	/// types.
498	ArrayRef<const ValueDecl > getDelayedVariableLengthDecls() const* {
499	return DelayedVariableLengthDecls.getArrayRef();
500	}
501	};
502	} // anonymous namespace
503
504	/// Get the id of the warp in the block.
505	/// We assume that the warp size is 32, which is always the case
506	/// on the NVPTX device, to generate more efficient code.
507	static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
508	CGBuilderTy &Bld = CGF.Builder;
509	unsigned LaneIDBits =
510	llvm::Log2_32(Value: CGF.getTarget().getGridValue().GV_Warp_Size);
511	auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
512	return Bld.CreateAShr(LHS: RT.getGPUThreadID(CGF), RHS: LaneIDBits, Name: "nvptx_warp_id");
513	}
514
515	/// Get the id of the current lane in the Warp.
516	/// We assume that the warp size is 32, which is always the case
517	/// on the NVPTX device, to generate more efficient code.
518	static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
519	CGBuilderTy &Bld = CGF.Builder;
520	unsigned LaneIDBits =
521	llvm::Log2_32(Value: CGF.getTarget().getGridValue().GV_Warp_Size);
522	assert(LaneIDBits < `32` && "Invalid LaneIDBits size in NVPTX device.");
523	unsigned LaneIDMask = ~`0u` >> (`32u` - LaneIDBits);
524	auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
525	return Bld.CreateAnd(LHS: RT.getGPUThreadID(CGF), RHS: Bld.getInt32(C: LaneIDMask),
526	Name: "nvptx_lane_id");
527	}
528
529	CGOpenMPRuntimeGPU::ExecutionMode
530	CGOpenMPRuntimeGPU::getExecutionMode() const {
531	return CurrentExecutionMode;
532	}
533
534	CGOpenMPRuntimeGPU::DataSharingMode
535	CGOpenMPRuntimeGPU::getDataSharingMode() const {
536	return CurrentDataSharingMode;
537	}
538
539	/// Check for inner (nested) SPMD construct, if any
540	static bool hasNestedSPMDDirective(ASTContext &Ctx,
541	const OMPExecutableDirective &D) {
542	const auto *CS = D.getInnermostCapturedStmt();
543	const auto *Body =
544	CS->getCapturedStmt()->IgnoreContainers(/IgnoreCaptured=/true);
545	const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
546
547	if (const auto *NestedDir =
548	dyn_cast_or_null<OMPExecutableDirective>(Val: ChildStmt)) {
549	OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
550	switch (D.getDirectiveKind()) {
551	case OMPD_target:
552	if (isOpenMPParallelDirective(DKind))
553	return true;
554	if (DKind == OMPD_teams) {
555	Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
556	/IgnoreCaptured=/true);
557	if (!Body)
558	return false;
559	ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
560	if (const auto *NND =
561	dyn_cast_or_null<OMPExecutableDirective>(Val: ChildStmt)) {
562	DKind = NND->getDirectiveKind();
563	if (isOpenMPParallelDirective(DKind))
564	return true;
565	}
566	}
567	return false;
568	case OMPD_target_teams:
569	return isOpenMPParallelDirective(DKind);
570	case OMPD_target_simd:
571	case OMPD_target_parallel:
572	case OMPD_target_parallel_for:
573	case OMPD_target_parallel_for_simd:
574	case OMPD_target_teams_distribute:
575	case OMPD_target_teams_distribute_simd:
576	case OMPD_target_teams_distribute_parallel_for:
577	case OMPD_target_teams_distribute_parallel_for_simd:
578	case OMPD_parallel:
579	case OMPD_for:
580	case OMPD_parallel_for:
581	case OMPD_parallel_master:
582	case OMPD_parallel_sections:
583	case OMPD_for_simd:
584	case OMPD_parallel_for_simd:
585	case OMPD_cancel:
586	case OMPD_cancellation_point:
587	case OMPD_ordered:
588	case OMPD_threadprivate:
589	case OMPD_allocate:
590	case OMPD_task:
591	case OMPD_simd:
592	case OMPD_sections:
593	case OMPD_section:
594	case OMPD_single:
595	case OMPD_master:
596	case OMPD_critical:
597	case OMPD_taskyield:
598	case OMPD_barrier:
599	case OMPD_taskwait:
600	case OMPD_taskgroup:
601	case OMPD_atomic:
602	case OMPD_flush:
603	case OMPD_depobj:
604	case OMPD_scan:
605	case OMPD_teams:
606	case OMPD_target_data:
607	case OMPD_target_exit_data:
608	case OMPD_target_enter_data:
609	case OMPD_distribute:
610	case OMPD_distribute_simd:
611	case OMPD_distribute_parallel_for:
612	case OMPD_distribute_parallel_for_simd:
613	case OMPD_teams_distribute:
614	case OMPD_teams_distribute_simd:
615	case OMPD_teams_distribute_parallel_for:
616	case OMPD_teams_distribute_parallel_for_simd:
617	case OMPD_target_update:
618	case OMPD_declare_simd:
619	case OMPD_declare_variant:
620	case OMPD_begin_declare_variant:
621	case OMPD_end_declare_variant:
622	case OMPD_declare_target:
623	case OMPD_end_declare_target:
624	case OMPD_declare_reduction:
625	case OMPD_declare_mapper:
626	case OMPD_taskloop:
627	case OMPD_taskloop_simd:
628	case OMPD_master_taskloop:
629	case OMPD_master_taskloop_simd:
630	case OMPD_parallel_master_taskloop:
631	case OMPD_parallel_master_taskloop_simd:
632	case OMPD_requires:
633	case OMPD_unknown:
634	default:
635	llvm_unreachable("Unexpected directive.");
636	}
637	}
638
639	return false;
640	}
641
642	static bool supportsSPMDExecutionMode(ASTContext &Ctx,
643	const OMPExecutableDirective &D) {
644	OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
645	switch (DirectiveKind) {
646	case OMPD_target:
647	case OMPD_target_teams:
648	return hasNestedSPMDDirective(Ctx, D);
649	case OMPD_target_parallel_loop:
650	case OMPD_target_parallel:
651	case OMPD_target_parallel_for:
652	case OMPD_target_parallel_for_simd:
653	case OMPD_target_teams_distribute_parallel_for:
654	case OMPD_target_teams_distribute_parallel_for_simd:
655	case OMPD_target_simd:
656	case OMPD_target_teams_distribute_simd:
657	return true;
658	case OMPD_target_teams_distribute:
659	return false;
660	case OMPD_target_teams_loop:
661	// Whether this is true or not depends on how the directive will
662	// eventually be emitted.
663	if (auto *TTLD = dyn_cast<OMPTargetTeamsGenericLoopDirective>(Val: &D))
664	return TTLD->canBeParallelFor();
665	return false;
666	case OMPD_parallel:
667	case OMPD_for:
668	case OMPD_parallel_for:
669	case OMPD_parallel_master:
670	case OMPD_parallel_sections:
671	case OMPD_for_simd:
672	case OMPD_parallel_for_simd:
673	case OMPD_cancel:
674	case OMPD_cancellation_point:
675	case OMPD_ordered:
676	case OMPD_threadprivate:
677	case OMPD_allocate:
678	case OMPD_task:
679	case OMPD_simd:
680	case OMPD_sections:
681	case OMPD_section:
682	case OMPD_single:
683	case OMPD_master:
684	case OMPD_critical:
685	case OMPD_taskyield:
686	case OMPD_barrier:
687	case OMPD_taskwait:
688	case OMPD_taskgroup:
689	case OMPD_atomic:
690	case OMPD_flush:
691	case OMPD_depobj:
692	case OMPD_scan:
693	case OMPD_teams:
694	case OMPD_target_data:
695	case OMPD_target_exit_data:
696	case OMPD_target_enter_data:
697	case OMPD_distribute:
698	case OMPD_distribute_simd:
699	case OMPD_distribute_parallel_for:
700	case OMPD_distribute_parallel_for_simd:
701	case OMPD_teams_distribute:
702	case OMPD_teams_distribute_simd:
703	case OMPD_teams_distribute_parallel_for:
704	case OMPD_teams_distribute_parallel_for_simd:
705	case OMPD_target_update:
706	case OMPD_declare_simd:
707	case OMPD_declare_variant:
708	case OMPD_begin_declare_variant:
709	case OMPD_end_declare_variant:
710	case OMPD_declare_target:
711	case OMPD_end_declare_target:
712	case OMPD_declare_reduction:
713	case OMPD_declare_mapper:
714	case OMPD_taskloop:
715	case OMPD_taskloop_simd:
716	case OMPD_master_taskloop:
717	case OMPD_master_taskloop_simd:
718	case OMPD_parallel_master_taskloop:
719	case OMPD_parallel_master_taskloop_simd:
720	case OMPD_requires:
721	case OMPD_unknown:
722	default:
723	break;
724	}
725	llvm_unreachable(
726	"Unknown programming model for OpenMP directive on NVPTX target.");
727	}
728
729	void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
730	StringRef ParentName,
731	llvm::Function *&OutlinedFn,
732	llvm::Constant *&OutlinedFnID,
733	bool IsOffloadEntry,
734	const RegionCodeGenTy &CodeGen) {
735	ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_NonSPMD);
736	EntryFunctionState EST;
737	WrapperFunctionsMap.clear();
738
739	[[maybe_unused]] bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
740	assert(!IsBareKernel && "bare kernel should not be at generic mode");
741
742	// Emit target region as a standalone region.
743	class NVPTXPrePostActionTy : public PrePostActionTy {
744	CGOpenMPRuntimeGPU::EntryFunctionState &EST;
745	const OMPExecutableDirective &D;
746
747	public:
748	NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST,
749	const OMPExecutableDirective &D)
750	: EST(EST), D(D) {}
751	void Enter(CodeGenFunction &CGF) override {
752	auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
753	RT.emitKernelInit(D, CGF, EST, / IsSPMD / false);
754	// Skip target region initialization.
755	RT.setLocThreadIdInsertPt(CGF, /AtCurrentPoint=/true);
756	}
757	void Exit(CodeGenFunction &CGF) override {
758	auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
759	RT.clearLocThreadIdInsertPt(CGF);
760	RT.emitKernelDeinit(CGF, EST, / IsSPMD / false);
761	}
762	} Action(EST, D);
763	CodeGen.setAction(Action);
764	IsInTTDRegion = true;
765	emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
766	IsOffloadEntry, CodeGen);
767	IsInTTDRegion = false;
768	}
769
770	void CGOpenMPRuntimeGPU::emitKernelInit(const OMPExecutableDirective &D,
771	CodeGenFunction &CGF,
772	EntryFunctionState &EST, bool IsSPMD) {
773	int32_t MinThreadsVal = `1`, MaxThreadsVal = -`1`, MinTeamsVal = `1`,
774	MaxTeamsVal = -`1`;
775	computeMinAndMaxThreadsAndTeams(D, CGF, MinThreadsVal, MaxThreadsVal,
776	MinTeamsVal, MaxTeamsVal);
777
778	CGBuilderTy &Bld = CGF.Builder;
779	Bld.restoreIP(IP: OMPBuilder.createTargetInit(
780	Loc: Bld, IsSPMD, MinThreadsVal, MaxThreadsVal, MinTeamsVal, MaxTeamsVal));
781	if (!IsSPMD)
782	emitGenericVarsProlog(CGF, Loc: EST.Loc);
783	}
784
785	void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
786	EntryFunctionState &EST,
787	bool IsSPMD) {
788	if (!IsSPMD)
789	emitGenericVarsEpilog(CGF);
790
791	// This is temporary until we remove the fixed sized buffer.
792	ASTContext &C = CGM.getContext();
793	RecordDecl *StaticRD = C.buildImplicitRecord(
794	Name: "_openmp_teams_reduction_type_$_", TK: RecordDecl::TagKind::Union);
795	StaticRD->startDefinition();
796	for (const RecordDecl *TeamReductionRec : TeamsReductions) {
797	QualType RecTy = C.getRecordType(Decl: TeamReductionRec);
798	auto *Field = FieldDecl::Create(
799	C, StaticRD, SourceLocation (), SourceLocation (), nullptr, RecTy,
800	C.getTrivialTypeSourceInfo(T: RecTy, Loc: SourceLocation ()),
801	/BW=/nullptr, /Mutable=/false,
802	/InitStyle=/ICIS_NoInit);
803	Field->setAccess(AS_public);
804	StaticRD->addDecl(D: Field);
805	}
806	StaticRD->completeDefinition();
807	QualType StaticTy = C.getRecordType(Decl: StaticRD);
808	llvm::Type *LLVMReductionsBufferTy =
809	CGM.getTypes().ConvertTypeForMem(T: StaticTy);
810	const auto &DL = CGM.getModule().getDataLayout();
811	uint64_t ReductionDataSize =
812	TeamsReductions.empty()
813	? `0`
814	: DL.getTypeAllocSize(Ty: LLVMReductionsBufferTy).getFixedValue();
815	CGBuilderTy &Bld = CGF.Builder;
816	OMPBuilder.createTargetDeinit(Loc: Bld, TeamsReductionDataSize: ReductionDataSize,
817	TeamsReductionBufferLength: C.getLangOpts().OpenMPCUDAReductionBufNum);
818	TeamsReductions.clear();
819	}
820
821	void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
822	StringRef ParentName,
823	llvm::Function *&OutlinedFn,
824	llvm::Constant *&OutlinedFnID,
825	bool IsOffloadEntry,
826	const RegionCodeGenTy &CodeGen) {
827	ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_SPMD);
828	EntryFunctionState EST;
829
830	bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
831
832	// Emit target region as a standalone region.
833	class NVPTXPrePostActionTy : public PrePostActionTy {
834	CGOpenMPRuntimeGPU &RT;
835	CGOpenMPRuntimeGPU::EntryFunctionState &EST;
836	bool IsBareKernel;
837	DataSharingMode Mode;
838	const OMPExecutableDirective &D;
839
840	public:
841	NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
842	CGOpenMPRuntimeGPU::EntryFunctionState &EST,
843	bool IsBareKernel, const OMPExecutableDirective &D)
844	: RT(RT), EST(EST), IsBareKernel(IsBareKernel),
845	Mode(RT.CurrentDataSharingMode), D(D) {}
846	void Enter(CodeGenFunction &CGF) override {
847	if (IsBareKernel) {
848	RT.CurrentDataSharingMode = DataSharingMode::DS_CUDA;
849	return;
850	}
851	RT.emitKernelInit(D, CGF, EST, / IsSPMD / true);
852	// Skip target region initialization.
853	RT.setLocThreadIdInsertPt(CGF, /AtCurrentPoint=/true);
854	}
855	void Exit(CodeGenFunction &CGF) override {
856	if (IsBareKernel) {
857	RT.CurrentDataSharingMode = Mode;
858	return;
859	}
860	RT.clearLocThreadIdInsertPt(CGF);
861	RT.emitKernelDeinit(CGF, EST, / IsSPMD / true);
862	}
863	} Action(*this, EST, IsBareKernel, D);
864	CodeGen.setAction(Action);
865	IsInTTDRegion = true;
866	emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
867	IsOffloadEntry, CodeGen);
868	IsInTTDRegion = false;
869	}
870
871	void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
872	const OMPExecutableDirective &D, StringRef ParentName,
873	llvm::Function &OutlinedFn, llvm::Constant &OutlinedFnID,
874	bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
875	if (!IsOffloadEntry) // Nothing to do.
876	return;
877
878	assert(!ParentName.empty() && "Invalid target region parent name!");
879
880	bool Mode = supportsSPMDExecutionMode(Ctx&: CGM.getContext(), D);
881	bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
882	if (Mode \|\| IsBareKernel)
883	emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
884	CodeGen);
885	else
886	emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
887	CodeGen);
888	}
889
890	CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
891	: CGOpenMPRuntime (CGM) {
892	llvm::OpenMPIRBuilderConfig Config(
893	CGM.getLangOpts().OpenMPIsTargetDevice, isGPU(),
894	CGM.getLangOpts().OpenMPOffloadMandatory,
895	/HasRequiresReverseOffload/ false, /HasRequiresUnifiedAddress/ false,
896	hasRequiresUnifiedSharedMemory(), /HasRequiresDynamicAllocators/ false);
897	OMPBuilder.setConfig(Config);
898
899	if (!CGM.getLangOpts().OpenMPIsTargetDevice)
900	llvm_unreachable("OpenMP can only handle device code.");
901
902	if (CGM.getLangOpts().OpenMPCUDAMode)
903	CurrentDataSharingMode = CGOpenMPRuntimeGPU::DS_CUDA;
904
905	llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
906	if (CGM.getLangOpts().NoGPULib \|\| CGM.getLangOpts().OMPHostIRFile.empty())
907	return;
908
909	OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPTargetDebug,
910	Name: "__omp_rtl_debug_kind");
911	OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPTeamSubscription,
912	Name: "__omp_rtl_assume_teams_oversubscription");
913	OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPThreadSubscription,
914	Name: "__omp_rtl_assume_threads_oversubscription");
915	OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPNoThreadState,
916	Name: "__omp_rtl_assume_no_thread_state");
917	OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPNoNestedParallelism,
918	Name: "__omp_rtl_assume_no_nested_parallelism");
919	}
920
921	void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF,
922	ProcBindKind ProcBind,
923	SourceLocation Loc) {
924	// Nothing to do.
925	}
926
927	void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF,
928	llvm::Value *NumThreads,
929	SourceLocation Loc) {
930	// Nothing to do.
931	}
932
933	void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF,
934	const Expr *NumTeams,
935	const Expr *ThreadLimit,
936	SourceLocation Loc) {}
937
938	llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction(
939	CodeGenFunction &CGF, const OMPExecutableDirective &D,
940	const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
941	const RegionCodeGenTy &CodeGen) {
942	// Emit target region as a standalone region.
943	bool PrevIsInTTDRegion = IsInTTDRegion;
944	IsInTTDRegion = false;
945	auto *OutlinedFun =
946	cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
947	CGF, D, ThreadIDVar, InnermostKind, CodeGen));
948	IsInTTDRegion = PrevIsInTTDRegion;
949	if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) {
950	llvm::Function *WrapperFun =
951	createParallelDataSharingWrapper(OutlinedParallelFn: OutlinedFun, D);
952	WrapperFunctionsMap[OutlinedFun] = WrapperFun;
953	}
954
955	return OutlinedFun;
956	}
957
958	/// Get list of lastprivate variables from the teams distribute ... or
959	/// teams {distribute ...} directives.
960	static void
961	getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D,
962	llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
963	assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
964	"expected teams directive.");
965	const OMPExecutableDirective *Dir = &D;
966	if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {
967	if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild(
968	Ctx,
969	Body: D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
970	/IgnoreCaptured=/true))) {
971	Dir = dyn_cast_or_null<OMPExecutableDirective>(Val: S);
972	if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
973	Dir = nullptr;
974	}
975	}
976	if (!Dir)
977	return;
978	for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
979	for (const Expr *E : C->getVarRefs())
980	Vars.push_back(getPrivateItem(E));
981	}
982	}
983
984	/// Get list of reduction variables from the teams ... directives.
985	static void
986	getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D,
987	llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
988	assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
989	"expected teams directive.");
990	for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
991	for (const Expr *E : C->privates())
992	Vars.push_back(Elt: getPrivateItem(RefExpr: E));
993	}
994	}
995
996	llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
997	CodeGenFunction &CGF, const OMPExecutableDirective &D,
998	const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
999	const RegionCodeGenTy &CodeGen) {
1000	SourceLocation Loc = D.getBeginLoc();
1001
1002	const RecordDecl GlobalizedRD = nullptr*;
1003	llvm::SmallVector<const ValueDecl *, `4`> LastPrivatesReductions;
1004	llvm::SmallDenseMap<const ValueDecl , const* FieldDecl *> MappedDeclsFields;
1005	unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size;
1006	// Globalize team reductions variable unconditionally in all modes.
1007	if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1008	getTeamsReductionVars(Ctx&: CGM.getContext(), D, Vars&: LastPrivatesReductions);
1009	if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
1010	getDistributeLastprivateVars(Ctx&: CGM.getContext(), D, Vars&: LastPrivatesReductions);
1011	if (!LastPrivatesReductions.empty()) {
1012	GlobalizedRD = ::buildRecordForGlobalizedVars(
1013	C&: CGM.getContext(), EscapedDecls: std::nullopt, EscapedDeclsForTeams: LastPrivatesReductions,
1014	MappedDeclsFields, BufSize: WarpSize);
1015	}
1016	} else if (!LastPrivatesReductions.empty()) {
1017	assert(!TeamAndReductions.first &&
1018	"Previous team declaration is not expected.");
1019	TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
1020	std::swap(LHS&: TeamAndReductions.second, RHS&: LastPrivatesReductions);
1021	}
1022
1023	// Emit target region as a standalone region.
1024	class NVPTXPrePostActionTy : public PrePostActionTy {
1025	SourceLocation &Loc;
1026	const RecordDecl *GlobalizedRD;
1027	llvm::SmallDenseMap<const ValueDecl , const* FieldDecl *>
1028	&MappedDeclsFields;
1029
1030	public:
1031	NVPTXPrePostActionTy(
1032	SourceLocation &Loc, const RecordDecl *GlobalizedRD,
1033	llvm::SmallDenseMap<const ValueDecl , const* FieldDecl *>
1034	&MappedDeclsFields)
1035	: Loc(Loc), GlobalizedRD(GlobalizedRD),
1036	MappedDeclsFields(MappedDeclsFields) {}
1037	void Enter(CodeGenFunction &CGF) override {
1038	auto &Rt =
1039	static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1040	if (GlobalizedRD) {
1041	auto I = Rt.FunctionGlobalizedDecls.try_emplace(Key: CGF.CurFn).first;
1042	I ->getSecond().MappedParams =
1043	std::make_unique<CodeGenFunction::OMPMapVars>();
1044	DeclToAddrMapTy &Data = I ->getSecond().LocalVarData;
1045	for (const auto &Pair : MappedDeclsFields) {
1046	assert(Pair.getFirst()->isCanonicalDecl() &&
1047	"Expected canonical declaration");
1048	Data.insert(std::make_pair(x: Pair.getFirst(), y: MappedVarData ()));
1049	}
1050	}
1051	Rt.emitGenericVarsProlog(CGF, Loc);
1052	}
1053	void Exit(CodeGenFunction &CGF) override {
1054	static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
1055	.emitGenericVarsEpilog(CGF);
1056	}
1057	} Action(Loc, GlobalizedRD, MappedDeclsFields);
1058	CodeGen.setAction(Action);
1059	llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
1060	CGF, D, ThreadIDVar, InnermostKind, CodeGen);
1061
1062	return OutlinedFun;
1063	}
1064
1065	void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
1066	SourceLocation Loc) {
1067	if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1068	return;
1069
1070	CGBuilderTy &Bld = CGF.Builder;
1071
1072	const auto I = FunctionGlobalizedDecls.find(Val: CGF.CurFn);
1073	if (I == FunctionGlobalizedDecls.end())
1074	return;
1075
1076	for (auto &Rec : I ->getSecond().LocalVarData) {
1077	const auto *VD = cast<VarDecl>(Val: Rec.first);
1078	bool EscapedParam = I ->getSecond().EscapedParameters.count(Ptr: Rec.first);
1079	QualType VarTy = VD->getType();
1080
1081	// Get the local allocation of a firstprivate variable before sharing
1082	llvm::Value *ParValue;
1083	if (EscapedParam) {
1084	LValue ParLVal =
1085	CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
1086	ParValue = CGF.EmitLoadOfScalar(lvalue: ParLVal, Loc);
1087	}
1088
1089	// Allocate space for the variable to be globalized
1090	llvm::Value *AllocArgs[] = {CGF.getTypeSize(Ty: VD->getType())};
1091	llvm::CallBase *VoidPtr =
1092	CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1093	M&: CGM.getModule(), FnID: OMPRTL___kmpc_alloc_shared),
1094	AllocArgs, VD->getName());
1095	// FIXME: We should use the variables actual alignment as an argument.
1096	VoidPtr->addRetAttr(llvm::Attribute::get(
1097	CGM.getLLVMContext(), llvm::Attribute::Alignment,
1098	CGM.getContext().getTargetInfo().getNewAlign() / `8`));
1099
1100	// Cast the void pointer and get the address of the globalized variable.
1101	llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(T: VarTy)->getPointerTo();
1102	llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1103	VoidPtr, VarPtrTy, VD->getName() + "_on_stack");
1104	LValue VarAddr =
1105	CGF.MakeNaturalAlignPointeeRawAddrLValue(V: CastedVoidPtr, T: VarTy);
1106	Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
1107	Rec.second.GlobalizedVal = VoidPtr;
1108
1109	// Assign the local allocation to the newly globalized location.
1110	if (EscapedParam) {
1111	CGF.EmitStoreOfScalar(value: ParValue, lvalue: VarAddr);
1112	I ->getSecond().MappedParams ->setVarAddr(CGF, LocalVD: VD, TempAddr: VarAddr.getAddress(CGF));
1113	}
1114	if (auto *DI = CGF.getDebugInfo())
1115	VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(Loc: VD->getLocation()));
1116	}
1117
1118	for (const auto *ValueD : I ->getSecond().EscapedVariableLengthDecls) {
1119	const auto *VD = cast<VarDecl>(Val: ValueD);
1120	std::pair<llvm::Value , llvm::Value > AddrSizePair =
1121	getKmpcAllocShared(CGF, VD);
1122	I ->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(Args&: AddrSizePair);
1123	LValue Base = CGF.MakeAddrLValue(AddrSizePair.first, VD->getType(),
1124	CGM.getContext().getDeclAlign(VD),
1125	AlignmentSource::Decl);
1126	I ->getSecond().MappedParams ->setVarAddr(CGF, LocalVD: VD, TempAddr: Base.getAddress(CGF));
1127	}
1128	I ->getSecond().MappedParams ->apply(CGF);
1129	}
1130
1131	bool CGOpenMPRuntimeGPU::isDelayedVariableLengthDecl(CodeGenFunction &CGF,
1132	const VarDecl VD) const* {
1133	const auto I = FunctionGlobalizedDecls.find(Val: CGF.CurFn);
1134	if (I == FunctionGlobalizedDecls.end())
1135	return false;
1136
1137	// Check variable declaration is delayed:
1138	return llvm::is_contained(Range: I ->getSecond().DelayedVariableLengthDecls, Element: VD);
1139	}
1140
1141	std::pair<llvm::Value , llvm::Value >
1142	CGOpenMPRuntimeGPU::getKmpcAllocShared(CodeGenFunction &CGF,
1143	const VarDecl *VD) {
1144	CGBuilderTy &Bld = CGF.Builder;
1145
1146	// Compute size and alignment.
1147	llvm::Value *Size = CGF.getTypeSize(Ty: VD->getType());
1148	CharUnits Align = CGM.getContext().getDeclAlign(VD);
1149	Size = Bld.CreateNUWAdd(
1150	LHS: Size, RHS: llvm::ConstantInt::get(Ty: CGF.SizeTy, V: Align.getQuantity() - `1`));
1151	llvm::Value *AlignVal =
1152	llvm::ConstantInt::get(Ty: CGF.SizeTy, V: Align.getQuantity());
1153	Size = Bld.CreateUDiv(LHS: Size, RHS: AlignVal);
1154	Size = Bld.CreateNUWMul(LHS: Size, RHS: AlignVal);
1155
1156	// Allocate space for this VLA object to be globalized.
1157	llvm::Value *AllocArgs[] = {Size};
1158	llvm::CallBase *VoidPtr =
1159	CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1160	M&: CGM.getModule(), FnID: OMPRTL___kmpc_alloc_shared),
1161	AllocArgs, VD->getName());
1162	VoidPtr->addRetAttr(llvm::Attribute::get(
1163	CGM.getLLVMContext(), llvm::Attribute::Alignment, Align.getQuantity()));
1164
1165	return std::make_pair(x&: VoidPtr, y&: Size);
1166	}
1167
1168	void CGOpenMPRuntimeGPU::getKmpcFreeShared(
1169	CodeGenFunction &CGF,
1170	const std::pair<llvm::Value , llvm::Value > &AddrSizePair) {
1171	// Deallocate the memory for each globalized VLA object
1172	CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1173	M&: CGM.getModule(), FnID: OMPRTL___kmpc_free_shared),
1174	args: {AddrSizePair.first, AddrSizePair.second});
1175	}
1176
1177	void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) {
1178	if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1179	return;
1180
1181	const auto I = FunctionGlobalizedDecls.find(Val: CGF.CurFn);
1182	if (I != FunctionGlobalizedDecls.end()) {
1183	// Deallocate the memory for each globalized VLA object that was
1184	// globalized in the prolog (i.e. emitGenericVarsProlog).
1185	for (const auto &AddrSizePair :
1186	llvm::reverse(C&: I ->getSecond().EscapedVariableLengthDeclsAddrs)) {
1187	CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1188	M&: CGM.getModule(), FnID: OMPRTL___kmpc_free_shared),
1189	args: {AddrSizePair.first, AddrSizePair.second});
1190	}
1191	// Deallocate the memory for each globalized value
1192	for (auto &Rec : llvm::reverse(C&: I ->getSecond().LocalVarData)) {
1193	const auto *VD = cast<VarDecl>(Val: Rec.first);
1194	I ->getSecond().MappedParams ->restore(CGF);
1195
1196	llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
1197	CGF.getTypeSize(Ty: VD->getType())};
1198	CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1199	M&: CGM.getModule(), FnID: OMPRTL___kmpc_free_shared),
1200	FreeArgs);
1201	}
1202	}
1203	}
1204
1205	void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF,
1206	const OMPExecutableDirective &D,
1207	SourceLocation Loc,
1208	llvm::Function *OutlinedFn,
1209	ArrayRef<llvm::Value *> CapturedVars) {
1210	if (!CGF.HaveInsertPoint())
1211	return;
1212
1213	bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
1214
1215	RawAddress ZeroAddr = CGF.CreateDefaultAlignTempAlloca(Ty: CGF.Int32Ty,
1216	/Name=/".zero.addr");
1217	CGF.Builder.CreateStore(Val: CGF.Builder.getInt32(/C/ `0`), Addr: ZeroAddr);
1218	llvm::SmallVector<llvm::Value *, `16`> OutlinedFnArgs;
1219	// We don't emit any thread id function call in bare kernel, but because the
1220	// outlined function has a pointer argument, we emit a nullptr here.
1221	if (IsBareKernel)
1222	OutlinedFnArgs.push_back(Elt: llvm::ConstantPointerNull::get(T: CGM.VoidPtrTy));
1223	else
1224	OutlinedFnArgs.push_back(Elt: emitThreadIDAddress(CGF, Loc).emitRawPointer(CGF));
1225	OutlinedFnArgs.push_back(Elt: ZeroAddr.getPointer());
1226	OutlinedFnArgs.append(in_start: CapturedVars.begin(), in_end: CapturedVars.end());
1227	emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, Args: OutlinedFnArgs);
1228	}
1229
1230	void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
1231	SourceLocation Loc,
1232	llvm::Function *OutlinedFn,
1233	ArrayRef<llvm::Value *> CapturedVars,
1234	const Expr *IfCond,
1235	llvm::Value *NumThreads) {
1236	if (!CGF.HaveInsertPoint())
1237	return;
1238
1239	auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond,
1240	NumThreads](CodeGenFunction &CGF,
1241	PrePostActionTy &Action) {
1242	CGBuilderTy &Bld = CGF.Builder;
1243	llvm::Value *NumThreadsVal = NumThreads;
1244	llvm::Function *WFn = WrapperFunctionsMap [OutlinedFn];
1245	llvm::Value *ID = llvm::ConstantPointerNull::get(T: CGM.Int8PtrTy);
1246	if (WFn)
1247	ID = Bld.CreateBitOrPointerCast(V: WFn, DestTy: CGM.Int8PtrTy);
1248	llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(V: OutlinedFn, DestTy: CGM.Int8PtrTy);
1249
1250	// Create a private scope that will globalize the arguments
1251	// passed from the outside of the target region.
1252	// TODO: Is that needed?
1253	CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
1254
1255	Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
1256	Ty: llvm::ArrayType::get(ElementType: CGM.VoidPtrTy, NumElements: CapturedVars.size()),
1257	Name: "captured_vars_addrs");
1258	// There's something to share.
1259	if (!CapturedVars.empty()) {
1260	// Prepare for parallel region. Indicate the outlined function.
1261	ASTContext &Ctx = CGF.getContext();
1262	unsigned Idx = `0`;
1263	for (llvm::Value *V : CapturedVars) {
1264	Address Dst = Bld.CreateConstArrayGEP(Addr: CapturedVarsAddrs, Index: Idx);
1265	llvm::Value *PtrV;
1266	if (V->getType()->isIntegerTy())
1267	PtrV = Bld.CreateIntToPtr(V, DestTy: CGF.VoidPtrTy);
1268	else
1269	PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, DestTy: CGF.VoidPtrTy);
1270	CGF.EmitStoreOfScalar(PtrV, Dst, /Volatile=/false,
1271	Ctx.getPointerType(Ctx.VoidPtrTy));
1272	++Idx;
1273	}
1274	}
1275
1276	llvm::Value IfCondVal = nullptr*;
1277	if (IfCond)
1278	IfCondVal = Bld.CreateIntCast(V: CGF.EvaluateExprAsBool(E: IfCond), DestTy: CGF.Int32Ty,
1279	/ isSigned / false);
1280	else
1281	IfCondVal = llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: `1`);
1282
1283	if (!NumThreadsVal)
1284	NumThreadsVal = llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: -`1`);
1285	else
1286	NumThreadsVal = Bld.CreateZExtOrTrunc(V: NumThreadsVal, DestTy: CGF.Int32Ty),
1287
1288	assert(IfCondVal && "Expected a value");
1289	llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1290	llvm::Value *Args[] = {
1291	RTLoc,
1292	getThreadID(CGF, Loc),
1293	IfCondVal,
1294	NumThreadsVal,
1295	llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: -`1`),
1296	FnPtr,
1297	ID,
1298	Bld.CreateBitOrPointerCast(V: CapturedVarsAddrs.emitRawPointer(CGF),
1299	DestTy: CGF.VoidPtrPtrTy),
1300	llvm::ConstantInt::get(Ty: CGM.SizeTy, V: CapturedVars.size())};
1301	CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1302	M&: CGM.getModule(), FnID: OMPRTL___kmpc_parallel_51),
1303	Args);
1304	};
1305
1306	RegionCodeGenTy RCG(ParallelGen);
1307	RCG (CGF);
1308	}
1309
1310	void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
1311	// Always emit simple barriers!
1312	if (!CGF.HaveInsertPoint())
1313	return;
1314	// Build call __kmpc_barrier_simple_spmd(nullptr, 0);
1315	// This function does not use parameters, so we can emit just default values.
1316	llvm::Value *Args[] = {
1317	llvm::ConstantPointerNull::get(
1318	T: cast<llvm::PointerType>(getIdentTyPointerTy())),
1319	llvm::ConstantInt::get(Ty: CGF.Int32Ty, /V=/`0`, /isSigned=/IsSigned: true)};
1320	CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1321	M&: CGM.getModule(), FnID: OMPRTL___kmpc_barrier_simple_spmd),
1322	Args);
1323	}
1324
1325	void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF,
1326	SourceLocation Loc,
1327	OpenMPDirectiveKind Kind, bool,
1328	bool) {
1329	// Always emit simple barriers!
1330	if (!CGF.HaveInsertPoint())
1331	return;
1332	// Build call __kmpc_cancel_barrier(loc, thread_id);
1333	unsigned Flags = getDefaultFlagsForBarriers(Kind);
1334	llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
1335	getThreadID(CGF, Loc)};
1336
1337	CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1338	M&: CGM.getModule(), FnID: OMPRTL___kmpc_barrier),
1339	Args);
1340	}
1341
1342	void CGOpenMPRuntimeGPU::emitCriticalRegion(
1343	CodeGenFunction &CGF, StringRef CriticalName,
1344	const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
1345	const Expr *Hint) {
1346	llvm::BasicBlock *LoopBB = CGF.createBasicBlock(name: "omp.critical.loop");
1347	llvm::BasicBlock *TestBB = CGF.createBasicBlock(name: "omp.critical.test");
1348	llvm::BasicBlock *SyncBB = CGF.createBasicBlock(name: "omp.critical.sync");
1349	llvm::BasicBlock *BodyBB = CGF.createBasicBlock(name: "omp.critical.body");
1350	llvm::BasicBlock *ExitBB = CGF.createBasicBlock(name: "omp.critical.exit");
1351
1352	auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1353
1354	// Get the mask of active threads in the warp.
1355	llvm::Value *Mask = CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1356	M&: CGM.getModule(), FnID: OMPRTL___kmpc_warp_active_thread_mask));
1357	// Fetch team-local id of the thread.
1358	llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1359
1360	// Get the width of the team.
1361	llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
1362
1363	// Initialize the counter variable for the loop.
1364	QualType Int32Ty =
1365	CGF.getContext().getIntTypeForBitwidth(/DestWidth=/`32`, /Signed=/`0`);
1366	Address Counter = CGF.CreateMemTemp(T: Int32Ty, Name: "critical_counter");
1367	LValue CounterLVal = CGF.MakeAddrLValue(Addr: Counter, T: Int32Ty);
1368	CGF.EmitStoreOfScalar(value: llvm::Constant::getNullValue(Ty: CGM.Int32Ty), lvalue: CounterLVal,
1369	/isInit=/true);
1370
1371	// Block checks if loop counter exceeds upper bound.
1372	CGF.EmitBlock(BB: LoopBB);
1373	llvm::Value *CounterVal = CGF.EmitLoadOfScalar(lvalue: CounterLVal, Loc);
1374	llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(LHS: CounterVal, RHS: TeamWidth);
1375	CGF.Builder.CreateCondBr(Cond: CmpLoopBound, True: TestBB, False: ExitBB);
1376
1377	// Block tests which single thread should execute region, and which threads
1378	// should go straight to synchronisation point.
1379	CGF.EmitBlock(BB: TestBB);
1380	CounterVal = CGF.EmitLoadOfScalar(lvalue: CounterLVal, Loc);
1381	llvm::Value *CmpThreadToCounter =
1382	CGF.Builder.CreateICmpEQ(LHS: ThreadID, RHS: CounterVal);
1383	CGF.Builder.CreateCondBr(Cond: CmpThreadToCounter, True: BodyBB, False: SyncBB);
1384
1385	// Block emits the body of the critical region.
1386	CGF.EmitBlock(BB: BodyBB);
1387
1388	// Output the critical statement.
1389	CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
1390	Hint);
1391
1392	// After the body surrounded by the critical region, the single executing
1393	// thread will jump to the synchronisation point.
1394	// Block waits for all threads in current team to finish then increments the
1395	// counter variable and returns to the loop.
1396	CGF.EmitBlock(BB: SyncBB);
1397	// Reconverge active threads in the warp.
1398	(void)CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1399	M&: CGM.getModule(), FnID: OMPRTL___kmpc_syncwarp),
1400	args: Mask);
1401
1402	llvm::Value *IncCounterVal =
1403	CGF.Builder.CreateNSWAdd(LHS: CounterVal, RHS: CGF.Builder.getInt32(C: `1`));
1404	CGF.EmitStoreOfScalar(value: IncCounterVal, lvalue: CounterLVal);
1405	CGF.EmitBranch(Block: LoopBB);
1406
1407	// Block that is reached when all threads in the team complete the region.
1408	CGF.EmitBlock(BB: ExitBB, /IsFinished=/true);
1409	}
1410
1411	/// Cast value to the specified type.
1412	static llvm::Value castValueToType(CodeGenFunction &CGF, llvm::Value Val,
1413	QualType ValTy, QualType CastTy,
1414	SourceLocation Loc) {
1415	assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
1416	"Cast type must sized.");
1417	assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
1418	"Val type must sized.");
1419	llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(T: CastTy);
1420	if (ValTy == CastTy)
1421	return Val;
1422	if (CGF.getContext().getTypeSizeInChars(T: ValTy) ==
1423	CGF.getContext().getTypeSizeInChars(T: CastTy))
1424	return CGF.Builder.CreateBitCast(V: Val, DestTy: LLVMCastTy);
1425	if (CastTy ->isIntegerType() && ValTy ->isIntegerType())
1426	return CGF.Builder.CreateIntCast(V: Val, DestTy: LLVMCastTy,
1427	isSigned: CastTy ->hasSignedIntegerRepresentation());
1428	Address CastItem = CGF.CreateMemTemp(T: CastTy);
1429	Address ValCastItem = CastItem.withElementType(ElemTy: Val->getType());
1430	CGF.EmitStoreOfScalar(Value: Val, Addr: ValCastItem, /Volatile=/false, Ty: ValTy,
1431	BaseInfo: LValueBaseInfo (AlignmentSource::Type),
1432	TBAAInfo: TBAAAccessInfo ());
1433	return CGF.EmitLoadOfScalar(Addr: CastItem, /Volatile=/false, Ty: CastTy, Loc,
1434	BaseInfo: LValueBaseInfo (AlignmentSource::Type),
1435	TBAAInfo: TBAAAccessInfo ());
1436	}
1437
1438	/// This function creates calls to one of two shuffle functions to copy
1439	/// variables between lanes in a warp.
1440	static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
1441	llvm::Value *Elem,
1442	QualType ElemType,
1443	llvm::Value *Offset,
1444	SourceLocation Loc) {
1445	CodeGenModule &CGM = CGF.CGM;
1446	CGBuilderTy &Bld = CGF.Builder;
1447	CGOpenMPRuntimeGPU &RT =
1448	(static_cast<CGOpenMPRuntimeGPU >(&CGM.getOpenMPRuntime()));
1449	llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder();
1450
1451	CharUnits Size = CGF.getContext().getTypeSizeInChars(T: ElemType);
1452	assert(Size.getQuantity() <= `8` &&
1453	"Unsupported bitwidth in shuffle instruction.");
1454
1455	RuntimeFunction ShuffleFn = Size.getQuantity() <= `4`
1456	? OMPRTL___kmpc_shuffle_int32
1457	: OMPRTL___kmpc_shuffle_int64;
1458
1459	// Cast all types to 32- or 64-bit values before calling shuffle routines.
1460	QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
1461	DestWidth: Size.getQuantity() <= `4` ? `32` : `64`, /Signed=/`1`);
1462	llvm::Value *ElemCast = castValueToType(CGF, Val: Elem, ValTy: ElemType, CastTy, Loc);
1463	llvm::Value *WarpSize =
1464	Bld.CreateIntCast(V: RT.getGPUWarpSize(CGF), DestTy: CGM.Int16Ty, /isSigned=/true);
1465
1466	llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
1467	callee: OMPBuilder.getOrCreateRuntimeFunction(M&: CGM.getModule(), FnID: ShuffleFn),
1468	args: {ElemCast, Offset, WarpSize});
1469
1470	return castValueToType(CGF, Val: ShuffledVal, ValTy: CastTy, CastTy: ElemType, Loc);
1471	}
1472
1473	static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
1474	Address DestAddr, QualType ElemType,
1475	llvm::Value *Offset, SourceLocation Loc) {
1476	CGBuilderTy &Bld = CGF.Builder;
1477
1478	CharUnits Size = CGF.getContext().getTypeSizeInChars(T: ElemType);
1479	// Create the loop over the big sized data.
1480	// ptr = (void)Elem;*
1481	// ptrEnd = (void) Elem + 1;*
1482	// Step = 8;
1483	// while (ptr + Step < ptrEnd)
1484	// shuffle((int64_t)ptr);*
1485	// Step = 4;
1486	// while (ptr + Step < ptrEnd)
1487	// shuffle((int32_t)ptr);*
1488	// ...
1489	Address ElemPtr = DestAddr;
1490	Address Ptr = SrcAddr;
1491	Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(
1492	Addr: Bld.CreateConstGEP(Addr: SrcAddr, Index: `1`), Ty: CGF.VoidPtrTy, ElementTy: CGF.Int8Ty);
1493	for (int IntSize = `8`; IntSize >= `1`; IntSize /= `2`) {
1494	if (Size < CharUnits::fromQuantity(Quantity: IntSize))
1495	continue;
1496	QualType IntType = CGF.getContext().getIntTypeForBitwidth(
1497	DestWidth: CGF.getContext().toBits(CharSize: CharUnits::fromQuantity(Quantity: IntSize)),
1498	/Signed=/`1`);
1499	llvm::Type *IntTy = CGF.ConvertTypeForMem(T: IntType);
1500	Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Addr: Ptr, Ty: IntTy->getPointerTo(),
1501	ElementTy: IntTy);
1502	ElemPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1503	Addr: ElemPtr, Ty: IntTy->getPointerTo(), ElementTy: IntTy);
1504	if (Size.getQuantity() / IntSize > `1`) {
1505	llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(name: ".shuffle.pre_cond");
1506	llvm::BasicBlock *ThenBB = CGF.createBasicBlock(name: ".shuffle.then");
1507	llvm::BasicBlock *ExitBB = CGF.createBasicBlock(name: ".shuffle.exit");
1508	llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
1509	CGF.EmitBlock(BB: PreCondBB);
1510	llvm::PHINode *PhiSrc =
1511	Bld.CreatePHI(Ty: Ptr.getType(), /NumReservedValues=/`2`);
1512	PhiSrc->addIncoming(V: Ptr.emitRawPointer(CGF), BB: CurrentBB);
1513	llvm::PHINode *PhiDest =
1514	Bld.CreatePHI(Ty: ElemPtr.getType(), /NumReservedValues=/`2`);
1515	PhiDest->addIncoming(V: ElemPtr.emitRawPointer(CGF), BB: CurrentBB);
1516	Ptr = Address (PhiSrc, Ptr.getElementType(), Ptr.getAlignment());
1517	ElemPtr =
1518	Address (PhiDest, ElemPtr.getElementType(), ElemPtr.getAlignment());
1519	llvm::Value *PtrEndRaw = PtrEnd.emitRawPointer(CGF);
1520	llvm::Value *PtrRaw = Ptr.emitRawPointer(CGF);
1521	llvm::Value *PtrDiff = Bld.CreatePtrDiff(
1522	ElemTy: CGF.Int8Ty, LHS: PtrEndRaw,
1523	RHS: Bld.CreatePointerBitCastOrAddrSpaceCast(V: PtrRaw, DestTy: CGF.VoidPtrTy));
1524	Bld.CreateCondBr(Cond: Bld.CreateICmpSGT(LHS: PtrDiff, RHS: Bld.getInt64(C: IntSize - `1`)),
1525	True: ThenBB, False: ExitBB);
1526	CGF.EmitBlock(BB: ThenBB);
1527	llvm::Value *Res = createRuntimeShuffleFunction(
1528	CGF,
1529	Elem: CGF.EmitLoadOfScalar(Addr: Ptr, /Volatile=/false, Ty: IntType, Loc,
1530	BaseInfo: LValueBaseInfo (AlignmentSource::Type),
1531	TBAAInfo: TBAAAccessInfo ()),
1532	ElemType: IntType, Offset, Loc);
1533	CGF.EmitStoreOfScalar(Value: Res, Addr: ElemPtr, /Volatile=/false, Ty: IntType,
1534	BaseInfo: LValueBaseInfo (AlignmentSource::Type),
1535	TBAAInfo: TBAAAccessInfo ());
1536	Address LocalPtr = Bld.CreateConstGEP(Addr: Ptr, Index: `1`);
1537	Address LocalElemPtr = Bld.CreateConstGEP(Addr: ElemPtr, Index: `1`);
1538	PhiSrc->addIncoming(V: LocalPtr.emitRawPointer(CGF), BB: ThenBB);
1539	PhiDest->addIncoming(V: LocalElemPtr.emitRawPointer(CGF), BB: ThenBB);
1540	CGF.EmitBranch(Block: PreCondBB);
1541	CGF.EmitBlock(BB: ExitBB);
1542	} else {
1543	llvm::Value *Res = createRuntimeShuffleFunction(
1544	CGF,
1545	Elem: CGF.EmitLoadOfScalar(Addr: Ptr, /Volatile=/false, Ty: IntType, Loc,
1546	BaseInfo: LValueBaseInfo (AlignmentSource::Type),
1547	TBAAInfo: TBAAAccessInfo ()),
1548	ElemType: IntType, Offset, Loc);
1549	CGF.EmitStoreOfScalar(Value: Res, Addr: ElemPtr, /Volatile=/false, Ty: IntType,
1550	BaseInfo: LValueBaseInfo (AlignmentSource::Type),
1551	TBAAInfo: TBAAAccessInfo ());
1552	Ptr = Bld.CreateConstGEP(Addr: Ptr, Index: `1`);
1553	ElemPtr = Bld.CreateConstGEP(Addr: ElemPtr, Index: `1`);
1554	}
1555	Size = Size % IntSize;
1556	}
1557	}
1558
1559	namespace {
1560	enum CopyAction : unsigned {
1561	// RemoteLaneToThread: Copy over a Reduce list from a remote lane in
1562	// the warp using shuffle instructions.
1563	RemoteLaneToThread,
1564	// ThreadCopy: Make a copy of a Reduce list on the thread's stack.
1565	ThreadCopy,
1566	};
1567	} // namespace
1568
1569	struct CopyOptionsTy {
1570	llvm::Value *RemoteLaneOffset;
1571	llvm::Value *ScratchpadIndex;
1572	llvm::Value *ScratchpadWidth;
1573	};
1574
1575	/// Emit instructions to copy a Reduce list, which contains partially
1576	/// aggregated values, in the specified direction.
1577	static void emitReductionListCopy(
1578	CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
1579	ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
1580	CopyOptionsTy CopyOptions = {.RemoteLaneOffset: nullptr, .ScratchpadIndex: nullptr, .ScratchpadWidth: nullptr}) {
1581
1582	CodeGenModule &CGM = CGF.CGM;
1583	ASTContext &C = CGM.getContext();
1584	CGBuilderTy &Bld = CGF.Builder;
1585
1586	llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
1587
1588	// Iterates, element-by-element, through the source Reduce list and
1589	// make a copy.
1590	unsigned Idx = `0`;
1591	for (const Expr *Private : Privates) {
1592	Address SrcElementAddr = Address::invalid();
1593	Address DestElementAddr = Address::invalid();
1594	Address DestElementPtrAddr = Address::invalid();
1595	// Should we shuffle in an element from a remote lane?
1596	bool ShuffleInElement = false;
1597	// Set to true to update the pointer in the dest Reduce list to a
1598	// newly created element.
1599	bool UpdateDestListPtr = false;
1600	QualType PrivatePtrType = C.getPointerType(T: Private->getType());
1601	llvm::Type *PrivateLlvmPtrType = CGF.ConvertType(T: PrivatePtrType);
1602
1603	switch (Action) {
1604	case RemoteLaneToThread: {
1605	// Step 1.1: Get the address for the src element in the Reduce list.
1606	Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(Addr: SrcBase, Index: Idx);
1607	SrcElementAddr = CGF.EmitLoadOfPointer(
1608	Ptr: SrcElementPtrAddr.withElementType(ElemTy: PrivateLlvmPtrType),
1609	PtrTy: PrivatePtrType ->castAs<PointerType>());
1610
1611	// Step 1.2: Create a temporary to store the element in the destination
1612	// Reduce list.
1613	DestElementPtrAddr = Bld.CreateConstArrayGEP(Addr: DestBase, Index: Idx);
1614	DestElementAddr =
1615	CGF.CreateMemTemp(T: Private->getType(), Name: ".omp.reduction.element");
1616	ShuffleInElement = true;
1617	UpdateDestListPtr = true;
1618	break;
1619	}
1620	case ThreadCopy: {
1621	// Step 1.1: Get the address for the src element in the Reduce list.
1622	Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(Addr: SrcBase, Index: Idx);
1623	SrcElementAddr = CGF.EmitLoadOfPointer(
1624	Ptr: SrcElementPtrAddr.withElementType(ElemTy: PrivateLlvmPtrType),
1625	PtrTy: PrivatePtrType ->castAs<PointerType>());
1626
1627	// Step 1.2: Get the address for dest element. The destination
1628	// element has already been created on the thread's stack.
1629	DestElementPtrAddr = Bld.CreateConstArrayGEP(Addr: DestBase, Index: Idx);
1630	DestElementAddr = CGF.EmitLoadOfPointer(
1631	Ptr: DestElementPtrAddr.withElementType(ElemTy: PrivateLlvmPtrType),
1632	PtrTy: PrivatePtrType ->castAs<PointerType>());
1633	break;
1634	}
1635	}
1636
1637	// Regardless of src and dest of copy, we emit the load of src
1638	// element as this is required in all directions
1639	SrcElementAddr = SrcElementAddr.withElementType(
1640	ElemTy: CGF.ConvertTypeForMem(T: Private->getType()));
1641	DestElementAddr =
1642	DestElementAddr.withElementType(ElemTy: SrcElementAddr.getElementType());
1643
1644	// Now that all active lanes have read the element in the
1645	// Reduce list, shuffle over the value from the remote lane.
1646	if (ShuffleInElement) {
1647	shuffleAndStore(CGF, SrcAddr: SrcElementAddr, DestAddr: DestElementAddr, ElemType: Private->getType(),
1648	Offset: RemoteLaneOffset, Loc: Private->getExprLoc());
1649	} else {
1650	switch (CGF.getEvaluationKind(T: Private->getType())) {
1651	case TEK_Scalar: {
1652	llvm::Value *Elem = CGF.EmitLoadOfScalar(
1653	Addr: SrcElementAddr, /Volatile=/false, Ty: Private->getType(),
1654	Loc: Private->getExprLoc(), BaseInfo: LValueBaseInfo (AlignmentSource::Type),
1655	TBAAInfo: TBAAAccessInfo ());
1656	// Store the source element value to the dest element address.
1657	CGF.EmitStoreOfScalar(
1658	Value: Elem, Addr: DestElementAddr, /Volatile=/false, Ty: Private->getType(),
1659	BaseInfo: LValueBaseInfo (AlignmentSource::Type), TBAAInfo: TBAAAccessInfo ());
1660	break;
1661	}
1662	case TEK_Complex: {
1663	CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex(
1664	src: CGF.MakeAddrLValue(Addr: SrcElementAddr, T: Private->getType()),
1665	loc: Private->getExprLoc());
1666	CGF.EmitStoreOfComplex(
1667	V: Elem, dest: CGF.MakeAddrLValue(Addr: DestElementAddr, T: Private->getType()),
1668	/isInit=/false);
1669	break;
1670	}
1671	case TEK_Aggregate:
1672	CGF.EmitAggregateCopy(
1673	Dest: CGF.MakeAddrLValue(Addr: DestElementAddr, T: Private->getType()),
1674	Src: CGF.MakeAddrLValue(Addr: SrcElementAddr, T: Private->getType()),
1675	EltTy: Private->getType(), MayOverlap: AggValueSlot::DoesNotOverlap);
1676	break;
1677	}
1678	}
1679
1680	// Step 3.1: Modify reference in dest Reduce list as needed.
1681	// Modifying the reference in Reduce list to point to the newly
1682	// created element. The element is live in the current function
1683	// scope and that of functions it invokes (i.e., reduce_function).
1684	// RemoteReduceData[i] = (void)&RemoteElem*
1685	if (UpdateDestListPtr) {
1686	CGF.EmitStoreOfScalar(
1687	Bld.CreatePointerBitCastOrAddrSpaceCast(
1688	V: DestElementAddr.emitRawPointer(CGF), DestTy: CGF.VoidPtrTy),
1689	DestElementPtrAddr, /Volatile=/false, C.VoidPtrTy);
1690	}
1691
1692	++Idx;
1693	}
1694	}
1695
1696	/// This function emits a helper that gathers Reduce lists from the first
1697	/// lane of every active warp to lanes in the first warp.
1698	///
1699	/// void inter_warp_copy_func(void reduce_data, num_warps)*
1700	/// shared smem[warp_size];
1701	/// For all data entries D in reduce_data:
1702	/// sync
1703	/// If (I am the first lane in each warp)
1704	/// Copy my local D to smem[warp_id]
1705	/// sync
1706	/// if (I am the first warp)
1707	/// Copy smem[thread_id] to my local D
1708	static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
1709	ArrayRef<const Expr *> Privates,
1710	QualType ReductionArrayTy,
1711	SourceLocation Loc) {
1712	ASTContext &C = CGM.getContext();
1713	llvm::Module &M = CGM.getModule();
1714
1715	// ReduceList: thread local Reduce list.
1716	// At the stage of the computation when this function is called, partially
1717	// aggregated values reside in the first lane of every active warp.
1718	ImplicitParamDecl ReduceListArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
1719	C.VoidPtrTy, ImplicitParamKind::Other);
1720	// NumWarps: number of warps active in the parallel region. This could
1721	// be smaller than 32 (max warps in a CTA) for partial block reduction.
1722	ImplicitParamDecl NumWarpsArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
1723	C.getIntTypeForBitwidth(DestWidth: `32`, / Signed / true),
1724	ImplicitParamKind::Other);
1725	FunctionArgList Args;
1726	Args.push_back(&ReduceListArg);
1727	Args.push_back(&NumWarpsArg);
1728
1729	const CGFunctionInfo &CGFI =
1730	CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1731	auto *Fn = llvm::Function::Create(Ty: CGM.getTypes().GetFunctionType(Info: CGFI),
1732	Linkage: llvm::GlobalValue::InternalLinkage,
1733	N: "_omp_reduction_inter_warp_copy_func", M: &M);
1734	CGM.SetInternalFunctionAttributes(GD: GlobalDecl (), F: Fn, FI: CGFI);
1735	Fn->setDoesNotRecurse();
1736	CodeGenFunction CGF(CGM);
1737	CGF.StartFunction(GD: GlobalDecl (), RetTy: C.VoidTy, Fn: Fn, FnInfo: CGFI, Args, Loc, StartLoc: Loc);
1738
1739	CGBuilderTy &Bld = CGF.Builder;
1740
1741	// This array is used as a medium to transfer, one reduce element at a time,
1742	// the data from the first lane of every warp to lanes in the first warp
1743	// in order to perform the final step of a reduction in a parallel region
1744	// (reduction across warps). The array is placed in NVPTX __shared__ memory
1745	// for reduced latency, as well as to have a distinct copy for concurrently
1746	// executing target regions. The array is declared with common linkage so
1747	// as to be shared across compilation units.
1748	StringRef TransferMediumName =
1749	"__openmp_nvptx_data_transfer_temporary_storage";
1750	llvm::GlobalVariable *TransferMedium =
1751	M.getGlobalVariable(Name: TransferMediumName);
1752	unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
1753	if (!TransferMedium) {
1754	auto *Ty = llvm::ArrayType::get(ElementType: CGM.Int32Ty, NumElements: WarpSize);
1755	unsigned SharedAddressSpace = C.getTargetAddressSpace(AS: LangAS::cuda_shared);
1756	TransferMedium = new llvm::GlobalVariable (
1757	M, Ty, /isConstant=/false, llvm::GlobalVariable::WeakAnyLinkage,
1758	llvm::UndefValue::get(T: Ty), TransferMediumName,
1759	/InsertBefore=/nullptr, llvm::GlobalVariable::NotThreadLocal,
1760	SharedAddressSpace);
1761	CGM.addCompilerUsedGlobal(GV: TransferMedium);
1762	}
1763
1764	auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1765	// Get the CUDA thread id of the current OpenMP thread on the GPU.
1766	llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1767	// nvptx_lane_id = nvptx_id % warpsize
1768	llvm::Value *LaneID = getNVPTXLaneID(CGF);
1769	// nvptx_warp_id = nvptx_id / warpsize
1770	llvm::Value *WarpID = getNVPTXWarpID(CGF);
1771
1772	Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1773	llvm::Type *ElemTy = CGF.ConvertTypeForMem(T: ReductionArrayTy);
1774	Address LocalReduceList(
1775	Bld.CreatePointerBitCastOrAddrSpaceCast(
1776	CGF.EmitLoadOfScalar(
1777	AddrReduceListArg, /Volatile=/false, C.VoidPtrTy, Loc,
1778	LValueBaseInfo (AlignmentSource::Type), TBAAAccessInfo ()),
1779	ElemTy->getPointerTo()),
1780	ElemTy, CGF.getPointerAlign());
1781
1782	unsigned Idx = `0`;
1783	for (const Expr *Private : Privates) {
1784	//
1785	// Warp master copies reduce element to transfer medium in __shared__
1786	// memory.
1787	//
1788	unsigned RealTySize =
1789	C.getTypeSizeInChars(T: Private->getType())
1790	.alignTo(Align: C.getTypeAlignInChars(T: Private->getType()))
1791	.getQuantity();
1792	for (unsigned TySize = `4`; TySize > `0` && RealTySize > `0`; TySize /=`2`) {
1793	unsigned NumIters = RealTySize / TySize;
1794	if (NumIters == `0`)
1795	continue;
1796	QualType CType = C.getIntTypeForBitwidth(
1797	DestWidth: C.toBits(CharSize: CharUnits::fromQuantity(Quantity: TySize)), /Signed=/`1`);
1798	llvm::Type *CopyType = CGF.ConvertTypeForMem(T: CType);
1799	CharUnits Align = CharUnits::fromQuantity(Quantity: TySize);
1800	llvm::Value Cnt = nullptr*;
1801	Address CntAddr = Address::invalid();
1802	llvm::BasicBlock PrecondBB = nullptr*;
1803	llvm::BasicBlock ExitBB = nullptr*;
1804	if (NumIters > `1`) {
1805	CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
1806	CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(Ty: CGM.IntTy), CntAddr,
1807	/Volatile=/false, C.IntTy);
1808	PrecondBB = CGF.createBasicBlock(name: "precond");
1809	ExitBB = CGF.createBasicBlock(name: "exit");
1810	llvm::BasicBlock *BodyBB = CGF.createBasicBlock(name: "body");
1811	// There is no need to emit line number for unconditional branch.
1812	(void)ApplyDebugLocation::CreateEmpty(CGF);
1813	CGF.EmitBlock(BB: PrecondBB);
1814	Cnt = CGF.EmitLoadOfScalar(CntAddr, /Volatile=/false, C.IntTy, Loc);
1815	llvm::Value *Cmp =
1816	Bld.CreateICmpULT(LHS: Cnt, RHS: llvm::ConstantInt::get(Ty: CGM.IntTy, V: NumIters));
1817	Bld.CreateCondBr(Cond: Cmp, True: BodyBB, False: ExitBB);
1818	CGF.EmitBlock(BB: BodyBB);
1819	}
1820	// kmpc_barrier.
1821	CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
1822	/EmitChecks=/false,
1823	/ForceSimpleCall=/true);
1824	llvm::BasicBlock *ThenBB = CGF.createBasicBlock(name: "then");
1825	llvm::BasicBlock *ElseBB = CGF.createBasicBlock(name: "else");
1826	llvm::BasicBlock *MergeBB = CGF.createBasicBlock(name: "ifcont");
1827
1828	// if (lane_id == 0)
1829	llvm::Value *IsWarpMaster = Bld.CreateIsNull(Arg: LaneID, Name: "warp_master");
1830	Bld.CreateCondBr(Cond: IsWarpMaster, True: ThenBB, False: ElseBB);
1831	CGF.EmitBlock(BB: ThenBB);
1832
1833	// Reduce element = LocalReduceList[i]
1834	Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(Addr: LocalReduceList, Index: Idx);
1835	llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
1836	ElemPtrPtrAddr, /Volatile=/false, C.VoidPtrTy, SourceLocation ());
1837	// elemptr = ((CopyType)(elemptrptr)) + I*
1838	Address ElemPtr(ElemPtrPtr, CopyType, Align);
1839	if (NumIters > `1`)
1840	ElemPtr = Bld.CreateGEP(CGF, Addr: ElemPtr, Index: Cnt);
1841
1842	// Get pointer to location in transfer medium.
1843	// MediumPtr = &medium[warp_id]
1844	llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
1845	Ty: TransferMedium->getValueType(), Ptr: TransferMedium,
1846	IdxList: {llvm::Constant::getNullValue(Ty: CGM.Int64Ty), WarpID});
1847	// Casting to actual data type.
1848	// MediumPtr = (CopyType)MediumPtrAddr;*
1849	Address MediumPtr(MediumPtrVal, CopyType, Align);
1850
1851	// elem = elemptr*
1852	//MediumPtr = elem*
1853	llvm::Value *Elem = CGF.EmitLoadOfScalar(
1854	Addr: ElemPtr, /Volatile=/false, Ty: CType, Loc,
1855	BaseInfo: LValueBaseInfo (AlignmentSource::Type), TBAAInfo: TBAAAccessInfo ());
1856	// Store the source element value to the dest element address.
1857	CGF.EmitStoreOfScalar(Value: Elem, Addr: MediumPtr, /Volatile=/true, Ty: CType,
1858	BaseInfo: LValueBaseInfo (AlignmentSource::Type),
1859	TBAAInfo: TBAAAccessInfo ());
1860
1861	Bld.CreateBr(Dest: MergeBB);
1862
1863	CGF.EmitBlock(BB: ElseBB);
1864	Bld.CreateBr(Dest: MergeBB);
1865
1866	CGF.EmitBlock(BB: MergeBB);
1867
1868	// kmpc_barrier.
1869	CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
1870	/EmitChecks=/false,
1871	/ForceSimpleCall=/true);
1872
1873	//
1874	// Warp 0 copies reduce element from transfer medium.
1875	//
1876	llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock(name: "then");
1877	llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock(name: "else");
1878	llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock(name: "ifcont");
1879
1880	Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
1881	llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
1882	AddrNumWarpsArg, /Volatile=/false, C.IntTy, Loc);
1883
1884	// Up to 32 threads in warp 0 are active.
1885	llvm::Value *IsActiveThread =
1886	Bld.CreateICmpULT(LHS: ThreadID, RHS: NumWarpsVal, Name: "is_active_thread");
1887	Bld.CreateCondBr(Cond: IsActiveThread, True: W0ThenBB, False: W0ElseBB);
1888
1889	CGF.EmitBlock(BB: W0ThenBB);
1890
1891	// SrcMediumPtr = &medium[tid]
1892	llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
1893	Ty: TransferMedium->getValueType(), Ptr: TransferMedium,
1894	IdxList: {llvm::Constant::getNullValue(Ty: CGM.Int64Ty), ThreadID});
1895	// SrcMediumVal = SrcMediumPtr;*
1896	Address SrcMediumPtr(SrcMediumPtrVal, CopyType, Align);
1897
1898	// TargetElemPtr = (CopyType)(SrcDataAddr[i]) + I*
1899	Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(Addr: LocalReduceList, Index: Idx);
1900	llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
1901	TargetElemPtrPtr, /Volatile=/false, C.VoidPtrTy, Loc);
1902	Address TargetElemPtr(TargetElemPtrVal, CopyType, Align);
1903	if (NumIters > `1`)
1904	TargetElemPtr = Bld.CreateGEP(CGF, Addr: TargetElemPtr, Index: Cnt);
1905
1906	// TargetElemPtr = SrcMediumVal;*
1907	llvm::Value *SrcMediumValue =
1908	CGF.EmitLoadOfScalar(Addr: SrcMediumPtr, /Volatile=/true, Ty: CType, Loc);
1909	CGF.EmitStoreOfScalar(Value: SrcMediumValue, Addr: TargetElemPtr, /Volatile=/false,
1910	Ty: CType);
1911	Bld.CreateBr(Dest: W0MergeBB);
1912
1913	CGF.EmitBlock(BB: W0ElseBB);
1914	Bld.CreateBr(Dest: W0MergeBB);
1915
1916	CGF.EmitBlock(BB: W0MergeBB);
1917
1918	if (NumIters > `1`) {
1919	Cnt = Bld.CreateNSWAdd(LHS: Cnt, RHS: llvm::ConstantInt::get(Ty: CGM.IntTy, /V=/`1`));
1920	CGF.EmitStoreOfScalar(Cnt, CntAddr, /Volatile=/false, C.IntTy);
1921	CGF.EmitBranch(Block: PrecondBB);
1922	(void)ApplyDebugLocation::CreateEmpty(CGF);
1923	CGF.EmitBlock(BB: ExitBB);
1924	}
1925	RealTySize %= TySize;
1926	}
1927	++Idx;
1928	}
1929
1930	CGF.FinishFunction();
1931	return Fn;
1932	}
1933
1934	/// Emit a helper that reduces data across two OpenMP threads (lanes)
1935	/// in the same warp. It uses shuffle instructions to copy over data from
1936	/// a remote lane's stack. The reduction algorithm performed is specified
1937	/// by the fourth parameter.
1938	///
1939	/// Algorithm Versions.
1940	/// Full Warp Reduce (argument value 0):
1941	/// This algorithm assumes that all 32 lanes are active and gathers
1942	/// data from these 32 lanes, producing a single resultant value.
1943	/// Contiguous Partial Warp Reduce (argument value 1):
1944	/// This algorithm assumes that only a contiguous* subset of lanes*
1945	/// are active. This happens for the last warp in a parallel region
1946	/// when the user specified num_threads is not an integer multiple of
1947	/// 32. This contiguous subset always starts with the zeroth lane.
1948	/// Partial Warp Reduce (argument value 2):
1949	/// This algorithm gathers data from any number of lanes at any position.
1950	/// All reduced values are stored in the lowest possible lane. The set
1951	/// of problems every algorithm addresses is a super set of those
1952	/// addressable by algorithms with a lower version number. Overhead
1953	/// increases as algorithm version increases.
1954	///
1955	/// Terminology
1956	/// Reduce element:
1957	/// Reduce element refers to the individual data field with primitive
1958	/// data types to be combined and reduced across threads.
1959	/// Reduce list:
1960	/// Reduce list refers to a collection of local, thread-private
1961	/// reduce elements.
1962	/// Remote Reduce list:
1963	/// Remote Reduce list refers to a collection of remote (relative to
1964	/// the current thread) reduce elements.
1965	///
1966	/// We distinguish between three states of threads that are important to
1967	/// the implementation of this function.
1968	/// Alive threads:
1969	/// Threads in a warp executing the SIMT instruction, as distinguished from
1970	/// threads that are inactive due to divergent control flow.
1971	/// Active threads:
1972	/// The minimal set of threads that has to be alive upon entry to this
1973	/// function. The computation is correct iff active threads are alive.
1974	/// Some threads are alive but they are not active because they do not
1975	/// contribute to the computation in any useful manner. Turning them off
1976	/// may introduce control flow overheads without any tangible benefits.
1977	/// Effective threads:
1978	/// In order to comply with the argument requirements of the shuffle
1979	/// function, we must keep all lanes holding data alive. But at most
1980	/// half of them perform value aggregation; we refer to this half of
1981	/// threads as effective. The other half is simply handing off their
1982	/// data.
1983	///
1984	/// Procedure
1985	/// Value shuffle:
1986	/// In this step active threads transfer data from higher lane positions
1987	/// in the warp to lower lane positions, creating Remote Reduce list.
1988	/// Value aggregation:
1989	/// In this step, effective threads combine their thread local Reduce list
1990	/// with Remote Reduce list and store the result in the thread local
1991	/// Reduce list.
1992	/// Value copy:
1993	/// In this step, we deal with the assumption made by algorithm 2
1994	/// (i.e. contiguity assumption). When we have an odd number of lanes
1995	/// active, say 2k+1, only k threads will be effective and therefore k
1996	/// new values will be produced. However, the Reduce list owned by the
1997	/// (2k+1)th thread is ignored in the value aggregation. Therefore
1998	/// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
1999	/// that the contiguity assumption still holds.
2000	static llvm::Function *emitShuffleAndReduceFunction(
2001	CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2002	QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
2003	ASTContext &C = CGM.getContext();
2004
2005	// Thread local Reduce list used to host the values of data to be reduced.
2006	ImplicitParamDecl ReduceListArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2007	C.VoidPtrTy, ImplicitParamKind::Other);
2008	// Current lane id; could be logical.
2009	ImplicitParamDecl LaneIDArg(C, /DC=/nullptr, Loc, /Id=/nullptr, C.ShortTy,
2010	ImplicitParamKind::Other);
2011	// Offset of the remote source lane relative to the current lane.
2012	ImplicitParamDecl RemoteLaneOffsetArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2013	C.ShortTy, ImplicitParamKind::Other);
2014	// Algorithm version. This is expected to be known at compile time.
2015	ImplicitParamDecl AlgoVerArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2016	C.ShortTy, ImplicitParamKind::Other);
2017	FunctionArgList Args;
2018	Args.push_back(&ReduceListArg);
2019	Args.push_back(&LaneIDArg);
2020	Args.push_back(&RemoteLaneOffsetArg);
2021	Args.push_back(&AlgoVerArg);
2022
2023	const CGFunctionInfo &CGFI =
2024	CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2025	auto *Fn = llvm::Function::Create(
2026	Ty: CGM.getTypes().GetFunctionType(Info: CGFI), Linkage: llvm::GlobalValue::InternalLinkage,
2027	N: "_omp_reduction_shuffle_and_reduce_func", M: &CGM.getModule());
2028	CGM.SetInternalFunctionAttributes(GD: GlobalDecl (), F: Fn, FI: CGFI);
2029	Fn->setDoesNotRecurse();
2030
2031	CodeGenFunction CGF(CGM);
2032	CGF.StartFunction(GD: GlobalDecl (), RetTy: C.VoidTy, Fn: Fn, FnInfo: CGFI, Args, Loc, StartLoc: Loc);
2033
2034	CGBuilderTy &Bld = CGF.Builder;
2035
2036	Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2037	llvm::Type *ElemTy = CGF.ConvertTypeForMem(T: ReductionArrayTy);
2038	Address LocalReduceList(
2039	Bld.CreatePointerBitCastOrAddrSpaceCast(
2040	CGF.EmitLoadOfScalar(AddrReduceListArg, /Volatile=/false,
2041	C.VoidPtrTy, SourceLocation ()),
2042	ElemTy->getPointerTo()),
2043	ElemTy, CGF.getPointerAlign());
2044
2045	Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
2046	llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
2047	AddrLaneIDArg, /Volatile=/false, C.ShortTy, SourceLocation ());
2048
2049	Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
2050	llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
2051	AddrRemoteLaneOffsetArg, /Volatile=/false, C.ShortTy, SourceLocation ());
2052
2053	Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
2054	llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
2055	AddrAlgoVerArg, /Volatile=/false, C.ShortTy, SourceLocation ());
2056
2057	// Create a local thread-private variable to host the Reduce list
2058	// from a remote lane.
2059	Address RemoteReduceList =
2060	CGF.CreateMemTemp(T: ReductionArrayTy, Name: ".omp.reduction.remote_reduce_list");
2061
2062	// This loop iterates through the list of reduce elements and copies,
2063	// element by element, from a remote lane in the warp to RemoteReduceList,
2064	// hosted on the thread's stack.
2065	emitReductionListCopy(Action: RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
2066	SrcBase: LocalReduceList, DestBase: RemoteReduceList,
2067	CopyOptions: {/RemoteLaneOffset=/RemoteLaneOffsetArgVal,
2068	/ScratchpadIndex=/nullptr,
2069	/ScratchpadWidth=/nullptr});
2070
2071	// The actions to be performed on the Remote Reduce list is dependent
2072	// on the algorithm version.
2073	//
2074	// if (AlgoVer==0) \|\| (AlgoVer==1 && (LaneId < Offset)) \|\| (AlgoVer==2 &&
2075	// LaneId % 2 == 0 && Offset > 0):
2076	// do the reduction value aggregation
2077	//
2078	// The thread local variable Reduce list is mutated in place to host the
2079	// reduced data, which is the aggregated value produced from local and
2080	// remote lanes.
2081	//
2082	// Note that AlgoVer is expected to be a constant integer known at compile
2083	// time.
2084	// When AlgoVer==0, the first conjunction evaluates to true, making
2085	// the entire predicate true during compile time.
2086	// When AlgoVer==1, the second conjunction has only the second part to be
2087	// evaluated during runtime. Other conjunctions evaluates to false
2088	// during compile time.
2089	// When AlgoVer==2, the third conjunction has only the second part to be
2090	// evaluated during runtime. Other conjunctions evaluates to false
2091	// during compile time.
2092	llvm::Value *CondAlgo0 = Bld.CreateIsNull(Arg: AlgoVerArgVal);
2093
2094	llvm::Value *Algo1 = Bld.CreateICmpEQ(LHS: AlgoVerArgVal, RHS: Bld.getInt16(C: `1`));
2095	llvm::Value *CondAlgo1 = Bld.CreateAnd(
2096	LHS: Algo1, RHS: Bld.CreateICmpULT(LHS: LaneIDArgVal, RHS: RemoteLaneOffsetArgVal));
2097
2098	llvm::Value *Algo2 = Bld.CreateICmpEQ(LHS: AlgoVerArgVal, RHS: Bld.getInt16(C: `2`));
2099	llvm::Value *CondAlgo2 = Bld.CreateAnd(
2100	LHS: Algo2, RHS: Bld.CreateIsNull(Arg: Bld.CreateAnd(LHS: LaneIDArgVal, RHS: Bld.getInt16(C: `1`))));
2101	CondAlgo2 = Bld.CreateAnd(
2102	LHS: CondAlgo2, RHS: Bld.CreateICmpSGT(LHS: RemoteLaneOffsetArgVal, RHS: Bld.getInt16(C: `0`)));
2103
2104	llvm::Value *CondReduce = Bld.CreateOr(LHS: CondAlgo0, RHS: CondAlgo1);
2105	CondReduce = Bld.CreateOr(LHS: CondReduce, RHS: CondAlgo2);
2106
2107	llvm::BasicBlock *ThenBB = CGF.createBasicBlock(name: "then");
2108	llvm::BasicBlock *ElseBB = CGF.createBasicBlock(name: "else");
2109	llvm::BasicBlock *MergeBB = CGF.createBasicBlock(name: "ifcont");
2110	Bld.CreateCondBr(Cond: CondReduce, True: ThenBB, False: ElseBB);
2111
2112	CGF.EmitBlock(BB: ThenBB);
2113	// reduce_function(LocalReduceList, RemoteReduceList)
2114	llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2115	V: LocalReduceList.emitRawPointer(CGF), DestTy: CGF.VoidPtrTy);
2116	llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2117	V: RemoteReduceList.emitRawPointer(CGF), DestTy: CGF.VoidPtrTy);
2118	CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2119	CGF, Loc, OutlinedFn: ReduceFn, Args: {LocalReduceListPtr, RemoteReduceListPtr});
2120	Bld.CreateBr(Dest: MergeBB);
2121
2122	CGF.EmitBlock(BB: ElseBB);
2123	Bld.CreateBr(Dest: MergeBB);
2124
2125	CGF.EmitBlock(BB: MergeBB);
2126
2127	// if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
2128	// Reduce list.
2129	Algo1 = Bld.CreateICmpEQ(LHS: AlgoVerArgVal, RHS: Bld.getInt16(C: `1`));
2130	llvm::Value *CondCopy = Bld.CreateAnd(
2131	LHS: Algo1, RHS: Bld.CreateICmpUGE(LHS: LaneIDArgVal, RHS: RemoteLaneOffsetArgVal));
2132
2133	llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock(name: "then");
2134	llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock(name: "else");
2135	llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock(name: "ifcont");
2136	Bld.CreateCondBr(Cond: CondCopy, True: CpyThenBB, False: CpyElseBB);
2137
2138	CGF.EmitBlock(BB: CpyThenBB);
2139	emitReductionListCopy(Action: ThreadCopy, CGF, ReductionArrayTy, Privates,
2140	SrcBase: RemoteReduceList, DestBase: LocalReduceList);
2141	Bld.CreateBr(Dest: CpyMergeBB);
2142
2143	CGF.EmitBlock(BB: CpyElseBB);
2144	Bld.CreateBr(Dest: CpyMergeBB);
2145
2146	CGF.EmitBlock(BB: CpyMergeBB);
2147
2148	CGF.FinishFunction();
2149	return Fn;
2150	}
2151
2152	/// This function emits a helper that copies all the reduction variables from
2153	/// the team into the provided global buffer for the reduction variables.
2154	///
2155	/// void list_to_global_copy_func(void buffer, int Idx, void reduce_data)
2156	/// For all data entries D in reduce_data:
2157	/// Copy local D to buffer.D[Idx]
2158	static llvm::Value *emitListToGlobalCopyFunction(
2159	CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2160	QualType ReductionArrayTy, SourceLocation Loc,
2161	const RecordDecl *TeamReductionRec,
2162	const llvm::SmallDenseMap<const ValueDecl , const* FieldDecl *>
2163	&VarFieldMap) {
2164	ASTContext &C = CGM.getContext();
2165
2166	// Buffer: global reduction buffer.
2167	ImplicitParamDecl BufferArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2168	C.VoidPtrTy, ImplicitParamKind::Other);
2169	// Idx: index of the buffer.
2170	ImplicitParamDecl IdxArg(C, /DC=/nullptr, Loc, /Id=/nullptr, C.IntTy,
2171	ImplicitParamKind::Other);
2172	// ReduceList: thread local Reduce list.
2173	ImplicitParamDecl ReduceListArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2174	C.VoidPtrTy, ImplicitParamKind::Other);
2175	FunctionArgList Args;
2176	Args.push_back(&BufferArg);
2177	Args.push_back(&IdxArg);
2178	Args.push_back(&ReduceListArg);
2179
2180	const CGFunctionInfo &CGFI =
2181	CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2182	auto *Fn = llvm::Function::Create(
2183	Ty: CGM.getTypes().GetFunctionType(Info: CGFI), Linkage: llvm::GlobalValue::InternalLinkage,
2184	N: "_omp_reduction_list_to_global_copy_func", M: &CGM.getModule());
2185	CGM.SetInternalFunctionAttributes(GD: GlobalDecl (), F: Fn, FI: CGFI);
2186	Fn->setDoesNotRecurse();
2187	CodeGenFunction CGF(CGM);
2188	CGF.StartFunction(GD: GlobalDecl (), RetTy: C.VoidTy, Fn: Fn, FnInfo: CGFI, Args, Loc, StartLoc: Loc);
2189
2190	CGBuilderTy &Bld = CGF.Builder;
2191
2192	Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2193	Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2194	llvm::Type *ElemTy = CGF.ConvertTypeForMem(T: ReductionArrayTy);
2195	Address LocalReduceList(
2196	Bld.CreatePointerBitCastOrAddrSpaceCast(
2197	CGF.EmitLoadOfScalar(AddrReduceListArg, /Volatile=/false,
2198	C.VoidPtrTy, Loc),
2199	ElemTy->getPointerTo()),
2200	ElemTy, CGF.getPointerAlign());
2201	QualType StaticTy = C.getRecordType(Decl: TeamReductionRec);
2202	llvm::Type *LLVMReductionsBufferTy =
2203	CGM.getTypes().ConvertTypeForMem(T: StaticTy);
2204	llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2205	CGF.EmitLoadOfScalar(AddrBufferArg, /Volatile=/false, C.VoidPtrTy, Loc),
2206	LLVMReductionsBufferTy->getPointerTo());
2207	llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2208	/Volatile=/false, C.IntTy,
2209	Loc)};
2210	unsigned Idx = `0`;
2211	for (const Expr *Private : Privates) {
2212	// Reduce element = LocalReduceList[i]
2213	Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(Addr: LocalReduceList, Index: Idx);
2214	llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2215	ElemPtrPtrAddr, /Volatile=/false, C.VoidPtrTy, SourceLocation ());
2216	// elemptr = ((CopyType)(elemptrptr)) + I*
2217	ElemTy = CGF.ConvertTypeForMem(T: Private->getType());
2218	ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2219	V: ElemPtrPtr, DestTy: ElemTy->getPointerTo());
2220	Address ElemPtr =
2221	Address (ElemPtrPtr, ElemTy, C.getTypeAlignInChars(T: Private->getType()));
2222	const ValueDecl *VD = cast<DeclRefExpr>(Val: Private)->getDecl();
2223	// Global = Buffer.VD[Idx];
2224	const FieldDecl *FD = VarFieldMap.lookup(Val: VD);
2225	llvm::Value *BufferPtr =
2226	Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);
2227	LValue GlobLVal = CGF.EmitLValueForField(
2228	Base: CGF.MakeNaturalAlignRawAddrLValue(V: BufferPtr, T: StaticTy), Field: FD);
2229	Address GlobAddr = GlobLVal.getAddress(CGF);
2230	GlobLVal.setAddress(Address (GlobAddr.emitRawPointer(CGF),
2231	CGF.ConvertTypeForMem(T: Private->getType()),
2232	GlobAddr.getAlignment()));
2233	switch (CGF.getEvaluationKind(T: Private->getType())) {
2234	case TEK_Scalar: {
2235	llvm::Value *V = CGF.EmitLoadOfScalar(
2236	Addr: ElemPtr, /Volatile=/false, Ty: Private->getType(), Loc,
2237	BaseInfo: LValueBaseInfo (AlignmentSource::Type), TBAAInfo: TBAAAccessInfo ());
2238	CGF.EmitStoreOfScalar(value: V, lvalue: GlobLVal);
2239	break;
2240	}
2241	case TEK_Complex: {
2242	CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(
2243	src: CGF.MakeAddrLValue(Addr: ElemPtr, T: Private->getType()), loc: Loc);
2244	CGF.EmitStoreOfComplex(V, dest: GlobLVal, /isInit=/false);
2245	break;
2246	}
2247	case TEK_Aggregate:
2248	CGF.EmitAggregateCopy(Dest: GlobLVal,
2249	Src: CGF.MakeAddrLValue(Addr: ElemPtr, T: Private->getType()),
2250	EltTy: Private->getType(), MayOverlap: AggValueSlot::DoesNotOverlap);
2251	break;
2252	}
2253	++Idx;
2254	}
2255
2256	CGF.FinishFunction();
2257	return Fn;
2258	}
2259
2260	/// This function emits a helper that reduces all the reduction variables from
2261	/// the team into the provided global buffer for the reduction variables.
2262	///
2263	/// void list_to_global_reduce_func(void buffer, int Idx, void reduce_data)
2264	/// void GlobPtrs[];*
2265	/// GlobPtrs[0] = (void)&buffer.D0[Idx];*
2266	/// ...
2267	/// GlobPtrs[N] = (void)&buffer.DN[Idx];*
2268	/// reduce_function(GlobPtrs, reduce_data);
2269	static llvm::Value *emitListToGlobalReduceFunction(
2270	CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2271	QualType ReductionArrayTy, SourceLocation Loc,
2272	const RecordDecl *TeamReductionRec,
2273	const llvm::SmallDenseMap<const ValueDecl , const* FieldDecl *>
2274	&VarFieldMap,
2275	llvm::Function *ReduceFn) {
2276	ASTContext &C = CGM.getContext();
2277
2278	// Buffer: global reduction buffer.
2279	ImplicitParamDecl BufferArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2280	C.VoidPtrTy, ImplicitParamKind::Other);
2281	// Idx: index of the buffer.
2282	ImplicitParamDecl IdxArg(C, /DC=/nullptr, Loc, /Id=/nullptr, C.IntTy,
2283	ImplicitParamKind::Other);
2284	// ReduceList: thread local Reduce list.
2285	ImplicitParamDecl ReduceListArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2286	C.VoidPtrTy, ImplicitParamKind::Other);
2287	FunctionArgList Args;
2288	Args.push_back(&BufferArg);
2289	Args.push_back(&IdxArg);
2290	Args.push_back(&ReduceListArg);
2291
2292	const CGFunctionInfo &CGFI =
2293	CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2294	auto *Fn = llvm::Function::Create(
2295	Ty: CGM.getTypes().GetFunctionType(Info: CGFI), Linkage: llvm::GlobalValue::InternalLinkage,
2296	N: "_omp_reduction_list_to_global_reduce_func", M: &CGM.getModule());
2297	CGM.SetInternalFunctionAttributes(GD: GlobalDecl (), F: Fn, FI: CGFI);
2298	Fn->setDoesNotRecurse();
2299	CodeGenFunction CGF(CGM);
2300	CGF.StartFunction(GD: GlobalDecl (), RetTy: C.VoidTy, Fn: Fn, FnInfo: CGFI, Args, Loc, StartLoc: Loc);
2301
2302	CGBuilderTy &Bld = CGF.Builder;
2303
2304	Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2305	QualType StaticTy = C.getRecordType(Decl: TeamReductionRec);
2306	llvm::Type *LLVMReductionsBufferTy =
2307	CGM.getTypes().ConvertTypeForMem(T: StaticTy);
2308	llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2309	CGF.EmitLoadOfScalar(AddrBufferArg, /Volatile=/false, C.VoidPtrTy, Loc),
2310	LLVMReductionsBufferTy->getPointerTo());
2311
2312	// 1. Build a list of reduction variables.
2313	// void RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};*
2314	RawAddress ReductionList =
2315	CGF.CreateMemTemp(T: ReductionArrayTy, Name: ".omp.reduction.red_list");
2316	auto IPriv = Privates.begin();
2317	llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2318	/Volatile=/false, C.IntTy,
2319	Loc)};
2320	unsigned Idx = `0`;
2321	for (unsigned I = `0`, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2322	Address Elem = CGF.Builder.CreateConstArrayGEP(Addr: ReductionList, Index: Idx);
2323	// Global = Buffer.VD[Idx];
2324	const ValueDecl VD = cast<DeclRefExpr>(Val: IPriv)->getDecl();
2325	const FieldDecl *FD = VarFieldMap.lookup(Val: VD);
2326	llvm::Value *BufferPtr =
2327	Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);
2328	LValue GlobLVal = CGF.EmitLValueForField(
2329	Base: CGF.MakeNaturalAlignRawAddrLValue(V: BufferPtr, T: StaticTy), Field: FD);
2330	Address GlobAddr = GlobLVal.getAddress(CGF);
2331	CGF.EmitStoreOfScalar(GlobAddr.emitRawPointer(CGF), Elem,
2332	/Volatile=/false, C.VoidPtrTy);
2333	if ((*IPriv)->getType()->isVariablyModifiedType()) {
2334	// Store array size.
2335	++Idx;
2336	Elem = CGF.Builder.CreateConstArrayGEP(Addr: ReductionList, Index: Idx);
2337	llvm::Value *Size = CGF.Builder.CreateIntCast(
2338	V: CGF.getVLASize(
2339	vla: CGF.getContext().getAsVariableArrayType(T: (*IPriv)->getType()))
2340	.NumElts,
2341	DestTy: CGF.SizeTy, /isSigned=/false);
2342	CGF.Builder.CreateStore(Val: CGF.Builder.CreateIntToPtr(V: Size, DestTy: CGF.VoidPtrTy),
2343	Addr: Elem);
2344	}
2345	}
2346
2347	// Call reduce_function(GlobalReduceList, ReduceList)
2348	llvm::Value *GlobalReduceList = ReductionList.getPointer();
2349	Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2350	llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
2351	AddrReduceListArg, /Volatile=/false, C.VoidPtrTy, Loc);
2352	CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2353	CGF, Loc, OutlinedFn: ReduceFn, Args: {GlobalReduceList, ReducedPtr});
2354	CGF.FinishFunction();
2355	return Fn;
2356	}
2357
2358	/// This function emits a helper that copies all the reduction variables from
2359	/// the team into the provided global buffer for the reduction variables.
2360	///
2361	/// void list_to_global_copy_func(void buffer, int Idx, void reduce_data)
2362	/// For all data entries D in reduce_data:
2363	/// Copy buffer.D[Idx] to local D;
2364	static llvm::Value *emitGlobalToListCopyFunction(
2365	CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2366	QualType ReductionArrayTy, SourceLocation Loc,
2367	const RecordDecl *TeamReductionRec,
2368	const llvm::SmallDenseMap<const ValueDecl , const* FieldDecl *>
2369	&VarFieldMap) {
2370	ASTContext &C = CGM.getContext();
2371
2372	// Buffer: global reduction buffer.
2373	ImplicitParamDecl BufferArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2374	C.VoidPtrTy, ImplicitParamKind::Other);
2375	// Idx: index of the buffer.
2376	ImplicitParamDecl IdxArg(C, /DC=/nullptr, Loc, /Id=/nullptr, C.IntTy,
2377	ImplicitParamKind::Other);
2378	// ReduceList: thread local Reduce list.
2379	ImplicitParamDecl ReduceListArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2380	C.VoidPtrTy, ImplicitParamKind::Other);
2381	FunctionArgList Args;
2382	Args.push_back(&BufferArg);
2383	Args.push_back(&IdxArg);
2384	Args.push_back(&ReduceListArg);
2385
2386	const CGFunctionInfo &CGFI =
2387	CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2388	auto *Fn = llvm::Function::Create(
2389	Ty: CGM.getTypes().GetFunctionType(Info: CGFI), Linkage: llvm::GlobalValue::InternalLinkage,
2390	N: "_omp_reduction_global_to_list_copy_func", M: &CGM.getModule());
2391	CGM.SetInternalFunctionAttributes(GD: GlobalDecl (), F: Fn, FI: CGFI);
2392	Fn->setDoesNotRecurse();
2393	CodeGenFunction CGF(CGM);
2394	CGF.StartFunction(GD: GlobalDecl (), RetTy: C.VoidTy, Fn: Fn, FnInfo: CGFI, Args, Loc, StartLoc: Loc);
2395
2396	CGBuilderTy &Bld = CGF.Builder;
2397
2398	Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2399	Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2400	llvm::Type *ElemTy = CGF.ConvertTypeForMem(T: ReductionArrayTy);
2401	Address LocalReduceList(
2402	Bld.CreatePointerBitCastOrAddrSpaceCast(
2403	CGF.EmitLoadOfScalar(AddrReduceListArg, /Volatile=/false,
2404	C.VoidPtrTy, Loc),
2405	ElemTy->getPointerTo()),
2406	ElemTy, CGF.getPointerAlign());
2407	QualType StaticTy = C.getRecordType(Decl: TeamReductionRec);
2408	llvm::Type *LLVMReductionsBufferTy =
2409	CGM.getTypes().ConvertTypeForMem(T: StaticTy);
2410	llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2411	CGF.EmitLoadOfScalar(AddrBufferArg, /Volatile=/false, C.VoidPtrTy, Loc),
2412	LLVMReductionsBufferTy->getPointerTo());
2413
2414	llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2415	/Volatile=/false, C.IntTy,
2416	Loc)};
2417	unsigned Idx = `0`;
2418	for (const Expr *Private : Privates) {
2419	// Reduce element = LocalReduceList[i]
2420	Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(Addr: LocalReduceList, Index: Idx);
2421	llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2422	ElemPtrPtrAddr, /Volatile=/false, C.VoidPtrTy, SourceLocation ());
2423	// elemptr = ((CopyType)(elemptrptr)) + I*
2424	ElemTy = CGF.ConvertTypeForMem(T: Private->getType());
2425	ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2426	V: ElemPtrPtr, DestTy: ElemTy->getPointerTo());
2427	Address ElemPtr =
2428	Address (ElemPtrPtr, ElemTy, C.getTypeAlignInChars(T: Private->getType()));
2429	const ValueDecl *VD = cast<DeclRefExpr>(Val: Private)->getDecl();
2430	// Global = Buffer.VD[Idx];
2431	const FieldDecl *FD = VarFieldMap.lookup(Val: VD);
2432	llvm::Value *BufferPtr =
2433	Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);
2434	LValue GlobLVal = CGF.EmitLValueForField(
2435	Base: CGF.MakeNaturalAlignRawAddrLValue(V: BufferPtr, T: StaticTy), Field: FD);
2436	Address GlobAddr = GlobLVal.getAddress(CGF);
2437	GlobLVal.setAddress(Address (GlobAddr.emitRawPointer(CGF),
2438	CGF.ConvertTypeForMem(T: Private->getType()),
2439	GlobAddr.getAlignment()));
2440	switch (CGF.getEvaluationKind(T: Private->getType())) {
2441	case TEK_Scalar: {
2442	llvm::Value *V = CGF.EmitLoadOfScalar(lvalue: GlobLVal, Loc);
2443	CGF.EmitStoreOfScalar(Value: V, Addr: ElemPtr, /Volatile=/false, Ty: Private->getType(),
2444	BaseInfo: LValueBaseInfo (AlignmentSource::Type),
2445	TBAAInfo: TBAAAccessInfo ());
2446	break;
2447	}
2448	case TEK_Complex: {
2449	CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(src: GlobLVal, loc: Loc);
2450	CGF.EmitStoreOfComplex(V, dest: CGF.MakeAddrLValue(Addr: ElemPtr, T: Private->getType()),
2451	/isInit=/false);
2452	break;
2453	}
2454	case TEK_Aggregate:
2455	CGF.EmitAggregateCopy(Dest: CGF.MakeAddrLValue(Addr: ElemPtr, T: Private->getType()),
2456	Src: GlobLVal, EltTy: Private->getType(),
2457	MayOverlap: AggValueSlot::DoesNotOverlap);
2458	break;
2459	}
2460	++Idx;
2461	}
2462
2463	CGF.FinishFunction();
2464	return Fn;
2465	}
2466
2467	/// This function emits a helper that reduces all the reduction variables from
2468	/// the team into the provided global buffer for the reduction variables.
2469	///
2470	/// void global_to_list_reduce_func(void buffer, int Idx, void reduce_data)
2471	/// void GlobPtrs[];*
2472	/// GlobPtrs[0] = (void)&buffer.D0[Idx];*
2473	/// ...
2474	/// GlobPtrs[N] = (void)&buffer.DN[Idx];*
2475	/// reduce_function(reduce_data, GlobPtrs);
2476	static llvm::Value *emitGlobalToListReduceFunction(
2477	CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2478	QualType ReductionArrayTy, SourceLocation Loc,
2479	const RecordDecl *TeamReductionRec,
2480	const llvm::SmallDenseMap<const ValueDecl , const* FieldDecl *>
2481	&VarFieldMap,
2482	llvm::Function *ReduceFn) {
2483	ASTContext &C = CGM.getContext();
2484
2485	// Buffer: global reduction buffer.
2486	ImplicitParamDecl BufferArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2487	C.VoidPtrTy, ImplicitParamKind::Other);
2488	// Idx: index of the buffer.
2489	ImplicitParamDecl IdxArg(C, /DC=/nullptr, Loc, /Id=/nullptr, C.IntTy,
2490	ImplicitParamKind::Other);
2491	// ReduceList: thread local Reduce list.
2492	ImplicitParamDecl ReduceListArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2493	C.VoidPtrTy, ImplicitParamKind::Other);
2494	FunctionArgList Args;
2495	Args.push_back(&BufferArg);
2496	Args.push_back(&IdxArg);
2497	Args.push_back(&ReduceListArg);
2498
2499	const CGFunctionInfo &CGFI =
2500	CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2501	auto *Fn = llvm::Function::Create(
2502	Ty: CGM.getTypes().GetFunctionType(Info: CGFI), Linkage: llvm::GlobalValue::InternalLinkage,
2503	N: "_omp_reduction_global_to_list_reduce_func", M: &CGM.getModule());
2504	CGM.SetInternalFunctionAttributes(GD: GlobalDecl (), F: Fn, FI: CGFI);
2505	Fn->setDoesNotRecurse();
2506	CodeGenFunction CGF(CGM);
2507	CGF.StartFunction(GD: GlobalDecl (), RetTy: C.VoidTy, Fn: Fn, FnInfo: CGFI, Args, Loc, StartLoc: Loc);
2508
2509	CGBuilderTy &Bld = CGF.Builder;
2510
2511	Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2512	QualType StaticTy = C.getRecordType(Decl: TeamReductionRec);
2513	llvm::Type *LLVMReductionsBufferTy =
2514	CGM.getTypes().ConvertTypeForMem(T: StaticTy);
2515	llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2516	CGF.EmitLoadOfScalar(AddrBufferArg, /Volatile=/false, C.VoidPtrTy, Loc),
2517	LLVMReductionsBufferTy->getPointerTo());
2518
2519	// 1. Build a list of reduction variables.
2520	// void RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};*
2521	Address ReductionList =
2522	CGF.CreateMemTemp(T: ReductionArrayTy, Name: ".omp.reduction.red_list");
2523	auto IPriv = Privates.begin();
2524	llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2525	/Volatile=/false, C.IntTy,
2526	Loc)};
2527	unsigned Idx = `0`;
2528	for (unsigned I = `0`, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2529	Address Elem = CGF.Builder.CreateConstArrayGEP(Addr: ReductionList, Index: Idx);
2530	// Global = Buffer.VD[Idx];
2531	const ValueDecl VD = cast<DeclRefExpr>(Val: IPriv)->getDecl();
2532	const FieldDecl *FD = VarFieldMap.lookup(Val: VD);
2533	llvm::Value *BufferPtr =
2534	Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);
2535	LValue GlobLVal = CGF.EmitLValueForField(
2536	Base: CGF.MakeNaturalAlignRawAddrLValue(V: BufferPtr, T: StaticTy), Field: FD);
2537	Address GlobAddr = GlobLVal.getAddress(CGF);
2538	CGF.EmitStoreOfScalar(GlobAddr.emitRawPointer(CGF), Elem,
2539	/Volatile=/false, C.VoidPtrTy);
2540	if ((*IPriv)->getType()->isVariablyModifiedType()) {
2541	// Store array size.
2542	++Idx;
2543	Elem = CGF.Builder.CreateConstArrayGEP(Addr: ReductionList, Index: Idx);
2544	llvm::Value *Size = CGF.Builder.CreateIntCast(
2545	V: CGF.getVLASize(
2546	vla: CGF.getContext().getAsVariableArrayType(T: (*IPriv)->getType()))
2547	.NumElts,
2548	DestTy: CGF.SizeTy, /isSigned=/false);
2549	CGF.Builder.CreateStore(Val: CGF.Builder.CreateIntToPtr(V: Size, DestTy: CGF.VoidPtrTy),
2550	Addr: Elem);
2551	}
2552	}
2553
2554	// Call reduce_function(ReduceList, GlobalReduceList)
2555	llvm::Value *GlobalReduceList = ReductionList.emitRawPointer(CGF);
2556	Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2557	llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
2558	AddrReduceListArg, /Volatile=/false, C.VoidPtrTy, Loc);
2559	CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2560	CGF, Loc, OutlinedFn: ReduceFn, Args: {ReducedPtr, GlobalReduceList});
2561	CGF.FinishFunction();
2562	return Fn;
2563	}
2564
2565	///
2566	/// Design of OpenMP reductions on the GPU
2567	///
2568	/// Consider a typical OpenMP program with one or more reduction
2569	/// clauses:
2570	///
2571	/// float foo;
2572	/// double bar;
2573	/// #pragma omp target teams distribute parallel for \
2574	/// reduction(+:foo) reduction(:bar)*
2575	/// for (int i = 0; i < N; i++) {
2576	/// foo += A[i]; bar = B[i];*
2577	/// }
2578	///
2579	/// where 'foo' and 'bar' are reduced across all OpenMP threads in
2580	/// all teams. In our OpenMP implementation on the NVPTX device an
2581	/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
2582	/// within a team are mapped to CUDA threads within a threadblock.
2583	/// Our goal is to efficiently aggregate values across all OpenMP
2584	/// threads such that:
2585	///
2586	/// - the compiler and runtime are logically concise, and
2587	/// - the reduction is performed efficiently in a hierarchical
2588	/// manner as follows: within OpenMP threads in the same warp,
2589	/// across warps in a threadblock, and finally across teams on
2590	/// the NVPTX device.
2591	///
2592	/// Introduction to Decoupling
2593	///
2594	/// We would like to decouple the compiler and the runtime so that the
2595	/// latter is ignorant of the reduction variables (number, data types)
2596	/// and the reduction operators. This allows a simpler interface
2597	/// and implementation while still attaining good performance.
2598	///
2599	/// Pseudocode for the aforementioned OpenMP program generated by the
2600	/// compiler is as follows:
2601	///
2602	/// 1. Create private copies of reduction variables on each OpenMP
2603	/// thread: 'foo_private', 'bar_private'
2604	/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
2605	/// to it and writes the result in 'foo_private' and 'bar_private'
2606	/// respectively.
2607	/// 3. Call the OpenMP runtime on the GPU to reduce within a team
2608	/// and store the result on the team master:
2609	///
2610	/// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
2611	/// reduceData, shuffleReduceFn, interWarpCpyFn)
2612	///
2613	/// where:
2614	/// struct ReduceData {
2615	/// double foo;*
2616	/// double bar;*
2617	/// } reduceData
2618	/// reduceData.foo = &foo_private
2619	/// reduceData.bar = &bar_private
2620	///
2621	/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
2622	/// auxiliary functions generated by the compiler that operate on
2623	/// variables of type 'ReduceData'. They aid the runtime perform
2624	/// algorithmic steps in a data agnostic manner.
2625	///
2626	/// 'shuffleReduceFn' is a pointer to a function that reduces data
2627	/// of type 'ReduceData' across two OpenMP threads (lanes) in the
2628	/// same warp. It takes the following arguments as input:
2629	///
2630	/// a. variable of type 'ReduceData' on the calling lane,
2631	/// b. its lane_id,
2632	/// c. an offset relative to the current lane_id to generate a
2633	/// remote_lane_id. The remote lane contains the second
2634	/// variable of type 'ReduceData' that is to be reduced.
2635	/// d. an algorithm version parameter determining which reduction
2636	/// algorithm to use.
2637	///
2638	/// 'shuffleReduceFn' retrieves data from the remote lane using
2639	/// efficient GPU shuffle intrinsics and reduces, using the
2640	/// algorithm specified by the 4th parameter, the two operands
2641	/// element-wise. The result is written to the first operand.
2642	///
2643	/// Different reduction algorithms are implemented in different
2644	/// runtime functions, all calling 'shuffleReduceFn' to perform
2645	/// the essential reduction step. Therefore, based on the 4th
2646	/// parameter, this function behaves slightly differently to
2647	/// cooperate with the runtime to ensure correctness under
2648	/// different circumstances.
2649	///
2650	/// 'InterWarpCpyFn' is a pointer to a function that transfers
2651	/// reduced variables across warps. It tunnels, through CUDA
2652	/// shared memory, the thread-private data of type 'ReduceData'
2653	/// from lane 0 of each warp to a lane in the first warp.
2654	/// 4. Call the OpenMP runtime on the GPU to reduce across teams.
2655	/// The last team writes the global reduced value to memory.
2656	///
2657	/// ret = __kmpc_nvptx_teams_reduce_nowait(...,
2658	/// reduceData, shuffleReduceFn, interWarpCpyFn,
2659	/// scratchpadCopyFn, loadAndReduceFn)
2660	///
2661	/// 'scratchpadCopyFn' is a helper that stores reduced
2662	/// data from the team master to a scratchpad array in
2663	/// global memory.
2664	///
2665	/// 'loadAndReduceFn' is a helper that loads data from
2666	/// the scratchpad array and reduces it with the input
2667	/// operand.
2668	///
2669	/// These compiler generated functions hide address
2670	/// calculation and alignment information from the runtime.
2671	/// 5. if ret == 1:
2672	/// The team master of the last team stores the reduced
2673	/// result to the globals in memory.
2674	/// foo += reduceData.foo; bar = reduceData.bar*
2675	///
2676	///
2677	/// Warp Reduction Algorithms
2678	///
2679	/// On the warp level, we have three algorithms implemented in the
2680	/// OpenMP runtime depending on the number of active lanes:
2681	///
2682	/// Full Warp Reduction
2683	///
2684	/// The reduce algorithm within a warp where all lanes are active
2685	/// is implemented in the runtime as follows:
2686	///
2687	/// full_warp_reduce(void reduce_data,*
2688	/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2689	/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
2690	/// ShuffleReduceFn(reduce_data, 0, offset, 0);
2691	/// }
2692	///
2693	/// The algorithm completes in log(2, WARPSIZE) steps.
2694	///
2695	/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
2696	/// not used therefore we save instructions by not retrieving lane_id
2697	/// from the corresponding special registers. The 4th parameter, which
2698	/// represents the version of the algorithm being used, is set to 0 to
2699	/// signify full warp reduction.
2700	///
2701	/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2702	///
2703	/// #reduce_elem refers to an element in the local lane's data structure
2704	/// #remote_elem is retrieved from a remote lane
2705	/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2706	/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
2707	///
2708	/// Contiguous Partial Warp Reduction
2709	///
2710	/// This reduce algorithm is used within a warp where only the first
2711	/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
2712	/// number of OpenMP threads in a parallel region is not a multiple of
2713	/// WARPSIZE. The algorithm is implemented in the runtime as follows:
2714	///
2715	/// void
2716	/// contiguous_partial_reduce(void reduce_data,*
2717	/// kmp_ShuffleReductFctPtr ShuffleReduceFn,
2718	/// int size, int lane_id) {
2719	/// int curr_size;
2720	/// int offset;
2721	/// curr_size = size;
2722	/// mask = curr_size/2;
2723	/// while (offset>0) {
2724	/// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
2725	/// curr_size = (curr_size+1)/2;
2726	/// offset = curr_size/2;
2727	/// }
2728	/// }
2729	///
2730	/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2731	///
2732	/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2733	/// if (lane_id < offset)
2734	/// reduce_elem = reduce_elem REDUCE_OP remote_elem
2735	/// else
2736	/// reduce_elem = remote_elem
2737	///
2738	/// This algorithm assumes that the data to be reduced are located in a
2739	/// contiguous subset of lanes starting from the first. When there is
2740	/// an odd number of active lanes, the data in the last lane is not
2741	/// aggregated with any other lane's dat but is instead copied over.
2742	///
2743	/// Dispersed Partial Warp Reduction
2744	///
2745	/// This algorithm is used within a warp when any discontiguous subset of
2746	/// lanes are active. It is used to implement the reduction operation
2747	/// across lanes in an OpenMP simd region or in a nested parallel region.
2748	///
2749	/// void
2750	/// dispersed_partial_reduce(void reduce_data,*
2751	/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2752	/// int size, remote_id;
2753	/// int logical_lane_id = number_of_active_lanes_before_me() 2;*
2754	/// do {
2755	/// remote_id = next_active_lane_id_right_after_me();
2756	/// # the above function returns 0 of no active lane
2757	/// # is present right after the current lane.
2758	/// size = number_of_active_lanes_in_this_warp();
2759	/// logical_lane_id /= 2;
2760	/// ShuffleReduceFn(reduce_data, logical_lane_id,
2761	/// remote_id-1-threadIdx.x, 2);
2762	/// } while (logical_lane_id % 2 == 0 && size > 1);
2763	/// }
2764	///
2765	/// There is no assumption made about the initial state of the reduction.
2766	/// Any number of lanes (>=1) could be active at any position. The reduction
2767	/// result is returned in the first active lane.
2768	///
2769	/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2770	///
2771	/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2772	/// if (lane_id % 2 == 0 && offset > 0)
2773	/// reduce_elem = reduce_elem REDUCE_OP remote_elem
2774	/// else
2775	/// reduce_elem = remote_elem
2776	///
2777	///
2778	/// Intra-Team Reduction
2779	///
2780	/// This function, as implemented in the runtime call
2781	/// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
2782	/// threads in a team. It first reduces within a warp using the
2783	/// aforementioned algorithms. We then proceed to gather all such
2784	/// reduced values at the first warp.
2785	///
2786	/// The runtime makes use of the function 'InterWarpCpyFn', which copies
2787	/// data from each of the "warp master" (zeroth lane of each warp, where
2788	/// warp-reduced data is held) to the zeroth warp. This step reduces (in
2789	/// a mathematical sense) the problem of reduction across warp masters in
2790	/// a block to the problem of warp reduction.
2791	///
2792	///
2793	/// Inter-Team Reduction
2794	///
2795	/// Once a team has reduced its data to a single value, it is stored in
2796	/// a global scratchpad array. Since each team has a distinct slot, this
2797	/// can be done without locking.
2798	///
2799	/// The last team to write to the scratchpad array proceeds to reduce the
2800	/// scratchpad array. One or more workers in the last team use the helper
2801	/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
2802	/// the k'th worker reduces every k'th element.
2803	///
2804	/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
2805	/// reduce across workers and compute a globally reduced value.
2806	///
2807	void CGOpenMPRuntimeGPU::emitReduction(
2808	CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
2809	ArrayRef<const Expr > LHSExprs, ArrayRef<const* Expr *> RHSExprs,
2810	ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
2811	if (!CGF.HaveInsertPoint())
2812	return;
2813
2814	bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
2815	#ifndef NDEBUG
2816	bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
2817	#endif
2818
2819	if (Options.SimpleReduction) {
2820	assert(!TeamsReduction && !ParallelReduction &&
2821	"Invalid reduction selection in emitReduction.");
2822	CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
2823	ReductionOps, Options);
2824	return;
2825	}
2826
2827	assert((TeamsReduction \|\| ParallelReduction) &&
2828	"Invalid reduction selection in emitReduction.");
2829
2830	llvm::SmallDenseMap<const ValueDecl , const* FieldDecl *> VarFieldMap;
2831	llvm::SmallVector<const ValueDecl *, `4`> PrivatesReductions(Privates.size());
2832	int Cnt = `0`;
2833	for (const Expr *DRE : Privates) {
2834	PrivatesReductions [Cnt] = cast<DeclRefExpr>(Val: DRE)->getDecl();
2835	++Cnt;
2836	}
2837
2838	ASTContext &C = CGM.getContext();
2839	const RecordDecl *ReductionRec = ::buildRecordForGlobalizedVars(
2840	C&: CGM.getContext(), EscapedDecls: PrivatesReductions, EscapedDeclsForTeams: std::nullopt, MappedDeclsFields&: VarFieldMap, BufSize: `1`);
2841
2842	// Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
2843	// RedList, shuffle_reduce_func, interwarp_copy_func);
2844	// or
2845	// Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
2846	llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2847
2848	llvm::Value *Res;
2849	// 1. Build a list of reduction variables.
2850	// void RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};*
2851	auto Size = RHSExprs.size();
2852	for (const Expr *E : Privates) {
2853	if (E->getType()->isVariablyModifiedType())
2854	// Reserve place for array size.
2855	++Size;
2856	}
2857	llvm::APInt ArraySize(/unsigned int numBits=/`32`, Size);
2858	QualType ReductionArrayTy = C.getConstantArrayType(
2859	EltTy: C.VoidPtrTy, ArySize: ArraySize, SizeExpr: nullptr, ASM: ArraySizeModifier::Normal,
2860	/IndexTypeQuals=/`0`);
2861	Address ReductionList =
2862	CGF.CreateMemTemp(T: ReductionArrayTy, Name: ".omp.reduction.red_list");
2863	auto IPriv = Privates.begin();
2864	unsigned Idx = `0`;
2865	for (unsigned I = `0`, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
2866	Address Elem = CGF.Builder.CreateConstArrayGEP(Addr: ReductionList, Index: Idx);
2867	CGF.Builder.CreateStore(
2868	Val: CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2869	V: CGF.EmitLValue(E: RHSExprs [I]).getPointer(CGF), DestTy: CGF.VoidPtrTy),
2870	Addr: Elem);
2871	if ((*IPriv)->getType()->isVariablyModifiedType()) {
2872	// Store array size.
2873	++Idx;
2874	Elem = CGF.Builder.CreateConstArrayGEP(Addr: ReductionList, Index: Idx);
2875	llvm::Value *Size = CGF.Builder.CreateIntCast(
2876	V: CGF.getVLASize(
2877	vla: CGF.getContext().getAsVariableArrayType(T: (*IPriv)->getType()))
2878	.NumElts,
2879	DestTy: CGF.SizeTy, /isSigned=/false);
2880	CGF.Builder.CreateStore(Val: CGF.Builder.CreateIntToPtr(V: Size, DestTy: CGF.VoidPtrTy),
2881	Addr: Elem);
2882	}
2883	}
2884
2885	llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2886	V: ReductionList.emitRawPointer(CGF), DestTy: CGF.VoidPtrTy);
2887	llvm::Function *ReductionFn = emitReductionFunction(
2888	CGF.CurFn->getName(), Loc, CGF.ConvertTypeForMem(T: ReductionArrayTy),
2889	Privates, LHSExprs, RHSExprs, ReductionOps);
2890	llvm::Value *ReductionDataSize =
2891	CGF.getTypeSize(Ty: C.getRecordType(Decl: ReductionRec));
2892	ReductionDataSize =
2893	CGF.Builder.CreateSExtOrTrunc(V: ReductionDataSize, DestTy: CGF.Int64Ty);
2894	llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
2895	CGM, Privates, ReductionArrayTy, ReduceFn: ReductionFn, Loc);
2896	llvm::Value *InterWarpCopyFn =
2897	emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
2898
2899	if (ParallelReduction) {
2900	llvm::Value *Args[] = {RTLoc, ReductionDataSize, RL, ShuffleAndReduceFn,
2901	InterWarpCopyFn};
2902
2903	Res = CGF.EmitRuntimeCall(
2904	callee: OMPBuilder.getOrCreateRuntimeFunction(
2905	M&: CGM.getModule(), FnID: OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2),
2906	args: Args);
2907	} else {
2908	assert(TeamsReduction && "expected teams reduction.");
2909	TeamsReductions.push_back(Elt: ReductionRec);
2910	auto *KernelTeamsReductionPtr = CGF.EmitRuntimeCall(
2911	callee: OMPBuilder.getOrCreateRuntimeFunction(
2912	M&: CGM.getModule(), FnID: OMPRTL___kmpc_reduction_get_fixed_buffer),
2913	args: {}, name: "_openmp_teams_reductions_buffer_$_$ptr");
2914	llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
2915	CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec: ReductionRec, VarFieldMap);
2916	llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
2917	CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec: ReductionRec, VarFieldMap,
2918	ReduceFn: ReductionFn);
2919	llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
2920	CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec: ReductionRec, VarFieldMap);
2921	llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
2922	CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec: ReductionRec, VarFieldMap,
2923	ReduceFn: ReductionFn);
2924
2925	llvm::Value *Args[] = {
2926	RTLoc,
2927	KernelTeamsReductionPtr,
2928	CGF.Builder.getInt32(C: C.getLangOpts().OpenMPCUDAReductionBufNum),
2929	ReductionDataSize,
2930	RL,
2931	ShuffleAndReduceFn,
2932	InterWarpCopyFn,
2933	GlobalToBufferCpyFn,
2934	GlobalToBufferRedFn,
2935	BufferToGlobalCpyFn,
2936	BufferToGlobalRedFn};
2937
2938	Res = CGF.EmitRuntimeCall(
2939	callee: OMPBuilder.getOrCreateRuntimeFunction(
2940	M&: CGM.getModule(), FnID: OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2),
2941	args: Args);
2942	}
2943
2944	// 5. Build if (res == 1)
2945	llvm::BasicBlock *ExitBB = CGF.createBasicBlock(name: ".omp.reduction.done");
2946	llvm::BasicBlock *ThenBB = CGF.createBasicBlock(name: ".omp.reduction.then");
2947	llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
2948	LHS: Res, RHS: llvm::ConstantInt::get(Ty: CGM.Int32Ty, /V=/`1`));
2949	CGF.Builder.CreateCondBr(Cond, True: ThenBB, False: ExitBB);
2950
2951	// 6. Build then branch: where we have reduced values in the master
2952	// thread in each team.
2953	// __kmpc_end_reduce{_nowait}(<gtid>);
2954	// break;
2955	CGF.EmitBlock(BB: ThenBB);
2956
2957	// Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
2958	auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
2959	this](CodeGenFunction &CGF, PrePostActionTy &Action) {
2960	auto IPriv = Privates.begin();
2961	auto ILHS = LHSExprs.begin();
2962	auto IRHS = RHSExprs.begin();
2963	for (const Expr *E : ReductionOps) {
2964	emitSingleReductionCombiner(CGF, E, IPriv, cast<DeclRefExpr>(Val: ILHS),
2965	cast<DeclRefExpr>(Val: *IRHS));
2966	++IPriv;
2967	++ILHS;
2968	++IRHS;
2969	}
2970	};
2971	RegionCodeGenTy RCG(CodeGen);
2972	RCG (CGF);
2973	// There is no need to emit line number for unconditional branch.
2974	(void)ApplyDebugLocation::CreateEmpty(CGF);
2975	CGF.EmitBlock(BB: ExitBB, /IsFinished=/true);
2976	}
2977
2978	const VarDecl *
2979	CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD,
2980	const VarDecl NativeParam) const* {
2981	if (!NativeParam->getType()->isReferenceType())
2982	return NativeParam;
2983	QualType ArgType = NativeParam->getType();
2984	QualifierCollector QC;
2985	const Type *NonQualTy = QC.strip(type: ArgType);
2986	QualType PointeeTy = cast<ReferenceType>(Val: NonQualTy)->getPointeeType();
2987	if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
2988	if (Attr->getCaptureKind() == OMPC_map) {
2989	PointeeTy = CGM.getContext().getAddrSpaceQualType(T: PointeeTy,
2990	AddressSpace: LangAS::opencl_global);
2991	}
2992	}
2993	ArgType = CGM.getContext().getPointerType(T: PointeeTy);
2994	QC.addRestrict();
2995	enum { NVPTX_local_addr = `5` };
2996	QC.addAddressSpace(space: getLangASFromTargetAS(TargetAS: NVPTX_local_addr));
2997	ArgType = QC.apply(Context: CGM.getContext(), QT: ArgType);
2998	if (isa<ImplicitParamDecl>(Val: NativeParam))
2999	return ImplicitParamDecl::Create(
3000	CGM.getContext(), /DC=/nullptr, NativeParam->getLocation(),
3001	NativeParam->getIdentifier(), ArgType, ImplicitParamKind::Other);
3002	return ParmVarDecl::Create(
3003	C&: CGM.getContext(),
3004	DC: const_cast<DeclContext *>(NativeParam->getDeclContext()),
3005	StartLoc: NativeParam->getBeginLoc(), IdLoc: NativeParam->getLocation(),
3006	Id: NativeParam->getIdentifier(), T: ArgType,
3007	/TInfo=/nullptr, S: SC_None, /DefArg=/nullptr);
3008	}
3009
3010	Address
3011	CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF,
3012	const VarDecl *NativeParam,
3013	const VarDecl TargetParam) const* {
3014	assert(NativeParam != TargetParam &&
3015	NativeParam->getType()->isReferenceType() &&
3016	"Native arg must not be the same as target arg.");
3017	Address LocalAddr = CGF.GetAddrOfLocalVar(VD: TargetParam);
3018	QualType NativeParamType = NativeParam->getType();
3019	QualifierCollector QC;
3020	const Type *NonQualTy = QC.strip(type: NativeParamType);
3021	QualType NativePointeeTy = cast<ReferenceType>(Val: NonQualTy)->getPointeeType();
3022	unsigned NativePointeeAddrSpace =
3023	CGF.getTypes().getTargetAddressSpace(T: NativePointeeTy);
3024	QualType TargetTy = TargetParam->getType();
3025	llvm::Value TargetAddr = CGF.EmitLoadOfScalar(Addr: LocalAddr, /Volatile=/*false,
3026	Ty: TargetTy, Loc: SourceLocation ());
3027	// Cast to native address space.
3028	TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3029	V: TargetAddr,
3030	DestTy: llvm::PointerType::get(C&: CGF.getLLVMContext(), AddressSpace: NativePointeeAddrSpace));
3031	Address NativeParamAddr = CGF.CreateMemTemp(T: NativeParamType);
3032	CGF.EmitStoreOfScalar(Value: TargetAddr, Addr: NativeParamAddr, /Volatile=/false,
3033	Ty: NativeParamType);
3034	return NativeParamAddr;
3035	}
3036
3037	void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall(
3038	CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
3039	ArrayRef<llvm::Value > Args) const* {
3040	SmallVector<llvm::Value *, `4`> TargetArgs;
3041	TargetArgs.reserve(N: Args.size());
3042	auto *FnType = OutlinedFn.getFunctionType();
3043	for (unsigned I = `0`, E = Args.size(); I < E; ++I) {
3044	if (FnType->isVarArg() && FnType->getNumParams() <= I) {
3045	TargetArgs.append(in_start: std::next(x: Args.begin(), n: I), in_end: Args.end());
3046	break;
3047	}
3048	llvm::Type *TargetType = FnType->getParamType(i: I);
3049	llvm::Value *NativeArg = Args [I];
3050	if (!TargetType->isPointerTy()) {
3051	TargetArgs.emplace_back(Args&: NativeArg);
3052	continue;
3053	}
3054	TargetArgs.emplace_back(
3055	Args: CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(V: NativeArg, DestTy: TargetType));
3056	}
3057	CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
3058	}
3059
3060	/// Emit function which wraps the outline parallel region
3061	/// and controls the arguments which are passed to this function.
3062	/// The wrapper ensures that the outlined function is called
3063	/// with the correct arguments when data is shared.
3064	llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
3065	llvm::Function OutlinedParallelFn, const* OMPExecutableDirective &D) {
3066	ASTContext &Ctx = CGM.getContext();
3067	const auto &CS = *D.getCapturedStmt(OMPD_parallel);
3068
3069	// Create a function that takes as argument the source thread.
3070	FunctionArgList WrapperArgs;
3071	QualType Int16QTy =
3072	Ctx.getIntTypeForBitwidth(/DestWidth=/`16`, /Signed=/false);
3073	QualType Int32QTy =
3074	Ctx.getIntTypeForBitwidth(/DestWidth=/`32`, /Signed=/false);
3075	ImplicitParamDecl ParallelLevelArg(Ctx, /DC=/nullptr, D.getBeginLoc(),
3076	/Id=/nullptr, Int16QTy,
3077	ImplicitParamKind::Other);
3078	ImplicitParamDecl WrapperArg(Ctx, /DC=/nullptr, D.getBeginLoc(),
3079	/Id=/nullptr, Int32QTy,
3080	ImplicitParamKind::Other);
3081	WrapperArgs.emplace_back(Args: &ParallelLevelArg);
3082	WrapperArgs.emplace_back(Args: &WrapperArg);
3083
3084	const CGFunctionInfo &CGFI =
3085	CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
3086
3087	auto *Fn = llvm::Function::Create(
3088	Ty: CGM.getTypes().GetFunctionType(Info: CGFI), Linkage: llvm::GlobalValue::InternalLinkage,
3089	N: Twine(OutlinedParallelFn->getName(), "_wrapper"), M: &CGM.getModule());
3090
3091	// Ensure we do not inline the function. This is trivially true for the ones
3092	// passed to __kmpc_fork_call but the ones calles in serialized regions
3093	// could be inlined. This is not a perfect but it is closer to the invariant
3094	// we want, namely, every data environment starts with a new function.
3095	// TODO: We should pass the if condition to the runtime function and do the
3096	// handling there. Much cleaner code.
3097	Fn->addFnAttr(llvm::Attribute::NoInline);
3098
3099	CGM.SetInternalFunctionAttributes(GD: GlobalDecl (), F: Fn, FI: CGFI);
3100	Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
3101	Fn->setDoesNotRecurse();
3102
3103	CodeGenFunction CGF(CGM, /suppressNewContext=/true);
3104	CGF.StartFunction(GD: GlobalDecl (), RetTy: Ctx.VoidTy, Fn: Fn, FnInfo: CGFI, Args: WrapperArgs,
3105	Loc: D.getBeginLoc(), StartLoc: D.getBeginLoc());
3106
3107	const auto *RD = CS.getCapturedRecordDecl();
3108	auto CurField = RD->field_begin();
3109
3110	Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(Ty: CGF.Int32Ty,
3111	/Name=/".zero.addr");
3112	CGF.Builder.CreateStore(Val: CGF.Builder.getInt32(/C/ `0`), Addr: ZeroAddr);
3113	// Get the array of arguments.
3114	SmallVector<llvm::Value *, `8`> Args;
3115
3116	Args.emplace_back(Args: CGF.GetAddrOfLocalVar(&WrapperArg).emitRawPointer(CGF));
3117	Args.emplace_back(Args: ZeroAddr.emitRawPointer(CGF));
3118
3119	CGBuilderTy &Bld = CGF.Builder;
3120	auto CI = CS.capture_begin();
3121
3122	// Use global memory for data sharing.
3123	// Handle passing of global args to workers.
3124	RawAddress GlobalArgs =
3125	CGF.CreateDefaultAlignTempAlloca(Ty: CGF.VoidPtrPtrTy, Name: "global_args");
3126	llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
3127	llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
3128	CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
3129	M&: CGM.getModule(), FnID: OMPRTL___kmpc_get_shared_variables),
3130	args: DataSharingArgs);
3131
3132	// Retrieve the shared variables from the list of references returned
3133	// by the runtime. Pass the variables to the outlined function.
3134	Address SharedArgListAddress = Address::invalid();
3135	if (CS.capture_size() > `0` \|\|
3136	isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
3137	SharedArgListAddress = CGF.EmitLoadOfPointer(
3138	Ptr: GlobalArgs, PtrTy: CGF.getContext()
3139	.getPointerType(CGF.getContext().VoidPtrTy)
3140	.castAs<PointerType>());
3141	}
3142	unsigned Idx = `0`;
3143	if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
3144	Address Src = Bld.CreateConstInBoundsGEP(Addr: SharedArgListAddress, Index: Idx);
3145	Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3146	Addr: Src, Ty: CGF.SizeTy->getPointerTo(), ElementTy: CGF.SizeTy);
3147	llvm::Value *LB = CGF.EmitLoadOfScalar(
3148	Addr: TypedAddress,
3149	/Volatile=/false,
3150	Ty: CGF.getContext().getPointerType(T: CGF.getContext().getSizeType()),
3151	Loc: cast<OMPLoopDirective>(Val: D).getLowerBoundVariable()->getExprLoc());
3152	Args.emplace_back(Args&: LB);
3153	++Idx;
3154	Src = Bld.CreateConstInBoundsGEP(Addr: SharedArgListAddress, Index: Idx);
3155	TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3156	Addr: Src, Ty: CGF.SizeTy->getPointerTo(), ElementTy: CGF.SizeTy);
3157	llvm::Value *UB = CGF.EmitLoadOfScalar(
3158	Addr: TypedAddress,
3159	/Volatile=/false,
3160	Ty: CGF.getContext().getPointerType(T: CGF.getContext().getSizeType()),
3161	Loc: cast<OMPLoopDirective>(Val: D).getUpperBoundVariable()->getExprLoc());
3162	Args.emplace_back(Args&: UB);
3163	++Idx;
3164	}
3165	if (CS.capture_size() > `0`) {
3166	ASTContext &CGFContext = CGF.getContext();
3167	for (unsigned I = `0`, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
3168	QualType ElemTy = CurField->getType();
3169	Address Src = Bld.CreateConstInBoundsGEP(Addr: SharedArgListAddress, Index: I + Idx);
3170	Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3171	Addr: Src, Ty: CGF.ConvertTypeForMem(T: CGFContext.getPointerType(T: ElemTy)),
3172	ElementTy: CGF.ConvertTypeForMem(T: ElemTy));
3173	llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
3174	/Volatile=/false,
3175	CGFContext.getPointerType(T: ElemTy),
3176	CI->getLocation());
3177	if (CI->capturesVariableByCopy() &&
3178	!CI->getCapturedVar()->getType()->isAnyPointerType()) {
3179	Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
3180	CI->getLocation());
3181	}
3182	Args.emplace_back(Args&: Arg);
3183	}
3184	}
3185
3186	emitOutlinedFunctionCall(CGF, Loc: D.getBeginLoc(), OutlinedFn: OutlinedParallelFn, Args);
3187	CGF.FinishFunction();
3188	return Fn;
3189	}
3190
3191	void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,
3192	const Decl *D) {
3193	if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
3194	return;
3195
3196	assert(D && "Expected function or captured\|block decl.");
3197	assert(FunctionGlobalizedDecls.count(CGF.CurFn) == `0` &&
3198	"Function is registered already.");
3199	assert((!TeamAndReductions.first \|\| TeamAndReductions.first == D) &&
3200	"Team is set but not processed.");
3201	const Stmt Body = nullptr*;
3202	bool NeedToDelayGlobalization = false;
3203	if (const auto *FD = dyn_cast<FunctionDecl>(Val: D)) {
3204	Body = FD->getBody();
3205	} else if (const auto *BD = dyn_cast<BlockDecl>(Val: D)) {
3206	Body = BD->getBody();
3207	} else if (const auto *CD = dyn_cast<CapturedDecl>(Val: D)) {
3208	Body = CD->getBody();
3209	NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
3210	if (NeedToDelayGlobalization &&
3211	getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
3212	return;
3213	}
3214	if (!Body)
3215	return;
3216	CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
3217	VarChecker.Visit(Body);
3218	const RecordDecl *GlobalizedVarsRecord =
3219	VarChecker.getGlobalizedRecord(IsInTTDRegion);
3220	TeamAndReductions.first = nullptr;
3221	TeamAndReductions.second.clear();
3222	ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
3223	VarChecker.getEscapedVariableLengthDecls();
3224	ArrayRef<const ValueDecl *> DelayedVariableLengthDecls =
3225	VarChecker.getDelayedVariableLengthDecls();
3226	if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty() &&
3227	DelayedVariableLengthDecls.empty())
3228	return;
3229	auto I = FunctionGlobalizedDecls.try_emplace(Key: CGF.CurFn).first;
3230	I ->getSecond().MappedParams =
3231	std::make_unique<CodeGenFunction::OMPMapVars>();
3232	I ->getSecond().EscapedParameters.insert(
3233	I: VarChecker.getEscapedParameters().begin(),
3234	E: VarChecker.getEscapedParameters().end());
3235	I ->getSecond().EscapedVariableLengthDecls.append(
3236	in_start: EscapedVariableLengthDecls.begin(), in_end: EscapedVariableLengthDecls.end());
3237	I ->getSecond().DelayedVariableLengthDecls.append(
3238	in_start: DelayedVariableLengthDecls.begin(), in_end: DelayedVariableLengthDecls.end());
3239	DeclToAddrMapTy &Data = I ->getSecond().LocalVarData;
3240	for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
3241	assert(VD->isCanonicalDecl() && "Expected canonical declaration");
3242	Data.insert(std::make_pair(x&: VD, y: MappedVarData ()));
3243	}
3244	if (!NeedToDelayGlobalization) {
3245	emitGenericVarsProlog(CGF, Loc: D->getBeginLoc());
3246	struct GlobalizationScope final : EHScopeStack::Cleanup {
3247	GlobalizationScope() = default;
3248
3249	void Emit(CodeGenFunction &CGF, Flags flags) override {
3250	static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
3251	.emitGenericVarsEpilog(CGF);
3252	}
3253	};
3254	CGF.EHStack.pushCleanup<GlobalizationScope>(Kind: NormalAndEHCleanup);
3255	}
3256	}
3257
3258	Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,
3259	const VarDecl *VD) {
3260	if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
3261	const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
3262	auto AS = LangAS::Default;
3263	switch (A->getAllocatorType()) {
3264	// Use the default allocator here as by default local vars are
3265	// threadlocal.
3266	case OMPAllocateDeclAttr::OMPNullMemAlloc:
3267	case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3268	case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3269	case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3270	case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3271	// Follow the user decision - use default allocation.
3272	return Address::invalid();
3273	case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3274	// TODO: implement aupport for user-defined allocators.
3275	return Address::invalid();
3276	case OMPAllocateDeclAttr::OMPConstMemAlloc:
3277	AS = LangAS::cuda_constant;
3278	break;
3279	case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3280	AS = LangAS::cuda_shared;
3281	break;
3282	case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3283	case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3284	break;
3285	}
3286	llvm::Type *VarTy = CGF.ConvertTypeForMem(T: VD->getType());
3287	auto GV = new* llvm::GlobalVariable(
3288	CGM.getModule(), VarTy, /isConstant=/false,
3289	llvm::GlobalValue::InternalLinkage, llvm::PoisonValue::get(T: VarTy),
3290	VD->getName(),
3291	/InsertBefore=/nullptr, llvm::GlobalValue::NotThreadLocal,
3292	CGM.getContext().getTargetAddressSpace(AS));
3293	CharUnits Align = CGM.getContext().getDeclAlign(VD);
3294	GV->setAlignment(Align.getAsAlign());
3295	return Address(
3296	CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3297	GV, VarTy->getPointerTo(AddrSpace: CGM.getContext().getTargetAddressSpace(
3298	AS: VD->getType().getAddressSpace()))),
3299	VarTy, Align);
3300	}
3301
3302	if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
3303	return Address::invalid();
3304
3305	VD = VD->getCanonicalDecl();
3306	auto I = FunctionGlobalizedDecls.find(Val: CGF.CurFn);
3307	if (I == FunctionGlobalizedDecls.end())
3308	return Address::invalid();
3309	auto VDI = I ->getSecond().LocalVarData.find(VD);
3310	if (VDI != I ->getSecond().LocalVarData.end())
3311	return VDI->second.PrivateAddr;
3312	if (VD->hasAttrs()) {
3313	for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),
3314	E(VD->attr_end());
3315	IT != E; ++IT) {
3316	auto VDI = I->getSecond().LocalVarData.find(
3317	cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
3318	->getCanonicalDecl());
3319	if (VDI != I->getSecond().LocalVarData.end())
3320	return VDI->second.PrivateAddr;
3321	}
3322	}
3323
3324	return Address::invalid();
3325	}
3326
3327	void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) {
3328	FunctionGlobalizedDecls.erase(Val: CGF.CurFn);
3329	CGOpenMPRuntime::functionFinished(CGF);
3330	}
3331
3332	void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(
3333	CodeGenFunction &CGF, const OMPLoopDirective &S,
3334	OpenMPDistScheduleClauseKind &ScheduleKind,
3335	llvm::Value &Chunk) const* {
3336	auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
3337	if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
3338	ScheduleKind = OMPC_DIST_SCHEDULE_static;
3339	Chunk = CGF.EmitScalarConversion(
3340	Src: RT.getGPUNumThreads(CGF),
3341	SrcTy: CGF.getContext().getIntTypeForBitwidth(DestWidth: `32`, /Signed=/`0`),
3342	DstTy: S.getIterationVariable()->getType(), Loc: S.getBeginLoc());
3343	return;
3344	}
3345	CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
3346	CGF, S, ScheduleKind, Chunk);
3347	}
3348
3349	void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk(
3350	CodeGenFunction &CGF, const OMPLoopDirective &S,
3351	OpenMPScheduleClauseKind &ScheduleKind,
3352	const Expr &ChunkExpr) const* {
3353	ScheduleKind = OMPC_SCHEDULE_static;
3354	// Chunk size is 1 in this case.
3355	llvm::APInt ChunkSize(`32`, `1`);
3356	ChunkExpr = IntegerLiteral::Create(C: CGF.getContext(), V: ChunkSize,
3357	type: CGF.getContext().getIntTypeForBitwidth(DestWidth: `32`, /Signed=/`0`),
3358	l: SourceLocation ());
3359	}
3360
3361	void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas(
3362	CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
3363	assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&
3364	" Expected target-based directive.");
3365	const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
3366	for (const CapturedStmt::Capture &C : CS->captures()) {
3367	// Capture variables captured by reference in lambdas for target-based
3368	// directives.
3369	if (!C.capturesVariable())
3370	continue;
3371	const VarDecl *VD = C.getCapturedVar();
3372	const auto *RD = VD->getType()
3373	.getCanonicalType()
3374	.getNonReferenceType()
3375	->getAsCXXRecordDecl();
3376	if (!RD \|\| !RD->isLambda())
3377	continue;
3378	Address VDAddr = CGF.GetAddrOfLocalVar(VD);
3379	LValue VDLVal;
3380	if (VD->getType().getCanonicalType()->isReferenceType())
3381	VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
3382	else
3383	VDLVal = CGF.MakeAddrLValue(
3384	VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
3385	llvm::DenseMap<const ValueDecl , FieldDecl > Captures;
3386	FieldDecl ThisCapture = nullptr*;
3387	RD->getCaptureFields(Captures, ThisCapture);
3388	if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
3389	LValue ThisLVal =
3390	CGF.EmitLValueForFieldInitialization(Base: VDLVal, Field: ThisCapture);
3391	llvm::Value *CXXThis = CGF.LoadCXXThis();
3392	CGF.EmitStoreOfScalar(value: CXXThis, lvalue: ThisLVal);
3393	}
3394	for (const LambdaCapture &LC : RD->captures()) {
3395	if (LC.getCaptureKind() != LCK_ByRef)
3396	continue;
3397	const ValueDecl *VD = LC.getCapturedVar();
3398	// FIXME: For now VD is always a VarDecl because OpenMP does not support
3399	// capturing structured bindings in lambdas yet.
3400	if (!CS->capturesVariable(cast<VarDecl>(VD)))
3401	continue;
3402	auto It = Captures.find(VD);
3403	assert(It != Captures.end() && "Found lambda capture without field.");
3404	LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
3405	Address VDAddr = CGF.GetAddrOfLocalVar(cast<VarDecl>(VD));
3406	if (VD->getType().getCanonicalType()->isReferenceType())
3407	VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
3408	VD->getType().getCanonicalType())
3409	.getAddress(CGF);
3410	CGF.EmitStoreOfScalar(VDAddr.emitRawPointer(CGF), VarLVal);
3411	}
3412	}
3413	}
3414
3415	bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
3416	LangAS &AS) {
3417	if (!VD \|\| !VD->hasAttr<OMPAllocateDeclAttr>())
3418	return false;
3419	const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
3420	switch(A->getAllocatorType()) {
3421	case OMPAllocateDeclAttr::OMPNullMemAlloc:
3422	case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3423	// Not supported, fallback to the default mem space.
3424	case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3425	case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3426	case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3427	case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3428	case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3429	AS = LangAS::Default;
3430	return true;
3431	case OMPAllocateDeclAttr::OMPConstMemAlloc:
3432	AS = LangAS::cuda_constant;
3433	return true;
3434	case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3435	AS = LangAS::cuda_shared;
3436	return true;
3437	case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3438	llvm_unreachable("Expected predefined allocator for the variables with the "
3439	"static storage.");
3440	}
3441	return false;
3442	}
3443
3444	// Get current CudaArch and ignore any unknown values
3445	static CudaArch getCudaArch(CodeGenModule &CGM) {
3446	if (!CGM.getTarget().hasFeature(Feature: "ptx"))
3447	return CudaArch::UNKNOWN;
3448	for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
3449	if (Feature.getValue()) {
3450	CudaArch Arch = StringToCudaArch(S: Feature.getKey());
3451	if (Arch != CudaArch::UNKNOWN)
3452	return Arch;
3453	}
3454	}
3455	return CudaArch::UNKNOWN;
3456	}
3457
3458	/// Check to see if target architecture supports unified addressing which is
3459	/// a restriction for OpenMP requires clause "unified_shared_memory".
3460	void CGOpenMPRuntimeGPU::processRequiresDirective(
3461	const OMPRequiresDecl *D) {
3462	for (const OMPClause *Clause : D->clauselists()) {
3463	if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
3464	CudaArch Arch = getCudaArch(CGM);
3465	switch (Arch) {
3466	case CudaArch::SM_20:
3467	case CudaArch::SM_21:
3468	case CudaArch::SM_30:
3469	case CudaArch::SM_32_:
3470	case CudaArch::SM_35:
3471	case CudaArch::SM_37:
3472	case CudaArch::SM_50:
3473	case CudaArch::SM_52:
3474	case CudaArch::SM_53: {
3475	SmallString<`256`> Buffer;
3476	llvm::raw_svector_ostream Out(Buffer);
3477	Out << "Target architecture " << CudaArchToString(A: Arch)
3478	<< " does not support unified addressing";
3479	CGM.Error(loc: Clause->getBeginLoc(), error: Out.str());
3480	return;
3481	}
3482	case CudaArch::SM_60:
3483	case CudaArch::SM_61:
3484	case CudaArch::SM_62:
3485	case CudaArch::SM_70:
3486	case CudaArch::SM_72:
3487	case CudaArch::SM_75:
3488	case CudaArch::SM_80:
3489	case CudaArch::SM_86:
3490	case CudaArch::SM_87:
3491	case CudaArch::SM_89:
3492	case CudaArch::SM_90:
3493	case CudaArch::SM_90a:
3494	case CudaArch::GFX600:
3495	case CudaArch::GFX601:
3496	case CudaArch::GFX602:
3497	case CudaArch::GFX700:
3498	case CudaArch::GFX701:
3499	case CudaArch::GFX702:
3500	case CudaArch::GFX703:
3501	case CudaArch::GFX704:
3502	case CudaArch::GFX705:
3503	case CudaArch::GFX801:
3504	case CudaArch::GFX802:
3505	case CudaArch::GFX803:
3506	case CudaArch::GFX805:
3507	case CudaArch::GFX810:
3508	case CudaArch::GFX900:
3509	case CudaArch::GFX902:
3510	case CudaArch::GFX904:
3511	case CudaArch::GFX906:
3512	case CudaArch::GFX908:
3513	case CudaArch::GFX909:
3514	case CudaArch::GFX90a:
3515	case CudaArch::GFX90c:
3516	case CudaArch::GFX940:
3517	case CudaArch::GFX941:
3518	case CudaArch::GFX942:
3519	case CudaArch::GFX1010:
3520	case CudaArch::GFX1011:
3521	case CudaArch::GFX1012:
3522	case CudaArch::GFX1013:
3523	case CudaArch::GFX1030:
3524	case CudaArch::GFX1031:
3525	case CudaArch::GFX1032:
3526	case CudaArch::GFX1033:
3527	case CudaArch::GFX1034:
3528	case CudaArch::GFX1035:
3529	case CudaArch::GFX1036:
3530	case CudaArch::GFX1100:
3531	case CudaArch::GFX1101:
3532	case CudaArch::GFX1102:
3533	case CudaArch::GFX1103:
3534	case CudaArch::GFX1150:
3535	case CudaArch::GFX1151:
3536	case CudaArch::GFX1200:
3537	case CudaArch::GFX1201:
3538	case CudaArch::Generic:
3539	case CudaArch::UNUSED:
3540	case CudaArch::UNKNOWN:
3541	break;
3542	case CudaArch::LAST:
3543	llvm_unreachable("Unexpected Cuda arch.");
3544	}
3545	}
3546	}
3547	CGOpenMPRuntime::processRequiresDirective(D);
3548	}
3549
3550	llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) {
3551	CGBuilderTy &Bld = CGF.Builder;
3552	llvm::Module *M = &CGF.CGM.getModule();
3553	const char *LocSize = "__kmpc_get_hardware_num_threads_in_block";
3554	llvm::Function *F = M->getFunction(Name: LocSize);
3555	if (!F) {
3556	F = llvm::Function::Create(
3557	Ty: llvm::FunctionType::get(Result: CGF.Int32Ty, Params: std::nullopt, isVarArg: false),
3558	Linkage: llvm::GlobalVariable::ExternalLinkage, N: LocSize, M: &CGF.CGM.getModule());
3559	}
3560	return Bld.CreateCall(Callee: F, Args: std::nullopt, Name: "nvptx_num_threads");
3561	}
3562
3563	llvm::Value *CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction &CGF) {
3564	ArrayRef<llvm::Value *> Args{};
3565	return CGF.EmitRuntimeCall(
3566	callee: OMPBuilder.getOrCreateRuntimeFunction(
3567	M&: CGM.getModule(), FnID: OMPRTL___kmpc_get_hardware_thread_id_in_block),
3568	args: Args);
3569	}
3570
3571	llvm::Value *CGOpenMPRuntimeGPU::getGPUWarpSize(CodeGenFunction &CGF) {
3572	ArrayRef<llvm::Value *> Args{};
3573	return CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
3574	M&: CGM.getModule(), FnID: OMPRTL___kmpc_get_warp_size),
3575	args: Args);
3576	}
3577

Provided by KDAB

Definitions

NVPTXActionTy
NVPTXActionTy
Enter
Done
Exit
ExecutionRuntimeModesRAII
ExecutionRuntimeModesRAII
~ExecutionRuntimeModesRAII
getPrivateItem
buildRecordForGlobalizedVars
CheckVarsEscapingDeclContext
markAsEscaped
VisitValueDecl
VisitOpenMPCapturedStmt
buildRecordForGlobalizedVars
CheckVarsEscapingDeclContext
~CheckVarsEscapingDeclContext
VisitDeclStmt
VisitOMPExecutableDirective
VisitCapturedStmt
VisitLambdaExpr
VisitBlockExpr
VisitCallExpr
VisitDeclRefExpr
VisitUnaryOperator
VisitImplicitCastExpr
VisitExpr
VisitStmt
getGlobalizedRecord
getFieldForGlobalizedVar
getEscapedDecls
getEscapedParameters
getEscapedVariableLengthDecls
getDelayedVariableLengthDecls
getNVPTXWarpID
getNVPTXLaneID
getExecutionMode
getDataSharingMode
hasNestedSPMDDirective
supportsSPMDExecutionMode
emitNonSPMDKernel
emitKernelInit
emitKernelDeinit
emitSPMDKernel
emitTargetOutlinedFunction
CGOpenMPRuntimeGPU
emitProcBindClause
emitNumThreadsClause
emitNumTeamsClause
emitParallelOutlinedFunction
getDistributeLastprivateVars
getTeamsReductionVars
emitTeamsOutlinedFunction
emitGenericVarsProlog
isDelayedVariableLengthDecl
getKmpcAllocShared
getKmpcFreeShared
emitGenericVarsEpilog
emitTeamsCall
emitParallelCall
syncCTAThreads
emitBarrierCall
emitCriticalRegion
castValueToType
createRuntimeShuffleFunction
shuffleAndStore
CopyAction
CopyOptionsTy
emitReductionListCopy
emitInterWarpCopyFunction
emitShuffleAndReduceFunction
emitListToGlobalCopyFunction
emitListToGlobalReduceFunction
emitGlobalToListCopyFunction
emitGlobalToListReduceFunction
emitReduction
translateParameter
getParameterAddress
emitOutlinedFunctionCall
createParallelDataSharingWrapper
emitFunctionProlog
getAddressOfLocalVariable
functionFinished
getDefaultDistScheduleAndChunk
getDefaultScheduleAndChunk
adjustTargetSpecificDataForLambdas
hasAllocateAttributeForGlobalVar
getCudaArch
processRequiresDirective
getGPUNumThreads
getGPUThreadID

Learn to use CMake with our Intro Training

Find out more

Definitions

source code of clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp