SIMemoryLegalizer.cpp source code [llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp]

1	//===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// Memory legalizer - implements memory model. More information can be
11	/// found here:
12	/// http://llvm.org/docs/AMDGPUUsage.html#memory-model
13	//
14	//===----------------------------------------------------------------------===//
15
16	#include "AMDGPU.h"
17	#include "AMDGPUMachineModuleInfo.h"
18	#include "GCNSubtarget.h"
19	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20	#include "llvm/ADT/BitmaskEnum.h"
21	#include "llvm/CodeGen/MachineBasicBlock.h"
22	#include "llvm/CodeGen/MachineFunctionPass.h"
23	#include "llvm/IR/DiagnosticInfo.h"
24	#include "llvm/Support/AtomicOrdering.h"
25	#include "llvm/TargetParser/TargetParser.h"
26
27	using namespace llvm;
28	using namespace llvm::AMDGPU;
29
30	#define DEBUG_TYPE "si-memory-legalizer"
31	#define PASS_NAME "SI Memory Legalizer"
32
33	static cl::opt<bool> AmdgcnSkipCacheInvalidations(
34	"amdgcn-skip-cache-invalidations", cl::init(Val: false), cl::Hidden,
35	cl::desc ("Use this to skip inserting cache invalidating instructions."));
36
37	namespace {
38
39	LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
40
41	/// Memory operation flags. Can be ORed together.
42	enum class SIMemOp {
43	NONE = `0u`,
44	LOAD = `1u` << `0`,
45	STORE = `1u` << `1`,
46	LLVM_MARK_AS_BITMASK_ENUM(/ LargestFlag = / STORE)
47	};
48
49	/// Position to insert a new instruction relative to an existing
50	/// instruction.
51	enum class Position {
52	BEFORE,
53	AFTER
54	};
55
56	/// The atomic synchronization scopes supported by the AMDGPU target.
57	enum class SIAtomicScope {
58	NONE,
59	SINGLETHREAD,
60	WAVEFRONT,
61	WORKGROUP,
62	AGENT,
63	SYSTEM
64	};
65
66	/// The distinct address spaces supported by the AMDGPU target for
67	/// atomic memory operation. Can be ORed together.
68	enum class SIAtomicAddrSpace {
69	NONE = `0u`,
70	GLOBAL = `1u` << `0`,
71	LDS = `1u` << `1`,
72	SCRATCH = `1u` << `2`,
73	GDS = `1u` << `3`,
74	OTHER = `1u` << `4`,
75
76	/// The address spaces that can be accessed by a FLAT instruction.
77	FLAT = GLOBAL \| LDS \| SCRATCH,
78
79	/// The address spaces that support atomic instructions.
80	ATOMIC = GLOBAL \| LDS \| SCRATCH \| GDS,
81
82	/// All address spaces.
83	ALL = GLOBAL \| LDS \| SCRATCH \| GDS \| OTHER,
84
85	LLVM_MARK_AS_BITMASK_ENUM(/ LargestFlag = / ALL)
86	};
87
88	class SIMemOpInfo final {
89	private:
90
91	friend class SIMemOpAccess;
92
93	AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
94	AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
95	SIAtomicScope Scope = SIAtomicScope::SYSTEM;
96	SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
97	SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
98	bool IsCrossAddressSpaceOrdering = false;
99	bool IsVolatile = false;
100	bool IsNonTemporal = false;
101	bool IsLastUse = false;
102
103	SIMemOpInfo(
104	AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
105	SIAtomicScope Scope = SIAtomicScope::SYSTEM,
106	SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
107	SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
108	bool IsCrossAddressSpaceOrdering = true,
109	AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
110	bool IsVolatile = false, bool IsNonTemporal = false,
111	bool IsLastUse = false)
112	: Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
113	OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
114	IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
115	IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal),
116	IsLastUse(IsLastUse) {
117
118	if (Ordering == AtomicOrdering::NotAtomic) {
119	assert(Scope == SIAtomicScope::NONE &&
120	OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
121	!IsCrossAddressSpaceOrdering &&
122	FailureOrdering == AtomicOrdering::NotAtomic);
123	return;
124	}
125
126	assert(Scope != SIAtomicScope::NONE &&
127	(OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
128	SIAtomicAddrSpace::NONE &&
129	(InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
130	SIAtomicAddrSpace::NONE);
131
132	// There is also no cross address space ordering if the ordering
133	// address space is the same as the instruction address space and
134	// only contains a single address space.
135	if ((OrderingAddrSpace == InstrAddrSpace) &&
136	isPowerOf2_32(Value: uint32_t(InstrAddrSpace)))
137	this->IsCrossAddressSpaceOrdering = false;
138
139	// Limit the scope to the maximum supported by the instruction's address
140	// spaces.
141	if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
142	SIAtomicAddrSpace::NONE) {
143	this->Scope = std::min(a: Scope, b: SIAtomicScope::SINGLETHREAD);
144	} else if ((InstrAddrSpace &
145	~(SIAtomicAddrSpace::SCRATCH \| SIAtomicAddrSpace::LDS)) ==
146	SIAtomicAddrSpace::NONE) {
147	this->Scope = std::min(a: Scope, b: SIAtomicScope::WORKGROUP);
148	} else if ((InstrAddrSpace &
149	~(SIAtomicAddrSpace::SCRATCH \| SIAtomicAddrSpace::LDS \|
150	SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
151	this->Scope = std::min(a: Scope, b: SIAtomicScope::AGENT);
152	}
153	}
154
155	public:
156	/// \returns Atomic synchronization scope of the machine instruction used to
157	/// create this SIMemOpInfo.
158	SIAtomicScope getScope() const {
159	return Scope;
160	}
161
162	/// \returns Ordering constraint of the machine instruction used to
163	/// create this SIMemOpInfo.
164	AtomicOrdering getOrdering() const {
165	return Ordering;
166	}
167
168	/// \returns Failure ordering constraint of the machine instruction used to
169	/// create this SIMemOpInfo.
170	AtomicOrdering getFailureOrdering() const {
171	return FailureOrdering;
172	}
173
174	/// \returns The address spaces be accessed by the machine
175	/// instruction used to create this SIMemOpInfo.
176	SIAtomicAddrSpace getInstrAddrSpace() const {
177	return InstrAddrSpace;
178	}
179
180	/// \returns The address spaces that must be ordered by the machine
181	/// instruction used to create this SIMemOpInfo.
182	SIAtomicAddrSpace getOrderingAddrSpace() const {
183	return OrderingAddrSpace;
184	}
185
186	/// \returns Return true iff memory ordering of operations on
187	/// different address spaces is required.
188	bool getIsCrossAddressSpaceOrdering() const {
189	return IsCrossAddressSpaceOrdering;
190	}
191
192	/// \returns True if memory access of the machine instruction used to
193	/// create this SIMemOpInfo is volatile, false otherwise.
194	bool isVolatile() const {
195	return IsVolatile;
196	}
197
198	/// \returns True if memory access of the machine instruction used to
199	/// create this SIMemOpInfo is nontemporal, false otherwise.
200	bool isNonTemporal() const {
201	return IsNonTemporal;
202	}
203
204	/// \returns True if memory access of the machine instruction used to
205	/// create this SIMemOpInfo is last use, false otherwise.
206	bool isLastUse() const { return IsLastUse; }
207
208	/// \returns True if ordering constraint of the machine instruction used to
209	/// create this SIMemOpInfo is unordered or higher, false otherwise.
210	bool isAtomic() const {
211	return Ordering != AtomicOrdering::NotAtomic;
212	}
213
214	};
215
216	class SIMemOpAccess final {
217	private:
218	AMDGPUMachineModuleInfo MMI = nullptr*;
219
220	/// Reports unsupported message \p Msg for \p MI to LLVM context.
221	void reportUnsupported(const MachineBasicBlock::iterator &MI,
222	const char Msg) const*;
223
224	/// Inspects the target synchronization scope \p SSID and determines
225	/// the SI atomic scope it corresponds to, the address spaces it
226	/// covers, and whether the memory ordering applies between address
227	/// spaces.
228	std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
229	toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
230
231	/// \return Return a bit set of the address spaces accessed by \p AS.
232	SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
233
234	/// \returns Info constructed from \p MI, which has at least machine memory
235	/// operand.
236	std::optional<SIMemOpInfo>
237	constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
238
239	public:
240	/// Construct class to support accessing the machine memory operands
241	/// of instructions in the machine function \p MF.
242	SIMemOpAccess(MachineFunction &MF);
243
244	/// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
245	std::optional<SIMemOpInfo>
246	getLoadInfo(const MachineBasicBlock::iterator &MI) const;
247
248	/// \returns Store info if \p MI is a store operation, "std::nullopt"
249	/// otherwise.
250	std::optional<SIMemOpInfo>
251	getStoreInfo(const MachineBasicBlock::iterator &MI) const;
252
253	/// \returns Atomic fence info if \p MI is an atomic fence operation,
254	/// "std::nullopt" otherwise.
255	std::optional<SIMemOpInfo>
256	getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;
257
258	/// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
259	/// rmw operation, "std::nullopt" otherwise.
260	std::optional<SIMemOpInfo>
261	getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
262	};
263
264	class SICacheControl {
265	protected:
266
267	/// AMDGPU subtarget info.
268	const GCNSubtarget &ST;
269
270	/// Instruction info.
271	const SIInstrInfo TII = nullptr*;
272
273	IsaVersion IV;
274
275	/// Whether to insert cache invalidating instructions.
276	bool InsertCacheInv;
277
278	SICacheControl(const GCNSubtarget &ST);
279
280	/// Sets named bit \p BitName to "true" if present in instruction \p MI.
281	/// \returns Returns true if \p MI is modified, false otherwise.
282	bool enableNamedBit(const MachineBasicBlock::iterator MI,
283	AMDGPU::CPol::CPol Bit) const;
284
285	public:
286
287	/// Create a cache control for the subtarget \p ST.
288	static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
289
290	/// Update \p MI memory load instruction to bypass any caches up to
291	/// the \p Scope memory scope for address spaces \p
292	/// AddrSpace. Return true iff the instruction was modified.
293	virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
294	SIAtomicScope Scope,
295	SIAtomicAddrSpace AddrSpace) const = `0`;
296
297	/// Update \p MI memory store instruction to bypass any caches up to
298	/// the \p Scope memory scope for address spaces \p
299	/// AddrSpace. Return true iff the instruction was modified.
300	virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
301	SIAtomicScope Scope,
302	SIAtomicAddrSpace AddrSpace) const = `0`;
303
304	/// Update \p MI memory read-modify-write instruction to bypass any caches up
305	/// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
306	/// iff the instruction was modified.
307	virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
308	SIAtomicScope Scope,
309	SIAtomicAddrSpace AddrSpace) const = `0`;
310
311	/// Update \p MI memory instruction of kind \p Op associated with address
312	/// spaces \p AddrSpace to indicate it is volatile and/or
313	/// nontemporal/last-use. Return true iff the instruction was modified.
314	virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
315	SIAtomicAddrSpace AddrSpace,
316	SIMemOp Op, bool IsVolatile,
317	bool IsNonTemporal,
318	bool IsLastUse = false) const = `0`;
319
320	virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const {
321	return false;
322	};
323
324	/// Inserts any necessary instructions at position \p Pos relative
325	/// to instruction \p MI to ensure memory instructions before \p Pos of kind
326	/// \p Op associated with address spaces \p AddrSpace have completed. Used
327	/// between memory instructions to enforce the order they become visible as
328	/// observed by other memory instructions executing in memory scope \p Scope.
329	/// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
330	/// address spaces. Returns true iff any instructions inserted.
331	virtual bool insertWait(MachineBasicBlock::iterator &MI,
332	SIAtomicScope Scope,
333	SIAtomicAddrSpace AddrSpace,
334	SIMemOp Op,
335	bool IsCrossAddrSpaceOrdering,
336	Position Pos) const = `0`;
337
338	/// Inserts any necessary instructions at position \p Pos relative to
339	/// instruction \p MI to ensure any subsequent memory instructions of this
340	/// thread with address spaces \p AddrSpace will observe the previous memory
341	/// operations by any thread for memory scopes up to memory scope \p Scope .
342	/// Returns true iff any instructions inserted.
343	virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
344	SIAtomicScope Scope,
345	SIAtomicAddrSpace AddrSpace,
346	Position Pos) const = `0`;
347
348	/// Inserts any necessary instructions at position \p Pos relative to
349	/// instruction \p MI to ensure previous memory instructions by this thread
350	/// with address spaces \p AddrSpace have completed and can be observed by
351	/// subsequent memory instructions by any thread executing in memory scope \p
352	/// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
353	/// between address spaces. Returns true iff any instructions inserted.
354	virtual bool insertRelease(MachineBasicBlock::iterator &MI,
355	SIAtomicScope Scope,
356	SIAtomicAddrSpace AddrSpace,
357	bool IsCrossAddrSpaceOrdering,
358	Position Pos) const = `0`;
359
360	/// Virtual destructor to allow derivations to be deleted.
361	virtual ~SICacheControl() = default;
362
363	virtual bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
364	MachineBasicBlock::iterator &MI) const {
365	return false;
366	}
367	};
368
369	class SIGfx6CacheControl : public SICacheControl {
370	protected:
371
372	/// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
373	/// is modified, false otherwise.
374	bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
375	return enableNamedBit(MI, Bit: AMDGPU::CPol::GLC);
376	}
377
378	/// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
379	/// is modified, false otherwise.
380	bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
381	return enableNamedBit(MI, Bit: AMDGPU::CPol::SLC);
382	}
383
384	public:
385
386	SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl (ST) {}
387
388	bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
389	SIAtomicScope Scope,
390	SIAtomicAddrSpace AddrSpace) const override;
391
392	bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
393	SIAtomicScope Scope,
394	SIAtomicAddrSpace AddrSpace) const override;
395
396	bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
397	SIAtomicScope Scope,
398	SIAtomicAddrSpace AddrSpace) const override;
399
400	bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
401	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
402	bool IsVolatile, bool IsNonTemporal,
403	bool IsLastUse) const override;
404
405	bool insertWait(MachineBasicBlock::iterator &MI,
406	SIAtomicScope Scope,
407	SIAtomicAddrSpace AddrSpace,
408	SIMemOp Op,
409	bool IsCrossAddrSpaceOrdering,
410	Position Pos) const override;
411
412	bool insertAcquire(MachineBasicBlock::iterator &MI,
413	SIAtomicScope Scope,
414	SIAtomicAddrSpace AddrSpace,
415	Position Pos) const override;
416
417	bool insertRelease(MachineBasicBlock::iterator &MI,
418	SIAtomicScope Scope,
419	SIAtomicAddrSpace AddrSpace,
420	bool IsCrossAddrSpaceOrdering,
421	Position Pos) const override;
422	};
423
424	class SIGfx7CacheControl : public SIGfx6CacheControl {
425	public:
426
427	SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl (ST) {}
428
429	bool insertAcquire(MachineBasicBlock::iterator &MI,
430	SIAtomicScope Scope,
431	SIAtomicAddrSpace AddrSpace,
432	Position Pos) const override;
433
434	};
435
436	class SIGfx90ACacheControl : public SIGfx7CacheControl {
437	public:
438
439	SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl (ST) {}
440
441	bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
442	SIAtomicScope Scope,
443	SIAtomicAddrSpace AddrSpace) const override;
444
445	bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
446	SIAtomicScope Scope,
447	SIAtomicAddrSpace AddrSpace) const override;
448
449	bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
450	SIAtomicScope Scope,
451	SIAtomicAddrSpace AddrSpace) const override;
452
453	bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
454	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
455	bool IsVolatile, bool IsNonTemporal,
456	bool IsLastUse) const override;
457
458	bool insertWait(MachineBasicBlock::iterator &MI,
459	SIAtomicScope Scope,
460	SIAtomicAddrSpace AddrSpace,
461	SIMemOp Op,
462	bool IsCrossAddrSpaceOrdering,
463	Position Pos) const override;
464
465	bool insertAcquire(MachineBasicBlock::iterator &MI,
466	SIAtomicScope Scope,
467	SIAtomicAddrSpace AddrSpace,
468	Position Pos) const override;
469
470	bool insertRelease(MachineBasicBlock::iterator &MI,
471	SIAtomicScope Scope,
472	SIAtomicAddrSpace AddrSpace,
473	bool IsCrossAddrSpaceOrdering,
474	Position Pos) const override;
475	};
476
477	class SIGfx940CacheControl : public SIGfx90ACacheControl {
478	protected:
479
480	/// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
481	/// is modified, false otherwise.
482	bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
483	return enableNamedBit(MI, Bit: AMDGPU::CPol::SC0);
484	}
485
486	/// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
487	/// is modified, false otherwise.
488	bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
489	return enableNamedBit(MI, Bit: AMDGPU::CPol::SC1);
490	}
491
492	/// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
493	/// is modified, false otherwise.
494	bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
495	return enableNamedBit(MI, Bit: AMDGPU::CPol::NT);
496	}
497
498	public:
499
500	SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl (ST) {};
501
502	bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
503	SIAtomicScope Scope,
504	SIAtomicAddrSpace AddrSpace) const override;
505
506	bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
507	SIAtomicScope Scope,
508	SIAtomicAddrSpace AddrSpace) const override;
509
510	bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
511	SIAtomicScope Scope,
512	SIAtomicAddrSpace AddrSpace) const override;
513
514	bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
515	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
516	bool IsVolatile, bool IsNonTemporal,
517	bool IsLastUse) const override;
518
519	bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
520	SIAtomicAddrSpace AddrSpace, Position Pos) const override;
521
522	bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
523	SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
524	Position Pos) const override;
525
526	bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
527	MachineBasicBlock::iterator &MI) const override {
528	bool Changed = false;
529	if (ST.hasForceStoreSC0SC1() &&
530	(MOI.getInstrAddrSpace() & (SIAtomicAddrSpace::SCRATCH \|
531	SIAtomicAddrSpace::GLOBAL \|
532	SIAtomicAddrSpace::OTHER)) !=
533	SIAtomicAddrSpace::NONE) {
534	Changed \|= enableSC0Bit(MI);
535	Changed \|= enableSC1Bit(MI);
536	}
537	return Changed;
538	}
539	};
540
541	class SIGfx10CacheControl : public SIGfx7CacheControl {
542	protected:
543
544	/// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
545	/// is modified, false otherwise.
546	bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
547	return enableNamedBit(MI, Bit: AMDGPU::CPol::DLC);
548	}
549
550	public:
551
552	SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl (ST) {}
553
554	bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
555	SIAtomicScope Scope,
556	SIAtomicAddrSpace AddrSpace) const override;
557
558	bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
559	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
560	bool IsVolatile, bool IsNonTemporal,
561	bool IsLastUse) const override;
562
563	bool insertWait(MachineBasicBlock::iterator &MI,
564	SIAtomicScope Scope,
565	SIAtomicAddrSpace AddrSpace,
566	SIMemOp Op,
567	bool IsCrossAddrSpaceOrdering,
568	Position Pos) const override;
569
570	bool insertAcquire(MachineBasicBlock::iterator &MI,
571	SIAtomicScope Scope,
572	SIAtomicAddrSpace AddrSpace,
573	Position Pos) const override;
574	};
575
576	class SIGfx11CacheControl : public SIGfx10CacheControl {
577	public:
578	SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl (ST) {}
579
580	bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
581	SIAtomicScope Scope,
582	SIAtomicAddrSpace AddrSpace) const override;
583
584	bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
585	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
586	bool IsVolatile, bool IsNonTemporal,
587	bool IsLastUse) const override;
588	};
589
590	class SIGfx12CacheControl : public SIGfx11CacheControl {
591	protected:
592	// Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
593	// \returns Returns true if \p MI is modified, false otherwise.
594	bool setTH(const MachineBasicBlock::iterator MI,
595	AMDGPU::CPol::CPol Value) const;
596	// Sets Scope policy to \p Value if CPol operand is present in instruction \p
597	// MI. \returns Returns true if \p MI is modified, false otherwise.
598	bool setScope(const MachineBasicBlock::iterator MI,
599	AMDGPU::CPol::CPol Value) const;
600
601	// Stores with system scope (SCOPE_SYS) need to wait for:
602	// - loads or atomics(returning) - wait for {LOAD\|SAMPLE\|BVH\|KM}CNT==0
603	// - non-returning-atomics - wait for STORECNT==0
604	// TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits
605	// since it does not distinguish atomics-with-return from regular stores.
606	// There is no need to wait if memory is cached (mtype != UC).
607	bool
608	insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const;
609
610	public:
611	SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl (ST) {}
612
613	bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
614	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
615	bool IsCrossAddrSpaceOrdering, Position Pos) const override;
616
617	bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
618	SIAtomicAddrSpace AddrSpace, Position Pos) const override;
619
620	bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
621	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
622	bool IsVolatile, bool IsNonTemporal,
623	bool IsLastUse) const override;
624
625	bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override;
626	};
627
628	class SIMemoryLegalizer final : public MachineFunctionPass {
629	private:
630
631	/// Cache Control.
632	std::unique_ptr<SICacheControl> CC = nullptr;
633
634	/// List of atomic pseudo instructions.
635	std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
636
637	/// Return true iff instruction \p MI is a atomic instruction that
638	/// returns a result.
639	bool isAtomicRet(const MachineInstr &MI) const {
640	return SIInstrInfo::isAtomicRet(MI);
641	}
642
643	/// Removes all processed atomic pseudo instructions from the current
644	/// function. Returns true if current function is modified, false otherwise.
645	bool removeAtomicPseudoMIs();
646
647	/// Expands load operation \p MI. Returns true if instructions are
648	/// added/deleted or \p MI is modified, false otherwise.
649	bool expandLoad(const SIMemOpInfo &MOI,
650	MachineBasicBlock::iterator &MI);
651	/// Expands store operation \p MI. Returns true if instructions are
652	/// added/deleted or \p MI is modified, false otherwise.
653	bool expandStore(const SIMemOpInfo &MOI,
654	MachineBasicBlock::iterator &MI);
655	/// Expands atomic fence operation \p MI. Returns true if
656	/// instructions are added/deleted or \p MI is modified, false otherwise.
657	bool expandAtomicFence(const SIMemOpInfo &MOI,
658	MachineBasicBlock::iterator &MI);
659	/// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
660	/// instructions are added/deleted or \p MI is modified, false otherwise.
661	bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
662	MachineBasicBlock::iterator &MI);
663
664	public:
665	static char ID;
666
667	SIMemoryLegalizer() : MachineFunctionPass (ID) {}
668
669	void getAnalysisUsage(AnalysisUsage &AU) const override {
670	AU.setPreservesCFG();
671	MachineFunctionPass::getAnalysisUsage(AU);
672	}
673
674	StringRef getPassName() const override {
675	return PASS_NAME;
676	}
677
678	bool runOnMachineFunction(MachineFunction &MF) override;
679	};
680
681	} // end namespace anonymous
682
683	void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
684	const char Msg) const* {
685	const Function &Func = MI ->getParent()->getParent()->getFunction();
686	DiagnosticInfoUnsupported Diag(Func, Msg, MI ->getDebugLoc());
687	Func.getContext().diagnose(DI: Diag);
688	}
689
690	std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
691	SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
692	SIAtomicAddrSpace InstrAddrSpace) const {
693	if (SSID == SyncScope::System)
694	return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
695	if (SSID == MMI->getAgentSSID())
696	return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
697	if (SSID == MMI->getWorkgroupSSID())
698	return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
699	true);
700	if (SSID == MMI->getWavefrontSSID())
701	return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,
702	true);
703	if (SSID == SyncScope::SingleThread)
704	return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,
705	true);
706	if (SSID == MMI->getSystemOneAddressSpaceSSID())
707	return std::tuple(SIAtomicScope::SYSTEM,
708	SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
709	if (SSID == MMI->getAgentOneAddressSpaceSSID())
710	return std::tuple(SIAtomicScope::AGENT,
711	SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
712	if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
713	return std::tuple(SIAtomicScope::WORKGROUP,
714	SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
715	if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
716	return std::tuple(SIAtomicScope::WAVEFRONT,
717	SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
718	if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
719	return std::tuple(SIAtomicScope::SINGLETHREAD,
720	SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
721	return std::nullopt;
722	}
723
724	SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
725	if (AS == AMDGPUAS::FLAT_ADDRESS)
726	return SIAtomicAddrSpace::FLAT;
727	if (AS == AMDGPUAS::GLOBAL_ADDRESS)
728	return SIAtomicAddrSpace::GLOBAL;
729	if (AS == AMDGPUAS::LOCAL_ADDRESS)
730	return SIAtomicAddrSpace::LDS;
731	if (AS == AMDGPUAS::PRIVATE_ADDRESS)
732	return SIAtomicAddrSpace::SCRATCH;
733	if (AS == AMDGPUAS::REGION_ADDRESS)
734	return SIAtomicAddrSpace::GDS;
735
736	return SIAtomicAddrSpace::OTHER;
737	}
738
739	SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
740	MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
741	}
742
743	std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
744	const MachineBasicBlock::iterator &MI) const {
745	assert(MI ->getNumMemOperands() > `0`);
746
747	SyncScope::ID SSID = SyncScope::SingleThread;
748	AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
749	AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
750	SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
751	bool IsNonTemporal = true;
752	bool IsVolatile = false;
753	bool IsLastUse = false;
754
755	// Validator should check whether or not MMOs cover the entire set of
756	// locations accessed by the memory instruction.
757	for (const auto &MMO : MI ->memoperands()) {
758	IsNonTemporal &= MMO->isNonTemporal();
759	IsVolatile \|= MMO->isVolatile();
760	IsLastUse \|= MMO->getFlags() & MOLastUse;
761	InstrAddrSpace \|=
762	toSIAtomicAddrSpace(AS: MMO->getPointerInfo().getAddrSpace());
763	AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
764	if (OpOrdering != AtomicOrdering::NotAtomic) {
765	const auto &IsSyncScopeInclusion =
766	MMI->isSyncScopeInclusion(A: SSID, B: MMO->getSyncScopeID());
767	if (!IsSyncScopeInclusion) {
768	reportUnsupported(MI,
769	Msg: "Unsupported non-inclusive atomic synchronization scope");
770	return std::nullopt;
771	}
772
773	SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
774	Ordering = getMergedAtomicOrdering(AO: Ordering, Other: OpOrdering);
775	assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
776	MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
777	FailureOrdering =
778	getMergedAtomicOrdering(AO: FailureOrdering, Other: MMO->getFailureOrdering());
779	}
780	}
781
782	SIAtomicScope Scope = SIAtomicScope::NONE;
783	SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
784	bool IsCrossAddressSpaceOrdering = false;
785	if (Ordering != AtomicOrdering::NotAtomic) {
786	auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
787	if (!ScopeOrNone) {
788	reportUnsupported(MI, Msg: "Unsupported atomic synchronization scope");
789	return std::nullopt;
790	}
791	std::tie(args&: Scope, args&: OrderingAddrSpace, args&: IsCrossAddressSpaceOrdering) =
792	*ScopeOrNone;
793	if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) \|\|
794	((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) \|\|
795	((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
796	reportUnsupported(MI, Msg: "Unsupported atomic address space");
797	return std::nullopt;
798	}
799	}
800	return SIMemOpInfo (Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
801	IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
802	IsNonTemporal, IsLastUse);
803	}
804
805	std::optional<SIMemOpInfo>
806	SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
807	assert(MI ->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
808
809	if (!(MI ->mayLoad() && !MI ->mayStore()))
810	return std::nullopt;
811
812	// Be conservative if there are no memory operands.
813	if (MI ->getNumMemOperands() == `0`)
814	return SIMemOpInfo ();
815
816	return constructFromMIWithMMO(MI);
817	}
818
819	std::optional<SIMemOpInfo>
820	SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
821	assert(MI ->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
822
823	if (!(!MI ->mayLoad() && MI ->mayStore()))
824	return std::nullopt;
825
826	// Be conservative if there are no memory operands.
827	if (MI ->getNumMemOperands() == `0`)
828	return SIMemOpInfo ();
829
830	return constructFromMIWithMMO(MI);
831	}
832
833	std::optional<SIMemOpInfo>
834	SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
835	assert(MI ->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
836
837	if (MI ->getOpcode() != AMDGPU::ATOMIC_FENCE)
838	return std::nullopt;
839
840	AtomicOrdering Ordering =
841	static_cast<AtomicOrdering>(MI ->getOperand(i: `0`).getImm());
842
843	SyncScope::ID SSID = static_cast<SyncScope::ID>(MI ->getOperand(i: `1`).getImm());
844	auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace: SIAtomicAddrSpace::ATOMIC);
845	if (!ScopeOrNone) {
846	reportUnsupported(MI, Msg: "Unsupported atomic synchronization scope");
847	return std::nullopt;
848	}
849
850	SIAtomicScope Scope = SIAtomicScope::NONE;
851	SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
852	bool IsCrossAddressSpaceOrdering = false;
853	std::tie(args&: Scope, args&: OrderingAddrSpace, args&: IsCrossAddressSpaceOrdering) =
854	*ScopeOrNone;
855
856	if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) \|\|
857	((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
858	reportUnsupported(MI, Msg: "Unsupported atomic address space");
859	return std::nullopt;
860	}
861
862	return SIMemOpInfo (Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
863	IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
864	}
865
866	std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
867	const MachineBasicBlock::iterator &MI) const {
868	assert(MI ->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
869
870	if (!(MI ->mayLoad() && MI ->mayStore()))
871	return std::nullopt;
872
873	// Be conservative if there are no memory operands.
874	if (MI ->getNumMemOperands() == `0`)
875	return SIMemOpInfo ();
876
877	return constructFromMIWithMMO(MI);
878	}
879
880	SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
881	TII = ST.getInstrInfo();
882	IV = getIsaVersion(ST.getCPU());
883	InsertCacheInv = !AmdgcnSkipCacheInvalidations;
884	}
885
886	bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
887	AMDGPU::CPol::CPol Bit) const {
888	MachineOperand CPol = TII->getNamedOperand(MI, AMDGPU::OpName::cpol);
889	if (!CPol)
890	return false;
891
892	CPol->setImm(CPol->getImm() \| Bit);
893	return true;
894	}
895
896	/ static /
897	std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
898	GCNSubtarget::Generation Generation = ST.getGeneration();
899	if (ST.hasGFX940Insts())
900	return std::make_unique<SIGfx940CacheControl>(args: ST);
901	if (ST.hasGFX90AInsts())
902	return std::make_unique<SIGfx90ACacheControl>(args: ST);
903	if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
904	return std::make_unique<SIGfx6CacheControl>(args: ST);
905	if (Generation < AMDGPUSubtarget::GFX10)
906	return std::make_unique<SIGfx7CacheControl>(args: ST);
907	if (Generation < AMDGPUSubtarget::GFX11)
908	return std::make_unique<SIGfx10CacheControl>(args: ST);
909	if (Generation < AMDGPUSubtarget::GFX12)
910	return std::make_unique<SIGfx11CacheControl>(args: ST);
911	return std::make_unique<SIGfx12CacheControl>(args: ST);
912	}
913
914	bool SIGfx6CacheControl::enableLoadCacheBypass(
915	const MachineBasicBlock::iterator &MI,
916	SIAtomicScope Scope,
917	SIAtomicAddrSpace AddrSpace) const {
918	assert(MI ->mayLoad() && !MI ->mayStore());
919	bool Changed = false;
920
921	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
922	switch (Scope) {
923	case SIAtomicScope::SYSTEM:
924	case SIAtomicScope::AGENT:
925	// Set L1 cache policy to MISS_EVICT.
926	// Note: there is no L2 cache bypass policy at the ISA level.
927	Changed \|= enableGLCBit(MI);
928	break;
929	case SIAtomicScope::WORKGROUP:
930	case SIAtomicScope::WAVEFRONT:
931	case SIAtomicScope::SINGLETHREAD:
932	// No cache to bypass.
933	break;
934	default:
935	llvm_unreachable("Unsupported synchronization scope");
936	}
937	}
938
939	/// The scratch address space does not need the global memory caches
940	/// to be bypassed as all memory operations by the same thread are
941	/// sequentially consistent, and no other thread can access scratch
942	/// memory.
943
944	/// Other address spaces do not have a cache.
945
946	return Changed;
947	}
948
949	bool SIGfx6CacheControl::enableStoreCacheBypass(
950	const MachineBasicBlock::iterator &MI,
951	SIAtomicScope Scope,
952	SIAtomicAddrSpace AddrSpace) const {
953	assert(!MI ->mayLoad() && MI ->mayStore());
954	bool Changed = false;
955
956	/// The L1 cache is write through so does not need to be bypassed. There is no
957	/// bypass control for the L2 cache at the isa level.
958
959	return Changed;
960	}
961
962	bool SIGfx6CacheControl::enableRMWCacheBypass(
963	const MachineBasicBlock::iterator &MI,
964	SIAtomicScope Scope,
965	SIAtomicAddrSpace AddrSpace) const {
966	assert(MI ->mayLoad() && MI ->mayStore());
967	bool Changed = false;
968
969	/// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
970	/// bypassed, and the GLC bit is instead used to indicate if they are
971	/// return or no-return.
972	/// Note: there is no L2 cache coherent bypass control at the ISA level.
973
974	return Changed;
975	}
976
977	bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
978	MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
979	bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
980	// Only handle load and store, not atomic read-modify-write insructions. The
981	// latter use glc to indicate if the atomic returns a result and so must not
982	// be used for cache control.
983	assert(MI ->mayLoad() ^ MI ->mayStore());
984
985	// Only update load and store, not LLVM IR atomic read-modify-write
986	// instructions. The latter are always marked as volatile so cannot sensibly
987	// handle it as do not want to pessimize all atomics. Also they do not support
988	// the nontemporal attribute.
989	assert(Op == SIMemOp::LOAD \|\| Op == SIMemOp::STORE);
990
991	bool Changed = false;
992
993	if (IsVolatile) {
994	// Set L1 cache policy to be MISS_EVICT for load instructions
995	// and MISS_LRU for store instructions.
996	// Note: there is no L2 cache bypass policy at the ISA level.
997	if (Op == SIMemOp::LOAD)
998	Changed \|= enableGLCBit(MI);
999
1000	// Ensure operation has completed at system scope to cause all volatile
1001	// operations to be visible outside the program in a global order. Do not
1002	// request cross address space as only the global address space can be
1003	// observable outside the program, so no need to cause a waitcnt for LDS
1004	// address space operations.
1005	Changed \|= insertWait(MI, Scope: SIAtomicScope::SYSTEM, AddrSpace, Op, IsCrossAddrSpaceOrdering: false,
1006	Pos: Position::AFTER);
1007
1008	return Changed;
1009	}
1010
1011	if (IsNonTemporal) {
1012	// Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1013	// for both loads and stores, and the L2 cache policy to STREAM.
1014	Changed \|= enableGLCBit(MI);
1015	Changed \|= enableSLCBit(MI);
1016	return Changed;
1017	}
1018
1019	return Changed;
1020	}
1021
1022	bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1023	SIAtomicScope Scope,
1024	SIAtomicAddrSpace AddrSpace,
1025	SIMemOp Op,
1026	bool IsCrossAddrSpaceOrdering,
1027	Position Pos) const {
1028	bool Changed = false;
1029
1030	MachineBasicBlock &MBB = *MI ->getParent();
1031	DebugLoc DL = MI ->getDebugLoc();
1032
1033	if (Pos == Position::AFTER)
1034	++MI;
1035
1036	bool VMCnt = false;
1037	bool LGKMCnt = false;
1038
1039	if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL \| SIAtomicAddrSpace::SCRATCH)) !=
1040	SIAtomicAddrSpace::NONE) {
1041	switch (Scope) {
1042	case SIAtomicScope::SYSTEM:
1043	case SIAtomicScope::AGENT:
1044	VMCnt \|= true;
1045	break;
1046	case SIAtomicScope::WORKGROUP:
1047	case SIAtomicScope::WAVEFRONT:
1048	case SIAtomicScope::SINGLETHREAD:
1049	// The L1 cache keeps all memory operations in order for
1050	// wavefronts in the same work-group.
1051	break;
1052	default:
1053	llvm_unreachable("Unsupported synchronization scope");
1054	}
1055	}
1056
1057	if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1058	switch (Scope) {
1059	case SIAtomicScope::SYSTEM:
1060	case SIAtomicScope::AGENT:
1061	case SIAtomicScope::WORKGROUP:
1062	// If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1063	// not needed as LDS operations for all waves are executed in a total
1064	// global ordering as observed by all waves. Required if also
1065	// synchronizing with global/GDS memory as LDS operations could be
1066	// reordered with respect to later global/GDS memory operations of the
1067	// same wave.
1068	LGKMCnt \|= IsCrossAddrSpaceOrdering;
1069	break;
1070	case SIAtomicScope::WAVEFRONT:
1071	case SIAtomicScope::SINGLETHREAD:
1072	// The LDS keeps all memory operations in order for
1073	// the same wavefront.
1074	break;
1075	default:
1076	llvm_unreachable("Unsupported synchronization scope");
1077	}
1078	}
1079
1080	if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1081	switch (Scope) {
1082	case SIAtomicScope::SYSTEM:
1083	case SIAtomicScope::AGENT:
1084	// If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1085	// is not needed as GDS operations for all waves are executed in a total
1086	// global ordering as observed by all waves. Required if also
1087	// synchronizing with global/LDS memory as GDS operations could be
1088	// reordered with respect to later global/LDS memory operations of the
1089	// same wave.
1090	LGKMCnt \|= IsCrossAddrSpaceOrdering;
1091	break;
1092	case SIAtomicScope::WORKGROUP:
1093	case SIAtomicScope::WAVEFRONT:
1094	case SIAtomicScope::SINGLETHREAD:
1095	// The GDS keeps all memory operations in order for
1096	// the same work-group.
1097	break;
1098	default:
1099	llvm_unreachable("Unsupported synchronization scope");
1100	}
1101	}
1102
1103	if (VMCnt \|\| LGKMCnt) {
1104	unsigned WaitCntImmediate =
1105	AMDGPU::encodeWaitcnt(Version: IV,
1106	Vmcnt: VMCnt ? `0` : getVmcntBitMask(Version: IV),
1107	Expcnt: getExpcntBitMask(Version: IV),
1108	Lgkmcnt: LGKMCnt ? `0` : getLgkmcntBitMask(Version: IV));
1109	BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1110	.addImm(WaitCntImmediate);
1111	Changed = true;
1112	}
1113
1114	if (Pos == Position::AFTER)
1115	--MI;
1116
1117	return Changed;
1118	}
1119
1120	bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1121	SIAtomicScope Scope,
1122	SIAtomicAddrSpace AddrSpace,
1123	Position Pos) const {
1124	if (!InsertCacheInv)
1125	return false;
1126
1127	bool Changed = false;
1128
1129	MachineBasicBlock &MBB = *MI ->getParent();
1130	DebugLoc DL = MI ->getDebugLoc();
1131
1132	if (Pos == Position::AFTER)
1133	++MI;
1134
1135	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1136	switch (Scope) {
1137	case SIAtomicScope::SYSTEM:
1138	case SIAtomicScope::AGENT:
1139	BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1140	Changed = true;
1141	break;
1142	case SIAtomicScope::WORKGROUP:
1143	case SIAtomicScope::WAVEFRONT:
1144	case SIAtomicScope::SINGLETHREAD:
1145	// No cache to invalidate.
1146	break;
1147	default:
1148	llvm_unreachable("Unsupported synchronization scope");
1149	}
1150	}
1151
1152	/// The scratch address space does not need the global memory cache
1153	/// to be flushed as all memory operations by the same thread are
1154	/// sequentially consistent, and no other thread can access scratch
1155	/// memory.
1156
1157	/// Other address spaces do not have a cache.
1158
1159	if (Pos == Position::AFTER)
1160	--MI;
1161
1162	return Changed;
1163	}
1164
1165	bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1166	SIAtomicScope Scope,
1167	SIAtomicAddrSpace AddrSpace,
1168	bool IsCrossAddrSpaceOrdering,
1169	Position Pos) const {
1170	return insertWait(MI, Scope, AddrSpace, Op: SIMemOp::LOAD \| SIMemOp::STORE,
1171	IsCrossAddrSpaceOrdering, Pos);
1172	}
1173
1174	bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1175	SIAtomicScope Scope,
1176	SIAtomicAddrSpace AddrSpace,
1177	Position Pos) const {
1178	if (!InsertCacheInv)
1179	return false;
1180
1181	bool Changed = false;
1182
1183	MachineBasicBlock &MBB = *MI ->getParent();
1184	DebugLoc DL = MI ->getDebugLoc();
1185
1186	const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1187
1188	const unsigned InvalidateL1 = STM.isAmdPalOS() \|\| STM.isMesa3DOS()
1189	? AMDGPU::BUFFER_WBINVL1
1190	: AMDGPU::BUFFER_WBINVL1_VOL;
1191
1192	if (Pos == Position::AFTER)
1193	++MI;
1194
1195	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1196	switch (Scope) {
1197	case SIAtomicScope::SYSTEM:
1198	case SIAtomicScope::AGENT:
1199	BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1200	Changed = true;
1201	break;
1202	case SIAtomicScope::WORKGROUP:
1203	case SIAtomicScope::WAVEFRONT:
1204	case SIAtomicScope::SINGLETHREAD:
1205	// No cache to invalidate.
1206	break;
1207	default:
1208	llvm_unreachable("Unsupported synchronization scope");
1209	}
1210	}
1211
1212	/// The scratch address space does not need the global memory cache
1213	/// to be flushed as all memory operations by the same thread are
1214	/// sequentially consistent, and no other thread can access scratch
1215	/// memory.
1216
1217	/// Other address spaces do not have a cache.
1218
1219	if (Pos == Position::AFTER)
1220	--MI;
1221
1222	return Changed;
1223	}
1224
1225	bool SIGfx90ACacheControl::enableLoadCacheBypass(
1226	const MachineBasicBlock::iterator &MI,
1227	SIAtomicScope Scope,
1228	SIAtomicAddrSpace AddrSpace) const {
1229	assert(MI ->mayLoad() && !MI ->mayStore());
1230	bool Changed = false;
1231
1232	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1233	switch (Scope) {
1234	case SIAtomicScope::SYSTEM:
1235	case SIAtomicScope::AGENT:
1236	// Set the L1 cache policy to MISS_LRU.
1237	// Note: there is no L2 cache bypass policy at the ISA level.
1238	Changed \|= enableGLCBit(MI);
1239	break;
1240	case SIAtomicScope::WORKGROUP:
1241	// In threadgroup split mode the waves of a work-group can be executing on
1242	// different CUs. Therefore need to bypass the L1 which is per CU.
1243	// Otherwise in non-threadgroup split mode all waves of a work-group are
1244	// on the same CU, and so the L1 does not need to be bypassed.
1245	if (ST.isTgSplitEnabled())
1246	Changed \|= enableGLCBit(MI);
1247	break;
1248	case SIAtomicScope::WAVEFRONT:
1249	case SIAtomicScope::SINGLETHREAD:
1250	// No cache to bypass.
1251	break;
1252	default:
1253	llvm_unreachable("Unsupported synchronization scope");
1254	}
1255	}
1256
1257	/// The scratch address space does not need the global memory caches
1258	/// to be bypassed as all memory operations by the same thread are
1259	/// sequentially consistent, and no other thread can access scratch
1260	/// memory.
1261
1262	/// Other address spaces do not have a cache.
1263
1264	return Changed;
1265	}
1266
1267	bool SIGfx90ACacheControl::enableStoreCacheBypass(
1268	const MachineBasicBlock::iterator &MI,
1269	SIAtomicScope Scope,
1270	SIAtomicAddrSpace AddrSpace) const {
1271	assert(!MI ->mayLoad() && MI ->mayStore());
1272	bool Changed = false;
1273
1274	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1275	switch (Scope) {
1276	case SIAtomicScope::SYSTEM:
1277	case SIAtomicScope::AGENT:
1278	/// Do not set glc for store atomic operations as they implicitly write
1279	/// through the L1 cache.
1280	break;
1281	case SIAtomicScope::WORKGROUP:
1282	case SIAtomicScope::WAVEFRONT:
1283	case SIAtomicScope::SINGLETHREAD:
1284	// No cache to bypass. Store atomics implicitly write through the L1
1285	// cache.
1286	break;
1287	default:
1288	llvm_unreachable("Unsupported synchronization scope");
1289	}
1290	}
1291
1292	/// The scratch address space does not need the global memory caches
1293	/// to be bypassed as all memory operations by the same thread are
1294	/// sequentially consistent, and no other thread can access scratch
1295	/// memory.
1296
1297	/// Other address spaces do not have a cache.
1298
1299	return Changed;
1300	}
1301
1302	bool SIGfx90ACacheControl::enableRMWCacheBypass(
1303	const MachineBasicBlock::iterator &MI,
1304	SIAtomicScope Scope,
1305	SIAtomicAddrSpace AddrSpace) const {
1306	assert(MI ->mayLoad() && MI ->mayStore());
1307	bool Changed = false;
1308
1309	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1310	switch (Scope) {
1311	case SIAtomicScope::SYSTEM:
1312	case SIAtomicScope::AGENT:
1313	/// Do not set glc for RMW atomic operations as they implicitly bypass
1314	/// the L1 cache, and the glc bit is instead used to indicate if they are
1315	/// return or no-return.
1316	break;
1317	case SIAtomicScope::WORKGROUP:
1318	case SIAtomicScope::WAVEFRONT:
1319	case SIAtomicScope::SINGLETHREAD:
1320	// No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1321	break;
1322	default:
1323	llvm_unreachable("Unsupported synchronization scope");
1324	}
1325	}
1326
1327	return Changed;
1328	}
1329
1330	bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1331	MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1332	bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1333	// Only handle load and store, not atomic read-modify-write insructions. The
1334	// latter use glc to indicate if the atomic returns a result and so must not
1335	// be used for cache control.
1336	assert(MI ->mayLoad() ^ MI ->mayStore());
1337
1338	// Only update load and store, not LLVM IR atomic read-modify-write
1339	// instructions. The latter are always marked as volatile so cannot sensibly
1340	// handle it as do not want to pessimize all atomics. Also they do not support
1341	// the nontemporal attribute.
1342	assert(Op == SIMemOp::LOAD \|\| Op == SIMemOp::STORE);
1343
1344	bool Changed = false;
1345
1346	if (IsVolatile) {
1347	// Set L1 cache policy to be MISS_EVICT for load instructions
1348	// and MISS_LRU for store instructions.
1349	// Note: there is no L2 cache bypass policy at the ISA level.
1350	if (Op == SIMemOp::LOAD)
1351	Changed \|= enableGLCBit(MI);
1352
1353	// Ensure operation has completed at system scope to cause all volatile
1354	// operations to be visible outside the program in a global order. Do not
1355	// request cross address space as only the global address space can be
1356	// observable outside the program, so no need to cause a waitcnt for LDS
1357	// address space operations.
1358	Changed \|= insertWait(MI, Scope: SIAtomicScope::SYSTEM, AddrSpace, Op, IsCrossAddrSpaceOrdering: false,
1359	Pos: Position::AFTER);
1360
1361	return Changed;
1362	}
1363
1364	if (IsNonTemporal) {
1365	// Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1366	// for both loads and stores, and the L2 cache policy to STREAM.
1367	Changed \|= enableGLCBit(MI);
1368	Changed \|= enableSLCBit(MI);
1369	return Changed;
1370	}
1371
1372	return Changed;
1373	}
1374
1375	bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1376	SIAtomicScope Scope,
1377	SIAtomicAddrSpace AddrSpace,
1378	SIMemOp Op,
1379	bool IsCrossAddrSpaceOrdering,
1380	Position Pos) const {
1381	if (ST.isTgSplitEnabled()) {
1382	// In threadgroup split mode the waves of a work-group can be executing on
1383	// different CUs. Therefore need to wait for global or GDS memory operations
1384	// to complete to ensure they are visible to waves in the other CUs.
1385	// Otherwise in non-threadgroup split mode all waves of a work-group are on
1386	// the same CU, so no need to wait for global memory as all waves in the
1387	// work-group access the same the L1, nor wait for GDS as access are ordered
1388	// on a CU.
1389	if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL \| SIAtomicAddrSpace::SCRATCH \|
1390	SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1391	(Scope == SIAtomicScope::WORKGROUP)) {
1392	// Same as GFX7 using agent scope.
1393	Scope = SIAtomicScope::AGENT;
1394	}
1395	// In threadgroup split mode LDS cannot be allocated so no need to wait for
1396	// LDS memory operations.
1397	AddrSpace &= ~SIAtomicAddrSpace::LDS;
1398	}
1399	return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1400	IsCrossAddrSpaceOrdering, Pos);
1401	}
1402
1403	bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1404	SIAtomicScope Scope,
1405	SIAtomicAddrSpace AddrSpace,
1406	Position Pos) const {
1407	if (!InsertCacheInv)
1408	return false;
1409
1410	bool Changed = false;
1411
1412	MachineBasicBlock &MBB = *MI ->getParent();
1413	DebugLoc DL = MI ->getDebugLoc();
1414
1415	if (Pos == Position::AFTER)
1416	++MI;
1417
1418	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1419	switch (Scope) {
1420	case SIAtomicScope::SYSTEM:
1421	// Ensures that following loads will not see stale remote VMEM data or
1422	// stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1423	// CC will never be stale due to the local memory probes.
1424	BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1425	// Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1426	// hardware does not reorder memory operations by the same wave with
1427	// respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1428	// remove any cache lines of earlier writes by the same wave and ensures
1429	// later reads by the same wave will refetch the cache lines.
1430	Changed = true;
1431	break;
1432	case SIAtomicScope::AGENT:
1433	// Same as GFX7.
1434	break;
1435	case SIAtomicScope::WORKGROUP:
1436	// In threadgroup split mode the waves of a work-group can be executing on
1437	// different CUs. Therefore need to invalidate the L1 which is per CU.
1438	// Otherwise in non-threadgroup split mode all waves of a work-group are
1439	// on the same CU, and so the L1 does not need to be invalidated.
1440	if (ST.isTgSplitEnabled()) {
1441	// Same as GFX7 using agent scope.
1442	Scope = SIAtomicScope::AGENT;
1443	}
1444	break;
1445	case SIAtomicScope::WAVEFRONT:
1446	case SIAtomicScope::SINGLETHREAD:
1447	// Same as GFX7.
1448	break;
1449	default:
1450	llvm_unreachable("Unsupported synchronization scope");
1451	}
1452	}
1453
1454	/// The scratch address space does not need the global memory cache
1455	/// to be flushed as all memory operations by the same thread are
1456	/// sequentially consistent, and no other thread can access scratch
1457	/// memory.
1458
1459	/// Other address spaces do not have a cache.
1460
1461	if (Pos == Position::AFTER)
1462	--MI;
1463
1464	Changed \|= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1465
1466	return Changed;
1467	}
1468
1469	bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1470	SIAtomicScope Scope,
1471	SIAtomicAddrSpace AddrSpace,
1472	bool IsCrossAddrSpaceOrdering,
1473	Position Pos) const {
1474	bool Changed = false;
1475
1476	MachineBasicBlock &MBB = *MI ->getParent();
1477	const DebugLoc &DL = MI ->getDebugLoc();
1478
1479	if (Pos == Position::AFTER)
1480	++MI;
1481
1482	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1483	switch (Scope) {
1484	case SIAtomicScope::SYSTEM:
1485	// Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1486	// hardware does not reorder memory operations by the same wave with
1487	// respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1488	// to initiate writeback of any dirty cache lines of earlier writes by the
1489	// same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1490	// writeback has completed.
1491	BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1492	// Set SC bits to indicate system scope.
1493	.addImm(AMDGPU::CPol::SC0 \| AMDGPU::CPol::SC1);
1494	// Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1495	// vmcnt(0)" needed by the "BUFFER_WBL2".
1496	Changed = true;
1497	break;
1498	case SIAtomicScope::AGENT:
1499	case SIAtomicScope::WORKGROUP:
1500	case SIAtomicScope::WAVEFRONT:
1501	case SIAtomicScope::SINGLETHREAD:
1502	// Same as GFX7.
1503	break;
1504	default:
1505	llvm_unreachable("Unsupported synchronization scope");
1506	}
1507	}
1508
1509	if (Pos == Position::AFTER)
1510	--MI;
1511
1512	Changed \|=
1513	SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1514	IsCrossAddrSpaceOrdering, Pos);
1515
1516	return Changed;
1517	}
1518
1519	bool SIGfx940CacheControl::enableLoadCacheBypass(
1520	const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1521	SIAtomicAddrSpace AddrSpace) const {
1522	assert(MI ->mayLoad() && !MI ->mayStore());
1523	bool Changed = false;
1524
1525	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1526	switch (Scope) {
1527	case SIAtomicScope::SYSTEM:
1528	// Set SC bits to indicate system scope.
1529	Changed \|= enableSC0Bit(MI);
1530	Changed \|= enableSC1Bit(MI);
1531	break;
1532	case SIAtomicScope::AGENT:
1533	// Set SC bits to indicate agent scope.
1534	Changed \|= enableSC1Bit(MI);
1535	break;
1536	case SIAtomicScope::WORKGROUP:
1537	// In threadgroup split mode the waves of a work-group can be executing on
1538	// different CUs. Therefore need to bypass the L1 which is per CU.
1539	// Otherwise in non-threadgroup split mode all waves of a work-group are
1540	// on the same CU, and so the L1 does not need to be bypassed. Setting SC
1541	// bits to indicate work-group scope will do this automatically.
1542	Changed \|= enableSC0Bit(MI);
1543	break;
1544	case SIAtomicScope::WAVEFRONT:
1545	case SIAtomicScope::SINGLETHREAD:
1546	// Leave SC bits unset to indicate wavefront scope.
1547	break;
1548	default:
1549	llvm_unreachable("Unsupported synchronization scope");
1550	}
1551	}
1552
1553	/// The scratch address space does not need the global memory caches
1554	/// to be bypassed as all memory operations by the same thread are
1555	/// sequentially consistent, and no other thread can access scratch
1556	/// memory.
1557
1558	/// Other address spaces do not have a cache.
1559
1560	return Changed;
1561	}
1562
1563	bool SIGfx940CacheControl::enableStoreCacheBypass(
1564	const MachineBasicBlock::iterator &MI,
1565	SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
1566	assert(!MI ->mayLoad() && MI ->mayStore());
1567	bool Changed = false;
1568
1569	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1570	switch (Scope) {
1571	case SIAtomicScope::SYSTEM:
1572	// Set SC bits to indicate system scope.
1573	Changed \|= enableSC0Bit(MI);
1574	Changed \|= enableSC1Bit(MI);
1575	break;
1576	case SIAtomicScope::AGENT:
1577	// Set SC bits to indicate agent scope.
1578	Changed \|= enableSC1Bit(MI);
1579	break;
1580	case SIAtomicScope::WORKGROUP:
1581	// Set SC bits to indicate workgroup scope.
1582	Changed \|= enableSC0Bit(MI);
1583	break;
1584	case SIAtomicScope::WAVEFRONT:
1585	case SIAtomicScope::SINGLETHREAD:
1586	// Leave SC bits unset to indicate wavefront scope.
1587	break;
1588	default:
1589	llvm_unreachable("Unsupported synchronization scope");
1590	}
1591	}
1592
1593	/// The scratch address space does not need the global memory caches
1594	/// to be bypassed as all memory operations by the same thread are
1595	/// sequentially consistent, and no other thread can access scratch
1596	/// memory.
1597
1598	/// Other address spaces do not have a cache.
1599
1600	return Changed;
1601	}
1602
1603	bool SIGfx940CacheControl::enableRMWCacheBypass(
1604	const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1605	SIAtomicAddrSpace AddrSpace) const {
1606	assert(MI ->mayLoad() && MI ->mayStore());
1607	bool Changed = false;
1608
1609	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1610	switch (Scope) {
1611	case SIAtomicScope::SYSTEM:
1612	// Set SC1 bit to indicate system scope.
1613	Changed \|= enableSC1Bit(MI);
1614	break;
1615	case SIAtomicScope::AGENT:
1616	case SIAtomicScope::WORKGROUP:
1617	case SIAtomicScope::WAVEFRONT:
1618	case SIAtomicScope::SINGLETHREAD:
1619	// RMW atomic operations implicitly bypass the L1 cache and only use SC1
1620	// to indicate system or agent scope. The SC0 bit is used to indicate if
1621	// they are return or no-return. Leave SC1 bit unset to indicate agent
1622	// scope.
1623	break;
1624	default:
1625	llvm_unreachable("Unsupported synchronization scope");
1626	}
1627	}
1628
1629	return Changed;
1630	}
1631
1632	bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
1633	MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1634	bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1635	// Only handle load and store, not atomic read-modify-write insructions. The
1636	// latter use glc to indicate if the atomic returns a result and so must not
1637	// be used for cache control.
1638	assert(MI ->mayLoad() ^ MI ->mayStore());
1639
1640	// Only update load and store, not LLVM IR atomic read-modify-write
1641	// instructions. The latter are always marked as volatile so cannot sensibly
1642	// handle it as do not want to pessimize all atomics. Also they do not support
1643	// the nontemporal attribute.
1644	assert(Op == SIMemOp::LOAD \|\| Op == SIMemOp::STORE);
1645
1646	bool Changed = false;
1647
1648	if (IsVolatile) {
1649	// Set SC bits to indicate system scope.
1650	Changed \|= enableSC0Bit(MI);
1651	Changed \|= enableSC1Bit(MI);
1652
1653	// Ensure operation has completed at system scope to cause all volatile
1654	// operations to be visible outside the program in a global order. Do not
1655	// request cross address space as only the global address space can be
1656	// observable outside the program, so no need to cause a waitcnt for LDS
1657	// address space operations.
1658	Changed \|= insertWait(MI, Scope: SIAtomicScope::SYSTEM, AddrSpace, Op, IsCrossAddrSpaceOrdering: false,
1659	Pos: Position::AFTER);
1660
1661	return Changed;
1662	}
1663
1664	if (IsNonTemporal) {
1665	Changed \|= enableNTBit(MI);
1666	return Changed;
1667	}
1668
1669	return Changed;
1670	}
1671
1672	bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1673	SIAtomicScope Scope,
1674	SIAtomicAddrSpace AddrSpace,
1675	Position Pos) const {
1676	if (!InsertCacheInv)
1677	return false;
1678
1679	bool Changed = false;
1680
1681	MachineBasicBlock &MBB = *MI ->getParent();
1682	DebugLoc DL = MI ->getDebugLoc();
1683
1684	if (Pos == Position::AFTER)
1685	++MI;
1686
1687	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1688	switch (Scope) {
1689	case SIAtomicScope::SYSTEM:
1690	// Ensures that following loads will not see stale remote VMEM data or
1691	// stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1692	// CC will never be stale due to the local memory probes.
1693	BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1694	// Set SC bits to indicate system scope.
1695	.addImm(AMDGPU::CPol::SC0 \| AMDGPU::CPol::SC1);
1696	// Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1697	// hardware does not reorder memory operations by the same wave with
1698	// respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1699	// remove any cache lines of earlier writes by the same wave and ensures
1700	// later reads by the same wave will refetch the cache lines.
1701	Changed = true;
1702	break;
1703	case SIAtomicScope::AGENT:
1704	// Ensures that following loads will not see stale remote date or local
1705	// MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
1706	// due to the memory probes.
1707	BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1708	// Set SC bits to indicate agent scope.
1709	.addImm(AMDGPU::CPol::SC1);
1710	// Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1711	// does not reorder memory operations with respect to preceeding buffer
1712	// invalidate. The invalidate is guaranteed to remove any cache lines of
1713	// earlier writes and ensures later writes will refetch the cache lines.
1714	Changed = true;
1715	break;
1716	case SIAtomicScope::WORKGROUP:
1717	// In threadgroup split mode the waves of a work-group can be executing on
1718	// different CUs. Therefore need to invalidate the L1 which is per CU.
1719	// Otherwise in non-threadgroup split mode all waves of a work-group are
1720	// on the same CU, and so the L1 does not need to be invalidated.
1721	if (ST.isTgSplitEnabled()) {
1722	// Ensures L1 is invalidated if in threadgroup split mode. In
1723	// non-threadgroup split mode it is a NOP, but no point generating it in
1724	// that case if know not in that mode.
1725	BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1726	// Set SC bits to indicate work-group scope.
1727	.addImm(AMDGPU::CPol::SC0);
1728	// Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1729	// does not reorder memory operations with respect to preceeding buffer
1730	// invalidate. The invalidate is guaranteed to remove any cache lines of
1731	// earlier writes and ensures later writes will refetch the cache lines.
1732	Changed = true;
1733	}
1734	break;
1735	case SIAtomicScope::WAVEFRONT:
1736	case SIAtomicScope::SINGLETHREAD:
1737	// Could generate "BUFFER_INV" but it would do nothing as there are no
1738	// caches to invalidate.
1739	break;
1740	default:
1741	llvm_unreachable("Unsupported synchronization scope");
1742	}
1743	}
1744
1745	/// The scratch address space does not need the global memory cache
1746	/// to be flushed as all memory operations by the same thread are
1747	/// sequentially consistent, and no other thread can access scratch
1748	/// memory.
1749
1750	/// Other address spaces do not have a cache.
1751
1752	if (Pos == Position::AFTER)
1753	--MI;
1754
1755	return Changed;
1756	}
1757
1758	bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1759	SIAtomicScope Scope,
1760	SIAtomicAddrSpace AddrSpace,
1761	bool IsCrossAddrSpaceOrdering,
1762	Position Pos) const {
1763	bool Changed = false;
1764
1765	MachineBasicBlock &MBB = *MI ->getParent();
1766	DebugLoc DL = MI ->getDebugLoc();
1767
1768	if (Pos == Position::AFTER)
1769	++MI;
1770
1771	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1772	switch (Scope) {
1773	case SIAtomicScope::SYSTEM:
1774	// Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1775	// hardware does not reorder memory operations by the same wave with
1776	// respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1777	// to initiate writeback of any dirty cache lines of earlier writes by the
1778	// same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1779	// writeback has completed.
1780	BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1781	// Set SC bits to indicate system scope.
1782	.addImm(AMDGPU::CPol::SC0 \| AMDGPU::CPol::SC1);
1783	// Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1784	// SIAtomicScope::SYSTEM, the following insertWait will generate the
1785	// required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
1786	Changed = true;
1787	break;
1788	case SIAtomicScope::AGENT:
1789	BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1790	// Set SC bits to indicate agent scope.
1791	.addImm(AMDGPU::CPol::SC1);
1792
1793	// Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1794	// SIAtomicScope::AGENT, the following insertWait will generate the
1795	// required "S_WAITCNT vmcnt(0)".
1796	Changed = true;
1797	break;
1798	case SIAtomicScope::WORKGROUP:
1799	case SIAtomicScope::WAVEFRONT:
1800	case SIAtomicScope::SINGLETHREAD:
1801	// Do not generate "BUFFER_WBL2" as there are no caches it would
1802	// writeback, and would require an otherwise unnecessary
1803	// "S_WAITCNT vmcnt(0)".
1804	break;
1805	default:
1806	llvm_unreachable("Unsupported synchronization scope");
1807	}
1808	}
1809
1810	if (Pos == Position::AFTER)
1811	--MI;
1812
1813	// Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1814	// S_WAITCNT needed.
1815	Changed \|= insertWait(MI, Scope, AddrSpace, Op: SIMemOp::LOAD \| SIMemOp::STORE,
1816	IsCrossAddrSpaceOrdering, Pos);
1817
1818	return Changed;
1819	}
1820
1821	bool SIGfx10CacheControl::enableLoadCacheBypass(
1822	const MachineBasicBlock::iterator &MI,
1823	SIAtomicScope Scope,
1824	SIAtomicAddrSpace AddrSpace) const {
1825	assert(MI ->mayLoad() && !MI ->mayStore());
1826	bool Changed = false;
1827
1828	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1829	switch (Scope) {
1830	case SIAtomicScope::SYSTEM:
1831	case SIAtomicScope::AGENT:
1832	// Set the L0 and L1 cache policies to MISS_EVICT.
1833	// Note: there is no L2 cache coherent bypass control at the ISA level.
1834	Changed \|= enableGLCBit(MI);
1835	Changed \|= enableDLCBit(MI);
1836	break;
1837	case SIAtomicScope::WORKGROUP:
1838	// In WGP mode the waves of a work-group can be executing on either CU of
1839	// the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1840	// CU mode all waves of a work-group are on the same CU, and so the L0
1841	// does not need to be bypassed.
1842	if (!ST.isCuModeEnabled())
1843	Changed \|= enableGLCBit(MI);
1844	break;
1845	case SIAtomicScope::WAVEFRONT:
1846	case SIAtomicScope::SINGLETHREAD:
1847	// No cache to bypass.
1848	break;
1849	default:
1850	llvm_unreachable("Unsupported synchronization scope");
1851	}
1852	}
1853
1854	/// The scratch address space does not need the global memory caches
1855	/// to be bypassed as all memory operations by the same thread are
1856	/// sequentially consistent, and no other thread can access scratch
1857	/// memory.
1858
1859	/// Other address spaces do not have a cache.
1860
1861	return Changed;
1862	}
1863
1864	bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1865	MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1866	bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1867
1868	// Only handle load and store, not atomic read-modify-write insructions. The
1869	// latter use glc to indicate if the atomic returns a result and so must not
1870	// be used for cache control.
1871	assert(MI ->mayLoad() ^ MI ->mayStore());
1872
1873	// Only update load and store, not LLVM IR atomic read-modify-write
1874	// instructions. The latter are always marked as volatile so cannot sensibly
1875	// handle it as do not want to pessimize all atomics. Also they do not support
1876	// the nontemporal attribute.
1877	assert(Op == SIMemOp::LOAD \|\| Op == SIMemOp::STORE);
1878
1879	bool Changed = false;
1880
1881	if (IsVolatile) {
1882	// Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1883	// and MISS_LRU for store instructions.
1884	// Note: there is no L2 cache coherent bypass control at the ISA level.
1885	if (Op == SIMemOp::LOAD) {
1886	Changed \|= enableGLCBit(MI);
1887	Changed \|= enableDLCBit(MI);
1888	}
1889
1890	// Ensure operation has completed at system scope to cause all volatile
1891	// operations to be visible outside the program in a global order. Do not
1892	// request cross address space as only the global address space can be
1893	// observable outside the program, so no need to cause a waitcnt for LDS
1894	// address space operations.
1895	Changed \|= insertWait(MI, Scope: SIAtomicScope::SYSTEM, AddrSpace, Op, IsCrossAddrSpaceOrdering: false,
1896	Pos: Position::AFTER);
1897	return Changed;
1898	}
1899
1900	if (IsNonTemporal) {
1901	// For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1902	// and L2 cache policy to STREAM.
1903	// For stores setting both GLC and SLC configures L0 and L1 cache policy
1904	// to MISS_EVICT and the L2 cache policy to STREAM.
1905	if (Op == SIMemOp::STORE)
1906	Changed \|= enableGLCBit(MI);
1907	Changed \|= enableSLCBit(MI);
1908
1909	return Changed;
1910	}
1911
1912	return Changed;
1913	}
1914
1915	bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1916	SIAtomicScope Scope,
1917	SIAtomicAddrSpace AddrSpace,
1918	SIMemOp Op,
1919	bool IsCrossAddrSpaceOrdering,
1920	Position Pos) const {
1921	bool Changed = false;
1922
1923	MachineBasicBlock &MBB = *MI ->getParent();
1924	DebugLoc DL = MI ->getDebugLoc();
1925
1926	if (Pos == Position::AFTER)
1927	++MI;
1928
1929	bool VMCnt = false;
1930	bool VSCnt = false;
1931	bool LGKMCnt = false;
1932
1933	if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL \| SIAtomicAddrSpace::SCRATCH)) !=
1934	SIAtomicAddrSpace::NONE) {
1935	switch (Scope) {
1936	case SIAtomicScope::SYSTEM:
1937	case SIAtomicScope::AGENT:
1938	if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1939	VMCnt \|= true;
1940	if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1941	VSCnt \|= true;
1942	break;
1943	case SIAtomicScope::WORKGROUP:
1944	// In WGP mode the waves of a work-group can be executing on either CU of
1945	// the WGP. Therefore need to wait for operations to complete to ensure
1946	// they are visible to waves in the other CU as the L0 is per CU.
1947	// Otherwise in CU mode and all waves of a work-group are on the same CU
1948	// which shares the same L0.
1949	if (!ST.isCuModeEnabled()) {
1950	if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1951	VMCnt \|= true;
1952	if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1953	VSCnt \|= true;
1954	}
1955	break;
1956	case SIAtomicScope::WAVEFRONT:
1957	case SIAtomicScope::SINGLETHREAD:
1958	// The L0 cache keeps all memory operations in order for
1959	// work-items in the same wavefront.
1960	break;
1961	default:
1962	llvm_unreachable("Unsupported synchronization scope");
1963	}
1964	}
1965
1966	if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1967	switch (Scope) {
1968	case SIAtomicScope::SYSTEM:
1969	case SIAtomicScope::AGENT:
1970	case SIAtomicScope::WORKGROUP:
1971	// If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1972	// not needed as LDS operations for all waves are executed in a total
1973	// global ordering as observed by all waves. Required if also
1974	// synchronizing with global/GDS memory as LDS operations could be
1975	// reordered with respect to later global/GDS memory operations of the
1976	// same wave.
1977	LGKMCnt \|= IsCrossAddrSpaceOrdering;
1978	break;
1979	case SIAtomicScope::WAVEFRONT:
1980	case SIAtomicScope::SINGLETHREAD:
1981	// The LDS keeps all memory operations in order for
1982	// the same wavefront.
1983	break;
1984	default:
1985	llvm_unreachable("Unsupported synchronization scope");
1986	}
1987	}
1988
1989	if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1990	switch (Scope) {
1991	case SIAtomicScope::SYSTEM:
1992	case SIAtomicScope::AGENT:
1993	// If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1994	// is not needed as GDS operations for all waves are executed in a total
1995	// global ordering as observed by all waves. Required if also
1996	// synchronizing with global/LDS memory as GDS operations could be
1997	// reordered with respect to later global/LDS memory operations of the
1998	// same wave.
1999	LGKMCnt \|= IsCrossAddrSpaceOrdering;
2000	break;
2001	case SIAtomicScope::WORKGROUP:
2002	case SIAtomicScope::WAVEFRONT:
2003	case SIAtomicScope::SINGLETHREAD:
2004	// The GDS keeps all memory operations in order for
2005	// the same work-group.
2006	break;
2007	default:
2008	llvm_unreachable("Unsupported synchronization scope");
2009	}
2010	}
2011
2012	if (VMCnt \|\| LGKMCnt) {
2013	unsigned WaitCntImmediate =
2014	AMDGPU::encodeWaitcnt(Version: IV,
2015	Vmcnt: VMCnt ? `0` : getVmcntBitMask(Version: IV),
2016	Expcnt: getExpcntBitMask(Version: IV),
2017	Lgkmcnt: LGKMCnt ? `0` : getLgkmcntBitMask(Version: IV));
2018	BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
2019	.addImm(WaitCntImmediate);
2020	Changed = true;
2021	}
2022
2023	if (VSCnt) {
2024	BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
2025	.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
2026	.addImm(`0`);
2027	Changed = true;
2028	}
2029
2030	if (Pos == Position::AFTER)
2031	--MI;
2032
2033	return Changed;
2034	}
2035
2036	bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2037	SIAtomicScope Scope,
2038	SIAtomicAddrSpace AddrSpace,
2039	Position Pos) const {
2040	if (!InsertCacheInv)
2041	return false;
2042
2043	bool Changed = false;
2044
2045	MachineBasicBlock &MBB = *MI ->getParent();
2046	DebugLoc DL = MI ->getDebugLoc();
2047
2048	if (Pos == Position::AFTER)
2049	++MI;
2050
2051	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2052	switch (Scope) {
2053	case SIAtomicScope::SYSTEM:
2054	case SIAtomicScope::AGENT:
2055	// The order of invalidates matter here. We must invalidate "outer in"
2056	// so L1 -> L0 to avoid L0 pulling in stale data from L1 when it is
2057	// invalidated.
2058	BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
2059	BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2060	Changed = true;
2061	break;
2062	case SIAtomicScope::WORKGROUP:
2063	// In WGP mode the waves of a work-group can be executing on either CU of
2064	// the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
2065	// in CU mode and all waves of a work-group are on the same CU, and so the
2066	// L0 does not need to be invalidated.
2067	if (!ST.isCuModeEnabled()) {
2068	BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2069	Changed = true;
2070	}
2071	break;
2072	case SIAtomicScope::WAVEFRONT:
2073	case SIAtomicScope::SINGLETHREAD:
2074	// No cache to invalidate.
2075	break;
2076	default:
2077	llvm_unreachable("Unsupported synchronization scope");
2078	}
2079	}
2080
2081	/// The scratch address space does not need the global memory cache
2082	/// to be flushed as all memory operations by the same thread are
2083	/// sequentially consistent, and no other thread can access scratch
2084	/// memory.
2085
2086	/// Other address spaces do not have a cache.
2087
2088	if (Pos == Position::AFTER)
2089	--MI;
2090
2091	return Changed;
2092	}
2093
2094	bool SIGfx11CacheControl::enableLoadCacheBypass(
2095	const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
2096	SIAtomicAddrSpace AddrSpace) const {
2097	assert(MI ->mayLoad() && !MI ->mayStore());
2098	bool Changed = false;
2099
2100	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2101	switch (Scope) {
2102	case SIAtomicScope::SYSTEM:
2103	case SIAtomicScope::AGENT:
2104	// Set the L0 and L1 cache policies to MISS_EVICT.
2105	// Note: there is no L2 cache coherent bypass control at the ISA level.
2106	Changed \|= enableGLCBit(MI);
2107	break;
2108	case SIAtomicScope::WORKGROUP:
2109	// In WGP mode the waves of a work-group can be executing on either CU of
2110	// the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
2111	// CU mode all waves of a work-group are on the same CU, and so the L0
2112	// does not need to be bypassed.
2113	if (!ST.isCuModeEnabled())
2114	Changed \|= enableGLCBit(MI);
2115	break;
2116	case SIAtomicScope::WAVEFRONT:
2117	case SIAtomicScope::SINGLETHREAD:
2118	// No cache to bypass.
2119	break;
2120	default:
2121	llvm_unreachable("Unsupported synchronization scope");
2122	}
2123	}
2124
2125	/// The scratch address space does not need the global memory caches
2126	/// to be bypassed as all memory operations by the same thread are
2127	/// sequentially consistent, and no other thread can access scratch
2128	/// memory.
2129
2130	/// Other address spaces do not have a cache.
2131
2132	return Changed;
2133	}
2134
2135	bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
2136	MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2137	bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2138
2139	// Only handle load and store, not atomic read-modify-write insructions. The
2140	// latter use glc to indicate if the atomic returns a result and so must not
2141	// be used for cache control.
2142	assert(MI ->mayLoad() ^ MI ->mayStore());
2143
2144	// Only update load and store, not LLVM IR atomic read-modify-write
2145	// instructions. The latter are always marked as volatile so cannot sensibly
2146	// handle it as do not want to pessimize all atomics. Also they do not support
2147	// the nontemporal attribute.
2148	assert(Op == SIMemOp::LOAD \|\| Op == SIMemOp::STORE);
2149
2150	bool Changed = false;
2151
2152	if (IsVolatile) {
2153	// Set L0 and L1 cache policy to be MISS_EVICT for load instructions
2154	// and MISS_LRU for store instructions.
2155	// Note: there is no L2 cache coherent bypass control at the ISA level.
2156	if (Op == SIMemOp::LOAD)
2157	Changed \|= enableGLCBit(MI);
2158
2159	// Set MALL NOALLOC for load and store instructions.
2160	Changed \|= enableDLCBit(MI);
2161
2162	// Ensure operation has completed at system scope to cause all volatile
2163	// operations to be visible outside the program in a global order. Do not
2164	// request cross address space as only the global address space can be
2165	// observable outside the program, so no need to cause a waitcnt for LDS
2166	// address space operations.
2167	Changed \|= insertWait(MI, Scope: SIAtomicScope::SYSTEM, AddrSpace, Op, IsCrossAddrSpaceOrdering: false,
2168	Pos: Position::AFTER);
2169	return Changed;
2170	}
2171
2172	if (IsNonTemporal) {
2173	// For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
2174	// and L2 cache policy to STREAM.
2175	// For stores setting both GLC and SLC configures L0 and L1 cache policy
2176	// to MISS_EVICT and the L2 cache policy to STREAM.
2177	if (Op == SIMemOp::STORE)
2178	Changed \|= enableGLCBit(MI);
2179	Changed \|= enableSLCBit(MI);
2180
2181	// Set MALL NOALLOC for load and store instructions.
2182	Changed \|= enableDLCBit(MI);
2183	return Changed;
2184	}
2185
2186	return Changed;
2187	}
2188
2189	bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
2190	AMDGPU::CPol::CPol Value) const {
2191	MachineOperand CPol = TII->getNamedOperand(MI, OpName::cpol);
2192	if (!CPol)
2193	return false;
2194
2195	uint64_t NewTH = Value & AMDGPU::CPol::TH;
2196	if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) {
2197	CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) \| NewTH);
2198	return true;
2199	}
2200
2201	return false;
2202	}
2203
2204	bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
2205	AMDGPU::CPol::CPol Value) const {
2206	MachineOperand CPol = TII->getNamedOperand(MI, OpName::cpol);
2207	if (!CPol)
2208	return false;
2209
2210	uint64_t NewScope = Value & AMDGPU::CPol::SCOPE;
2211	if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) {
2212	CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) \| NewScope);
2213	return true;
2214	}
2215
2216	return false;
2217	}
2218
2219	bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore(
2220	const MachineBasicBlock::iterator MI) const {
2221	// TODO: implement flag for frontend to give us a hint not to insert waits.
2222
2223	MachineBasicBlock &MBB = *MI ->getParent();
2224	const DebugLoc &DL = MI ->getDebugLoc();
2225
2226	BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(`0`);
2227	BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(`0`);
2228	BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(`0`);
2229	BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(`0`);
2230	BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(`0`);
2231
2232	return true;
2233	}
2234
2235	bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
2236	SIAtomicScope Scope,
2237	SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2238	bool IsCrossAddrSpaceOrdering,
2239	Position Pos) const {
2240	bool Changed = false;
2241
2242	MachineBasicBlock &MBB = *MI ->getParent();
2243	DebugLoc DL = MI ->getDebugLoc();
2244
2245	bool LOADCnt = false;
2246	bool DSCnt = false;
2247	bool STORECnt = false;
2248
2249	if (Pos == Position::AFTER)
2250	++MI;
2251
2252	if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL \| SIAtomicAddrSpace::SCRATCH)) !=
2253	SIAtomicAddrSpace::NONE) {
2254	switch (Scope) {
2255	case SIAtomicScope::SYSTEM:
2256	case SIAtomicScope::AGENT:
2257	if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2258	LOADCnt \|= true;
2259	if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2260	STORECnt \|= true;
2261	break;
2262	case SIAtomicScope::WORKGROUP:
2263	// In WGP mode the waves of a work-group can be executing on either CU of
2264	// the WGP. Therefore need to wait for operations to complete to ensure
2265	// they are visible to waves in the other CU as the L0 is per CU.
2266	// Otherwise in CU mode and all waves of a work-group are on the same CU
2267	// which shares the same L0.
2268	if (!ST.isCuModeEnabled()) {
2269	if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2270	LOADCnt \|= true;
2271	if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2272	STORECnt \|= true;
2273	}
2274	break;
2275	case SIAtomicScope::WAVEFRONT:
2276	case SIAtomicScope::SINGLETHREAD:
2277	// The L0 cache keeps all memory operations in order for
2278	// work-items in the same wavefront.
2279	break;
2280	default:
2281	llvm_unreachable("Unsupported synchronization scope");
2282	}
2283	}
2284
2285	if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2286	switch (Scope) {
2287	case SIAtomicScope::SYSTEM:
2288	case SIAtomicScope::AGENT:
2289	case SIAtomicScope::WORKGROUP:
2290	// If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
2291	// not needed as LDS operations for all waves are executed in a total
2292	// global ordering as observed by all waves. Required if also
2293	// synchronizing with global/GDS memory as LDS operations could be
2294	// reordered with respect to later global/GDS memory operations of the
2295	// same wave.
2296	DSCnt \|= IsCrossAddrSpaceOrdering;
2297	break;
2298	case SIAtomicScope::WAVEFRONT:
2299	case SIAtomicScope::SINGLETHREAD:
2300	// The LDS keeps all memory operations in order for
2301	// the same wavefront.
2302	break;
2303	default:
2304	llvm_unreachable("Unsupported synchronization scope");
2305	}
2306	}
2307
2308	if (LOADCnt) {
2309	BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(`0`);
2310	BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(`0`);
2311	BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(`0`);
2312	Changed = true;
2313	}
2314
2315	if (STORECnt) {
2316	BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(`0`);
2317	Changed = true;
2318	}
2319
2320	if (DSCnt) {
2321	BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(`0`);
2322	Changed = true;
2323	}
2324
2325	if (Pos == Position::AFTER)
2326	--MI;
2327
2328	return Changed;
2329	}
2330
2331	bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2332	SIAtomicScope Scope,
2333	SIAtomicAddrSpace AddrSpace,
2334	Position Pos) const {
2335	if (!InsertCacheInv)
2336	return false;
2337
2338	MachineBasicBlock &MBB = *MI ->getParent();
2339	DebugLoc DL = MI ->getDebugLoc();
2340
2341	/// The scratch address space does not need the global memory cache
2342	/// to be flushed as all memory operations by the same thread are
2343	/// sequentially consistent, and no other thread can access scratch
2344	/// memory.
2345
2346	/// Other address spaces do not have a cache.
2347	if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
2348	return false;
2349
2350	AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2351	switch (Scope) {
2352	case SIAtomicScope::SYSTEM:
2353	ScopeImm = AMDGPU::CPol::SCOPE_SYS;
2354	break;
2355	case SIAtomicScope::AGENT:
2356	ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2357	break;
2358	case SIAtomicScope::WORKGROUP:
2359	// In WGP mode the waves of a work-group can be executing on either CU of
2360	// the WGP. Therefore we need to invalidate the L0 which is per CU.
2361	// Otherwise in CU mode all waves of a work-group are on the same CU, and so
2362	// the L0 does not need to be invalidated.
2363	if (ST.isCuModeEnabled())
2364	return false;
2365
2366	ScopeImm = AMDGPU::CPol::SCOPE_SE;
2367	break;
2368	case SIAtomicScope::WAVEFRONT:
2369	case SIAtomicScope::SINGLETHREAD:
2370	// No cache to invalidate.
2371	return false;
2372	default:
2373	llvm_unreachable("Unsupported synchronization scope");
2374	}
2375
2376	if (Pos == Position::AFTER)
2377	++MI;
2378
2379	BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm);
2380
2381	if (Pos == Position::AFTER)
2382	--MI;
2383
2384	return true;
2385	}
2386
2387	bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
2388	MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2389	bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2390
2391	// Only handle load and store, not atomic read-modify-write instructions.
2392	assert(MI ->mayLoad() ^ MI ->mayStore());
2393
2394	// Only update load and store, not LLVM IR atomic read-modify-write
2395	// instructions. The latter are always marked as volatile so cannot sensibly
2396	// handle it as do not want to pessimize all atomics. Also they do not support
2397	// the nontemporal attribute.
2398	assert(Op == SIMemOp::LOAD \|\| Op == SIMemOp::STORE);
2399
2400	bool Changed = false;
2401
2402	if (IsLastUse) {
2403	// Set last-use hint.
2404	Changed \|= setTH(MI, Value: AMDGPU::CPol::TH_LU);
2405	} else if (IsNonTemporal) {
2406	// Set non-temporal hint for all cache levels.
2407	Changed \|= setTH(MI, Value: AMDGPU::CPol::TH_NT);
2408	}
2409
2410	if (IsVolatile) {
2411	Changed \|= setScope(MI, Value: AMDGPU::CPol::SCOPE_SYS);
2412
2413	if (Op == SIMemOp::STORE)
2414	Changed \|= insertWaitsBeforeSystemScopeStore(MI);
2415
2416	// Ensure operation has completed at system scope to cause all volatile
2417	// operations to be visible outside the program in a global order. Do not
2418	// request cross address space as only the global address space can be
2419	// observable outside the program, so no need to cause a waitcnt for LDS
2420	// address space operations.
2421	Changed \|= insertWait(MI, Scope: SIAtomicScope::SYSTEM, AddrSpace, Op, IsCrossAddrSpaceOrdering: false,
2422	Pos: Position::AFTER);
2423	}
2424
2425	return Changed;
2426	}
2427
2428	bool SIGfx12CacheControl::expandSystemScopeStore(
2429	MachineBasicBlock::iterator &MI) const {
2430	MachineOperand CPol = TII->getNamedOperand(MI, OpName::cpol);
2431	if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS))
2432	return insertWaitsBeforeSystemScopeStore(MI);
2433
2434	return false;
2435	}
2436
2437	bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2438	if (AtomicPseudoMIs.empty())
2439	return false;
2440
2441	for (auto &MI : AtomicPseudoMIs)
2442	MI ->eraseFromParent();
2443
2444	AtomicPseudoMIs.clear();
2445	return true;
2446	}
2447
2448	bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2449	MachineBasicBlock::iterator &MI) {
2450	assert(MI ->mayLoad() && !MI ->mayStore());
2451
2452	bool Changed = false;
2453
2454	if (MOI.isAtomic()) {
2455	if (MOI.getOrdering() == AtomicOrdering::Monotonic \|\|
2456	MOI.getOrdering() == AtomicOrdering::Acquire \|\|
2457	MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2458	Changed \|= CC ->enableLoadCacheBypass(MI, Scope: MOI.getScope(),
2459	AddrSpace: MOI.getOrderingAddrSpace());
2460	}
2461
2462	if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2463	Changed \|= CC ->insertWait(MI, Scope: MOI.getScope(),
2464	AddrSpace: MOI.getOrderingAddrSpace(),
2465	Op: SIMemOp::LOAD \| SIMemOp::STORE,
2466	IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(),
2467	Pos: Position::BEFORE);
2468
2469	if (MOI.getOrdering() == AtomicOrdering::Acquire \|\|
2470	MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2471	Changed \|= CC ->insertWait(MI, Scope: MOI.getScope(),
2472	AddrSpace: MOI.getInstrAddrSpace(),
2473	Op: SIMemOp::LOAD,
2474	IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(),
2475	Pos: Position::AFTER);
2476	Changed \|= CC ->insertAcquire(MI, Scope: MOI.getScope(),
2477	AddrSpace: MOI.getOrderingAddrSpace(),
2478	Pos: Position::AFTER);
2479	}
2480
2481	return Changed;
2482	}
2483
2484	// Atomic instructions already bypass caches to the scope specified by the
2485	// SyncScope operand. Only non-atomic volatile and nontemporal/last-use
2486	// instructions need additional treatment.
2487	Changed \|= CC ->enableVolatileAndOrNonTemporal(
2488	MI, AddrSpace: MOI.getInstrAddrSpace(), Op: SIMemOp::LOAD, IsVolatile: MOI.isVolatile(),
2489	IsNonTemporal: MOI.isNonTemporal(), IsLastUse: MOI.isLastUse());
2490
2491	return Changed;
2492	}
2493
2494	bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2495	MachineBasicBlock::iterator &MI) {
2496	assert(!MI ->mayLoad() && MI ->mayStore());
2497
2498	bool Changed = false;
2499
2500	if (MOI.isAtomic()) {
2501	if (MOI.getOrdering() == AtomicOrdering::Monotonic \|\|
2502	MOI.getOrdering() == AtomicOrdering::Release \|\|
2503	MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2504	Changed \|= CC ->enableStoreCacheBypass(MI, Scope: MOI.getScope(),
2505	AddrSpace: MOI.getOrderingAddrSpace());
2506	}
2507
2508	if (MOI.getOrdering() == AtomicOrdering::Release \|\|
2509	MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2510	Changed \|= CC ->insertRelease(MI, Scope: MOI.getScope(),
2511	AddrSpace: MOI.getOrderingAddrSpace(),
2512	IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(),
2513	Pos: Position::BEFORE);
2514
2515	return Changed;
2516	}
2517
2518	// Atomic instructions already bypass caches to the scope specified by the
2519	// SyncScope operand. Only non-atomic volatile and nontemporal instructions
2520	// need additional treatment.
2521	Changed \|= CC ->enableVolatileAndOrNonTemporal(
2522	MI, AddrSpace: MOI.getInstrAddrSpace(), Op: SIMemOp::STORE, IsVolatile: MOI.isVolatile(),
2523	IsNonTemporal: MOI.isNonTemporal());
2524
2525	// GFX12 specific, scope(desired coherence domain in cache hierarchy) is
2526	// instruction field, do not confuse it with atomic scope.
2527	Changed \|= CC ->expandSystemScopeStore(MI);
2528	return Changed;
2529	}
2530
2531	bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2532	MachineBasicBlock::iterator &MI) {
2533	assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2534
2535	AtomicPseudoMIs.push_back(x: MI);
2536	bool Changed = false;
2537
2538	if (MOI.isAtomic()) {
2539	if (MOI.getOrdering() == AtomicOrdering::Acquire)
2540	Changed \|= CC ->insertWait(MI, Scope: MOI.getScope(), AddrSpace: MOI.getOrderingAddrSpace(),
2541	Op: SIMemOp::LOAD \| SIMemOp::STORE,
2542	IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(),
2543	Pos: Position::BEFORE);
2544
2545	if (MOI.getOrdering() == AtomicOrdering::Release \|\|
2546	MOI.getOrdering() == AtomicOrdering::AcquireRelease \|\|
2547	MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2548	/// TODO: This relies on a barrier always generating a waitcnt
2549	/// for LDS to ensure it is not reordered with the completion of
2550	/// the proceeding LDS operations. If barrier had a memory
2551	/// ordering and memory scope, then library does not need to
2552	/// generate a fence. Could add support in this file for
2553	/// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2554	/// adding S_WAITCNT before a S_BARRIER.
2555	Changed \|= CC ->insertRelease(MI, Scope: MOI.getScope(),
2556	AddrSpace: MOI.getOrderingAddrSpace(),
2557	IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(),
2558	Pos: Position::BEFORE);
2559
2560	// TODO: If both release and invalidate are happening they could be combined
2561	// to use the single "BUFFER_WBINV" instruction. This could be done by*
2562	// reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2563	// track cache invalidate and write back instructions.
2564
2565	if (MOI.getOrdering() == AtomicOrdering::Acquire \|\|
2566	MOI.getOrdering() == AtomicOrdering::AcquireRelease \|\|
2567	MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2568	Changed \|= CC ->insertAcquire(MI, Scope: MOI.getScope(),
2569	AddrSpace: MOI.getOrderingAddrSpace(),
2570	Pos: Position::BEFORE);
2571
2572	return Changed;
2573	}
2574
2575	return Changed;
2576	}
2577
2578	bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2579	MachineBasicBlock::iterator &MI) {
2580	assert(MI ->mayLoad() && MI ->mayStore());
2581
2582	bool Changed = false;
2583
2584	if (MOI.isAtomic()) {
2585	if (MOI.getOrdering() == AtomicOrdering::Monotonic \|\|
2586	MOI.getOrdering() == AtomicOrdering::Acquire \|\|
2587	MOI.getOrdering() == AtomicOrdering::Release \|\|
2588	MOI.getOrdering() == AtomicOrdering::AcquireRelease \|\|
2589	MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2590	Changed \|= CC ->enableRMWCacheBypass(MI, Scope: MOI.getScope(),
2591	AddrSpace: MOI.getInstrAddrSpace());
2592	}
2593
2594	if (MOI.getOrdering() == AtomicOrdering::Release \|\|
2595	MOI.getOrdering() == AtomicOrdering::AcquireRelease \|\|
2596	MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent \|\|
2597	MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2598	Changed \|= CC ->insertRelease(MI, Scope: MOI.getScope(),
2599	AddrSpace: MOI.getOrderingAddrSpace(),
2600	IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(),
2601	Pos: Position::BEFORE);
2602
2603	if (MOI.getOrdering() == AtomicOrdering::Acquire \|\|
2604	MOI.getOrdering() == AtomicOrdering::AcquireRelease \|\|
2605	MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent \|\|
2606	MOI.getFailureOrdering() == AtomicOrdering::Acquire \|\|
2607	MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2608	Changed \|= CC ->insertWait(MI, Scope: MOI.getScope(),
2609	AddrSpace: MOI.getInstrAddrSpace(),
2610	Op: isAtomicRet(MI: *MI) ? SIMemOp::LOAD :
2611	SIMemOp::STORE,
2612	IsCrossAddrSpaceOrdering: MOI.getIsCrossAddressSpaceOrdering(),
2613	Pos: Position::AFTER);
2614	Changed \|= CC ->insertAcquire(MI, Scope: MOI.getScope(),
2615	AddrSpace: MOI.getOrderingAddrSpace(),
2616	Pos: Position::AFTER);
2617	}
2618
2619	return Changed;
2620	}
2621
2622	return Changed;
2623	}
2624
2625	bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
2626	bool Changed = false;
2627
2628	SIMemOpAccess MOA(MF);
2629	CC = SICacheControl::create(ST: MF.getSubtarget<GCNSubtarget>());
2630
2631	for (auto &MBB : MF) {
2632	for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2633
2634	// Unbundle instructions after the post-RA scheduler.
2635	if (MI ->isBundle() && MI ->mayLoadOrStore()) {
2636	MachineBasicBlock::instr_iterator II(MI ->getIterator());
2637	for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
2638	I != E && I ->isBundledWithPred(); ++I) {
2639	I ->unbundleFromPred();
2640	for (MachineOperand &MO : I ->operands())
2641	if (MO.isReg())
2642	MO.setIsInternalRead(false);
2643	}
2644
2645	MI ->eraseFromParent();
2646	MI = II ->getIterator();
2647	}
2648
2649	if (!(MI ->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
2650	continue;
2651
2652	if (const auto &MOI = MOA.getLoadInfo(MI))
2653	Changed \|= expandLoad(MOI: *MOI, MI);
2654	else if (const auto &MOI = MOA.getStoreInfo(MI)) {
2655	Changed \|= expandStore(MOI: *MOI, MI);
2656	Changed \|= CC ->tryForceStoreSC0SC1(MOI: *MOI, MI);
2657	} else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2658	Changed \|= expandAtomicFence(MOI: *MOI, MI);
2659	else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2660	Changed \|= expandAtomicCmpxchgOrRmw(MOI: *MOI, MI);
2661	}
2662	}
2663
2664	Changed \|= removeAtomicPseudoMIs();
2665	return Changed;
2666	}
2667
2668	INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
2669
2670	char SIMemoryLegalizer::ID = `0`;
2671	char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
2672
2673	FunctionPass *llvm::createSIMemoryLegalizerPass() {
2674	return new SIMemoryLegalizer ();
2675	}
2676

source code of llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp