AtomicExpandPass.cpp source code [llvm/lib/CodeGen/AtomicExpandPass.cpp]

1	//===- AtomicExpandPass.cpp - Expand atomic instructions ------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file contains a pass (at IR level) to replace atomic instructions with
10	// __atomic_ library calls, or target specific instruction which implement the*
11	// same semantics in a way which better fits the target backend. This can
12	// include the use of (intrinsic-based) load-linked/store-conditional loops,
13	// AtomicCmpXchg, or type coercions.
14	//
15	//===----------------------------------------------------------------------===//
16
17	#include "llvm/ADT/ArrayRef.h"
18	#include "llvm/ADT/STLFunctionalExtras.h"
19	#include "llvm/ADT/SmallVector.h"
20	#include "llvm/Analysis/InstSimplifyFolder.h"
21	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
22	#include "llvm/CodeGen/AtomicExpand.h"
23	#include "llvm/CodeGen/AtomicExpandUtils.h"
24	#include "llvm/CodeGen/RuntimeLibcalls.h"
25	#include "llvm/CodeGen/TargetLowering.h"
26	#include "llvm/CodeGen/TargetPassConfig.h"
27	#include "llvm/CodeGen/TargetSubtargetInfo.h"
28	#include "llvm/CodeGen/ValueTypes.h"
29	#include "llvm/IR/Attributes.h"
30	#include "llvm/IR/BasicBlock.h"
31	#include "llvm/IR/Constant.h"
32	#include "llvm/IR/Constants.h"
33	#include "llvm/IR/DataLayout.h"
34	#include "llvm/IR/DerivedTypes.h"
35	#include "llvm/IR/Function.h"
36	#include "llvm/IR/IRBuilder.h"
37	#include "llvm/IR/InstIterator.h"
38	#include "llvm/IR/Instruction.h"
39	#include "llvm/IR/Instructions.h"
40	#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
41	#include "llvm/IR/Module.h"
42	#include "llvm/IR/Type.h"
43	#include "llvm/IR/User.h"
44	#include "llvm/IR/Value.h"
45	#include "llvm/InitializePasses.h"
46	#include "llvm/Pass.h"
47	#include "llvm/Support/AtomicOrdering.h"
48	#include "llvm/Support/Casting.h"
49	#include "llvm/Support/Debug.h"
50	#include "llvm/Support/ErrorHandling.h"
51	#include "llvm/Support/raw_ostream.h"
52	#include "llvm/Target/TargetMachine.h"
53	#include "llvm/Transforms/Utils/LowerAtomic.h"
54	#include <cassert>
55	#include <cstdint>
56	#include <iterator>
57
58	using namespace llvm;
59
60	#define DEBUG_TYPE "atomic-expand"
61
62	namespace {
63
64	class AtomicExpandImpl {
65	const TargetLowering TLI = nullptr*;
66	const DataLayout DL = nullptr*;
67
68	private:
69	bool bracketInstWithFences(Instruction *I, AtomicOrdering Order);
70	IntegerType getCorrespondingIntegerType(Type T, const DataLayout &DL);
71	LoadInst convertAtomicLoadToIntegerType(LoadInst LI);
72	bool tryExpandAtomicLoad(LoadInst *LI);
73	bool expandAtomicLoadToLL(LoadInst *LI);
74	bool expandAtomicLoadToCmpXchg(LoadInst *LI);
75	StoreInst convertAtomicStoreToIntegerType(StoreInst SI);
76	bool tryExpandAtomicStore(StoreInst *SI);
77	void expandAtomicStore(StoreInst *SI);
78	bool tryExpandAtomicRMW(AtomicRMWInst *AI);
79	AtomicRMWInst convertAtomicXchgToIntegerType(AtomicRMWInst RMWI);
80	Value *
81	insertRMWLLSCLoop(IRBuilderBase &Builder, Type ResultTy, Value Addr,
82	Align AddrAlign, AtomicOrdering MemOpOrder,
83	function_ref<Value (IRBuilderBase &, Value )> PerformOp);
84	void expandAtomicOpToLLSC(
85	Instruction I, Type ResultTy, Value *Addr, Align AddrAlign,
86	AtomicOrdering MemOpOrder,
87	function_ref<Value (IRBuilderBase &, Value )> PerformOp);
88	void expandPartwordAtomicRMW(
89	AtomicRMWInst *I, TargetLoweringBase::AtomicExpansionKind ExpansionKind);
90	AtomicRMWInst widenPartwordAtomicRMW(AtomicRMWInst AI);
91	bool expandPartwordCmpXchg(AtomicCmpXchgInst *I);
92	void expandAtomicRMWToMaskedIntrinsic(AtomicRMWInst *AI);
93	void expandAtomicCmpXchgToMaskedIntrinsic(AtomicCmpXchgInst *CI);
94
95	AtomicCmpXchgInst convertCmpXchgToIntegerType(AtomicCmpXchgInst CI);
96	static Value *insertRMWCmpXchgLoop(
97	IRBuilderBase &Builder, Type ResultType, Value Addr, Align AddrAlign,
98	AtomicOrdering MemOpOrder, SyncScope::ID SSID,
99	function_ref<Value (IRBuilderBase &, Value )> PerformOp,
100	CreateCmpXchgInstFun CreateCmpXchg);
101	bool tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI);
102
103	bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI);
104	bool isIdempotentRMW(AtomicRMWInst *RMWI);
105	bool simplifyIdempotentRMW(AtomicRMWInst *RMWI);
106
107	bool expandAtomicOpToLibcall(Instruction I, unsigned* Size, Align Alignment,
108	Value PointerOperand, Value ValueOperand,
109	Value *CASExpected, AtomicOrdering Ordering,
110	AtomicOrdering Ordering2,
111	ArrayRef<RTLIB::Libcall> Libcalls);
112	void expandAtomicLoadToLibcall(LoadInst *LI);
113	void expandAtomicStoreToLibcall(StoreInst *LI);
114	void expandAtomicRMWToLibcall(AtomicRMWInst *I);
115	void expandAtomicCASToLibcall(AtomicCmpXchgInst *I);
116
117	friend bool
118	llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
119	CreateCmpXchgInstFun CreateCmpXchg);
120
121	public:
122	bool run(Function &F, const TargetMachine *TM);
123	};
124
125	class AtomicExpandLegacy : public FunctionPass {
126	public:
127	static char ID; // Pass identification, replacement for typeid
128
129	AtomicExpandLegacy() : FunctionPass (ID) {
130	initializeAtomicExpandLegacyPass(*PassRegistry::getPassRegistry());
131	}
132
133	bool runOnFunction(Function &F) override;
134	};
135
136	// IRBuilder to be used for replacement atomic instructions.
137	struct ReplacementIRBuilder
138	: IRBuilder<InstSimplifyFolder, IRBuilderCallbackInserter> {
139	MDNode MMRAMD = nullptr*;
140
141	// Preserves the DebugLoc from I, and preserves still valid metadata.
142	// Enable StrictFP builder mode when appropriate.
143	explicit ReplacementIRBuilder(Instruction I, const* DataLayout &DL)
144	: IRBuilder (I->getContext(), DL,
145	IRBuilderCallbackInserter (
146	[this](Instruction *I) { addMMRAMD(I); })) {
147	SetInsertPoint(I);
148	this->CollectMetadataToCopy(Src: I, MetadataKinds: {LLVMContext::MD_pcsections});
149	if (BB->getParent()->getAttributes().hasFnAttr(Attribute::StrictFP))
150	this->setIsFPConstrained(true);
151
152	MMRAMD = I->getMetadata(KindID: LLVMContext::MD_mmra);
153	}
154
155	void addMMRAMD(Instruction *I) {
156	if (canInstructionHaveMMRAs(I: *I))
157	I->setMetadata(KindID: LLVMContext::MD_mmra, Node: MMRAMD);
158	}
159	};
160
161	} // end anonymous namespace
162
163	char AtomicExpandLegacy::ID = `0`;
164
165	char &llvm::AtomicExpandID = AtomicExpandLegacy::ID;
166
167	INITIALIZE_PASS_BEGIN(AtomicExpandLegacy, DEBUG_TYPE,
168	"Expand Atomic instructions", false, false)
169	INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
170	INITIALIZE_PASS_END(AtomicExpandLegacy, DEBUG_TYPE,
171	"Expand Atomic instructions", false, false)
172
173	// Helper functions to retrieve the size of atomic instructions.
174	static unsigned getAtomicOpSize(LoadInst *LI) {
175	const DataLayout &DL = LI->getModule()->getDataLayout();
176	return DL.getTypeStoreSize(Ty: LI->getType());
177	}
178
179	static unsigned getAtomicOpSize(StoreInst *SI) {
180	const DataLayout &DL = SI->getModule()->getDataLayout();
181	return DL.getTypeStoreSize(Ty: SI->getValueOperand()->getType());
182	}
183
184	static unsigned getAtomicOpSize(AtomicRMWInst *RMWI) {
185	const DataLayout &DL = RMWI->getModule()->getDataLayout();
186	return DL.getTypeStoreSize(Ty: RMWI->getValOperand()->getType());
187	}
188
189	static unsigned getAtomicOpSize(AtomicCmpXchgInst *CASI) {
190	const DataLayout &DL = CASI->getModule()->getDataLayout();
191	return DL.getTypeStoreSize(Ty: CASI->getCompareOperand()->getType());
192	}
193
194	// Determine if a particular atomic operation has a supported size,
195	// and is of appropriate alignment, to be passed through for target
196	// lowering. (Versus turning into a __atomic libcall)
197	template <typename Inst>
198	static bool atomicSizeSupported(const TargetLowering TLI, Inst I) {
199	unsigned Size = getAtomicOpSize(I);
200	Align Alignment = I->getAlign();
201	return Alignment >= Size &&
202	Size <= TLI->getMaxAtomicSizeInBitsSupported() / `8`;
203	}
204
205	bool AtomicExpandImpl::run(Function &F, const TargetMachine *TM) {
206	const auto *Subtarget = TM->getSubtargetImpl(F);
207	if (!Subtarget->enableAtomicExpand())
208	return false;
209	TLI = Subtarget->getTargetLowering();
210	DL = &F.getParent()->getDataLayout();
211
212	SmallVector<Instruction *, `1`> AtomicInsts;
213
214	// Changing control-flow while iterating through it is a bad idea, so gather a
215	// list of all atomic instructions before we start.
216	for (Instruction &I : instructions(F))
217	if (I.isAtomic() && !isa<FenceInst>(Val: &I))
218	AtomicInsts.push_back(Elt: &I);
219
220	bool MadeChange = false;
221	for (auto *I : AtomicInsts) {
222	auto LI = dyn_cast<LoadInst>(Val: I);
223	auto SI = dyn_cast<StoreInst>(Val: I);
224	auto RMWI = dyn_cast<AtomicRMWInst>(Val: I);
225	auto CASI = dyn_cast<AtomicCmpXchgInst>(Val: I);
226	assert((LI \|\| SI \|\| RMWI \|\| CASI) && "Unknown atomic instruction");
227
228	// If the Size/Alignment is not supported, replace with a libcall.
229	if (LI) {
230	if (!atomicSizeSupported(TLI, I: LI)) {
231	expandAtomicLoadToLibcall(LI);
232	MadeChange = true;
233	continue;
234	}
235	} else if (SI) {
236	if (!atomicSizeSupported(TLI, I: SI)) {
237	expandAtomicStoreToLibcall(LI: SI);
238	MadeChange = true;
239	continue;
240	}
241	} else if (RMWI) {
242	if (!atomicSizeSupported(TLI, I: RMWI)) {
243	expandAtomicRMWToLibcall(I: RMWI);
244	MadeChange = true;
245	continue;
246	}
247	} else if (CASI) {
248	if (!atomicSizeSupported(TLI, I: CASI)) {
249	expandAtomicCASToLibcall(I: CASI);
250	MadeChange = true;
251	continue;
252	}
253	}
254
255	if (LI && TLI->shouldCastAtomicLoadInIR(LI) ==
256	TargetLoweringBase::AtomicExpansionKind::CastToInteger) {
257	I = LI = convertAtomicLoadToIntegerType(LI);
258	MadeChange = true;
259	} else if (SI &&
260	TLI->shouldCastAtomicStoreInIR(SI) ==
261	TargetLoweringBase::AtomicExpansionKind::CastToInteger) {
262	I = SI = convertAtomicStoreToIntegerType(SI);
263	MadeChange = true;
264	} else if (RMWI &&
265	TLI->shouldCastAtomicRMWIInIR(RMWI) ==
266	TargetLoweringBase::AtomicExpansionKind::CastToInteger) {
267	I = RMWI = convertAtomicXchgToIntegerType(RMWI);
268	MadeChange = true;
269	} else if (CASI) {
270	// TODO: when we're ready to make the change at the IR level, we can
271	// extend convertCmpXchgToInteger for floating point too.
272	if (CASI->getCompareOperand()->getType()->isPointerTy()) {
273	// TODO: add a TLI hook to control this so that each target can
274	// convert to lowering the original type one at a time.
275	I = CASI = convertCmpXchgToIntegerType(CI: CASI);
276	MadeChange = true;
277	}
278	}
279
280	if (TLI->shouldInsertFencesForAtomic(I)) {
281	auto FenceOrdering = AtomicOrdering::Monotonic;
282	if (LI && isAcquireOrStronger(AO: LI->getOrdering())) {
283	FenceOrdering = LI->getOrdering();
284	LI->setOrdering(AtomicOrdering::Monotonic);
285	} else if (SI && isReleaseOrStronger(AO: SI->getOrdering())) {
286	FenceOrdering = SI->getOrdering();
287	SI->setOrdering(AtomicOrdering::Monotonic);
288	} else if (RMWI && (isReleaseOrStronger(AO: RMWI->getOrdering()) \|\|
289	isAcquireOrStronger(AO: RMWI->getOrdering()))) {
290	FenceOrdering = RMWI->getOrdering();
291	RMWI->setOrdering(AtomicOrdering::Monotonic);
292	} else if (CASI &&
293	TLI->shouldExpandAtomicCmpXchgInIR(AI: CASI) ==
294	TargetLoweringBase::AtomicExpansionKind::None &&
295	(isReleaseOrStronger(AO: CASI->getSuccessOrdering()) \|\|
296	isAcquireOrStronger(AO: CASI->getSuccessOrdering()) \|\|
297	isAcquireOrStronger(AO: CASI->getFailureOrdering()))) {
298	// If a compare and swap is lowered to LL/SC, we can do smarter fence
299	// insertion, with a stronger one on the success path than on the
300	// failure path. As a result, fence insertion is directly done by
301	// expandAtomicCmpXchg in that case.
302	FenceOrdering = CASI->getMergedOrdering();
303	CASI->setSuccessOrdering(AtomicOrdering::Monotonic);
304	CASI->setFailureOrdering(AtomicOrdering::Monotonic);
305	}
306
307	if (FenceOrdering != AtomicOrdering::Monotonic) {
308	MadeChange \|= bracketInstWithFences(I, Order: FenceOrdering);
309	}
310	} else if (I->hasAtomicStore() &&
311	TLI->shouldInsertTrailingFenceForAtomicStore(I)) {
312	auto FenceOrdering = AtomicOrdering::Monotonic;
313	if (SI)
314	FenceOrdering = SI->getOrdering();
315	else if (RMWI)
316	FenceOrdering = RMWI->getOrdering();
317	else if (CASI && TLI->shouldExpandAtomicCmpXchgInIR(AI: CASI) !=
318	TargetLoweringBase::AtomicExpansionKind::LLSC)
319	// LLSC is handled in expandAtomicCmpXchg().
320	FenceOrdering = CASI->getSuccessOrdering();
321
322	IRBuilder Builder(I);
323	if (auto TrailingFence =
324	TLI->emitTrailingFence(Builder, Inst: I, Ord: FenceOrdering)) {
325	TrailingFence->moveAfter(MovePos: I);
326	MadeChange = true;
327	}
328	}
329
330	if (LI)
331	MadeChange \|= tryExpandAtomicLoad(LI);
332	else if (SI)
333	MadeChange \|= tryExpandAtomicStore(SI);
334	else if (RMWI) {
335	// There are two different ways of expanding RMW instructions:
336	// - into a load if it is idempotent
337	// - into a Cmpxchg/LL-SC loop otherwise
338	// we try them in that order.
339
340	if (isIdempotentRMW(RMWI) && simplifyIdempotentRMW(RMWI)) {
341	MadeChange = true;
342	} else {
343	MadeChange \|= tryExpandAtomicRMW(AI: RMWI);
344	}
345	} else if (CASI)
346	MadeChange \|= tryExpandAtomicCmpXchg(CI: CASI);
347	}
348	return MadeChange;
349	}
350
351	bool AtomicExpandLegacy::runOnFunction(Function &F) {
352
353	auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
354	if (!TPC)
355	return false;
356	auto *TM = &TPC->getTM<TargetMachine>();
357	AtomicExpandImpl AE;
358	return AE.run(F, TM);
359	}
360
361	FunctionPass *llvm::createAtomicExpandLegacyPass() {
362	return new AtomicExpandLegacy ();
363	}
364
365	PreservedAnalyses AtomicExpandPass::run(Function &F,
366	FunctionAnalysisManager &AM) {
367	AtomicExpandImpl AE;
368
369	bool Changed = AE.run(F, TM);
370	if (!Changed)
371	return PreservedAnalyses::all();
372
373	return PreservedAnalyses::none();
374	}
375
376	bool AtomicExpandImpl::bracketInstWithFences(Instruction *I,
377	AtomicOrdering Order) {
378	ReplacementIRBuilder Builder(I, *DL);
379
380	auto LeadingFence = TLI->emitLeadingFence(Builder, Inst: I, Ord: Order);
381
382	auto TrailingFence = TLI->emitTrailingFence(Builder, Inst: I, Ord: Order);
383	// We have a guard here because not every atomic operation generates a
384	// trailing fence.
385	if (TrailingFence)
386	TrailingFence->moveAfter(MovePos: I);
387
388	return (LeadingFence \|\| TrailingFence);
389	}
390
391	/// Get the iX type with the same bitwidth as T.
392	IntegerType *
393	AtomicExpandImpl::getCorrespondingIntegerType(Type T, const* DataLayout &DL) {
394	EVT VT = TLI->getMemValueType(DL, Ty: T);
395	unsigned BitWidth = VT.getStoreSizeInBits();
396	assert(BitWidth == VT.getSizeInBits() && "must be a power of two");
397	return IntegerType::get(C&: T->getContext(), NumBits: BitWidth);
398	}
399
400	/// Convert an atomic load of a non-integral type to an integer load of the
401	/// equivalent bitwidth. See the function comment on
402	/// convertAtomicStoreToIntegerType for background.
403	LoadInst AtomicExpandImpl::convertAtomicLoadToIntegerType(LoadInst LI) {
404	auto *M = LI->getModule();
405	Type *NewTy = getCorrespondingIntegerType(T: LI->getType(), DL: M->getDataLayout());
406
407	ReplacementIRBuilder Builder(LI, *DL);
408
409	Value *Addr = LI->getPointerOperand();
410
411	auto *NewLI = Builder.CreateLoad(Ty: NewTy, Ptr: Addr);
412	NewLI->setAlignment(LI->getAlign());
413	NewLI->setVolatile(LI->isVolatile());
414	NewLI->setAtomic(Ordering: LI->getOrdering(), SSID: LI->getSyncScopeID());
415	LLVM_DEBUG(dbgs() << "Replaced " << LI << " with " << NewLI << "\n");
416
417	Value *NewVal = Builder.CreateBitCast(V: NewLI, DestTy: LI->getType());
418	LI->replaceAllUsesWith(V: NewVal);
419	LI->eraseFromParent();
420	return NewLI;
421	}
422
423	AtomicRMWInst *
424	AtomicExpandImpl::convertAtomicXchgToIntegerType(AtomicRMWInst *RMWI) {
425	auto *M = RMWI->getModule();
426	Type *NewTy =
427	getCorrespondingIntegerType(T: RMWI->getType(), DL: M->getDataLayout());
428
429	ReplacementIRBuilder Builder(RMWI, *DL);
430
431	Value *Addr = RMWI->getPointerOperand();
432	Value *Val = RMWI->getValOperand();
433	Value *NewVal = Val->getType()->isPointerTy()
434	? Builder.CreatePtrToInt(V: Val, DestTy: NewTy)
435	: Builder.CreateBitCast(V: Val, DestTy: NewTy);
436
437	auto *NewRMWI = Builder.CreateAtomicRMW(Op: AtomicRMWInst::Xchg, Ptr: Addr, Val: NewVal,
438	Align: RMWI->getAlign(), Ordering: RMWI->getOrdering(),
439	SSID: RMWI->getSyncScopeID());
440	NewRMWI->setVolatile(RMWI->isVolatile());
441	LLVM_DEBUG(dbgs() << "Replaced " << RMWI << " with " << NewRMWI << "\n");
442
443	Value *NewRVal = RMWI->getType()->isPointerTy()
444	? Builder.CreateIntToPtr(V: NewRMWI, DestTy: RMWI->getType())
445	: Builder.CreateBitCast(V: NewRMWI, DestTy: RMWI->getType());
446	RMWI->replaceAllUsesWith(V: NewRVal);
447	RMWI->eraseFromParent();
448	return NewRMWI;
449	}
450
451	bool AtomicExpandImpl::tryExpandAtomicLoad(LoadInst *LI) {
452	switch (TLI->shouldExpandAtomicLoadInIR(LI)) {
453	case TargetLoweringBase::AtomicExpansionKind::None:
454	return false;
455	case TargetLoweringBase::AtomicExpansionKind::LLSC:
456	expandAtomicOpToLLSC(
457	I: LI, ResultTy: LI->getType(), Addr: LI->getPointerOperand(), AddrAlign: LI->getAlign(),
458	MemOpOrder: LI->getOrdering(),
459	PerformOp: [](IRBuilderBase &Builder, Value Loaded) { return* Loaded; });
460	return true;
461	case TargetLoweringBase::AtomicExpansionKind::LLOnly:
462	return expandAtomicLoadToLL(LI);
463	case TargetLoweringBase::AtomicExpansionKind::CmpXChg:
464	return expandAtomicLoadToCmpXchg(LI);
465	case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
466	LI->setAtomic(Ordering: AtomicOrdering::NotAtomic);
467	return true;
468	default:
469	llvm_unreachable("Unhandled case in tryExpandAtomicLoad");
470	}
471	}
472
473	bool AtomicExpandImpl::tryExpandAtomicStore(StoreInst *SI) {
474	switch (TLI->shouldExpandAtomicStoreInIR(SI)) {
475	case TargetLoweringBase::AtomicExpansionKind::None:
476	return false;
477	case TargetLoweringBase::AtomicExpansionKind::Expand:
478	expandAtomicStore(SI);
479	return true;
480	case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
481	SI->setAtomic(Ordering: AtomicOrdering::NotAtomic);
482	return true;
483	default:
484	llvm_unreachable("Unhandled case in tryExpandAtomicStore");
485	}
486	}
487
488	bool AtomicExpandImpl::expandAtomicLoadToLL(LoadInst *LI) {
489	ReplacementIRBuilder Builder(LI, *DL);
490
491	// On some architectures, load-linked instructions are atomic for larger
492	// sizes than normal loads. For example, the only 64-bit load guaranteed
493	// to be single-copy atomic by ARM is an ldrexd (A3.5.3).
494	Value *Val = TLI->emitLoadLinked(Builder, ValueTy: LI->getType(),
495	Addr: LI->getPointerOperand(), Ord: LI->getOrdering());
496	TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder);
497
498	LI->replaceAllUsesWith(V: Val);
499	LI->eraseFromParent();
500
501	return true;
502	}
503
504	bool AtomicExpandImpl::expandAtomicLoadToCmpXchg(LoadInst *LI) {
505	ReplacementIRBuilder Builder(LI, *DL);
506	AtomicOrdering Order = LI->getOrdering();
507	if (Order == AtomicOrdering::Unordered)
508	Order = AtomicOrdering::Monotonic;
509
510	Value *Addr = LI->getPointerOperand();
511	Type *Ty = LI->getType();
512	Constant *DummyVal = Constant::getNullValue(Ty);
513
514	Value *Pair = Builder.CreateAtomicCmpXchg(
515	Ptr: Addr, Cmp: DummyVal, New: DummyVal, Align: LI->getAlign(), SuccessOrdering: Order,
516	FailureOrdering: AtomicCmpXchgInst::getStrongestFailureOrdering(SuccessOrdering: Order));
517	Value *Loaded = Builder.CreateExtractValue(Agg: Pair, Idxs: `0`, Name: "loaded");
518
519	LI->replaceAllUsesWith(V: Loaded);
520	LI->eraseFromParent();
521
522	return true;
523	}
524
525	/// Convert an atomic store of a non-integral type to an integer store of the
526	/// equivalent bitwidth. We used to not support floating point or vector
527	/// atomics in the IR at all. The backends learned to deal with the bitcast
528	/// idiom because that was the only way of expressing the notion of a atomic
529	/// float or vector store. The long term plan is to teach each backend to
530	/// instruction select from the original atomic store, but as a migration
531	/// mechanism, we convert back to the old format which the backends understand.
532	/// Each backend will need individual work to recognize the new format.
533	StoreInst AtomicExpandImpl::convertAtomicStoreToIntegerType(StoreInst SI) {
534	ReplacementIRBuilder Builder(SI, *DL);
535	auto *M = SI->getModule();
536	Type *NewTy = getCorrespondingIntegerType(T: SI->getValueOperand()->getType(),
537	DL: M->getDataLayout());
538	Value *NewVal = Builder.CreateBitCast(V: SI->getValueOperand(), DestTy: NewTy);
539
540	Value *Addr = SI->getPointerOperand();
541
542	StoreInst *NewSI = Builder.CreateStore(Val: NewVal, Ptr: Addr);
543	NewSI->setAlignment(SI->getAlign());
544	NewSI->setVolatile(SI->isVolatile());
545	NewSI->setAtomic(Ordering: SI->getOrdering(), SSID: SI->getSyncScopeID());
546	LLVM_DEBUG(dbgs() << "Replaced " << SI << " with " << NewSI << "\n");
547	SI->eraseFromParent();
548	return NewSI;
549	}
550
551	void AtomicExpandImpl::expandAtomicStore(StoreInst *SI) {
552	// This function is only called on atomic stores that are too large to be
553	// atomic if implemented as a native store. So we replace them by an
554	// atomic swap, that can be implemented for example as a ldrex/strex on ARM
555	// or lock cmpxchg8/16b on X86, as these are atomic for larger sizes.
556	// It is the responsibility of the target to only signal expansion via
557	// shouldExpandAtomicRMW in cases where this is required and possible.
558	ReplacementIRBuilder Builder(SI, *DL);
559	AtomicOrdering Ordering = SI->getOrdering();
560	assert(Ordering != AtomicOrdering::NotAtomic);
561	AtomicOrdering RMWOrdering = Ordering == AtomicOrdering::Unordered
562	? AtomicOrdering::Monotonic
563	: Ordering;
564	AtomicRMWInst *AI = Builder.CreateAtomicRMW(
565	Op: AtomicRMWInst::Xchg, Ptr: SI->getPointerOperand(), Val: SI->getValueOperand(),
566	Align: SI->getAlign(), Ordering: RMWOrdering);
567	SI->eraseFromParent();
568
569	// Now we have an appropriate swap instruction, lower it as usual.
570	tryExpandAtomicRMW(AI);
571	}
572
573	static void createCmpXchgInstFun(IRBuilderBase &Builder, Value *Addr,
574	Value Loaded, Value NewVal, Align AddrAlign,
575	AtomicOrdering MemOpOrder, SyncScope::ID SSID,
576	Value &Success, Value &NewLoaded) {
577	Type *OrigTy = NewVal->getType();
578
579	// This code can go away when cmpxchg supports FP and vector types.
580	assert(!OrigTy->isPointerTy());
581	bool NeedBitcast = OrigTy->isFloatingPointTy() \|\| OrigTy->isVectorTy();
582	if (NeedBitcast) {
583	IntegerType *IntTy = Builder.getIntNTy(N: OrigTy->getPrimitiveSizeInBits());
584	NewVal = Builder.CreateBitCast(V: NewVal, DestTy: IntTy);
585	Loaded = Builder.CreateBitCast(V: Loaded, DestTy: IntTy);
586	}
587
588	Value *Pair = Builder.CreateAtomicCmpXchg(
589	Ptr: Addr, Cmp: Loaded, New: NewVal, Align: AddrAlign, SuccessOrdering: MemOpOrder,
590	FailureOrdering: AtomicCmpXchgInst::getStrongestFailureOrdering(SuccessOrdering: MemOpOrder), SSID);
591	Success = Builder.CreateExtractValue(Agg: Pair, Idxs: `1`, Name: "success");
592	NewLoaded = Builder.CreateExtractValue(Agg: Pair, Idxs: `0`, Name: "newloaded");
593
594	if (NeedBitcast)
595	NewLoaded = Builder.CreateBitCast(V: NewLoaded, DestTy: OrigTy);
596	}
597
598	bool AtomicExpandImpl::tryExpandAtomicRMW(AtomicRMWInst *AI) {
599	LLVMContext &Ctx = AI->getModule()->getContext();
600	TargetLowering::AtomicExpansionKind Kind = TLI->shouldExpandAtomicRMWInIR(RMW: AI);
601	switch (Kind) {
602	case TargetLoweringBase::AtomicExpansionKind::None:
603	return false;
604	case TargetLoweringBase::AtomicExpansionKind::LLSC: {
605	unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / `8`;
606	unsigned ValueSize = getAtomicOpSize(RMWI: AI);
607	if (ValueSize < MinCASSize) {
608	expandPartwordAtomicRMW(I: AI,
609	ExpansionKind: TargetLoweringBase::AtomicExpansionKind::LLSC);
610	} else {
611	auto PerformOp = [&](IRBuilderBase &Builder, Value *Loaded) {
612	return buildAtomicRMWValue(Op: AI->getOperation(), Builder, Loaded,
613	Val: AI->getValOperand());
614	};
615	expandAtomicOpToLLSC(I: AI, ResultTy: AI->getType(), Addr: AI->getPointerOperand(),
616	AddrAlign: AI->getAlign(), MemOpOrder: AI->getOrdering(), PerformOp);
617	}
618	return true;
619	}
620	case TargetLoweringBase::AtomicExpansionKind::CmpXChg: {
621	unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / `8`;
622	unsigned ValueSize = getAtomicOpSize(RMWI: AI);
623	if (ValueSize < MinCASSize) {
624	expandPartwordAtomicRMW(I: AI,
625	ExpansionKind: TargetLoweringBase::AtomicExpansionKind::CmpXChg);
626	} else {
627	SmallVector<StringRef> SSNs;
628	Ctx.getSyncScopeNames(SSNs);
629	auto MemScope = SSNs [AI->getSyncScopeID()].empty()
630	? "system"
631	: SSNs [AI->getSyncScopeID()];
632	OptimizationRemarkEmitter ORE(AI->getFunction());
633	ORE.emit(RemarkBuilder: [&]() {
634	return OptimizationRemark (DEBUG_TYPE, "Passed", AI)
635	<< "A compare and swap loop was generated for an atomic "
636	<< AI->getOperationName(Op: AI->getOperation()) << " operation at "
637	<< MemScope << " memory scope";
638	});
639	expandAtomicRMWToCmpXchg(AI, CreateCmpXchg: createCmpXchgInstFun);
640	}
641	return true;
642	}
643	case TargetLoweringBase::AtomicExpansionKind::MaskedIntrinsic: {
644	unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / `8`;
645	unsigned ValueSize = getAtomicOpSize(RMWI: AI);
646	if (ValueSize < MinCASSize) {
647	AtomicRMWInst::BinOp Op = AI->getOperation();
648	// Widen And/Or/Xor and give the target another chance at expanding it.
649	if (Op == AtomicRMWInst::Or \|\| Op == AtomicRMWInst::Xor \|\|
650	Op == AtomicRMWInst::And) {
651	tryExpandAtomicRMW(AI: widenPartwordAtomicRMW(AI));
652	return true;
653	}
654	}
655	expandAtomicRMWToMaskedIntrinsic(AI);
656	return true;
657	}
658	case TargetLoweringBase::AtomicExpansionKind::BitTestIntrinsic: {
659	TLI->emitBitTestAtomicRMWIntrinsic(AI);
660	return true;
661	}
662	case TargetLoweringBase::AtomicExpansionKind::CmpArithIntrinsic: {
663	TLI->emitCmpArithAtomicRMWIntrinsic(AI);
664	return true;
665	}
666	case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
667	return lowerAtomicRMWInst(RMWI: AI);
668	case TargetLoweringBase::AtomicExpansionKind::Expand:
669	TLI->emitExpandAtomicRMW(AI);
670	return true;
671	default:
672	llvm_unreachable("Unhandled case in tryExpandAtomicRMW");
673	}
674	}
675
676	namespace {
677
678	struct PartwordMaskValues {
679	// These three fields are guaranteed to be set by createMaskInstrs.
680	Type WordType = nullptr*;
681	Type ValueType = nullptr*;
682	Type IntValueType = nullptr*;
683	Value AlignedAddr = nullptr*;
684	Align AlignedAddrAlignment;
685	// The remaining fields can be null.
686	Value ShiftAmt = nullptr*;
687	Value Mask = nullptr*;
688	Value Inv_Mask = nullptr*;
689	};
690
691	LLVM_ATTRIBUTE_UNUSED
692	raw_ostream &operator<<(raw_ostream &O, const PartwordMaskValues &PMV) {
693	auto PrintObj = [&O](auto *V) {
694	if (V)
695	O << *V;
696	else
697	O << "nullptr";
698	O << `'\n'`;
699	};
700	O << "PartwordMaskValues {\n";
701	O << " WordType: ";
702	PrintObj (PMV.WordType);
703	O << " ValueType: ";
704	PrintObj (PMV.ValueType);
705	O << " AlignedAddr: ";
706	PrintObj (PMV.AlignedAddr);
707	O << " AlignedAddrAlignment: " << PMV.AlignedAddrAlignment.value() << `'\n'`;
708	O << " ShiftAmt: ";
709	PrintObj (PMV.ShiftAmt);
710	O << " Mask: ";
711	PrintObj (PMV.Mask);
712	O << " Inv_Mask: ";
713	PrintObj (PMV.Inv_Mask);
714	O << "}\n";
715	return O;
716	}
717
718	} // end anonymous namespace
719
720	/// This is a helper function which builds instructions to provide
721	/// values necessary for partword atomic operations. It takes an
722	/// incoming address, Addr, and ValueType, and constructs the address,
723	/// shift-amounts and masks needed to work with a larger value of size
724	/// WordSize.
725	///
726	/// AlignedAddr: Addr rounded down to a multiple of WordSize
727	///
728	/// ShiftAmt: Number of bits to right-shift a WordSize value loaded
729	/// from AlignAddr for it to have the same value as if
730	/// ValueType was loaded from Addr.
731	///
732	/// Mask: Value to mask with the value loaded from AlignAddr to
733	/// include only the part that would've been loaded from Addr.
734	///
735	/// Inv_Mask: The inverse of Mask.
736	static PartwordMaskValues createMaskInstrs(IRBuilderBase &Builder,
737	Instruction I, Type ValueType,
738	Value *Addr, Align AddrAlign,
739	unsigned MinWordSize) {
740	PartwordMaskValues PMV;
741
742	Module *M = I->getModule();
743	LLVMContext &Ctx = M->getContext();
744	const DataLayout &DL = M->getDataLayout();
745	unsigned ValueSize = DL.getTypeStoreSize(Ty: ValueType);
746
747	PMV.ValueType = PMV.IntValueType = ValueType;
748	if (PMV.ValueType->isFloatingPointTy() \|\| PMV.ValueType->isVectorTy())
749	PMV.IntValueType =
750	Type::getIntNTy(C&: Ctx, N: ValueType->getPrimitiveSizeInBits());
751
752	PMV.WordType = MinWordSize > ValueSize ? Type::getIntNTy(C&: Ctx, N: MinWordSize * `8`)
753	: ValueType;
754	if (PMV.ValueType == PMV.WordType) {
755	PMV.AlignedAddr = Addr;
756	PMV.AlignedAddrAlignment = AddrAlign;
757	PMV.ShiftAmt = ConstantInt::get(Ty: PMV.ValueType, V: `0`);
758	PMV.Mask = ConstantInt::get(Ty: PMV.ValueType, V: ~`0`, /isSigned/ IsSigned: true);
759	return PMV;
760	}
761
762	PMV.AlignedAddrAlignment = Align (MinWordSize);
763
764	assert(ValueSize < MinWordSize);
765
766	PointerType *PtrTy = cast<PointerType>(Val: Addr->getType());
767	IntegerType *IntTy = DL.getIntPtrType(C&: Ctx, AddressSpace: PtrTy->getAddressSpace());
768	Value *PtrLSB;
769
770	if (AddrAlign < MinWordSize) {
771	PMV.AlignedAddr = Builder.CreateIntrinsic(
772	Intrinsic::ptrmask, {PtrTy, IntTy},
773	{Addr, ConstantInt::get(Ty: IntTy, V: ~(uint64_t)(MinWordSize - `1`))}, nullptr,
774	"AlignedAddr");
775
776	Value *AddrInt = Builder.CreatePtrToInt(V: Addr, DestTy: IntTy);
777	PtrLSB = Builder.CreateAnd(LHS: AddrInt, RHS: MinWordSize - `1`, Name: "PtrLSB");
778	} else {
779	// If the alignment is high enough, the LSB are known 0.
780	PMV.AlignedAddr = Addr;
781	PtrLSB = ConstantInt::getNullValue(Ty: IntTy);
782	}
783
784	if (DL.isLittleEndian()) {
785	// turn bytes into bits
786	PMV.ShiftAmt = Builder.CreateShl(LHS: PtrLSB, RHS: `3`);
787	} else {
788	// turn bytes into bits, and count from the other side.
789	PMV.ShiftAmt = Builder.CreateShl(
790	LHS: Builder.CreateXor(LHS: PtrLSB, RHS: MinWordSize - ValueSize), RHS: `3`);
791	}
792
793	PMV.ShiftAmt = Builder.CreateTrunc(V: PMV.ShiftAmt, DestTy: PMV.WordType, Name: "ShiftAmt");
794	PMV.Mask = Builder.CreateShl(
795	LHS: ConstantInt::get(Ty: PMV.WordType, V: (`1` << (ValueSize * `8`)) - `1`), RHS: PMV.ShiftAmt,
796	Name: "Mask");
797
798	PMV.Inv_Mask = Builder.CreateNot(V: PMV.Mask, Name: "Inv_Mask");
799
800	return PMV;
801	}
802
803	static Value extractMaskedValue(IRBuilderBase &Builder, Value WideWord,
804	const PartwordMaskValues &PMV) {
805	assert(WideWord->getType() == PMV.WordType && "Widened type mismatch");
806	if (PMV.WordType == PMV.ValueType)
807	return WideWord;
808
809	Value *Shift = Builder.CreateLShr(LHS: WideWord, RHS: PMV.ShiftAmt, Name: "shifted");
810	Value *Trunc = Builder.CreateTrunc(V: Shift, DestTy: PMV.IntValueType, Name: "extracted");
811	return Builder.CreateBitCast(V: Trunc, DestTy: PMV.ValueType);
812	}
813
814	static Value insertMaskedValue(IRBuilderBase &Builder, Value WideWord,
815	Value Updated, const* PartwordMaskValues &PMV) {
816	assert(WideWord->getType() == PMV.WordType && "Widened type mismatch");
817	assert(Updated->getType() == PMV.ValueType && "Value type mismatch");
818	if (PMV.WordType == PMV.ValueType)
819	return Updated;
820
821	Updated = Builder.CreateBitCast(V: Updated, DestTy: PMV.IntValueType);
822
823	Value *ZExt = Builder.CreateZExt(V: Updated, DestTy: PMV.WordType, Name: "extended");
824	Value *Shift =
825	Builder.CreateShl(LHS: ZExt, RHS: PMV.ShiftAmt, Name: "shifted", /HasNUW/ true);
826	Value *And = Builder.CreateAnd(LHS: WideWord, RHS: PMV.Inv_Mask, Name: "unmasked");
827	Value *Or = Builder.CreateOr(LHS: And, RHS: Shift, Name: "inserted");
828	return Or;
829	}
830
831	/// Emit IR to implement a masked version of a given atomicrmw
832	/// operation. (That is, only the bits under the Mask should be
833	/// affected by the operation)
834	static Value *performMaskedAtomicOp(AtomicRMWInst::BinOp Op,
835	IRBuilderBase &Builder, Value *Loaded,
836	Value Shifted_Inc, Value Inc,
837	const PartwordMaskValues &PMV) {
838	// TODO: update to use
839	// https://graphics.stanford.edu/~seander/bithacks.html#MaskedMerge in order
840	// to merge bits from two values without requiring PMV.Inv_Mask.
841	switch (Op) {
842	case AtomicRMWInst::Xchg: {
843	Value *Loaded_MaskOut = Builder.CreateAnd(LHS: Loaded, RHS: PMV.Inv_Mask);
844	Value *FinalVal = Builder.CreateOr(LHS: Loaded_MaskOut, RHS: Shifted_Inc);
845	return FinalVal;
846	}
847	case AtomicRMWInst::Or:
848	case AtomicRMWInst::Xor:
849	case AtomicRMWInst::And:
850	llvm_unreachable("Or/Xor/And handled by widenPartwordAtomicRMW");
851	case AtomicRMWInst::Add:
852	case AtomicRMWInst::Sub:
853	case AtomicRMWInst::Nand: {
854	// The other arithmetic ops need to be masked into place.
855	Value *NewVal = buildAtomicRMWValue(Op, Builder, Loaded, Val: Shifted_Inc);
856	Value *NewVal_Masked = Builder.CreateAnd(LHS: NewVal, RHS: PMV.Mask);
857	Value *Loaded_MaskOut = Builder.CreateAnd(LHS: Loaded, RHS: PMV.Inv_Mask);
858	Value *FinalVal = Builder.CreateOr(LHS: Loaded_MaskOut, RHS: NewVal_Masked);
859	return FinalVal;
860	}
861	case AtomicRMWInst::Max:
862	case AtomicRMWInst::Min:
863	case AtomicRMWInst::UMax:
864	case AtomicRMWInst::UMin:
865	case AtomicRMWInst::FAdd:
866	case AtomicRMWInst::FSub:
867	case AtomicRMWInst::FMin:
868	case AtomicRMWInst::FMax:
869	case AtomicRMWInst::UIncWrap:
870	case AtomicRMWInst::UDecWrap: {
871	// Finally, other ops will operate on the full value, so truncate down to
872	// the original size, and expand out again after doing the
873	// operation. Bitcasts will be inserted for FP values.
874	Value *Loaded_Extract = extractMaskedValue(Builder, WideWord: Loaded, PMV);
875	Value *NewVal = buildAtomicRMWValue(Op, Builder, Loaded: Loaded_Extract, Val: Inc);
876	Value *FinalVal = insertMaskedValue(Builder, WideWord: Loaded, Updated: NewVal, PMV);
877	return FinalVal;
878	}
879	default:
880	llvm_unreachable("Unknown atomic op");
881	}
882	}
883
884	/// Expand a sub-word atomicrmw operation into an appropriate
885	/// word-sized operation.
886	///
887	/// It will create an LL/SC or cmpxchg loop, as appropriate, the same
888	/// way as a typical atomicrmw expansion. The only difference here is
889	/// that the operation inside of the loop may operate upon only a
890	/// part of the value.
891	void AtomicExpandImpl::expandPartwordAtomicRMW(
892	AtomicRMWInst *AI, TargetLoweringBase::AtomicExpansionKind ExpansionKind) {
893	// Widen And/Or/Xor and give the target another chance at expanding it.
894	AtomicRMWInst::BinOp Op = AI->getOperation();
895	if (Op == AtomicRMWInst::Or \|\| Op == AtomicRMWInst::Xor \|\|
896	Op == AtomicRMWInst::And) {
897	tryExpandAtomicRMW(AI: widenPartwordAtomicRMW(AI));
898	return;
899	}
900	AtomicOrdering MemOpOrder = AI->getOrdering();
901	SyncScope::ID SSID = AI->getSyncScopeID();
902
903	ReplacementIRBuilder Builder(AI, *DL);
904
905	PartwordMaskValues PMV =
906	createMaskInstrs(Builder, I: AI, ValueType: AI->getType(), Addr: AI->getPointerOperand(),
907	AddrAlign: AI->getAlign(), MinWordSize: TLI->getMinCmpXchgSizeInBits() / `8`);
908
909	Value ValOperand_Shifted = nullptr*;
910	if (Op == AtomicRMWInst::Xchg \|\| Op == AtomicRMWInst::Add \|\|
911	Op == AtomicRMWInst::Sub \|\| Op == AtomicRMWInst::Nand) {
912	ValOperand_Shifted =
913	Builder.CreateShl(LHS: Builder.CreateZExt(V: AI->getValOperand(), DestTy: PMV.WordType),
914	RHS: PMV.ShiftAmt, Name: "ValOperand_Shifted");
915	}
916
917	auto PerformPartwordOp = [&](IRBuilderBase &Builder, Value *Loaded) {
918	return performMaskedAtomicOp(Op, Builder, Loaded, Shifted_Inc: ValOperand_Shifted,
919	Inc: AI->getValOperand(), PMV);
920	};
921
922	Value *OldResult;
923	if (ExpansionKind == TargetLoweringBase::AtomicExpansionKind::CmpXChg) {
924	OldResult = insertRMWCmpXchgLoop(Builder, ResultType: PMV.WordType, Addr: PMV.AlignedAddr,
925	AddrAlign: PMV.AlignedAddrAlignment, MemOpOrder, SSID,
926	PerformOp: PerformPartwordOp, CreateCmpXchg: createCmpXchgInstFun);
927	} else {
928	assert(ExpansionKind == TargetLoweringBase::AtomicExpansionKind::LLSC);
929	OldResult = insertRMWLLSCLoop(Builder, ResultTy: PMV.WordType, Addr: PMV.AlignedAddr,
930	AddrAlign: PMV.AlignedAddrAlignment, MemOpOrder,
931	PerformOp: PerformPartwordOp);
932	}
933
934	Value *FinalOldResult = extractMaskedValue(Builder, WideWord: OldResult, PMV);
935	AI->replaceAllUsesWith(V: FinalOldResult);
936	AI->eraseFromParent();
937	}
938
939	// Widen the bitwise atomicrmw (or/xor/and) to the minimum supported width.
940	AtomicRMWInst AtomicExpandImpl::widenPartwordAtomicRMW(AtomicRMWInst AI) {
941	ReplacementIRBuilder Builder(AI, *DL);
942	AtomicRMWInst::BinOp Op = AI->getOperation();
943
944	assert((Op == AtomicRMWInst::Or \|\| Op == AtomicRMWInst::Xor \|\|
945	Op == AtomicRMWInst::And) &&
946	"Unable to widen operation");
947
948	PartwordMaskValues PMV =
949	createMaskInstrs(Builder, I: AI, ValueType: AI->getType(), Addr: AI->getPointerOperand(),
950	AddrAlign: AI->getAlign(), MinWordSize: TLI->getMinCmpXchgSizeInBits() / `8`);
951
952	Value *ValOperand_Shifted =
953	Builder.CreateShl(LHS: Builder.CreateZExt(V: AI->getValOperand(), DestTy: PMV.WordType),
954	RHS: PMV.ShiftAmt, Name: "ValOperand_Shifted");
955
956	Value *NewOperand;
957
958	if (Op == AtomicRMWInst::And)
959	NewOperand =
960	Builder.CreateOr(LHS: ValOperand_Shifted, RHS: PMV.Inv_Mask, Name: "AndOperand");
961	else
962	NewOperand = ValOperand_Shifted;
963
964	AtomicRMWInst *NewAI = Builder.CreateAtomicRMW(
965	Op, Ptr: PMV.AlignedAddr, Val: NewOperand, Align: PMV.AlignedAddrAlignment,
966	Ordering: AI->getOrdering(), SSID: AI->getSyncScopeID());
967	// TODO: Preserve metadata
968
969	Value *FinalOldResult = extractMaskedValue(Builder, WideWord: NewAI, PMV);
970	AI->replaceAllUsesWith(V: FinalOldResult);
971	AI->eraseFromParent();
972	return NewAI;
973	}
974
975	bool AtomicExpandImpl::expandPartwordCmpXchg(AtomicCmpXchgInst *CI) {
976	// The basic idea here is that we're expanding a cmpxchg of a
977	// smaller memory size up to a word-sized cmpxchg. To do this, we
978	// need to add a retry-loop for strong cmpxchg, so that
979	// modifications to other parts of the word don't cause a spurious
980	// failure.
981
982	// This generates code like the following:
983	// [[Setup mask values PMV.]]*
984	// %NewVal_Shifted = shl i32 %NewVal, %PMV.ShiftAmt
985	// %Cmp_Shifted = shl i32 %Cmp, %PMV.ShiftAmt
986	// %InitLoaded = load i32 %addr*
987	// %InitLoaded_MaskOut = and i32 %InitLoaded, %PMV.Inv_Mask
988	// br partword.cmpxchg.loop
989	// partword.cmpxchg.loop:
990	// %Loaded_MaskOut = phi i32 [ %InitLoaded_MaskOut, %entry ],
991	// [ %OldVal_MaskOut, %partword.cmpxchg.failure ]
992	// %FullWord_NewVal = or i32 %Loaded_MaskOut, %NewVal_Shifted
993	// %FullWord_Cmp = or i32 %Loaded_MaskOut, %Cmp_Shifted
994	// %NewCI = cmpxchg i32 %PMV.AlignedAddr, i32 %FullWord_Cmp,*
995	// i32 %FullWord_NewVal success_ordering failure_ordering
996	// %OldVal = extractvalue { i32, i1 } %NewCI, 0
997	// %Success = extractvalue { i32, i1 } %NewCI, 1
998	// br i1 %Success, label %partword.cmpxchg.end,
999	// label %partword.cmpxchg.failure
1000	// partword.cmpxchg.failure:
1001	// %OldVal_MaskOut = and i32 %OldVal, %PMV.Inv_Mask
1002	// %ShouldContinue = icmp ne i32 %Loaded_MaskOut, %OldVal_MaskOut
1003	// br i1 %ShouldContinue, label %partword.cmpxchg.loop,
1004	// label %partword.cmpxchg.end
1005	// partword.cmpxchg.end:
1006	// %tmp1 = lshr i32 %OldVal, %PMV.ShiftAmt
1007	// %FinalOldVal = trunc i32 %tmp1 to i8
1008	// %tmp2 = insertvalue { i8, i1 } undef, i8 %FinalOldVal, 0
1009	// %Res = insertvalue { i8, i1 } %25, i1 %Success, 1
1010
1011	Value *Addr = CI->getPointerOperand();
1012	Value *Cmp = CI->getCompareOperand();
1013	Value *NewVal = CI->getNewValOperand();
1014
1015	BasicBlock *BB = CI->getParent();
1016	Function *F = BB->getParent();
1017	ReplacementIRBuilder Builder(CI, *DL);
1018	LLVMContext &Ctx = Builder.getContext();
1019
1020	BasicBlock *EndBB =
1021	BB->splitBasicBlock(I: CI->getIterator(), BBName: "partword.cmpxchg.end");
1022	auto FailureBB =
1023	BasicBlock::Create(Context&: Ctx, Name: "partword.cmpxchg.failure", Parent: F, InsertBefore: EndBB);
1024	auto LoopBB = BasicBlock::Create(Context&: Ctx, Name: "partword.cmpxchg.loop", Parent: F, InsertBefore: FailureBB);
1025
1026	// The split call above "helpfully" added a branch at the end of BB
1027	// (to the wrong place).
1028	std::prev(x: BB->end())->eraseFromParent();
1029	Builder.SetInsertPoint(BB);
1030
1031	PartwordMaskValues PMV =
1032	createMaskInstrs(Builder, I: CI, ValueType: CI->getCompareOperand()->getType(), Addr,
1033	AddrAlign: CI->getAlign(), MinWordSize: TLI->getMinCmpXchgSizeInBits() / `8`);
1034
1035	// Shift the incoming values over, into the right location in the word.
1036	Value *NewVal_Shifted =
1037	Builder.CreateShl(LHS: Builder.CreateZExt(V: NewVal, DestTy: PMV.WordType), RHS: PMV.ShiftAmt);
1038	Value *Cmp_Shifted =
1039	Builder.CreateShl(LHS: Builder.CreateZExt(V: Cmp, DestTy: PMV.WordType), RHS: PMV.ShiftAmt);
1040
1041	// Load the entire current word, and mask into place the expected and new
1042	// values
1043	LoadInst *InitLoaded = Builder.CreateLoad(Ty: PMV.WordType, Ptr: PMV.AlignedAddr);
1044	InitLoaded->setVolatile(CI->isVolatile());
1045	Value *InitLoaded_MaskOut = Builder.CreateAnd(LHS: InitLoaded, RHS: PMV.Inv_Mask);
1046	Builder.CreateBr(Dest: LoopBB);
1047
1048	// partword.cmpxchg.loop:
1049	Builder.SetInsertPoint(LoopBB);
1050	PHINode *Loaded_MaskOut = Builder.CreatePHI(Ty: PMV.WordType, NumReservedValues: `2`);
1051	Loaded_MaskOut->addIncoming(V: InitLoaded_MaskOut, BB);
1052
1053	// Mask/Or the expected and new values into place in the loaded word.
1054	Value *FullWord_NewVal = Builder.CreateOr(LHS: Loaded_MaskOut, RHS: NewVal_Shifted);
1055	Value *FullWord_Cmp = Builder.CreateOr(LHS: Loaded_MaskOut, RHS: Cmp_Shifted);
1056	AtomicCmpXchgInst *NewCI = Builder.CreateAtomicCmpXchg(
1057	Ptr: PMV.AlignedAddr, Cmp: FullWord_Cmp, New: FullWord_NewVal, Align: PMV.AlignedAddrAlignment,
1058	SuccessOrdering: CI->getSuccessOrdering(), FailureOrdering: CI->getFailureOrdering(), SSID: CI->getSyncScopeID());
1059	NewCI->setVolatile(CI->isVolatile());
1060	// When we're building a strong cmpxchg, we need a loop, so you
1061	// might think we could use a weak cmpxchg inside. But, using strong
1062	// allows the below comparison for ShouldContinue, and we're
1063	// expecting the underlying cmpxchg to be a machine instruction,
1064	// which is strong anyways.
1065	NewCI->setWeak(CI->isWeak());
1066
1067	Value *OldVal = Builder.CreateExtractValue(Agg: NewCI, Idxs: `0`);
1068	Value *Success = Builder.CreateExtractValue(Agg: NewCI, Idxs: `1`);
1069
1070	if (CI->isWeak())
1071	Builder.CreateBr(Dest: EndBB);
1072	else
1073	Builder.CreateCondBr(Cond: Success, True: EndBB, False: FailureBB);
1074
1075	// partword.cmpxchg.failure:
1076	Builder.SetInsertPoint(FailureBB);
1077	// Upon failure, verify that the masked-out part of the loaded value
1078	// has been modified. If it didn't, abort the cmpxchg, since the
1079	// masked-in part must've.
1080	Value *OldVal_MaskOut = Builder.CreateAnd(LHS: OldVal, RHS: PMV.Inv_Mask);
1081	Value *ShouldContinue = Builder.CreateICmpNE(LHS: Loaded_MaskOut, RHS: OldVal_MaskOut);
1082	Builder.CreateCondBr(Cond: ShouldContinue, True: LoopBB, False: EndBB);
1083
1084	// Add the second value to the phi from above
1085	Loaded_MaskOut->addIncoming(V: OldVal_MaskOut, BB: FailureBB);
1086
1087	// partword.cmpxchg.end:
1088	Builder.SetInsertPoint(CI);
1089
1090	Value *FinalOldVal = extractMaskedValue(Builder, WideWord: OldVal, PMV);
1091	Value *Res = PoisonValue::get(T: CI->getType());
1092	Res = Builder.CreateInsertValue(Agg: Res, Val: FinalOldVal, Idxs: `0`);
1093	Res = Builder.CreateInsertValue(Agg: Res, Val: Success, Idxs: `1`);
1094
1095	CI->replaceAllUsesWith(V: Res);
1096	CI->eraseFromParent();
1097	return true;
1098	}
1099
1100	void AtomicExpandImpl::expandAtomicOpToLLSC(
1101	Instruction I, Type ResultType, Value *Addr, Align AddrAlign,
1102	AtomicOrdering MemOpOrder,
1103	function_ref<Value (IRBuilderBase &, Value )> PerformOp) {
1104	ReplacementIRBuilder Builder(I, *DL);
1105	Value *Loaded = insertRMWLLSCLoop(Builder, ResultTy: ResultType, Addr, AddrAlign,
1106	MemOpOrder, PerformOp);
1107
1108	I->replaceAllUsesWith(V: Loaded);
1109	I->eraseFromParent();
1110	}
1111
1112	void AtomicExpandImpl::expandAtomicRMWToMaskedIntrinsic(AtomicRMWInst *AI) {
1113	ReplacementIRBuilder Builder(AI, *DL);
1114
1115	PartwordMaskValues PMV =
1116	createMaskInstrs(Builder, I: AI, ValueType: AI->getType(), Addr: AI->getPointerOperand(),
1117	AddrAlign: AI->getAlign(), MinWordSize: TLI->getMinCmpXchgSizeInBits() / `8`);
1118
1119	// The value operand must be sign-extended for signed min/max so that the
1120	// target's signed comparison instructions can be used. Otherwise, just
1121	// zero-ext.
1122	Instruction::CastOps CastOp = Instruction::ZExt;
1123	AtomicRMWInst::BinOp RMWOp = AI->getOperation();
1124	if (RMWOp == AtomicRMWInst::Max \|\| RMWOp == AtomicRMWInst::Min)
1125	CastOp = Instruction::SExt;
1126
1127	Value *ValOperand_Shifted = Builder.CreateShl(
1128	LHS: Builder.CreateCast(Op: CastOp, V: AI->getValOperand(), DestTy: PMV.WordType),
1129	RHS: PMV.ShiftAmt, Name: "ValOperand_Shifted");
1130	Value *OldResult = TLI->emitMaskedAtomicRMWIntrinsic(
1131	Builder, AI, AlignedAddr: PMV.AlignedAddr, Incr: ValOperand_Shifted, Mask: PMV.Mask, ShiftAmt: PMV.ShiftAmt,
1132	Ord: AI->getOrdering());
1133	Value *FinalOldResult = extractMaskedValue(Builder, WideWord: OldResult, PMV);
1134	AI->replaceAllUsesWith(V: FinalOldResult);
1135	AI->eraseFromParent();
1136	}
1137
1138	void AtomicExpandImpl::expandAtomicCmpXchgToMaskedIntrinsic(
1139	AtomicCmpXchgInst *CI) {
1140	ReplacementIRBuilder Builder(CI, *DL);
1141
1142	PartwordMaskValues PMV = createMaskInstrs(
1143	Builder, I: CI, ValueType: CI->getCompareOperand()->getType(), Addr: CI->getPointerOperand(),
1144	AddrAlign: CI->getAlign(), MinWordSize: TLI->getMinCmpXchgSizeInBits() / `8`);
1145
1146	Value *CmpVal_Shifted = Builder.CreateShl(
1147	LHS: Builder.CreateZExt(V: CI->getCompareOperand(), DestTy: PMV.WordType), RHS: PMV.ShiftAmt,
1148	Name: "CmpVal_Shifted");
1149	Value *NewVal_Shifted = Builder.CreateShl(
1150	LHS: Builder.CreateZExt(V: CI->getNewValOperand(), DestTy: PMV.WordType), RHS: PMV.ShiftAmt,
1151	Name: "NewVal_Shifted");
1152	Value *OldVal = TLI->emitMaskedAtomicCmpXchgIntrinsic(
1153	Builder, CI, AlignedAddr: PMV.AlignedAddr, CmpVal: CmpVal_Shifted, NewVal: NewVal_Shifted, Mask: PMV.Mask,
1154	Ord: CI->getMergedOrdering());
1155	Value *FinalOldVal = extractMaskedValue(Builder, WideWord: OldVal, PMV);
1156	Value *Res = PoisonValue::get(T: CI->getType());
1157	Res = Builder.CreateInsertValue(Agg: Res, Val: FinalOldVal, Idxs: `0`);
1158	Value *Success = Builder.CreateICmpEQ(
1159	LHS: CmpVal_Shifted, RHS: Builder.CreateAnd(LHS: OldVal, RHS: PMV.Mask), Name: "Success");
1160	Res = Builder.CreateInsertValue(Agg: Res, Val: Success, Idxs: `1`);
1161
1162	CI->replaceAllUsesWith(V: Res);
1163	CI->eraseFromParent();
1164	}
1165
1166	Value *AtomicExpandImpl::insertRMWLLSCLoop(
1167	IRBuilderBase &Builder, Type ResultTy, Value Addr, Align AddrAlign,
1168	AtomicOrdering MemOpOrder,
1169	function_ref<Value (IRBuilderBase &, Value )> PerformOp) {
1170	LLVMContext &Ctx = Builder.getContext();
1171	BasicBlock *BB = Builder.GetInsertBlock();
1172	Function *F = BB->getParent();
1173
1174	assert(AddrAlign >=
1175	F->getParent()->getDataLayout().getTypeStoreSize(ResultTy) &&
1176	"Expected at least natural alignment at this point.");
1177
1178	// Given: atomicrmw some_op iN %addr, iN %incr ordering*
1179	//
1180	// The standard expansion we produce is:
1181	// [...]
1182	// atomicrmw.start:
1183	// %loaded = @load.linked(%addr)
1184	// %new = some_op iN %loaded, %incr
1185	// %stored = @store_conditional(%new, %addr)
1186	// %try_again = icmp i32 ne %stored, 0
1187	// br i1 %try_again, label %loop, label %atomicrmw.end
1188	// atomicrmw.end:
1189	// [...]
1190	BasicBlock *ExitBB =
1191	BB->splitBasicBlock(I: Builder.GetInsertPoint(), BBName: "atomicrmw.end");
1192	BasicBlock *LoopBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.start", Parent: F, InsertBefore: ExitBB);
1193
1194	// The split call above "helpfully" added a branch at the end of BB (to the
1195	// wrong place).
1196	std::prev(x: BB->end())->eraseFromParent();
1197	Builder.SetInsertPoint(BB);
1198	Builder.CreateBr(Dest: LoopBB);
1199
1200	// Start the main loop block now that we've taken care of the preliminaries.
1201	Builder.SetInsertPoint(LoopBB);
1202	Value *Loaded = TLI->emitLoadLinked(Builder, ValueTy: ResultTy, Addr, Ord: MemOpOrder);
1203
1204	Value *NewVal = PerformOp (Builder, Loaded);
1205
1206	Value *StoreSuccess =
1207	TLI->emitStoreConditional(Builder, Val: NewVal, Addr, Ord: MemOpOrder);
1208	Value *TryAgain = Builder.CreateICmpNE(
1209	LHS: StoreSuccess, RHS: ConstantInt::get(Ty: IntegerType::get(C&: Ctx, NumBits: `32`), V: `0`), Name: "tryagain");
1210	Builder.CreateCondBr(Cond: TryAgain, True: LoopBB, False: ExitBB);
1211
1212	Builder.SetInsertPoint(TheBB: ExitBB, IP: ExitBB->begin());
1213	return Loaded;
1214	}
1215
1216	/// Convert an atomic cmpxchg of a non-integral type to an integer cmpxchg of
1217	/// the equivalent bitwidth. We used to not support pointer cmpxchg in the
1218	/// IR. As a migration step, we convert back to what use to be the standard
1219	/// way to represent a pointer cmpxchg so that we can update backends one by
1220	/// one.
1221	AtomicCmpXchgInst *
1222	AtomicExpandImpl::convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI) {
1223	auto *M = CI->getModule();
1224	Type *NewTy = getCorrespondingIntegerType(T: CI->getCompareOperand()->getType(),
1225	DL: M->getDataLayout());
1226
1227	ReplacementIRBuilder Builder(CI, *DL);
1228
1229	Value *Addr = CI->getPointerOperand();
1230
1231	Value *NewCmp = Builder.CreatePtrToInt(V: CI->getCompareOperand(), DestTy: NewTy);
1232	Value *NewNewVal = Builder.CreatePtrToInt(V: CI->getNewValOperand(), DestTy: NewTy);
1233
1234	auto *NewCI = Builder.CreateAtomicCmpXchg(
1235	Ptr: Addr, Cmp: NewCmp, New: NewNewVal, Align: CI->getAlign(), SuccessOrdering: CI->getSuccessOrdering(),
1236	FailureOrdering: CI->getFailureOrdering(), SSID: CI->getSyncScopeID());
1237	NewCI->setVolatile(CI->isVolatile());
1238	NewCI->setWeak(CI->isWeak());
1239	LLVM_DEBUG(dbgs() << "Replaced " << CI << " with " << NewCI << "\n");
1240
1241	Value *OldVal = Builder.CreateExtractValue(Agg: NewCI, Idxs: `0`);
1242	Value *Succ = Builder.CreateExtractValue(Agg: NewCI, Idxs: `1`);
1243
1244	OldVal = Builder.CreateIntToPtr(V: OldVal, DestTy: CI->getCompareOperand()->getType());
1245
1246	Value *Res = PoisonValue::get(T: CI->getType());
1247	Res = Builder.CreateInsertValue(Agg: Res, Val: OldVal, Idxs: `0`);
1248	Res = Builder.CreateInsertValue(Agg: Res, Val: Succ, Idxs: `1`);
1249
1250	CI->replaceAllUsesWith(V: Res);
1251	CI->eraseFromParent();
1252	return NewCI;
1253	}
1254
1255	bool AtomicExpandImpl::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
1256	AtomicOrdering SuccessOrder = CI->getSuccessOrdering();
1257	AtomicOrdering FailureOrder = CI->getFailureOrdering();
1258	Value *Addr = CI->getPointerOperand();
1259	BasicBlock *BB = CI->getParent();
1260	Function *F = BB->getParent();
1261	LLVMContext &Ctx = F->getContext();
1262	// If shouldInsertFencesForAtomic() returns true, then the target does not
1263	// want to deal with memory orders, and emitLeading/TrailingFence should take
1264	// care of everything. Otherwise, emitLeading/TrailingFence are no-op and we
1265	// should preserve the ordering.
1266	bool ShouldInsertFencesForAtomic = TLI->shouldInsertFencesForAtomic(I: CI);
1267	AtomicOrdering MemOpOrder = ShouldInsertFencesForAtomic
1268	? AtomicOrdering::Monotonic
1269	: CI->getMergedOrdering();
1270
1271	// In implementations which use a barrier to achieve release semantics, we can
1272	// delay emitting this barrier until we know a store is actually going to be
1273	// attempted. The cost of this delay is that we need 2 copies of the block
1274	// emitting the load-linked, affecting code size.
1275	//
1276	// Ideally, this logic would be unconditional except for the minsize check
1277	// since in other cases the extra blocks naturally collapse down to the
1278	// minimal loop. Unfortunately, this puts too much stress on later
1279	// optimisations so we avoid emitting the extra logic in those cases too.
1280	bool HasReleasedLoadBB = !CI->isWeak() && ShouldInsertFencesForAtomic &&
1281	SuccessOrder != AtomicOrdering::Monotonic &&
1282	SuccessOrder != AtomicOrdering::Acquire &&
1283	!F->hasMinSize();
1284
1285	// There's no overhead for sinking the release barrier in a weak cmpxchg, so
1286	// do it even on minsize.
1287	bool UseUnconditionalReleaseBarrier = F->hasMinSize() && !CI->isWeak();
1288
1289	// Given: cmpxchg some_op iN %addr, iN %desired, iN %new success_ord fail_ord*
1290	//
1291	// The full expansion we produce is:
1292	// [...]
1293	// %aligned.addr = ...
1294	// cmpxchg.start:
1295	// %unreleasedload = @load.linked(%aligned.addr)
1296	// %unreleasedload.extract = extract value from %unreleasedload
1297	// %should_store = icmp eq %unreleasedload.extract, %desired
1298	// br i1 %should_store, label %cmpxchg.releasingstore,
1299	// label %cmpxchg.nostore
1300	// cmpxchg.releasingstore:
1301	// fence?
1302	// br label cmpxchg.trystore
1303	// cmpxchg.trystore:
1304	// %loaded.trystore = phi [%unreleasedload, %cmpxchg.releasingstore],
1305	// [%releasedload, %cmpxchg.releasedload]
1306	// %updated.new = insert %new into %loaded.trystore
1307	// %stored = @store_conditional(%updated.new, %aligned.addr)
1308	// %success = icmp eq i32 %stored, 0
1309	// br i1 %success, label %cmpxchg.success,
1310	// label %cmpxchg.releasedload/%cmpxchg.failure
1311	// cmpxchg.releasedload:
1312	// %releasedload = @load.linked(%aligned.addr)
1313	// %releasedload.extract = extract value from %releasedload
1314	// %should_store = icmp eq %releasedload.extract, %desired
1315	// br i1 %should_store, label %cmpxchg.trystore,
1316	// label %cmpxchg.failure
1317	// cmpxchg.success:
1318	// fence?
1319	// br label %cmpxchg.end
1320	// cmpxchg.nostore:
1321	// %loaded.nostore = phi [%unreleasedload, %cmpxchg.start],
1322	// [%releasedload,
1323	// %cmpxchg.releasedload/%cmpxchg.trystore]
1324	// @load_linked_fail_balance()?
1325	// br label %cmpxchg.failure
1326	// cmpxchg.failure:
1327	// fence?
1328	// br label %cmpxchg.end
1329	// cmpxchg.end:
1330	// %loaded.exit = phi [%loaded.nostore, %cmpxchg.failure],
1331	// [%loaded.trystore, %cmpxchg.trystore]
1332	// %success = phi i1 [true, %cmpxchg.success], [false, %cmpxchg.failure]
1333	// %loaded = extract value from %loaded.exit
1334	// %restmp = insertvalue { iN, i1 } undef, iN %loaded, 0
1335	// %res = insertvalue { iN, i1 } %restmp, i1 %success, 1
1336	// [...]
1337	BasicBlock *ExitBB = BB->splitBasicBlock(I: CI->getIterator(), BBName: "cmpxchg.end");
1338	auto FailureBB = BasicBlock::Create(Context&: Ctx, Name: "cmpxchg.failure", Parent: F, InsertBefore: ExitBB);
1339	auto NoStoreBB = BasicBlock::Create(Context&: Ctx, Name: "cmpxchg.nostore", Parent: F, InsertBefore: FailureBB);
1340	auto SuccessBB = BasicBlock::Create(Context&: Ctx, Name: "cmpxchg.success", Parent: F, InsertBefore: NoStoreBB);
1341	auto ReleasedLoadBB =
1342	BasicBlock::Create(Context&: Ctx, Name: "cmpxchg.releasedload", Parent: F, InsertBefore: SuccessBB);
1343	auto TryStoreBB =
1344	BasicBlock::Create(Context&: Ctx, Name: "cmpxchg.trystore", Parent: F, InsertBefore: ReleasedLoadBB);
1345	auto ReleasingStoreBB =
1346	BasicBlock::Create(Context&: Ctx, Name: "cmpxchg.fencedstore", Parent: F, InsertBefore: TryStoreBB);
1347	auto StartBB = BasicBlock::Create(Context&: Ctx, Name: "cmpxchg.start", Parent: F, InsertBefore: ReleasingStoreBB);
1348
1349	ReplacementIRBuilder Builder(CI, *DL);
1350
1351	// The split call above "helpfully" added a branch at the end of BB (to the
1352	// wrong place), but we might want a fence too. It's easiest to just remove
1353	// the branch entirely.
1354	std::prev(x: BB->end())->eraseFromParent();
1355	Builder.SetInsertPoint(BB);
1356	if (ShouldInsertFencesForAtomic && UseUnconditionalReleaseBarrier)
1357	TLI->emitLeadingFence(Builder, Inst: CI, Ord: SuccessOrder);
1358
1359	PartwordMaskValues PMV =
1360	createMaskInstrs(Builder, I: CI, ValueType: CI->getCompareOperand()->getType(), Addr,
1361	AddrAlign: CI->getAlign(), MinWordSize: TLI->getMinCmpXchgSizeInBits() / `8`);
1362	Builder.CreateBr(Dest: StartBB);
1363
1364	// Start the main loop block now that we've taken care of the preliminaries.
1365	Builder.SetInsertPoint(StartBB);
1366	Value *UnreleasedLoad =
1367	TLI->emitLoadLinked(Builder, ValueTy: PMV.WordType, Addr: PMV.AlignedAddr, Ord: MemOpOrder);
1368	Value *UnreleasedLoadExtract =
1369	extractMaskedValue(Builder, WideWord: UnreleasedLoad, PMV);
1370	Value *ShouldStore = Builder.CreateICmpEQ(
1371	LHS: UnreleasedLoadExtract, RHS: CI->getCompareOperand(), Name: "should_store");
1372
1373	// If the cmpxchg doesn't actually need any ordering when it fails, we can
1374	// jump straight past that fence instruction (if it exists).
1375	Builder.CreateCondBr(Cond: ShouldStore, True: ReleasingStoreBB, False: NoStoreBB);
1376
1377	Builder.SetInsertPoint(ReleasingStoreBB);
1378	if (ShouldInsertFencesForAtomic && !UseUnconditionalReleaseBarrier)
1379	TLI->emitLeadingFence(Builder, Inst: CI, Ord: SuccessOrder);
1380	Builder.CreateBr(Dest: TryStoreBB);
1381
1382	Builder.SetInsertPoint(TryStoreBB);
1383	PHINode *LoadedTryStore =
1384	Builder.CreatePHI(Ty: PMV.WordType, NumReservedValues: `2`, Name: "loaded.trystore");
1385	LoadedTryStore->addIncoming(V: UnreleasedLoad, BB: ReleasingStoreBB);
1386	Value *NewValueInsert =
1387	insertMaskedValue(Builder, WideWord: LoadedTryStore, Updated: CI->getNewValOperand(), PMV);
1388	Value *StoreSuccess = TLI->emitStoreConditional(Builder, Val: NewValueInsert,
1389	Addr: PMV.AlignedAddr, Ord: MemOpOrder);
1390	StoreSuccess = Builder.CreateICmpEQ(
1391	LHS: StoreSuccess, RHS: ConstantInt::get(Ty: Type::getInt32Ty(C&: Ctx), V: `0`), Name: "success");
1392	BasicBlock *RetryBB = HasReleasedLoadBB ? ReleasedLoadBB : StartBB;
1393	Builder.CreateCondBr(Cond: StoreSuccess, True: SuccessBB,
1394	False: CI->isWeak() ? FailureBB : RetryBB);
1395
1396	Builder.SetInsertPoint(ReleasedLoadBB);
1397	Value *SecondLoad;
1398	if (HasReleasedLoadBB) {
1399	SecondLoad =
1400	TLI->emitLoadLinked(Builder, ValueTy: PMV.WordType, Addr: PMV.AlignedAddr, Ord: MemOpOrder);
1401	Value *SecondLoadExtract = extractMaskedValue(Builder, WideWord: SecondLoad, PMV);
1402	ShouldStore = Builder.CreateICmpEQ(LHS: SecondLoadExtract,
1403	RHS: CI->getCompareOperand(), Name: "should_store");
1404
1405	// If the cmpxchg doesn't actually need any ordering when it fails, we can
1406	// jump straight past that fence instruction (if it exists).
1407	Builder.CreateCondBr(Cond: ShouldStore, True: TryStoreBB, False: NoStoreBB);
1408	// Update PHI node in TryStoreBB.
1409	LoadedTryStore->addIncoming(V: SecondLoad, BB: ReleasedLoadBB);
1410	} else
1411	Builder.CreateUnreachable();
1412
1413	// Make sure later instructions don't get reordered with a fence if
1414	// necessary.
1415	Builder.SetInsertPoint(SuccessBB);
1416	if (ShouldInsertFencesForAtomic \|\|
1417	TLI->shouldInsertTrailingFenceForAtomicStore(I: CI))
1418	TLI->emitTrailingFence(Builder, Inst: CI, Ord: SuccessOrder);
1419	Builder.CreateBr(Dest: ExitBB);
1420
1421	Builder.SetInsertPoint(NoStoreBB);
1422	PHINode *LoadedNoStore =
1423	Builder.CreatePHI(Ty: UnreleasedLoad->getType(), NumReservedValues: `2`, Name: "loaded.nostore");
1424	LoadedNoStore->addIncoming(V: UnreleasedLoad, BB: StartBB);
1425	if (HasReleasedLoadBB)
1426	LoadedNoStore->addIncoming(V: SecondLoad, BB: ReleasedLoadBB);
1427
1428	// In the failing case, where we don't execute the store-conditional, the
1429	// target might want to balance out the load-linked with a dedicated
1430	// instruction (e.g., on ARM, clearing the exclusive monitor).
1431	TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder);
1432	Builder.CreateBr(Dest: FailureBB);
1433
1434	Builder.SetInsertPoint(FailureBB);
1435	PHINode *LoadedFailure =
1436	Builder.CreatePHI(Ty: UnreleasedLoad->getType(), NumReservedValues: `2`, Name: "loaded.failure");
1437	LoadedFailure->addIncoming(V: LoadedNoStore, BB: NoStoreBB);
1438	if (CI->isWeak())
1439	LoadedFailure->addIncoming(V: LoadedTryStore, BB: TryStoreBB);
1440	if (ShouldInsertFencesForAtomic)
1441	TLI->emitTrailingFence(Builder, Inst: CI, Ord: FailureOrder);
1442	Builder.CreateBr(Dest: ExitBB);
1443
1444	// Finally, we have control-flow based knowledge of whether the cmpxchg
1445	// succeeded or not. We expose this to later passes by converting any
1446	// subsequent "icmp eq/ne %loaded, %oldval" into a use of an appropriate
1447	// PHI.
1448	Builder.SetInsertPoint(TheBB: ExitBB, IP: ExitBB->begin());
1449	PHINode *LoadedExit =
1450	Builder.CreatePHI(Ty: UnreleasedLoad->getType(), NumReservedValues: `2`, Name: "loaded.exit");
1451	LoadedExit->addIncoming(V: LoadedTryStore, BB: SuccessBB);
1452	LoadedExit->addIncoming(V: LoadedFailure, BB: FailureBB);
1453	PHINode *Success = Builder.CreatePHI(Ty: Type::getInt1Ty(C&: Ctx), NumReservedValues: `2`, Name: "success");
1454	Success->addIncoming(V: ConstantInt::getTrue(Context&: Ctx), BB: SuccessBB);
1455	Success->addIncoming(V: ConstantInt::getFalse(Context&: Ctx), BB: FailureBB);
1456
1457	// This is the "exit value" from the cmpxchg expansion. It may be of
1458	// a type wider than the one in the cmpxchg instruction.
1459	Value *LoadedFull = LoadedExit;
1460
1461	Builder.SetInsertPoint(TheBB: ExitBB, IP: std::next(x: Success->getIterator()));
1462	Value *Loaded = extractMaskedValue(Builder, WideWord: LoadedFull, PMV);
1463
1464	// Look for any users of the cmpxchg that are just comparing the loaded value
1465	// against the desired one, and replace them with the CFG-derived version.
1466	SmallVector<ExtractValueInst *, `2`> PrunedInsts;
1467	for (auto *User : CI->users()) {
1468	ExtractValueInst *EV = dyn_cast<ExtractValueInst>(Val: User);
1469	if (!EV)
1470	continue;
1471
1472	assert(EV->getNumIndices() == `1` && EV->getIndices()[`0`] <= `1` &&
1473	"weird extraction from { iN, i1 }");
1474
1475	if (EV->getIndices()[`0`] == `0`)
1476	EV->replaceAllUsesWith(V: Loaded);
1477	else
1478	EV->replaceAllUsesWith(V: Success);
1479
1480	PrunedInsts.push_back(Elt: EV);
1481	}
1482
1483	// We can remove the instructions now we're no longer iterating through them.
1484	for (auto *EV : PrunedInsts)
1485	EV->eraseFromParent();
1486
1487	if (!CI->use_empty()) {
1488	// Some use of the full struct return that we don't understand has happened,
1489	// so we've got to reconstruct it properly.
1490	Value *Res;
1491	Res = Builder.CreateInsertValue(Agg: PoisonValue::get(T: CI->getType()), Val: Loaded, Idxs: `0`);
1492	Res = Builder.CreateInsertValue(Agg: Res, Val: Success, Idxs: `1`);
1493
1494	CI->replaceAllUsesWith(V: Res);
1495	}
1496
1497	CI->eraseFromParent();
1498	return true;
1499	}
1500
1501	bool AtomicExpandImpl::isIdempotentRMW(AtomicRMWInst *RMWI) {
1502	auto C = dyn_cast<ConstantInt>(Val: RMWI->getValOperand());
1503	if (!C)
1504	return false;
1505
1506	AtomicRMWInst::BinOp Op = RMWI->getOperation();
1507	switch (Op) {
1508	case AtomicRMWInst::Add:
1509	case AtomicRMWInst::Sub:
1510	case AtomicRMWInst::Or:
1511	case AtomicRMWInst::Xor:
1512	return C->isZero();
1513	case AtomicRMWInst::And:
1514	return C->isMinusOne();
1515	// FIXME: we could also treat Min/Max/UMin/UMax by the INT_MIN/INT_MAX/...
1516	default:
1517	return false;
1518	}
1519	}
1520
1521	bool AtomicExpandImpl::simplifyIdempotentRMW(AtomicRMWInst *RMWI) {
1522	if (auto ResultingLoad = TLI->lowerIdempotentRMWIntoFencedLoad(RMWI)) {
1523	tryExpandAtomicLoad(LI: ResultingLoad);
1524	return true;
1525	}
1526	return false;
1527	}
1528
1529	Value *AtomicExpandImpl::insertRMWCmpXchgLoop(
1530	IRBuilderBase &Builder, Type ResultTy, Value Addr, Align AddrAlign,
1531	AtomicOrdering MemOpOrder, SyncScope::ID SSID,
1532	function_ref<Value (IRBuilderBase &, Value )> PerformOp,
1533	CreateCmpXchgInstFun CreateCmpXchg) {
1534	LLVMContext &Ctx = Builder.getContext();
1535	BasicBlock *BB = Builder.GetInsertBlock();
1536	Function *F = BB->getParent();
1537
1538	// Given: atomicrmw some_op iN %addr, iN %incr ordering*
1539	//
1540	// The standard expansion we produce is:
1541	// [...]
1542	// %init_loaded = load atomic iN %addr*
1543	// br label %loop
1544	// loop:
1545	// %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ]
1546	// %new = some_op iN %loaded, %incr
1547	// %pair = cmpxchg iN %addr, iN %loaded, iN %new*
1548	// %new_loaded = extractvalue { iN, i1 } %pair, 0
1549	// %success = extractvalue { iN, i1 } %pair, 1
1550	// br i1 %success, label %atomicrmw.end, label %loop
1551	// atomicrmw.end:
1552	// [...]
1553	BasicBlock *ExitBB =
1554	BB->splitBasicBlock(I: Builder.GetInsertPoint(), BBName: "atomicrmw.end");
1555	BasicBlock *LoopBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.start", Parent: F, InsertBefore: ExitBB);
1556
1557	// The split call above "helpfully" added a branch at the end of BB (to the
1558	// wrong place), but we want a load. It's easiest to just remove
1559	// the branch entirely.
1560	std::prev(x: BB->end())->eraseFromParent();
1561	Builder.SetInsertPoint(BB);
1562	LoadInst *InitLoaded = Builder.CreateAlignedLoad(Ty: ResultTy, Ptr: Addr, Align: AddrAlign);
1563	Builder.CreateBr(Dest: LoopBB);
1564
1565	// Start the main loop block now that we've taken care of the preliminaries.
1566	Builder.SetInsertPoint(LoopBB);
1567	PHINode *Loaded = Builder.CreatePHI(Ty: ResultTy, NumReservedValues: `2`, Name: "loaded");
1568	Loaded->addIncoming(V: InitLoaded, BB);
1569
1570	Value *NewVal = PerformOp (Builder, Loaded);
1571
1572	Value NewLoaded = nullptr*;
1573	Value Success = nullptr*;
1574
1575	CreateCmpXchg (Builder, Addr, Loaded, NewVal, AddrAlign,
1576	MemOpOrder == AtomicOrdering::Unordered
1577	? AtomicOrdering::Monotonic
1578	: MemOpOrder,
1579	SSID, Success, NewLoaded);
1580	assert(Success && NewLoaded);
1581
1582	Loaded->addIncoming(V: NewLoaded, BB: LoopBB);
1583
1584	Builder.CreateCondBr(Cond: Success, True: ExitBB, False: LoopBB);
1585
1586	Builder.SetInsertPoint(TheBB: ExitBB, IP: ExitBB->begin());
1587	return NewLoaded;
1588	}
1589
1590	bool AtomicExpandImpl::tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
1591	unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / `8`;
1592	unsigned ValueSize = getAtomicOpSize(CASI: CI);
1593
1594	switch (TLI->shouldExpandAtomicCmpXchgInIR(AI: CI)) {
1595	default:
1596	llvm_unreachable("Unhandled case in tryExpandAtomicCmpXchg");
1597	case TargetLoweringBase::AtomicExpansionKind::None:
1598	if (ValueSize < MinCASSize)
1599	return expandPartwordCmpXchg(CI);
1600	return false;
1601	case TargetLoweringBase::AtomicExpansionKind::LLSC: {
1602	return expandAtomicCmpXchg(CI);
1603	}
1604	case TargetLoweringBase::AtomicExpansionKind::MaskedIntrinsic:
1605	expandAtomicCmpXchgToMaskedIntrinsic(CI);
1606	return true;
1607	case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
1608	return lowerAtomicCmpXchgInst(CXI: CI);
1609	}
1610	}
1611
1612	// Note: This function is exposed externally by AtomicExpandUtils.h
1613	bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
1614	CreateCmpXchgInstFun CreateCmpXchg) {
1615	ReplacementIRBuilder Builder(AI, AI->getModule()->getDataLayout());
1616	Builder.setIsFPConstrained(
1617	AI->getFunction()->hasFnAttribute(Attribute::StrictFP));
1618
1619	// FIXME: If FP exceptions are observable, we should force them off for the
1620	// loop for the FP atomics.
1621	Value *Loaded = AtomicExpandImpl::insertRMWCmpXchgLoop(
1622	Builder, ResultTy: AI->getType(), Addr: AI->getPointerOperand(), AddrAlign: AI->getAlign(),
1623	MemOpOrder: AI->getOrdering(), SSID: AI->getSyncScopeID(),
1624	PerformOp: [&](IRBuilderBase &Builder, Value *Loaded) {
1625	return buildAtomicRMWValue(Op: AI->getOperation(), Builder, Loaded,
1626	Val: AI->getValOperand());
1627	},
1628	CreateCmpXchg);
1629
1630	AI->replaceAllUsesWith(V: Loaded);
1631	AI->eraseFromParent();
1632	return true;
1633	}
1634
1635	// In order to use one of the sized library calls such as
1636	// __atomic_fetch_add_4, the alignment must be sufficient, the size
1637	// must be one of the potentially-specialized sizes, and the value
1638	// type must actually exist in C on the target (otherwise, the
1639	// function wouldn't actually be defined.)
1640	static bool canUseSizedAtomicCall(unsigned Size, Align Alignment,
1641	const DataLayout &DL) {
1642	// TODO: "LargestSize" is an approximation for "largest type that
1643	// you can express in C". It seems to be the case that int128 is
1644	// supported on all 64-bit platforms, otherwise only up to 64-bit
1645	// integers are supported. If we get this wrong, then we'll try to
1646	// call a sized libcall that doesn't actually exist. There should
1647	// really be some more reliable way in LLVM of determining integer
1648	// sizes which are valid in the target's C ABI...
1649	unsigned LargestSize = DL.getLargestLegalIntTypeSizeInBits() >= `64` ? `16` : `8`;
1650	return Alignment >= Size &&
1651	(Size == `1` \|\| Size == `2` \|\| Size == `4` \|\| Size == `8` \|\| Size == `16`) &&
1652	Size <= LargestSize;
1653	}
1654
1655	void AtomicExpandImpl::expandAtomicLoadToLibcall(LoadInst *I) {
1656	static const RTLIB::Libcall Libcalls[`6`] = {
1657	RTLIB::ATOMIC_LOAD, RTLIB::ATOMIC_LOAD_1, RTLIB::ATOMIC_LOAD_2,
1658	RTLIB::ATOMIC_LOAD_4, RTLIB::ATOMIC_LOAD_8, RTLIB::ATOMIC_LOAD_16};
1659	unsigned Size = getAtomicOpSize(LI: I);
1660
1661	bool expanded = expandAtomicOpToLibcall(
1662	I, Size, Alignment: I->getAlign(), PointerOperand: I->getPointerOperand(), ValueOperand: nullptr, CASExpected: nullptr,
1663	Ordering: I->getOrdering(), Ordering2: AtomicOrdering::NotAtomic, Libcalls);
1664	if (!expanded)
1665	report_fatal_error(reason: "expandAtomicOpToLibcall shouldn't fail for Load");
1666	}
1667
1668	void AtomicExpandImpl::expandAtomicStoreToLibcall(StoreInst *I) {
1669	static const RTLIB::Libcall Libcalls[`6`] = {
1670	RTLIB::ATOMIC_STORE, RTLIB::ATOMIC_STORE_1, RTLIB::ATOMIC_STORE_2,
1671	RTLIB::ATOMIC_STORE_4, RTLIB::ATOMIC_STORE_8, RTLIB::ATOMIC_STORE_16};
1672	unsigned Size = getAtomicOpSize(SI: I);
1673
1674	bool expanded = expandAtomicOpToLibcall(
1675	I, Size, Alignment: I->getAlign(), PointerOperand: I->getPointerOperand(), ValueOperand: I->getValueOperand(),
1676	CASExpected: nullptr, Ordering: I->getOrdering(), Ordering2: AtomicOrdering::NotAtomic, Libcalls);
1677	if (!expanded)
1678	report_fatal_error(reason: "expandAtomicOpToLibcall shouldn't fail for Store");
1679	}
1680
1681	void AtomicExpandImpl::expandAtomicCASToLibcall(AtomicCmpXchgInst *I) {
1682	static const RTLIB::Libcall Libcalls[`6`] = {
1683	RTLIB::ATOMIC_COMPARE_EXCHANGE, RTLIB::ATOMIC_COMPARE_EXCHANGE_1,
1684	RTLIB::ATOMIC_COMPARE_EXCHANGE_2, RTLIB::ATOMIC_COMPARE_EXCHANGE_4,
1685	RTLIB::ATOMIC_COMPARE_EXCHANGE_8, RTLIB::ATOMIC_COMPARE_EXCHANGE_16};
1686	unsigned Size = getAtomicOpSize(CASI: I);
1687
1688	bool expanded = expandAtomicOpToLibcall(
1689	I, Size, Alignment: I->getAlign(), PointerOperand: I->getPointerOperand(), ValueOperand: I->getNewValOperand(),
1690	CASExpected: I->getCompareOperand(), Ordering: I->getSuccessOrdering(), Ordering2: I->getFailureOrdering(),
1691	Libcalls);
1692	if (!expanded)
1693	report_fatal_error(reason: "expandAtomicOpToLibcall shouldn't fail for CAS");
1694	}
1695
1696	static ArrayRef<RTLIB::Libcall> GetRMWLibcall(AtomicRMWInst::BinOp Op) {
1697	static const RTLIB::Libcall LibcallsXchg[`6`] = {
1698	RTLIB::ATOMIC_EXCHANGE, RTLIB::ATOMIC_EXCHANGE_1,
1699	RTLIB::ATOMIC_EXCHANGE_2, RTLIB::ATOMIC_EXCHANGE_4,
1700	RTLIB::ATOMIC_EXCHANGE_8, RTLIB::ATOMIC_EXCHANGE_16};
1701	static const RTLIB::Libcall LibcallsAdd[`6`] = {
1702	RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_ADD_1,
1703	RTLIB::ATOMIC_FETCH_ADD_2, RTLIB::ATOMIC_FETCH_ADD_4,
1704	RTLIB::ATOMIC_FETCH_ADD_8, RTLIB::ATOMIC_FETCH_ADD_16};
1705	static const RTLIB::Libcall LibcallsSub[`6`] = {
1706	RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_SUB_1,
1707	RTLIB::ATOMIC_FETCH_SUB_2, RTLIB::ATOMIC_FETCH_SUB_4,
1708	RTLIB::ATOMIC_FETCH_SUB_8, RTLIB::ATOMIC_FETCH_SUB_16};
1709	static const RTLIB::Libcall LibcallsAnd[`6`] = {
1710	RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_AND_1,
1711	RTLIB::ATOMIC_FETCH_AND_2, RTLIB::ATOMIC_FETCH_AND_4,
1712	RTLIB::ATOMIC_FETCH_AND_8, RTLIB::ATOMIC_FETCH_AND_16};
1713	static const RTLIB::Libcall LibcallsOr[`6`] = {
1714	RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_OR_1,
1715	RTLIB::ATOMIC_FETCH_OR_2, RTLIB::ATOMIC_FETCH_OR_4,
1716	RTLIB::ATOMIC_FETCH_OR_8, RTLIB::ATOMIC_FETCH_OR_16};
1717	static const RTLIB::Libcall LibcallsXor[`6`] = {
1718	RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_XOR_1,
1719	RTLIB::ATOMIC_FETCH_XOR_2, RTLIB::ATOMIC_FETCH_XOR_4,
1720	RTLIB::ATOMIC_FETCH_XOR_8, RTLIB::ATOMIC_FETCH_XOR_16};
1721	static const RTLIB::Libcall LibcallsNand[`6`] = {
1722	RTLIB::UNKNOWN_LIBCALL, RTLIB::ATOMIC_FETCH_NAND_1,
1723	RTLIB::ATOMIC_FETCH_NAND_2, RTLIB::ATOMIC_FETCH_NAND_4,
1724	RTLIB::ATOMIC_FETCH_NAND_8, RTLIB::ATOMIC_FETCH_NAND_16};
1725
1726	switch (Op) {
1727	case AtomicRMWInst::BAD_BINOP:
1728	llvm_unreachable("Should not have BAD_BINOP.");
1729	case AtomicRMWInst::Xchg:
1730	return ArrayRef(LibcallsXchg);
1731	case AtomicRMWInst::Add:
1732	return ArrayRef(LibcallsAdd);
1733	case AtomicRMWInst::Sub:
1734	return ArrayRef(LibcallsSub);
1735	case AtomicRMWInst::And:
1736	return ArrayRef(LibcallsAnd);
1737	case AtomicRMWInst::Or:
1738	return ArrayRef(LibcallsOr);
1739	case AtomicRMWInst::Xor:
1740	return ArrayRef(LibcallsXor);
1741	case AtomicRMWInst::Nand:
1742	return ArrayRef(LibcallsNand);
1743	case AtomicRMWInst::Max:
1744	case AtomicRMWInst::Min:
1745	case AtomicRMWInst::UMax:
1746	case AtomicRMWInst::UMin:
1747	case AtomicRMWInst::FMax:
1748	case AtomicRMWInst::FMin:
1749	case AtomicRMWInst::FAdd:
1750	case AtomicRMWInst::FSub:
1751	case AtomicRMWInst::UIncWrap:
1752	case AtomicRMWInst::UDecWrap:
1753	// No atomic libcalls are available for max/min/umax/umin.
1754	return {};
1755	}
1756	llvm_unreachable("Unexpected AtomicRMW operation.");
1757	}
1758
1759	void AtomicExpandImpl::expandAtomicRMWToLibcall(AtomicRMWInst *I) {
1760	ArrayRef<RTLIB::Libcall> Libcalls = GetRMWLibcall(Op: I->getOperation());
1761
1762	unsigned Size = getAtomicOpSize(RMWI: I);
1763
1764	bool Success = false;
1765	if (!Libcalls.empty())
1766	Success = expandAtomicOpToLibcall(
1767	I, Size, Alignment: I->getAlign(), PointerOperand: I->getPointerOperand(), ValueOperand: I->getValOperand(),
1768	CASExpected: nullptr, Ordering: I->getOrdering(), Ordering2: AtomicOrdering::NotAtomic, Libcalls);
1769
1770	// The expansion failed: either there were no libcalls at all for
1771	// the operation (min/max), or there were only size-specialized
1772	// libcalls (add/sub/etc) and we needed a generic. So, expand to a
1773	// CAS libcall, via a CAS loop, instead.
1774	if (!Success) {
1775	expandAtomicRMWToCmpXchg(
1776	AI: I, CreateCmpXchg: [this](IRBuilderBase &Builder, Value Addr, Value Loaded,
1777	Value *NewVal, Align Alignment, AtomicOrdering MemOpOrder,
1778	SyncScope::ID SSID, Value &Success, Value &NewLoaded) {
1779	// Create the CAS instruction normally...
1780	AtomicCmpXchgInst *Pair = Builder.CreateAtomicCmpXchg(
1781	Ptr: Addr, Cmp: Loaded, New: NewVal, Align: Alignment, SuccessOrdering: MemOpOrder,
1782	FailureOrdering: AtomicCmpXchgInst::getStrongestFailureOrdering(SuccessOrdering: MemOpOrder), SSID);
1783	Success = Builder.CreateExtractValue(Agg: Pair, Idxs: `1`, Name: "success");
1784	NewLoaded = Builder.CreateExtractValue(Agg: Pair, Idxs: `0`, Name: "newloaded");
1785
1786	// ...and then expand the CAS into a libcall.
1787	expandAtomicCASToLibcall(I: Pair);
1788	});
1789	}
1790	}
1791
1792	// A helper routine for the above expandAtomicToLibcall functions.*
1793	//
1794	// 'Libcalls' contains an array of enum values for the particular
1795	// ATOMIC libcalls to be emitted. All of the other arguments besides
1796	// 'I' are extracted from the Instruction subclass by the
1797	// caller. Depending on the particular call, some will be null.
1798	bool AtomicExpandImpl::expandAtomicOpToLibcall(
1799	Instruction I, unsigned* Size, Align Alignment, Value *PointerOperand,
1800	Value ValueOperand, Value CASExpected, AtomicOrdering Ordering,
1801	AtomicOrdering Ordering2, ArrayRef<RTLIB::Libcall> Libcalls) {
1802	assert(Libcalls.size() == `6`);
1803
1804	LLVMContext &Ctx = I->getContext();
1805	Module *M = I->getModule();
1806	const DataLayout &DL = M->getDataLayout();
1807	IRBuilder<> Builder(I);
1808	IRBuilder<> AllocaBuilder(&I->getFunction()->getEntryBlock().front());
1809
1810	bool UseSizedLibcall = canUseSizedAtomicCall(Size, Alignment, DL);
1811	Type SizedIntTy = Type::getIntNTy(C&: Ctx, N: Size `8`);
1812
1813	const Align AllocaAlignment = DL.getPrefTypeAlign(Ty: SizedIntTy);
1814
1815	// TODO: the "order" argument type is "int", not int32. So
1816	// getInt32Ty may be wrong if the arch uses e.g. 16-bit ints.
1817	ConstantInt *SizeVal64 = ConstantInt::get(Ty: Type::getInt64Ty(C&: Ctx), V: Size);
1818	assert(Ordering != AtomicOrdering::NotAtomic && "expect atomic MO");
1819	Constant *OrderingVal =
1820	ConstantInt::get(Ty: Type::getInt32Ty(C&: Ctx), V: (int)toCABI(AO: Ordering));
1821	Constant Ordering2Val = nullptr*;
1822	if (CASExpected) {
1823	assert(Ordering2 != AtomicOrdering::NotAtomic && "expect atomic MO");
1824	Ordering2Val =
1825	ConstantInt::get(Ty: Type::getInt32Ty(C&: Ctx), V: (int)toCABI(AO: Ordering2));
1826	}
1827	bool HasResult = I->getType() != Type::getVoidTy(C&: Ctx);
1828
1829	RTLIB::Libcall RTLibType;
1830	if (UseSizedLibcall) {
1831	switch (Size) {
1832	case `1`:
1833	RTLibType = Libcalls [`1`];
1834	break;
1835	case `2`:
1836	RTLibType = Libcalls [`2`];
1837	break;
1838	case `4`:
1839	RTLibType = Libcalls [`3`];
1840	break;
1841	case `8`:
1842	RTLibType = Libcalls [`4`];
1843	break;
1844	case `16`:
1845	RTLibType = Libcalls [`5`];
1846	break;
1847	}
1848	} else if (Libcalls [`0`] != RTLIB::UNKNOWN_LIBCALL) {
1849	RTLibType = Libcalls [`0`];
1850	} else {
1851	// Can't use sized function, and there's no generic for this
1852	// operation, so give up.
1853	return false;
1854	}
1855
1856	if (!TLI->getLibcallName(Call: RTLibType)) {
1857	// This target does not implement the requested atomic libcall so give up.
1858	return false;
1859	}
1860
1861	// Build up the function call. There's two kinds. First, the sized
1862	// variants. These calls are going to be one of the following (with
1863	// N=1,2,4,8,16):
1864	// iN __atomic_load_N(iN ptr, int ordering)*
1865	// void __atomic_store_N(iN ptr, iN val, int ordering)*
1866	// iN __atomic_{exchange\|fetch_}_N(iN ptr, iN val, int ordering)
1867	// bool __atomic_compare_exchange_N(iN ptr, iN expected, iN desired,
1868	// int success_order, int failure_order)
1869	//
1870	// Note that these functions can be used for non-integer atomic
1871	// operations, the values just need to be bitcast to integers on the
1872	// way in and out.
1873	//
1874	// And, then, the generic variants. They look like the following:
1875	// void __atomic_load(size_t size, void ptr, void ret, int ordering)
1876	// void __atomic_store(size_t size, void ptr, void val, int ordering)
1877	// void __atomic_exchange(size_t size, void ptr, void val, void ret,*
1878	// int ordering)
1879	// bool __atomic_compare_exchange(size_t size, void ptr, void expected,
1880	// void desired, int success_order,*
1881	// int failure_order)
1882	//
1883	// The different signatures are built up depending on the
1884	// 'UseSizedLibcall', 'CASExpected', 'ValueOperand', and 'HasResult'
1885	// variables.
1886
1887	AllocaInst AllocaCASExpected = nullptr*;
1888	AllocaInst AllocaValue = nullptr*;
1889	AllocaInst AllocaResult = nullptr*;
1890
1891	Type *ResultTy;
1892	SmallVector<Value *, `6`> Args;
1893	AttributeList Attr;
1894
1895	// 'size' argument.
1896	if (!UseSizedLibcall) {
1897	// Note, getIntPtrType is assumed equivalent to size_t.
1898	Args.push_back(Elt: ConstantInt::get(Ty: DL.getIntPtrType(C&: Ctx), V: Size));
1899	}
1900
1901	// 'ptr' argument.
1902	// note: This assumes all address spaces share a common libfunc
1903	// implementation and that addresses are convertable. For systems without
1904	// that property, we'd need to extend this mechanism to support AS-specific
1905	// families of atomic intrinsics.
1906	Value *PtrVal = PointerOperand;
1907	PtrVal = Builder.CreateAddrSpaceCast(V: PtrVal, DestTy: PointerType::getUnqual(C&: Ctx));
1908	Args.push_back(Elt: PtrVal);
1909
1910	// 'expected' argument, if present.
1911	if (CASExpected) {
1912	AllocaCASExpected = AllocaBuilder.CreateAlloca(Ty: CASExpected->getType());
1913	AllocaCASExpected->setAlignment(AllocaAlignment);
1914	Builder.CreateLifetimeStart(Ptr: AllocaCASExpected, Size: SizeVal64);
1915	Builder.CreateAlignedStore(Val: CASExpected, Ptr: AllocaCASExpected, Align: AllocaAlignment);
1916	Args.push_back(Elt: AllocaCASExpected);
1917	}
1918
1919	// 'val' argument ('desired' for cas), if present.
1920	if (ValueOperand) {
1921	if (UseSizedLibcall) {
1922	Value *IntValue =
1923	Builder.CreateBitOrPointerCast(V: ValueOperand, DestTy: SizedIntTy);
1924	Args.push_back(Elt: IntValue);
1925	} else {
1926	AllocaValue = AllocaBuilder.CreateAlloca(Ty: ValueOperand->getType());
1927	AllocaValue->setAlignment(AllocaAlignment);
1928	Builder.CreateLifetimeStart(Ptr: AllocaValue, Size: SizeVal64);
1929	Builder.CreateAlignedStore(Val: ValueOperand, Ptr: AllocaValue, Align: AllocaAlignment);
1930	Args.push_back(Elt: AllocaValue);
1931	}
1932	}
1933
1934	// 'ret' argument.
1935	if (!CASExpected && HasResult && !UseSizedLibcall) {
1936	AllocaResult = AllocaBuilder.CreateAlloca(Ty: I->getType());
1937	AllocaResult->setAlignment(AllocaAlignment);
1938	Builder.CreateLifetimeStart(Ptr: AllocaResult, Size: SizeVal64);
1939	Args.push_back(Elt: AllocaResult);
1940	}
1941
1942	// 'ordering' ('success_order' for cas) argument.
1943	Args.push_back(Elt: OrderingVal);
1944
1945	// 'failure_order' argument, if present.
1946	if (Ordering2Val)
1947	Args.push_back(Elt: Ordering2Val);
1948
1949	// Now, the return type.
1950	if (CASExpected) {
1951	ResultTy = Type::getInt1Ty(C&: Ctx);
1952	Attr = Attr.addRetAttribute(Ctx, Attribute::ZExt);
1953	} else if (HasResult && UseSizedLibcall)
1954	ResultTy = SizedIntTy;
1955	else
1956	ResultTy = Type::getVoidTy(C&: Ctx);
1957
1958	// Done with setting up arguments and return types, create the call:
1959	SmallVector<Type *, `6`> ArgTys;
1960	for (Value *Arg : Args)
1961	ArgTys.push_back(Elt: Arg->getType());
1962	FunctionType FnType = FunctionType::get(Result: ResultTy, Params: ArgTys, isVarArg: false*);
1963	FunctionCallee LibcallFn =
1964	M->getOrInsertFunction(Name: TLI->getLibcallName(Call: RTLibType), T: FnType, AttributeList: Attr);
1965	CallInst *Call = Builder.CreateCall(Callee: LibcallFn, Args);
1966	Call->setAttributes(Attr);
1967	Value *Result = Call;
1968
1969	// And then, extract the results...
1970	if (ValueOperand && !UseSizedLibcall)
1971	Builder.CreateLifetimeEnd(Ptr: AllocaValue, Size: SizeVal64);
1972
1973	if (CASExpected) {
1974	// The final result from the CAS is {load of 'expected' alloca, bool result
1975	// from call}
1976	Type *FinalResultTy = I->getType();
1977	Value *V = PoisonValue::get(T: FinalResultTy);
1978	Value *ExpectedOut = Builder.CreateAlignedLoad(
1979	Ty: CASExpected->getType(), Ptr: AllocaCASExpected, Align: AllocaAlignment);
1980	Builder.CreateLifetimeEnd(Ptr: AllocaCASExpected, Size: SizeVal64);
1981	V = Builder.CreateInsertValue(Agg: V, Val: ExpectedOut, Idxs: `0`);
1982	V = Builder.CreateInsertValue(Agg: V, Val: Result, Idxs: `1`);
1983	I->replaceAllUsesWith(V);
1984	} else if (HasResult) {
1985	Value *V;
1986	if (UseSizedLibcall)
1987	V = Builder.CreateBitOrPointerCast(V: Result, DestTy: I->getType());
1988	else {
1989	V = Builder.CreateAlignedLoad(Ty: I->getType(), Ptr: AllocaResult,
1990	Align: AllocaAlignment);
1991	Builder.CreateLifetimeEnd(Ptr: AllocaResult, Size: SizeVal64);
1992	}
1993	I->replaceAllUsesWith(V);
1994	}
1995	I->eraseFromParent();
1996	return true;
1997	}
1998

source code of llvm/lib/CodeGen/AtomicExpandPass.cpp