SROA.cpp source code [llvm/lib/Transforms/Scalar/SROA.cpp]

1	//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	/// \file
9	/// This transformation implements the well known scalar replacement of
10	/// aggregates transformation. It tries to identify promotable elements of an
11	/// aggregate alloca, and promote them to registers. It will also try to
12	/// convert uses of an element (or set of elements) of an alloca into a vector
13	/// or bitfield-style integer scalar if appropriate.
14	///
15	/// It works to do this with minimal slicing of the alloca so that regions
16	/// which are merely transferred in and out of external memory remain unchanged
17	/// and are not decomposed to scalar code.
18	///
19	/// Because this also performs alloca promotion, it can be thought of as also
20	/// serving the purpose of SSA formation. The algorithm iterates on the
21	/// function until all opportunities for promotion have been realized.
22	///
23	//===----------------------------------------------------------------------===//
24
25	#include "llvm/Transforms/Scalar/SROA.h"
26	#include "llvm/ADT/APInt.h"
27	#include "llvm/ADT/ArrayRef.h"
28	#include "llvm/ADT/DenseMap.h"
29	#include "llvm/ADT/MapVector.h"
30	#include "llvm/ADT/PointerIntPair.h"
31	#include "llvm/ADT/STLExtras.h"
32	#include "llvm/ADT/SetVector.h"
33	#include "llvm/ADT/SmallBitVector.h"
34	#include "llvm/ADT/SmallPtrSet.h"
35	#include "llvm/ADT/SmallVector.h"
36	#include "llvm/ADT/Statistic.h"
37	#include "llvm/ADT/StringRef.h"
38	#include "llvm/ADT/Twine.h"
39	#include "llvm/ADT/iterator.h"
40	#include "llvm/ADT/iterator_range.h"
41	#include "llvm/Analysis/AssumptionCache.h"
42	#include "llvm/Analysis/DomTreeUpdater.h"
43	#include "llvm/Analysis/GlobalsModRef.h"
44	#include "llvm/Analysis/Loads.h"
45	#include "llvm/Analysis/PtrUseVisitor.h"
46	#include "llvm/Config/llvm-config.h"
47	#include "llvm/IR/BasicBlock.h"
48	#include "llvm/IR/Constant.h"
49	#include "llvm/IR/ConstantFolder.h"
50	#include "llvm/IR/Constants.h"
51	#include "llvm/IR/DIBuilder.h"
52	#include "llvm/IR/DataLayout.h"
53	#include "llvm/IR/DebugInfo.h"
54	#include "llvm/IR/DebugInfoMetadata.h"
55	#include "llvm/IR/DerivedTypes.h"
56	#include "llvm/IR/Dominators.h"
57	#include "llvm/IR/Function.h"
58	#include "llvm/IR/GetElementPtrTypeIterator.h"
59	#include "llvm/IR/GlobalAlias.h"
60	#include "llvm/IR/IRBuilder.h"
61	#include "llvm/IR/InstVisitor.h"
62	#include "llvm/IR/Instruction.h"
63	#include "llvm/IR/Instructions.h"
64	#include "llvm/IR/IntrinsicInst.h"
65	#include "llvm/IR/LLVMContext.h"
66	#include "llvm/IR/Metadata.h"
67	#include "llvm/IR/Module.h"
68	#include "llvm/IR/Operator.h"
69	#include "llvm/IR/PassManager.h"
70	#include "llvm/IR/Type.h"
71	#include "llvm/IR/Use.h"
72	#include "llvm/IR/User.h"
73	#include "llvm/IR/Value.h"
74	#include "llvm/IR/ValueHandle.h"
75	#include "llvm/InitializePasses.h"
76	#include "llvm/Pass.h"
77	#include "llvm/Support/Casting.h"
78	#include "llvm/Support/CommandLine.h"
79	#include "llvm/Support/Compiler.h"
80	#include "llvm/Support/Debug.h"
81	#include "llvm/Support/ErrorHandling.h"
82	#include "llvm/Support/raw_ostream.h"
83	#include "llvm/Transforms/Scalar.h"
84	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
85	#include "llvm/Transforms/Utils/Local.h"
86	#include "llvm/Transforms/Utils/PromoteMemToReg.h"
87	#include <algorithm>
88	#include <cassert>
89	#include <cstddef>
90	#include <cstdint>
91	#include <cstring>
92	#include <iterator>
93	#include <string>
94	#include <tuple>
95	#include <utility>
96	#include <variant>
97	#include <vector>
98
99	using namespace llvm;
100
101	#define DEBUG_TYPE "sroa"
102
103	STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
104	STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
105	STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca");
106	STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten");
107	STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition");
108	STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
109	STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
110	STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
111	STATISTIC(NumLoadsPredicated,
112	"Number of loads rewritten into predicated loads to allow promotion");
113	STATISTIC(
114	NumStoresPredicated,
115	"Number of stores rewritten into predicated loads to allow promotion");
116	STATISTIC(NumDeleted, "Number of instructions deleted");
117	STATISTIC(NumVectorized, "Number of vectorized aggregates");
118
119	/// Hidden option to experiment with completely strict handling of inbounds
120	/// GEPs.
121	static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", cl::init(Val: false),
122	cl::Hidden);
123	/// Disable running mem2reg during SROA in order to test or debug SROA.
124	static cl::opt<bool> SROASkipMem2Reg("sroa-skip-mem2reg", cl::init(Val: false),
125	cl::Hidden);
126	namespace {
127
128	class AllocaSliceRewriter;
129	class AllocaSlices;
130	class Partition;
131
132	class SelectHandSpeculativity {
133	unsigned char Storage = `0`; // None are speculatable by default.
134	using TrueVal = Bitfield::Element<bool, `0`, `1`>; // Low 0'th bit.
135	using FalseVal = Bitfield::Element<bool, `1`, `1`>; // Low 1'th bit.
136	public:
137	SelectHandSpeculativity() = default;
138	SelectHandSpeculativity &setAsSpeculatable(bool isTrueVal);
139	bool isSpeculatable(bool isTrueVal) const;
140	bool areAllSpeculatable() const;
141	bool areAnySpeculatable() const;
142	bool areNoneSpeculatable() const;
143	// For interop as int half of PointerIntPair.
144	explicit operator intptr_t() const { return static_cast<intptr_t>(Storage); }
145	explicit SelectHandSpeculativity(intptr_t Storage_) : Storage(Storage_) {}
146	};
147	static_assert(sizeof(SelectHandSpeculativity) == sizeof(unsigned char));
148
149	using PossiblySpeculatableLoad =
150	PointerIntPair<LoadInst *, `2`, SelectHandSpeculativity>;
151	using UnspeculatableStore = StoreInst *;
152	using RewriteableMemOp =
153	std::variant<PossiblySpeculatableLoad, UnspeculatableStore>;
154	using RewriteableMemOps = SmallVector<RewriteableMemOp, `2`>;
155
156	/// An optimization pass providing Scalar Replacement of Aggregates.
157	///
158	/// This pass takes allocations which can be completely analyzed (that is, they
159	/// don't escape) and tries to turn them into scalar SSA values. There are
160	/// a few steps to this process.
161	///
162	/// 1) It takes allocations of aggregates and analyzes the ways in which they
163	/// are used to try to split them into smaller allocations, ideally of
164	/// a single scalar data type. It will split up memcpy and memset accesses
165	/// as necessary and try to isolate individual scalar accesses.
166	/// 2) It will transform accesses into forms which are suitable for SSA value
167	/// promotion. This can be replacing a memset with a scalar store of an
168	/// integer value, or it can involve speculating operations on a PHI or
169	/// select to be a PHI or select of the results.
170	/// 3) Finally, this will try to detect a pattern of accesses which map cleanly
171	/// onto insert and extract operations on a vector value, and convert them to
172	/// this form. By doing so, it will enable promotion of vector aggregates to
173	/// SSA vector values.
174	class SROA {
175	LLVMContext *const C;
176	DomTreeUpdater *const DTU;
177	AssumptionCache *const AC;
178	const bool PreserveCFG;
179
180	/// Worklist of alloca instructions to simplify.
181	///
182	/// Each alloca in the function is added to this. Each new alloca formed gets
183	/// added to it as well to recursively simplify unless that alloca can be
184	/// directly promoted. Finally, each time we rewrite a use of an alloca other
185	/// the one being actively rewritten, we add it back onto the list if not
186	/// already present to ensure it is re-visited.
187	SmallSetVector<AllocaInst *, `16`> Worklist;
188
189	/// A collection of instructions to delete.
190	/// We try to batch deletions to simplify code and make things a bit more
191	/// efficient. We also make sure there is no dangling pointers.
192	SmallVector<WeakVH, `8`> DeadInsts;
193
194	/// Post-promotion worklist.
195	///
196	/// Sometimes we discover an alloca which has a high probability of becoming
197	/// viable for SROA after a round of promotion takes place. In those cases,
198	/// the alloca is enqueued here for re-processing.
199	///
200	/// Note that we have to be very careful to clear allocas out of this list in
201	/// the event they are deleted.
202	SmallSetVector<AllocaInst *, `16`> PostPromotionWorklist;
203
204	/// A collection of alloca instructions we can directly promote.
205	std::vector<AllocaInst *> PromotableAllocas;
206
207	/// A worklist of PHIs to speculate prior to promoting allocas.
208	///
209	/// All of these PHIs have been checked for the safety of speculation and by
210	/// being speculated will allow promoting allocas currently in the promotable
211	/// queue.
212	SmallSetVector<PHINode *, `8`> SpeculatablePHIs;
213
214	/// A worklist of select instructions to rewrite prior to promoting
215	/// allocas.
216	SmallMapVector<SelectInst *, RewriteableMemOps, `8`> SelectsToRewrite;
217
218	/// Select instructions that use an alloca and are subsequently loaded can be
219	/// rewritten to load both input pointers and then select between the result,
220	/// allowing the load of the alloca to be promoted.
221	/// From this:
222	/// %P2 = select i1 %cond, ptr %Alloca, ptr %Other
223	/// %V = load <type>, ptr %P2
224	/// to:
225	/// %V1 = load <type>, ptr %Alloca -> will be mem2reg'd
226	/// %V2 = load <type>, ptr %Other
227	/// %V = select i1 %cond, <type> %V1, <type> %V2
228	///
229	/// We can do this to a select if its only uses are loads
230	/// and if either the operand to the select can be loaded unconditionally,
231	/// or if we are allowed to perform CFG modifications.
232	/// If found an intervening bitcast with a single use of the load,
233	/// allow the promotion.
234	static std::optional<RewriteableMemOps>
235	isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG);
236
237	public:
238	SROA(LLVMContext C, DomTreeUpdater DTU, AssumptionCache *AC,
239	SROAOptions PreserveCFG_)
240	: C(C), DTU(DTU), AC(AC),
241	PreserveCFG(PreserveCFG_ == SROAOptions::PreserveCFG) {}
242
243	/// Main run method used by both the SROAPass and by the legacy pass.
244	std::pair<bool /Changed/, bool /CFGChanged/> runSROA(Function &F);
245
246	private:
247	friend class AllocaSliceRewriter;
248
249	bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS);
250	AllocaInst *rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P);
251	bool splitAlloca(AllocaInst &AI, AllocaSlices &AS);
252	std::pair<bool /Changed/, bool /CFGChanged/> runOnAlloca(AllocaInst &AI);
253	void clobberUse(Use &U);
254	bool deleteDeadInstructions(SmallPtrSetImpl<AllocaInst *> &DeletedAllocas);
255	bool promoteAllocas(Function &F);
256	};
257
258	} // end anonymous namespace
259
260	/// Calculate the fragment of a variable to use when slicing a store
261	/// based on the slice dimensions, existing fragment, and base storage
262	/// fragment.
263	/// Results:
264	/// UseFrag - Use Target as the new fragment.
265	/// UseNoFrag - The new slice already covers the whole variable.
266	/// Skip - The new alloca slice doesn't include this variable.
267	/// FIXME: Can we use calculateFragmentIntersect instead?
268	namespace {
269	enum FragCalcResult { UseFrag, UseNoFrag, Skip };
270	}
271	static FragCalcResult
272	calculateFragment(DILocalVariable *Variable,
273	uint64_t NewStorageSliceOffsetInBits,
274	uint64_t NewStorageSliceSizeInBits,
275	std::optional<DIExpression::FragmentInfo> StorageFragment,
276	std::optional<DIExpression::FragmentInfo> CurrentFragment,
277	DIExpression::FragmentInfo &Target) {
278	// If the base storage describes part of the variable apply the offset and
279	// the size constraint.
280	if (StorageFragment) {
281	Target.SizeInBits =
282	std::min(a: NewStorageSliceSizeInBits, b: StorageFragment ->SizeInBits);
283	Target.OffsetInBits =
284	NewStorageSliceOffsetInBits + StorageFragment ->OffsetInBits;
285	} else {
286	Target.SizeInBits = NewStorageSliceSizeInBits;
287	Target.OffsetInBits = NewStorageSliceOffsetInBits;
288	}
289
290	// If this slice extracts the entirety of an independent variable from a
291	// larger alloca, do not produce a fragment expression, as the variable is
292	// not fragmented.
293	if (!CurrentFragment) {
294	if (auto Size = Variable->getSizeInBits()) {
295	// Treat the current fragment as covering the whole variable.
296	CurrentFragment = DIExpression::FragmentInfo (*Size, `0`);
297	if (Target == CurrentFragment)
298	return UseNoFrag;
299	}
300	}
301
302	// No additional work to do if there isn't a fragment already, or there is
303	// but it already exactly describes the new assignment.
304	if (!CurrentFragment \|\| *CurrentFragment == Target)
305	return UseFrag;
306
307	// Reject the target fragment if it doesn't fit wholly within the current
308	// fragment. TODO: We could instead chop up the target to fit in the case of
309	// a partial overlap.
310	if (Target.startInBits() < CurrentFragment ->startInBits() \|\|
311	Target.endInBits() > CurrentFragment ->endInBits())
312	return Skip;
313
314	// Target fits within the current fragment, return it.
315	return UseFrag;
316	}
317
318	static DebugVariable getAggregateVariable(DbgVariableIntrinsic *DVI) {
319	return DebugVariable (DVI->getVariable(), std::nullopt,
320	DVI->getDebugLoc().getInlinedAt());
321	}
322	static DebugVariable getAggregateVariable(DbgVariableRecord *DVR) {
323	return DebugVariable (DVR->getVariable(), std::nullopt,
324	DVR->getDebugLoc().getInlinedAt());
325	}
326
327	/// Helpers for handling new and old debug info modes in migrateDebugInfo.
328	/// These overloads unwrap a DbgInstPtr {Instruction \| DbgRecord} union based
329	/// on the \p Unused parameter type.
330	DbgVariableRecord UnwrapDbgInstPtr(DbgInstPtr P, DbgVariableRecord Unused) {
331	(void)Unused;
332	return static_cast<DbgVariableRecord >(cast<DbgRecord >(Val&: P));
333	}
334	DbgAssignIntrinsic UnwrapDbgInstPtr(DbgInstPtr P, DbgAssignIntrinsic Unused) {
335	(void)Unused;
336	return static_cast<DbgAssignIntrinsic >(cast<Instruction >(Val&: P));
337	}
338
339	/// Find linked dbg.assign and generate a new one with the correct
340	/// FragmentInfo. Link Inst to the new dbg.assign. If Value is nullptr the
341	/// value component is copied from the old dbg.assign to the new.
342	/// \param OldAlloca Alloca for the variable before splitting.
343	/// \param IsSplit True if the store (not necessarily alloca)
344	/// is being split.
345	/// \param OldAllocaOffsetInBits Offset of the slice taken from OldAlloca.
346	/// \param SliceSizeInBits New number of bits being written to.
347	/// \param OldInst Instruction that is being split.
348	/// \param Inst New instruction performing this part of the
349	/// split store.
350	/// \param Dest Store destination.
351	/// \param Value Stored value.
352	/// \param DL Datalayout.
353	static void migrateDebugInfo(AllocaInst OldAlloca, bool* IsSplit,
354	uint64_t OldAllocaOffsetInBits,
355	uint64_t SliceSizeInBits, Instruction *OldInst,
356	Instruction Inst, Value Dest, Value *Value,
357	const DataLayout &DL) {
358	auto MarkerRange = at::getAssignmentMarkers(Inst: OldInst);
359	auto DVRAssignMarkerRange = at::getDVRAssignmentMarkers(Inst: OldInst);
360	// Nothing to do if OldInst has no linked dbg.assign intrinsics.
361	if (MarkerRange.empty() && DVRAssignMarkerRange.empty())
362	return;
363
364	LLVM_DEBUG(dbgs() << " migrateDebugInfo\n");
365	LLVM_DEBUG(dbgs() << " OldAlloca: " << *OldAlloca << "\n");
366	LLVM_DEBUG(dbgs() << " IsSplit: " << IsSplit << "\n");
367	LLVM_DEBUG(dbgs() << " OldAllocaOffsetInBits: " << OldAllocaOffsetInBits
368	<< "\n");
369	LLVM_DEBUG(dbgs() << " SliceSizeInBits: " << SliceSizeInBits << "\n");
370	LLVM_DEBUG(dbgs() << " OldInst: " << *OldInst << "\n");
371	LLVM_DEBUG(dbgs() << " Inst: " << *Inst << "\n");
372	LLVM_DEBUG(dbgs() << " Dest: " << *Dest << "\n");
373	if (Value)
374	LLVM_DEBUG(dbgs() << " Value: " << *Value << "\n");
375
376	/// Map of aggregate variables to their fragment associated with OldAlloca.
377	DenseMap<DebugVariable, std::optional<DIExpression::FragmentInfo>>
378	BaseFragments;
379	for (auto *DAI : at::getAssignmentMarkers(Inst: OldAlloca))
380	BaseFragments [getAggregateVariable(DVI: DAI)] =
381	DAI->getExpression()->getFragmentInfo();
382	for (auto *DVR : at::getDVRAssignmentMarkers(Inst: OldAlloca))
383	BaseFragments [getAggregateVariable(DVR)] =
384	DVR->getExpression()->getFragmentInfo();
385
386	// The new inst needs a DIAssignID unique metadata tag (if OldInst has
387	// one). It shouldn't already have one: assert this assumption.
388	assert(!Inst->getMetadata(LLVMContext::MD_DIAssignID));
389	DIAssignID NewID = nullptr*;
390	auto &Ctx = Inst->getContext();
391	DIBuilder DIB(OldInst->getModule(), /AllowUnresolved/* false);
392	assert(OldAlloca->isStaticAlloca());
393
394	auto MigrateDbgAssign = [&](auto *DbgAssign) {
395	LLVM_DEBUG(dbgs() << " existing dbg.assign is: " << *DbgAssign
396	<< "\n");
397	auto *Expr = DbgAssign->getExpression();
398	bool SetKillLocation = false;
399
400	if (IsSplit) {
401	std::optional<DIExpression::FragmentInfo> BaseFragment;
402	{
403	auto R = BaseFragments.find(getAggregateVariable(DbgAssign));
404	if (R == BaseFragments.end())
405	return;
406	BaseFragment = R->second;
407	}
408	std::optional<DIExpression::FragmentInfo> CurrentFragment =
409	Expr->getFragmentInfo();
410	DIExpression::FragmentInfo NewFragment;
411	FragCalcResult Result = calculateFragment(
412	DbgAssign->getVariable(), OldAllocaOffsetInBits, SliceSizeInBits,
413	BaseFragment, CurrentFragment, NewFragment);
414
415	if (Result == Skip)
416	return;
417	if (Result == UseFrag && !(NewFragment == CurrentFragment)) {
418	if (CurrentFragment) {
419	// Rewrite NewFragment to be relative to the existing one (this is
420	// what createFragmentExpression wants). CalculateFragment has
421	// already resolved the size for us. FIXME: Should it return the
422	// relative fragment too?
423	NewFragment.OffsetInBits -= CurrentFragment ->OffsetInBits;
424	}
425	// Add the new fragment info to the existing expression if possible.
426	if (auto E = DIExpression::createFragmentExpression(
427	Expr, OffsetInBits: NewFragment.OffsetInBits, SizeInBits: NewFragment.SizeInBits)) {
428	Expr = *E;
429	} else {
430	// Otherwise, add the new fragment info to an empty expression and
431	// discard the value component of this dbg.assign as the value cannot
432	// be computed with the new fragment.
433	Expr = *DIExpression::createFragmentExpression(
434	Expr: DIExpression::get(Context&: Expr->getContext(), Elements: std::nullopt),
435	OffsetInBits: NewFragment.OffsetInBits, SizeInBits: NewFragment.SizeInBits);
436	SetKillLocation = true;
437	}
438	}
439	}
440
441	// If we haven't created a DIAssignID ID do that now and attach it to Inst.
442	if (!NewID) {
443	NewID = DIAssignID::getDistinct(Context&: Ctx);
444	Inst->setMetadata(KindID: LLVMContext::MD_DIAssignID, Node: NewID);
445	}
446
447	::Value *NewValue = Value ? Value : DbgAssign->getValue();
448	auto *NewAssign = UnwrapDbgInstPtr(
449	DIB.insertDbgAssign(LinkedInstr: Inst, Val: NewValue, SrcVar: DbgAssign->getVariable(), ValExpr: Expr,
450	Addr: Dest,
451	AddrExpr: DIExpression::get(Context&: Expr->getContext(), Elements: std::nullopt),
452	DL: DbgAssign->getDebugLoc()),
453	DbgAssign);
454
455	// If we've updated the value but the original dbg.assign has an arglist
456	// then kill it now - we can't use the requested new value.
457	// We can't replace the DIArgList with the new value as it'd leave
458	// the DIExpression in an invalid state (DW_OP_LLVM_arg operands without
459	// an arglist). And we can't keep the DIArgList in case the linked store
460	// is being split - in which case the DIArgList + expression may no longer
461	// be computing the correct value.
462	// This should be a very rare situation as it requires the value being
463	// stored to differ from the dbg.assign (i.e., the value has been
464	// represented differently in the debug intrinsic for some reason).
465	SetKillLocation \|=
466	Value && (DbgAssign->hasArgList() \|\|
467	!DbgAssign->getExpression()->isSingleLocationExpression());
468	if (SetKillLocation)
469	NewAssign->setKillLocation();
470
471	// We could use more precision here at the cost of some additional (code)
472	// complexity - if the original dbg.assign was adjacent to its store, we
473	// could position this new dbg.assign adjacent to its store rather than the
474	// old dbg.assgn. That would result in interleaved dbg.assigns rather than
475	// what we get now:
476	// split store !1
477	// split store !2
478	// dbg.assign !1
479	// dbg.assign !2
480	// This (current behaviour) results results in debug assignments being
481	// noted as slightly offset (in code) from the store. In practice this
482	// should have little effect on the debugging experience due to the fact
483	// that all the split stores should get the same line number.
484	NewAssign->moveBefore(DbgAssign);
485
486	NewAssign->setDebugLoc(DbgAssign->getDebugLoc());
487	LLVM_DEBUG(dbgs() << "Created new assign: " << *NewAssign << "\n");
488	};
489
490	for_each(Range&: MarkerRange, F: MigrateDbgAssign);
491	for_each(Range&: DVRAssignMarkerRange, F: MigrateDbgAssign);
492	}
493
494	namespace {
495
496	/// A custom IRBuilder inserter which prefixes all names, but only in
497	/// Assert builds.
498	class IRBuilderPrefixedInserter final : public IRBuilderDefaultInserter {
499	std::string Prefix;
500
501	Twine getNameWithPrefix(const Twine &Name) const {
502	return Name.isTriviallyEmpty() ? Name : Prefix + Name;
503	}
504
505	public:
506	void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
507
508	void InsertHelper(Instruction I, const* Twine &Name, BasicBlock *BB,
509	BasicBlock::iterator InsertPt) const override {
510	IRBuilderDefaultInserter::InsertHelper(I, Name: getNameWithPrefix(Name), BB,
511	InsertPt);
512	}
513	};
514
515	/// Provide a type for IRBuilder that drops names in release builds.
516	using IRBuilderTy = IRBuilder<ConstantFolder, IRBuilderPrefixedInserter>;
517
518	/// A used slice of an alloca.
519	///
520	/// This structure represents a slice of an alloca used by some instruction. It
521	/// stores both the begin and end offsets of this use, a pointer to the use
522	/// itself, and a flag indicating whether we can classify the use as splittable
523	/// or not when forming partitions of the alloca.
524	class Slice {
525	/// The beginning offset of the range.
526	uint64_t BeginOffset = `0`;
527
528	/// The ending offset, not included in the range.
529	uint64_t EndOffset = `0`;
530
531	/// Storage for both the use of this slice and whether it can be
532	/// split.
533	PointerIntPair<Use , `1`, bool*> UseAndIsSplittable;
534
535	public:
536	Slice() = default;
537
538	Slice(uint64_t BeginOffset, uint64_t EndOffset, Use U, bool* IsSplittable)
539	: BeginOffset(BeginOffset), EndOffset(EndOffset),
540	UseAndIsSplittable (U, IsSplittable) {}
541
542	uint64_t beginOffset() const { return BeginOffset; }
543	uint64_t endOffset() const { return EndOffset; }
544
545	bool isSplittable() const { return UseAndIsSplittable.getInt(); }
546	void makeUnsplittable() { UseAndIsSplittable.setInt(false); }
547
548	Use getUse() const* { return UseAndIsSplittable.getPointer(); }
549
550	bool isDead() const { return getUse() == nullptr; }
551	void kill() { UseAndIsSplittable.setPointer(nullptr); }
552
553	/// Support for ordering ranges.
554	///
555	/// This provides an ordering over ranges such that start offsets are
556	/// always increasing, and within equal start offsets, the end offsets are
557	/// decreasing. Thus the spanning range comes first in a cluster with the
558	/// same start position.
559	bool operator<(const Slice &RHS) const {
560	if (beginOffset() < RHS.beginOffset())
561	return true;
562	if (beginOffset() > RHS.beginOffset())
563	return false;
564	if (isSplittable() != RHS.isSplittable())
565	return !isSplittable();
566	if (endOffset() > RHS.endOffset())
567	return true;
568	return false;
569	}
570
571	/// Support comparison with a single offset to allow binary searches.
572	friend LLVM_ATTRIBUTE_UNUSED bool operator<(const Slice &LHS,
573	uint64_t RHSOffset) {
574	return LHS.beginOffset() < RHSOffset;
575	}
576	friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset,
577	const Slice &RHS) {
578	return LHSOffset < RHS.beginOffset();
579	}
580
581	bool operator==(const Slice &RHS) const {
582	return isSplittable() == RHS.isSplittable() &&
583	beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset();
584	}
585	bool operator!=(const Slice &RHS) const { return !operator==(RHS); }
586	};
587
588	/// Representation of the alloca slices.
589	///
590	/// This class represents the slices of an alloca which are formed by its
591	/// various uses. If a pointer escapes, we can't fully build a representation
592	/// for the slices used and we reflect that in this structure. The uses are
593	/// stored, sorted by increasing beginning offset and with unsplittable slices
594	/// starting at a particular offset before splittable slices.
595	class AllocaSlices {
596	public:
597	/// Construct the slices of a particular alloca.
598	AllocaSlices(const DataLayout &DL, AllocaInst &AI);
599
600	/// Test whether a pointer to the allocation escapes our analysis.
601	///
602	/// If this is true, the slices are never fully built and should be
603	/// ignored.
604	bool isEscaped() const { return PointerEscapingInstr; }
605
606	/// Support for iterating over the slices.
607	/// @{
608	using iterator = SmallVectorImpl<Slice>::iterator;
609	using range = iterator_range<iterator>;
610
611	iterator begin() { return Slices.begin(); }
612	iterator end() { return Slices.end(); }
613
614	using const_iterator = SmallVectorImpl<Slice>::const_iterator;
615	using const_range = iterator_range<const_iterator>;
616
617	const_iterator begin() const { return Slices.begin(); }
618	const_iterator end() const { return Slices.end(); }
619	/// @}
620
621	/// Erase a range of slices.
622	void erase(iterator Start, iterator Stop) { Slices.erase(CS: Start, CE: Stop); }
623
624	/// Insert new slices for this alloca.
625	///
626	/// This moves the slices into the alloca's slices collection, and re-sorts
627	/// everything so that the usual ordering properties of the alloca's slices
628	/// hold.
629	void insert(ArrayRef<Slice> NewSlices) {
630	int OldSize = Slices.size();
631	Slices.append(in_start: NewSlices.begin(), in_end: NewSlices.end());
632	auto SliceI = Slices.begin() + OldSize;
633	llvm::sort(Start: SliceI, End: Slices.end());
634	std::inplace_merge(first: Slices.begin(), middle: SliceI, last: Slices.end());
635	}
636
637	// Forward declare the iterator and range accessor for walking the
638	// partitions.
639	class partition_iterator;
640	iterator_range<partition_iterator> partitions();
641
642	/// Access the dead users for this alloca.
643	ArrayRef<Instruction > getDeadUsers() const* { return DeadUsers; }
644
645	/// Access Uses that should be dropped if the alloca is promotable.
646	ArrayRef<Use > getDeadUsesIfPromotable() const* {
647	return DeadUseIfPromotable;
648	}
649
650	/// Access the dead operands referring to this alloca.
651	///
652	/// These are operands which have cannot actually be used to refer to the
653	/// alloca as they are outside its range and the user doesn't correct for
654	/// that. These mostly consist of PHI node inputs and the like which we just
655	/// need to replace with undef.
656	ArrayRef<Use > getDeadOperands() const* { return DeadOperands; }
657
658	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
659	void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const;
660	void printSlice(raw_ostream &OS, const_iterator I,
661	StringRef Indent = " ") const;
662	void printUse(raw_ostream &OS, const_iterator I,
663	StringRef Indent = " ") const;
664	void print(raw_ostream &OS) const;
665	void dump(const_iterator I) const;
666	void dump() const;
667	#endif
668
669	private:
670	template <typename DerivedT, typename RetT = void> class BuilderBase;
671	class SliceBuilder;
672
673	friend class AllocaSlices::SliceBuilder;
674
675	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
676	/// Handle to alloca instruction to simplify method interfaces.
677	AllocaInst &AI;
678	#endif
679
680	/// The instruction responsible for this alloca not having a known set
681	/// of slices.
682	///
683	/// When an instruction (potentially) escapes the pointer to the alloca, we
684	/// store a pointer to that here and abort trying to form slices of the
685	/// alloca. This will be null if the alloca slices are analyzed successfully.
686	Instruction *PointerEscapingInstr;
687
688	/// The slices of the alloca.
689	///
690	/// We store a vector of the slices formed by uses of the alloca here. This
691	/// vector is sorted by increasing begin offset, and then the unsplittable
692	/// slices before the splittable ones. See the Slice inner class for more
693	/// details.
694	SmallVector<Slice, `8`> Slices;
695
696	/// Instructions which will become dead if we rewrite the alloca.
697	///
698	/// Note that these are not separated by slice. This is because we expect an
699	/// alloca to be completely rewritten or not rewritten at all. If rewritten,
700	/// all these instructions can simply be removed and replaced with poison as
701	/// they come from outside of the allocated space.
702	SmallVector<Instruction *, `8`> DeadUsers;
703
704	/// Uses which will become dead if can promote the alloca.
705	SmallVector<Use *, `8`> DeadUseIfPromotable;
706
707	/// Operands which will become dead if we rewrite the alloca.
708	///
709	/// These are operands that in their particular use can be replaced with
710	/// poison when we rewrite the alloca. These show up in out-of-bounds inputs
711	/// to PHI nodes and the like. They aren't entirely dead (there might be
712	/// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
713	/// want to swap this particular input for poison to simplify the use lists of
714	/// the alloca.
715	SmallVector<Use *, `8`> DeadOperands;
716	};
717
718	/// A partition of the slices.
719	///
720	/// An ephemeral representation for a range of slices which can be viewed as
721	/// a partition of the alloca. This range represents a span of the alloca's
722	/// memory which cannot be split, and provides access to all of the slices
723	/// overlapping some part of the partition.
724	///
725	/// Objects of this type are produced by traversing the alloca's slices, but
726	/// are only ephemeral and not persistent.
727	class Partition {
728	private:
729	friend class AllocaSlices;
730	friend class AllocaSlices::partition_iterator;
731
732	using iterator = AllocaSlices::iterator;
733
734	/// The beginning and ending offsets of the alloca for this
735	/// partition.
736	uint64_t BeginOffset = `0`, EndOffset = `0`;
737
738	/// The start and end iterators of this partition.
739	iterator SI, SJ;
740
741	/// A collection of split slice tails overlapping the partition.
742	SmallVector<Slice *, `4`> SplitTails;
743
744	/// Raw constructor builds an empty partition starting and ending at
745	/// the given iterator.
746	Partition(iterator SI) : SI(SI), SJ(SI) {}
747
748	public:
749	/// The start offset of this partition.
750	///
751	/// All of the contained slices start at or after this offset.
752	uint64_t beginOffset() const { return BeginOffset; }
753
754	/// The end offset of this partition.
755	///
756	/// All of the contained slices end at or before this offset.
757	uint64_t endOffset() const { return EndOffset; }
758
759	/// The size of the partition.
760	///
761	/// Note that this can never be zero.
762	uint64_t size() const {
763	assert(BeginOffset < EndOffset && "Partitions must span some bytes!");
764	return EndOffset - BeginOffset;
765	}
766
767	/// Test whether this partition contains no slices, and merely spans
768	/// a region occupied by split slices.
769	bool empty() const { return SI == SJ; }
770
771	/// \name Iterate slices that start within the partition.
772	/// These may be splittable or unsplittable. They have a begin offset >= the
773	/// partition begin offset.
774	/// @{
775	// FIXME: We should probably define a "concat_iterator" helper and use that
776	// to stitch together pointee_iterators over the split tails and the
777	// contiguous iterators of the partition. That would give a much nicer
778	// interface here. We could then additionally expose filtered iterators for
779	// split, unsplit, and unsplittable splices based on the usage patterns.
780	iterator begin() const { return SI; }
781	iterator end() const { return SJ; }
782	/// @}
783
784	/// Get the sequence of split slice tails.
785	///
786	/// These tails are of slices which start before this partition but are
787	/// split and overlap into the partition. We accumulate these while forming
788	/// partitions.
789	ArrayRef<Slice > splitSliceTails() const* { return SplitTails; }
790	};
791
792	} // end anonymous namespace
793
794	/// An iterator over partitions of the alloca's slices.
795	///
796	/// This iterator implements the core algorithm for partitioning the alloca's
797	/// slices. It is a forward iterator as we don't support backtracking for
798	/// efficiency reasons, and re-use a single storage area to maintain the
799	/// current set of split slices.
800	///
801	/// It is templated on the slice iterator type to use so that it can operate
802	/// with either const or non-const slice iterators.
803	class AllocaSlices::partition_iterator
804	: public iterator_facade_base<partition_iterator, std::forward_iterator_tag,
805	Partition> {
806	friend class AllocaSlices;
807
808	/// Most of the state for walking the partitions is held in a class
809	/// with a nice interface for examining them.
810	Partition P;
811
812	/// We need to keep the end of the slices to know when to stop.
813	AllocaSlices::iterator SE;
814
815	/// We also need to keep track of the maximum split end offset seen.
816	/// FIXME: Do we really?
817	uint64_t MaxSplitSliceEndOffset = `0`;
818
819	/// Sets the partition to be empty at given iterator, and sets the
820	/// end iterator.
821	partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE)
822	: P (SI), SE(SE) {
823	// If not already at the end, advance our state to form the initial
824	// partition.
825	if (SI != SE)
826	advance();
827	}
828
829	/// Advance the iterator to the next partition.
830	///
831	/// Requires that the iterator not be at the end of the slices.
832	void advance() {
833	assert((P.SI != SE \|\| !P.SplitTails.empty()) &&
834	"Cannot advance past the end of the slices!");
835
836	// Clear out any split uses which have ended.
837	if (!P.SplitTails.empty()) {
838	if (P.EndOffset >= MaxSplitSliceEndOffset) {
839	// If we've finished all splits, this is easy.
840	P.SplitTails.clear();
841	MaxSplitSliceEndOffset = `0`;
842	} else {
843	// Remove the uses which have ended in the prior partition. This
844	// cannot change the max split slice end because we just checked that
845	// the prior partition ended prior to that max.
846	llvm::erase_if(C&: P.SplitTails,
847	P: [&](Slice S) { return* S->endOffset() <= P.EndOffset; });
848	assert(llvm::any_of(P.SplitTails,
849	[&](Slice *S) {
850	return S->endOffset() == MaxSplitSliceEndOffset;
851	}) &&
852	"Could not find the current max split slice offset!");
853	assert(llvm::all_of(P.SplitTails,
854	[&](Slice *S) {
855	return S->endOffset() <= MaxSplitSliceEndOffset;
856	}) &&
857	"Max split slice end offset is not actually the max!");
858	}
859	}
860
861	// If P.SI is already at the end, then we've cleared the split tail and
862	// now have an end iterator.
863	if (P.SI == SE) {
864	assert(P.SplitTails.empty() && "Failed to clear the split slices!");
865	return;
866	}
867
868	// If we had a non-empty partition previously, set up the state for
869	// subsequent partitions.
870	if (P.SI != P.SJ) {
871	// Accumulate all the splittable slices which started in the old
872	// partition into the split list.
873	for (Slice &S : P)
874	if (S.isSplittable() && S.endOffset() > P.EndOffset) {
875	P.SplitTails.push_back(Elt: &S);
876	MaxSplitSliceEndOffset =
877	std::max(a: S.endOffset(), b: MaxSplitSliceEndOffset);
878	}
879
880	// Start from the end of the previous partition.
881	P.SI = P.SJ;
882
883	// If P.SI is now at the end, we at most have a tail of split slices.
884	if (P.SI == SE) {
885	P.BeginOffset = P.EndOffset;
886	P.EndOffset = MaxSplitSliceEndOffset;
887	return;
888	}
889
890	// If the we have split slices and the next slice is after a gap and is
891	// not splittable immediately form an empty partition for the split
892	// slices up until the next slice begins.
893	if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset &&
894	!P.SI->isSplittable()) {
895	P.BeginOffset = P.EndOffset;
896	P.EndOffset = P.SI->beginOffset();
897	return;
898	}
899	}
900
901	// OK, we need to consume new slices. Set the end offset based on the
902	// current slice, and step SJ past it. The beginning offset of the
903	// partition is the beginning offset of the next slice unless we have
904	// pre-existing split slices that are continuing, in which case we begin
905	// at the prior end offset.
906	P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset;
907	P.EndOffset = P.SI->endOffset();
908	++P.SJ;
909
910	// There are two strategies to form a partition based on whether the
911	// partition starts with an unsplittable slice or a splittable slice.
912	if (!P.SI->isSplittable()) {
913	// When we're forming an unsplittable region, it must always start at
914	// the first slice and will extend through its end.
915	assert(P.BeginOffset == P.SI->beginOffset());
916
917	// Form a partition including all of the overlapping slices with this
918	// unsplittable slice.
919	while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
920	if (!P.SJ->isSplittable())
921	P.EndOffset = std::max(a: P.EndOffset, b: P.SJ->endOffset());
922	++P.SJ;
923	}
924
925	// We have a partition across a set of overlapping unsplittable
926	// partitions.
927	return;
928	}
929
930	// If we're starting with a splittable slice, then we need to form
931	// a synthetic partition spanning it and any other overlapping splittable
932	// splices.
933	assert(P.SI->isSplittable() && "Forming a splittable partition!");
934
935	// Collect all of the overlapping splittable slices.
936	while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset &&
937	P.SJ->isSplittable()) {
938	P.EndOffset = std::max(a: P.EndOffset, b: P.SJ->endOffset());
939	++P.SJ;
940	}
941
942	// Back upiP.EndOffset if we ended the span early when encountering an
943	// unsplittable slice. This synthesizes the early end offset of
944	// a partition spanning only splittable slices.
945	if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
946	assert(!P.SJ->isSplittable());
947	P.EndOffset = P.SJ->beginOffset();
948	}
949	}
950
951	public:
952	bool operator==(const partition_iterator &RHS) const {
953	assert(SE == RHS.SE &&
954	"End iterators don't match between compared partition iterators!");
955
956	// The observed positions of partitions is marked by the P.SI iterator and
957	// the emptiness of the split slices. The latter is only relevant when
958	// P.SI == SE, as the end iterator will additionally have an empty split
959	// slices list, but the prior may have the same P.SI and a tail of split
960	// slices.
961	if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) {
962	assert(P.SJ == RHS.P.SJ &&
963	"Same set of slices formed two different sized partitions!");
964	assert(P.SplitTails.size() == RHS.P.SplitTails.size() &&
965	"Same slice position with differently sized non-empty split "
966	"slice tails!");
967	return true;
968	}
969	return false;
970	}
971
972	partition_iterator &operator++() {
973	advance();
974	return *this;
975	}
976
977	Partition &operator() { return* P; }
978	};
979
980	/// A forward range over the partitions of the alloca's slices.
981	///
982	/// This accesses an iterator range over the partitions of the alloca's
983	/// slices. It computes these partitions on the fly based on the overlapping
984	/// offsets of the slices and the ability to split them. It will visit "empty"
985	/// partitions to cover regions of the alloca only accessed via split
986	/// slices.
987	iterator_range<AllocaSlices::partition_iterator> AllocaSlices::partitions() {
988	return make_range(x: partition_iterator (begin(), end()),
989	y: partition_iterator (end(), end()));
990	}
991
992	static Value *foldSelectInst(SelectInst &SI) {
993	// If the condition being selected on is a constant or the same value is
994	// being selected between, fold the select. Yes this does (rarely) happen
995	// early on.
996	if (ConstantInt *CI = dyn_cast<ConstantInt>(Val: SI.getCondition()))
997	return SI.getOperand(i_nocapture: `1` + CI->isZero());
998	if (SI.getOperand(i_nocapture: `1`) == SI.getOperand(i_nocapture: `2`))
999	return SI.getOperand(i_nocapture: `1`);
1000
1001	return nullptr;
1002	}
1003
1004	/// A helper that folds a PHI node or a select.
1005	static Value *foldPHINodeOrSelectInst(Instruction &I) {
1006	if (PHINode *PN = dyn_cast<PHINode>(Val: &I)) {
1007	// If PN merges together the same value, return that value.
1008	return PN->hasConstantValue();
1009	}
1010	return foldSelectInst(SI&: cast<SelectInst>(Val&: I));
1011	}
1012
1013	/// Builder for the alloca slices.
1014	///
1015	/// This class builds a set of alloca slices by recursively visiting the uses
1016	/// of an alloca and making a slice for each load and store at each offset.
1017	class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
1018	friend class PtrUseVisitor<SliceBuilder>;
1019	friend class InstVisitor<SliceBuilder>;
1020
1021	using Base = PtrUseVisitor<SliceBuilder>;
1022
1023	const uint64_t AllocSize;
1024	AllocaSlices &AS;
1025
1026	SmallDenseMap<Instruction , unsigned*> MemTransferSliceMap;
1027	SmallDenseMap<Instruction *, uint64_t> PHIOrSelectSizes;
1028
1029	/// Set to de-duplicate dead instructions found in the use walk.
1030	SmallPtrSet<Instruction *, `4`> VisitedDeadInsts;
1031
1032	public:
1033	SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
1034	: PtrUseVisitor<SliceBuilder>(DL),
1035	AllocSize(DL.getTypeAllocSize(Ty: AI.getAllocatedType()).getFixedValue()),
1036	AS(AS) {}
1037
1038	private:
1039	void markAsDead(Instruction &I) {
1040	if (VisitedDeadInsts.insert(Ptr: &I).second)
1041	AS.DeadUsers.push_back(Elt: &I);
1042	}
1043
1044	void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
1045	bool IsSplittable = false) {
1046	// Completely skip uses which have a zero size or start either before or
1047	// past the end of the allocation.
1048	if (Size == `0` \|\| Offset.uge(RHS: AllocSize)) {
1049	LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @"
1050	<< Offset
1051	<< " which has zero size or starts outside of the "
1052	<< AllocSize << " byte alloca:\n"
1053	<< " alloca: " << AS.AI << "\n"
1054	<< " use: " << I << "\n");
1055	return markAsDead(I);
1056	}
1057
1058	uint64_t BeginOffset = Offset.getZExtValue();
1059	uint64_t EndOffset = BeginOffset + Size;
1060
1061	// Clamp the end offset to the end of the allocation. Note that this is
1062	// formulated to handle even the case where "BeginOffset + Size" overflows.
1063	// This may appear superficially to be something we could ignore entirely,
1064	// but that is not so! There may be widened loads or PHI-node uses where
1065	// some instructions are dead but not others. We can't completely ignore
1066	// them, and so have to record at least the information here.
1067	assert(AllocSize >= BeginOffset); // Established above.
1068	if (Size > AllocSize - BeginOffset) {
1069	LLVM_DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @"
1070	<< Offset << " to remain within the " << AllocSize
1071	<< " byte alloca:\n"
1072	<< " alloca: " << AS.AI << "\n"
1073	<< " use: " << I << "\n");
1074	EndOffset = AllocSize;
1075	}
1076
1077	AS.Slices.push_back(Elt: Slice (BeginOffset, EndOffset, U, IsSplittable));
1078	}
1079
1080	void visitBitCastInst(BitCastInst &BC) {
1081	if (BC.use_empty())
1082	return markAsDead(I&: BC);
1083
1084	return Base::visitBitCastInst(BC);
1085	}
1086
1087	void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
1088	if (ASC.use_empty())
1089	return markAsDead(I&: ASC);
1090
1091	return Base::visitAddrSpaceCastInst(ASC);
1092	}
1093
1094	void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
1095	if (GEPI.use_empty())
1096	return markAsDead(I&: GEPI);
1097
1098	if (SROAStrictInbounds && GEPI.isInBounds()) {
1099	// FIXME: This is a manually un-factored variant of the basic code inside
1100	// of GEPs with checking of the inbounds invariant specified in the
1101	// langref in a very strict sense. If we ever want to enable
1102	// SROAStrictInbounds, this code should be factored cleanly into
1103	// PtrUseVisitor, but it is easier to experiment with SROAStrictInbounds
1104	// by writing out the code here where we have the underlying allocation
1105	// size readily available.
1106	APInt GEPOffset = Offset;
1107	const DataLayout &DL = GEPI.getModule()->getDataLayout();
1108	for (gep_type_iterator GTI = gep_type_begin(GEP: GEPI),
1109	GTE = gep_type_end(GEP: GEPI);
1110	GTI != GTE; ++GTI) {
1111	ConstantInt *OpC = dyn_cast<ConstantInt>(Val: GTI.getOperand());
1112	if (!OpC)
1113	break;
1114
1115	// Handle a struct index, which adds its field offset to the pointer.
1116	if (StructType *STy = GTI.getStructTypeOrNull()) {
1117	unsigned ElementIdx = OpC->getZExtValue();
1118	const StructLayout *SL = DL.getStructLayout(Ty: STy);
1119	GEPOffset +=
1120	APInt (Offset.getBitWidth(), SL->getElementOffset(Idx: ElementIdx));
1121	} else {
1122	// For array or vector indices, scale the index by the size of the
1123	// type.
1124	APInt Index = OpC->getValue().sextOrTrunc(width: Offset.getBitWidth());
1125	GEPOffset += Index * APInt (Offset.getBitWidth(),
1126	GTI.getSequentialElementStride(DL));
1127	}
1128
1129	// If this index has computed an intermediate pointer which is not
1130	// inbounds, then the result of the GEP is a poison value and we can
1131	// delete it and all uses.
1132	if (GEPOffset.ugt(RHS: AllocSize))
1133	return markAsDead(I&: GEPI);
1134	}
1135	}
1136
1137	return Base::visitGetElementPtrInst(GEPI);
1138	}
1139
1140	void handleLoadOrStore(Type Ty, Instruction &I, const* APInt &Offset,
1141	uint64_t Size, bool IsVolatile) {
1142	// We allow splitting of non-volatile loads and stores where the type is an
1143	// integer type. These may be used to implement 'memcpy' or other "transfer
1144	// of bits" patterns.
1145	bool IsSplittable =
1146	Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty);
1147
1148	insertUse(I, Offset, Size, IsSplittable);
1149	}
1150
1151	void visitLoadInst(LoadInst &LI) {
1152	assert((!LI.isSimple() \|\| LI.getType()->isSingleValueType()) &&
1153	"All simple FCA loads should have been pre-split");
1154
1155	if (!IsOffsetKnown)
1156	return PI.setAborted(&LI);
1157
1158	TypeSize Size = DL.getTypeStoreSize(Ty: LI.getType());
1159	if (Size.isScalable())
1160	return PI.setAborted(&LI);
1161
1162	return handleLoadOrStore(Ty: LI.getType(), I&: LI, Offset, Size: Size.getFixedValue(),
1163	IsVolatile: LI.isVolatile());
1164	}
1165
1166	void visitStoreInst(StoreInst &SI) {
1167	Value *ValOp = SI.getValueOperand();
1168	if (ValOp == *U)
1169	return PI.setEscapedAndAborted(&SI);
1170	if (!IsOffsetKnown)
1171	return PI.setAborted(&SI);
1172
1173	TypeSize StoreSize = DL.getTypeStoreSize(Ty: ValOp->getType());
1174	if (StoreSize.isScalable())
1175	return PI.setAborted(&SI);
1176
1177	uint64_t Size = StoreSize.getFixedValue();
1178
1179	// If this memory access can be shown to statically* extend outside the*
1180	// bounds of the allocation, it's behavior is undefined, so simply
1181	// ignore it. Note that this is more strict than the generic clamping
1182	// behavior of insertUse. We also try to handle cases which might run the
1183	// risk of overflow.
1184	// FIXME: We should instead consider the pointer to have escaped if this
1185	// function is being instrumented for addressing bugs or race conditions.
1186	if (Size > AllocSize \|\| Offset.ugt(RHS: AllocSize - Size)) {
1187	LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @"
1188	<< Offset << " which extends past the end of the "
1189	<< AllocSize << " byte alloca:\n"
1190	<< " alloca: " << AS.AI << "\n"
1191	<< " use: " << SI << "\n");
1192	return markAsDead(I&: SI);
1193	}
1194
1195	assert((!SI.isSimple() \|\| ValOp->getType()->isSingleValueType()) &&
1196	"All simple FCA stores should have been pre-split");
1197	handleLoadOrStore(Ty: ValOp->getType(), I&: SI, Offset, Size, IsVolatile: SI.isVolatile());
1198	}
1199
1200	void visitMemSetInst(MemSetInst &II) {
1201	assert(II.getRawDest() == *U && "Pointer use is not the destination?");
1202	ConstantInt *Length = dyn_cast<ConstantInt>(Val: II.getLength());
1203	if ((Length && Length->getValue() == `0`) \|\|
1204	(IsOffsetKnown && Offset.uge(RHS: AllocSize)))
1205	// Zero-length mem transfer intrinsics can be ignored entirely.
1206	return markAsDead(I&: II);
1207
1208	if (!IsOffsetKnown)
1209	return PI.setAborted(&II);
1210
1211	insertUse(I&: II, Offset,
1212	Size: Length ? Length->getLimitedValue()
1213	: AllocSize - Offset.getLimitedValue(),
1214	IsSplittable: (bool)Length);
1215	}
1216
1217	void visitMemTransferInst(MemTransferInst &II) {
1218	ConstantInt *Length = dyn_cast<ConstantInt>(Val: II.getLength());
1219	if (Length && Length->getValue() == `0`)
1220	// Zero-length mem transfer intrinsics can be ignored entirely.
1221	return markAsDead(I&: II);
1222
1223	// Because we can visit these intrinsics twice, also check to see if the
1224	// first time marked this instruction as dead. If so, skip it.
1225	if (VisitedDeadInsts.count(Ptr: &II))
1226	return;
1227
1228	if (!IsOffsetKnown)
1229	return PI.setAborted(&II);
1230
1231	// This side of the transfer is completely out-of-bounds, and so we can
1232	// nuke the entire transfer. However, we also need to nuke the other side
1233	// if already added to our partitions.
1234	// FIXME: Yet another place we really should bypass this when
1235	// instrumenting for ASan.
1236	if (Offset.uge(RHS: AllocSize)) {
1237	SmallDenseMap<Instruction , unsigned*>::iterator MTPI =
1238	MemTransferSliceMap.find(Val: &II);
1239	if (MTPI != MemTransferSliceMap.end())
1240	AS.Slices [MTPI ->second].kill();
1241	return markAsDead(I&: II);
1242	}
1243
1244	uint64_t RawOffset = Offset.getLimitedValue();
1245	uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset;
1246
1247	// Check for the special case where the same exact value is used for both
1248	// source and dest.
1249	if (U == II.getRawDest() && U == II.getRawSource()) {
1250	// For non-volatile transfers this is a no-op.
1251	if (!II.isVolatile())
1252	return markAsDead(I&: II);
1253
1254	return insertUse(I&: II, Offset, Size, /IsSplittable=/false);
1255	}
1256
1257	// If we have seen both source and destination for a mem transfer, then
1258	// they both point to the same alloca.
1259	bool Inserted;
1260	SmallDenseMap<Instruction , unsigned*>::iterator MTPI;
1261	std::tie(args&: MTPI, args&: Inserted) =
1262	MemTransferSliceMap.insert(KV: std::make_pair(x: &II, y: AS.Slices.size()));
1263	unsigned PrevIdx = MTPI ->second;
1264	if (!Inserted) {
1265	Slice &PrevP = AS.Slices [PrevIdx];
1266
1267	// Check if the begin offsets match and this is a non-volatile transfer.
1268	// In that case, we can completely elide the transfer.
1269	if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) {
1270	PrevP.kill();
1271	return markAsDead(I&: II);
1272	}
1273
1274	// Otherwise we have an offset transfer within the same alloca. We can't
1275	// split those.
1276	PrevP.makeUnsplittable();
1277	}
1278
1279	// Insert the use now that we've fixed up the splittable nature.
1280	insertUse(I&: II, Offset, Size, /IsSplittable=/Inserted && Length);
1281
1282	// Check that we ended up with a valid index in the map.
1283	assert(AS.Slices[PrevIdx].getUse()->getUser() == &II &&
1284	"Map index doesn't point back to a slice with this user.");
1285	}
1286
1287	// Disable SRoA for any intrinsics except for lifetime invariants and
1288	// invariant group.
1289	// FIXME: What about debug intrinsics? This matches old behavior, but
1290	// doesn't make sense.
1291	void visitIntrinsicInst(IntrinsicInst &II) {
1292	if (II.isDroppable()) {
1293	AS.DeadUseIfPromotable.push_back(Elt: U);
1294	return;
1295	}
1296
1297	if (!IsOffsetKnown)
1298	return PI.setAborted(&II);
1299
1300	if (II.isLifetimeStartOrEnd()) {
1301	ConstantInt *Length = cast<ConstantInt>(Val: II.getArgOperand(i: `0`));
1302	uint64_t Size = std::min(a: AllocSize - Offset.getLimitedValue(),
1303	b: Length->getLimitedValue());
1304	insertUse(I&: II, Offset, Size, IsSplittable: true);
1305	return;
1306	}
1307
1308	if (II.isLaunderOrStripInvariantGroup()) {
1309	insertUse(I&: II, Offset, Size: AllocSize, IsSplittable: true);
1310	enqueueUsers(I&: II);
1311	return;
1312	}
1313
1314	Base::visitIntrinsicInst(II);
1315	}
1316
1317	Instruction hasUnsafePHIOrSelectUse(Instruction Root, uint64_t &Size) {
1318	// We consider any PHI or select that results in a direct load or store of
1319	// the same offset to be a viable use for slicing purposes. These uses
1320	// are considered unsplittable and the size is the maximum loaded or stored
1321	// size.
1322	SmallPtrSet<Instruction *, `4`> Visited;
1323	SmallVector<std::pair<Instruction , Instruction >, `4`> Uses;
1324	Visited.insert(Ptr: Root);
1325	Uses.push_back(Elt: std::make_pair(x: cast<Instruction>(Val&: *U), y&: Root));
1326	const DataLayout &DL = Root->getModule()->getDataLayout();
1327	// If there are no loads or stores, the access is dead. We mark that as
1328	// a size zero access.
1329	Size = `0`;
1330	do {
1331	Instruction I, UsedI;
1332	std::tie(args&: UsedI, args&: I) = Uses.pop_back_val();
1333
1334	if (LoadInst *LI = dyn_cast<LoadInst>(Val: I)) {
1335	TypeSize LoadSize = DL.getTypeStoreSize(Ty: LI->getType());
1336	if (LoadSize.isScalable()) {
1337	PI.setAborted(LI);
1338	return nullptr;
1339	}
1340	Size = std::max(a: Size, b: LoadSize.getFixedValue());
1341	continue;
1342	}
1343	if (StoreInst *SI = dyn_cast<StoreInst>(Val: I)) {
1344	Value *Op = SI->getOperand(i_nocapture: `0`);
1345	if (Op == UsedI)
1346	return SI;
1347	TypeSize StoreSize = DL.getTypeStoreSize(Ty: Op->getType());
1348	if (StoreSize.isScalable()) {
1349	PI.setAborted(SI);
1350	return nullptr;
1351	}
1352	Size = std::max(a: Size, b: StoreSize.getFixedValue());
1353	continue;
1354	}
1355
1356	if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: I)) {
1357	if (!GEP->hasAllZeroIndices())
1358	return GEP;
1359	} else if (!isa<BitCastInst>(Val: I) && !isa<PHINode>(Val: I) &&
1360	!isa<SelectInst>(Val: I) && !isa<AddrSpaceCastInst>(Val: I)) {
1361	return I;
1362	}
1363
1364	for (User *U : I->users())
1365	if (Visited.insert(Ptr: cast<Instruction>(Val: U)).second)
1366	Uses.push_back(Elt: std::make_pair(x&: I, y: cast<Instruction>(Val: U)));
1367	} while (!Uses.empty());
1368
1369	return nullptr;
1370	}
1371
1372	void visitPHINodeOrSelectInst(Instruction &I) {
1373	assert(isa<PHINode>(I) \|\| isa<SelectInst>(I));
1374	if (I.use_empty())
1375	return markAsDead(I);
1376
1377	// If this is a PHI node before a catchswitch, we cannot insert any non-PHI
1378	// instructions in this BB, which may be required during rewriting. Bail out
1379	// on these cases.
1380	if (isa<PHINode>(Val: I) &&
1381	I.getParent()->getFirstInsertionPt() == I.getParent()->end())
1382	return PI.setAborted(&I);
1383
1384	// TODO: We could use simplifyInstruction here to fold PHINodes and
1385	// SelectInsts. However, doing so requires to change the current
1386	// dead-operand-tracking mechanism. For instance, suppose neither loading
1387	// from %U nor %other traps. Then "load (select undef, %U, %other)" does not
1388	// trap either. However, if we simply replace %U with undef using the
1389	// current dead-operand-tracking mechanism, "load (select undef, undef,
1390	// %other)" may trap because the select may return the first operand
1391	// "undef".
1392	if (Value *Result = foldPHINodeOrSelectInst(I)) {
1393	if (Result == *U)
1394	// If the result of the constant fold will be the pointer, recurse
1395	// through the PHI/select as if we had RAUW'ed it.
1396	enqueueUsers(I);
1397	else
1398	// Otherwise the operand to the PHI/select is dead, and we can replace
1399	// it with poison.
1400	AS.DeadOperands.push_back(Elt: U);
1401
1402	return;
1403	}
1404
1405	if (!IsOffsetKnown)
1406	return PI.setAborted(&I);
1407
1408	// See if we already have computed info on this node.
1409	uint64_t &Size = PHIOrSelectSizes [&I];
1410	if (!Size) {
1411	// This is a new PHI/Select, check for an unsafe use of it.
1412	if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(Root: &I, Size))
1413	return PI.setAborted(UnsafeI);
1414	}
1415
1416	// For PHI and select operands outside the alloca, we can't nuke the entire
1417	// phi or select -- the other side might still be relevant, so we special
1418	// case them here and use a separate structure to track the operands
1419	// themselves which should be replaced with poison.
1420	// FIXME: This should instead be escaped in the event we're instrumenting
1421	// for address sanitization.
1422	if (Offset.uge(RHS: AllocSize)) {
1423	AS.DeadOperands.push_back(Elt: U);
1424	return;
1425	}
1426
1427	insertUse(I, Offset, Size);
1428	}
1429
1430	void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(I&: PN); }
1431
1432	void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(I&: SI); }
1433
1434	/// Disable SROA entirely if there are unhandled users of the alloca.
1435	void visitInstruction(Instruction &I) { PI.setAborted(&I); }
1436	};
1437
1438	AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
1439	:
1440	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
1441	AI(AI),
1442	#endif
1443	PointerEscapingInstr(nullptr) {
1444	SliceBuilder PB(DL, AI, *this);
1445	SliceBuilder::PtrInfo PtrI = PB.visitPtr(I&: AI);
1446	if (PtrI.isEscaped() \|\| PtrI.isAborted()) {
1447	// FIXME: We should sink the escape vs. abort info into the caller nicely,
1448	// possibly by just storing the PtrInfo in the AllocaSlices.
1449	PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst()
1450	: PtrI.getAbortingInst();
1451	assert(PointerEscapingInstr && "Did not track a bad instruction");
1452	return;
1453	}
1454
1455	llvm::erase_if(C&: Slices, P: [](const Slice &S) { return S.isDead(); });
1456
1457	// Sort the uses. This arranges for the offsets to be in ascending order,
1458	// and the sizes to be in descending order.
1459	llvm::stable_sort(Range&: Slices);
1460	}
1461
1462	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
1463
1464	void AllocaSlices::print(raw_ostream &OS, const_iterator I,
1465	StringRef Indent) const {
1466	printSlice(OS, I, Indent);
1467	OS << "\n";
1468	printUse(OS, I, Indent);
1469	}
1470
1471	void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
1472	StringRef Indent) const {
1473	OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
1474	<< " slice #" << (I - begin())
1475	<< (I->isSplittable() ? " (splittable)" : "");
1476	}
1477
1478	void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
1479	StringRef Indent) const {
1480	OS << Indent << " used by: " << *I->getUse()->getUser() << "\n";
1481	}
1482
1483	void AllocaSlices::print(raw_ostream &OS) const {
1484	if (PointerEscapingInstr) {
1485	OS << "Can't analyze slices for alloca: " << AI << "\n"
1486	<< " A pointer to this alloca escaped by:\n"
1487	<< " " << *PointerEscapingInstr << "\n";
1488	return;
1489	}
1490
1491	OS << "Slices of alloca: " << AI << "\n";
1492	for (const_iterator I = begin(), E = end(); I != E; ++I)
1493	print(OS, I);
1494	}
1495
1496	LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const {
1497	print(OS&: dbgs(), I);
1498	}
1499	LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(OS&: dbgs()); }
1500
1501	#endif // !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
1502
1503	/// Walk the range of a partitioning looking for a common type to cover this
1504	/// sequence of slices.
1505	static std::pair<Type , IntegerType >
1506	findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E,
1507	uint64_t EndOffset) {
1508	Type Ty = nullptr*;
1509	bool TyIsCommon = true;
1510	IntegerType ITy = nullptr*;
1511
1512	// Note that we need to look at every* alloca slice's Use to ensure we*
1513	// always get consistent results regardless of the order of slices.
1514	for (AllocaSlices::const_iterator I = B; I != E; ++I) {
1515	Use *U = I->getUse();
1516	if (isa<IntrinsicInst>(Val: *U->getUser()))
1517	continue;
1518	if (I->beginOffset() != B->beginOffset() \|\| I->endOffset() != EndOffset)
1519	continue;
1520
1521	Type UserTy = nullptr*;
1522	if (LoadInst *LI = dyn_cast<LoadInst>(Val: U->getUser())) {
1523	UserTy = LI->getType();
1524	} else if (StoreInst *SI = dyn_cast<StoreInst>(Val: U->getUser())) {
1525	UserTy = SI->getValueOperand()->getType();
1526	}
1527
1528	if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(Val: UserTy)) {
1529	// If the type is larger than the partition, skip it. We only encounter
1530	// this for split integer operations where we want to use the type of the
1531	// entity causing the split. Also skip if the type is not a byte width
1532	// multiple.
1533	if (UserITy->getBitWidth() % `8` != `0` \|\|
1534	UserITy->getBitWidth() / `8` > (EndOffset - B->beginOffset()))
1535	continue;
1536
1537	// Track the largest bitwidth integer type used in this way in case there
1538	// is no common type.
1539	if (!ITy \|\| ITy->getBitWidth() < UserITy->getBitWidth())
1540	ITy = UserITy;
1541	}
1542
1543	// To avoid depending on the order of slices, Ty and TyIsCommon must not
1544	// depend on types skipped above.
1545	if (!UserTy \|\| (Ty && Ty != UserTy))
1546	TyIsCommon = false; // Give up on anything but an iN type.
1547	else
1548	Ty = UserTy;
1549	}
1550
1551	return {TyIsCommon ? Ty : nullptr, ITy};
1552	}
1553
1554	/// PHI instructions that use an alloca and are subsequently loaded can be
1555	/// rewritten to load both input pointers in the pred blocks and then PHI the
1556	/// results, allowing the load of the alloca to be promoted.
1557	/// From this:
1558	/// %P2 = phi [i32 %Alloca, i32* %Other]*
1559	/// %V = load i32 %P2*
1560	/// to:
1561	/// %V1 = load i32 %Alloca -> will be mem2reg'd*
1562	/// ...
1563	/// %V2 = load i32 %Other*
1564	/// ...
1565	/// %V = phi [i32 %V1, i32 %V2]
1566	///
1567	/// We can do this to a select if its only uses are loads and if the operands
1568	/// to the select can be loaded unconditionally.
1569	///
1570	/// FIXME: This should be hoisted into a generic utility, likely in
1571	/// Transforms/Util/Local.h
1572	static bool isSafePHIToSpeculate(PHINode &PN) {
1573	const DataLayout &DL = PN.getModule()->getDataLayout();
1574
1575	// For now, we can only do this promotion if the load is in the same block
1576	// as the PHI, and if there are no stores between the phi and load.
1577	// TODO: Allow recursive phi users.
1578	// TODO: Allow stores.
1579	BasicBlock *BB = PN.getParent();
1580	Align MaxAlign;
1581	uint64_t APWidth = DL.getIndexTypeSizeInBits(Ty: PN.getType());
1582	Type LoadType = nullptr*;
1583	for (User *U : PN.users()) {
1584	LoadInst *LI = dyn_cast<LoadInst>(Val: U);
1585	if (!LI \|\| !LI->isSimple())
1586	return false;
1587
1588	// For now we only allow loads in the same block as the PHI. This is
1589	// a common case that happens when instcombine merges two loads through
1590	// a PHI.
1591	if (LI->getParent() != BB)
1592	return false;
1593
1594	if (LoadType) {
1595	if (LoadType != LI->getType())
1596	return false;
1597	} else {
1598	LoadType = LI->getType();
1599	}
1600
1601	// Ensure that there are no instructions between the PHI and the load that
1602	// could store.
1603	for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI)
1604	if (BBI ->mayWriteToMemory())
1605	return false;
1606
1607	MaxAlign = std::max(a: MaxAlign, b: LI->getAlign());
1608	}
1609
1610	if (!LoadType)
1611	return false;
1612
1613	APInt LoadSize =
1614	APInt (APWidth, DL.getTypeStoreSize(Ty: LoadType).getFixedValue());
1615
1616	// We can only transform this if it is safe to push the loads into the
1617	// predecessor blocks. The only thing to watch out for is that we can't put
1618	// a possibly trapping load in the predecessor if it is a critical edge.
1619	for (unsigned Idx = `0`, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1620	Instruction *TI = PN.getIncomingBlock(i: Idx)->getTerminator();
1621	Value *InVal = PN.getIncomingValue(i: Idx);
1622
1623	// If the value is produced by the terminator of the predecessor (an
1624	// invoke) or it has side-effects, there is no valid place to put a load
1625	// in the predecessor.
1626	if (TI == InVal \|\| TI->mayHaveSideEffects())
1627	return false;
1628
1629	// If the predecessor has a single successor, then the edge isn't
1630	// critical.
1631	if (TI->getNumSuccessors() == `1`)
1632	continue;
1633
1634	// If this pointer is always safe to load, or if we can prove that there
1635	// is already a load in the block, then we can move the load to the pred
1636	// block.
1637	if (isSafeToLoadUnconditionally(V: InVal, Alignment: MaxAlign, Size&: LoadSize, DL, ScanFrom: TI))
1638	continue;
1639
1640	return false;
1641	}
1642
1643	return true;
1644	}
1645
1646	static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN) {
1647	LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
1648
1649	LoadInst *SomeLoad = cast<LoadInst>(Val: PN.user_back());
1650	Type *LoadTy = SomeLoad->getType();
1651	IRB.SetInsertPoint(&PN);
1652	PHINode *NewPN = IRB.CreatePHI(Ty: LoadTy, NumReservedValues: PN.getNumIncomingValues(),
1653	Name: PN.getName() + ".sroa.speculated");
1654
1655	// Get the AA tags and alignment to use from one of the loads. It does not
1656	// matter which one we get and if any differ.
1657	AAMDNodes AATags = SomeLoad->getAAMetadata();
1658	Align Alignment = SomeLoad->getAlign();
1659
1660	// Rewrite all loads of the PN to use the new PHI.
1661	while (!PN.use_empty()) {
1662	LoadInst *LI = cast<LoadInst>(Val: PN.user_back());
1663	LI->replaceAllUsesWith(V: NewPN);
1664	LI->eraseFromParent();
1665	}
1666
1667	// Inject loads into all of the pred blocks.
1668	DenseMap<BasicBlock , Value > InjectedLoads;
1669	for (unsigned Idx = `0`, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1670	BasicBlock *Pred = PN.getIncomingBlock(i: Idx);
1671	Value *InVal = PN.getIncomingValue(i: Idx);
1672
1673	// A PHI node is allowed to have multiple (duplicated) entries for the same
1674	// basic block, as long as the value is the same. So if we already injected
1675	// a load in the predecessor, then we should reuse the same load for all
1676	// duplicated entries.
1677	if (Value *V = InjectedLoads.lookup(Val: Pred)) {
1678	NewPN->addIncoming(V, BB: Pred);
1679	continue;
1680	}
1681
1682	Instruction *TI = Pred->getTerminator();
1683	IRB.SetInsertPoint(TI);
1684
1685	LoadInst *Load = IRB.CreateAlignedLoad(
1686	Ty: LoadTy, Ptr: InVal, Align: Alignment,
1687	Name: (PN.getName() + ".sroa.speculate.load." + Pred->getName()));
1688	++NumLoadsSpeculated;
1689	if (AATags)
1690	Load->setAAMetadata(AATags);
1691	NewPN->addIncoming(V: Load, BB: Pred);
1692	InjectedLoads [Pred] = Load;
1693	}
1694
1695	LLVM_DEBUG(dbgs() << " speculated to: " << *NewPN << "\n");
1696	PN.eraseFromParent();
1697	}
1698
1699	SelectHandSpeculativity &
1700	SelectHandSpeculativity::setAsSpeculatable(bool isTrueVal) {
1701	if (isTrueVal)
1702	Bitfield::set<SelectHandSpeculativity::TrueVal>(Packed&: Storage, Value: true);
1703	else
1704	Bitfield::set<SelectHandSpeculativity::FalseVal>(Packed&: Storage, Value: true);
1705	return *this;
1706	}
1707
1708	bool SelectHandSpeculativity::isSpeculatable(bool isTrueVal) const {
1709	return isTrueVal ? Bitfield::get<SelectHandSpeculativity::TrueVal>(Packed: Storage)
1710	: Bitfield::get<SelectHandSpeculativity::FalseVal>(Packed: Storage);
1711	}
1712
1713	bool SelectHandSpeculativity::areAllSpeculatable() const {
1714	return isSpeculatable(/isTrueVal=/true) &&
1715	isSpeculatable(/isTrueVal=/false);
1716	}
1717
1718	bool SelectHandSpeculativity::areAnySpeculatable() const {
1719	return isSpeculatable(/isTrueVal=/true) \|\|
1720	isSpeculatable(/isTrueVal=/false);
1721	}
1722	bool SelectHandSpeculativity::areNoneSpeculatable() const {
1723	return !areAnySpeculatable();
1724	}
1725
1726	static SelectHandSpeculativity
1727	isSafeLoadOfSelectToSpeculate(LoadInst &LI, SelectInst &SI, bool PreserveCFG) {
1728	assert(LI.isSimple() && "Only for simple loads");
1729	SelectHandSpeculativity Spec;
1730
1731	const DataLayout &DL = SI.getModule()->getDataLayout();
1732	for (Value *Value : {SI.getTrueValue(), SI.getFalseValue()})
1733	if (isSafeToLoadUnconditionally(V: Value, Ty: LI.getType(), Alignment: LI.getAlign(), DL,
1734	ScanFrom: &LI))
1735	Spec.setAsSpeculatable(/isTrueVal=/Value == SI.getTrueValue());
1736	else if (PreserveCFG)
1737	return Spec;
1738
1739	return Spec;
1740	}
1741
1742	std::optional<RewriteableMemOps>
1743	SROA::isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG) {
1744	RewriteableMemOps Ops;
1745
1746	for (User *U : SI.users()) {
1747	if (auto *BC = dyn_cast<BitCastInst>(Val: U); BC && BC->hasOneUse())
1748	U = *BC->user_begin();
1749
1750	if (auto *Store = dyn_cast<StoreInst>(Val: U)) {
1751	// Note that atomic stores can be transformed; atomic semantics do not
1752	// have any meaning for a local alloca. Stores are not speculatable,
1753	// however, so if we can't turn it into a predicated store, we are done.
1754	if (Store->isVolatile() \|\| PreserveCFG)
1755	return {}; // Give up on this `select`.
1756	Ops.emplace_back(Args&: Store);
1757	continue;
1758	}
1759
1760	auto *LI = dyn_cast<LoadInst>(Val: U);
1761
1762	// Note that atomic loads can be transformed;
1763	// atomic semantics do not have any meaning for a local alloca.
1764	if (!LI \|\| LI->isVolatile())
1765	return {}; // Give up on this `select`.
1766
1767	PossiblySpeculatableLoad Load(LI);
1768	if (!LI->isSimple()) {
1769	// If the `load` is not simple, we can't speculatively execute it,
1770	// but we could handle this via a CFG modification. But can we?
1771	if (PreserveCFG)
1772	return {}; // Give up on this `select`.
1773	Ops.emplace_back(Args&: Load);
1774	continue;
1775	}
1776
1777	SelectHandSpeculativity Spec =
1778	isSafeLoadOfSelectToSpeculate(LI&: *LI, SI, PreserveCFG);
1779	if (PreserveCFG && !Spec.areAllSpeculatable())
1780	return {}; // Give up on this `select`.
1781
1782	Load.setInt(Spec);
1783	Ops.emplace_back(Args&: Load);
1784	}
1785
1786	return Ops;
1787	}
1788
1789	static void speculateSelectInstLoads(SelectInst &SI, LoadInst &LI,
1790	IRBuilderTy &IRB) {
1791	LLVM_DEBUG(dbgs() << " original load: " << SI << "\n");
1792
1793	Value *TV = SI.getTrueValue();
1794	Value *FV = SI.getFalseValue();
1795	// Replace the given load of the select with a select of two loads.
1796
1797	assert(LI.isSimple() && "We only speculate simple loads");
1798
1799	IRB.SetInsertPoint(&LI);
1800
1801	LoadInst *TL =
1802	IRB.CreateAlignedLoad(Ty: LI.getType(), Ptr: TV, Align: LI.getAlign(),
1803	Name: LI.getName() + ".sroa.speculate.load.true");
1804	LoadInst *FL =
1805	IRB.CreateAlignedLoad(Ty: LI.getType(), Ptr: FV, Align: LI.getAlign(),
1806	Name: LI.getName() + ".sroa.speculate.load.false");
1807	NumLoadsSpeculated += `2`;
1808
1809	// Transfer alignment and AA info if present.
1810	TL->setAlignment(LI.getAlign());
1811	FL->setAlignment(LI.getAlign());
1812
1813	AAMDNodes Tags = LI.getAAMetadata();
1814	if (Tags) {
1815	TL->setAAMetadata(Tags);
1816	FL->setAAMetadata(Tags);
1817	}
1818
1819	Value *V = IRB.CreateSelect(C: SI.getCondition(), True: TL, False: FL,
1820	Name: LI.getName() + ".sroa.speculated");
1821
1822	LLVM_DEBUG(dbgs() << " speculated to: " << *V << "\n");
1823	LI.replaceAllUsesWith(V);
1824	}
1825
1826	template <typename T>
1827	static void rewriteMemOpOfSelect(SelectInst &SI, T &I,
1828	SelectHandSpeculativity Spec,
1829	DomTreeUpdater &DTU) {
1830	assert((isa<LoadInst>(I) \|\| isa<StoreInst>(I)) && "Only for load and store!");
1831	LLVM_DEBUG(dbgs() << " original mem op: " << I << "\n");
1832	BasicBlock *Head = I.getParent();
1833	Instruction ThenTerm = nullptr*;
1834	Instruction ElseTerm = nullptr*;
1835	if (Spec.areNoneSpeculatable())
1836	SplitBlockAndInsertIfThenElse(SI.getCondition(), &I, &ThenTerm, &ElseTerm,
1837	SI.getMetadata(KindID: LLVMContext::MD_prof), &DTU);
1838	else {
1839	SplitBlockAndInsertIfThen(SI.getCondition(), &I, /Unreachable=/false,
1840	SI.getMetadata(KindID: LLVMContext::MD_prof), &DTU,
1841	/LI=/nullptr, /ThenBlock=/nullptr);
1842	if (Spec.isSpeculatable(/isTrueVal=/true))
1843	cast<BranchInst>(Val: Head->getTerminator())->swapSuccessors();
1844	}
1845	auto *HeadBI = cast<BranchInst>(Val: Head->getTerminator());
1846	Spec = {}; // Do not use `Spec` beyond this point.
1847	BasicBlock *Tail = I.getParent();
1848	Tail->setName(Head->getName() + ".cont");
1849	PHINode *PN;
1850	if (isa<LoadInst>(I))
1851	PN = PHINode::Create(I.getType(), `2`, "", I.getIterator());
1852	for (BasicBlock *SuccBB : successors(BB: Head)) {
1853	bool IsThen = SuccBB == HeadBI->getSuccessor(i: `0`);
1854	int SuccIdx = IsThen ? `0` : `1`;
1855	auto *NewMemOpBB = SuccBB == Tail ? Head : SuccBB;
1856	auto &CondMemOp = cast<T>(*I.clone());
1857	if (NewMemOpBB != Head) {
1858	NewMemOpBB->setName(Head->getName() + (IsThen ? ".then" : ".else"));
1859	if (isa<LoadInst>(I))
1860	++NumLoadsPredicated;
1861	else
1862	++NumStoresPredicated;
1863	} else {
1864	CondMemOp.dropUBImplyingAttrsAndMetadata();
1865	++NumLoadsSpeculated;
1866	}
1867	CondMemOp.insertBefore(NewMemOpBB->getTerminator());
1868	Value *Ptr = SI.getOperand(i_nocapture: `1` + SuccIdx);
1869	CondMemOp.setOperand(I.getPointerOperandIndex(), Ptr);
1870	if (isa<LoadInst>(I)) {
1871	CondMemOp.setName(I.getName() + (IsThen ? ".then" : ".else") + ".val");
1872	PN->addIncoming(V: &CondMemOp, BB: NewMemOpBB);
1873	} else
1874	LLVM_DEBUG(dbgs() << " to: " << CondMemOp << "\n");
1875	}
1876	if (isa<LoadInst>(I)) {
1877	PN->takeName(V: &I);
1878	LLVM_DEBUG(dbgs() << " to: " << *PN << "\n");
1879	I.replaceAllUsesWith(PN);
1880	}
1881	}
1882
1883	static void rewriteMemOpOfSelect(SelectInst &SelInst, Instruction &I,
1884	SelectHandSpeculativity Spec,
1885	DomTreeUpdater &DTU) {
1886	if (auto *LI = dyn_cast<LoadInst>(Val: &I))
1887	rewriteMemOpOfSelect(SI&: SelInst, I&: *LI, Spec, DTU);
1888	else if (auto *SI = dyn_cast<StoreInst>(Val: &I))
1889	rewriteMemOpOfSelect(SI&: SelInst, I&: *SI, Spec, DTU);
1890	else
1891	llvm_unreachable_internal(msg: "Only for load and store.");
1892	}
1893
1894	static bool rewriteSelectInstMemOps(SelectInst &SI,
1895	const RewriteableMemOps &Ops,
1896	IRBuilderTy &IRB, DomTreeUpdater *DTU) {
1897	bool CFGChanged = false;
1898	LLVM_DEBUG(dbgs() << " original select: " << SI << "\n");
1899
1900	for (const RewriteableMemOp &Op : Ops) {
1901	SelectHandSpeculativity Spec;
1902	Instruction *I;
1903	if (auto *const *US = std::get_if<UnspeculatableStore>(ptr: &Op)) {
1904	I = *US;
1905	} else {
1906	auto PSL = std::get<PossiblySpeculatableLoad>(v: Op);
1907	I = PSL.getPointer();
1908	Spec = PSL.getInt();
1909	}
1910	if (Spec.areAllSpeculatable()) {
1911	speculateSelectInstLoads(SI, LI&: cast<LoadInst>(Val&: *I), IRB);
1912	} else {
1913	assert(DTU && "Should not get here when not allowed to modify the CFG!");
1914	rewriteMemOpOfSelect(SelInst&: SI, I&: I, Spec, DTU&: DTU);
1915	CFGChanged = true;
1916	}
1917	I->eraseFromParent();
1918	}
1919
1920	for (User *U : make_early_inc_range(Range: SI.users()))
1921	cast<BitCastInst>(Val: U)->eraseFromParent();
1922	SI.eraseFromParent();
1923	return CFGChanged;
1924	}
1925
1926	/// Compute an adjusted pointer from Ptr by Offset bytes where the
1927	/// resulting pointer has PointerTy.
1928	static Value getAdjustedPtr(IRBuilderTy &IRB, const* DataLayout &DL, Value *Ptr,
1929	APInt Offset, Type *PointerTy,
1930	const Twine &NamePrefix) {
1931	if (Offset != `0`)
1932	Ptr = IRB.CreateInBoundsPtrAdd(Ptr, Offset: IRB.getInt(AI: Offset),
1933	Name: NamePrefix + "sroa_idx");
1934	return IRB.CreatePointerBitCastOrAddrSpaceCast(V: Ptr, DestTy: PointerTy,
1935	Name: NamePrefix + "sroa_cast");
1936	}
1937
1938	/// Compute the adjusted alignment for a load or store from an offset.
1939	static Align getAdjustedAlignment(Instruction *I, uint64_t Offset) {
1940	return commonAlignment(A: getLoadStoreAlignment(I), Offset);
1941	}
1942
1943	/// Test whether we can convert a value from the old to the new type.
1944	///
1945	/// This predicate should be used to guard calls to convertValue in order to
1946	/// ensure that we only try to convert viable values. The strategy is that we
1947	/// will peel off single element struct and array wrappings to get to an
1948	/// underlying value, and convert that value.
1949	static bool canConvertValue(const DataLayout &DL, Type OldTy, Type NewTy) {
1950	if (OldTy == NewTy)
1951	return true;
1952
1953	// For integer types, we can't handle any bit-width differences. This would
1954	// break both vector conversions with extension and introduce endianness
1955	// issues when in conjunction with loads and stores.
1956	if (isa<IntegerType>(Val: OldTy) && isa<IntegerType>(Val: NewTy)) {
1957	assert(cast<IntegerType>(OldTy)->getBitWidth() !=
1958	cast<IntegerType>(NewTy)->getBitWidth() &&
1959	"We can't have the same bitwidth for different int types");
1960	return false;
1961	}
1962
1963	if (DL.getTypeSizeInBits(Ty: NewTy).getFixedValue() !=
1964	DL.getTypeSizeInBits(Ty: OldTy).getFixedValue())
1965	return false;
1966	if (!NewTy->isSingleValueType() \|\| !OldTy->isSingleValueType())
1967	return false;
1968
1969	// We can convert pointers to integers and vice-versa. Same for vectors
1970	// of pointers and integers.
1971	OldTy = OldTy->getScalarType();
1972	NewTy = NewTy->getScalarType();
1973	if (NewTy->isPointerTy() \|\| OldTy->isPointerTy()) {
1974	if (NewTy->isPointerTy() && OldTy->isPointerTy()) {
1975	unsigned OldAS = OldTy->getPointerAddressSpace();
1976	unsigned NewAS = NewTy->getPointerAddressSpace();
1977	// Convert pointers if they are pointers from the same address space or
1978	// different integral (not non-integral) address spaces with the same
1979	// pointer size.
1980	return OldAS == NewAS \|\|
1981	(!DL.isNonIntegralAddressSpace(AddrSpace: OldAS) &&
1982	!DL.isNonIntegralAddressSpace(AddrSpace: NewAS) &&
1983	DL.getPointerSize(AS: OldAS) == DL.getPointerSize(AS: NewAS));
1984	}
1985
1986	// We can convert integers to integral pointers, but not to non-integral
1987	// pointers.
1988	if (OldTy->isIntegerTy())
1989	return !DL.isNonIntegralPointerType(Ty: NewTy);
1990
1991	// We can convert integral pointers to integers, but non-integral pointers
1992	// need to remain pointers.
1993	if (!DL.isNonIntegralPointerType(Ty: OldTy))
1994	return NewTy->isIntegerTy();
1995
1996	return false;
1997	}
1998
1999	if (OldTy->isTargetExtTy() \|\| NewTy->isTargetExtTy())
2000	return false;
2001
2002	return true;
2003	}
2004
2005	/// Generic routine to convert an SSA value to a value of a different
2006	/// type.
2007	///
2008	/// This will try various different casting techniques, such as bitcasts,
2009	/// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
2010	/// two types for viability with this routine.
2011	static Value convertValue(const* DataLayout &DL, IRBuilderTy &IRB, Value *V,
2012	Type *NewTy) {
2013	Type *OldTy = V->getType();
2014	assert(canConvertValue(DL, OldTy, NewTy) && "Value not convertable to type");
2015
2016	if (OldTy == NewTy)
2017	return V;
2018
2019	assert(!(isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) &&
2020	"Integer types must be the exact same to convert.");
2021
2022	// See if we need inttoptr for this type pair. May require additional bitcast.
2023	if (OldTy->isIntOrIntVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
2024	// Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8*
2025	// Expand i128 to <2 x i8> --> i128 to <2 x i64> to <2 x i8>
2026	// Expand <4 x i32> to <2 x i8> --> <4 x i32> to <2 x i64> to <2 x i8>
2027	// Directly handle i64 to i8*
2028	return IRB.CreateIntToPtr(V: IRB.CreateBitCast(V, DestTy: DL.getIntPtrType(NewTy)),
2029	DestTy: NewTy);
2030	}
2031
2032	// See if we need ptrtoint for this type pair. May require additional bitcast.
2033	if (OldTy->isPtrOrPtrVectorTy() && NewTy->isIntOrIntVectorTy()) {
2034	// Expand <2 x i8> to i128 --> <2 x i8> to <2 x i64> to i128
2035	// Expand i8 to <2 x i32> --> i8* to i64 to <2 x i32>*
2036	// Expand <2 x i8> to <4 x i32> --> <2 x i8> to <2 x i64> to <4 x i32>
2037	// Expand i8 to i64 --> i8* to i64 to i64*
2038	return IRB.CreateBitCast(V: IRB.CreatePtrToInt(V, DestTy: DL.getIntPtrType(OldTy)),
2039	DestTy: NewTy);
2040	}
2041
2042	if (OldTy->isPtrOrPtrVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
2043	unsigned OldAS = OldTy->getPointerAddressSpace();
2044	unsigned NewAS = NewTy->getPointerAddressSpace();
2045	// To convert pointers with different address spaces (they are already
2046	// checked convertible, i.e. they have the same pointer size), so far we
2047	// cannot use `bitcast` (which has restrict on the same address space) or
2048	// `addrspacecast` (which is not always no-op casting). Instead, use a pair
2049	// of no-op `ptrtoint`/`inttoptr` casts through an integer with the same bit
2050	// size.
2051	if (OldAS != NewAS) {
2052	assert(DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
2053	return IRB.CreateIntToPtr(V: IRB.CreatePtrToInt(V, DestTy: DL.getIntPtrType(OldTy)),
2054	DestTy: NewTy);
2055	}
2056	}
2057
2058	return IRB.CreateBitCast(V, DestTy: NewTy);
2059	}
2060
2061	/// Test whether the given slice use can be promoted to a vector.
2062	///
2063	/// This function is called to test each entry in a partition which is slated
2064	/// for a single slice.
2065	static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
2066	VectorType *Ty,
2067	uint64_t ElementSize,
2068	const DataLayout &DL) {
2069	// First validate the slice offsets.
2070	uint64_t BeginOffset =
2071	std::max(a: S.beginOffset(), b: P.beginOffset()) - P.beginOffset();
2072	uint64_t BeginIndex = BeginOffset / ElementSize;
2073	if (BeginIndex * ElementSize != BeginOffset \|\|
2074	BeginIndex >= cast<FixedVectorType>(Val: Ty)->getNumElements())
2075	return false;
2076	uint64_t EndOffset = std::min(a: S.endOffset(), b: P.endOffset()) - P.beginOffset();
2077	uint64_t EndIndex = EndOffset / ElementSize;
2078	if (EndIndex * ElementSize != EndOffset \|\|
2079	EndIndex > cast<FixedVectorType>(Val: Ty)->getNumElements())
2080	return false;
2081
2082	assert(EndIndex > BeginIndex && "Empty vector!");
2083	uint64_t NumElements = EndIndex - BeginIndex;
2084	Type *SliceTy = (NumElements == `1`)
2085	? Ty->getElementType()
2086	: FixedVectorType::get(ElementType: Ty->getElementType(), NumElts: NumElements);
2087
2088	Type *SplitIntTy =
2089	Type::getIntNTy(C&: Ty->getContext(), N: NumElements * ElementSize * `8`);
2090
2091	Use *U = S.getUse();
2092
2093	if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Val: U->getUser())) {
2094	if (MI->isVolatile())
2095	return false;
2096	if (!S.isSplittable())
2097	return false; // Skip any unsplittable intrinsics.
2098	} else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: U->getUser())) {
2099	if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
2100	return false;
2101	} else if (LoadInst *LI = dyn_cast<LoadInst>(Val: U->getUser())) {
2102	if (LI->isVolatile())
2103	return false;
2104	Type *LTy = LI->getType();
2105	// Disable vector promotion when there are loads or stores of an FCA.
2106	if (LTy->isStructTy())
2107	return false;
2108	if (P.beginOffset() > S.beginOffset() \|\| P.endOffset() < S.endOffset()) {
2109	assert(LTy->isIntegerTy());
2110	LTy = SplitIntTy;
2111	}
2112	if (!canConvertValue(DL, OldTy: SliceTy, NewTy: LTy))
2113	return false;
2114	} else if (StoreInst *SI = dyn_cast<StoreInst>(Val: U->getUser())) {
2115	if (SI->isVolatile())
2116	return false;
2117	Type *STy = SI->getValueOperand()->getType();
2118	// Disable vector promotion when there are loads or stores of an FCA.
2119	if (STy->isStructTy())
2120	return false;
2121	if (P.beginOffset() > S.beginOffset() \|\| P.endOffset() < S.endOffset()) {
2122	assert(STy->isIntegerTy());
2123	STy = SplitIntTy;
2124	}
2125	if (!canConvertValue(DL, OldTy: STy, NewTy: SliceTy))
2126	return false;
2127	} else {
2128	return false;
2129	}
2130
2131	return true;
2132	}
2133
2134	/// Test whether a vector type is viable for promotion.
2135	///
2136	/// This implements the necessary checking for \c checkVectorTypesForPromotion
2137	/// (and thus isVectorPromotionViable) over all slices of the alloca for the
2138	/// given VectorType.
2139	static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy,
2140	const DataLayout &DL) {
2141	uint64_t ElementSize =
2142	DL.getTypeSizeInBits(Ty: VTy->getElementType()).getFixedValue();
2143
2144	// While the definition of LLVM vectors is bitpacked, we don't support sizes
2145	// that aren't byte sized.
2146	if (ElementSize % `8`)
2147	return false;
2148	assert((DL.getTypeSizeInBits(VTy).getFixedValue() % `8`) == `0` &&
2149	"vector size not a multiple of element size?");
2150	ElementSize /= `8`;
2151
2152	for (const Slice &S : P)
2153	if (!isVectorPromotionViableForSlice(P, S, Ty: VTy, ElementSize, DL))
2154	return false;
2155
2156	for (const Slice *S : P.splitSliceTails())
2157	if (!isVectorPromotionViableForSlice(P, S: *S, Ty: VTy, ElementSize, DL))
2158	return false;
2159
2160	return true;
2161	}
2162
2163	/// Test whether any vector type in \p CandidateTys is viable for promotion.
2164	///
2165	/// This implements the necessary checking for \c isVectorPromotionViable over
2166	/// all slices of the alloca for the given VectorType.
2167	static VectorType *
2168	checkVectorTypesForPromotion(Partition &P, const DataLayout &DL,
2169	SmallVectorImpl<VectorType *> &CandidateTys,
2170	bool HaveCommonEltTy, Type *CommonEltTy,
2171	bool HaveVecPtrTy, bool HaveCommonVecPtrTy,
2172	VectorType *CommonVecPtrTy) {
2173	// If we didn't find a vector type, nothing to do here.
2174	if (CandidateTys.empty())
2175	return nullptr;
2176
2177	// Pointer-ness is sticky, if we had a vector-of-pointers candidate type,
2178	// then we should choose it, not some other alternative.
2179	// But, we can't perform a no-op pointer address space change via bitcast,
2180	// so if we didn't have a common pointer element type, bail.
2181	if (HaveVecPtrTy && !HaveCommonVecPtrTy)
2182	return nullptr;
2183
2184	// Try to pick the "best" element type out of the choices.
2185	if (!HaveCommonEltTy && HaveVecPtrTy) {
2186	// If there was a pointer element type, there's really only one choice.
2187	CandidateTys.clear();
2188	CandidateTys.push_back(Elt: CommonVecPtrTy);
2189	} else if (!HaveCommonEltTy && !HaveVecPtrTy) {
2190	// Integer-ify vector types.
2191	for (VectorType *&VTy : CandidateTys) {
2192	if (!VTy->getElementType()->isIntegerTy())
2193	VTy = cast<VectorType>(Val: VTy->getWithNewType(EltTy: IntegerType::getIntNTy(
2194	C&: VTy->getContext(), N: VTy->getScalarSizeInBits())));
2195	}
2196
2197	// Rank the remaining candidate vector types. This is easy because we know
2198	// they're all integer vectors. We sort by ascending number of elements.
2199	auto RankVectorTypesComp = [&DL](VectorType RHSTy, VectorType LHSTy) {
2200	(void)DL;
2201	assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
2202	DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
2203	"Cannot have vector types of different sizes!");
2204	assert(RHSTy->getElementType()->isIntegerTy() &&
2205	"All non-integer types eliminated!");
2206	assert(LHSTy->getElementType()->isIntegerTy() &&
2207	"All non-integer types eliminated!");
2208	return cast<FixedVectorType>(Val: RHSTy)->getNumElements() <
2209	cast<FixedVectorType>(Val: LHSTy)->getNumElements();
2210	};
2211	auto RankVectorTypesEq = [&DL](VectorType RHSTy, VectorType LHSTy) {
2212	(void)DL;
2213	assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
2214	DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
2215	"Cannot have vector types of different sizes!");
2216	assert(RHSTy->getElementType()->isIntegerTy() &&
2217	"All non-integer types eliminated!");
2218	assert(LHSTy->getElementType()->isIntegerTy() &&
2219	"All non-integer types eliminated!");
2220	return cast<FixedVectorType>(Val: RHSTy)->getNumElements() ==
2221	cast<FixedVectorType>(Val: LHSTy)->getNumElements();
2222	};
2223	llvm::sort(C&: CandidateTys, Comp: RankVectorTypesComp);
2224	CandidateTys.erase(CS: std::unique(first: CandidateTys.begin(), last: CandidateTys.end(),
2225	binary_pred: RankVectorTypesEq),
2226	CE: CandidateTys.end());
2227	} else {
2228	// The only way to have the same element type in every vector type is to
2229	// have the same vector type. Check that and remove all but one.
2230	#ifndef NDEBUG
2231	for (VectorType *VTy : CandidateTys) {
2232	assert(VTy->getElementType() == CommonEltTy &&
2233	"Unaccounted for element type!");
2234	assert(VTy == CandidateTys[`0`] &&
2235	"Different vector types with the same element type!");
2236	}
2237	#endif
2238	CandidateTys.resize(N: `1`);
2239	}
2240
2241	// FIXME: hack. Do we have a named constant for this?
2242	// SDAG SDNode can't have more than 65535 operands.
2243	llvm::erase_if(C&: CandidateTys, P: [](VectorType *VTy) {
2244	return cast<FixedVectorType>(Val: VTy)->getNumElements() >
2245	std::numeric_limits<unsigned short>::max();
2246	});
2247
2248	for (VectorType *VTy : CandidateTys)
2249	if (checkVectorTypeForPromotion(P, VTy, DL))
2250	return VTy;
2251
2252	return nullptr;
2253	}
2254
2255	static VectorType *createAndCheckVectorTypesForPromotion(
2256	SetVector<Type > &OtherTys, ArrayRef<VectorType > CandidateTysCopy,
2257	function_ref<void(Type *)> CheckCandidateType, Partition &P,
2258	const DataLayout &DL, SmallVectorImpl<VectorType *> &CandidateTys,
2259	bool &HaveCommonEltTy, Type &CommonEltTy, bool* &HaveVecPtrTy,
2260	bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy) {
2261	[[maybe_unused]] VectorType *OriginalElt =
2262	CandidateTysCopy.size() ? CandidateTysCopy [`0`] : nullptr;
2263	// Consider additional vector types where the element type size is a
2264	// multiple of load/store element size.
2265	for (Type *Ty : OtherTys) {
2266	if (!VectorType::isValidElementType(ElemTy: Ty))
2267	continue;
2268	unsigned TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
2269	// Make a copy of CandidateTys and iterate through it, because we
2270	// might append to CandidateTys in the loop.
2271	for (VectorType *const VTy : CandidateTysCopy) {
2272	// The elements in the copy should remain invariant throughout the loop
2273	assert(CandidateTysCopy[`0`] == OriginalElt && "Different Element");
2274	unsigned VectorSize = DL.getTypeSizeInBits(Ty: VTy).getFixedValue();
2275	unsigned ElementSize =
2276	DL.getTypeSizeInBits(Ty: VTy->getElementType()).getFixedValue();
2277	if (TypeSize != VectorSize && TypeSize != ElementSize &&
2278	VectorSize % TypeSize == `0`) {
2279	VectorType NewVTy = VectorType::get(ElementType: Ty, NumElements: VectorSize / TypeSize, Scalable: false*);
2280	CheckCandidateType (NewVTy);
2281	}
2282	}
2283	}
2284
2285	return checkVectorTypesForPromotion(P, DL, CandidateTys, HaveCommonEltTy,
2286	CommonEltTy, HaveVecPtrTy,
2287	HaveCommonVecPtrTy, CommonVecPtrTy);
2288	}
2289
2290	/// Test whether the given alloca partitioning and range of slices can be
2291	/// promoted to a vector.
2292	///
2293	/// This is a quick test to check whether we can rewrite a particular alloca
2294	/// partition (and its newly formed alloca) into a vector alloca with only
2295	/// whole-vector loads and stores such that it could be promoted to a vector
2296	/// SSA value. We only can ensure this for a limited set of operations, and we
2297	/// don't want to do the rewrites unless we are confident that the result will
2298	/// be promotable, so we have an early test here.
2299	static VectorType isVectorPromotionViable(Partition &P, const* DataLayout &DL) {
2300	// Collect the candidate types for vector-based promotion. Also track whether
2301	// we have different element types.
2302	SmallVector<VectorType *, `4`> CandidateTys;
2303	SetVector<Type *> LoadStoreTys;
2304	SetVector<Type *> DeferredTys;
2305	Type CommonEltTy = nullptr*;
2306	VectorType CommonVecPtrTy = nullptr*;
2307	bool HaveVecPtrTy = false;
2308	bool HaveCommonEltTy = true;
2309	bool HaveCommonVecPtrTy = true;
2310	auto CheckCandidateType = [&](Type *Ty) {
2311	if (auto *VTy = dyn_cast<VectorType>(Val: Ty)) {
2312	// Return if bitcast to vectors is different for total size in bits.
2313	if (!CandidateTys.empty()) {
2314	VectorType *V = CandidateTys [`0`];
2315	if (DL.getTypeSizeInBits(Ty: VTy).getFixedValue() !=
2316	DL.getTypeSizeInBits(Ty: V).getFixedValue()) {
2317	CandidateTys.clear();
2318	return;
2319	}
2320	}
2321	CandidateTys.push_back(Elt: VTy);
2322	Type *EltTy = VTy->getElementType();
2323
2324	if (!CommonEltTy)
2325	CommonEltTy = EltTy;
2326	else if (CommonEltTy != EltTy)
2327	HaveCommonEltTy = false;
2328
2329	if (EltTy->isPointerTy()) {
2330	HaveVecPtrTy = true;
2331	if (!CommonVecPtrTy)
2332	CommonVecPtrTy = VTy;
2333	else if (CommonVecPtrTy != VTy)
2334	HaveCommonVecPtrTy = false;
2335	}
2336	}
2337	};
2338
2339	// Put load and store types into a set for de-duplication.
2340	for (const Slice &S : P) {
2341	Type *Ty;
2342	if (auto *LI = dyn_cast<LoadInst>(Val: S.getUse()->getUser()))
2343	Ty = LI->getType();
2344	else if (auto *SI = dyn_cast<StoreInst>(Val: S.getUse()->getUser()))
2345	Ty = SI->getValueOperand()->getType();
2346	else
2347	continue;
2348
2349	auto CandTy = Ty->getScalarType();
2350	if (CandTy->isPointerTy() && (S.beginOffset() != P.beginOffset() \|\|
2351	S.endOffset() != P.endOffset())) {
2352	DeferredTys.insert(X: Ty);
2353	continue;
2354	}
2355
2356	LoadStoreTys.insert(X: Ty);
2357	// Consider any loads or stores that are the exact size of the slice.
2358	if (S.beginOffset() == P.beginOffset() && S.endOffset() == P.endOffset())
2359	CheckCandidateType (Ty);
2360	}
2361
2362	SmallVector<VectorType *, `4`> CandidateTysCopy = CandidateTys;
2363	if (auto *VTy = createAndCheckVectorTypesForPromotion(
2364	OtherTys&: LoadStoreTys, CandidateTysCopy, CheckCandidateType, P, DL,
2365	CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
2366	HaveCommonVecPtrTy, CommonVecPtrTy))
2367	return VTy;
2368
2369	CandidateTys.clear();
2370	return createAndCheckVectorTypesForPromotion(
2371	OtherTys&: DeferredTys, CandidateTysCopy, CheckCandidateType, P, DL, CandidateTys,
2372	HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, HaveCommonVecPtrTy,
2373	CommonVecPtrTy);
2374	}
2375
2376	/// Test whether a slice of an alloca is valid for integer widening.
2377	///
2378	/// This implements the necessary checking for the \c isIntegerWideningViable
2379	/// test below on a single slice of the alloca.
2380	static bool isIntegerWideningViableForSlice(const Slice &S,
2381	uint64_t AllocBeginOffset,
2382	Type *AllocaTy,
2383	const DataLayout &DL,
2384	bool &WholeAllocaOp) {
2385	uint64_t Size = DL.getTypeStoreSize(Ty: AllocaTy).getFixedValue();
2386
2387	uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
2388	uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
2389
2390	Use *U = S.getUse();
2391
2392	// Lifetime intrinsics operate over the whole alloca whose sizes are usually
2393	// larger than other load/store slices (RelEnd > Size). But lifetime are
2394	// always promotable and should not impact other slices' promotability of the
2395	// partition.
2396	if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: U->getUser())) {
2397	if (II->isLifetimeStartOrEnd() \|\| II->isDroppable())
2398	return true;
2399	}
2400
2401	// We can't reasonably handle cases where the load or store extends past
2402	// the end of the alloca's type and into its padding.
2403	if (RelEnd > Size)
2404	return false;
2405
2406	if (LoadInst *LI = dyn_cast<LoadInst>(Val: U->getUser())) {
2407	if (LI->isVolatile())
2408	return false;
2409	// We can't handle loads that extend past the allocated memory.
2410	if (DL.getTypeStoreSize(Ty: LI->getType()).getFixedValue() > Size)
2411	return false;
2412	// So far, AllocaSliceRewriter does not support widening split slice tails
2413	// in rewriteIntegerLoad.
2414	if (S.beginOffset() < AllocBeginOffset)
2415	return false;
2416	// Note that we don't count vector loads or stores as whole-alloca
2417	// operations which enable integer widening because we would prefer to use
2418	// vector widening instead.
2419	if (!isa<VectorType>(Val: LI->getType()) && RelBegin == `0` && RelEnd == Size)
2420	WholeAllocaOp = true;
2421	if (IntegerType *ITy = dyn_cast<IntegerType>(Val: LI->getType())) {
2422	if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(Ty: ITy).getFixedValue())
2423	return false;
2424	} else if (RelBegin != `0` \|\| RelEnd != Size \|\|
2425	!canConvertValue(DL, OldTy: AllocaTy, NewTy: LI->getType())) {
2426	// Non-integer loads need to be convertible from the alloca type so that
2427	// they are promotable.
2428	return false;
2429	}
2430	} else if (StoreInst *SI = dyn_cast<StoreInst>(Val: U->getUser())) {
2431	Type *ValueTy = SI->getValueOperand()->getType();
2432	if (SI->isVolatile())
2433	return false;
2434	// We can't handle stores that extend past the allocated memory.
2435	if (DL.getTypeStoreSize(Ty: ValueTy).getFixedValue() > Size)
2436	return false;
2437	// So far, AllocaSliceRewriter does not support widening split slice tails
2438	// in rewriteIntegerStore.
2439	if (S.beginOffset() < AllocBeginOffset)
2440	return false;
2441	// Note that we don't count vector loads or stores as whole-alloca
2442	// operations which enable integer widening because we would prefer to use
2443	// vector widening instead.
2444	if (!isa<VectorType>(Val: ValueTy) && RelBegin == `0` && RelEnd == Size)
2445	WholeAllocaOp = true;
2446	if (IntegerType *ITy = dyn_cast<IntegerType>(Val: ValueTy)) {
2447	if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(Ty: ITy).getFixedValue())
2448	return false;
2449	} else if (RelBegin != `0` \|\| RelEnd != Size \|\|
2450	!canConvertValue(DL, OldTy: ValueTy, NewTy: AllocaTy)) {
2451	// Non-integer stores need to be convertible to the alloca type so that
2452	// they are promotable.
2453	return false;
2454	}
2455	} else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Val: U->getUser())) {
2456	if (MI->isVolatile() \|\| !isa<Constant>(Val: MI->getLength()))
2457	return false;
2458	if (!S.isSplittable())
2459	return false; // Skip any unsplittable intrinsics.
2460	} else {
2461	return false;
2462	}
2463
2464	return true;
2465	}
2466
2467	/// Test whether the given alloca partition's integer operations can be
2468	/// widened to promotable ones.
2469	///
2470	/// This is a quick test to check whether we can rewrite the integer loads and
2471	/// stores to a particular alloca into wider loads and stores and be able to
2472	/// promote the resulting alloca.
2473	static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
2474	const DataLayout &DL) {
2475	uint64_t SizeInBits = DL.getTypeSizeInBits(Ty: AllocaTy).getFixedValue();
2476	// Don't create integer types larger than the maximum bitwidth.
2477	if (SizeInBits > IntegerType::MAX_INT_BITS)
2478	return false;
2479
2480	// Don't try to handle allocas with bit-padding.
2481	if (SizeInBits != DL.getTypeStoreSizeInBits(Ty: AllocaTy).getFixedValue())
2482	return false;
2483
2484	// We need to ensure that an integer type with the appropriate bitwidth can
2485	// be converted to the alloca type, whatever that is. We don't want to force
2486	// the alloca itself to have an integer type if there is a more suitable one.
2487	Type *IntTy = Type::getIntNTy(C&: AllocaTy->getContext(), N: SizeInBits);
2488	if (!canConvertValue(DL, OldTy: AllocaTy, NewTy: IntTy) \|\|
2489	!canConvertValue(DL, OldTy: IntTy, NewTy: AllocaTy))
2490	return false;
2491
2492	// While examining uses, we ensure that the alloca has a covering load or
2493	// store. We don't want to widen the integer operations only to fail to
2494	// promote due to some other unsplittable entry (which we may make splittable
2495	// later). However, if there are only splittable uses, go ahead and assume
2496	// that we cover the alloca.
2497	// FIXME: We shouldn't consider split slices that happen to start in the
2498	// partition here...
2499	bool WholeAllocaOp = P.empty() && DL.isLegalInteger(Width: SizeInBits);
2500
2501	for (const Slice &S : P)
2502	if (!isIntegerWideningViableForSlice(S, AllocBeginOffset: P.beginOffset(), AllocaTy, DL,
2503	WholeAllocaOp))
2504	return false;
2505
2506	for (const Slice *S : P.splitSliceTails())
2507	if (!isIntegerWideningViableForSlice(S: *S, AllocBeginOffset: P.beginOffset(), AllocaTy, DL,
2508	WholeAllocaOp))
2509	return false;
2510
2511	return WholeAllocaOp;
2512	}
2513
2514	static Value extractInteger(const* DataLayout &DL, IRBuilderTy &IRB, Value *V,
2515	IntegerType *Ty, uint64_t Offset,
2516	const Twine &Name) {
2517	LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2518	IntegerType *IntTy = cast<IntegerType>(Val: V->getType());
2519	assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <=
2520	DL.getTypeStoreSize(IntTy).getFixedValue() &&
2521	"Element extends past full value");
2522	uint64_t ShAmt = `8` * Offset;
2523	if (DL.isBigEndian())
2524	ShAmt = `8` * (DL.getTypeStoreSize(Ty: IntTy).getFixedValue() -
2525	DL.getTypeStoreSize(Ty).getFixedValue() - Offset);
2526	if (ShAmt) {
2527	V = IRB.CreateLShr(LHS: V, RHS: ShAmt, Name: Name + ".shift");
2528	LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2529	}
2530	assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2531	"Cannot extract to a larger integer!");
2532	if (Ty != IntTy) {
2533	V = IRB.CreateTrunc(V, DestTy: Ty, Name: Name + ".trunc");
2534	LLVM_DEBUG(dbgs() << " trunced: " << *V << "\n");
2535	}
2536	return V;
2537	}
2538
2539	static Value insertInteger(const* DataLayout &DL, IRBuilderTy &IRB, Value *Old,
2540	Value V, uint64_t Offset, const* Twine &Name) {
2541	IntegerType *IntTy = cast<IntegerType>(Val: Old->getType());
2542	IntegerType *Ty = cast<IntegerType>(Val: V->getType());
2543	assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2544	"Cannot insert a larger integer!");
2545	LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2546	if (Ty != IntTy) {
2547	V = IRB.CreateZExt(V, DestTy: IntTy, Name: Name + ".ext");
2548	LLVM_DEBUG(dbgs() << " extended: " << *V << "\n");
2549	}
2550	assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <=
2551	DL.getTypeStoreSize(IntTy).getFixedValue() &&
2552	"Element store outside of alloca store");
2553	uint64_t ShAmt = `8` * Offset;
2554	if (DL.isBigEndian())
2555	ShAmt = `8` * (DL.getTypeStoreSize(Ty: IntTy).getFixedValue() -
2556	DL.getTypeStoreSize(Ty).getFixedValue() - Offset);
2557	if (ShAmt) {
2558	V = IRB.CreateShl(LHS: V, RHS: ShAmt, Name: Name + ".shift");
2559	LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2560	}
2561
2562	if (ShAmt \|\| Ty->getBitWidth() < IntTy->getBitWidth()) {
2563	APInt Mask = ~Ty->getMask().zext(width: IntTy->getBitWidth()).shl(shiftAmt: ShAmt);
2564	Old = IRB.CreateAnd(LHS: Old, RHS: Mask, Name: Name + ".mask");
2565	LLVM_DEBUG(dbgs() << " masked: " << *Old << "\n");
2566	V = IRB.CreateOr(LHS: Old, RHS: V, Name: Name + ".insert");
2567	LLVM_DEBUG(dbgs() << " inserted: " << *V << "\n");
2568	}
2569	return V;
2570	}
2571
2572	static Value extractVector(IRBuilderTy &IRB, Value V, unsigned BeginIndex,
2573	unsigned EndIndex, const Twine &Name) {
2574	auto *VecTy = cast<FixedVectorType>(Val: V->getType());
2575	unsigned NumElements = EndIndex - BeginIndex;
2576	assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
2577
2578	if (NumElements == VecTy->getNumElements())
2579	return V;
2580
2581	if (NumElements == `1`) {
2582	V = IRB.CreateExtractElement(Vec: V, Idx: IRB.getInt32(C: BeginIndex),
2583	Name: Name + ".extract");
2584	LLVM_DEBUG(dbgs() << " extract: " << *V << "\n");
2585	return V;
2586	}
2587
2588	auto Mask = llvm::to_vector<`8`>(Range: llvm::seq<int>(Begin: BeginIndex, End: EndIndex));
2589	V = IRB.CreateShuffleVector(V, Mask, Name: Name + ".extract");
2590	LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2591	return V;
2592	}
2593
2594	static Value insertVector(IRBuilderTy &IRB, Value Old, Value *V,
2595	unsigned BeginIndex, const Twine &Name) {
2596	VectorType *VecTy = cast<VectorType>(Val: Old->getType());
2597	assert(VecTy && "Can only insert a vector into a vector");
2598
2599	VectorType *Ty = dyn_cast<VectorType>(Val: V->getType());
2600	if (!Ty) {
2601	// Single element to insert.
2602	V = IRB.CreateInsertElement(Vec: Old, NewElt: V, Idx: IRB.getInt32(C: BeginIndex),
2603	Name: Name + ".insert");
2604	LLVM_DEBUG(dbgs() << " insert: " << *V << "\n");
2605	return V;
2606	}
2607
2608	assert(cast<FixedVectorType>(Ty)->getNumElements() <=
2609	cast<FixedVectorType>(VecTy)->getNumElements() &&
2610	"Too many elements!");
2611	if (cast<FixedVectorType>(Val: Ty)->getNumElements() ==
2612	cast<FixedVectorType>(Val: VecTy)->getNumElements()) {
2613	assert(V->getType() == VecTy && "Vector type mismatch");
2614	return V;
2615	}
2616	unsigned EndIndex = BeginIndex + cast<FixedVectorType>(Val: Ty)->getNumElements();
2617
2618	// When inserting a smaller vector into the larger to store, we first
2619	// use a shuffle vector to widen it with undef elements, and then
2620	// a second shuffle vector to select between the loaded vector and the
2621	// incoming vector.
2622	SmallVector<int, `8`> Mask;
2623	Mask.reserve(N: cast<FixedVectorType>(Val: VecTy)->getNumElements());
2624	for (unsigned i = `0`; i != cast<FixedVectorType>(Val: VecTy)->getNumElements(); ++i)
2625	if (i >= BeginIndex && i < EndIndex)
2626	Mask.push_back(Elt: i - BeginIndex);
2627	else
2628	Mask.push_back(Elt: -`1`);
2629	V = IRB.CreateShuffleVector(V, Mask, Name: Name + ".expand");
2630	LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2631
2632	SmallVector<Constant *, `8`> Mask2;
2633	Mask2.reserve(N: cast<FixedVectorType>(Val: VecTy)->getNumElements());
2634	for (unsigned i = `0`; i != cast<FixedVectorType>(Val: VecTy)->getNumElements(); ++i)
2635	Mask2.push_back(Elt: IRB.getInt1(V: i >= BeginIndex && i < EndIndex));
2636
2637	V = IRB.CreateSelect(C: ConstantVector::get(V: Mask2), True: V, False: Old, Name: Name + "blend");
2638
2639	LLVM_DEBUG(dbgs() << " blend: " << *V << "\n");
2640	return V;
2641	}
2642
2643	namespace {
2644
2645	/// Visitor to rewrite instructions using p particular slice of an alloca
2646	/// to use a new alloca.
2647	///
2648	/// Also implements the rewriting to vector-based accesses when the partition
2649	/// passes the isVectorPromotionViable predicate. Most of the rewriting logic
2650	/// lives here.
2651	class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
2652	// Befriend the base class so it can delegate to private visit methods.
2653	friend class InstVisitor<AllocaSliceRewriter, bool>;
2654
2655	using Base = InstVisitor<AllocaSliceRewriter, bool>;
2656
2657	const DataLayout &DL;
2658	AllocaSlices &AS;
2659	SROA &Pass;
2660	AllocaInst &OldAI, &NewAI;
2661	const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
2662	Type *NewAllocaTy;
2663
2664	// This is a convenience and flag variable that will be null unless the new
2665	// alloca's integer operations should be widened to this integer type due to
2666	// passing isIntegerWideningViable above. If it is non-null, the desired
2667	// integer type will be stored here for easy access during rewriting.
2668	IntegerType *IntTy;
2669
2670	// If we are rewriting an alloca partition which can be written as pure
2671	// vector operations, we stash extra information here. When VecTy is
2672	// non-null, we have some strict guarantees about the rewritten alloca:
2673	// - The new alloca is exactly the size of the vector type here.
2674	// - The accesses all either map to the entire vector or to a single
2675	// element.
2676	// - The set of accessing instructions is only one of those handled above
2677	// in isVectorPromotionViable. Generally these are the same access kinds
2678	// which are promotable via mem2reg.
2679	VectorType *VecTy;
2680	Type *ElementTy;
2681	uint64_t ElementSize;
2682
2683	// The original offset of the slice currently being rewritten relative to
2684	// the original alloca.
2685	uint64_t BeginOffset = `0`;
2686	uint64_t EndOffset = `0`;
2687
2688	// The new offsets of the slice currently being rewritten relative to the
2689	// original alloca.
2690	uint64_t NewBeginOffset = `0`, NewEndOffset = `0`;
2691
2692	uint64_t SliceSize = `0`;
2693	bool IsSplittable = false;
2694	bool IsSplit = false;
2695	Use OldUse = nullptr*;
2696	Instruction OldPtr = nullptr*;
2697
2698	// Track post-rewrite users which are PHI nodes and Selects.
2699	SmallSetVector<PHINode *, `8`> &PHIUsers;
2700	SmallSetVector<SelectInst *, `8`> &SelectUsers;
2701
2702	// Utility IR builder, whose name prefix is setup for each visited use, and
2703	// the insertion point is set to point to the user.
2704	IRBuilderTy IRB;
2705
2706	// Return the new alloca, addrspacecasted if required to avoid changing the
2707	// addrspace of a volatile access.
2708	Value getPtrToNewAI(unsigned* AddrSpace, bool IsVolatile) {
2709	if (!IsVolatile \|\| AddrSpace == NewAI.getType()->getPointerAddressSpace())
2710	return &NewAI;
2711
2712	Type *AccessTy = IRB.getPtrTy(AddrSpace);
2713	return IRB.CreateAddrSpaceCast(V: &NewAI, DestTy: AccessTy);
2714	}
2715
2716	public:
2717	AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass,
2718	AllocaInst &OldAI, AllocaInst &NewAI,
2719	uint64_t NewAllocaBeginOffset,
2720	uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
2721	VectorType *PromotableVecTy,
2722	SmallSetVector<PHINode *, `8`> &PHIUsers,
2723	SmallSetVector<SelectInst *, `8`> &SelectUsers)
2724	: DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
2725	NewAllocaBeginOffset(NewAllocaBeginOffset),
2726	NewAllocaEndOffset(NewAllocaEndOffset),
2727	NewAllocaTy(NewAI.getAllocatedType()),
2728	IntTy(
2729	IsIntegerPromotable
2730	? Type::getIntNTy(C&: NewAI.getContext(),
2731	N: DL.getTypeSizeInBits(Ty: NewAI.getAllocatedType())
2732	.getFixedValue())
2733	: nullptr),
2734	VecTy(PromotableVecTy),
2735	ElementTy(VecTy ? VecTy->getElementType() : nullptr),
2736	ElementSize(VecTy ? DL.getTypeSizeInBits(Ty: ElementTy).getFixedValue() / `8`
2737	: `0`),
2738	PHIUsers(PHIUsers), SelectUsers(SelectUsers),
2739	IRB (NewAI.getContext(), ConstantFolder ()) {
2740	if (VecTy) {
2741	assert((DL.getTypeSizeInBits(ElementTy).getFixedValue() % `8`) == `0` &&
2742	"Only multiple-of-8 sized vector elements are viable");
2743	++NumVectorized;
2744	}
2745	assert((!IntTy && !VecTy) \|\| (IntTy && !VecTy) \|\| (!IntTy && VecTy));
2746	}
2747
2748	bool visit(AllocaSlices::const_iterator I) {
2749	bool CanSROA = true;
2750	BeginOffset = I->beginOffset();
2751	EndOffset = I->endOffset();
2752	IsSplittable = I->isSplittable();
2753	IsSplit =
2754	BeginOffset < NewAllocaBeginOffset \|\| EndOffset > NewAllocaEndOffset;
2755	LLVM_DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : ""));
2756	LLVM_DEBUG(AS.printSlice(dbgs(), I, ""));
2757	LLVM_DEBUG(dbgs() << "\n");
2758
2759	// Compute the intersecting offset range.
2760	assert(BeginOffset < NewAllocaEndOffset);
2761	assert(EndOffset > NewAllocaBeginOffset);
2762	NewBeginOffset = std::max(a: BeginOffset, b: NewAllocaBeginOffset);
2763	NewEndOffset = std::min(a: EndOffset, b: NewAllocaEndOffset);
2764
2765	SliceSize = NewEndOffset - NewBeginOffset;
2766	LLVM_DEBUG(dbgs() << " Begin:(" << BeginOffset << ", " << EndOffset
2767	<< ") NewBegin:(" << NewBeginOffset << ", "
2768	<< NewEndOffset << ") NewAllocaBegin:("
2769	<< NewAllocaBeginOffset << ", " << NewAllocaEndOffset
2770	<< ")\n");
2771	assert(IsSplit \|\| NewBeginOffset == BeginOffset);
2772	OldUse = I->getUse();
2773	OldPtr = cast<Instruction>(Val: OldUse->get());
2774
2775	Instruction *OldUserI = cast<Instruction>(Val: OldUse->getUser());
2776	IRB.SetInsertPoint(OldUserI);
2777	IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
2778	IRB.getInserter().SetNamePrefix(Twine (NewAI.getName()) + "." +
2779	Twine (BeginOffset) + ".");
2780
2781	CanSROA &= visit(I: cast<Instruction>(Val: OldUse->getUser()));
2782	if (VecTy \|\| IntTy)
2783	assert(CanSROA);
2784	return CanSROA;
2785	}
2786
2787	private:
2788	// Make sure the other visit overloads are visible.
2789	using Base::visit;
2790
2791	// Every instruction which can end up as a user must have a rewrite rule.
2792	bool visitInstruction(Instruction &I) {
2793	LLVM_DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n");
2794	llvm_unreachable("No rewrite rule for this instruction!");
2795	}
2796
2797	Value getNewAllocaSlicePtr(IRBuilderTy &IRB, Type PointerTy) {
2798	// Note that the offset computation can use BeginOffset or NewBeginOffset
2799	// interchangeably for unsplit slices.
2800	assert(IsSplit \|\| BeginOffset == NewBeginOffset);
2801	uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
2802
2803	#ifndef NDEBUG
2804	StringRef OldName = OldPtr->getName();
2805	// Skip through the last '.sroa.' component of the name.
2806	size_t LastSROAPrefix = OldName.rfind(Str: ".sroa.");
2807	if (LastSROAPrefix != StringRef::npos) {
2808	OldName = OldName.substr(Start: LastSROAPrefix + strlen(s: ".sroa."));
2809	// Look for an SROA slice index.
2810	size_t IndexEnd = OldName.find_first_not_of(Chars: "0123456789");
2811	if (IndexEnd != StringRef::npos && OldName [IndexEnd] == `'.'`) {
2812	// Strip the index and look for the offset.
2813	OldName = OldName.substr(Start: IndexEnd + `1`);
2814	size_t OffsetEnd = OldName.find_first_not_of(Chars: "0123456789");
2815	if (OffsetEnd != StringRef::npos && OldName [OffsetEnd] == `'.'`)
2816	// Strip the offset.
2817	OldName = OldName.substr(Start: OffsetEnd + `1`);
2818	}
2819	}
2820	// Strip any SROA suffixes as well.
2821	OldName = OldName.substr(Start: `0`, N: OldName.find(Str: ".sroa_"));
2822	#endif
2823
2824	return getAdjustedPtr(IRB, DL, Ptr: &NewAI,
2825	Offset: APInt (DL.getIndexTypeSizeInBits(Ty: PointerTy), Offset),
2826	PointerTy,
2827	#ifndef NDEBUG
2828	NamePrefix: Twine (OldName) + "."
2829	#else
2830	Twine()
2831	#endif
2832	);
2833	}
2834
2835	/// Compute suitable alignment to access this slice of the new
2836	/// alloca.
2837	///
2838	/// You can optionally pass a type to this routine and if that type's ABI
2839	/// alignment is itself suitable, this will return zero.
2840	Align getSliceAlign() {
2841	return commonAlignment(A: NewAI.getAlign(),
2842	Offset: NewBeginOffset - NewAllocaBeginOffset);
2843	}
2844
2845	unsigned getIndex(uint64_t Offset) {
2846	assert(VecTy && "Can only call getIndex when rewriting a vector");
2847	uint64_t RelOffset = Offset - NewAllocaBeginOffset;
2848	assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
2849	uint32_t Index = RelOffset / ElementSize;
2850	assert(Index * ElementSize == RelOffset);
2851	return Index;
2852	}
2853
2854	void deleteIfTriviallyDead(Value *V) {
2855	Instruction *I = cast<Instruction>(Val: V);
2856	if (isInstructionTriviallyDead(I))
2857	Pass.DeadInsts.push_back(Elt: I);
2858	}
2859
2860	Value *rewriteVectorizedLoadInst(LoadInst &LI) {
2861	unsigned BeginIndex = getIndex(Offset: NewBeginOffset);
2862	unsigned EndIndex = getIndex(Offset: NewEndOffset);
2863	assert(EndIndex > BeginIndex && "Empty vector!");
2864
2865	LoadInst *Load = IRB.CreateAlignedLoad(Ty: NewAI.getAllocatedType(), Ptr: &NewAI,
2866	Align: NewAI.getAlign(), Name: "load");
2867
2868	Load->copyMetadata(SrcInst: LI, WL: {LLVMContext::MD_mem_parallel_loop_access,
2869	LLVMContext::MD_access_group});
2870	return extractVector(IRB, V: Load, BeginIndex, EndIndex, Name: "vec");
2871	}
2872
2873	Value *rewriteIntegerLoad(LoadInst &LI) {
2874	assert(IntTy && "We cannot insert an integer to the alloca");
2875	assert(!LI.isVolatile());
2876	Value *V = IRB.CreateAlignedLoad(Ty: NewAI.getAllocatedType(), Ptr: &NewAI,
2877	Align: NewAI.getAlign(), Name: "load");
2878	V = convertValue(DL, IRB, V, NewTy: IntTy);
2879	assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
2880	uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
2881	if (Offset > `0` \|\| NewEndOffset < NewAllocaEndOffset) {
2882	IntegerType ExtractTy = Type::getIntNTy(C&: LI.getContext(), N: SliceSize `8`);
2883	V = extractInteger(DL, IRB, V, Ty: ExtractTy, Offset, Name: "extract");
2884	}
2885	// It is possible that the extracted type is not the load type. This
2886	// happens if there is a load past the end of the alloca, and as
2887	// a consequence the slice is narrower but still a candidate for integer
2888	// lowering. To handle this case, we just zero extend the extracted
2889	// integer.
2890	assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * `8` &&
2891	"Can only handle an extract for an overly wide load");
2892	if (cast<IntegerType>(Val: LI.getType())->getBitWidth() > SliceSize * `8`)
2893	V = IRB.CreateZExt(V, DestTy: LI.getType());
2894	return V;
2895	}
2896
2897	bool visitLoadInst(LoadInst &LI) {
2898	LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
2899	Value *OldOp = LI.getOperand(i_nocapture: `0`);
2900	assert(OldOp == OldPtr);
2901
2902	AAMDNodes AATags = LI.getAAMetadata();
2903
2904	unsigned AS = LI.getPointerAddressSpace();
2905
2906	Type TargetTy = IsSplit ? Type::getIntNTy(C&: LI.getContext(), N: SliceSize `8`)
2907	: LI.getType();
2908	const bool IsLoadPastEnd =
2909	DL.getTypeStoreSize(Ty: TargetTy).getFixedValue() > SliceSize;
2910	bool IsPtrAdjusted = false;
2911	Value *V;
2912	if (VecTy) {
2913	V = rewriteVectorizedLoadInst(LI);
2914	} else if (IntTy && LI.getType()->isIntegerTy()) {
2915	V = rewriteIntegerLoad(LI);
2916	} else if (NewBeginOffset == NewAllocaBeginOffset &&
2917	NewEndOffset == NewAllocaEndOffset &&
2918	(canConvertValue(DL, OldTy: NewAllocaTy, NewTy: TargetTy) \|\|
2919	(IsLoadPastEnd && NewAllocaTy->isIntegerTy() &&
2920	TargetTy->isIntegerTy() && !LI.isVolatile()))) {
2921	Value *NewPtr =
2922	getPtrToNewAI(AddrSpace: LI.getPointerAddressSpace(), IsVolatile: LI.isVolatile());
2923	LoadInst *NewLI = IRB.CreateAlignedLoad(Ty: NewAI.getAllocatedType(), Ptr: NewPtr,
2924	Align: NewAI.getAlign(), isVolatile: LI.isVolatile(),
2925	Name: LI.getName());
2926	if (LI.isVolatile())
2927	NewLI->setAtomic(Ordering: LI.getOrdering(), SSID: LI.getSyncScopeID());
2928	if (NewLI->isAtomic())
2929	NewLI->setAlignment(LI.getAlign());
2930
2931	// Copy any metadata that is valid for the new load. This may require
2932	// conversion to a different kind of metadata, e.g. !nonnull might change
2933	// to !range or vice versa.
2934	copyMetadataForLoad(Dest&: *NewLI, Source: LI);
2935
2936	// Do this after copyMetadataForLoad() to preserve the TBAA shift.
2937	if (AATags)
2938	NewLI->setAAMetadata(AATags.adjustForAccess(
2939	Offset: NewBeginOffset - BeginOffset, AccessTy: NewLI->getType(), DL));
2940
2941	// Try to preserve nonnull metadata
2942	V = NewLI;
2943
2944	// If this is an integer load past the end of the slice (which means the
2945	// bytes outside the slice are undef or this load is dead) just forcibly
2946	// fix the integer size with correct handling of endianness.
2947	if (auto *AITy = dyn_cast<IntegerType>(Val: NewAllocaTy))
2948	if (auto *TITy = dyn_cast<IntegerType>(Val: TargetTy))
2949	if (AITy->getBitWidth() < TITy->getBitWidth()) {
2950	V = IRB.CreateZExt(V, DestTy: TITy, Name: "load.ext");
2951	if (DL.isBigEndian())
2952	V = IRB.CreateShl(LHS: V, RHS: TITy->getBitWidth() - AITy->getBitWidth(),
2953	Name: "endian_shift");
2954	}
2955	} else {
2956	Type *LTy = IRB.getPtrTy(AddrSpace: AS);
2957	LoadInst *NewLI =
2958	IRB.CreateAlignedLoad(Ty: TargetTy, Ptr: getNewAllocaSlicePtr(IRB, PointerTy: LTy),
2959	Align: getSliceAlign(), isVolatile: LI.isVolatile(), Name: LI.getName());
2960
2961	if (AATags)
2962	NewLI->setAAMetadata(AATags.adjustForAccess(
2963	Offset: NewBeginOffset - BeginOffset, AccessTy: NewLI->getType(), DL));
2964
2965	if (LI.isVolatile())
2966	NewLI->setAtomic(Ordering: LI.getOrdering(), SSID: LI.getSyncScopeID());
2967	NewLI->copyMetadata(SrcInst: LI, WL: {LLVMContext::MD_mem_parallel_loop_access,
2968	LLVMContext::MD_access_group});
2969
2970	V = NewLI;
2971	IsPtrAdjusted = true;
2972	}
2973	V = convertValue(DL, IRB, V, NewTy: TargetTy);
2974
2975	if (IsSplit) {
2976	assert(!LI.isVolatile());
2977	assert(LI.getType()->isIntegerTy() &&
2978	"Only integer type loads and stores are split");
2979	assert(SliceSize < DL.getTypeStoreSize(LI.getType()).getFixedValue() &&
2980	"Split load isn't smaller than original load");
2981	assert(DL.typeSizeEqualsStoreSize(LI.getType()) &&
2982	"Non-byte-multiple bit width");
2983	// Move the insertion point just past the load so that we can refer to it.
2984	BasicBlock::iterator LIIt = std::next(x: LI.getIterator());
2985	// Ensure the insertion point comes before any debug-info immediately
2986	// after the load, so that variable values referring to the load are
2987	// dominated by it.
2988	LIIt.setHeadBit(true);
2989	IRB.SetInsertPoint(TheBB: LI.getParent(), IP: LIIt);
2990	// Create a placeholder value with the same type as LI to use as the
2991	// basis for the new value. This allows us to replace the uses of LI with
2992	// the computed value, and then replace the placeholder with LI, leaving
2993	// LI only used for this computation.
2994	Value *Placeholder =
2995	new LoadInst (LI.getType(), PoisonValue::get(T: IRB.getPtrTy(AddrSpace: AS)), "",
2996	false, Align (`1`));
2997	V = insertInteger(DL, IRB, Old: Placeholder, V, Offset: NewBeginOffset - BeginOffset,
2998	Name: "insert");
2999	LI.replaceAllUsesWith(V);
3000	Placeholder->replaceAllUsesWith(V: &LI);
3001	Placeholder->deleteValue();
3002	} else {
3003	LI.replaceAllUsesWith(V);
3004	}
3005
3006	Pass.DeadInsts.push_back(Elt: &LI);
3007	deleteIfTriviallyDead(V: OldOp);
3008	LLVM_DEBUG(dbgs() << " to: " << *V << "\n");
3009	return !LI.isVolatile() && !IsPtrAdjusted;
3010	}
3011
3012	bool rewriteVectorizedStoreInst(Value V, StoreInst &SI, Value OldOp,
3013	AAMDNodes AATags) {
3014	// Capture V for the purpose of debug-info accounting once it's converted
3015	// to a vector store.
3016	Value *OrigV = V;
3017	if (V->getType() != VecTy) {
3018	unsigned BeginIndex = getIndex(Offset: NewBeginOffset);
3019	unsigned EndIndex = getIndex(Offset: NewEndOffset);
3020	assert(EndIndex > BeginIndex && "Empty vector!");
3021	unsigned NumElements = EndIndex - BeginIndex;
3022	assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
3023	"Too many elements!");
3024	Type *SliceTy = (NumElements == `1`)
3025	? ElementTy
3026	: FixedVectorType::get(ElementType: ElementTy, NumElts: NumElements);
3027	if (V->getType() != SliceTy)
3028	V = convertValue(DL, IRB, V, NewTy: SliceTy);
3029
3030	// Mix in the existing elements.
3031	Value *Old = IRB.CreateAlignedLoad(Ty: NewAI.getAllocatedType(), Ptr: &NewAI,
3032	Align: NewAI.getAlign(), Name: "load");
3033	V = insertVector(IRB, Old, V, BeginIndex, Name: "vec");
3034	}
3035	StoreInst *Store = IRB.CreateAlignedStore(Val: V, Ptr: &NewAI, Align: NewAI.getAlign());
3036	Store->copyMetadata(SrcInst: SI, WL: {LLVMContext::MD_mem_parallel_loop_access,
3037	LLVMContext::MD_access_group});
3038	if (AATags)
3039	Store->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3040	AccessTy: V->getType(), DL));
3041	Pass.DeadInsts.push_back(Elt: &SI);
3042
3043	// NOTE: Careful to use OrigV rather than V.
3044	migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * `8`, SliceSizeInBits: SliceSize * `8`, OldInst: &SI,
3045	Inst: Store, Dest: Store->getPointerOperand(), Value: OrigV, DL);
3046	LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3047	return true;
3048	}
3049
3050	bool rewriteIntegerStore(Value *V, StoreInst &SI, AAMDNodes AATags) {
3051	assert(IntTy && "We cannot extract an integer from the alloca");
3052	assert(!SI.isVolatile());
3053	if (DL.getTypeSizeInBits(Ty: V->getType()).getFixedValue() !=
3054	IntTy->getBitWidth()) {
3055	Value *Old = IRB.CreateAlignedLoad(Ty: NewAI.getAllocatedType(), Ptr: &NewAI,
3056	Align: NewAI.getAlign(), Name: "oldload");
3057	Old = convertValue(DL, IRB, V: Old, NewTy: IntTy);
3058	assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
3059	uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
3060	V = insertInteger(DL, IRB, Old, V: SI.getValueOperand(), Offset, Name: "insert");
3061	}
3062	V = convertValue(DL, IRB, V, NewTy: NewAllocaTy);
3063	StoreInst *Store = IRB.CreateAlignedStore(Val: V, Ptr: &NewAI, Align: NewAI.getAlign());
3064	Store->copyMetadata(SrcInst: SI, WL: {LLVMContext::MD_mem_parallel_loop_access,
3065	LLVMContext::MD_access_group});
3066	if (AATags)
3067	Store->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3068	AccessTy: V->getType(), DL));
3069
3070	migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * `8`, SliceSizeInBits: SliceSize * `8`, OldInst: &SI,
3071	Inst: Store, Dest: Store->getPointerOperand(),
3072	Value: Store->getValueOperand(), DL);
3073
3074	Pass.DeadInsts.push_back(Elt: &SI);
3075	LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3076	return true;
3077	}
3078
3079	bool visitStoreInst(StoreInst &SI) {
3080	LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3081	Value *OldOp = SI.getOperand(i_nocapture: `1`);
3082	assert(OldOp == OldPtr);
3083
3084	AAMDNodes AATags = SI.getAAMetadata();
3085	Value *V = SI.getValueOperand();
3086
3087	// Strip all inbounds GEPs and pointer casts to try to dig out any root
3088	// alloca that should be re-examined after promoting this alloca.
3089	if (V->getType()->isPointerTy())
3090	if (AllocaInst *AI = dyn_cast<AllocaInst>(Val: V->stripInBoundsOffsets()))
3091	Pass.PostPromotionWorklist.insert(X: AI);
3092
3093	if (SliceSize < DL.getTypeStoreSize(Ty: V->getType()).getFixedValue()) {
3094	assert(!SI.isVolatile());
3095	assert(V->getType()->isIntegerTy() &&
3096	"Only integer type loads and stores are split");
3097	assert(DL.typeSizeEqualsStoreSize(V->getType()) &&
3098	"Non-byte-multiple bit width");
3099	IntegerType NarrowTy = Type::getIntNTy(C&: SI.getContext(), N: SliceSize `8`);
3100	V = extractInteger(DL, IRB, V, Ty: NarrowTy, Offset: NewBeginOffset - BeginOffset,
3101	Name: "extract");
3102	}
3103
3104	if (VecTy)
3105	return rewriteVectorizedStoreInst(V, SI, OldOp, AATags);
3106	if (IntTy && V->getType()->isIntegerTy())
3107	return rewriteIntegerStore(V, SI, AATags);
3108
3109	StoreInst *NewSI;
3110	if (NewBeginOffset == NewAllocaBeginOffset &&
3111	NewEndOffset == NewAllocaEndOffset &&
3112	canConvertValue(DL, OldTy: V->getType(), NewTy: NewAllocaTy)) {
3113	V = convertValue(DL, IRB, V, NewTy: NewAllocaTy);
3114	Value *NewPtr =
3115	getPtrToNewAI(AddrSpace: SI.getPointerAddressSpace(), IsVolatile: SI.isVolatile());
3116
3117	NewSI =
3118	IRB.CreateAlignedStore(Val: V, Ptr: NewPtr, Align: NewAI.getAlign(), isVolatile: SI.isVolatile());
3119	} else {
3120	unsigned AS = SI.getPointerAddressSpace();
3121	Value *NewPtr = getNewAllocaSlicePtr(IRB, PointerTy: IRB.getPtrTy(AddrSpace: AS));
3122	NewSI =
3123	IRB.CreateAlignedStore(Val: V, Ptr: NewPtr, Align: getSliceAlign(), isVolatile: SI.isVolatile());
3124	}
3125	NewSI->copyMetadata(SrcInst: SI, WL: {LLVMContext::MD_mem_parallel_loop_access,
3126	LLVMContext::MD_access_group});
3127	if (AATags)
3128	NewSI->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3129	AccessTy: V->getType(), DL));
3130	if (SI.isVolatile())
3131	NewSI->setAtomic(Ordering: SI.getOrdering(), SSID: SI.getSyncScopeID());
3132	if (NewSI->isAtomic())
3133	NewSI->setAlignment(SI.getAlign());
3134
3135	migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * `8`, SliceSizeInBits: SliceSize * `8`, OldInst: &SI,
3136	Inst: NewSI, Dest: NewSI->getPointerOperand(),
3137	Value: NewSI->getValueOperand(), DL);
3138
3139	Pass.DeadInsts.push_back(Elt: &SI);
3140	deleteIfTriviallyDead(V: OldOp);
3141
3142	LLVM_DEBUG(dbgs() << " to: " << *NewSI << "\n");
3143	return NewSI->getPointerOperand() == &NewAI &&
3144	NewSI->getValueOperand()->getType() == NewAllocaTy &&
3145	!SI.isVolatile();
3146	}
3147
3148	/// Compute an integer value from splatting an i8 across the given
3149	/// number of bytes.
3150	///
3151	/// Note that this routine assumes an i8 is a byte. If that isn't true, don't
3152	/// call this routine.
3153	/// FIXME: Heed the advice above.
3154	///
3155	/// \param V The i8 value to splat.
3156	/// \param Size The number of bytes in the output (assuming i8 is one byte)
3157	Value getIntegerSplat(Value V, unsigned Size) {
3158	assert(Size > `0` && "Expected a positive number of bytes.");
3159	IntegerType *VTy = cast<IntegerType>(Val: V->getType());
3160	assert(VTy->getBitWidth() == `8` && "Expected an i8 value for the byte");
3161	if (Size == `1`)
3162	return V;
3163
3164	Type SplatIntTy = Type::getIntNTy(C&: VTy->getContext(), N: Size `8`);
3165	V = IRB.CreateMul(
3166	LHS: IRB.CreateZExt(V, DestTy: SplatIntTy, Name: "zext"),
3167	RHS: IRB.CreateUDiv(LHS: Constant::getAllOnesValue(Ty: SplatIntTy),
3168	RHS: IRB.CreateZExt(V: Constant::getAllOnesValue(Ty: V->getType()),
3169	DestTy: SplatIntTy)),
3170	Name: "isplat");
3171	return V;
3172	}
3173
3174	/// Compute a vector splat for a given element value.
3175	Value getVectorSplat(Value V, unsigned NumElements) {
3176	V = IRB.CreateVectorSplat(NumElts: NumElements, V, Name: "vsplat");
3177	LLVM_DEBUG(dbgs() << " splat: " << *V << "\n");
3178	return V;
3179	}
3180
3181	bool visitMemSetInst(MemSetInst &II) {
3182	LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3183	assert(II.getRawDest() == OldPtr);
3184
3185	AAMDNodes AATags = II.getAAMetadata();
3186
3187	// If the memset has a variable size, it cannot be split, just adjust the
3188	// pointer to the new alloca.
3189	if (!isa<ConstantInt>(Val: II.getLength())) {
3190	assert(!IsSplit);
3191	assert(NewBeginOffset == BeginOffset);
3192	II.setDest(getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType()));
3193	II.setDestAlignment(getSliceAlign());
3194	// In theory we should call migrateDebugInfo here. However, we do not
3195	// emit dbg.assign intrinsics for mem intrinsics storing through non-
3196	// constant geps, or storing a variable number of bytes.
3197	assert(at::getAssignmentMarkers(&II).empty() &&
3198	at::getDVRAssignmentMarkers(&II).empty() &&
3199	"AT: Unexpected link to non-const GEP");
3200	deleteIfTriviallyDead(V: OldPtr);
3201	return false;
3202	}
3203
3204	// Record this instruction for deletion.
3205	Pass.DeadInsts.push_back(Elt: &II);
3206
3207	Type *AllocaTy = NewAI.getAllocatedType();
3208	Type *ScalarTy = AllocaTy->getScalarType();
3209
3210	const bool CanContinue = [&]() {
3211	if (VecTy \|\| IntTy)
3212	return true;
3213	if (BeginOffset > NewAllocaBeginOffset \|\| EndOffset < NewAllocaEndOffset)
3214	return false;
3215	// Length must be in range for FixedVectorType.
3216	auto *C = cast<ConstantInt>(Val: II.getLength());
3217	const uint64_t Len = C->getLimitedValue();
3218	if (Len > std::numeric_limits<unsigned>::max())
3219	return false;
3220	auto *Int8Ty = IntegerType::getInt8Ty(C&: NewAI.getContext());
3221	auto *SrcTy = FixedVectorType::get(ElementType: Int8Ty, NumElts: Len);
3222	return canConvertValue(DL, OldTy: SrcTy, NewTy: AllocaTy) &&
3223	DL.isLegalInteger(Width: DL.getTypeSizeInBits(Ty: ScalarTy).getFixedValue());
3224	}();
3225
3226	// If this doesn't map cleanly onto the alloca type, and that type isn't
3227	// a single value type, just emit a memset.
3228	if (!CanContinue) {
3229	Type *SizeTy = II.getLength()->getType();
3230	unsigned Sz = NewEndOffset - NewBeginOffset;
3231	Constant *Size = ConstantInt::get(Ty: SizeTy, V: Sz);
3232	MemIntrinsic *New = cast<MemIntrinsic>(Val: IRB.CreateMemSet(
3233	Ptr: getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType()), Val: II.getValue(), Size,
3234	Align: MaybeAlign (getSliceAlign()), isVolatile: II.isVolatile()));
3235	if (AATags)
3236	New->setAAMetadata(
3237	AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset, AccessSize: Sz));
3238
3239	migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * `8`, SliceSizeInBits: SliceSize * `8`, OldInst: &II,
3240	Inst: New, Dest: New->getRawDest(), Value: nullptr, DL);
3241
3242	LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3243	return false;
3244	}
3245
3246	// If we can represent this as a simple value, we have to build the actual
3247	// value to store, which requires expanding the byte present in memset to
3248	// a sensible representation for the alloca type. This is essentially
3249	// splatting the byte to a sufficiently wide integer, splatting it across
3250	// any desired vector width, and bitcasting to the final type.
3251	Value *V;
3252
3253	if (VecTy) {
3254	// If this is a memset of a vectorized alloca, insert it.
3255	assert(ElementTy == ScalarTy);
3256
3257	unsigned BeginIndex = getIndex(Offset: NewBeginOffset);
3258	unsigned EndIndex = getIndex(Offset: NewEndOffset);
3259	assert(EndIndex > BeginIndex && "Empty vector!");
3260	unsigned NumElements = EndIndex - BeginIndex;
3261	assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
3262	"Too many elements!");
3263
3264	Value *Splat = getIntegerSplat(
3265	V: II.getValue(), Size: DL.getTypeSizeInBits(Ty: ElementTy).getFixedValue() / `8`);
3266	Splat = convertValue(DL, IRB, V: Splat, NewTy: ElementTy);
3267	if (NumElements > `1`)
3268	Splat = getVectorSplat(V: Splat, NumElements);
3269
3270	Value *Old = IRB.CreateAlignedLoad(Ty: NewAI.getAllocatedType(), Ptr: &NewAI,
3271	Align: NewAI.getAlign(), Name: "oldload");
3272	V = insertVector(IRB, Old, V: Splat, BeginIndex, Name: "vec");
3273	} else if (IntTy) {
3274	// If this is a memset on an alloca where we can widen stores, insert the
3275	// set integer.
3276	assert(!II.isVolatile());
3277
3278	uint64_t Size = NewEndOffset - NewBeginOffset;
3279	V = getIntegerSplat(V: II.getValue(), Size);
3280
3281	if (IntTy && (BeginOffset != NewAllocaBeginOffset \|\|
3282	EndOffset != NewAllocaBeginOffset)) {
3283	Value *Old = IRB.CreateAlignedLoad(Ty: NewAI.getAllocatedType(), Ptr: &NewAI,
3284	Align: NewAI.getAlign(), Name: "oldload");
3285	Old = convertValue(DL, IRB, V: Old, NewTy: IntTy);
3286	uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3287	V = insertInteger(DL, IRB, Old, V, Offset, Name: "insert");
3288	} else {
3289	assert(V->getType() == IntTy &&
3290	"Wrong type for an alloca wide integer!");
3291	}
3292	V = convertValue(DL, IRB, V, NewTy: AllocaTy);
3293	} else {
3294	// Established these invariants above.
3295	assert(NewBeginOffset == NewAllocaBeginOffset);
3296	assert(NewEndOffset == NewAllocaEndOffset);
3297
3298	V = getIntegerSplat(V: II.getValue(),
3299	Size: DL.getTypeSizeInBits(Ty: ScalarTy).getFixedValue() / `8`);
3300	if (VectorType *AllocaVecTy = dyn_cast<VectorType>(Val: AllocaTy))
3301	V = getVectorSplat(
3302	V, NumElements: cast<FixedVectorType>(Val: AllocaVecTy)->getNumElements());
3303
3304	V = convertValue(DL, IRB, V, NewTy: AllocaTy);
3305	}
3306
3307	Value *NewPtr = getPtrToNewAI(AddrSpace: II.getDestAddressSpace(), IsVolatile: II.isVolatile());
3308	StoreInst *New =
3309	IRB.CreateAlignedStore(Val: V, Ptr: NewPtr, Align: NewAI.getAlign(), isVolatile: II.isVolatile());
3310	New->copyMetadata(SrcInst: II, WL: {LLVMContext::MD_mem_parallel_loop_access,
3311	LLVMContext::MD_access_group});
3312	if (AATags)
3313	New->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3314	AccessTy: V->getType(), DL));
3315
3316	migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * `8`, SliceSizeInBits: SliceSize * `8`, OldInst: &II,
3317	Inst: New, Dest: New->getPointerOperand(), Value: V, DL);
3318
3319	LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3320	return !II.isVolatile();
3321	}
3322
3323	bool visitMemTransferInst(MemTransferInst &II) {
3324	// Rewriting of memory transfer instructions can be a bit tricky. We break
3325	// them into two categories: split intrinsics and unsplit intrinsics.
3326
3327	LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3328
3329	AAMDNodes AATags = II.getAAMetadata();
3330
3331	bool IsDest = &II.getRawDestUse() == OldUse;
3332	assert((IsDest && II.getRawDest() == OldPtr) \|\|
3333	(!IsDest && II.getRawSource() == OldPtr));
3334
3335	Align SliceAlign = getSliceAlign();
3336	// For unsplit intrinsics, we simply modify the source and destination
3337	// pointers in place. This isn't just an optimization, it is a matter of
3338	// correctness. With unsplit intrinsics we may be dealing with transfers
3339	// within a single alloca before SROA ran, or with transfers that have
3340	// a variable length. We may also be dealing with memmove instead of
3341	// memcpy, and so simply updating the pointers is the necessary for us to
3342	// update both source and dest of a single call.
3343	if (!IsSplittable) {
3344	Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType());
3345	if (IsDest) {
3346	// Update the address component of linked dbg.assigns.
3347	auto UpdateAssignAddress = [&](auto *DbgAssign) {
3348	if (llvm::is_contained(DbgAssign->location_ops(), II.getDest()) \|\|
3349	DbgAssign->getAddress() == II.getDest())
3350	DbgAssign->replaceVariableLocationOp(II.getDest(), AdjustedPtr);
3351	};
3352	for_each(Range: at::getAssignmentMarkers(Inst: &II), F: UpdateAssignAddress);
3353	for_each(Range: at::getDVRAssignmentMarkers(Inst: &II), F: UpdateAssignAddress);
3354	II.setDest(AdjustedPtr);
3355	II.setDestAlignment(SliceAlign);
3356	} else {
3357	II.setSource(AdjustedPtr);
3358	II.setSourceAlignment(SliceAlign);
3359	}
3360
3361	LLVM_DEBUG(dbgs() << " to: " << II << "\n");
3362	deleteIfTriviallyDead(V: OldPtr);
3363	return false;
3364	}
3365	// For split transfer intrinsics we have an incredibly useful assurance:
3366	// the source and destination do not reside within the same alloca, and at
3367	// least one of them does not escape. This means that we can replace
3368	// memmove with memcpy, and we don't need to worry about all manner of
3369	// downsides to splitting and transforming the operations.
3370
3371	// If this doesn't map cleanly onto the alloca type, and that type isn't
3372	// a single value type, just emit a memcpy.
3373	bool EmitMemCpy =
3374	!VecTy && !IntTy &&
3375	(BeginOffset > NewAllocaBeginOffset \|\| EndOffset < NewAllocaEndOffset \|\|
3376	SliceSize !=
3377	DL.getTypeStoreSize(Ty: NewAI.getAllocatedType()).getFixedValue() \|\|
3378	!DL.typeSizeEqualsStoreSize(Ty: NewAI.getAllocatedType()) \|\|
3379	!NewAI.getAllocatedType()->isSingleValueType());
3380
3381	// If we're just going to emit a memcpy, the alloca hasn't changed, and the
3382	// size hasn't been shrunk based on analysis of the viable range, this is
3383	// a no-op.
3384	if (EmitMemCpy && &OldAI == &NewAI) {
3385	// Ensure the start lines up.
3386	assert(NewBeginOffset == BeginOffset);
3387
3388	// Rewrite the size as needed.
3389	if (NewEndOffset != EndOffset)
3390	II.setLength(ConstantInt::get(Ty: II.getLength()->getType(),
3391	V: NewEndOffset - NewBeginOffset));
3392	return false;
3393	}
3394	// Record this instruction for deletion.
3395	Pass.DeadInsts.push_back(Elt: &II);
3396
3397	// Strip all inbounds GEPs and pointer casts to try to dig out any root
3398	// alloca that should be re-examined after rewriting this instruction.
3399	Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
3400	if (AllocaInst *AI =
3401	dyn_cast<AllocaInst>(Val: OtherPtr->stripInBoundsOffsets())) {
3402	assert(AI != &OldAI && AI != &NewAI &&
3403	"Splittable transfers cannot reach the same alloca on both ends.");
3404	Pass.Worklist.insert(X: AI);
3405	}
3406
3407	Type *OtherPtrTy = OtherPtr->getType();
3408	unsigned OtherAS = OtherPtrTy->getPointerAddressSpace();
3409
3410	// Compute the relative offset for the other pointer within the transfer.
3411	unsigned OffsetWidth = DL.getIndexSizeInBits(AS: OtherAS);
3412	APInt OtherOffset(OffsetWidth, NewBeginOffset - BeginOffset);
3413	Align OtherAlign =
3414	(IsDest ? II.getSourceAlign() : II.getDestAlign()).valueOrOne();
3415	OtherAlign =
3416	commonAlignment(A: OtherAlign, Offset: OtherOffset.zextOrTrunc(width: `64`).getZExtValue());
3417
3418	if (EmitMemCpy) {
3419	// Compute the other pointer, folding as much as possible to produce
3420	// a single, simple GEP in most cases.
3421	OtherPtr = getAdjustedPtr(IRB, DL, Ptr: OtherPtr, Offset: OtherOffset, PointerTy: OtherPtrTy,
3422	NamePrefix: OtherPtr->getName() + ".");
3423
3424	Value *OurPtr = getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType());
3425	Type *SizeTy = II.getLength()->getType();
3426	Constant *Size = ConstantInt::get(Ty: SizeTy, V: NewEndOffset - NewBeginOffset);
3427
3428	Value DestPtr, SrcPtr;
3429	MaybeAlign DestAlign, SrcAlign;
3430	// Note: IsDest is true iff we're copying into the new alloca slice
3431	if (IsDest) {
3432	DestPtr = OurPtr;
3433	DestAlign = SliceAlign;
3434	SrcPtr = OtherPtr;
3435	SrcAlign = OtherAlign;
3436	} else {
3437	DestPtr = OtherPtr;
3438	DestAlign = OtherAlign;
3439	SrcPtr = OurPtr;
3440	SrcAlign = SliceAlign;
3441	}
3442	CallInst *New = IRB.CreateMemCpy(Dst: DestPtr, DstAlign: DestAlign, Src: SrcPtr, SrcAlign,
3443	Size, isVolatile: II.isVolatile());
3444	if (AATags)
3445	New->setAAMetadata(AATags.shift(Offset: NewBeginOffset - BeginOffset));
3446
3447	APInt Offset(DL.getIndexTypeSizeInBits(Ty: DestPtr->getType()), `0`);
3448	if (IsDest) {
3449	migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * `8`, SliceSizeInBits: SliceSize * `8`,
3450	OldInst: &II, Inst: New, Dest: DestPtr, Value: nullptr, DL);
3451	} else if (AllocaInst *Base = dyn_cast<AllocaInst>(
3452	Val: DestPtr->stripAndAccumulateConstantOffsets(
3453	DL, Offset, /AllowNonInbounds/ true))) {
3454	migrateDebugInfo(OldAlloca: Base, IsSplit, OldAllocaOffsetInBits: Offset.getZExtValue() * `8`,
3455	SliceSizeInBits: SliceSize * `8`, OldInst: &II, Inst: New, Dest: DestPtr, Value: nullptr, DL);
3456	}
3457	LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3458	return false;
3459	}
3460
3461	bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset &&
3462	NewEndOffset == NewAllocaEndOffset;
3463	uint64_t Size = NewEndOffset - NewBeginOffset;
3464	unsigned BeginIndex = VecTy ? getIndex(Offset: NewBeginOffset) : `0`;
3465	unsigned EndIndex = VecTy ? getIndex(Offset: NewEndOffset) : `0`;
3466	unsigned NumElements = EndIndex - BeginIndex;
3467	IntegerType *SubIntTy =
3468	IntTy ? Type::getIntNTy(C&: IntTy->getContext(), N: Size * `8`) : nullptr;
3469
3470	// Reset the other pointer type to match the register type we're going to
3471	// use, but using the address space of the original other pointer.
3472	Type *OtherTy;
3473	if (VecTy && !IsWholeAlloca) {
3474	if (NumElements == `1`)
3475	OtherTy = VecTy->getElementType();
3476	else
3477	OtherTy = FixedVectorType::get(ElementType: VecTy->getElementType(), NumElts: NumElements);
3478	} else if (IntTy && !IsWholeAlloca) {
3479	OtherTy = SubIntTy;
3480	} else {
3481	OtherTy = NewAllocaTy;
3482	}
3483
3484	Value *AdjPtr = getAdjustedPtr(IRB, DL, Ptr: OtherPtr, Offset: OtherOffset, PointerTy: OtherPtrTy,
3485	NamePrefix: OtherPtr->getName() + ".");
3486	MaybeAlign SrcAlign = OtherAlign;
3487	MaybeAlign DstAlign = SliceAlign;
3488	if (!IsDest)
3489	std::swap(a&: SrcAlign, b&: DstAlign);
3490
3491	Value *SrcPtr;
3492	Value *DstPtr;
3493
3494	if (IsDest) {
3495	DstPtr = getPtrToNewAI(AddrSpace: II.getDestAddressSpace(), IsVolatile: II.isVolatile());
3496	SrcPtr = AdjPtr;
3497	} else {
3498	DstPtr = AdjPtr;
3499	SrcPtr = getPtrToNewAI(AddrSpace: II.getSourceAddressSpace(), IsVolatile: II.isVolatile());
3500	}
3501
3502	Value *Src;
3503	if (VecTy && !IsWholeAlloca && !IsDest) {
3504	Src = IRB.CreateAlignedLoad(Ty: NewAI.getAllocatedType(), Ptr: &NewAI,
3505	Align: NewAI.getAlign(), Name: "load");
3506	Src = extractVector(IRB, V: Src, BeginIndex, EndIndex, Name: "vec");
3507	} else if (IntTy && !IsWholeAlloca && !IsDest) {
3508	Src = IRB.CreateAlignedLoad(Ty: NewAI.getAllocatedType(), Ptr: &NewAI,
3509	Align: NewAI.getAlign(), Name: "load");
3510	Src = convertValue(DL, IRB, V: Src, NewTy: IntTy);
3511	uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3512	Src = extractInteger(DL, IRB, V: Src, Ty: SubIntTy, Offset, Name: "extract");
3513	} else {
3514	LoadInst *Load = IRB.CreateAlignedLoad(Ty: OtherTy, Ptr: SrcPtr, Align: SrcAlign,
3515	isVolatile: II.isVolatile(), Name: "copyload");
3516	Load->copyMetadata(SrcInst: II, WL: {LLVMContext::MD_mem_parallel_loop_access,
3517	LLVMContext::MD_access_group});
3518	if (AATags)
3519	Load->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3520	AccessTy: Load->getType(), DL));
3521	Src = Load;
3522	}
3523
3524	if (VecTy && !IsWholeAlloca && IsDest) {
3525	Value *Old = IRB.CreateAlignedLoad(Ty: NewAI.getAllocatedType(), Ptr: &NewAI,
3526	Align: NewAI.getAlign(), Name: "oldload");
3527	Src = insertVector(IRB, Old, V: Src, BeginIndex, Name: "vec");
3528	} else if (IntTy && !IsWholeAlloca && IsDest) {
3529	Value *Old = IRB.CreateAlignedLoad(Ty: NewAI.getAllocatedType(), Ptr: &NewAI,
3530	Align: NewAI.getAlign(), Name: "oldload");
3531	Old = convertValue(DL, IRB, V: Old, NewTy: IntTy);
3532	uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3533	Src = insertInteger(DL, IRB, Old, V: Src, Offset, Name: "insert");
3534	Src = convertValue(DL, IRB, V: Src, NewTy: NewAllocaTy);
3535	}
3536
3537	StoreInst *Store = cast<StoreInst>(
3538	Val: IRB.CreateAlignedStore(Val: Src, Ptr: DstPtr, Align: DstAlign, isVolatile: II.isVolatile()));
3539	Store->copyMetadata(SrcInst: II, WL: {LLVMContext::MD_mem_parallel_loop_access,
3540	LLVMContext::MD_access_group});
3541	if (AATags)
3542	Store->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3543	AccessTy: Src->getType(), DL));
3544
3545	APInt Offset(DL.getIndexTypeSizeInBits(Ty: DstPtr->getType()), `0`);
3546	if (IsDest) {
3547
3548	migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * `8`, SliceSizeInBits: SliceSize * `8`, OldInst: &II,
3549	Inst: Store, Dest: DstPtr, Value: Src, DL);
3550	} else if (AllocaInst *Base = dyn_cast<AllocaInst>(
3551	Val: DstPtr->stripAndAccumulateConstantOffsets(
3552	DL, Offset, /AllowNonInbounds/ true))) {
3553	migrateDebugInfo(OldAlloca: Base, IsSplit, OldAllocaOffsetInBits: Offset.getZExtValue() * `8`, SliceSizeInBits: SliceSize * `8`,
3554	OldInst: &II, Inst: Store, Dest: DstPtr, Value: Src, DL);
3555	}
3556
3557	LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3558	return !II.isVolatile();
3559	}
3560
3561	bool visitIntrinsicInst(IntrinsicInst &II) {
3562	assert((II.isLifetimeStartOrEnd() \|\| II.isLaunderOrStripInvariantGroup() \|\|
3563	II.isDroppable()) &&
3564	"Unexpected intrinsic!");
3565	LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3566
3567	// Record this instruction for deletion.
3568	Pass.DeadInsts.push_back(Elt: &II);
3569
3570	if (II.isDroppable()) {
3571	assert(II.getIntrinsicID() == Intrinsic::assume && "Expected assume");
3572	// TODO For now we forget assumed information, this can be improved.
3573	OldPtr->dropDroppableUsesIn(Usr&: II);
3574	return true;
3575	}
3576
3577	if (II.isLaunderOrStripInvariantGroup())
3578	return true;
3579
3580	assert(II.getArgOperand(`1`) == OldPtr);
3581	// Lifetime intrinsics are only promotable if they cover the whole alloca.
3582	// Therefore, we drop lifetime intrinsics which don't cover the whole
3583	// alloca.
3584	// (In theory, intrinsics which partially cover an alloca could be
3585	// promoted, but PromoteMemToReg doesn't handle that case.)
3586	// FIXME: Check whether the alloca is promotable before dropping the
3587	// lifetime intrinsics?
3588	if (NewBeginOffset != NewAllocaBeginOffset \|\|
3589	NewEndOffset != NewAllocaEndOffset)
3590	return true;
3591
3592	ConstantInt *Size =
3593	ConstantInt::get(Ty: cast<IntegerType>(Val: II.getArgOperand(i: `0`)->getType()),
3594	V: NewEndOffset - NewBeginOffset);
3595	// Lifetime intrinsics always expect an i8 so directly get such a pointer*
3596	// for the new alloca slice.
3597	Type *PointerTy = IRB.getPtrTy(AddrSpace: OldPtr->getType()->getPointerAddressSpace());
3598	Value *Ptr = getNewAllocaSlicePtr(IRB, PointerTy);
3599	Value *New;
3600	if (II.getIntrinsicID() == Intrinsic::lifetime_start)
3601	New = IRB.CreateLifetimeStart(Ptr, Size);
3602	else
3603	New = IRB.CreateLifetimeEnd(Ptr, Size);
3604
3605	(void)New;
3606	LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3607
3608	return true;
3609	}
3610
3611	void fixLoadStoreAlign(Instruction &Root) {
3612	// This algorithm implements the same visitor loop as
3613	// hasUnsafePHIOrSelectUse, and fixes the alignment of each load
3614	// or store found.
3615	SmallPtrSet<Instruction *, `4`> Visited;
3616	SmallVector<Instruction *, `4`> Uses;
3617	Visited.insert(Ptr: &Root);
3618	Uses.push_back(Elt: &Root);
3619	do {
3620	Instruction *I = Uses.pop_back_val();
3621
3622	if (LoadInst *LI = dyn_cast<LoadInst>(Val: I)) {
3623	LI->setAlignment(std::min(a: LI->getAlign(), b: getSliceAlign()));
3624	continue;
3625	}
3626	if (StoreInst *SI = dyn_cast<StoreInst>(Val: I)) {
3627	SI->setAlignment(std::min(a: SI->getAlign(), b: getSliceAlign()));
3628	continue;
3629	}
3630
3631	assert(isa<BitCastInst>(I) \|\| isa<AddrSpaceCastInst>(I) \|\|
3632	isa<PHINode>(I) \|\| isa<SelectInst>(I) \|\|
3633	isa<GetElementPtrInst>(I));
3634	for (User *U : I->users())
3635	if (Visited.insert(Ptr: cast<Instruction>(Val: U)).second)
3636	Uses.push_back(Elt: cast<Instruction>(Val: U));
3637	} while (!Uses.empty());
3638	}
3639
3640	bool visitPHINode(PHINode &PN) {
3641	LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
3642	assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable");
3643	assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable");
3644
3645	// We would like to compute a new pointer in only one place, but have it be
3646	// as local as possible to the PHI. To do that, we re-use the location of
3647	// the old pointer, which necessarily must be in the right position to
3648	// dominate the PHI.
3649	IRBuilderBase::InsertPointGuard Guard(IRB);
3650	if (isa<PHINode>(Val: OldPtr))
3651	IRB.SetInsertPoint(TheBB: OldPtr->getParent(),
3652	IP: OldPtr->getParent()->getFirstInsertionPt());
3653	else
3654	IRB.SetInsertPoint(OldPtr);
3655	IRB.SetCurrentDebugLocation(OldPtr->getDebugLoc());
3656
3657	Value *NewPtr = getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType());
3658	// Replace the operands which were using the old pointer.
3659	std::replace(first: PN.op_begin(), last: PN.op_end(), old_value: cast<Value>(Val: OldPtr), new_value: NewPtr);
3660
3661	LLVM_DEBUG(dbgs() << " to: " << PN << "\n");
3662	deleteIfTriviallyDead(V: OldPtr);
3663
3664	// Fix the alignment of any loads or stores using this PHI node.
3665	fixLoadStoreAlign(Root&: PN);
3666
3667	// PHIs can't be promoted on their own, but often can be speculated. We
3668	// check the speculation outside of the rewriter so that we see the
3669	// fully-rewritten alloca.
3670	PHIUsers.insert(X: &PN);
3671	return true;
3672	}
3673
3674	bool visitSelectInst(SelectInst &SI) {
3675	LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3676	assert((SI.getTrueValue() == OldPtr \|\| SI.getFalseValue() == OldPtr) &&
3677	"Pointer isn't an operand!");
3678	assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
3679	assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable");
3680
3681	Value *NewPtr = getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType());
3682	// Replace the operands which were using the old pointer.
3683	if (SI.getOperand(i_nocapture: `1`) == OldPtr)
3684	SI.setOperand(i_nocapture: `1`, Val_nocapture: NewPtr);
3685	if (SI.getOperand(i_nocapture: `2`) == OldPtr)
3686	SI.setOperand(i_nocapture: `2`, Val_nocapture: NewPtr);
3687
3688	LLVM_DEBUG(dbgs() << " to: " << SI << "\n");
3689	deleteIfTriviallyDead(V: OldPtr);
3690
3691	// Fix the alignment of any loads or stores using this select.
3692	fixLoadStoreAlign(Root&: SI);
3693
3694	// Selects can't be promoted on their own, but often can be speculated. We
3695	// check the speculation outside of the rewriter so that we see the
3696	// fully-rewritten alloca.
3697	SelectUsers.insert(X: &SI);
3698	return true;
3699	}
3700	};
3701
3702	/// Visitor to rewrite aggregate loads and stores as scalar.
3703	///
3704	/// This pass aggressively rewrites all aggregate loads and stores on
3705	/// a particular pointer (or any pointer derived from it which we can identify)
3706	/// with scalar loads and stores.
3707	class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
3708	// Befriend the base class so it can delegate to private visit methods.
3709	friend class InstVisitor<AggLoadStoreRewriter, bool>;
3710
3711	/// Queue of pointer uses to analyze and potentially rewrite.
3712	SmallVector<Use *, `8`> Queue;
3713
3714	/// Set to prevent us from cycling with phi nodes and loops.
3715	SmallPtrSet<User *, `8`> Visited;
3716
3717	/// The current pointer use being rewritten. This is used to dig up the used
3718	/// value (as opposed to the user).
3719	Use U = nullptr*;
3720
3721	/// Used to calculate offsets, and hence alignment, of subobjects.
3722	const DataLayout &DL;
3723
3724	IRBuilderTy &IRB;
3725
3726	public:
3727	AggLoadStoreRewriter(const DataLayout &DL, IRBuilderTy &IRB)
3728	: DL(DL), IRB(IRB) {}
3729
3730	/// Rewrite loads and stores through a pointer and all pointers derived from
3731	/// it.
3732	bool rewrite(Instruction &I) {
3733	LLVM_DEBUG(dbgs() << " Rewriting FCA loads and stores...\n");
3734	enqueueUsers(I);
3735	bool Changed = false;
3736	while (!Queue.empty()) {
3737	U = Queue.pop_back_val();
3738	Changed \|= visit(I: cast<Instruction>(Val: U->getUser()));
3739	}
3740	return Changed;
3741	}
3742
3743	private:
3744	/// Enqueue all the users of the given instruction for further processing.
3745	/// This uses a set to de-duplicate users.
3746	void enqueueUsers(Instruction &I) {
3747	for (Use &U : I.uses())
3748	if (Visited.insert(Ptr: U.getUser()).second)
3749	Queue.push_back(Elt: &U);
3750	}
3751
3752	// Conservative default is to not rewrite anything.
3753	bool visitInstruction(Instruction &I) { return false; }
3754
3755	/// Generic recursive split emission class.
3756	template <typename Derived> class OpSplitter {
3757	protected:
3758	/// The builder used to form new instructions.
3759	IRBuilderTy &IRB;
3760
3761	/// The indices which to be used with insert- or extractvalue to select the
3762	/// appropriate value within the aggregate.
3763	SmallVector<unsigned, `4`> Indices;
3764
3765	/// The indices to a GEP instruction which will move Ptr to the correct slot
3766	/// within the aggregate.
3767	SmallVector<Value *, `4`> GEPIndices;
3768
3769	/// The base pointer of the original op, used as a base for GEPing the
3770	/// split operations.
3771	Value *Ptr;
3772
3773	/// The base pointee type being GEPed into.
3774	Type *BaseTy;
3775
3776	/// Known alignment of the base pointer.
3777	Align BaseAlign;
3778
3779	/// To calculate offset of each component so we can correctly deduce
3780	/// alignments.
3781	const DataLayout &DL;
3782
3783	/// Initialize the splitter with an insertion point, Ptr and start with a
3784	/// single zero GEP index.
3785	OpSplitter(Instruction InsertionPoint, Value Ptr, Type *BaseTy,
3786	Align BaseAlign, const DataLayout &DL, IRBuilderTy &IRB)
3787	: IRB(IRB), GEPIndices (`1`, IRB.getInt32(C: `0`)), Ptr(Ptr), BaseTy(BaseTy),
3788	BaseAlign (BaseAlign), DL(DL) {
3789	IRB.SetInsertPoint(InsertionPoint);
3790	}
3791
3792	public:
3793	/// Generic recursive split emission routine.
3794	///
3795	/// This method recursively splits an aggregate op (load or store) into
3796	/// scalar or vector ops. It splits recursively until it hits a single value
3797	/// and emits that single value operation via the template argument.
3798	///
3799	/// The logic of this routine relies on GEPs and insertvalue and
3800	/// extractvalue all operating with the same fundamental index list, merely
3801	/// formatted differently (GEPs need actual values).
3802	///
3803	/// \param Ty The type being split recursively into smaller ops.
3804	/// \param Agg The aggregate value being built up or stored, depending on
3805	/// whether this is splitting a load or a store respectively.
3806	void emitSplitOps(Type Ty, Value &Agg, const Twine &Name) {
3807	if (Ty->isSingleValueType()) {
3808	unsigned Offset = DL.getIndexedOffsetInType(ElemTy: BaseTy, Indices: GEPIndices);
3809	return static_cast<Derived >(this*)->emitFunc(
3810	Ty, Agg, commonAlignment(A: BaseAlign, Offset), Name);
3811	}
3812
3813	if (ArrayType *ATy = dyn_cast<ArrayType>(Val: Ty)) {
3814	unsigned OldSize = Indices.size();
3815	(void)OldSize;
3816	for (unsigned Idx = `0`, Size = ATy->getNumElements(); Idx != Size;
3817	++Idx) {
3818	assert(Indices.size() == OldSize && "Did not return to the old size");
3819	Indices.push_back(Elt: Idx);
3820	GEPIndices.push_back(Elt: IRB.getInt32(C: Idx));
3821	emitSplitOps(Ty: ATy->getElementType(), Agg, Name: Name + "." + Twine (Idx));
3822	GEPIndices.pop_back();
3823	Indices.pop_back();
3824	}
3825	return;
3826	}
3827
3828	if (StructType *STy = dyn_cast<StructType>(Val: Ty)) {
3829	unsigned OldSize = Indices.size();
3830	(void)OldSize;
3831	for (unsigned Idx = `0`, Size = STy->getNumElements(); Idx != Size;
3832	++Idx) {
3833	assert(Indices.size() == OldSize && "Did not return to the old size");
3834	Indices.push_back(Elt: Idx);
3835	GEPIndices.push_back(Elt: IRB.getInt32(C: Idx));
3836	emitSplitOps(Ty: STy->getElementType(N: Idx), Agg, Name: Name + "." + Twine (Idx));
3837	GEPIndices.pop_back();
3838	Indices.pop_back();
3839	}
3840	return;
3841	}
3842
3843	llvm_unreachable("Only arrays and structs are aggregate loadable types");
3844	}
3845	};
3846
3847	struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
3848	AAMDNodes AATags;
3849
3850	LoadOpSplitter(Instruction InsertionPoint, Value Ptr, Type *BaseTy,
3851	AAMDNodes AATags, Align BaseAlign, const DataLayout &DL,
3852	IRBuilderTy &IRB)
3853	: OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, DL,
3854	IRB),
3855	AATags (AATags) {}
3856
3857	/// Emit a leaf load of a single value. This is called at the leaves of the
3858	/// recursive emission to actually load values.
3859	void emitFunc(Type Ty, Value &Agg, Align Alignment, const Twine &Name) {
3860	assert(Ty->isSingleValueType());
3861	// Load the single value and insert it using the indices.
3862	Value *GEP =
3863	IRB.CreateInBoundsGEP(Ty: BaseTy, Ptr, IdxList: GEPIndices, Name: Name + ".gep");
3864	LoadInst *Load =
3865	IRB.CreateAlignedLoad(Ty, Ptr: GEP, Align: Alignment, Name: Name + ".load");
3866
3867	APInt Offset(
3868	DL.getIndexSizeInBits(AS: Ptr->getType()->getPointerAddressSpace()), `0`);
3869	if (AATags &&
3870	GEPOperator::accumulateConstantOffset(SourceType: BaseTy, Index: GEPIndices, DL, Offset))
3871	Load->setAAMetadata(
3872	AATags.adjustForAccess(Offset: Offset.getZExtValue(), AccessTy: Load->getType(), DL));
3873
3874	Agg = IRB.CreateInsertValue(Agg, Val: Load, Idxs: Indices, Name: Name + ".insert");
3875	LLVM_DEBUG(dbgs() << " to: " << *Load << "\n");
3876	}
3877	};
3878
3879	bool visitLoadInst(LoadInst &LI) {
3880	assert(LI.getPointerOperand() == *U);
3881	if (!LI.isSimple() \|\| LI.getType()->isSingleValueType())
3882	return false;
3883
3884	// We have an aggregate being loaded, split it apart.
3885	LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
3886	LoadOpSplitter Splitter(&LI, *U, LI.getType(), LI.getAAMetadata(),
3887	getAdjustedAlignment(I: &LI, Offset: `0`), DL, IRB);
3888	Value *V = PoisonValue::get(T: LI.getType());
3889	Splitter.emitSplitOps(Ty: LI.getType(), Agg&: V, Name: LI.getName() + ".fca");
3890	Visited.erase(Ptr: &LI);
3891	LI.replaceAllUsesWith(V);
3892	LI.eraseFromParent();
3893	return true;
3894	}
3895
3896	struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
3897	StoreOpSplitter(Instruction InsertionPoint, Value Ptr, Type *BaseTy,
3898	AAMDNodes AATags, StoreInst *AggStore, Align BaseAlign,
3899	const DataLayout &DL, IRBuilderTy &IRB)
3900	: OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
3901	DL, IRB),
3902	AATags (AATags), AggStore(AggStore) {}
3903	AAMDNodes AATags;
3904	StoreInst *AggStore;
3905	/// Emit a leaf store of a single value. This is called at the leaves of the
3906	/// recursive emission to actually produce stores.
3907	void emitFunc(Type Ty, Value &Agg, Align Alignment, const Twine &Name) {
3908	assert(Ty->isSingleValueType());
3909	// Extract the single value and store it using the indices.
3910	//
3911	// The gep and extractvalue values are factored out of the CreateStore
3912	// call to make the output independent of the argument evaluation order.
3913	Value *ExtractValue =
3914	IRB.CreateExtractValue(Agg, Idxs: Indices, Name: Name + ".extract");
3915	Value *InBoundsGEP =
3916	IRB.CreateInBoundsGEP(Ty: BaseTy, Ptr, IdxList: GEPIndices, Name: Name + ".gep");
3917	StoreInst *Store =
3918	IRB.CreateAlignedStore(Val: ExtractValue, Ptr: InBoundsGEP, Align: Alignment);
3919
3920	APInt Offset(
3921	DL.getIndexSizeInBits(AS: Ptr->getType()->getPointerAddressSpace()), `0`);
3922	GEPOperator::accumulateConstantOffset(SourceType: BaseTy, Index: GEPIndices, DL, Offset);
3923	if (AATags) {
3924	Store->setAAMetadata(AATags.adjustForAccess(
3925	Offset: Offset.getZExtValue(), AccessTy: ExtractValue->getType(), DL));
3926	}
3927
3928	// migrateDebugInfo requires the base Alloca. Walk to it from this gep.
3929	// If we cannot (because there's an intervening non-const or unbounded
3930	// gep) then we wouldn't expect to see dbg.assign intrinsics linked to
3931	// this instruction.
3932	Value *Base = AggStore->getPointerOperand()->stripInBoundsOffsets();
3933	if (auto *OldAI = dyn_cast<AllocaInst>(Val: Base)) {
3934	uint64_t SizeInBits =
3935	DL.getTypeSizeInBits(Ty: Store->getValueOperand()->getType());
3936	migrateDebugInfo(OldAlloca: OldAI, /IsSplit/ true, OldAllocaOffsetInBits: Offset.getZExtValue() * `8`,
3937	SliceSizeInBits: SizeInBits, OldInst: AggStore, Inst: Store,
3938	Dest: Store->getPointerOperand(), Value: Store->getValueOperand(),
3939	DL);
3940	} else {
3941	assert(at::getAssignmentMarkers(Store).empty() &&
3942	at::getDVRAssignmentMarkers(Store).empty() &&
3943	"AT: unexpected debug.assign linked to store through "
3944	"unbounded GEP");
3945	}
3946	LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3947	}
3948	};
3949
3950	bool visitStoreInst(StoreInst &SI) {
3951	if (!SI.isSimple() \|\| SI.getPointerOperand() != *U)
3952	return false;
3953	Value *V = SI.getValueOperand();
3954	if (V->getType()->isSingleValueType())
3955	return false;
3956
3957	// We have an aggregate being stored, split it apart.
3958	LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3959	StoreOpSplitter Splitter(&SI, *U, V->getType(), SI.getAAMetadata(), &SI,
3960	getAdjustedAlignment(I: &SI, Offset: `0`), DL, IRB);
3961	Splitter.emitSplitOps(Ty: V->getType(), Agg&: V, Name: V->getName() + ".fca");
3962	Visited.erase(Ptr: &SI);
3963	// The stores replacing SI each have markers describing fragments of the
3964	// assignment so delete the assignment markers linked to SI.
3965	at::deleteAssignmentMarkers(Inst: &SI);
3966	SI.eraseFromParent();
3967	return true;
3968	}
3969
3970	bool visitBitCastInst(BitCastInst &BC) {
3971	enqueueUsers(I&: BC);
3972	return false;
3973	}
3974
3975	bool visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
3976	enqueueUsers(I&: ASC);
3977	return false;
3978	}
3979
3980	// Unfold gep (select cond, ptr1, ptr2), idx
3981	// => select cond, gep(ptr1, idx), gep(ptr2, idx)
3982	// and gep ptr, (select cond, idx1, idx2)
3983	// => select cond, gep(ptr, idx1), gep(ptr, idx2)
3984	bool unfoldGEPSelect(GetElementPtrInst &GEPI) {
3985	// Check whether the GEP has exactly one select operand and all indices
3986	// will become constant after the transform.
3987	SelectInst *Sel = dyn_cast<SelectInst>(Val: GEPI.getPointerOperand());
3988	for (Value *Op : GEPI.indices()) {
3989	if (auto *SI = dyn_cast<SelectInst>(Val: Op)) {
3990	if (Sel)
3991	return false;
3992
3993	Sel = SI;
3994	if (!isa<ConstantInt>(Val: Sel->getTrueValue()) \|\|
3995	!isa<ConstantInt>(Val: Sel->getFalseValue()))
3996	return false;
3997	continue;
3998	}
3999
4000	if (!isa<ConstantInt>(Val: Op))
4001	return false;
4002	}
4003
4004	if (!Sel)
4005	return false;
4006
4007	LLVM_DEBUG(dbgs() << " Rewriting gep(select) -> select(gep):\n";
4008	dbgs() << " original: " << *Sel << "\n";
4009	dbgs() << " " << GEPI << "\n";);
4010
4011	auto GetNewOps = [&](Value *SelOp) {
4012	SmallVector<Value *> NewOps;
4013	for (Value *Op : GEPI.operands())
4014	if (Op == Sel)
4015	NewOps.push_back(Elt: SelOp);
4016	else
4017	NewOps.push_back(Elt: Op);
4018	return NewOps;
4019	};
4020
4021	Value *True = Sel->getTrueValue();
4022	Value *False = Sel->getFalseValue();
4023	SmallVector<Value *> TrueOps = GetNewOps(True);
4024	SmallVector<Value *> FalseOps = GetNewOps(False);
4025
4026	IRB.SetInsertPoint(&GEPI);
4027	bool IsInBounds = GEPI.isInBounds();
4028
4029	Type *Ty = GEPI.getSourceElementType();
4030	Value *NTrue = IRB.CreateGEP(Ty, Ptr: TrueOps [`0`], IdxList: ArrayRef(TrueOps).drop_front(),
4031	Name: True->getName() + ".sroa.gep", IsInBounds);
4032
4033	Value *NFalse =
4034	IRB.CreateGEP(Ty, Ptr: FalseOps [`0`], IdxList: ArrayRef(FalseOps).drop_front(),
4035	Name: False->getName() + ".sroa.gep", IsInBounds);
4036
4037	Value *NSel = IRB.CreateSelect(C: Sel->getCondition(), True: NTrue, False: NFalse,
4038	Name: Sel->getName() + ".sroa.sel");
4039	Visited.erase(Ptr: &GEPI);
4040	GEPI.replaceAllUsesWith(V: NSel);
4041	GEPI.eraseFromParent();
4042	Instruction *NSelI = cast<Instruction>(Val: NSel);
4043	Visited.insert(Ptr: NSelI);
4044	enqueueUsers(I&: *NSelI);
4045
4046	LLVM_DEBUG(dbgs() << " to: " << *NTrue << "\n";
4047	dbgs() << " " << *NFalse << "\n";
4048	dbgs() << " " << *NSel << "\n";);
4049
4050	return true;
4051	}
4052
4053	// Unfold gep (phi ptr1, ptr2), idx
4054	// => phi ((gep ptr1, idx), (gep ptr2, idx))
4055	// and gep ptr, (phi idx1, idx2)
4056	// => phi ((gep ptr, idx1), (gep ptr, idx2))
4057	bool unfoldGEPPhi(GetElementPtrInst &GEPI) {
4058	// To prevent infinitely expanding recursive phis, bail if the GEP pointer
4059	// operand (looking through the phi if it is the phi we want to unfold) is
4060	// an instruction besides a static alloca.
4061	PHINode *Phi = dyn_cast<PHINode>(Val: GEPI.getPointerOperand());
4062	auto IsInvalidPointerOperand = [](Value *V) {
4063	if (!isa<Instruction>(Val: V))
4064	return false;
4065	if (auto *AI = dyn_cast<AllocaInst>(Val: V))
4066	return !AI->isStaticAlloca();
4067	return true;
4068	};
4069	if (Phi) {
4070	if (any_of(Range: Phi->operands(), P: IsInvalidPointerOperand))
4071	return false;
4072	} else {
4073	if (IsInvalidPointerOperand(GEPI.getPointerOperand()))
4074	return false;
4075	}
4076	// Check whether the GEP has exactly one phi operand (including the pointer
4077	// operand) and all indices will become constant after the transform.
4078	for (Value *Op : GEPI.indices()) {
4079	if (auto *SI = dyn_cast<PHINode>(Val: Op)) {
4080	if (Phi)
4081	return false;
4082
4083	Phi = SI;
4084	if (!all_of(Range: Phi->incoming_values(),
4085	P: [](Value V) { return* isa<ConstantInt>(Val: V); }))
4086	return false;
4087	continue;
4088	}
4089
4090	if (!isa<ConstantInt>(Val: Op))
4091	return false;
4092	}
4093
4094	if (!Phi)
4095	return false;
4096
4097	LLVM_DEBUG(dbgs() << " Rewriting gep(phi) -> phi(gep):\n";
4098	dbgs() << " original: " << *Phi << "\n";
4099	dbgs() << " " << GEPI << "\n";);
4100
4101	auto GetNewOps = [&](Value *PhiOp) {
4102	SmallVector<Value *> NewOps;
4103	for (Value *Op : GEPI.operands())
4104	if (Op == Phi)
4105	NewOps.push_back(Elt: PhiOp);
4106	else
4107	NewOps.push_back(Elt: Op);
4108	return NewOps;
4109	};
4110
4111	IRB.SetInsertPoint(Phi);
4112	PHINode *NewPhi = IRB.CreatePHI(Ty: GEPI.getType(), NumReservedValues: Phi->getNumIncomingValues(),
4113	Name: Phi->getName() + ".sroa.phi");
4114
4115	bool IsInBounds = GEPI.isInBounds();
4116	Type *SourceTy = GEPI.getSourceElementType();
4117	// We only handle arguments, constants, and static allocas here, so we can
4118	// insert GEPs at the end of the entry block.
4119	IRB.SetInsertPoint(GEPI.getFunction()->getEntryBlock().getTerminator());
4120	for (unsigned I = `0`, E = Phi->getNumIncomingValues(); I != E; ++I) {
4121	Value *Op = Phi->getIncomingValue(i: I);
4122	BasicBlock *BB = Phi->getIncomingBlock(i: I);
4123	Value *NewGEP;
4124	if (int NI = NewPhi->getBasicBlockIndex(BB); NI >= `0`) {
4125	NewGEP = NewPhi->getIncomingValue(i: NI);
4126	} else {
4127	SmallVector<Value *> NewOps = GetNewOps(Op);
4128	NewGEP =
4129	IRB.CreateGEP(Ty: SourceTy, Ptr: NewOps [`0`], IdxList: ArrayRef(NewOps).drop_front(),
4130	Name: Phi->getName() + ".sroa.gep", IsInBounds);
4131	}
4132	NewPhi->addIncoming(V: NewGEP, BB);
4133	}
4134
4135	Visited.erase(Ptr: &GEPI);
4136	GEPI.replaceAllUsesWith(V: NewPhi);
4137	GEPI.eraseFromParent();
4138	Visited.insert(Ptr: NewPhi);
4139	enqueueUsers(I&: *NewPhi);
4140
4141	LLVM_DEBUG(dbgs() << " to: ";
4142	for (Value *In
4143	: NewPhi->incoming_values()) dbgs()
4144	<< "\n " << *In;
4145	dbgs() << "\n " << *NewPhi << `'\n'`);
4146
4147	return true;
4148	}
4149
4150	bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
4151	if (unfoldGEPSelect(GEPI))
4152	return true;
4153
4154	if (unfoldGEPPhi(GEPI))
4155	return true;
4156
4157	enqueueUsers(I&: GEPI);
4158	return false;
4159	}
4160
4161	bool visitPHINode(PHINode &PN) {
4162	enqueueUsers(I&: PN);
4163	return false;
4164	}
4165
4166	bool visitSelectInst(SelectInst &SI) {
4167	enqueueUsers(I&: SI);
4168	return false;
4169	}
4170	};
4171
4172	} // end anonymous namespace
4173
4174	/// Strip aggregate type wrapping.
4175	///
4176	/// This removes no-op aggregate types wrapping an underlying type. It will
4177	/// strip as many layers of types as it can without changing either the type
4178	/// size or the allocated size.
4179	static Type stripAggregateTypeWrapping(const* DataLayout &DL, Type *Ty) {
4180	if (Ty->isSingleValueType())
4181	return Ty;
4182
4183	uint64_t AllocSize = DL.getTypeAllocSize(Ty).getFixedValue();
4184	uint64_t TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
4185
4186	Type *InnerTy;
4187	if (ArrayType *ArrTy = dyn_cast<ArrayType>(Val: Ty)) {
4188	InnerTy = ArrTy->getElementType();
4189	} else if (StructType *STy = dyn_cast<StructType>(Val: Ty)) {
4190	const StructLayout *SL = DL.getStructLayout(Ty: STy);
4191	unsigned Index = SL->getElementContainingOffset(FixedOffset: `0`);
4192	InnerTy = STy->getElementType(N: Index);
4193	} else {
4194	return Ty;
4195	}
4196
4197	if (AllocSize > DL.getTypeAllocSize(Ty: InnerTy).getFixedValue() \|\|
4198	TypeSize > DL.getTypeSizeInBits(Ty: InnerTy).getFixedValue())
4199	return Ty;
4200
4201	return stripAggregateTypeWrapping(DL, Ty: InnerTy);
4202	}
4203
4204	/// Try to find a partition of the aggregate type passed in for a given
4205	/// offset and size.
4206	///
4207	/// This recurses through the aggregate type and tries to compute a subtype
4208	/// based on the offset and size. When the offset and size span a sub-section
4209	/// of an array, it will even compute a new array type for that sub-section,
4210	/// and the same for structs.
4211	///
4212	/// Note that this routine is very strict and tries to find a partition of the
4213	/// type which produces the exact* right offset and size. It is not forgiving*
4214	/// when the size or offset cause either end of type-based partition to be off.
4215	/// Also, this is a best-effort routine. It is reasonable to give up and not
4216	/// return a type if necessary.
4217	static Type getTypePartition(const* DataLayout &DL, Type *Ty, uint64_t Offset,
4218	uint64_t Size) {
4219	if (Offset == `0` && DL.getTypeAllocSize(Ty).getFixedValue() == Size)
4220	return stripAggregateTypeWrapping(DL, Ty);
4221	if (Offset > DL.getTypeAllocSize(Ty).getFixedValue() \|\|
4222	(DL.getTypeAllocSize(Ty).getFixedValue() - Offset) < Size)
4223	return nullptr;
4224
4225	if (isa<ArrayType>(Val: Ty) \|\| isa<VectorType>(Val: Ty)) {
4226	Type *ElementTy;
4227	uint64_t TyNumElements;
4228	if (auto *AT = dyn_cast<ArrayType>(Val: Ty)) {
4229	ElementTy = AT->getElementType();
4230	TyNumElements = AT->getNumElements();
4231	} else {
4232	// FIXME: This isn't right for vectors with non-byte-sized or
4233	// non-power-of-two sized elements.
4234	auto *VT = cast<FixedVectorType>(Val: Ty);
4235	ElementTy = VT->getElementType();
4236	TyNumElements = VT->getNumElements();
4237	}
4238	uint64_t ElementSize = DL.getTypeAllocSize(Ty: ElementTy).getFixedValue();
4239	uint64_t NumSkippedElements = Offset / ElementSize;
4240	if (NumSkippedElements >= TyNumElements)
4241	return nullptr;
4242	Offset -= NumSkippedElements * ElementSize;
4243
4244	// First check if we need to recurse.
4245	if (Offset > `0` \|\| Size < ElementSize) {
4246	// Bail if the partition ends in a different array element.
4247	if ((Offset + Size) > ElementSize)
4248	return nullptr;
4249	// Recurse through the element type trying to peel off offset bytes.
4250	return getTypePartition(DL, Ty: ElementTy, Offset, Size);
4251	}
4252	assert(Offset == `0`);
4253
4254	if (Size == ElementSize)
4255	return stripAggregateTypeWrapping(DL, Ty: ElementTy);
4256	assert(Size > ElementSize);
4257	uint64_t NumElements = Size / ElementSize;
4258	if (NumElements * ElementSize != Size)
4259	return nullptr;
4260	return ArrayType::get(ElementType: ElementTy, NumElements);
4261	}
4262
4263	StructType *STy = dyn_cast<StructType>(Val: Ty);
4264	if (!STy)
4265	return nullptr;
4266
4267	const StructLayout *SL = DL.getStructLayout(Ty: STy);
4268
4269	if (SL->getSizeInBits().isScalable())
4270	return nullptr;
4271
4272	if (Offset >= SL->getSizeInBytes())
4273	return nullptr;
4274	uint64_t EndOffset = Offset + Size;
4275	if (EndOffset > SL->getSizeInBytes())
4276	return nullptr;
4277
4278	unsigned Index = SL->getElementContainingOffset(FixedOffset: Offset);
4279	Offset -= SL->getElementOffset(Idx: Index);
4280
4281	Type *ElementTy = STy->getElementType(N: Index);
4282	uint64_t ElementSize = DL.getTypeAllocSize(Ty: ElementTy).getFixedValue();
4283	if (Offset >= ElementSize)
4284	return nullptr; // The offset points into alignment padding.
4285
4286	// See if any partition must be contained by the element.
4287	if (Offset > `0` \|\| Size < ElementSize) {
4288	if ((Offset + Size) > ElementSize)
4289	return nullptr;
4290	return getTypePartition(DL, Ty: ElementTy, Offset, Size);
4291	}
4292	assert(Offset == `0`);
4293
4294	if (Size == ElementSize)
4295	return stripAggregateTypeWrapping(DL, Ty: ElementTy);
4296
4297	StructType::element_iterator EI = STy->element_begin() + Index,
4298	EE = STy->element_end();
4299	if (EndOffset < SL->getSizeInBytes()) {
4300	unsigned EndIndex = SL->getElementContainingOffset(FixedOffset: EndOffset);
4301	if (Index == EndIndex)
4302	return nullptr; // Within a single element and its padding.
4303
4304	// Don't try to form "natural" types if the elements don't line up with the
4305	// expected size.
4306	// FIXME: We could potentially recurse down through the last element in the
4307	// sub-struct to find a natural end point.
4308	if (SL->getElementOffset(Idx: EndIndex) != EndOffset)
4309	return nullptr;
4310
4311	assert(Index < EndIndex);
4312	EE = STy->element_begin() + EndIndex;
4313	}
4314
4315	// Try to build up a sub-structure.
4316	StructType *SubTy =
4317	StructType::get(Context&: STy->getContext(), Elements: ArrayRef(EI, EE), isPacked: STy->isPacked());
4318	const StructLayout *SubSL = DL.getStructLayout(Ty: SubTy);
4319	if (Size != SubSL->getSizeInBytes())
4320	return nullptr; // The sub-struct doesn't have quite the size needed.
4321
4322	return SubTy;
4323	}
4324
4325	/// Pre-split loads and stores to simplify rewriting.
4326	///
4327	/// We want to break up the splittable load+store pairs as much as
4328	/// possible. This is important to do as a preprocessing step, as once we
4329	/// start rewriting the accesses to partitions of the alloca we lose the
4330	/// necessary information to correctly split apart paired loads and stores
4331	/// which both point into this alloca. The case to consider is something like
4332	/// the following:
4333	///
4334	/// %a = alloca [12 x i8]
4335	/// %gep1 = getelementptr i8, ptr %a, i32 0
4336	/// %gep2 = getelementptr i8, ptr %a, i32 4
4337	/// %gep3 = getelementptr i8, ptr %a, i32 8
4338	/// store float 0.0, ptr %gep1
4339	/// store float 1.0, ptr %gep2
4340	/// %v = load i64, ptr %gep1
4341	/// store i64 %v, ptr %gep2
4342	/// %f1 = load float, ptr %gep2
4343	/// %f2 = load float, ptr %gep3
4344	///
4345	/// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
4346	/// promote everything so we recover the 2 SSA values that should have been
4347	/// there all along.
4348	///
4349	/// \returns true if any changes are made.
4350	bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
4351	LLVM_DEBUG(dbgs() << "Pre-splitting loads and stores\n");
4352
4353	// Track the loads and stores which are candidates for pre-splitting here, in
4354	// the order they first appear during the partition scan. These give stable
4355	// iteration order and a basis for tracking which loads and stores we
4356	// actually split.
4357	SmallVector<LoadInst *, `4`> Loads;
4358	SmallVector<StoreInst *, `4`> Stores;
4359
4360	// We need to accumulate the splits required of each load or store where we
4361	// can find them via a direct lookup. This is important to cross-check loads
4362	// and stores against each other. We also track the slice so that we can kill
4363	// all the slices that end up split.
4364	struct SplitOffsets {
4365	Slice *S;
4366	std::vector<uint64_t> Splits;
4367	};
4368	SmallDenseMap<Instruction *, SplitOffsets, `8`> SplitOffsetsMap;
4369
4370	// Track loads out of this alloca which cannot, for any reason, be pre-split.
4371	// This is important as we also cannot pre-split stores of those loads!
4372	// FIXME: This is all pretty gross. It means that we can be more aggressive
4373	// in pre-splitting when the load feeding the store happens to come from
4374	// a separate alloca. Put another way, the effectiveness of SROA would be
4375	// decreased by a frontend which just concatenated all of its local allocas
4376	// into one big flat alloca. But defeating such patterns is exactly the job
4377	// SROA is tasked with! Sadly, to not have this discrepancy we would have
4378	// change store pre-splitting to actually force pre-splitting of the load
4379	// that feeds it and all stores. That makes pre-splitting much harder, but
4380	// maybe it would make it more principled?
4381	SmallPtrSet<LoadInst *, `8`> UnsplittableLoads;
4382
4383	LLVM_DEBUG(dbgs() << " Searching for candidate loads and stores\n");
4384	for (auto &P : AS.partitions()) {
4385	for (Slice &S : P) {
4386	Instruction *I = cast<Instruction>(Val: S.getUse()->getUser());
4387	if (!S.isSplittable() \|\| S.endOffset() <= P.endOffset()) {
4388	// If this is a load we have to track that it can't participate in any
4389	// pre-splitting. If this is a store of a load we have to track that
4390	// that load also can't participate in any pre-splitting.
4391	if (auto *LI = dyn_cast<LoadInst>(Val: I))
4392	UnsplittableLoads.insert(Ptr: LI);
4393	else if (auto *SI = dyn_cast<StoreInst>(Val: I))
4394	if (auto *LI = dyn_cast<LoadInst>(Val: SI->getValueOperand()))
4395	UnsplittableLoads.insert(Ptr: LI);
4396	continue;
4397	}
4398	assert(P.endOffset() > S.beginOffset() &&
4399	"Empty or backwards partition!");
4400
4401	// Determine if this is a pre-splittable slice.
4402	if (auto *LI = dyn_cast<LoadInst>(Val: I)) {
4403	assert(!LI->isVolatile() && "Cannot split volatile loads!");
4404
4405	// The load must be used exclusively to store into other pointers for
4406	// us to be able to arbitrarily pre-split it. The stores must also be
4407	// simple to avoid changing semantics.
4408	auto IsLoadSimplyStored = [](LoadInst *LI) {
4409	for (User *LU : LI->users()) {
4410	auto *SI = dyn_cast<StoreInst>(Val: LU);
4411	if (!SI \|\| !SI->isSimple())
4412	return false;
4413	}
4414	return true;
4415	};
4416	if (!IsLoadSimplyStored (LI)) {
4417	UnsplittableLoads.insert(Ptr: LI);
4418	continue;
4419	}
4420
4421	Loads.push_back(Elt: LI);
4422	} else if (auto *SI = dyn_cast<StoreInst>(Val: I)) {
4423	if (S.getUse() != &SI->getOperandUse(i: SI->getPointerOperandIndex()))
4424	// Skip stores of* pointers. FIXME: This shouldn't even be possible!*
4425	continue;
4426	auto *StoredLoad = dyn_cast<LoadInst>(Val: SI->getValueOperand());
4427	if (!StoredLoad \|\| !StoredLoad->isSimple())
4428	continue;
4429	assert(!SI->isVolatile() && "Cannot split volatile stores!");
4430
4431	Stores.push_back(Elt: SI);
4432	} else {
4433	// Other uses cannot be pre-split.
4434	continue;
4435	}
4436
4437	// Record the initial split.
4438	LLVM_DEBUG(dbgs() << " Candidate: " << *I << "\n");
4439	auto &Offsets = SplitOffsetsMap [I];
4440	assert(Offsets.Splits.empty() &&
4441	"Should not have splits the first time we see an instruction!");
4442	Offsets.S = &S;
4443	Offsets.Splits.push_back(x: P.endOffset() - S.beginOffset());
4444	}
4445
4446	// Now scan the already split slices, and add a split for any of them which
4447	// we're going to pre-split.
4448	for (Slice *S : P.splitSliceTails()) {
4449	auto SplitOffsetsMapI =
4450	SplitOffsetsMap.find(Val: cast<Instruction>(Val: S->getUse()->getUser()));
4451	if (SplitOffsetsMapI == SplitOffsetsMap.end())
4452	continue;
4453	auto &Offsets = SplitOffsetsMapI ->second;
4454
4455	assert(Offsets.S == S && "Found a mismatched slice!");
4456	assert(!Offsets.Splits.empty() &&
4457	"Cannot have an empty set of splits on the second partition!");
4458	assert(Offsets.Splits.back() ==
4459	P.beginOffset() - Offsets.S->beginOffset() &&
4460	"Previous split does not end where this one begins!");
4461
4462	// Record each split. The last partition's end isn't needed as the size
4463	// of the slice dictates that.
4464	if (S->endOffset() > P.endOffset())
4465	Offsets.Splits.push_back(x: P.endOffset() - Offsets.S->beginOffset());
4466	}
4467	}
4468
4469	// We may have split loads where some of their stores are split stores. For
4470	// such loads and stores, we can only pre-split them if their splits exactly
4471	// match relative to their starting offset. We have to verify this prior to
4472	// any rewriting.
4473	llvm::erase_if(C&: Stores, P: [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
4474	// Lookup the load we are storing in our map of split
4475	// offsets.
4476	auto *LI = cast<LoadInst>(Val: SI->getValueOperand());
4477	// If it was completely unsplittable, then we're done,
4478	// and this store can't be pre-split.
4479	if (UnsplittableLoads.count(Ptr: LI))
4480	return true;
4481
4482	auto LoadOffsetsI = SplitOffsetsMap.find(Val: LI);
4483	if (LoadOffsetsI == SplitOffsetsMap.end())
4484	return false; // Unrelated loads are definitely safe.
4485	auto &LoadOffsets = LoadOffsetsI ->second;
4486
4487	// Now lookup the store's offsets.
4488	auto &StoreOffsets = SplitOffsetsMap [SI];
4489
4490	// If the relative offsets of each split in the load and
4491	// store match exactly, then we can split them and we
4492	// don't need to remove them here.
4493	if (LoadOffsets.Splits == StoreOffsets.Splits)
4494	return false;
4495
4496	LLVM_DEBUG(dbgs() << " Mismatched splits for load and store:\n"
4497	<< " " << *LI << "\n"
4498	<< " " << *SI << "\n");
4499
4500	// We've found a store and load that we need to split
4501	// with mismatched relative splits. Just give up on them
4502	// and remove both instructions from our list of
4503	// candidates.
4504	UnsplittableLoads.insert(Ptr: LI);
4505	return true;
4506	});
4507	// Now we have to go back* through all the stores, because a later store may*
4508	// have caused an earlier store's load to become unsplittable and if it is
4509	// unsplittable for the later store, then we can't rely on it being split in
4510	// the earlier store either.
4511	llvm::erase_if(C&: Stores, P: [&UnsplittableLoads](StoreInst *SI) {
4512	auto *LI = cast<LoadInst>(Val: SI->getValueOperand());
4513	return UnsplittableLoads.count(Ptr: LI);
4514	});
4515	// Once we've established all the loads that can't be split for some reason,
4516	// filter any that made it into our list out.
4517	llvm::erase_if(C&: Loads, P: [&UnsplittableLoads](LoadInst *LI) {
4518	return UnsplittableLoads.count(Ptr: LI);
4519	});
4520
4521	// If no loads or stores are left, there is no pre-splitting to be done for
4522	// this alloca.
4523	if (Loads.empty() && Stores.empty())
4524	return false;
4525
4526	// From here on, we can't fail and will be building new accesses, so rig up
4527	// an IR builder.
4528	IRBuilderTy IRB(&AI);
4529
4530	// Collect the new slices which we will merge into the alloca slices.
4531	SmallVector<Slice, `4`> NewSlices;
4532
4533	// Track any allocas we end up splitting loads and stores for so we iterate
4534	// on them.
4535	SmallPtrSet<AllocaInst *, `4`> ResplitPromotableAllocas;
4536
4537	// At this point, we have collected all of the loads and stores we can
4538	// pre-split, and the specific splits needed for them. We actually do the
4539	// splitting in a specific order in order to handle when one of the loads in
4540	// the value operand to one of the stores.
4541	//
4542	// First, we rewrite all of the split loads, and just accumulate each split
4543	// load in a parallel structure. We also build the slices for them and append
4544	// them to the alloca slices.
4545	SmallDenseMap<LoadInst , std::vector<LoadInst >, `1`> SplitLoadsMap;
4546	std::vector<LoadInst *> SplitLoads;
4547	const DataLayout &DL = AI.getModule()->getDataLayout();
4548	for (LoadInst *LI : Loads) {
4549	SplitLoads.clear();
4550
4551	auto &Offsets = SplitOffsetsMap [LI];
4552	unsigned SliceSize = Offsets.S->endOffset() - Offsets.S->beginOffset();
4553	assert(LI->getType()->getIntegerBitWidth() % `8` == `0` &&
4554	"Load must have type size equal to store size");
4555	assert(LI->getType()->getIntegerBitWidth() / `8` >= SliceSize &&
4556	"Load must be >= slice size");
4557
4558	uint64_t BaseOffset = Offsets.S->beginOffset();
4559	assert(BaseOffset + SliceSize > BaseOffset &&
4560	"Cannot represent alloca access size using 64-bit integers!");
4561
4562	Instruction *BasePtr = cast<Instruction>(Val: LI->getPointerOperand());
4563	IRB.SetInsertPoint(LI);
4564
4565	LLVM_DEBUG(dbgs() << " Splitting load: " << *LI << "\n");
4566
4567	uint64_t PartOffset = `0`, PartSize = Offsets.Splits.front();
4568	int Idx = `0`, Size = Offsets.Splits.size();
4569	for (;;) {
4570	auto PartTy = Type::getIntNTy(C&: LI->getContext(), N: PartSize `8`);
4571	auto AS = LI->getPointerAddressSpace();
4572	auto *PartPtrTy = LI->getPointerOperandType();
4573	LoadInst *PLoad = IRB.CreateAlignedLoad(
4574	Ty: PartTy,
4575	Ptr: getAdjustedPtr(IRB, DL, Ptr: BasePtr,
4576	Offset: APInt (DL.getIndexSizeInBits(AS), PartOffset),
4577	PointerTy: PartPtrTy, NamePrefix: BasePtr->getName() + "."),
4578	Align: getAdjustedAlignment(I: LI, Offset: PartOffset),
4579	/IsVolatile/ isVolatile: false, Name: LI->getName());
4580	PLoad->copyMetadata(SrcInst: *LI, WL: {LLVMContext::MD_mem_parallel_loop_access,
4581	LLVMContext::MD_access_group});
4582
4583	// Append this load onto the list of split loads so we can find it later
4584	// to rewrite the stores.
4585	SplitLoads.push_back(x: PLoad);
4586
4587	// Now build a new slice for the alloca.
4588	NewSlices.push_back(
4589	Elt: Slice (BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
4590	&PLoad->getOperandUse(i: PLoad->getPointerOperandIndex()),
4591	/IsSplittable/ false));
4592	LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
4593	<< ", " << NewSlices.back().endOffset()
4594	<< "): " << *PLoad << "\n");
4595
4596	// See if we've handled all the splits.
4597	if (Idx >= Size)
4598	break;
4599
4600	// Setup the next partition.
4601	PartOffset = Offsets.Splits [Idx];
4602	++Idx;
4603	PartSize = (Idx < Size ? Offsets.Splits [Idx] : SliceSize) - PartOffset;
4604	}
4605
4606	// Now that we have the split loads, do the slow walk over all uses of the
4607	// load and rewrite them as split stores, or save the split loads to use
4608	// below if the store is going to be split there anyways.
4609	bool DeferredStores = false;
4610	for (User *LU : LI->users()) {
4611	StoreInst *SI = cast<StoreInst>(Val: LU);
4612	if (!Stores.empty() && SplitOffsetsMap.count(Val: SI)) {
4613	DeferredStores = true;
4614	LLVM_DEBUG(dbgs() << " Deferred splitting of store: " << *SI
4615	<< "\n");
4616	continue;
4617	}
4618
4619	Value *StoreBasePtr = SI->getPointerOperand();
4620	IRB.SetInsertPoint(SI);
4621	AAMDNodes AATags = SI->getAAMetadata();
4622
4623	LLVM_DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n");
4624
4625	for (int Idx = `0`, Size = SplitLoads.size(); Idx < Size; ++Idx) {
4626	LoadInst *PLoad = SplitLoads [Idx];
4627	uint64_t PartOffset = Idx == `0` ? `0` : Offsets.Splits [Idx - `1`];
4628	auto *PartPtrTy = SI->getPointerOperandType();
4629
4630	auto AS = SI->getPointerAddressSpace();
4631	StoreInst *PStore = IRB.CreateAlignedStore(
4632	Val: PLoad,
4633	Ptr: getAdjustedPtr(IRB, DL, Ptr: StoreBasePtr,
4634	Offset: APInt (DL.getIndexSizeInBits(AS), PartOffset),
4635	PointerTy: PartPtrTy, NamePrefix: StoreBasePtr->getName() + "."),
4636	Align: getAdjustedAlignment(I: SI, Offset: PartOffset),
4637	/IsVolatile/ isVolatile: false);
4638	PStore->copyMetadata(SrcInst: *SI, WL: {LLVMContext::MD_mem_parallel_loop_access,
4639	LLVMContext::MD_access_group,
4640	LLVMContext::MD_DIAssignID});
4641
4642	if (AATags)
4643	PStore->setAAMetadata(
4644	AATags.adjustForAccess(Offset: PartOffset, AccessTy: PLoad->getType(), DL));
4645	LLVM_DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n");
4646	}
4647
4648	// We want to immediately iterate on any allocas impacted by splitting
4649	// this store, and we have to track any promotable alloca (indicated by
4650	// a direct store) as needing to be resplit because it is no longer
4651	// promotable.
4652	if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(Val: StoreBasePtr)) {
4653	ResplitPromotableAllocas.insert(Ptr: OtherAI);
4654	Worklist.insert(X: OtherAI);
4655	} else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
4656	Val: StoreBasePtr->stripInBoundsOffsets())) {
4657	Worklist.insert(X: OtherAI);
4658	}
4659
4660	// Mark the original store as dead.
4661	DeadInsts.push_back(Elt: SI);
4662	}
4663
4664	// Save the split loads if there are deferred stores among the users.
4665	if (DeferredStores)
4666	SplitLoadsMap.insert(KV: std::make_pair(x&: LI, y: std::move(SplitLoads)));
4667
4668	// Mark the original load as dead and kill the original slice.
4669	DeadInsts.push_back(Elt: LI);
4670	Offsets.S->kill();
4671	}
4672
4673	// Second, we rewrite all of the split stores. At this point, we know that
4674	// all loads from this alloca have been split already. For stores of such
4675	// loads, we can simply look up the pre-existing split loads. For stores of
4676	// other loads, we split those loads first and then write split stores of
4677	// them.
4678	for (StoreInst *SI : Stores) {
4679	auto *LI = cast<LoadInst>(Val: SI->getValueOperand());
4680	IntegerType *Ty = cast<IntegerType>(Val: LI->getType());
4681	assert(Ty->getBitWidth() % `8` == `0`);
4682	uint64_t StoreSize = Ty->getBitWidth() / `8`;
4683	assert(StoreSize > `0` && "Cannot have a zero-sized integer store!");
4684
4685	auto &Offsets = SplitOffsetsMap [SI];
4686	assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
4687	"Slice size should always match load size exactly!");
4688	uint64_t BaseOffset = Offsets.S->beginOffset();
4689	assert(BaseOffset + StoreSize > BaseOffset &&
4690	"Cannot represent alloca access size using 64-bit integers!");
4691
4692	Value *LoadBasePtr = LI->getPointerOperand();
4693	Instruction *StoreBasePtr = cast<Instruction>(Val: SI->getPointerOperand());
4694
4695	LLVM_DEBUG(dbgs() << " Splitting store: " << *SI << "\n");
4696
4697	// Check whether we have an already split load.
4698	auto SplitLoadsMapI = SplitLoadsMap.find(Val: LI);
4699	std::vector<LoadInst > SplitLoads = nullptr;
4700	if (SplitLoadsMapI != SplitLoadsMap.end()) {
4701	SplitLoads = &SplitLoadsMapI ->second;
4702	assert(SplitLoads->size() == Offsets.Splits.size() + `1` &&
4703	"Too few split loads for the number of splits in the store!");
4704	} else {
4705	LLVM_DEBUG(dbgs() << " of load: " << *LI << "\n");
4706	}
4707
4708	uint64_t PartOffset = `0`, PartSize = Offsets.Splits.front();
4709	int Idx = `0`, Size = Offsets.Splits.size();
4710	for (;;) {
4711	auto PartTy = Type::getIntNTy(C&: Ty->getContext(), N: PartSize `8`);
4712	auto *LoadPartPtrTy = LI->getPointerOperandType();
4713	auto *StorePartPtrTy = SI->getPointerOperandType();
4714
4715	// Either lookup a split load or create one.
4716	LoadInst *PLoad;
4717	if (SplitLoads) {
4718	PLoad = (*SplitLoads)[Idx];
4719	} else {
4720	IRB.SetInsertPoint(LI);
4721	auto AS = LI->getPointerAddressSpace();
4722	PLoad = IRB.CreateAlignedLoad(
4723	Ty: PartTy,
4724	Ptr: getAdjustedPtr(IRB, DL, Ptr: LoadBasePtr,
4725	Offset: APInt (DL.getIndexSizeInBits(AS), PartOffset),
4726	PointerTy: LoadPartPtrTy, NamePrefix: LoadBasePtr->getName() + "."),
4727	Align: getAdjustedAlignment(I: LI, Offset: PartOffset),
4728	/IsVolatile/ isVolatile: false, Name: LI->getName());
4729	PLoad->copyMetadata(SrcInst: *LI, WL: {LLVMContext::MD_mem_parallel_loop_access,
4730	LLVMContext::MD_access_group});
4731	}
4732
4733	// And store this partition.
4734	IRB.SetInsertPoint(SI);
4735	auto AS = SI->getPointerAddressSpace();
4736	StoreInst *PStore = IRB.CreateAlignedStore(
4737	Val: PLoad,
4738	Ptr: getAdjustedPtr(IRB, DL, Ptr: StoreBasePtr,
4739	Offset: APInt (DL.getIndexSizeInBits(AS), PartOffset),
4740	PointerTy: StorePartPtrTy, NamePrefix: StoreBasePtr->getName() + "."),
4741	Align: getAdjustedAlignment(I: SI, Offset: PartOffset),
4742	/IsVolatile/ isVolatile: false);
4743	PStore->copyMetadata(SrcInst: *SI, WL: {LLVMContext::MD_mem_parallel_loop_access,
4744	LLVMContext::MD_access_group});
4745
4746	// Now build a new slice for the alloca.
4747	NewSlices.push_back(
4748	Elt: Slice (BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
4749	&PStore->getOperandUse(i: PStore->getPointerOperandIndex()),
4750	/IsSplittable/ false));
4751	LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
4752	<< ", " << NewSlices.back().endOffset()
4753	<< "): " << *PStore << "\n");
4754	if (!SplitLoads) {
4755	LLVM_DEBUG(dbgs() << " of split load: " << *PLoad << "\n");
4756	}
4757
4758	// See if we've finished all the splits.
4759	if (Idx >= Size)
4760	break;
4761
4762	// Setup the next partition.
4763	PartOffset = Offsets.Splits [Idx];
4764	++Idx;
4765	PartSize = (Idx < Size ? Offsets.Splits [Idx] : StoreSize) - PartOffset;
4766	}
4767
4768	// We want to immediately iterate on any allocas impacted by splitting
4769	// this load, which is only relevant if it isn't a load of this alloca and
4770	// thus we didn't already split the loads above. We also have to keep track
4771	// of any promotable allocas we split loads on as they can no longer be
4772	// promoted.
4773	if (!SplitLoads) {
4774	if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(Val: LoadBasePtr)) {
4775	assert(OtherAI != &AI && "We can't re-split our own alloca!");
4776	ResplitPromotableAllocas.insert(Ptr: OtherAI);
4777	Worklist.insert(X: OtherAI);
4778	} else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
4779	Val: LoadBasePtr->stripInBoundsOffsets())) {
4780	assert(OtherAI != &AI && "We can't re-split our own alloca!");
4781	Worklist.insert(X: OtherAI);
4782	}
4783	}
4784
4785	// Mark the original store as dead now that we've split it up and kill its
4786	// slice. Note that we leave the original load in place unless this store
4787	// was its only use. It may in turn be split up if it is an alloca load
4788	// for some other alloca, but it may be a normal load. This may introduce
4789	// redundant loads, but where those can be merged the rest of the optimizer
4790	// should handle the merging, and this uncovers SSA splits which is more
4791	// important. In practice, the original loads will almost always be fully
4792	// split and removed eventually, and the splits will be merged by any
4793	// trivial CSE, including instcombine.
4794	if (LI->hasOneUse()) {
4795	assert(*LI->user_begin() == SI && "Single use isn't this store!");
4796	DeadInsts.push_back(Elt: LI);
4797	}
4798	DeadInsts.push_back(Elt: SI);
4799	Offsets.S->kill();
4800	}
4801
4802	// Remove the killed slices that have ben pre-split.
4803	llvm::erase_if(C&: AS, P: [](const Slice &S) { return S.isDead(); });
4804
4805	// Insert our new slices. This will sort and merge them into the sorted
4806	// sequence.
4807	AS.insert(NewSlices);
4808
4809	LLVM_DEBUG(dbgs() << " Pre-split slices:\n");
4810	#ifndef NDEBUG
4811	for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
4812	LLVM_DEBUG(AS.print(dbgs(), I, " "));
4813	#endif
4814
4815	// Finally, don't try to promote any allocas that new require re-splitting.
4816	// They have already been added to the worklist above.
4817	llvm::erase_if(C&: PromotableAllocas, P: [&](AllocaInst *AI) {
4818	return ResplitPromotableAllocas.count(Ptr: AI);
4819	});
4820
4821	return true;
4822	}
4823
4824	/// Rewrite an alloca partition's users.
4825	///
4826	/// This routine drives both of the rewriting goals of the SROA pass. It tries
4827	/// to rewrite uses of an alloca partition to be conducive for SSA value
4828	/// promotion. If the partition needs a new, more refined alloca, this will
4829	/// build that new alloca, preserving as much type information as possible, and
4830	/// rewrite the uses of the old alloca to point at the new one and have the
4831	/// appropriate new offsets. It also evaluates how successful the rewrite was
4832	/// at enabling promotion and if it was successful queues the alloca to be
4833	/// promoted.
4834	AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
4835	Partition &P) {
4836	// Try to compute a friendly type for this partition of the alloca. This
4837	// won't always succeed, in which case we fall back to a legal integer type
4838	// or an i8 array of an appropriate size.
4839	Type SliceTy = nullptr*;
4840	VectorType SliceVecTy = nullptr*;
4841	const DataLayout &DL = AI.getModule()->getDataLayout();
4842	std::pair<Type , IntegerType > CommonUseTy =
4843	findCommonType(B: P.begin(), E: P.end(), EndOffset: P.endOffset());
4844	// Do all uses operate on the same type?
4845	if (CommonUseTy.first)
4846	if (DL.getTypeAllocSize(Ty: CommonUseTy.first).getFixedValue() >= P.size()) {
4847	SliceTy = CommonUseTy.first;
4848	SliceVecTy = dyn_cast<VectorType>(Val: SliceTy);
4849	}
4850	// If not, can we find an appropriate subtype in the original allocated type?
4851	if (!SliceTy)
4852	if (Type *TypePartitionTy = getTypePartition(DL, Ty: AI.getAllocatedType(),
4853	Offset: P.beginOffset(), Size: P.size()))
4854	SliceTy = TypePartitionTy;
4855
4856	// If still not, can we use the largest bitwidth integer type used?
4857	if (!SliceTy && CommonUseTy.second)
4858	if (DL.getTypeAllocSize(Ty: CommonUseTy.second).getFixedValue() >= P.size()) {
4859	SliceTy = CommonUseTy.second;
4860	SliceVecTy = dyn_cast<VectorType>(Val: SliceTy);
4861	}
4862	if ((!SliceTy \|\| (SliceTy->isArrayTy() &&
4863	SliceTy->getArrayElementType()->isIntegerTy())) &&
4864	DL.isLegalInteger(Width: P.size() * `8`)) {
4865	SliceTy = Type::getIntNTy(C&: C, N: P.size() `8`);
4866	}
4867
4868	// If the common use types are not viable for promotion then attempt to find
4869	// another type that is viable.
4870	if (SliceVecTy && !checkVectorTypeForPromotion(P, VTy: SliceVecTy, DL))
4871	if (Type *TypePartitionTy = getTypePartition(DL, Ty: AI.getAllocatedType(),
4872	Offset: P.beginOffset(), Size: P.size())) {
4873	VectorType *TypePartitionVecTy = dyn_cast<VectorType>(Val: TypePartitionTy);
4874	if (TypePartitionVecTy &&
4875	checkVectorTypeForPromotion(P, VTy: TypePartitionVecTy, DL))
4876	SliceTy = TypePartitionTy;
4877	}
4878
4879	if (!SliceTy)
4880	SliceTy = ArrayType::get(ElementType: Type::getInt8Ty(C&: *C), NumElements: P.size());
4881	assert(DL.getTypeAllocSize(SliceTy).getFixedValue() >= P.size());
4882
4883	bool IsIntegerPromotable = isIntegerWideningViable(P, AllocaTy: SliceTy, DL);
4884
4885	VectorType *VecTy =
4886	IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL);
4887	if (VecTy)
4888	SliceTy = VecTy;
4889
4890	// Check for the case where we're going to rewrite to a new alloca of the
4891	// exact same type as the original, and with the same access offsets. In that
4892	// case, re-use the existing alloca, but still run through the rewriter to
4893	// perform phi and select speculation.
4894	// P.beginOffset() can be non-zero even with the same type in a case with
4895	// out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll).
4896	AllocaInst *NewAI;
4897	if (SliceTy == AI.getAllocatedType() && P.beginOffset() == `0`) {
4898	NewAI = &AI;
4899	// FIXME: We should be able to bail at this point with "nothing changed".
4900	// FIXME: We might want to defer PHI speculation until after here.
4901	// FIXME: return nullptr;
4902	} else {
4903	// Make sure the alignment is compatible with P.beginOffset().
4904	const Align Alignment = commonAlignment(A: AI.getAlign(), Offset: P.beginOffset());
4905	// If we will get at least this much alignment from the type alone, leave
4906	// the alloca's alignment unconstrained.
4907	const bool IsUnconstrained = Alignment <= DL.getABITypeAlign(Ty: SliceTy);
4908	NewAI = new AllocaInst (
4909	SliceTy, AI.getAddressSpace(), nullptr,
4910	IsUnconstrained ? DL.getPrefTypeAlign(Ty: SliceTy) : Alignment,
4911	AI.getName() + ".sroa." + Twine (P.begin() - AS.begin()),
4912	AI.getIterator());
4913	// Copy the old AI debug location over to the new one.
4914	NewAI->setDebugLoc(AI.getDebugLoc());
4915	++NumNewAllocas;
4916	}
4917
4918	LLVM_DEBUG(dbgs() << "Rewriting alloca partition " << "[" << P.beginOffset()
4919	<< "," << P.endOffset() << ") to: " << *NewAI << "\n");
4920
4921	// Track the high watermark on the worklist as it is only relevant for
4922	// promoted allocas. We will reset it to this point if the alloca is not in
4923	// fact scheduled for promotion.
4924	unsigned PPWOldSize = PostPromotionWorklist.size();
4925	unsigned NumUses = `0`;
4926	SmallSetVector<PHINode *, `8`> PHIUsers;
4927	SmallSetVector<SelectInst *, `8`> SelectUsers;
4928
4929	AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(),
4930	P.endOffset(), IsIntegerPromotable, VecTy,
4931	PHIUsers, SelectUsers);
4932	bool Promotable = true;
4933	for (Slice *S : P.splitSliceTails()) {
4934	Promotable &= Rewriter.visit(I: S);
4935	++NumUses;
4936	}
4937	for (Slice &S : P) {
4938	Promotable &= Rewriter.visit(I: &S);
4939	++NumUses;
4940	}
4941
4942	NumAllocaPartitionUses += NumUses;
4943	MaxUsesPerAllocaPartition.updateMax(V: NumUses);
4944
4945	// Now that we've processed all the slices in the new partition, check if any
4946	// PHIs or Selects would block promotion.
4947	for (PHINode *PHI : PHIUsers)
4948	if (!isSafePHIToSpeculate(PN&: *PHI)) {
4949	Promotable = false;
4950	PHIUsers.clear();
4951	SelectUsers.clear();
4952	break;
4953	}
4954
4955	SmallVector<std::pair<SelectInst *, RewriteableMemOps>, `2`>
4956	NewSelectsToRewrite;
4957	NewSelectsToRewrite.reserve(N: SelectUsers.size());
4958	for (SelectInst *Sel : SelectUsers) {
4959	std::optional<RewriteableMemOps> Ops =
4960	isSafeSelectToSpeculate(SI&: *Sel, PreserveCFG);
4961	if (!Ops) {
4962	Promotable = false;
4963	PHIUsers.clear();
4964	SelectUsers.clear();
4965	NewSelectsToRewrite.clear();
4966	break;
4967	}
4968	NewSelectsToRewrite.emplace_back(Args: std::make_pair(x&: Sel, y&: *Ops));
4969	}
4970
4971	if (Promotable) {
4972	for (Use *U : AS.getDeadUsesIfPromotable()) {
4973	auto *OldInst = dyn_cast<Instruction>(Val: U->get());
4974	Value::dropDroppableUse(U&: *U);
4975	if (OldInst)
4976	if (isInstructionTriviallyDead(I: OldInst))
4977	DeadInsts.push_back(Elt: OldInst);
4978	}
4979	if (PHIUsers.empty() && SelectUsers.empty()) {
4980	// Promote the alloca.
4981	PromotableAllocas.push_back(x: NewAI);
4982	} else {
4983	// If we have either PHIs or Selects to speculate, add them to those
4984	// worklists and re-queue the new alloca so that we promote in on the
4985	// next iteration.
4986	for (PHINode *PHIUser : PHIUsers)
4987	SpeculatablePHIs.insert(X: PHIUser);
4988	SelectsToRewrite.reserve(NumEntries: SelectsToRewrite.size() +
4989	NewSelectsToRewrite.size());
4990	for (auto &&KV : llvm::make_range(
4991	x: std::make_move_iterator(i: NewSelectsToRewrite.begin()),
4992	y: std::make_move_iterator(i: NewSelectsToRewrite.end())))
4993	SelectsToRewrite.insert(KV: std::move(KV));
4994	Worklist.insert(X: NewAI);
4995	}
4996	} else {
4997	// Drop any post-promotion work items if promotion didn't happen.
4998	while (PostPromotionWorklist.size() > PPWOldSize)
4999	PostPromotionWorklist.pop_back();
5000
5001	// We couldn't promote and we didn't create a new partition, nothing
5002	// happened.
5003	if (NewAI == &AI)
5004	return nullptr;
5005
5006	// If we can't promote the alloca, iterate on it to check for new
5007	// refinements exposed by splitting the current alloca. Don't iterate on an
5008	// alloca which didn't actually change and didn't get promoted.
5009	Worklist.insert(X: NewAI);
5010	}
5011
5012	return NewAI;
5013	}
5014
5015	static void insertNewDbgInst(DIBuilder &DIB, DbgDeclareInst *Orig,
5016	AllocaInst NewAddr, DIExpression NewFragmentExpr,
5017	Instruction *BeforeInst) {
5018	DIB.insertDeclare(Storage: NewAddr, VarInfo: Orig->getVariable(), Expr: NewFragmentExpr,
5019	DL: Orig->getDebugLoc(), InsertBefore: BeforeInst);
5020	}
5021	static void insertNewDbgInst(DIBuilder &DIB, DbgAssignIntrinsic *Orig,
5022	AllocaInst NewAddr, DIExpression NewFragmentExpr,
5023	Instruction *BeforeInst) {
5024	(void)BeforeInst;
5025	if (!NewAddr->hasMetadata(KindID: LLVMContext::MD_DIAssignID)) {
5026	NewAddr->setMetadata(KindID: LLVMContext::MD_DIAssignID,
5027	Node: DIAssignID::getDistinct(Context&: NewAddr->getContext()));
5028	}
5029	Instruction *NewAssign =
5030	DIB.insertDbgAssign(LinkedInstr: NewAddr, Val: Orig->getValue(), SrcVar: Orig->getVariable(),
5031	ValExpr: NewFragmentExpr, Addr: NewAddr,
5032	AddrExpr: Orig->getAddressExpression(), DL: Orig->getDebugLoc())
5033	.get<Instruction *>();
5034	LLVM_DEBUG(dbgs() << "Created new assign intrinsic: " << *NewAssign << "\n");
5035	(void)NewAssign;
5036	}
5037	static void insertNewDbgInst(DIBuilder &DIB, DbgVariableRecord *Orig,
5038	AllocaInst NewAddr, DIExpression NewFragmentExpr,
5039	Instruction *BeforeInst) {
5040	(void)DIB;
5041	if (Orig->isDbgDeclare()) {
5042	DbgVariableRecord *DVR = DbgVariableRecord::createDVRDeclare(
5043	Address: NewAddr, DV: Orig->getVariable(), Expr: NewFragmentExpr, DI: Orig->getDebugLoc());
5044	BeforeInst->getParent()->insertDbgRecordBefore(DR: DVR,
5045	Here: BeforeInst->getIterator());
5046	return;
5047	}
5048	if (!NewAddr->hasMetadata(KindID: LLVMContext::MD_DIAssignID)) {
5049	NewAddr->setMetadata(KindID: LLVMContext::MD_DIAssignID,
5050	Node: DIAssignID::getDistinct(Context&: NewAddr->getContext()));
5051	}
5052	DbgVariableRecord *NewAssign = DbgVariableRecord::createLinkedDVRAssign(
5053	LinkedInstr: NewAddr, Val: Orig->getValue(), Variable: Orig->getVariable(), Expression: NewFragmentExpr, Address: NewAddr,
5054	AddressExpression: Orig->getAddressExpression(), DI: Orig->getDebugLoc());
5055	LLVM_DEBUG(dbgs() << "Created new DVRAssign: " << *NewAssign << "\n");
5056	(void)NewAssign;
5057	}
5058
5059	/// Walks the slices of an alloca and form partitions based on them,
5060	/// rewriting each of their uses.
5061	bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
5062	if (AS.begin() == AS.end())
5063	return false;
5064
5065	unsigned NumPartitions = `0`;
5066	bool Changed = false;
5067	const DataLayout &DL = AI.getModule()->getDataLayout();
5068
5069	// First try to pre-split loads and stores.
5070	Changed \|= presplitLoadsAndStores(AI, AS);
5071
5072	// Now that we have identified any pre-splitting opportunities,
5073	// mark loads and stores unsplittable except for the following case.
5074	// We leave a slice splittable if all other slices are disjoint or fully
5075	// included in the slice, such as whole-alloca loads and stores.
5076	// If we fail to split these during pre-splitting, we want to force them
5077	// to be rewritten into a partition.
5078	bool IsSorted = true;
5079
5080	uint64_t AllocaSize =
5081	DL.getTypeAllocSize(Ty: AI.getAllocatedType()).getFixedValue();
5082	const uint64_t MaxBitVectorSize = `1024`;
5083	if (AllocaSize <= MaxBitVectorSize) {
5084	// If a byte boundary is included in any load or store, a slice starting or
5085	// ending at the boundary is not splittable.
5086	SmallBitVector SplittableOffset(AllocaSize + `1`, true);
5087	for (Slice &S : AS)
5088	for (unsigned O = S.beginOffset() + `1`;
5089	O < S.endOffset() && O < AllocaSize; O++)
5090	SplittableOffset.reset(Idx: O);
5091
5092	for (Slice &S : AS) {
5093	if (!S.isSplittable())
5094	continue;
5095
5096	if ((S.beginOffset() > AllocaSize \|\| SplittableOffset [S.beginOffset()]) &&
5097	(S.endOffset() > AllocaSize \|\| SplittableOffset [S.endOffset()]))
5098	continue;
5099
5100	if (isa<LoadInst>(Val: S.getUse()->getUser()) \|\|
5101	isa<StoreInst>(Val: S.getUse()->getUser())) {
5102	S.makeUnsplittable();
5103	IsSorted = false;
5104	}
5105	}
5106	} else {
5107	// We only allow whole-alloca splittable loads and stores
5108	// for a large alloca to avoid creating too large BitVector.
5109	for (Slice &S : AS) {
5110	if (!S.isSplittable())
5111	continue;
5112
5113	if (S.beginOffset() == `0` && S.endOffset() >= AllocaSize)
5114	continue;
5115
5116	if (isa<LoadInst>(Val: S.getUse()->getUser()) \|\|
5117	isa<StoreInst>(Val: S.getUse()->getUser())) {
5118	S.makeUnsplittable();
5119	IsSorted = false;
5120	}
5121	}
5122	}
5123
5124	if (!IsSorted)
5125	llvm::sort(C&: AS);
5126
5127	/// Describes the allocas introduced by rewritePartition in order to migrate
5128	/// the debug info.
5129	struct Fragment {
5130	AllocaInst *Alloca;
5131	uint64_t Offset;
5132	uint64_t Size;
5133	Fragment(AllocaInst *AI, uint64_t O, uint64_t S)
5134	: Alloca(AI), Offset(O), Size(S) {}
5135	};
5136	SmallVector<Fragment, `4`> Fragments;
5137
5138	// Rewrite each partition.
5139	for (auto &P : AS.partitions()) {
5140	if (AllocaInst *NewAI = rewritePartition(AI, AS, P)) {
5141	Changed = true;
5142	if (NewAI != &AI) {
5143	uint64_t SizeOfByte = `8`;
5144	uint64_t AllocaSize =
5145	DL.getTypeSizeInBits(Ty: NewAI->getAllocatedType()).getFixedValue();
5146	// Don't include any padding.
5147	uint64_t Size = std::min(a: AllocaSize, b: P.size() * SizeOfByte);
5148	Fragments.push_back(
5149	Elt: Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
5150	}
5151	}
5152	++NumPartitions;
5153	}
5154
5155	NumAllocaPartitions += NumPartitions;
5156	MaxPartitionsPerAlloca.updateMax(V: NumPartitions);
5157
5158	// Migrate debug information from the old alloca to the new alloca(s)
5159	// and the individual partitions.
5160	auto MigrateOne = [&](auto *DbgVariable) {
5161	auto *Expr = DbgVariable->getExpression();
5162	DIBuilder DIB(AI.getModule(), /AllowUnresolved/* false);
5163	uint64_t AllocaSize =
5164	DL.getTypeSizeInBits(Ty: AI.getAllocatedType()).getFixedValue();
5165	for (auto Fragment : Fragments) {
5166	// Create a fragment expression describing the new partition or reuse AI's
5167	// expression if there is only one partition.
5168	auto *FragmentExpr = Expr;
5169	if (Fragment.Size < AllocaSize \|\| Expr->isFragment()) {
5170	// If this alloca is already a scalar replacement of a larger aggregate,
5171	// Fragment.Offset describes the offset inside the scalar.
5172	auto ExprFragment = Expr->getFragmentInfo();
5173	uint64_t Offset = ExprFragment ? ExprFragment->OffsetInBits : `0`;
5174	uint64_t Start = Offset + Fragment.Offset;
5175	uint64_t Size = Fragment.Size;
5176	if (ExprFragment) {
5177	uint64_t AbsEnd =
5178	ExprFragment->OffsetInBits + ExprFragment->SizeInBits;
5179	if (Start >= AbsEnd) {
5180	// No need to describe a SROAed padding.
5181	continue;
5182	}
5183	Size = std::min(a: Size, b: AbsEnd - Start);
5184	}
5185	// The new, smaller fragment is stenciled out from the old fragment.
5186	if (auto OrigFragment = FragmentExpr->getFragmentInfo()) {
5187	assert(Start >= OrigFragment->OffsetInBits &&
5188	"new fragment is outside of original fragment");
5189	Start -= OrigFragment->OffsetInBits;
5190	}
5191
5192	// The alloca may be larger than the variable.
5193	auto VarSize = DbgVariable->getVariable()->getSizeInBits();
5194	if (VarSize) {
5195	if (Size > *VarSize)
5196	Size = *VarSize;
5197	if (Size == `0` \|\| Start + Size > *VarSize)
5198	continue;
5199	}
5200
5201	// Avoid creating a fragment expression that covers the entire variable.
5202	if (!VarSize \|\| *VarSize != Size) {
5203	if (auto E =
5204	DIExpression::createFragmentExpression(Expr, OffsetInBits: Start, SizeInBits: Size))
5205	FragmentExpr = *E;
5206	else
5207	continue;
5208	}
5209	}
5210
5211	// Remove any existing intrinsics on the new alloca describing
5212	// the variable fragment.
5213	auto RemoveOne = [DbgVariable](auto *OldDII) {
5214	auto SameVariableFragment = [](const auto LHS, const* auto *RHS) {
5215	return LHS->getVariable() == RHS->getVariable() &&
5216	LHS->getDebugLoc()->getInlinedAt() ==
5217	RHS->getDebugLoc()->getInlinedAt();
5218	};
5219	if (SameVariableFragment(OldDII, DbgVariable))
5220	OldDII->eraseFromParent();
5221	};
5222	for_each(findDbgDeclares(V: Fragment.Alloca), RemoveOne);
5223	for_each(findDVRDeclares(V: Fragment.Alloca), RemoveOne);
5224
5225	insertNewDbgInst(DIB, DbgVariable, Fragment.Alloca, FragmentExpr, &AI);
5226	}
5227	};
5228
5229	// Migrate debug information from the old alloca to the new alloca(s)
5230	// and the individual partitions.
5231	for_each(Range: findDbgDeclares(V: &AI), F: MigrateOne);
5232	for_each(Range: findDVRDeclares(V: &AI), F: MigrateOne);
5233	for_each(Range: at::getAssignmentMarkers(Inst: &AI), F: MigrateOne);
5234	for_each(Range: at::getDVRAssignmentMarkers(Inst: &AI), F: MigrateOne);
5235
5236	return Changed;
5237	}
5238
5239	/// Clobber a use with poison, deleting the used value if it becomes dead.
5240	void SROA::clobberUse(Use &U) {
5241	Value *OldV = U;
5242	// Replace the use with an poison value.
5243	U = PoisonValue::get(T: OldV->getType());
5244
5245	// Check for this making an instruction dead. We have to garbage collect
5246	// all the dead instructions to ensure the uses of any alloca end up being
5247	// minimal.
5248	if (Instruction *OldI = dyn_cast<Instruction>(Val: OldV))
5249	if (isInstructionTriviallyDead(I: OldI)) {
5250	DeadInsts.push_back(Elt: OldI);
5251	}
5252	}
5253
5254	/// Analyze an alloca for SROA.
5255	///
5256	/// This analyzes the alloca to ensure we can reason about it, builds
5257	/// the slices of the alloca, and then hands it off to be split and
5258	/// rewritten as needed.
5259	std::pair<bool /Changed/, bool /CFGChanged/>
5260	SROA::runOnAlloca(AllocaInst &AI) {
5261	bool Changed = false;
5262	bool CFGChanged = false;
5263
5264	LLVM_DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
5265	++NumAllocasAnalyzed;
5266
5267	// Special case dead allocas, as they're trivial.
5268	if (AI.use_empty()) {
5269	AI.eraseFromParent();
5270	Changed = true;
5271	return {Changed, CFGChanged};
5272	}
5273	const DataLayout &DL = AI.getModule()->getDataLayout();
5274
5275	// Skip alloca forms that this analysis can't handle.
5276	auto *AT = AI.getAllocatedType();
5277	TypeSize Size = DL.getTypeAllocSize(Ty: AT);
5278	if (AI.isArrayAllocation() \|\| !AT->isSized() \|\| Size.isScalable() \|\|
5279	Size.getFixedValue() == `0`)
5280	return {Changed, CFGChanged};
5281
5282	// First, split any FCA loads and stores touching this alloca to promote
5283	// better splitting and promotion opportunities.
5284	IRBuilderTy IRB(&AI);
5285	AggLoadStoreRewriter AggRewriter(DL, IRB);
5286	Changed \|= AggRewriter.rewrite(I&: AI);
5287
5288	// Build the slices using a recursive instruction-visiting builder.
5289	AllocaSlices AS(DL, AI);
5290	LLVM_DEBUG(AS.print(dbgs()));
5291	if (AS.isEscaped())
5292	return {Changed, CFGChanged};
5293
5294	// Delete all the dead users of this alloca before splitting and rewriting it.
5295	for (Instruction *DeadUser : AS.getDeadUsers()) {
5296	// Free up everything used by this instruction.
5297	for (Use &DeadOp : DeadUser->operands())
5298	clobberUse(U&: DeadOp);
5299
5300	// Now replace the uses of this instruction.
5301	DeadUser->replaceAllUsesWith(V: PoisonValue::get(T: DeadUser->getType()));
5302
5303	// And mark it for deletion.
5304	DeadInsts.push_back(Elt: DeadUser);
5305	Changed = true;
5306	}
5307	for (Use *DeadOp : AS.getDeadOperands()) {
5308	clobberUse(U&: *DeadOp);
5309	Changed = true;
5310	}
5311
5312	// No slices to split. Leave the dead alloca for a later pass to clean up.
5313	if (AS.begin() == AS.end())
5314	return {Changed, CFGChanged};
5315
5316	Changed \|= splitAlloca(AI, AS);
5317
5318	LLVM_DEBUG(dbgs() << " Speculating PHIs\n");
5319	while (!SpeculatablePHIs.empty())
5320	speculatePHINodeLoads(IRB, PN&: *SpeculatablePHIs.pop_back_val());
5321
5322	LLVM_DEBUG(dbgs() << " Rewriting Selects\n");
5323	auto RemainingSelectsToRewrite = SelectsToRewrite.takeVector();
5324	while (!RemainingSelectsToRewrite.empty()) {
5325	const auto [K, V] = RemainingSelectsToRewrite.pop_back_val();
5326	CFGChanged \|=
5327	rewriteSelectInstMemOps(SI&: K, Ops: V, IRB, DTU: PreserveCFG ? nullptr* : DTU);
5328	}
5329
5330	return {Changed, CFGChanged};
5331	}
5332
5333	/// Delete the dead instructions accumulated in this run.
5334	///
5335	/// Recursively deletes the dead instructions we've accumulated. This is done
5336	/// at the very end to maximize locality of the recursive delete and to
5337	/// minimize the problems of invalidated instruction pointers as such pointers
5338	/// are used heavily in the intermediate stages of the algorithm.
5339	///
5340	/// We also record the alloca instructions deleted here so that they aren't
5341	/// subsequently handed to mem2reg to promote.
5342	bool SROA::deleteDeadInstructions(
5343	SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
5344	bool Changed = false;
5345	while (!DeadInsts.empty()) {
5346	Instruction *I = dyn_cast_or_null<Instruction>(Val: DeadInsts.pop_back_val());
5347	if (!I)
5348	continue;
5349	LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
5350
5351	// If the instruction is an alloca, find the possible dbg.declare connected
5352	// to it, and remove it too. We must do this before calling RAUW or we will
5353	// not be able to find it.
5354	if (AllocaInst *AI = dyn_cast<AllocaInst>(Val: I)) {
5355	DeletedAllocas.insert(Ptr: AI);
5356	for (DbgDeclareInst *OldDII : findDbgDeclares(V: AI))
5357	OldDII->eraseFromParent();
5358	for (DbgVariableRecord *OldDII : findDVRDeclares(V: AI))
5359	OldDII->eraseFromParent();
5360	}
5361
5362	at::deleteAssignmentMarkers(Inst: I);
5363	I->replaceAllUsesWith(V: UndefValue::get(T: I->getType()));
5364
5365	for (Use &Operand : I->operands())
5366	if (Instruction *U = dyn_cast<Instruction>(Val&: Operand)) {
5367	// Zero out the operand and see if it becomes trivially dead.
5368	Operand = nullptr;
5369	if (isInstructionTriviallyDead(I: U))
5370	DeadInsts.push_back(Elt: U);
5371	}
5372
5373	++NumDeleted;
5374	I->eraseFromParent();
5375	Changed = true;
5376	}
5377	return Changed;
5378	}
5379
5380	/// Promote the allocas, using the best available technique.
5381	///
5382	/// This attempts to promote whatever allocas have been identified as viable in
5383	/// the PromotableAllocas list. If that list is empty, there is nothing to do.
5384	/// This function returns whether any promotion occurred.
5385	bool SROA::promoteAllocas(Function &F) {
5386	if (PromotableAllocas.empty())
5387	return false;
5388
5389	NumPromoted += PromotableAllocas.size();
5390
5391	if (SROASkipMem2Reg) {
5392	LLVM_DEBUG(dbgs() << "Not promoting allocas with mem2reg!\n");
5393	} else {
5394	LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
5395	PromoteMemToReg(Allocas: PromotableAllocas, DT&: DTU->getDomTree(), AC);
5396	}
5397
5398	PromotableAllocas.clear();
5399	return true;
5400	}
5401
5402	std::pair<bool /Changed/, bool /CFGChanged/> SROA::runSROA(Function &F) {
5403	LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
5404
5405	const DataLayout &DL = F.getParent()->getDataLayout();
5406	BasicBlock &EntryBB = F.getEntryBlock();
5407	for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(x: EntryBB.end());
5408	I != E; ++I) {
5409	if (AllocaInst *AI = dyn_cast<AllocaInst>(Val&: I)) {
5410	if (DL.getTypeAllocSize(Ty: AI->getAllocatedType()).isScalable() &&
5411	isAllocaPromotable(AI))
5412	PromotableAllocas.push_back(x: AI);
5413	else
5414	Worklist.insert(X: AI);
5415	}
5416	}
5417
5418	bool Changed = false;
5419	bool CFGChanged = false;
5420	// A set of deleted alloca instruction pointers which should be removed from
5421	// the list of promotable allocas.
5422	SmallPtrSet<AllocaInst *, `4`> DeletedAllocas;
5423
5424	do {
5425	while (!Worklist.empty()) {
5426	auto [IterationChanged, IterationCFGChanged] =
5427	runOnAlloca(AI&: *Worklist.pop_back_val());
5428	Changed \|= IterationChanged;
5429	CFGChanged \|= IterationCFGChanged;
5430
5431	Changed \|= deleteDeadInstructions(DeletedAllocas);
5432
5433	// Remove the deleted allocas from various lists so that we don't try to
5434	// continue processing them.
5435	if (!DeletedAllocas.empty()) {
5436	auto IsInSet = [&](AllocaInst AI) { return* DeletedAllocas.count(Ptr: AI); };
5437	Worklist.remove_if(P: IsInSet);
5438	PostPromotionWorklist.remove_if(P: IsInSet);
5439	llvm::erase_if(C&: PromotableAllocas, P: IsInSet);
5440	DeletedAllocas.clear();
5441	}
5442	}
5443
5444	Changed \|= promoteAllocas(F);
5445
5446	Worklist = PostPromotionWorklist;
5447	PostPromotionWorklist.clear();
5448	} while (!Worklist.empty());
5449
5450	assert((!CFGChanged \|\| Changed) && "Can not only modify the CFG.");
5451	assert((!CFGChanged \|\| !PreserveCFG) &&
5452	"Should not have modified the CFG when told to preserve it.");
5453
5454	if (Changed && isAssignmentTrackingEnabled(M: *F.getParent())) {
5455	for (auto &BB : F) {
5456	RemoveRedundantDbgInstrs(BB: &BB);
5457	}
5458	}
5459
5460	return {Changed, CFGChanged};
5461	}
5462
5463	PreservedAnalyses SROAPass::run(Function &F, FunctionAnalysisManager &AM) {
5464	DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(IR&: F);
5465	AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(IR&: F);
5466	DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
5467	auto [Changed, CFGChanged] =
5468	SROA (&F.getContext(), &DTU, &AC, PreserveCFG).runSROA(F);
5469	if (!Changed)
5470	return PreservedAnalyses::all();
5471	PreservedAnalyses PA;
5472	if (!CFGChanged)
5473	PA.preserveSet<CFGAnalyses>();
5474	PA.preserve<DominatorTreeAnalysis>();
5475	return PA;
5476	}
5477
5478	void SROAPass::printPipeline(
5479	raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
5480	static_cast<PassInfoMixin<SROAPass> >(this*)->printPipeline(
5481	OS, MapClassName2PassName);
5482	OS << (PreserveCFG == SROAOptions::PreserveCFG ? "<preserve-cfg>"
5483	: "<modify-cfg>");
5484	}
5485
5486	SROAPass::SROAPass(SROAOptions PreserveCFG) : PreserveCFG(PreserveCFG) {}
5487
5488	namespace {
5489
5490	/// A legacy pass for the legacy pass manager that wraps the \c SROA pass.
5491	class SROALegacyPass : public FunctionPass {
5492	SROAOptions PreserveCFG;
5493
5494	public:
5495	static char ID;
5496
5497	SROALegacyPass(SROAOptions PreserveCFG = SROAOptions::PreserveCFG)
5498	: FunctionPass (ID), PreserveCFG(PreserveCFG) {
5499	initializeSROALegacyPassPass(*PassRegistry::getPassRegistry());
5500	}
5501
5502	bool runOnFunction(Function &F) override {
5503	if (skipFunction(F))
5504	return false;
5505
5506	DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
5507	AssumptionCache &AC =
5508	getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
5509	DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
5510	auto [Changed, _] =
5511	SROA (&F.getContext(), &DTU, &AC, PreserveCFG).runSROA(F);
5512	return Changed;
5513	}
5514
5515	void getAnalysisUsage(AnalysisUsage &AU) const override {
5516	AU.addRequired<AssumptionCacheTracker>();
5517	AU.addRequired<DominatorTreeWrapperPass>();
5518	AU.addPreserved<GlobalsAAWrapperPass>();
5519	AU.addPreserved<DominatorTreeWrapperPass>();
5520	}
5521
5522	StringRef getPassName() const override { return "SROA"; }
5523	};
5524
5525	} // end anonymous namespace
5526
5527	char SROALegacyPass::ID = `0`;
5528
5529	FunctionPass llvm::createSROAPass(bool* PreserveCFG) {
5530	return new SROALegacyPass (PreserveCFG ? SROAOptions::PreserveCFG
5531	: SROAOptions::ModifyCFG);
5532	}
5533
5534	INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa",
5535	"Scalar Replacement Of Aggregates", false, false)
5536	INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
5537	INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
5538	INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates",
5539	false, false)
5540

source code of llvm/lib/Transforms/Scalar/SROA.cpp