X86ISelDAGToDAG.cpp source code [llvm/lib/Target/X86/X86ISelDAGToDAG.cpp]

1	//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file defines a DAG pattern matching instruction selector for X86,
10	// converting from a legalized dag to a X86 dag.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "X86.h"
15	#include "X86MachineFunctionInfo.h"
16	#include "X86RegisterInfo.h"
17	#include "X86Subtarget.h"
18	#include "X86TargetMachine.h"
19	#include "llvm/ADT/Statistic.h"
20	#include "llvm/CodeGen/MachineModuleInfo.h"
21	#include "llvm/CodeGen/SelectionDAGISel.h"
22	#include "llvm/Config/llvm-config.h"
23	#include "llvm/IR/ConstantRange.h"
24	#include "llvm/IR/Function.h"
25	#include "llvm/IR/Instructions.h"
26	#include "llvm/IR/Intrinsics.h"
27	#include "llvm/IR/IntrinsicsX86.h"
28	#include "llvm/IR/Type.h"
29	#include "llvm/Support/Debug.h"
30	#include "llvm/Support/ErrorHandling.h"
31	#include "llvm/Support/KnownBits.h"
32	#include "llvm/Support/MathExtras.h"
33	#include <cstdint>
34
35	using namespace llvm;
36
37	#define DEBUG_TYPE "x86-isel"
38	#define PASS_NAME "X86 DAG->DAG Instruction Selection"
39
40	STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
41
42	static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(Val: true),
43	cl::desc ("Enable setting constant bits to reduce size of mask immediates"),
44	cl::Hidden);
45
46	static cl::opt<bool> EnablePromoteAnyextLoad(
47	"x86-promote-anyext-load", cl::init(Val: true),
48	cl::desc ("Enable promoting aligned anyext load to wider load"), cl::Hidden);
49
50	extern cl::opt<bool> IndirectBranchTracking;
51
52	//===----------------------------------------------------------------------===//
53	// Pattern Matcher Implementation
54	//===----------------------------------------------------------------------===//
55
56	namespace {
57	/// This corresponds to X86AddressMode, but uses SDValue's instead of register
58	/// numbers for the leaves of the matched tree.
59	struct X86ISelAddressMode {
60	enum {
61	RegBase,
62	FrameIndexBase
63	} BaseType = RegBase;
64
65	// This is really a union, discriminated by BaseType!
66	SDValue Base_Reg;
67	int Base_FrameIndex = `0`;
68
69	unsigned Scale = `1`;
70	SDValue IndexReg;
71	int32_t Disp = `0`;
72	SDValue Segment;
73	const GlobalValue GV = nullptr*;
74	const Constant CP = nullptr*;
75	const BlockAddress BlockAddr = nullptr*;
76	const char ES = nullptr*;
77	MCSymbol MCSym = nullptr*;
78	int JT = -`1`;
79	Align Alignment; // CP alignment.
80	unsigned char SymbolFlags = X86II::MO_NO_FLAG; // X86II::MO_*
81	bool NegateIndex = false;
82
83	X86ISelAddressMode() = default;
84
85	bool hasSymbolicDisplacement() const {
86	return GV != nullptr \|\| CP != nullptr \|\| ES != nullptr \|\|
87	MCSym != nullptr \|\| JT != -`1` \|\| BlockAddr != nullptr;
88	}
89
90	bool hasBaseOrIndexReg() const {
91	return BaseType == FrameIndexBase \|\|
92	IndexReg.getNode() != nullptr \|\| Base_Reg.getNode() != nullptr;
93	}
94
95	/// Return true if this addressing mode is already RIP-relative.
96	bool isRIPRelative() const {
97	if (BaseType != RegBase) return false;
98	if (RegisterSDNode *RegNode =
99	dyn_cast_or_null<RegisterSDNode>(Val: Base_Reg.getNode()))
100	return RegNode->getReg() == X86::RIP;
101	return false;
102	}
103
104	void setBaseReg(SDValue Reg) {
105	BaseType = RegBase;
106	Base_Reg = Reg;
107	}
108
109	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
110	void dump(SelectionDAG DAG = nullptr*) {
111	dbgs() << "X86ISelAddressMode " << this << `'\n'`;
112	dbgs() << "Base_Reg ";
113	if (Base_Reg.getNode())
114	Base_Reg.getNode()->dump(G: DAG);
115	else
116	dbgs() << "nul\n";
117	if (BaseType == FrameIndexBase)
118	dbgs() << " Base.FrameIndex " << Base_FrameIndex << `'\n'`;
119	dbgs() << " Scale " << Scale << `'\n'`
120	<< "IndexReg ";
121	if (NegateIndex)
122	dbgs() << "negate ";
123	if (IndexReg.getNode())
124	IndexReg.getNode()->dump(G: DAG);
125	else
126	dbgs() << "nul\n";
127	dbgs() << " Disp " << Disp << `'\n'`
128	<< "GV ";
129	if (GV)
130	GV->dump();
131	else
132	dbgs() << "nul";
133	dbgs() << " CP ";
134	if (CP)
135	CP->dump();
136	else
137	dbgs() << "nul";
138	dbgs() << `'\n'`
139	<< "ES ";
140	if (ES)
141	dbgs() << ES;
142	else
143	dbgs() << "nul";
144	dbgs() << " MCSym ";
145	if (MCSym)
146	dbgs() << MCSym;
147	else
148	dbgs() << "nul";
149	dbgs() << " JT" << JT << " Align" << Alignment.value() << `'\n'`;
150	}
151	#endif
152	};
153	}
154
155	namespace {
156	//===--------------------------------------------------------------------===//
157	/// ISel - X86-specific code to select X86 machine instructions for
158	/// SelectionDAG operations.
159	///
160	class X86DAGToDAGISel final : public SelectionDAGISel {
161	/// Keep a pointer to the X86Subtarget around so that we can
162	/// make the right decision when generating code for different targets.
163	const X86Subtarget *Subtarget;
164
165	/// If true, selector should try to optimize for minimum code size.
166	bool OptForMinSize;
167
168	/// Disable direct TLS access through segment registers.
169	bool IndirectTlsSegRefs;
170
171	public:
172	static char ID;
173
174	X86DAGToDAGISel() = delete;
175
176	explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOptLevel OptLevel)
177	: SelectionDAGISel (ID, tm, OptLevel), Subtarget(nullptr),
178	OptForMinSize(false), IndirectTlsSegRefs(false) {}
179
180	bool runOnMachineFunction(MachineFunction &MF) override {
181	// Reset the subtarget each time through.
182	Subtarget = &MF.getSubtarget<X86Subtarget>();
183	IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
184	Kind: "indirect-tls-seg-refs");
185
186	// OptFor[Min]Size are used in pattern predicates that isel is matching.
187	OptForMinSize = MF.getFunction().hasMinSize();
188	assert((!OptForMinSize \|\| MF.getFunction().hasOptSize()) &&
189	"OptForMinSize implies OptForSize");
190
191	SelectionDAGISel::runOnMachineFunction(MF);
192	return true;
193	}
194
195	void emitFunctionEntryCode() override;
196
197	bool IsProfitableToFold(SDValue N, SDNode U, SDNode Root) const override;
198
199	void PreprocessISelDAG() override;
200	void PostprocessISelDAG() override;
201
202	// Include the pieces autogenerated from the target description.
203	#include "X86GenDAGISel.inc"
204
205	private:
206	void Select(SDNode *N) override;
207
208	bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
209	bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
210	bool AllowSegmentRegForX32 = false);
211	bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
212	bool matchAddress(SDValue N, X86ISelAddressMode &AM);
213	bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
214	bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
215	SDValue matchIndexRecursively(SDValue N, X86ISelAddressMode &AM,
216	unsigned Depth);
217	bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
218	unsigned Depth);
219	bool matchVectorAddressRecursively(SDValue N, X86ISelAddressMode &AM,
220	unsigned Depth);
221	bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
222	bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
223	SDValue &Scale, SDValue &Index, SDValue &Disp,
224	SDValue &Segment);
225	bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp,
226	SDValue ScaleOp, SDValue &Base, SDValue &Scale,
227	SDValue &Index, SDValue &Disp, SDValue &Segment);
228	bool selectMOV64Imm32(SDValue N, SDValue &Imm);
229	bool selectLEAAddr(SDValue N, SDValue &Base,
230	SDValue &Scale, SDValue &Index, SDValue &Disp,
231	SDValue &Segment);
232	bool selectLEA64_32Addr(SDValue N, SDValue &Base,
233	SDValue &Scale, SDValue &Index, SDValue &Disp,
234	SDValue &Segment);
235	bool selectTLSADDRAddr(SDValue N, SDValue &Base,
236	SDValue &Scale, SDValue &Index, SDValue &Disp,
237	SDValue &Segment);
238	bool selectRelocImm(SDValue N, SDValue &Op);
239
240	bool tryFoldLoad(SDNode Root, SDNode P, SDValue N,
241	SDValue &Base, SDValue &Scale,
242	SDValue &Index, SDValue &Disp,
243	SDValue &Segment);
244
245	// Convenience method where P is also root.
246	bool tryFoldLoad(SDNode *P, SDValue N,
247	SDValue &Base, SDValue &Scale,
248	SDValue &Index, SDValue &Disp,
249	SDValue &Segment) {
250	return tryFoldLoad(Root: P, P, N, Base, Scale, Index, Disp, Segment);
251	}
252
253	bool tryFoldBroadcast(SDNode Root, SDNode P, SDValue N,
254	SDValue &Base, SDValue &Scale,
255	SDValue &Index, SDValue &Disp,
256	SDValue &Segment);
257
258	bool isProfitableToFormMaskedOp(SDNode N) const*;
259
260	/// Implement addressing mode selection for inline asm expressions.
261	bool SelectInlineAsmMemoryOperand(const SDValue &Op,
262	InlineAsm::ConstraintCode ConstraintID,
263	std::vector<SDValue> &OutOps) override;
264
265	void emitSpecialCodeForMain();
266
267	inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
268	MVT VT, SDValue &Base, SDValue &Scale,
269	SDValue &Index, SDValue &Disp,
270	SDValue &Segment) {
271	if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
272	Base = CurDAG->getTargetFrameIndex(
273	FI: AM.Base_FrameIndex, VT: TLI->getPointerTy(DL: CurDAG->getDataLayout()));
274	else if (AM.Base_Reg.getNode())
275	Base = AM.Base_Reg;
276	else
277	Base = CurDAG->getRegister(Reg: `0`, VT);
278
279	Scale = getI8Imm(Imm: AM.Scale, DL);
280
281	#define GET_ND_IF_ENABLED(OPC) (Subtarget->hasNDD() ? OPC##_ND : OPC)
282	// Negate the index if needed.
283	if (AM.NegateIndex) {
284	unsigned NegOpc = VT == MVT::i64 ? GET_ND_IF_ENABLED(X86::NEG64r)
285	: GET_ND_IF_ENABLED(X86::NEG32r);
286	SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32,
287	AM.IndexReg), `0`);
288	AM.IndexReg = Neg;
289	}
290
291	if (AM.IndexReg.getNode())
292	Index = AM.IndexReg;
293	else
294	Index = CurDAG->getRegister(Reg: `0`, VT);
295
296	// These are 32-bit even in 64-bit mode since RIP-relative offset
297	// is 32-bit.
298	if (AM.GV)
299	Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
300	MVT::i32, AM.Disp,
301	AM.SymbolFlags);
302	else if (AM.CP)
303	Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Alignment,
304	AM.Disp, AM.SymbolFlags);
305	else if (AM.ES) {
306	assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
307	Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
308	} else if (AM.MCSym) {
309	assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
310	assert(AM.SymbolFlags == `0` && "oo");
311	Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
312	} else if (AM.JT != -`1`) {
313	assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
314	Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
315	} else if (AM.BlockAddr)
316	Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
317	AM.SymbolFlags);
318	else
319	Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32);
320
321	if (AM.Segment.getNode())
322	Segment = AM.Segment;
323	else
324	Segment = CurDAG->getRegister(`0`, MVT::i16);
325	}
326
327	// Utility function to determine whether we should avoid selecting
328	// immediate forms of instructions for better code size or not.
329	// At a high level, we'd like to avoid such instructions when
330	// we have similar constants used within the same basic block
331	// that can be kept in a register.
332	//
333	bool shouldAvoidImmediateInstFormsForSize(SDNode N) const* {
334	uint32_t UseCount = `0`;
335
336	// Do not want to hoist if we're not optimizing for size.
337	// TODO: We'd like to remove this restriction.
338	// See the comment in X86InstrInfo.td for more info.
339	if (!CurDAG->shouldOptForSize())
340	return false;
341
342	// Walk all the users of the immediate.
343	for (const SDNode *User : N->uses()) {
344	if (UseCount >= `2`)
345	break;
346
347	// This user is already selected. Count it as a legitimate use and
348	// move on.
349	if (User->isMachineOpcode()) {
350	UseCount++;
351	continue;
352	}
353
354	// We want to count stores of immediates as real uses.
355	if (User->getOpcode() == ISD::STORE &&
356	User->getOperand(`1`).getNode() == N) {
357	UseCount++;
358	continue;
359	}
360
361	// We don't currently match users that have > 2 operands (except
362	// for stores, which are handled above)
363	// Those instruction won't match in ISEL, for now, and would
364	// be counted incorrectly.
365	// This may change in the future as we add additional instruction
366	// types.
367	if (User->getNumOperands() != `2`)
368	continue;
369
370	// If this is a sign-extended 8-bit integer immediate used in an ALU
371	// instruction, there is probably an opcode encoding to save space.
372	auto *C = dyn_cast<ConstantSDNode>(N);
373	if (C && isInt<`8`>(C->getSExtValue()))
374	continue;
375
376	// Immediates that are used for offsets as part of stack
377	// manipulation should be left alone. These are typically
378	// used to indicate SP offsets for argument passing and
379	// will get pulled into stores/pushes (implicitly).
380	if (User->getOpcode() == X86ISD::ADD \|\|
381	User->getOpcode() == ISD::ADD \|\|
382	User->getOpcode() == X86ISD::SUB \|\|
383	User->getOpcode() == ISD::SUB) {
384
385	// Find the other operand of the add/sub.
386	SDValue OtherOp = User->getOperand(`0`);
387	if (OtherOp.getNode() == N)
388	OtherOp = User->getOperand(`1`);
389
390	// Don't count if the other operand is SP.
391	RegisterSDNode *RegNode;
392	if (OtherOp->getOpcode() == ISD::CopyFromReg &&
393	(RegNode = dyn_cast_or_null<RegisterSDNode>(
394	OtherOp->getOperand(`1`).getNode())))
395	if ((RegNode->getReg() == X86::ESP) \|\|
396	(RegNode->getReg() == X86::RSP))
397	continue;
398	}
399
400	// ... otherwise, count this and move on.
401	UseCount++;
402	}
403
404	// If we have more than 1 use, then recommend for hoisting.
405	return (UseCount > `1`);
406	}
407
408	/// Return a target constant with the specified value of type i8.
409	inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
410	return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
411	}
412
413	/// Return a target constant with the specified value, of type i32.
414	inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
415	return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
416	}
417
418	/// Return a target constant with the specified value, of type i64.
419	inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
420	return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
421	}
422
423	SDValue getExtractVEXTRACTImmediate(SDNode N, unsigned* VecWidth,
424	const SDLoc &DL) {
425	assert((VecWidth == `128` \|\| VecWidth == `256`) && "Unexpected vector width");
426	uint64_t Index = N->getConstantOperandVal(Num: `1`);
427	MVT VecVT = N->getOperand(Num: `0`).getSimpleValueType();
428	return getI8Imm(Imm: (Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
429	}
430
431	SDValue getInsertVINSERTImmediate(SDNode N, unsigned* VecWidth,
432	const SDLoc &DL) {
433	assert((VecWidth == `128` \|\| VecWidth == `256`) && "Unexpected vector width");
434	uint64_t Index = N->getConstantOperandVal(Num: `2`);
435	MVT VecVT = N->getSimpleValueType(ResNo: `0`);
436	return getI8Imm(Imm: (Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
437	}
438
439	SDValue getPermuteVINSERTCommutedImmediate(SDNode N, unsigned* VecWidth,
440	const SDLoc &DL) {
441	assert(VecWidth == `128` && "Unexpected vector width");
442	uint64_t Index = N->getConstantOperandVal(Num: `2`);
443	MVT VecVT = N->getSimpleValueType(ResNo: `0`);
444	uint64_t InsertIdx = (Index * VecVT.getScalarSizeInBits()) / VecWidth;
445	assert((InsertIdx == `0` \|\| InsertIdx == `1`) && "Bad insertf128 index");
446	// vinsert(0,sub,vec) -> [sub0][vec1] -> vperm2x128(0x30,vec,sub)
447	// vinsert(1,sub,vec) -> [vec0][sub0] -> vperm2x128(0x02,vec,sub)
448	return getI8Imm(Imm: InsertIdx ? `0x02` : `0x30`, DL);
449	}
450
451	SDValue getSBBZero(SDNode *N) {
452	SDLoc dl(N);
453	MVT VT = N->getSimpleValueType(ResNo: `0`);
454
455	// Create zero.
456	SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
457	SDValue Zero = SDValue(
458	CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, std::nullopt), `0`);
459	if (VT == MVT::i64) {
460	Zero = SDValue(
461	CurDAG->getMachineNode(
462	TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
463	CurDAG->getTargetConstant(`0`, dl, MVT::i64), Zero,
464	CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)),
465	`0`);
466	}
467
468	// Copy flags to the EFLAGS register and glue it to next node.
469	unsigned Opcode = N->getOpcode();
470	assert((Opcode == X86ISD::SBB \|\| Opcode == X86ISD::SETCC_CARRY) &&
471	"Unexpected opcode for SBB materialization");
472	unsigned FlagOpIndex = Opcode == X86ISD::SBB ? `2` : `1`;
473	SDValue EFLAGS =
474	CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
475	N->getOperand(FlagOpIndex), SDValue());
476
477	// Create a 64-bit instruction if the result is 64-bits otherwise use the
478	// 32-bit version.
479	unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr;
480	MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
481	VTs = CurDAG->getVTList(SBBVT, MVT::i32);
482	return SDValue (
483	CurDAG->getMachineNode(Opc, dl, VTs,
484	{Zero, Zero, EFLAGS, EFLAGS.getValue(R: `1`)}),
485	`0`);
486	}
487
488	// Helper to detect unneeded and instructions on shift amounts. Called
489	// from PatFrags in tablegen.
490	bool isUnneededShiftMask(SDNode N, unsigned* Width) const {
491	assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
492	const APInt &Val = N->getConstantOperandAPInt(Num: `1`);
493
494	if (Val.countr_one() >= Width)
495	return true;
496
497	APInt Mask = Val \| CurDAG->computeKnownBits(Op: N->getOperand(Num: `0`)).Zero;
498	return Mask.countr_one() >= Width;
499	}
500
501	/// Return an SDNode that returns the value of the global base register.
502	/// Output instructions required to initialize the global base register,
503	/// if necessary.
504	SDNode *getGlobalBaseReg();
505
506	/// Return a reference to the TargetMachine, casted to the target-specific
507	/// type.
508	const X86TargetMachine &getTargetMachine() const {
509	return static_cast<const X86TargetMachine &>(TM);
510	}
511
512	/// Return a reference to the TargetInstrInfo, casted to the target-specific
513	/// type.
514	const X86InstrInfo getInstrInfo() const* {
515	return Subtarget->getInstrInfo();
516	}
517
518	/// Return a condition code of the given SDNode
519	X86::CondCode getCondFromNode(SDNode N) const*;
520
521	/// Address-mode matching performs shift-of-and to and-of-shift
522	/// reassociation in order to expose more scaled addressing
523	/// opportunities.
524	bool ComplexPatternFuncMutatesDAG() const override {
525	return true;
526	}
527
528	bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode N) const*;
529
530	// Indicates we should prefer to use a non-temporal load for this load.
531	bool useNonTemporalLoad(LoadSDNode N) const* {
532	if (!N->isNonTemporal())
533	return false;
534
535	unsigned StoreSize = N->getMemoryVT().getStoreSize();
536
537	if (N->getAlign().value() < StoreSize)
538	return false;
539
540	switch (StoreSize) {
541	default: llvm_unreachable("Unsupported store size");
542	case `4`:
543	case `8`:
544	return false;
545	case `16`:
546	return Subtarget->hasSSE41();
547	case `32`:
548	return Subtarget->hasAVX2();
549	case `64`:
550	return Subtarget->hasAVX512();
551	}
552	}
553
554	bool foldLoadStoreIntoMemOperand(SDNode *Node);
555	MachineSDNode matchBEXTRFromAndImm(SDNode Node);
556	bool matchBitExtract(SDNode *Node);
557	bool shrinkAndImmediate(SDNode *N);
558	bool isMaskZeroExtended(SDNode N) const*;
559	bool tryShiftAmountMod(SDNode *N);
560	bool tryShrinkShlLogicImm(SDNode *N);
561	bool tryVPTERNLOG(SDNode *N);
562	bool matchVPTERNLOG(SDNode Root, SDNode ParentA, SDNode *ParentB,
563	SDNode *ParentC, SDValue A, SDValue B, SDValue C,
564	uint8_t Imm);
565	bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
566	bool tryMatchBitSelect(SDNode *N);
567
568	MachineSDNode emitPCMPISTR(unsigned* ROpc, unsigned MOpc, bool MayFoldLoad,
569	const SDLoc &dl, MVT VT, SDNode *Node);
570	MachineSDNode emitPCMPESTR(unsigned* ROpc, unsigned MOpc, bool MayFoldLoad,
571	const SDLoc &dl, MVT VT, SDNode *Node,
572	SDValue &InGlue);
573
574	bool tryOptimizeRem8Extend(SDNode *N);
575
576	bool onlyUsesZeroFlag(SDValue Flags) const;
577	bool hasNoSignFlagUses(SDValue Flags) const;
578	bool hasNoCarryFlagUses(SDValue Flags) const;
579	};
580	}
581
582	char X86DAGToDAGISel::ID = `0`;
583
584	INITIALIZE_PASS(X86DAGToDAGISel, DEBUG_TYPE, PASS_NAME, false, false)
585
586	// Returns true if this masked compare can be implemented legally with this
587	// type.
588	static bool isLegalMaskCompare(SDNode N, const* X86Subtarget *Subtarget) {
589	unsigned Opcode = N->getOpcode();
590	if (Opcode == X86ISD::CMPM \|\| Opcode == X86ISD::CMPMM \|\|
591	Opcode == X86ISD::STRICT_CMPM \|\| Opcode == ISD::SETCC \|\|
592	Opcode == X86ISD::CMPMM_SAE \|\| Opcode == X86ISD::VFPCLASS) {
593	// We can get 256-bit 8 element types here without VLX being enabled. When
594	// this happens we will use 512-bit operations and the mask will not be
595	// zero extended.
596	EVT OpVT = N->getOperand(Num: `0`).getValueType();
597	// The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the
598	// second operand.
599	if (Opcode == X86ISD::STRICT_CMPM)
600	OpVT = N->getOperand(Num: `1`).getValueType();
601	if (OpVT.is256BitVector() \|\| OpVT.is128BitVector())
602	return Subtarget->hasVLX();
603
604	return true;
605	}
606	// Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
607	if (Opcode == X86ISD::VFPCLASSS \|\| Opcode == X86ISD::FSETCCM \|\|
608	Opcode == X86ISD::FSETCCM_SAE)
609	return true;
610
611	return false;
612	}
613
614	// Returns true if we can assume the writer of the mask has zero extended it
615	// for us.
616	bool X86DAGToDAGISel::isMaskZeroExtended(SDNode N) const* {
617	// If this is an AND, check if we have a compare on either side. As long as
618	// one side guarantees the mask is zero extended, the AND will preserve those
619	// zeros.
620	if (N->getOpcode() == ISD::AND)
621	return isLegalMaskCompare(N: N->getOperand(Num: `0`).getNode(), Subtarget) \|\|
622	isLegalMaskCompare(N: N->getOperand(Num: `1`).getNode(), Subtarget);
623
624	return isLegalMaskCompare(N, Subtarget);
625	}
626
627	bool
628	X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode U, SDNode Root) const {
629	if (OptLevel == CodeGenOptLevel::None)
630	return false;
631
632	if (!N.hasOneUse())
633	return false;
634
635	if (N.getOpcode() != ISD::LOAD)
636	return true;
637
638	// Don't fold non-temporal loads if we have an instruction for them.
639	if (useNonTemporalLoad(N: cast<LoadSDNode>(Val&: N)))
640	return false;
641
642	// If N is a load, do additional profitability checks.
643	if (U == Root) {
644	switch (U->getOpcode()) {
645	default: break;
646	case X86ISD::ADD:
647	case X86ISD::ADC:
648	case X86ISD::SUB:
649	case X86ISD::SBB:
650	case X86ISD::AND:
651	case X86ISD::XOR:
652	case X86ISD::OR:
653	case ISD::ADD:
654	case ISD::UADDO_CARRY:
655	case ISD::AND:
656	case ISD::OR:
657	case ISD::XOR: {
658	SDValue Op1 = U->getOperand(Num: `1`);
659
660	// If the other operand is a 8-bit immediate we should fold the immediate
661	// instead. This reduces code size.
662	// e.g.
663	// movl 4(%esp), %eax
664	// addl $4, %eax
665	// vs.
666	// movl $4, %eax
667	// addl 4(%esp), %eax
668	// The former is 2 bytes shorter. In case where the increment is 1, then
669	// the saving can be 4 bytes (by using incl %eax).
670	if (auto *Imm = dyn_cast<ConstantSDNode>(Val&: Op1)) {
671	if (Imm->getAPIntValue().isSignedIntN(N: `8`))
672	return false;
673
674	// If this is a 64-bit AND with an immediate that fits in 32-bits,
675	// prefer using the smaller and over folding the load. This is needed to
676	// make sure immediates created by shrinkAndImmediate are always folded.
677	// Ideally we would narrow the load during DAG combine and get the
678	// best of both worlds.
679	if (U->getOpcode() == ISD::AND &&
680	Imm->getAPIntValue().getBitWidth() == `64` &&
681	Imm->getAPIntValue().isIntN(N: `32`))
682	return false;
683
684	// If this really a zext_inreg that can be represented with a movzx
685	// instruction, prefer that.
686	// TODO: We could shrink the load and fold if it is non-volatile.
687	if (U->getOpcode() == ISD::AND &&
688	(Imm->getAPIntValue() == UINT8_MAX \|\|
689	Imm->getAPIntValue() == UINT16_MAX \|\|
690	Imm->getAPIntValue() == UINT32_MAX))
691	return false;
692
693	// ADD/SUB with can negate the immediate and use the opposite operation
694	// to fit 128 into a sign extended 8 bit immediate.
695	if ((U->getOpcode() == ISD::ADD \|\| U->getOpcode() == ISD::SUB) &&
696	(-Imm->getAPIntValue()).isSignedIntN(N: `8`))
697	return false;
698
699	if ((U->getOpcode() == X86ISD::ADD \|\| U->getOpcode() == X86ISD::SUB) &&
700	(-Imm->getAPIntValue()).isSignedIntN(N: `8`) &&
701	hasNoCarryFlagUses(Flags: SDValue (U, `1`)))
702	return false;
703	}
704
705	// If the other operand is a TLS address, we should fold it instead.
706	// This produces
707	// movl %gs:0, %eax
708	// leal i@NTPOFF(%eax), %eax
709	// instead of
710	// movl $i@NTPOFF, %eax
711	// addl %gs:0, %eax
712	// if the block also has an access to a second TLS address this will save
713	// a load.
714	// FIXME: This is probably also true for non-TLS addresses.
715	if (Op1.getOpcode() == X86ISD::Wrapper) {
716	SDValue Val = Op1.getOperand(i: `0`);
717	if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
718	return false;
719	}
720
721	// Don't fold load if this matches the BTS/BTR/BTC patterns.
722	// BTS: (or X, (shl 1, n))
723	// BTR: (and X, (rotl -2, n))
724	// BTC: (xor X, (shl 1, n))
725	if (U->getOpcode() == ISD::OR \|\| U->getOpcode() == ISD::XOR) {
726	if (U->getOperand(Num: `0`).getOpcode() == ISD::SHL &&
727	isOneConstant(V: U->getOperand(Num: `0`).getOperand(i: `0`)))
728	return false;
729
730	if (U->getOperand(Num: `1`).getOpcode() == ISD::SHL &&
731	isOneConstant(V: U->getOperand(Num: `1`).getOperand(i: `0`)))
732	return false;
733	}
734	if (U->getOpcode() == ISD::AND) {
735	SDValue U0 = U->getOperand(Num: `0`);
736	SDValue U1 = U->getOperand(Num: `1`);
737	if (U0.getOpcode() == ISD::ROTL) {
738	auto *C = dyn_cast<ConstantSDNode>(Val: U0.getOperand(i: `0`));
739	if (C && C->getSExtValue() == -`2`)
740	return false;
741	}
742
743	if (U1.getOpcode() == ISD::ROTL) {
744	auto *C = dyn_cast<ConstantSDNode>(Val: U1.getOperand(i: `0`));
745	if (C && C->getSExtValue() == -`2`)
746	return false;
747	}
748	}
749
750	break;
751	}
752	case ISD::SHL:
753	case ISD::SRA:
754	case ISD::SRL:
755	// Don't fold a load into a shift by immediate. The BMI2 instructions
756	// support folding a load, but not an immediate. The legacy instructions
757	// support folding an immediate, but can't fold a load. Folding an
758	// immediate is preferable to folding a load.
759	if (isa<ConstantSDNode>(Val: U->getOperand(Num: `1`)))
760	return false;
761
762	break;
763	}
764	}
765
766	// Prevent folding a load if this can implemented with an insert_subreg or
767	// a move that implicitly zeroes.
768	if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
769	isNullConstant(V: Root->getOperand(Num: `2`)) &&
770	(Root->getOperand(Num: `0`).isUndef() \|\|
771	ISD::isBuildVectorAllZeros(N: Root->getOperand(Num: `0`).getNode())))
772	return false;
773
774	return true;
775	}
776
777	// Indicates it is profitable to form an AVX512 masked operation. Returning
778	// false will favor a masked register-register masked move or vblendm and the
779	// operation will be selected separately.
780	bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode N) const* {
781	assert(
782	(N->getOpcode() == ISD::VSELECT \|\| N->getOpcode() == X86ISD::SELECTS) &&
783	"Unexpected opcode!");
784
785	// If the operation has additional users, the operation will be duplicated.
786	// Check the use count to prevent that.
787	// FIXME: Are there cheap opcodes we might want to duplicate?
788	return N->getOperand(Num: `1`).hasOneUse();
789	}
790
791	/// Replace the original chain operand of the call with
792	/// load's chain operand and move load below the call's chain operand.
793	static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
794	SDValue Call, SDValue OrigChain) {
795	SmallVector<SDValue, `8`> Ops;
796	SDValue Chain = OrigChain.getOperand(i: `0`);
797	if (Chain.getNode() == Load.getNode())
798	Ops.push_back(Elt: Load.getOperand(i: `0`));
799	else {
800	assert(Chain.getOpcode() == ISD::TokenFactor &&
801	"Unexpected chain operand");
802	for (unsigned i = `0`, e = Chain.getNumOperands(); i != e; ++i)
803	if (Chain.getOperand(i).getNode() == Load.getNode())
804	Ops.push_back(Elt: Load.getOperand(i: `0`));
805	else
806	Ops.push_back(Elt: Chain.getOperand(i));
807	SDValue NewChain =
808	CurDAG->getNode(ISD::TokenFactor, SDLoc (Load), MVT::Other, Ops);
809	Ops.clear();
810	Ops.push_back(Elt: NewChain);
811	}
812	Ops.append(in_start: OrigChain ->op_begin() + `1`, in_end: OrigChain ->op_end());
813	CurDAG->UpdateNodeOperands(N: OrigChain.getNode(), Ops);
814	CurDAG->UpdateNodeOperands(N: Load.getNode(), Op1: Call.getOperand(i: `0`),
815	Op2: Load.getOperand(i: `1`), Op3: Load.getOperand(i: `2`));
816
817	Ops.clear();
818	Ops.push_back(Elt: SDValue (Load.getNode(), `1`));
819	Ops.append(in_start: Call ->op_begin() + `1`, in_end: Call ->op_end());
820	CurDAG->UpdateNodeOperands(N: Call.getNode(), Ops);
821	}
822
823	/// Return true if call address is a load and it can be
824	/// moved below CALLSEQ_START and the chains leading up to the call.
825	/// Return the CALLSEQ_START by reference as a second output.
826	/// In the case of a tail call, there isn't a callseq node between the call
827	/// chain and the load.
828	static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
829	// The transformation is somewhat dangerous if the call's chain was glued to
830	// the call. After MoveBelowOrigChain the load is moved between the call and
831	// the chain, this can create a cycle if the load is not folded. So it is
832	// really* important that we are sure the load will be folded.*
833	if (Callee.getNode() == Chain.getNode() \|\| !Callee.hasOneUse())
834	return false;
835	auto *LD = dyn_cast<LoadSDNode>(Val: Callee.getNode());
836	if (!LD \|\|
837	!LD->isSimple() \|\|
838	LD->getAddressingMode() != ISD::UNINDEXED \|\|
839	LD->getExtensionType() != ISD::NON_EXTLOAD)
840	return false;
841
842	// Now let's find the callseq_start.
843	while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
844	if (!Chain.hasOneUse())
845	return false;
846	Chain = Chain.getOperand(i: `0`);
847	}
848
849	if (!Chain.getNumOperands())
850	return false;
851	// Since we are not checking for AA here, conservatively abort if the chain
852	// writes to memory. It's not safe to move the callee (a load) across a store.
853	if (isa<MemSDNode>(Val: Chain.getNode()) &&
854	cast<MemSDNode>(Val: Chain.getNode())->writeMem())
855	return false;
856	if (Chain.getOperand(i: `0`).getNode() == Callee.getNode())
857	return true;
858	if (Chain.getOperand(i: `0`).getOpcode() == ISD::TokenFactor &&
859	Callee.getValue(R: `1`).isOperandOf(N: Chain.getOperand(i: `0`).getNode()) &&
860	Callee.getValue(R: `1`).hasOneUse())
861	return true;
862	return false;
863	}
864
865	static bool isEndbrImm64(uint64_t Imm) {
866	// There may be some other prefix bytes between 0xF3 and 0x0F1EFA.
867	// i.g: 0xF3660F1EFA, 0xF3670F1EFA
868	if ((Imm & `0x00FFFFFF`) != `0x0F1EFA`)
869	return false;
870
871	uint8_t OptionalPrefixBytes [] = {`0x26`, `0x2e`, `0x36`, `0x3e`, `0x64`,
872	`0x65`, `0x66`, `0x67`, `0xf0`, `0xf2`};
873	int i = `24`; // 24bit 0x0F1EFA has matched
874	while (i < `64`) {
875	uint8_t Byte = (Imm >> i) & `0xFF`;
876	if (Byte == `0xF3`)
877	return true;
878	if (!llvm::is_contained(Range&: OptionalPrefixBytes, Element: Byte))
879	return false;
880	i += `8`;
881	}
882
883	return false;
884	}
885
886	static bool needBWI(MVT VT) {
887	return (VT == MVT::v32i16 \|\| VT == MVT::v32f16 \|\| VT == MVT::v64i8);
888	}
889
890	void X86DAGToDAGISel::PreprocessISelDAG() {
891	bool MadeChange = false;
892	for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
893	E = CurDAG->allnodes_end(); I != E; ) {
894	SDNode N = &I ++; // Preincrement iterator to avoid invalidation issues.
895
896	// This is for CET enhancement.
897	//
898	// ENDBR32 and ENDBR64 have specific opcodes:
899	// ENDBR32: F3 0F 1E FB
900	// ENDBR64: F3 0F 1E FA
901	// And we want that attackers won’t find unintended ENDBR32/64
902	// opcode matches in the binary
903	// Here’s an example:
904	// If the compiler had to generate asm for the following code:
905	// a = 0xF30F1EFA
906	// it could, for example, generate:
907	// mov 0xF30F1EFA, dword ptr[a]
908	// In such a case, the binary would include a gadget that starts
909	// with a fake ENDBR64 opcode. Therefore, we split such generation
910	// into multiple operations, let it not shows in the binary
911	if (N->getOpcode() == ISD::Constant) {
912	MVT VT = N->getSimpleValueType(ResNo: `0`);
913	int64_t Imm = cast<ConstantSDNode>(Val: N)->getSExtValue();
914	int32_t EndbrImm = Subtarget->is64Bit() ? `0xF30F1EFA` : `0xF30F1EFB`;
915	if (Imm == EndbrImm \|\| isEndbrImm64(Imm)) {
916	// Check that the cf-protection-branch is enabled.
917	Metadata *CFProtectionBranch =
918	MF->getMMI().getModule()->getModuleFlag(Key: "cf-protection-branch");
919	if (CFProtectionBranch \|\| IndirectBranchTracking) {
920	SDLoc dl(N);
921	SDValue Complement = CurDAG->getConstant(Val: ~Imm, DL: dl, VT, isTarget: false, isOpaque: true);
922	Complement = CurDAG->getNOT(DL: dl, Val: Complement, VT);
923	--I;
924	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (N, `0`), To: Complement);
925	++I;
926	MadeChange = true;
927	continue;
928	}
929	}
930	}
931
932	// If this is a target specific AND node with no flag usages, turn it back
933	// into ISD::AND to enable test instruction matching.
934	if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(Value: `1`)) {
935	SDValue Res = CurDAG->getNode(Opcode: ISD::AND, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
936	N1: N->getOperand(Num: `0`), N2: N->getOperand(Num: `1`));
937	--I;
938	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (N, `0`), To: Res);
939	++I;
940	MadeChange = true;
941	continue;
942	}
943
944	// Convert vector increment or decrement to sub/add with an all-ones
945	// constant:
946	// add X, <1, 1...> --> sub X, <-1, -1...>
947	// sub X, <1, 1...> --> add X, <-1, -1...>
948	// The all-ones vector constant can be materialized using a pcmpeq
949	// instruction that is commonly recognized as an idiom (has no register
950	// dependency), so that's better/smaller than loading a splat 1 constant.
951	//
952	// But don't do this if it would inhibit a potentially profitable load
953	// folding opportunity for the other operand. That only occurs with the
954	// intersection of:
955	// (1) The other operand (op0) is load foldable.
956	// (2) The op is an add (otherwise, we are creating* an add and can still*
957	// load fold the other op).
958	// (3) The target has AVX (otherwise, we have a destructive add and can't
959	// load fold the other op without killing the constant op).
960	// (4) The constant 1 vector has multiple uses (so it is profitable to load
961	// into a register anyway).
962	auto mayPreventLoadFold = [&]() {
963	return X86::mayFoldLoad(Op: N->getOperand(Num: `0`), Subtarget: *Subtarget) &&
964	N->getOpcode() == ISD::ADD && Subtarget->hasAVX() &&
965	!N->getOperand(Num: `1`).hasOneUse();
966	};
967	if ((N->getOpcode() == ISD::ADD \|\| N->getOpcode() == ISD::SUB) &&
968	N->getSimpleValueType(ResNo: `0`).isVector() && !mayPreventLoadFold ()) {
969	APInt SplatVal;
970	if (X86::isConstantSplat(Op: N->getOperand(Num: `1`), SplatVal) &&
971	SplatVal.isOne()) {
972	SDLoc DL(N);
973
974	MVT VT = N->getSimpleValueType(ResNo: `0`);
975	unsigned NumElts = VT.getSizeInBits() / `32`;
976	SDValue AllOnes =
977	CurDAG->getAllOnesConstant(DL, VT: MVT::getVectorVT(MVT::i32, NumElts));
978	AllOnes = CurDAG->getBitcast(VT, V: AllOnes);
979
980	unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
981	SDValue Res =
982	CurDAG->getNode(Opcode: NewOpcode, DL, VT, N1: N->getOperand(Num: `0`), N2: AllOnes);
983	--I;
984	CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode());
985	++I;
986	MadeChange = true;
987	continue;
988	}
989	}
990
991	switch (N->getOpcode()) {
992	case X86ISD::VBROADCAST: {
993	MVT VT = N->getSimpleValueType(ResNo: `0`);
994	// Emulate v32i16/v64i8 broadcast without BWI.
995	if (!Subtarget->hasBWI() && needBWI(VT)) {
996	MVT NarrowVT = VT.getHalfNumVectorElementsVT();
997	SDLoc dl(N);
998	SDValue NarrowBCast =
999	CurDAG->getNode(Opcode: X86ISD::VBROADCAST, DL: dl, VT: NarrowVT, Operand: N->getOperand(Num: `0`));
1000	SDValue Res =
1001	CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: CurDAG->getUNDEF(VT),
1002	N2: NarrowBCast, N3: CurDAG->getIntPtrConstant(Val: `0`, DL: dl));
1003	unsigned Index = NarrowVT.getVectorMinNumElements();
1004	Res = CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: Res, N2: NarrowBCast,
1005	N3: CurDAG->getIntPtrConstant(Val: Index, DL: dl));
1006
1007	--I;
1008	CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode());
1009	++I;
1010	MadeChange = true;
1011	continue;
1012	}
1013
1014	break;
1015	}
1016	case X86ISD::VBROADCAST_LOAD: {
1017	MVT VT = N->getSimpleValueType(ResNo: `0`);
1018	// Emulate v32i16/v64i8 broadcast without BWI.
1019	if (!Subtarget->hasBWI() && needBWI(VT)) {
1020	MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1021	auto *MemNode = cast<MemSDNode>(Val: N);
1022	SDLoc dl(N);
1023	SDVTList VTs = CurDAG->getVTList(NarrowVT, MVT::Other);
1024	SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()};
1025	SDValue NarrowBCast = CurDAG->getMemIntrinsicNode(
1026	Opcode: X86ISD::VBROADCAST_LOAD, dl, VTList: VTs, Ops, MemVT: MemNode->getMemoryVT(),
1027	MMO: MemNode->getMemOperand());
1028	SDValue Res =
1029	CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: CurDAG->getUNDEF(VT),
1030	N2: NarrowBCast, N3: CurDAG->getIntPtrConstant(Val: `0`, DL: dl));
1031	unsigned Index = NarrowVT.getVectorMinNumElements();
1032	Res = CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: Res, N2: NarrowBCast,
1033	N3: CurDAG->getIntPtrConstant(Val: Index, DL: dl));
1034
1035	--I;
1036	SDValue To[] = {Res, NarrowBCast.getValue(R: `1`)};
1037	CurDAG->ReplaceAllUsesWith(From: N, To);
1038	++I;
1039	MadeChange = true;
1040	continue;
1041	}
1042
1043	break;
1044	}
1045	case ISD::LOAD: {
1046	// If this is a XMM/YMM load of the same lower bits as another YMM/ZMM
1047	// load, then just extract the lower subvector and avoid the second load.
1048	auto *Ld = cast<LoadSDNode>(Val: N);
1049	MVT VT = N->getSimpleValueType(ResNo: `0`);
1050	if (!ISD::isNormalLoad(N: Ld) \|\| !Ld->isSimple() \|\|
1051	!(VT.is128BitVector() \|\| VT.is256BitVector()))
1052	break;
1053
1054	MVT MaxVT = VT;
1055	SDNode MaxLd = nullptr*;
1056	SDValue Ptr = Ld->getBasePtr();
1057	SDValue Chain = Ld->getChain();
1058	for (SDNode *User : Ptr ->uses()) {
1059	auto *UserLd = dyn_cast<LoadSDNode>(Val: User);
1060	MVT UserVT = User->getSimpleValueType(ResNo: `0`);
1061	if (User != N && UserLd && ISD::isNormalLoad(N: User) &&
1062	UserLd->getBasePtr() == Ptr && UserLd->getChain() == Chain &&
1063	!User->hasAnyUseOfValue(Value: `1`) &&
1064	(UserVT.is256BitVector() \|\| UserVT.is512BitVector()) &&
1065	UserVT.getSizeInBits() > VT.getSizeInBits() &&
1066	(!MaxLd \|\| UserVT.getSizeInBits() > MaxVT.getSizeInBits())) {
1067	MaxLd = User;
1068	MaxVT = UserVT;
1069	}
1070	}
1071	if (MaxLd) {
1072	SDLoc dl(N);
1073	unsigned NumSubElts = VT.getSizeInBits() / MaxVT.getScalarSizeInBits();
1074	MVT SubVT = MVT::getVectorVT(VT: MaxVT.getScalarType(), NumElements: NumSubElts);
1075	SDValue Extract = CurDAG->getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT: SubVT,
1076	N1: SDValue (MaxLd, `0`),
1077	N2: CurDAG->getIntPtrConstant(Val: `0`, DL: dl));
1078	SDValue Res = CurDAG->getBitcast(VT, V: Extract);
1079
1080	--I;
1081	SDValue To[] = {Res, SDValue (MaxLd, `1`)};
1082	CurDAG->ReplaceAllUsesWith(From: N, To);
1083	++I;
1084	MadeChange = true;
1085	continue;
1086	}
1087	break;
1088	}
1089	case ISD::VSELECT: {
1090	// Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG.
1091	EVT EleVT = N->getOperand(Num: `0`).getValueType().getVectorElementType();
1092	if (EleVT == MVT::i1)
1093	break;
1094
1095	assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
1096	assert(N->getValueType(`0`).getVectorElementType() != MVT::i16 &&
1097	"We can't replace VSELECT with BLENDV in vXi16!");
1098	SDValue R;
1099	if (Subtarget->hasVLX() && CurDAG->ComputeNumSignBits(Op: N->getOperand(Num: `0`)) ==
1100	EleVT.getSizeInBits()) {
1101	R = CurDAG->getNode(X86ISD::VPTERNLOG, SDLoc (N), N->getValueType(ResNo: `0`),
1102	N->getOperand(Num: `0`), N->getOperand(Num: `1`), N->getOperand(Num: `2`),
1103	CurDAG->getTargetConstant(`0xCA`, SDLoc (N), MVT::i8));
1104	} else {
1105	R = CurDAG->getNode(Opcode: X86ISD::BLENDV, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
1106	N1: N->getOperand(Num: `0`), N2: N->getOperand(Num: `1`),
1107	N3: N->getOperand(Num: `2`));
1108	}
1109	--I;
1110	CurDAG->ReplaceAllUsesWith(From: N, To: R.getNode());
1111	++I;
1112	MadeChange = true;
1113	continue;
1114	}
1115	case ISD::FP_ROUND:
1116	case ISD::STRICT_FP_ROUND:
1117	case ISD::FP_TO_SINT:
1118	case ISD::FP_TO_UINT:
1119	case ISD::STRICT_FP_TO_SINT:
1120	case ISD::STRICT_FP_TO_UINT: {
1121	// Replace vector fp_to_s/uint with their X86 specific equivalent so we
1122	// don't need 2 sets of patterns.
1123	if (!N->getSimpleValueType(ResNo: `0`).isVector())
1124	break;
1125
1126	unsigned NewOpc;
1127	switch (N->getOpcode()) {
1128	default: llvm_unreachable("Unexpected opcode!");
1129	case ISD::FP_ROUND: NewOpc = X86ISD::VFPROUND; break;
1130	case ISD::STRICT_FP_ROUND: NewOpc = X86ISD::STRICT_VFPROUND; break;
1131	case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break;
1132	case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break;
1133	case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break;
1134	case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break;
1135	}
1136	SDValue Res;
1137	if (N->isStrictFPOpcode())
1138	Res =
1139	CurDAG->getNode(NewOpc, SDLoc(N), {N->getValueType(`0`), MVT::Other},
1140	{N->getOperand(`0`), N->getOperand(`1`)});
1141	else
1142	Res =
1143	CurDAG->getNode(Opcode: NewOpc, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
1144	Operand: N->getOperand(Num: `0`));
1145	--I;
1146	CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode());
1147	++I;
1148	MadeChange = true;
1149	continue;
1150	}
1151	case ISD::SHL:
1152	case ISD::SRA:
1153	case ISD::SRL: {
1154	// Replace vector shifts with their X86 specific equivalent so we don't
1155	// need 2 sets of patterns.
1156	if (!N->getValueType(ResNo: `0`).isVector())
1157	break;
1158
1159	unsigned NewOpc;
1160	switch (N->getOpcode()) {
1161	default: llvm_unreachable("Unexpected opcode!");
1162	case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
1163	case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
1164	case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
1165	}
1166	SDValue Res = CurDAG->getNode(Opcode: NewOpc, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
1167	N1: N->getOperand(Num: `0`), N2: N->getOperand(Num: `1`));
1168	--I;
1169	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (N, `0`), To: Res);
1170	++I;
1171	MadeChange = true;
1172	continue;
1173	}
1174	case ISD::ANY_EXTEND:
1175	case ISD::ANY_EXTEND_VECTOR_INREG: {
1176	// Replace vector any extend with the zero extend equivalents so we don't
1177	// need 2 sets of patterns. Ignore vXi1 extensions.
1178	if (!N->getValueType(ResNo: `0`).isVector())
1179	break;
1180
1181	unsigned NewOpc;
1182	if (N->getOperand(Num: `0`).getScalarValueSizeInBits() == `1`) {
1183	assert(N->getOpcode() == ISD::ANY_EXTEND &&
1184	"Unexpected opcode for mask vector!");
1185	NewOpc = ISD::SIGN_EXTEND;
1186	} else {
1187	NewOpc = N->getOpcode() == ISD::ANY_EXTEND
1188	? ISD::ZERO_EXTEND
1189	: ISD::ZERO_EXTEND_VECTOR_INREG;
1190	}
1191
1192	SDValue Res = CurDAG->getNode(Opcode: NewOpc, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
1193	Operand: N->getOperand(Num: `0`));
1194	--I;
1195	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (N, `0`), To: Res);
1196	++I;
1197	MadeChange = true;
1198	continue;
1199	}
1200	case ISD::FCEIL:
1201	case ISD::STRICT_FCEIL:
1202	case ISD::FFLOOR:
1203	case ISD::STRICT_FFLOOR:
1204	case ISD::FTRUNC:
1205	case ISD::STRICT_FTRUNC:
1206	case ISD::FROUNDEVEN:
1207	case ISD::STRICT_FROUNDEVEN:
1208	case ISD::FNEARBYINT:
1209	case ISD::STRICT_FNEARBYINT:
1210	case ISD::FRINT:
1211	case ISD::STRICT_FRINT: {
1212	// Replace fp rounding with their X86 specific equivalent so we don't
1213	// need 2 sets of patterns.
1214	unsigned Imm;
1215	switch (N->getOpcode()) {
1216	default: llvm_unreachable("Unexpected opcode!");
1217	case ISD::STRICT_FCEIL:
1218	case ISD::FCEIL: Imm = `0xA`; break;
1219	case ISD::STRICT_FFLOOR:
1220	case ISD::FFLOOR: Imm = `0x9`; break;
1221	case ISD::STRICT_FTRUNC:
1222	case ISD::FTRUNC: Imm = `0xB`; break;
1223	case ISD::STRICT_FROUNDEVEN:
1224	case ISD::FROUNDEVEN: Imm = `0x8`; break;
1225	case ISD::STRICT_FNEARBYINT:
1226	case ISD::FNEARBYINT: Imm = `0xC`; break;
1227	case ISD::STRICT_FRINT:
1228	case ISD::FRINT: Imm = `0x4`; break;
1229	}
1230	SDLoc dl(N);
1231	bool IsStrict = N->isStrictFPOpcode();
1232	SDValue Res;
1233	if (IsStrict)
1234	Res = CurDAG->getNode(X86ISD::STRICT_VRNDSCALE, dl,
1235	{N->getValueType(`0`), MVT::Other},
1236	{N->getOperand(`0`), N->getOperand(`1`),
1237	CurDAG->getTargetConstant(Imm, dl, MVT::i32)});
1238	else
1239	Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, N->getValueType(ResNo: `0`),
1240	N->getOperand(Num: `0`),
1241	CurDAG->getTargetConstant(Imm, dl, MVT::i32));
1242	--I;
1243	CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode());
1244	++I;
1245	MadeChange = true;
1246	continue;
1247	}
1248	case X86ISD::FANDN:
1249	case X86ISD::FAND:
1250	case X86ISD::FOR:
1251	case X86ISD::FXOR: {
1252	// Widen scalar fp logic ops to vector to reduce isel patterns.
1253	// FIXME: Can we do this during lowering/combine.
1254	MVT VT = N->getSimpleValueType(ResNo: `0`);
1255	if (VT.isVector() \|\| VT == MVT::f128)
1256	break;
1257
1258	MVT VecVT = VT == MVT::f64 ? MVT::v2f64
1259	: VT == MVT::f32 ? MVT::v4f32
1260	: MVT::v8f16;
1261
1262	SDLoc dl(N);
1263	SDValue Op0 = CurDAG->getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT: VecVT,
1264	Operand: N->getOperand(Num: `0`));
1265	SDValue Op1 = CurDAG->getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT: VecVT,
1266	Operand: N->getOperand(Num: `1`));
1267
1268	SDValue Res;
1269	if (Subtarget->hasSSE2()) {
1270	EVT IntVT = EVT (VecVT).changeVectorElementTypeToInteger();
1271	Op0 = CurDAG->getNode(Opcode: ISD::BITCAST, DL: dl, VT: IntVT, Operand: Op0);
1272	Op1 = CurDAG->getNode(Opcode: ISD::BITCAST, DL: dl, VT: IntVT, Operand: Op1);
1273	unsigned Opc;
1274	switch (N->getOpcode()) {
1275	default: llvm_unreachable("Unexpected opcode!");
1276	case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
1277	case X86ISD::FAND: Opc = ISD::AND; break;
1278	case X86ISD::FOR: Opc = ISD::OR; break;
1279	case X86ISD::FXOR: Opc = ISD::XOR; break;
1280	}
1281	Res = CurDAG->getNode(Opcode: Opc, DL: dl, VT: IntVT, N1: Op0, N2: Op1);
1282	Res = CurDAG->getNode(Opcode: ISD::BITCAST, DL: dl, VT: VecVT, Operand: Res);
1283	} else {
1284	Res = CurDAG->getNode(Opcode: N->getOpcode(), DL: dl, VT: VecVT, N1: Op0, N2: Op1);
1285	}
1286	Res = CurDAG->getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT, N1: Res,
1287	N2: CurDAG->getIntPtrConstant(Val: `0`, DL: dl));
1288	--I;
1289	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (N, `0`), To: Res);
1290	++I;
1291	MadeChange = true;
1292	continue;
1293	}
1294	}
1295
1296	if (OptLevel != CodeGenOptLevel::None &&
1297	// Only do this when the target can fold the load into the call or
1298	// jmp.
1299	!Subtarget->useIndirectThunkCalls() &&
1300	((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) \|\|
1301	(N->getOpcode() == X86ISD::TC_RETURN &&
1302	(Subtarget->is64Bit() \|\|
1303	!getTargetMachine().isPositionIndependent())))) {
1304	/// Also try moving call address load from outside callseq_start to just
1305	/// before the call to allow it to be folded.
1306	///
1307	/// [Load chain]
1308	/// ^
1309	/// \|
1310	/// [Load]
1311	/// ^ ^
1312	/// \| \|
1313	/// / \--
1314	/// / \|
1315	///[CALLSEQ_START] \|
1316	/// ^ \|
1317	/// \| \|
1318	/// [LOAD/C2Reg] \|
1319	/// \| \|
1320	/// \ /
1321	/// \ /
1322	/// [CALL]
1323	bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
1324	SDValue Chain = N->getOperand(Num: `0`);
1325	SDValue Load = N->getOperand(Num: `1`);
1326	if (!isCalleeLoad(Callee: Load, Chain, HasCallSeq))
1327	continue;
1328	moveBelowOrigChain(CurDAG, Load, Call: SDValue (N, `0`), OrigChain: Chain);
1329	++NumLoadMoved;
1330	MadeChange = true;
1331	continue;
1332	}
1333
1334	// Lower fpround and fpextend nodes that target the FP stack to be store and
1335	// load to the stack. This is a gross hack. We would like to simply mark
1336	// these as being illegal, but when we do that, legalize produces these when
1337	// it expands calls, then expands these in the same legalize pass. We would
1338	// like dag combine to be able to hack on these between the call expansion
1339	// and the node legalization. As such this pass basically does "really
1340	// late" legalization of these inline with the X86 isel pass.
1341	// FIXME: This should only happen when not compiled with -O0.
1342	switch (N->getOpcode()) {
1343	default: continue;
1344	case ISD::FP_ROUND:
1345	case ISD::FP_EXTEND:
1346	{
1347	MVT SrcVT = N->getOperand(Num: `0`).getSimpleValueType();
1348	MVT DstVT = N->getSimpleValueType(ResNo: `0`);
1349
1350	// If any of the sources are vectors, no fp stack involved.
1351	if (SrcVT.isVector() \|\| DstVT.isVector())
1352	continue;
1353
1354	// If the source and destination are SSE registers, then this is a legal
1355	// conversion that should not be lowered.
1356	const X86TargetLowering *X86Lowering =
1357	static_cast<const X86TargetLowering *>(TLI);
1358	bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: SrcVT);
1359	bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: DstVT);
1360	if (SrcIsSSE && DstIsSSE)
1361	continue;
1362
1363	if (!SrcIsSSE && !DstIsSSE) {
1364	// If this is an FPStack extension, it is a noop.
1365	if (N->getOpcode() == ISD::FP_EXTEND)
1366	continue;
1367	// If this is a value-preserving FPStack truncation, it is a noop.
1368	if (N->getConstantOperandVal(Num: `1`))
1369	continue;
1370	}
1371
1372	// Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1373	// FPStack has extload and truncstore. SSE can fold direct loads into other
1374	// operations. Based on this, decide what we want to do.
1375	MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT;
1376	SDValue MemTmp = CurDAG->CreateStackTemporary(VT: MemVT);
1377	int SPFI = cast<FrameIndexSDNode>(Val&: MemTmp)->getIndex();
1378	MachinePointerInfo MPI =
1379	MachinePointerInfo::getFixedStack(MF&: CurDAG->getMachineFunction(), FI: SPFI);
1380	SDLoc dl(N);
1381
1382	// FIXME: optimize the case where the src/dest is a load or store?
1383
1384	SDValue Store = CurDAG->getTruncStore(
1385	Chain: CurDAG->getEntryNode(), dl, Val: N->getOperand(Num: `0`), Ptr: MemTmp, PtrInfo: MPI, SVT: MemVT);
1386	SDValue Result = CurDAG->getExtLoad(ExtType: ISD::EXTLOAD, dl, VT: DstVT, Chain: Store,
1387	Ptr: MemTmp, PtrInfo: MPI, MemVT);
1388
1389	// We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1390	// extload we created. This will cause general havok on the dag because
1391	// anything below the conversion could be folded into other existing nodes.
1392	// To avoid invalidating 'I', back it up to the convert node.
1393	--I;
1394	CurDAG->ReplaceAllUsesOfValueWith(From: SDValue (N, `0`), To: Result);
1395	break;
1396	}
1397
1398	//The sequence of events for lowering STRICT_FP versions of these nodes requires
1399	//dealing with the chain differently, as there is already a preexisting chain.
1400	case ISD::STRICT_FP_ROUND:
1401	case ISD::STRICT_FP_EXTEND:
1402	{
1403	MVT SrcVT = N->getOperand(Num: `1`).getSimpleValueType();
1404	MVT DstVT = N->getSimpleValueType(ResNo: `0`);
1405
1406	// If any of the sources are vectors, no fp stack involved.
1407	if (SrcVT.isVector() \|\| DstVT.isVector())
1408	continue;
1409
1410	// If the source and destination are SSE registers, then this is a legal
1411	// conversion that should not be lowered.
1412	const X86TargetLowering *X86Lowering =
1413	static_cast<const X86TargetLowering *>(TLI);
1414	bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: SrcVT);
1415	bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: DstVT);
1416	if (SrcIsSSE && DstIsSSE)
1417	continue;
1418
1419	if (!SrcIsSSE && !DstIsSSE) {
1420	// If this is an FPStack extension, it is a noop.
1421	if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
1422	continue;
1423	// If this is a value-preserving FPStack truncation, it is a noop.
1424	if (N->getConstantOperandVal(Num: `2`))
1425	continue;
1426	}
1427
1428	// Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1429	// FPStack has extload and truncstore. SSE can fold direct loads into other
1430	// operations. Based on this, decide what we want to do.
1431	MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT;
1432	SDValue MemTmp = CurDAG->CreateStackTemporary(VT: MemVT);
1433	int SPFI = cast<FrameIndexSDNode>(Val&: MemTmp)->getIndex();
1434	MachinePointerInfo MPI =
1435	MachinePointerInfo::getFixedStack(MF&: CurDAG->getMachineFunction(), FI: SPFI);
1436	SDLoc dl(N);
1437
1438	// FIXME: optimize the case where the src/dest is a load or store?
1439
1440	//Since the operation is StrictFP, use the preexisting chain.
1441	SDValue Store, Result;
1442	if (!SrcIsSSE) {
1443	SDVTList VTs = CurDAG->getVTList(MVT::Other);
1444	SDValue Ops[] = {N->getOperand(Num: `0`), N->getOperand(Num: `1`), MemTmp};
1445	Store = CurDAG->getMemIntrinsicNode(Opcode: X86ISD::FST, dl, VTList: VTs, Ops, MemVT,
1446	PtrInfo: MPI, /Align/ Alignment: std::nullopt,
1447	Flags: MachineMemOperand::MOStore);
1448	if (N->getFlags().hasNoFPExcept()) {
1449	SDNodeFlags Flags = Store ->getFlags();
1450	Flags.setNoFPExcept(true);
1451	Store ->setFlags(Flags);
1452	}
1453	} else {
1454	assert(SrcVT == MemVT && "Unexpected VT!");
1455	Store = CurDAG->getStore(Chain: N->getOperand(Num: `0`), dl, Val: N->getOperand(Num: `1`), Ptr: MemTmp,
1456	PtrInfo: MPI);
1457	}
1458
1459	if (!DstIsSSE) {
1460	SDVTList VTs = CurDAG->getVTList(DstVT, MVT::Other);
1461	SDValue Ops[] = {Store, MemTmp};
1462	Result = CurDAG->getMemIntrinsicNode(
1463	Opcode: X86ISD::FLD, dl, VTList: VTs, Ops, MemVT, PtrInfo: MPI,
1464	/Align/ Alignment: std::nullopt, Flags: MachineMemOperand::MOLoad);
1465	if (N->getFlags().hasNoFPExcept()) {
1466	SDNodeFlags Flags = Result ->getFlags();
1467	Flags.setNoFPExcept(true);
1468	Result ->setFlags(Flags);
1469	}
1470	} else {
1471	assert(DstVT == MemVT && "Unexpected VT!");
1472	Result = CurDAG->getLoad(VT: DstVT, dl, Chain: Store, Ptr: MemTmp, PtrInfo: MPI);
1473	}
1474
1475	// We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1476	// extload we created. This will cause general havok on the dag because
1477	// anything below the conversion could be folded into other existing nodes.
1478	// To avoid invalidating 'I', back it up to the convert node.
1479	--I;
1480	CurDAG->ReplaceAllUsesWith(From: N, To: Result.getNode());
1481	break;
1482	}
1483	}
1484
1485
1486	// Now that we did that, the node is dead. Increment the iterator to the
1487	// next node to process, then delete N.
1488	++I;
1489	MadeChange = true;
1490	}
1491
1492	// Remove any dead nodes that may have been left behind.
1493	if (MadeChange)
1494	CurDAG->RemoveDeadNodes();
1495	}
1496
1497	// Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
1498	bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
1499	unsigned Opc = N->getMachineOpcode();
1500	if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
1501	Opc != X86::MOVSX64rr8)
1502	return false;
1503
1504	SDValue N0 = N->getOperand(Num: `0`);
1505
1506	// We need to be extracting the lower bit of an extend.
1507	if (!N0.isMachineOpcode() \|\|
1508	N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG \|\|
1509	N0.getConstantOperandVal(i: `1`) != X86::sub_8bit)
1510	return false;
1511
1512	// We're looking for either a movsx or movzx to match the original opcode.
1513	unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
1514	: X86::MOVSX32rr8_NOREX;
1515	SDValue N00 = N0.getOperand(i: `0`);
1516	if (!N00.isMachineOpcode() \|\| N00.getMachineOpcode() != ExpectedOpc)
1517	return false;
1518
1519	if (Opc == X86::MOVSX64rr8) {
1520	// If we had a sign extend from 8 to 64 bits. We still need to go from 32
1521	// to 64.
1522	MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc (N),
1523	MVT::i64, N00);
1524	ReplaceUses(F: N, T: Extend);
1525	} else {
1526	// Ok we can drop this extend and just use the original extend.
1527	ReplaceUses(F: N, T: N00.getNode());
1528	}
1529
1530	return true;
1531	}
1532
1533	void X86DAGToDAGISel::PostprocessISelDAG() {
1534	// Skip peepholes at -O0.
1535	if (TM.getOptLevel() == CodeGenOptLevel::None)
1536	return;
1537
1538	SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
1539
1540	bool MadeChange = false;
1541	while (Position != CurDAG->allnodes_begin()) {
1542	SDNode N = &--Position;
1543	// Skip dead nodes and any non-machine opcodes.
1544	if (N->use_empty() \|\| !N->isMachineOpcode())
1545	continue;
1546
1547	if (tryOptimizeRem8Extend(N)) {
1548	MadeChange = true;
1549	continue;
1550	}
1551
1552	// Look for a TESTrr+ANDrr pattern where both operands of the test are
1553	// the same. Rewrite to remove the AND.
1554	unsigned Opc = N->getMachineOpcode();
1555	if ((Opc == X86::TEST8rr \|\| Opc == X86::TEST16rr \|\|
1556	Opc == X86::TEST32rr \|\| Opc == X86::TEST64rr) &&
1557	N->getOperand(Num: `0`) == N->getOperand(Num: `1`) &&
1558	N->getOperand(Num: `0`)->hasNUsesOfValue(NUses: `2`, Value: N->getOperand(Num: `0`).getResNo()) &&
1559	N->getOperand(Num: `0`).isMachineOpcode()) {
1560	SDValue And = N->getOperand(Num: `0`);
1561	unsigned N0Opc = And.getMachineOpcode();
1562	if ((N0Opc == X86::AND8rr \|\| N0Opc == X86::AND16rr \|\|
1563	N0Opc == X86::AND32rr \|\| N0Opc == X86::AND64rr) &&
1564	!And->hasAnyUseOfValue(`1`)) {
1565	MachineSDNode *Test = CurDAG->getMachineNode(Opc, SDLoc(N),
1566	MVT::i32,
1567	And.getOperand(`0`),
1568	And.getOperand(`1`));
1569	ReplaceUses(F: N, T: Test);
1570	MadeChange = true;
1571	continue;
1572	}
1573	if ((N0Opc == X86::AND8rm \|\| N0Opc == X86::AND16rm \|\|
1574	N0Opc == X86::AND32rm \|\| N0Opc == X86::AND64rm) &&
1575	!And->hasAnyUseOfValue(`1`)) {
1576	unsigned NewOpc;
1577	switch (N0Opc) {
1578	case X86::AND8rm: NewOpc = X86::TEST8mr; break;
1579	case X86::AND16rm: NewOpc = X86::TEST16mr; break;
1580	case X86::AND32rm: NewOpc = X86::TEST32mr; break;
1581	case X86::AND64rm: NewOpc = X86::TEST64mr; break;
1582	}
1583
1584	// Need to swap the memory and register operand.
1585	SDValue Ops[] = { And.getOperand(i: `1`),
1586	And.getOperand(i: `2`),
1587	And.getOperand(i: `3`),
1588	And.getOperand(i: `4`),
1589	And.getOperand(i: `5`),
1590	And.getOperand(i: `0`),
1591	And.getOperand(i: `6`) / Chain / };
1592	MachineSDNode *Test = CurDAG->getMachineNode(NewOpc, SDLoc(N),
1593	MVT::i32, MVT::Other, Ops);
1594	CurDAG->setNodeMemRefs(
1595	N: Test, NewMemRefs: cast<MachineSDNode>(Val: And.getNode())->memoperands());
1596	ReplaceUses(F: And.getValue(R: `2`), T: SDValue (Test, `1`));
1597	ReplaceUses(F: SDValue (N, `0`), T: SDValue (Test, `0`));
1598	MadeChange = true;
1599	continue;
1600	}
1601	}
1602
1603	// Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1604	// used. We're doing this late so we can prefer to fold the AND into masked
1605	// comparisons. Doing that can be better for the live range of the mask
1606	// register.
1607	if ((Opc == X86::KORTESTBrr \|\| Opc == X86::KORTESTWrr \|\|
1608	Opc == X86::KORTESTDrr \|\| Opc == X86::KORTESTQrr) &&
1609	N->getOperand(`0`) == N->getOperand(`1`) &&
1610	N->isOnlyUserOf(N->getOperand(`0`).getNode()) &&
1611	N->getOperand(`0`).isMachineOpcode() &&
1612	onlyUsesZeroFlag(SDValue(N, `0`))) {
1613	SDValue And = N->getOperand(Num: `0`);
1614	unsigned N0Opc = And.getMachineOpcode();
1615	// KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1616	// KAND instructions and KTEST use the same ISA feature.
1617	if (N0Opc == X86::KANDBrr \|\|
1618	(N0Opc == X86::KANDWrr && Subtarget->hasDQI()) \|\|
1619	N0Opc == X86::KANDDrr \|\| N0Opc == X86::KANDQrr) {
1620	unsigned NewOpc;
1621	switch (Opc) {
1622	default: llvm_unreachable("Unexpected opcode!");
1623	case X86::KORTESTBrr: NewOpc = X86::KTESTBrr; break;
1624	case X86::KORTESTWrr: NewOpc = X86::KTESTWrr; break;
1625	case X86::KORTESTDrr: NewOpc = X86::KTESTDrr; break;
1626	case X86::KORTESTQrr: NewOpc = X86::KTESTQrr; break;
1627	}
1628	MachineSDNode *KTest = CurDAG->getMachineNode(NewOpc, SDLoc(N),
1629	MVT::i32,
1630	And.getOperand(`0`),
1631	And.getOperand(`1`));
1632	ReplaceUses(F: N, T: KTest);
1633	MadeChange = true;
1634	continue;
1635	}
1636	}
1637
1638	// Attempt to remove vectors moves that were inserted to zero upper bits.
1639	if (Opc != TargetOpcode::SUBREG_TO_REG)
1640	continue;
1641
1642	unsigned SubRegIdx = N->getConstantOperandVal(Num: `2`);
1643	if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
1644	continue;
1645
1646	SDValue Move = N->getOperand(Num: `1`);
1647	if (!Move.isMachineOpcode())
1648	continue;
1649
1650	// Make sure its one of the move opcodes we recognize.
1651	switch (Move.getMachineOpcode()) {
1652	default:
1653	continue;
1654	case X86::VMOVAPDrr: case X86::VMOVUPDrr:
1655	case X86::VMOVAPSrr: case X86::VMOVUPSrr:
1656	case X86::VMOVDQArr: case X86::VMOVDQUrr:
1657	case X86::VMOVAPDYrr: case X86::VMOVUPDYrr:
1658	case X86::VMOVAPSYrr: case X86::VMOVUPSYrr:
1659	case X86::VMOVDQAYrr: case X86::VMOVDQUYrr:
1660	case X86::VMOVAPDZ128rr: case X86::VMOVUPDZ128rr:
1661	case X86::VMOVAPSZ128rr: case X86::VMOVUPSZ128rr:
1662	case X86::VMOVDQA32Z128rr: case X86::VMOVDQU32Z128rr:
1663	case X86::VMOVDQA64Z128rr: case X86::VMOVDQU64Z128rr:
1664	case X86::VMOVAPDZ256rr: case X86::VMOVUPDZ256rr:
1665	case X86::VMOVAPSZ256rr: case X86::VMOVUPSZ256rr:
1666	case X86::VMOVDQA32Z256rr: case X86::VMOVDQU32Z256rr:
1667	case X86::VMOVDQA64Z256rr: case X86::VMOVDQU64Z256rr:
1668	break;
1669	}
1670
1671	SDValue In = Move.getOperand(i: `0`);
1672	if (!In.isMachineOpcode() \|\|
1673	In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
1674	continue;
1675
1676	// Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1677	// the SHA instructions which use a legacy encoding.
1678	uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags;
1679	if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
1680	(TSFlags & X86II::EncodingMask) != X86II::EVEX &&
1681	(TSFlags & X86II::EncodingMask) != X86II::XOP)
1682	continue;
1683
1684	// Producing instruction is another vector instruction. We can drop the
1685	// move.
1686	CurDAG->UpdateNodeOperands(N, Op1: N->getOperand(Num: `0`), Op2: In, Op3: N->getOperand(Num: `2`));
1687	MadeChange = true;
1688	}
1689
1690	if (MadeChange)
1691	CurDAG->RemoveDeadNodes();
1692	}
1693
1694
1695	/// Emit any code that needs to be executed only in the main function.
1696	void X86DAGToDAGISel::emitSpecialCodeForMain() {
1697	if (Subtarget->isTargetCygMing()) {
1698	TargetLowering::ArgListTy Args;
1699	auto &DL = CurDAG->getDataLayout();
1700
1701	TargetLowering::CallLoweringInfo CLI(*CurDAG);
1702	CLI.setChain(CurDAG->getRoot())
1703	.setCallee(CC: CallingConv::C, ResultType: Type::getVoidTy(C&: *CurDAG->getContext()),
1704	Target: CurDAG->getExternalSymbol(Sym: "__main", VT: TLI->getPointerTy(DL)),
1705	ArgsList: std::move(Args));
1706	const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
1707	std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
1708	CurDAG->setRoot(Result.second);
1709	}
1710	}
1711
1712	void X86DAGToDAGISel::emitFunctionEntryCode() {
1713	// If this is main, emit special code for main.
1714	const Function &F = MF->getFunction();
1715	if (F.hasExternalLinkage() && F.getName() == "main")
1716	emitSpecialCodeForMain();
1717	}
1718
1719	static bool isDispSafeForFrameIndex(int64_t Val) {
1720	// On 64-bit platforms, we can run into an issue where a frame index
1721	// includes a displacement that, when added to the explicit displacement,
1722	// will overflow the displacement field. Assuming that the frame index
1723	// displacement fits into a 31-bit integer (which is only slightly more
1724	// aggressive than the current fundamental assumption that it fits into
1725	// a 32-bit integer), a 31-bit disp should always be safe.
1726	return isInt<`31`>(x: Val);
1727	}
1728
1729	bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
1730	X86ISelAddressMode &AM) {
1731	// We may have already matched a displacement and the caller just added the
1732	// symbolic displacement. So we still need to do the checks even if Offset
1733	// is zero.
1734
1735	int64_t Val = AM.Disp + Offset;
1736
1737	// Cannot combine ExternalSymbol displacements with integer offsets.
1738	if (Val != `0` && (AM.ES \|\| AM.MCSym))
1739	return true;
1740
1741	CodeModel::Model M = TM.getCodeModel();
1742	if (Subtarget->is64Bit()) {
1743	if (Val != `0` &&
1744	!X86::isOffsetSuitableForCodeModel(Offset: Val, M,
1745	hasSymbolicDisplacement: AM.hasSymbolicDisplacement()))
1746	return true;
1747	// In addition to the checks required for a register base, check that
1748	// we do not try to use an unsafe Disp with a frame index.
1749	if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
1750	!isDispSafeForFrameIndex(Val))
1751	return true;
1752	// In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to
1753	// 64 bits. Instructions with 32-bit register addresses perform this zero
1754	// extension for us and we can safely ignore the high bits of Offset.
1755	// Instructions with only a 32-bit immediate address do not, though: they
1756	// sign extend instead. This means only address the low 2GB of address space
1757	// is directly addressable, we need indirect addressing for the high 2GB of
1758	// address space.
1759	// TODO: Some of the earlier checks may be relaxed for ILP32 mode as the
1760	// implicit zero extension of instructions would cover up any problem.
1761	// However, we have asserts elsewhere that get triggered if we do, so keep
1762	// the checks for now.
1763	// TODO: We would actually be able to accept these, as well as the same
1764	// addresses in LP64 mode, by adding the EIZ pseudo-register as an operand
1765	// to get an address size override to be emitted. However, this
1766	// pseudo-register is not part of any register class and therefore causes
1767	// MIR verification to fail.
1768	if (Subtarget->isTarget64BitILP32() && !isUInt<`31`>(x: Val) &&
1769	!AM.hasBaseOrIndexReg())
1770	return true;
1771	}
1772	AM.Disp = Val;
1773	return false;
1774	}
1775
1776	bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
1777	bool AllowSegmentRegForX32) {
1778	SDValue Address = N->getOperand(Num: `1`);
1779
1780	// load gs:0 -> GS segment register.
1781	// load fs:0 -> FS segment register.
1782	//
1783	// This optimization is generally valid because the GNU TLS model defines that
1784	// gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode
1785	// with 32-bit registers, as we get in ILP32 mode, those registers are first
1786	// zero-extended to 64 bits and then added it to the base address, which gives
1787	// unwanted results when the register holds a negative value.
1788	// For more information see http://people.redhat.com/drepper/tls.pdf
1789	if (isNullConstant(V: Address) && AM.Segment.getNode() == nullptr &&
1790	!IndirectTlsSegRefs &&
1791	(Subtarget->isTargetGlibc() \|\| Subtarget->isTargetAndroid() \|\|
1792	Subtarget->isTargetFuchsia())) {
1793	if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32)
1794	return true;
1795	switch (N->getPointerInfo().getAddrSpace()) {
1796	case X86AS::GS:
1797	AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
1798	return false;
1799	case X86AS::FS:
1800	AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
1801	return false;
1802	// Address space X86AS::SS is not handled here, because it is not used to
1803	// address TLS areas.
1804	}
1805	}
1806
1807	return true;
1808	}
1809
1810	/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1811	/// mode. These wrap things that will resolve down into a symbol reference.
1812	/// If no match is possible, this returns true, otherwise it returns false.
1813	bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
1814	// If the addressing mode already has a symbol as the displacement, we can
1815	// never match another symbol.
1816	if (AM.hasSymbolicDisplacement())
1817	return true;
1818
1819	bool IsRIPRelTLS = false;
1820	bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
1821	if (IsRIPRel) {
1822	SDValue Val = N.getOperand(i: `0`);
1823	if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
1824	IsRIPRelTLS = true;
1825	}
1826
1827	// We can't use an addressing mode in the 64-bit large code model.
1828	// Global TLS addressing is an exception. In the medium code model,
1829	// we use can use a mode when RIP wrappers are present.
1830	// That signifies access to globals that are known to be "near",
1831	// such as the GOT itself.
1832	CodeModel::Model M = TM.getCodeModel();
1833	if (Subtarget->is64Bit() && M == CodeModel::Large && !IsRIPRelTLS)
1834	return true;
1835
1836	// Base and index reg must be 0 in order to use %rip as base.
1837	if (IsRIPRel && AM.hasBaseOrIndexReg())
1838	return true;
1839
1840	// Make a local copy in case we can't do this fold.
1841	X86ISelAddressMode Backup = AM;
1842
1843	int64_t Offset = `0`;
1844	SDValue N0 = N.getOperand(i: `0`);
1845	if (auto *G = dyn_cast<GlobalAddressSDNode>(Val&: N0)) {
1846	AM.GV = G->getGlobal();
1847	AM.SymbolFlags = G->getTargetFlags();
1848	Offset = G->getOffset();
1849	} else if (auto *CP = dyn_cast<ConstantPoolSDNode>(Val&: N0)) {
1850	AM.CP = CP->getConstVal();
1851	AM.Alignment = CP->getAlign();
1852	AM.SymbolFlags = CP->getTargetFlags();
1853	Offset = CP->getOffset();
1854	} else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Val&: N0)) {
1855	AM.ES = S->getSymbol();
1856	AM.SymbolFlags = S->getTargetFlags();
1857	} else if (auto *S = dyn_cast<MCSymbolSDNode>(Val&: N0)) {
1858	AM.MCSym = S->getMCSymbol();
1859	} else if (auto *J = dyn_cast<JumpTableSDNode>(Val&: N0)) {
1860	AM.JT = J->getIndex();
1861	AM.SymbolFlags = J->getTargetFlags();
1862	} else if (auto *BA = dyn_cast<BlockAddressSDNode>(Val&: N0)) {
1863	AM.BlockAddr = BA->getBlockAddress();
1864	AM.SymbolFlags = BA->getTargetFlags();
1865	Offset = BA->getOffset();
1866	} else
1867	llvm_unreachable("Unhandled symbol reference node.");
1868
1869	// Can't use an addressing mode with large globals.
1870	if (Subtarget->is64Bit() && !IsRIPRel && AM.GV &&
1871	TM.isLargeGlobalValue(GV: AM.GV)) {
1872	AM = Backup;
1873	return true;
1874	}
1875
1876	if (foldOffsetIntoAddress(Offset, AM)) {
1877	AM = Backup;
1878	return true;
1879	}
1880
1881	if (IsRIPRel)
1882	AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
1883
1884	// Commit the changes now that we know this fold is safe.
1885	return false;
1886	}
1887
1888	/// Add the specified node to the specified addressing mode, returning true if
1889	/// it cannot be done. This just pattern matches for the addressing mode.
1890	bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
1891	if (matchAddressRecursively(N, AM, Depth: `0`))
1892	return true;
1893
1894	// Post-processing: Make a second attempt to fold a load, if we now know
1895	// that there will not be any other register. This is only performed for
1896	// 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded
1897	// any foldable load the first time.
1898	if (Subtarget->isTarget64BitILP32() &&
1899	AM.BaseType == X86ISelAddressMode::RegBase &&
1900	AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) {
1901	SDValue Save_Base_Reg = AM.Base_Reg;
1902	if (auto *LoadN = dyn_cast<LoadSDNode>(Val&: Save_Base_Reg)) {
1903	AM.Base_Reg = SDValue ();
1904	if (matchLoadInAddress(N: LoadN, AM, /AllowSegmentRegForX32=/true))
1905	AM.Base_Reg = Save_Base_Reg;
1906	}
1907	}
1908
1909	// Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
1910	// a smaller encoding and avoids a scaled-index.
1911	if (AM.Scale == `2` &&
1912	AM.BaseType == X86ISelAddressMode::RegBase &&
1913	AM.Base_Reg.getNode() == nullptr) {
1914	AM.Base_Reg = AM.IndexReg;
1915	AM.Scale = `1`;
1916	}
1917
1918	// Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
1919	// because it has a smaller encoding.
1920	if (TM.getCodeModel() != CodeModel::Large &&
1921	(!AM.GV \|\| !TM.isLargeGlobalValue(GV: AM.GV)) && Subtarget->is64Bit() &&
1922	AM.Scale == `1` && AM.BaseType == X86ISelAddressMode::RegBase &&
1923	AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr &&
1924	AM.SymbolFlags == X86II::MO_NO_FLAG && AM.hasSymbolicDisplacement()) {
1925	AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
1926	}
1927
1928	return false;
1929	}
1930
1931	bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
1932	unsigned Depth) {
1933	// Add an artificial use to this node so that we can keep track of
1934	// it if it gets CSE'd with a different node.
1935	HandleSDNode Handle(N);
1936
1937	X86ISelAddressMode Backup = AM;
1938	if (!matchAddressRecursively(N: N.getOperand(i: `0`), AM, Depth: Depth+`1`) &&
1939	!matchAddressRecursively(N: Handle.getValue().getOperand(i: `1`), AM, Depth: Depth+`1`))
1940	return false;
1941	AM = Backup;
1942
1943	// Try again after commutating the operands.
1944	if (!matchAddressRecursively(N: Handle.getValue().getOperand(i: `1`), AM,
1945	Depth: Depth + `1`) &&
1946	!matchAddressRecursively(N: Handle.getValue().getOperand(i: `0`), AM, Depth: Depth + `1`))
1947	return false;
1948	AM = Backup;
1949
1950	// If we couldn't fold both operands into the address at the same time,
1951	// see if we can just put each operand into a register and fold at least
1952	// the add.
1953	if (AM.BaseType == X86ISelAddressMode::RegBase &&
1954	!AM.Base_Reg.getNode() &&
1955	!AM.IndexReg.getNode()) {
1956	N = Handle.getValue();
1957	AM.Base_Reg = N.getOperand(i: `0`);
1958	AM.IndexReg = N.getOperand(i: `1`);
1959	AM.Scale = `1`;
1960	return false;
1961	}
1962	N = Handle.getValue();
1963	return true;
1964	}
1965
1966	// Insert a node into the DAG at least before the Pos node's position. This
1967	// will reposition the node as needed, and will assign it a node ID that is <=
1968	// the Pos node's ID. Note that this does not* preserve the uniqueness of node*
1969	// IDs! The selection DAG must no longer depend on their uniqueness when this
1970	// is used.
1971	static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
1972	if (N ->getNodeId() == -`1` \|\|
1973	(SelectionDAGISel::getUninvalidatedNodeId(N: N.getNode()) >
1974	SelectionDAGISel::getUninvalidatedNodeId(N: Pos.getNode()))) {
1975	DAG.RepositionNode(Position: Pos ->getIterator(), N: N.getNode());
1976	// Mark Node as invalid for pruning as after this it may be a successor to a
1977	// selected node but otherwise be in the same position of Pos.
1978	// Conservatively mark it with the same -abs(Id) to assure node id
1979	// invariant is preserved.
1980	N ->setNodeId(Pos ->getNodeId());
1981	SelectionDAGISel::InvalidateNodeId(N: N.getNode());
1982	}
1983	}
1984
1985	// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
1986	// safe. This allows us to convert the shift and and into an h-register
1987	// extract and a scaled index. Returns false if the simplification is
1988	// performed.
1989	static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
1990	uint64_t Mask,
1991	SDValue Shift, SDValue X,
1992	X86ISelAddressMode &AM) {
1993	if (Shift.getOpcode() != ISD::SRL \|\|
1994	!isa<ConstantSDNode>(Val: Shift.getOperand(i: `1`)) \|\|
1995	!Shift.hasOneUse())
1996	return true;
1997
1998	int ScaleLog = `8` - Shift.getConstantOperandVal(i: `1`);
1999	if (ScaleLog <= `0` \|\| ScaleLog >= `4` \|\|
2000	Mask != (`0xffu` << ScaleLog))
2001	return true;
2002
2003	MVT XVT = X.getSimpleValueType();
2004	MVT VT = N.getSimpleValueType();
2005	SDLoc DL(N);
2006	SDValue Eight = DAG.getConstant(`8`, DL, MVT::i8);
2007	SDValue NewMask = DAG.getConstant(Val: `0xff`, DL, VT: XVT);
2008	SDValue Srl = DAG.getNode(Opcode: ISD::SRL, DL, VT: XVT, N1: X, N2: Eight);
2009	SDValue And = DAG.getNode(Opcode: ISD::AND, DL, VT: XVT, N1: Srl, N2: NewMask);
2010	SDValue Ext = DAG.getZExtOrTrunc(Op: And, DL, VT);
2011	SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
2012	SDValue Shl = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: Ext, N2: ShlCount);
2013
2014	// Insert the new nodes into the topological ordering. We must do this in
2015	// a valid topological ordering as nothing is going to go back and re-sort
2016	// these nodes. We continually insert before 'N' in sequence as this is
2017	// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2018	// hierarchy left to express.
2019	insertDAGNode(DAG, Pos: N, N: Eight);
2020	insertDAGNode(DAG, Pos: N, N: NewMask);
2021	insertDAGNode(DAG, Pos: N, N: Srl);
2022	insertDAGNode(DAG, Pos: N, N: And);
2023	insertDAGNode(DAG, Pos: N, N: Ext);
2024	insertDAGNode(DAG, Pos: N, N: ShlCount);
2025	insertDAGNode(DAG, Pos: N, N: Shl);
2026	DAG.ReplaceAllUsesWith(From: N, To: Shl);
2027	DAG.RemoveDeadNode(N: N.getNode());
2028	AM.IndexReg = Ext;
2029	AM.Scale = (`1` << ScaleLog);
2030	return false;
2031	}
2032
2033	// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
2034	// allows us to fold the shift into this addressing mode. Returns false if the
2035	// transform succeeded.
2036	static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
2037	X86ISelAddressMode &AM) {
2038	SDValue Shift = N.getOperand(i: `0`);
2039
2040	// Use a signed mask so that shifting right will insert sign bits. These
2041	// bits will be removed when we shift the result left so it doesn't matter
2042	// what we use. This might allow a smaller immediate encoding.
2043	int64_t Mask = cast<ConstantSDNode>(Val: N ->getOperand(Num: `1`))->getSExtValue();
2044
2045	// If we have an any_extend feeding the AND, look through it to see if there
2046	// is a shift behind it. But only if the AND doesn't use the extended bits.
2047	// FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
2048	bool FoundAnyExtend = false;
2049	if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
2050	Shift.getOperand(`0`).getSimpleValueType() == MVT::i32 &&
2051	isUInt<`32`>(Mask)) {
2052	FoundAnyExtend = true;
2053	Shift = Shift.getOperand(i: `0`);
2054	}
2055
2056	if (Shift.getOpcode() != ISD::SHL \|\|
2057	!isa<ConstantSDNode>(Val: Shift.getOperand(i: `1`)))
2058	return true;
2059
2060	SDValue X = Shift.getOperand(i: `0`);
2061
2062	// Not likely to be profitable if either the AND or SHIFT node has more
2063	// than one use (unless all uses are for address computation). Besides,
2064	// isel mechanism requires their node ids to be reused.
2065	if (!N.hasOneUse() \|\| !Shift.hasOneUse())
2066	return true;
2067
2068	// Verify that the shift amount is something we can fold.
2069	unsigned ShiftAmt = Shift.getConstantOperandVal(i: `1`);
2070	if (ShiftAmt != `1` && ShiftAmt != `2` && ShiftAmt != `3`)
2071	return true;
2072
2073	MVT VT = N.getSimpleValueType();
2074	SDLoc DL(N);
2075	if (FoundAnyExtend) {
2076	SDValue NewX = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT, Operand: X);
2077	insertDAGNode(DAG, Pos: N, N: NewX);
2078	X = NewX;
2079	}
2080
2081	SDValue NewMask = DAG.getConstant(Val: Mask >> ShiftAmt, DL, VT);
2082	SDValue NewAnd = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: X, N2: NewMask);
2083	SDValue NewShift = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: NewAnd, N2: Shift.getOperand(i: `1`));
2084
2085	// Insert the new nodes into the topological ordering. We must do this in
2086	// a valid topological ordering as nothing is going to go back and re-sort
2087	// these nodes. We continually insert before 'N' in sequence as this is
2088	// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2089	// hierarchy left to express.
2090	insertDAGNode(DAG, Pos: N, N: NewMask);
2091	insertDAGNode(DAG, Pos: N, N: NewAnd);
2092	insertDAGNode(DAG, Pos: N, N: NewShift);
2093	DAG.ReplaceAllUsesWith(From: N, To: NewShift);
2094	DAG.RemoveDeadNode(N: N.getNode());
2095
2096	AM.Scale = `1` << ShiftAmt;
2097	AM.IndexReg = NewAnd;
2098	return false;
2099	}
2100
2101	// Implement some heroics to detect shifts of masked values where the mask can
2102	// be replaced by extending the shift and undoing that in the addressing mode
2103	// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
2104	// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
2105	// the addressing mode. This results in code such as:
2106	//
2107	// int f(short y, int lookup_table) {
2108	// ...
2109	// return y + lookup_table[y >> 11];
2110	// }
2111	//
2112	// Turning into:
2113	// movzwl (%rdi), %eax
2114	// movl %eax, %ecx
2115	// shrl $11, %ecx
2116	// addl (%rsi,%rcx,4), %eax
2117	//
2118	// Instead of:
2119	// movzwl (%rdi), %eax
2120	// movl %eax, %ecx
2121	// shrl $9, %ecx
2122	// andl $124, %rcx
2123	// addl (%rsi,%rcx), %eax
2124	//
2125	// Note that this function assumes the mask is provided as a mask after* the*
2126	// value is shifted. The input chain may or may not match that, but computing
2127	// such a mask is trivial.
2128	static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
2129	uint64_t Mask,
2130	SDValue Shift, SDValue X,
2131	X86ISelAddressMode &AM) {
2132	if (Shift.getOpcode() != ISD::SRL \|\| !Shift.hasOneUse() \|\|
2133	!isa<ConstantSDNode>(Val: Shift.getOperand(i: `1`)))
2134	return true;
2135
2136	// We need to ensure that mask is a continuous run of bits.
2137	unsigned MaskIdx, MaskLen;
2138	if (!isShiftedMask_64(Value: Mask, MaskIdx, MaskLen))
2139	return true;
2140	unsigned MaskLZ = `64` - (MaskIdx + MaskLen);
2141
2142	unsigned ShiftAmt = Shift.getConstantOperandVal(i: `1`);
2143
2144	// The amount of shift we're trying to fit into the addressing mode is taken
2145	// from the shifted mask index (number of trailing zeros of the mask).
2146	unsigned AMShiftAmt = MaskIdx;
2147
2148	// There is nothing we can do here unless the mask is removing some bits.
2149	// Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2150	if (AMShiftAmt == `0` \|\| AMShiftAmt > `3`) return true;
2151
2152	// Scale the leading zero count down based on the actual size of the value.
2153	// Also scale it down based on the size of the shift.
2154	unsigned ScaleDown = (`64` - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
2155	if (MaskLZ < ScaleDown)
2156	return true;
2157	MaskLZ -= ScaleDown;
2158
2159	// The final check is to ensure that any masked out high bits of X are
2160	// already known to be zero. Otherwise, the mask has a semantic impact
2161	// other than masking out a couple of low bits. Unfortunately, because of
2162	// the mask, zero extensions will be removed from operands in some cases.
2163	// This code works extra hard to look through extensions because we can
2164	// replace them with zero extensions cheaply if necessary.
2165	bool ReplacingAnyExtend = false;
2166	if (X.getOpcode() == ISD::ANY_EXTEND) {
2167	unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
2168	X.getOperand(i: `0`).getSimpleValueType().getSizeInBits();
2169	// Assume that we'll replace the any-extend with a zero-extend, and
2170	// narrow the search to the extended value.
2171	X = X.getOperand(i: `0`);
2172	MaskLZ = ExtendBits > MaskLZ ? `0` : MaskLZ - ExtendBits;
2173	ReplacingAnyExtend = true;
2174	}
2175	APInt MaskedHighBits =
2176	APInt::getHighBitsSet(numBits: X.getSimpleValueType().getSizeInBits(), hiBitsSet: MaskLZ);
2177	if (!DAG.MaskedValueIsZero(Op: X, Mask: MaskedHighBits))
2178	return true;
2179
2180	// We've identified a pattern that can be transformed into a single shift
2181	// and an addressing mode. Make it so.
2182	MVT VT = N.getSimpleValueType();
2183	if (ReplacingAnyExtend) {
2184	assert(X.getValueType() != VT);
2185	// We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
2186	SDValue NewX = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SDLoc (X), VT, Operand: X);
2187	insertDAGNode(DAG, Pos: N, N: NewX);
2188	X = NewX;
2189	}
2190
2191	MVT XVT = X.getSimpleValueType();
2192	SDLoc DL(N);
2193	SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2194	SDValue NewSRL = DAG.getNode(Opcode: ISD::SRL, DL, VT: XVT, N1: X, N2: NewSRLAmt);
2195	SDValue NewExt = DAG.getZExtOrTrunc(Op: NewSRL, DL, VT);
2196	SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2197	SDValue NewSHL = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: NewExt, N2: NewSHLAmt);
2198
2199	// Insert the new nodes into the topological ordering. We must do this in
2200	// a valid topological ordering as nothing is going to go back and re-sort
2201	// these nodes. We continually insert before 'N' in sequence as this is
2202	// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2203	// hierarchy left to express.
2204	insertDAGNode(DAG, Pos: N, N: NewSRLAmt);
2205	insertDAGNode(DAG, Pos: N, N: NewSRL);
2206	insertDAGNode(DAG, Pos: N, N: NewExt);
2207	insertDAGNode(DAG, Pos: N, N: NewSHLAmt);
2208	insertDAGNode(DAG, Pos: N, N: NewSHL);
2209	DAG.ReplaceAllUsesWith(From: N, To: NewSHL);
2210	DAG.RemoveDeadNode(N: N.getNode());
2211
2212	AM.Scale = `1` << AMShiftAmt;
2213	AM.IndexReg = NewExt;
2214	return false;
2215	}
2216
2217	// Transform "(X >> SHIFT) & (MASK << C1)" to
2218	// "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
2219	// matched to a BEXTR later. Returns false if the simplification is performed.
2220	static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N,
2221	uint64_t Mask,
2222	SDValue Shift, SDValue X,
2223	X86ISelAddressMode &AM,
2224	const X86Subtarget &Subtarget) {
2225	if (Shift.getOpcode() != ISD::SRL \|\|
2226	!isa<ConstantSDNode>(Val: Shift.getOperand(i: `1`)) \|\|
2227	!Shift.hasOneUse() \|\| !N.hasOneUse())
2228	return true;
2229
2230	// Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
2231	if (!Subtarget.hasTBM() &&
2232	!(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
2233	return true;
2234
2235	// We need to ensure that mask is a continuous run of bits.
2236	unsigned MaskIdx, MaskLen;
2237	if (!isShiftedMask_64(Value: Mask, MaskIdx, MaskLen))
2238	return true;
2239
2240	unsigned ShiftAmt = Shift.getConstantOperandVal(i: `1`);
2241
2242	// The amount of shift we're trying to fit into the addressing mode is taken
2243	// from the shifted mask index (number of trailing zeros of the mask).
2244	unsigned AMShiftAmt = MaskIdx;
2245
2246	// There is nothing we can do here unless the mask is removing some bits.
2247	// Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2248	if (AMShiftAmt == `0` \|\| AMShiftAmt > `3`) return true;
2249
2250	MVT XVT = X.getSimpleValueType();
2251	MVT VT = N.getSimpleValueType();
2252	SDLoc DL(N);
2253	SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2254	SDValue NewSRL = DAG.getNode(Opcode: ISD::SRL, DL, VT: XVT, N1: X, N2: NewSRLAmt);
2255	SDValue NewMask = DAG.getConstant(Val: Mask >> AMShiftAmt, DL, VT: XVT);
2256	SDValue NewAnd = DAG.getNode(Opcode: ISD::AND, DL, VT: XVT, N1: NewSRL, N2: NewMask);
2257	SDValue NewExt = DAG.getZExtOrTrunc(Op: NewAnd, DL, VT);
2258	SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2259	SDValue NewSHL = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: NewExt, N2: NewSHLAmt);
2260
2261	// Insert the new nodes into the topological ordering. We must do this in
2262	// a valid topological ordering as nothing is going to go back and re-sort
2263	// these nodes. We continually insert before 'N' in sequence as this is
2264	// essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2265	// hierarchy left to express.
2266	insertDAGNode(DAG, Pos: N, N: NewSRLAmt);
2267	insertDAGNode(DAG, Pos: N, N: NewSRL);
2268	insertDAGNode(DAG, Pos: N, N: NewMask);
2269	insertDAGNode(DAG, Pos: N, N: NewAnd);
2270	insertDAGNode(DAG, Pos: N, N: NewExt);
2271	insertDAGNode(DAG, Pos: N, N: NewSHLAmt);
2272	insertDAGNode(DAG, Pos: N, N: NewSHL);
2273	DAG.ReplaceAllUsesWith(From: N, To: NewSHL);
2274	DAG.RemoveDeadNode(N: N.getNode());
2275
2276	AM.Scale = `1` << AMShiftAmt;
2277	AM.IndexReg = NewExt;
2278	return false;
2279	}
2280
2281	// Attempt to peek further into a scaled index register, collecting additional
2282	// extensions / offsets / etc. Returns /p N if we can't peek any further.
2283	SDValue X86DAGToDAGISel::matchIndexRecursively(SDValue N,
2284	X86ISelAddressMode &AM,
2285	unsigned Depth) {
2286	assert(AM.IndexReg.getNode() == nullptr && "IndexReg already matched");
2287	assert((AM.Scale == `1` \|\| AM.Scale == `2` \|\| AM.Scale == `4` \|\| AM.Scale == `8`) &&
2288	"Illegal index scale");
2289
2290	// Limit recursion.
2291	if (Depth >= SelectionDAG::MaxRecursionDepth)
2292	return N;
2293
2294	EVT VT = N.getValueType();
2295	unsigned Opc = N.getOpcode();
2296
2297	// index: add(x,c) -> index: x, disp + c
2298	if (CurDAG->isBaseWithConstantOffset(Op: N)) {
2299	auto *AddVal = cast<ConstantSDNode>(Val: N.getOperand(i: `1`));
2300	uint64_t Offset = (uint64_t)AddVal->getSExtValue() * AM.Scale;
2301	if (!foldOffsetIntoAddress(Offset, AM))
2302	return matchIndexRecursively(N: N.getOperand(i: `0`), AM, Depth: Depth + `1`);
2303	}
2304
2305	// index: add(x,x) -> index: x, scale 2*
2306	if (Opc == ISD::ADD && N.getOperand(i: `0`) == N.getOperand(i: `1`)) {
2307	if (AM.Scale <= `4`) {
2308	AM.Scale *= `2`;
2309	return matchIndexRecursively(N: N.getOperand(i: `0`), AM, Depth: Depth + `1`);
2310	}
2311	}
2312
2313	// index: shl(x,i) -> index: x, scale (1 << i)*
2314	if (Opc == X86ISD::VSHLI) {
2315	uint64_t ShiftAmt = N.getConstantOperandVal(i: `1`);
2316	uint64_t ScaleAmt = `1ULL` << ShiftAmt;
2317	if ((AM.Scale * ScaleAmt) <= `8`) {
2318	AM.Scale *= ScaleAmt;
2319	return matchIndexRecursively(N: N.getOperand(i: `0`), AM, Depth: Depth + `1`);
2320	}
2321	}
2322
2323	// index: sext(add_nsw(x,c)) -> index: sext(x), disp + sext(c)
2324	// TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2325	if (Opc == ISD::SIGN_EXTEND && !VT.isVector() && N.hasOneUse()) {
2326	SDValue Src = N.getOperand(i: `0`);
2327	if (Src.getOpcode() == ISD::ADD && Src ->getFlags().hasNoSignedWrap() &&
2328	Src.hasOneUse()) {
2329	if (CurDAG->isBaseWithConstantOffset(Op: Src)) {
2330	SDValue AddSrc = Src.getOperand(i: `0`);
2331	auto *AddVal = cast<ConstantSDNode>(Val: Src.getOperand(i: `1`));
2332	uint64_t Offset = (uint64_t)AddVal->getSExtValue();
2333	if (!foldOffsetIntoAddress(Offset: Offset * AM.Scale, AM)) {
2334	SDLoc DL(N);
2335	SDValue ExtSrc = CurDAG->getNode(Opcode: Opc, DL, VT, Operand: AddSrc);
2336	SDValue ExtVal = CurDAG->getConstant(Val: Offset, DL, VT);
2337	SDValue ExtAdd = CurDAG->getNode(Opcode: ISD::ADD, DL, VT, N1: ExtSrc, N2: ExtVal);
2338	insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtSrc);
2339	insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtVal);
2340	insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtAdd);
2341	CurDAG->ReplaceAllUsesWith(From: N, To: ExtAdd);
2342	CurDAG->RemoveDeadNode(N: N.getNode());
2343	return ExtSrc;
2344	}
2345	}
2346	}
2347	}
2348
2349	// index: zext(add_nuw(x,c)) -> index: zext(x), disp + zext(c)
2350	// index: zext(addlike(x,c)) -> index: zext(x), disp + zext(c)
2351	// TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2352	if (Opc == ISD::ZERO_EXTEND && !VT.isVector() && N.hasOneUse()) {
2353	SDValue Src = N.getOperand(i: `0`);
2354	unsigned SrcOpc = Src.getOpcode();
2355	if (((SrcOpc == ISD::ADD && Src ->getFlags().hasNoUnsignedWrap()) \|\|
2356	CurDAG->isADDLike(Op: Src)) &&
2357	Src.hasOneUse()) {
2358	if (CurDAG->isBaseWithConstantOffset(Op: Src)) {
2359	SDValue AddSrc = Src.getOperand(i: `0`);
2360	uint64_t Offset = Src.getConstantOperandVal(i: `1`);
2361	if (!foldOffsetIntoAddress(Offset: Offset * AM.Scale, AM)) {
2362	SDLoc DL(N);
2363	SDValue Res;
2364	// If we're also scaling, see if we can use that as well.
2365	if (AddSrc.getOpcode() == ISD::SHL &&
2366	isa<ConstantSDNode>(Val: AddSrc.getOperand(i: `1`))) {
2367	SDValue ShVal = AddSrc.getOperand(i: `0`);
2368	uint64_t ShAmt = AddSrc.getConstantOperandVal(i: `1`);
2369	APInt HiBits =
2370	APInt::getHighBitsSet(numBits: AddSrc.getScalarValueSizeInBits(), hiBitsSet: ShAmt);
2371	uint64_t ScaleAmt = `1ULL` << ShAmt;
2372	if ((AM.Scale * ScaleAmt) <= `8` &&
2373	(AddSrc ->getFlags().hasNoUnsignedWrap() \|\|
2374	CurDAG->MaskedValueIsZero(Op: ShVal, Mask: HiBits))) {
2375	AM.Scale *= ScaleAmt;
2376	SDValue ExtShVal = CurDAG->getNode(Opcode: Opc, DL, VT, Operand: ShVal);
2377	SDValue ExtShift = CurDAG->getNode(Opcode: ISD::SHL, DL, VT, N1: ExtShVal,
2378	N2: AddSrc.getOperand(i: `1`));
2379	insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtShVal);
2380	insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtShift);
2381	AddSrc = ExtShift;
2382	Res = ExtShVal;
2383	}
2384	}
2385	SDValue ExtSrc = CurDAG->getNode(Opcode: Opc, DL, VT, Operand: AddSrc);
2386	SDValue ExtVal = CurDAG->getConstant(Val: Offset, DL, VT);
2387	SDValue ExtAdd = CurDAG->getNode(Opcode: SrcOpc, DL, VT, N1: ExtSrc, N2: ExtVal);
2388	insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtSrc);
2389	insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtVal);
2390	insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtAdd);
2391	CurDAG->ReplaceAllUsesWith(From: N, To: ExtAdd);
2392	CurDAG->RemoveDeadNode(N: N.getNode());
2393	return Res ? Res : ExtSrc;
2394	}
2395	}
2396	}
2397	}
2398
2399	// TODO: Handle extensions, shifted masks etc.
2400	return N;
2401	}
2402
2403	bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
2404	unsigned Depth) {
2405	SDLoc dl(N);
2406	LLVM_DEBUG({
2407	dbgs() << "MatchAddress: ";
2408	AM.dump(CurDAG);
2409	});
2410	// Limit recursion.
2411	if (Depth >= SelectionDAG::MaxRecursionDepth)
2412	return matchAddressBase(N, AM);
2413
2414	// If this is already a %rip relative address, we can only merge immediates
2415	// into it. Instead of handling this in every case, we handle it here.
2416	// RIP relative addressing: %rip + 32-bit displacement!
2417	if (AM.isRIPRelative()) {
2418	// FIXME: JumpTable and ExternalSymbol address currently don't like
2419	// displacements. It isn't very important, but this should be fixed for
2420	// consistency.
2421	if (!(AM.ES \|\| AM.MCSym) && AM.JT != -`1`)
2422	return true;
2423
2424	if (auto *Cst = dyn_cast<ConstantSDNode>(Val&: N))
2425	if (!foldOffsetIntoAddress(Offset: Cst->getSExtValue(), AM))
2426	return false;
2427	return true;
2428	}
2429
2430	switch (N.getOpcode()) {
2431	default: break;
2432	case ISD::LOCAL_RECOVER: {
2433	if (!AM.hasSymbolicDisplacement() && AM.Disp == `0`)
2434	if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(Val: N.getOperand(i: `0`))) {
2435	// Use the symbol and don't prefix it.
2436	AM.MCSym = ESNode->getMCSymbol();
2437	return false;
2438	}
2439	break;
2440	}
2441	case ISD::Constant: {
2442	uint64_t Val = cast<ConstantSDNode>(Val&: N)->getSExtValue();
2443	if (!foldOffsetIntoAddress(Offset: Val, AM))
2444	return false;
2445	break;
2446	}
2447
2448	case X86ISD::Wrapper:
2449	case X86ISD::WrapperRIP:
2450	if (!matchWrapper(N, AM))
2451	return false;
2452	break;
2453
2454	case ISD::LOAD:
2455	if (!matchLoadInAddress(N: cast<LoadSDNode>(Val&: N), AM))
2456	return false;
2457	break;
2458
2459	case ISD::FrameIndex:
2460	if (AM.BaseType == X86ISelAddressMode::RegBase &&
2461	AM.Base_Reg.getNode() == nullptr &&
2462	(!Subtarget->is64Bit() \|\| isDispSafeForFrameIndex(Val: AM.Disp))) {
2463	AM.BaseType = X86ISelAddressMode::FrameIndexBase;
2464	AM.Base_FrameIndex = cast<FrameIndexSDNode>(Val&: N)->getIndex();
2465	return false;
2466	}
2467	break;
2468
2469	case ISD::SHL:
2470	if (AM.IndexReg.getNode() != nullptr \|\| AM.Scale != `1`)
2471	break;
2472
2473	if (auto *CN = dyn_cast<ConstantSDNode>(Val: N.getOperand(i: `1`))) {
2474	unsigned Val = CN->getZExtValue();
2475	// Note that we handle x<<1 as (,x,2) rather than (x,x) here so
2476	// that the base operand remains free for further matching. If
2477	// the base doesn't end up getting used, a post-processing step
2478	// in MatchAddress turns (,x,2) into (x,x), which is cheaper.
2479	if (Val == `1` \|\| Val == `2` \|\| Val == `3`) {
2480	SDValue ShVal = N.getOperand(i: `0`);
2481	AM.Scale = `1` << Val;
2482	AM.IndexReg = matchIndexRecursively(N: ShVal, AM, Depth: Depth + `1`);
2483	return false;
2484	}
2485	}
2486	break;
2487
2488	case ISD::SRL: {
2489	// Scale must not be used already.
2490	if (AM.IndexReg.getNode() != nullptr \|\| AM.Scale != `1`) break;
2491
2492	// We only handle up to 64-bit values here as those are what matter for
2493	// addressing mode optimizations.
2494	assert(N.getSimpleValueType().getSizeInBits() <= `64` &&
2495	"Unexpected value size!");
2496
2497	SDValue And = N.getOperand(i: `0`);
2498	if (And.getOpcode() != ISD::AND) break;
2499	SDValue X = And.getOperand(i: `0`);
2500
2501	// The mask used for the transform is expected to be post-shift, but we
2502	// found the shift first so just apply the shift to the mask before passing
2503	// it down.
2504	if (!isa<ConstantSDNode>(Val: N.getOperand(i: `1`)) \|\|
2505	!isa<ConstantSDNode>(Val: And.getOperand(i: `1`)))
2506	break;
2507	uint64_t Mask = And.getConstantOperandVal(i: `1`) >> N.getConstantOperandVal(i: `1`);
2508
2509	// Try to fold the mask and shift into the scale, and return false if we
2510	// succeed.
2511	if (!foldMaskAndShiftToScale(DAG&: *CurDAG, N, Mask, Shift: N, X, AM))
2512	return false;
2513	break;
2514	}
2515
2516	case ISD::SMUL_LOHI:
2517	case ISD::UMUL_LOHI:
2518	// A mul_lohi where we need the low part can be folded as a plain multiply.
2519	if (N.getResNo() != `0`) break;
2520	[[fallthrough]];
2521	case ISD::MUL:
2522	case X86ISD::MUL_IMM:
2523	// X[3,5,9] -> X+X[2,4,8]
2524	if (AM.BaseType == X86ISelAddressMode::RegBase &&
2525	AM.Base_Reg.getNode() == nullptr &&
2526	AM.IndexReg.getNode() == nullptr) {
2527	if (auto *CN = dyn_cast<ConstantSDNode>(Val: N.getOperand(i: `1`)))
2528	if (CN->getZExtValue() == `3` \|\| CN->getZExtValue() == `5` \|\|
2529	CN->getZExtValue() == `9`) {
2530	AM.Scale = unsigned(CN->getZExtValue())-`1`;
2531
2532	SDValue MulVal = N.getOperand(i: `0`);
2533	SDValue Reg;
2534
2535	// Okay, we know that we have a scale by now. However, if the scaled
2536	// value is an add of something and a constant, we can fold the
2537	// constant into the disp field here.
2538	if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
2539	isa<ConstantSDNode>(Val: MulVal.getOperand(i: `1`))) {
2540	Reg = MulVal.getOperand(i: `0`);
2541	auto *AddVal = cast<ConstantSDNode>(Val: MulVal.getOperand(i: `1`));
2542	uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
2543	if (foldOffsetIntoAddress(Offset: Disp, AM))
2544	Reg = N.getOperand(i: `0`);
2545	} else {
2546	Reg = N.getOperand(i: `0`);
2547	}
2548
2549	AM.IndexReg = AM.Base_Reg = Reg;
2550	return false;
2551	}
2552	}
2553	break;
2554
2555	case ISD::SUB: {
2556	// Given A-B, if A can be completely folded into the address and
2557	// the index field with the index field unused, use -B as the index.
2558	// This is a win if a has multiple parts that can be folded into
2559	// the address. Also, this saves a mov if the base register has
2560	// other uses, since it avoids a two-address sub instruction, however
2561	// it costs an additional mov if the index register has other uses.
2562
2563	// Add an artificial use to this node so that we can keep track of
2564	// it if it gets CSE'd with a different node.
2565	HandleSDNode Handle(N);
2566
2567	// Test if the LHS of the sub can be folded.
2568	X86ISelAddressMode Backup = AM;
2569	if (matchAddressRecursively(N: N.getOperand(i: `0`), AM, Depth: Depth+`1`)) {
2570	N = Handle.getValue();
2571	AM = Backup;
2572	break;
2573	}
2574	N = Handle.getValue();
2575	// Test if the index field is free for use.
2576	if (AM.IndexReg.getNode() \|\| AM.isRIPRelative()) {
2577	AM = Backup;
2578	break;
2579	}
2580
2581	int Cost = `0`;
2582	SDValue RHS = N.getOperand(i: `1`);
2583	// If the RHS involves a register with multiple uses, this
2584	// transformation incurs an extra mov, due to the neg instruction
2585	// clobbering its operand.
2586	if (!RHS.getNode()->hasOneUse() \|\|
2587	RHS.getNode()->getOpcode() == ISD::CopyFromReg \|\|
2588	RHS.getNode()->getOpcode() == ISD::TRUNCATE \|\|
2589	RHS.getNode()->getOpcode() == ISD::ANY_EXTEND \|\|
2590	(RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
2591	RHS.getOperand(`0`).getValueType() == MVT::i32))
2592	++Cost;
2593	// If the base is a register with multiple uses, this
2594	// transformation may save a mov.
2595	if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
2596	!AM.Base_Reg.getNode()->hasOneUse()) \|\|
2597	AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2598	--Cost;
2599	// If the folded LHS was interesting, this transformation saves
2600	// address arithmetic.
2601	if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
2602	((AM.Disp != `0`) && (Backup.Disp == `0`)) +
2603	(AM.Segment.getNode() && !Backup.Segment.getNode()) >= `2`)
2604	--Cost;
2605	// If it doesn't look like it may be an overall win, don't do it.
2606	if (Cost >= `0`) {
2607	AM = Backup;
2608	break;
2609	}
2610
2611	// Ok, the transformation is legal and appears profitable. Go for it.
2612	// Negation will be emitted later to avoid creating dangling nodes if this
2613	// was an unprofitable LEA.
2614	AM.IndexReg = RHS;
2615	AM.NegateIndex = true;
2616	AM.Scale = `1`;
2617	return false;
2618	}
2619
2620	case ISD::OR:
2621	case ISD::XOR:
2622	// See if we can treat the OR/XOR node as an ADD node.
2623	if (!CurDAG->isADDLike(Op: N))
2624	break;
2625	[[fallthrough]];
2626	case ISD::ADD:
2627	if (!matchAdd(N, AM, Depth))
2628	return false;
2629	break;
2630
2631	case ISD::AND: {
2632	// Perform some heroic transforms on an and of a constant-count shift
2633	// with a constant to enable use of the scaled offset field.
2634
2635	// Scale must not be used already.
2636	if (AM.IndexReg.getNode() != nullptr \|\| AM.Scale != `1`) break;
2637
2638	// We only handle up to 64-bit values here as those are what matter for
2639	// addressing mode optimizations.
2640	assert(N.getSimpleValueType().getSizeInBits() <= `64` &&
2641	"Unexpected value size!");
2642
2643	if (!isa<ConstantSDNode>(Val: N.getOperand(i: `1`)))
2644	break;
2645
2646	if (N.getOperand(i: `0`).getOpcode() == ISD::SRL) {
2647	SDValue Shift = N.getOperand(i: `0`);
2648	SDValue X = Shift.getOperand(i: `0`);
2649
2650	uint64_t Mask = N.getConstantOperandVal(i: `1`);
2651
2652	// Try to fold the mask and shift into an extract and scale.
2653	if (!foldMaskAndShiftToExtract(DAG&: *CurDAG, N, Mask, Shift, X, AM))
2654	return false;
2655
2656	// Try to fold the mask and shift directly into the scale.
2657	if (!foldMaskAndShiftToScale(DAG&: *CurDAG, N, Mask, Shift, X, AM))
2658	return false;
2659
2660	// Try to fold the mask and shift into BEXTR and scale.
2661	if (!foldMaskedShiftToBEXTR(DAG&: CurDAG, N, Mask, Shift, X, AM, Subtarget: Subtarget))
2662	return false;
2663	}
2664
2665	// Try to swap the mask and shift to place shifts which can be done as
2666	// a scale on the outside of the mask.
2667	if (!foldMaskedShiftToScaledMask(DAG&: *CurDAG, N, AM))
2668	return false;
2669
2670	break;
2671	}
2672	case ISD::ZERO_EXTEND: {
2673	// Try to widen a zexted shift left to the same size as its use, so we can
2674	// match the shift as a scale factor.
2675	if (AM.IndexReg.getNode() != nullptr \|\| AM.Scale != `1`)
2676	break;
2677
2678	SDValue Src = N.getOperand(i: `0`);
2679
2680	// See if we can match a zext(addlike(x,c)).
2681	// TODO: Move more ZERO_EXTEND patterns into matchIndexRecursively.
2682	if (Src.getOpcode() == ISD::ADD \|\| Src.getOpcode() == ISD::OR)
2683	if (SDValue Index = matchIndexRecursively(N, AM, Depth: Depth + `1`))
2684	if (Index != N) {
2685	AM.IndexReg = Index;
2686	return false;
2687	}
2688
2689	// Peek through mask: zext(and(shl(x,c1),c2))
2690	APInt Mask = APInt::getAllOnes(numBits: Src.getScalarValueSizeInBits());
2691	if (Src.getOpcode() == ISD::AND && Src.hasOneUse())
2692	if (auto *MaskC = dyn_cast<ConstantSDNode>(Val: Src.getOperand(i: `1`))) {
2693	Mask = MaskC->getAPIntValue();
2694	Src = Src.getOperand(i: `0`);
2695	}
2696
2697	if (Src.getOpcode() == ISD::SHL && Src.hasOneUse()) {
2698	// Give up if the shift is not a valid scale factor [1,2,3].
2699	SDValue ShlSrc = Src.getOperand(i: `0`);
2700	SDValue ShlAmt = Src.getOperand(i: `1`);
2701	auto *ShAmtC = dyn_cast<ConstantSDNode>(Val&: ShlAmt);
2702	if (!ShAmtC)
2703	break;
2704	unsigned ShAmtV = ShAmtC->getZExtValue();
2705	if (ShAmtV > `3`)
2706	break;
2707
2708	// The narrow shift must only shift out zero bits (it must be 'nuw').
2709	// That makes it safe to widen to the destination type.
2710	APInt HighZeros =
2711	APInt::getHighBitsSet(numBits: ShlSrc.getValueSizeInBits(), hiBitsSet: ShAmtV);
2712	if (!Src ->getFlags().hasNoUnsignedWrap() &&
2713	!CurDAG->MaskedValueIsZero(Op: ShlSrc, Mask: HighZeros & Mask))
2714	break;
2715
2716	// zext (shl nuw i8 %x, C1) to i32
2717	// --> shl (zext i8 %x to i32), (zext C1)
2718	// zext (and (shl nuw i8 %x, C1), C2) to i32
2719	// --> shl (zext i8 (and %x, C2 >> C1) to i32), (zext C1)
2720	MVT SrcVT = ShlSrc.getSimpleValueType();
2721	MVT VT = N.getSimpleValueType();
2722	SDLoc DL(N);
2723
2724	SDValue Res = ShlSrc;
2725	if (!Mask.isAllOnes()) {
2726	Res = CurDAG->getConstant(Val: Mask.lshr(shiftAmt: ShAmtV), DL, VT: SrcVT);
2727	insertDAGNode(DAG&: *CurDAG, Pos: N, N: Res);
2728	Res = CurDAG->getNode(Opcode: ISD::AND, DL, VT: SrcVT, N1: ShlSrc, N2: Res);
2729	insertDAGNode(DAG&: *CurDAG, Pos: N, N: Res);
2730	}
2731	SDValue Zext = CurDAG->getNode(Opcode: ISD::ZERO_EXTEND, DL, VT, Operand: Res);
2732	insertDAGNode(DAG&: *CurDAG, Pos: N, N: Zext);
2733	SDValue NewShl = CurDAG->getNode(Opcode: ISD::SHL, DL, VT, N1: Zext, N2: ShlAmt);
2734	insertDAGNode(DAG&: *CurDAG, Pos: N, N: NewShl);
2735	CurDAG->ReplaceAllUsesWith(From: N, To: NewShl);
2736	CurDAG->RemoveDeadNode(N: N.getNode());
2737
2738	// Convert the shift to scale factor.
2739	AM.Scale = `1` << ShAmtV;
2740	// If matchIndexRecursively is not called here,
2741	// Zext may be replaced by other nodes but later used to call a builder
2742	// method
2743	AM.IndexReg = matchIndexRecursively(N: Zext, AM, Depth: Depth + `1`);
2744	return false;
2745	}
2746
2747	if (Src.getOpcode() == ISD::SRL && !Mask.isAllOnes()) {
2748	// Try to fold the mask and shift into an extract and scale.
2749	if (!foldMaskAndShiftToExtract(DAG&: *CurDAG, N, Mask: Mask.getZExtValue(), Shift: Src,
2750	X: Src.getOperand(i: `0`), AM))
2751	return false;
2752
2753	// Try to fold the mask and shift directly into the scale.
2754	if (!foldMaskAndShiftToScale(DAG&: *CurDAG, N, Mask: Mask.getZExtValue(), Shift: Src,
2755	X: Src.getOperand(i: `0`), AM))
2756	return false;
2757
2758	// Try to fold the mask and shift into BEXTR and scale.
2759	if (!foldMaskedShiftToBEXTR(DAG&: *CurDAG, N, Mask: Mask.getZExtValue(), Shift: Src,
2760	X: Src.getOperand(i: `0`), AM, Subtarget: *Subtarget))
2761	return false;
2762	}
2763
2764	break;
2765	}
2766	}
2767
2768	return matchAddressBase(N, AM);
2769	}
2770
2771	/// Helper for MatchAddress. Add the specified node to the
2772	/// specified addressing mode without any further recursion.
2773	bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
2774	// Is the base register already occupied?
2775	if (AM.BaseType != X86ISelAddressMode::RegBase \|\| AM.Base_Reg.getNode()) {
2776	// If so, check to see if the scale index register is set.
2777	if (!AM.IndexReg.getNode()) {
2778	AM.IndexReg = N;
2779	AM.Scale = `1`;
2780	return false;
2781	}
2782
2783	// Otherwise, we cannot select it.
2784	return true;
2785	}
2786
2787	// Default, generate it as a register.
2788	AM.BaseType = X86ISelAddressMode::RegBase;
2789	AM.Base_Reg = N;
2790	return false;
2791	}
2792
2793	bool X86DAGToDAGISel::matchVectorAddressRecursively(SDValue N,
2794	X86ISelAddressMode &AM,
2795	unsigned Depth) {
2796	SDLoc dl(N);
2797	LLVM_DEBUG({
2798	dbgs() << "MatchVectorAddress: ";
2799	AM.dump(CurDAG);
2800	});
2801	// Limit recursion.
2802	if (Depth >= SelectionDAG::MaxRecursionDepth)
2803	return matchAddressBase(N, AM);
2804
2805	// TODO: Support other operations.
2806	switch (N.getOpcode()) {
2807	case ISD::Constant: {
2808	uint64_t Val = cast<ConstantSDNode>(Val&: N)->getSExtValue();
2809	if (!foldOffsetIntoAddress(Offset: Val, AM))
2810	return false;
2811	break;
2812	}
2813	case X86ISD::Wrapper:
2814	if (!matchWrapper(N, AM))
2815	return false;
2816	break;
2817	case ISD::ADD: {
2818	// Add an artificial use to this node so that we can keep track of
2819	// it if it gets CSE'd with a different node.
2820	HandleSDNode Handle(N);
2821
2822	X86ISelAddressMode Backup = AM;
2823	if (!matchVectorAddressRecursively(N: N.getOperand(i: `0`), AM, Depth: Depth + `1`) &&
2824	!matchVectorAddressRecursively(N: Handle.getValue().getOperand(i: `1`), AM,
2825	Depth: Depth + `1`))
2826	return false;
2827	AM = Backup;
2828
2829	// Try again after commuting the operands.
2830	if (!matchVectorAddressRecursively(N: Handle.getValue().getOperand(i: `1`), AM,
2831	Depth: Depth + `1`) &&
2832	!matchVectorAddressRecursively(N: Handle.getValue().getOperand(i: `0`), AM,
2833	Depth: Depth + `1`))
2834	return false;
2835	AM = Backup;
2836
2837	N = Handle.getValue();
2838	break;
2839	}
2840	}
2841
2842	return matchAddressBase(N, AM);
2843	}
2844
2845	/// Helper for selectVectorAddr. Handles things that can be folded into a
2846	/// gather/scatter address. The index register and scale should have already
2847	/// been handled.
2848	bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
2849	return matchVectorAddressRecursively(N, AM, Depth: `0`);
2850	}
2851
2852	bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr,
2853	SDValue IndexOp, SDValue ScaleOp,
2854	SDValue &Base, SDValue &Scale,
2855	SDValue &Index, SDValue &Disp,
2856	SDValue &Segment) {
2857	X86ISelAddressMode AM;
2858	AM.Scale = ScaleOp ->getAsZExtVal();
2859
2860	// Attempt to match index patterns, as long as we're not relying on implicit
2861	// sign-extension, which is performed BEFORE scale.
2862	if (IndexOp.getScalarValueSizeInBits() == BasePtr.getScalarValueSizeInBits())
2863	AM.IndexReg = matchIndexRecursively(N: IndexOp, AM, Depth: `0`);
2864	else
2865	AM.IndexReg = IndexOp;
2866
2867	unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace();
2868	if (AddrSpace == X86AS::GS)
2869	AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2870	if (AddrSpace == X86AS::FS)
2871	AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2872	if (AddrSpace == X86AS::SS)
2873	AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2874
2875	SDLoc DL(BasePtr);
2876	MVT VT = BasePtr.getSimpleValueType();
2877
2878	// Try to match into the base and displacement fields.
2879	if (matchVectorAddress(N: BasePtr, AM))
2880	return false;
2881
2882	getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2883	return true;
2884	}
2885
2886	/// Returns true if it is able to pattern match an addressing mode.
2887	/// It returns the operands which make up the maximal addressing mode it can
2888	/// match by reference.
2889	///
2890	/// Parent is the parent node of the addr operand that is being matched. It
2891	/// is always a load, store, atomic node, or null. It is only null when
2892	/// checking memory operands for inline asm nodes.
2893	bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
2894	SDValue &Scale, SDValue &Index,
2895	SDValue &Disp, SDValue &Segment) {
2896	X86ISelAddressMode AM;
2897
2898	if (Parent &&
2899	// This list of opcodes are all the nodes that have an "addr:$ptr" operand
2900	// that are not a MemSDNode, and thus don't have proper addrspace info.
2901	Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
2902	Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
2903	Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
2904	Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
2905	Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
2906	Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
2907	Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
2908	unsigned AddrSpace =
2909	cast<MemSDNode>(Val: Parent)->getPointerInfo().getAddrSpace();
2910	if (AddrSpace == X86AS::GS)
2911	AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2912	if (AddrSpace == X86AS::FS)
2913	AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2914	if (AddrSpace == X86AS::SS)
2915	AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2916	}
2917
2918	// Save the DL and VT before calling matchAddress, it can invalidate N.
2919	SDLoc DL(N);
2920	MVT VT = N.getSimpleValueType();
2921
2922	if (matchAddress(N, AM))
2923	return false;
2924
2925	getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2926	return true;
2927	}
2928
2929	bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
2930	// Cannot use 32 bit constants to reference objects in kernel/large code
2931	// model.
2932	if (TM.getCodeModel() == CodeModel::Kernel \|\|
2933	TM.getCodeModel() == CodeModel::Large)
2934	return false;
2935
2936	// In static codegen with small code model, we can get the address of a label
2937	// into a register with 'movl'
2938	if (N ->getOpcode() != X86ISD::Wrapper)
2939	return false;
2940
2941	N = N.getOperand(i: `0`);
2942
2943	// At least GNU as does not accept 'movl' for TPOFF relocations.
2944	// FIXME: We could use 'movl' when we know we are targeting MC.
2945	if (N ->getOpcode() == ISD::TargetGlobalTLSAddress)
2946	return false;
2947
2948	Imm = N;
2949	// Small/medium code model can reference non-TargetGlobalAddress objects with
2950	// 32 bit constants.
2951	if (N ->getOpcode() != ISD::TargetGlobalAddress) {
2952	return TM.getCodeModel() == CodeModel::Small \|\|
2953	TM.getCodeModel() == CodeModel::Medium;
2954	}
2955
2956	const GlobalValue *GV = cast<GlobalAddressSDNode>(Val&: N)->getGlobal();
2957	if (std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange())
2958	return CR ->getUnsignedMax().ult(RHS: `1ull` << `32`);
2959
2960	return !TM.isLargeGlobalValue(GV);
2961	}
2962
2963	bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
2964	SDValue &Scale, SDValue &Index,
2965	SDValue &Disp, SDValue &Segment) {
2966	// Save the debug loc before calling selectLEAAddr, in case it invalidates N.
2967	SDLoc DL(N);
2968
2969	if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
2970	return false;
2971
2972	auto *RN = dyn_cast<RegisterSDNode>(Val&: Base);
2973	if (RN && RN->getReg() == `0`)
2974	Base = CurDAG->getRegister(`0`, MVT::i64);
2975	else if (Base.getValueType() == MVT::i32 && !isa<FrameIndexSDNode>(Base)) {
2976	// Base could already be %rip, particularly in the x32 ABI.
2977	SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
2978	MVT::i64), `0`);
2979	Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
2980	Base);
2981	}
2982
2983	RN = dyn_cast<RegisterSDNode>(Val&: Index);
2984	if (RN && RN->getReg() == `0`)
2985	Index = CurDAG->getRegister(`0`, MVT::i64);
2986	else {
2987	assert(Index.getValueType() == MVT::i32 &&
2988	"Expect to be extending 32-bit registers for use in LEA");
2989	SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
2990	MVT::i64), `0`);
2991	Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
2992	Index);
2993	}
2994
2995	return true;
2996	}
2997
2998	/// Calls SelectAddr and determines if the maximal addressing
2999	/// mode it matches can be cost effectively emitted as an LEA instruction.
3000	bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
3001	SDValue &Base, SDValue &Scale,
3002	SDValue &Index, SDValue &Disp,
3003	SDValue &Segment) {
3004	X86ISelAddressMode AM;
3005
3006	// Save the DL and VT before calling matchAddress, it can invalidate N.
3007	SDLoc DL(N);
3008	MVT VT = N.getSimpleValueType();
3009
3010	// Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
3011	// segments.
3012	SDValue Copy = AM.Segment;
3013	SDValue T = CurDAG->getRegister(`0`, MVT::i32);
3014	AM.Segment = T;
3015	if (matchAddress(N, AM))
3016	return false;
3017	assert (T == AM.Segment);
3018	AM.Segment = Copy;
3019
3020	unsigned Complexity = `0`;
3021	if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
3022	Complexity = `1`;
3023	else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
3024	Complexity = `4`;
3025
3026	if (AM.IndexReg.getNode())
3027	Complexity++;
3028
3029	// Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
3030	// a simple shift.
3031	if (AM.Scale > `1`)
3032	Complexity++;
3033
3034	// FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
3035	// to a LEA. This is determined with some experimentation but is by no means
3036	// optimal (especially for code size consideration). LEA is nice because of
3037	// its three-address nature. Tweak the cost function again when we can run
3038	// convertToThreeAddress() at register allocation time.
3039	if (AM.hasSymbolicDisplacement()) {
3040	// For X86-64, always use LEA to materialize RIP-relative addresses.
3041	if (Subtarget->is64Bit())
3042	Complexity = `4`;
3043	else
3044	Complexity += `2`;
3045	}
3046
3047	// Heuristic: try harder to form an LEA from ADD if the operands set flags.
3048	// Unlike ADD, LEA does not affect flags, so we will be less likely to require
3049	// duplicating flag-producing instructions later in the pipeline.
3050	if (N.getOpcode() == ISD::ADD) {
3051	auto isMathWithFlags = [](SDValue V) {
3052	switch (V.getOpcode()) {
3053	case X86ISD::ADD:
3054	case X86ISD::SUB:
3055	case X86ISD::ADC:
3056	case X86ISD::SBB:
3057	case X86ISD::SMUL:
3058	case X86ISD::UMUL:
3059	/ TODO: These opcodes can be added safely, but we may want to justify*
3060	their inclusion for different reasons (better for reg-alloc).
3061	case X86ISD::OR:
3062	case X86ISD::XOR:
3063	case X86ISD::AND:
3064	*/
3065	// Value 1 is the flag output of the node - verify it's not dead.
3066	return !SDValue (V.getNode(), `1`).use_empty();
3067	default:
3068	return false;
3069	}
3070	};
3071	// TODO: We might want to factor in whether there's a load folding
3072	// opportunity for the math op that disappears with LEA.
3073	if (isMathWithFlags (N.getOperand(i: `0`)) \|\| isMathWithFlags (N.getOperand(i: `1`)))
3074	Complexity++;
3075	}
3076
3077	if (AM.Disp)
3078	Complexity++;
3079
3080	// If it isn't worth using an LEA, reject it.
3081	if (Complexity <= `2`)
3082	return false;
3083
3084	getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3085	return true;
3086	}
3087
3088	/// This is only run on TargetGlobalTLSAddress nodes.
3089	bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
3090	SDValue &Scale, SDValue &Index,
3091	SDValue &Disp, SDValue &Segment) {
3092	assert(N.getOpcode() == ISD::TargetGlobalTLSAddress \|\|
3093	N.getOpcode() == ISD::TargetExternalSymbol);
3094
3095	X86ISelAddressMode AM;
3096	if (auto *GA = dyn_cast<GlobalAddressSDNode>(Val&: N)) {
3097	AM.GV = GA->getGlobal();
3098	AM.Disp += GA->getOffset();
3099	AM.SymbolFlags = GA->getTargetFlags();
3100	} else {
3101	auto *SA = cast<ExternalSymbolSDNode>(Val&: N);
3102	AM.ES = SA->getSymbol();
3103	AM.SymbolFlags = SA->getTargetFlags();
3104	}
3105
3106	if (Subtarget->is32Bit()) {
3107	AM.Scale = `1`;
3108	AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
3109	}
3110
3111	MVT VT = N.getSimpleValueType();
3112	getAddressOperands(AM, DL: SDLoc (N), VT, Base, Scale, Index, Disp, Segment);
3113	return true;
3114	}
3115
3116	bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
3117	// Keep track of the original value type and whether this value was
3118	// truncated. If we see a truncation from pointer type to VT that truncates
3119	// bits that are known to be zero, we can use a narrow reference.
3120	EVT VT = N.getValueType();
3121	bool WasTruncated = false;
3122	if (N.getOpcode() == ISD::TRUNCATE) {
3123	WasTruncated = true;
3124	N = N.getOperand(i: `0`);
3125	}
3126
3127	if (N.getOpcode() != X86ISD::Wrapper)
3128	return false;
3129
3130	// We can only use non-GlobalValues as immediates if they were not truncated,
3131	// as we do not have any range information. If we have a GlobalValue and the
3132	// address was not truncated, we can select it as an operand directly.
3133	unsigned Opc = N.getOperand(i: `0`)->getOpcode();
3134	if (Opc != ISD::TargetGlobalAddress \|\| !WasTruncated) {
3135	Op = N.getOperand(i: `0`);
3136	// We can only select the operand directly if we didn't have to look past a
3137	// truncate.
3138	return !WasTruncated;
3139	}
3140
3141	// Check that the global's range fits into VT.
3142	auto *GA = cast<GlobalAddressSDNode>(Val: N.getOperand(i: `0`));
3143	std::optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
3144	if (!CR \|\| CR ->getUnsignedMax().uge(RHS: `1ull` << VT.getSizeInBits()))
3145	return false;
3146
3147	// Okay, we can use a narrow reference.
3148	Op = CurDAG->getTargetGlobalAddress(GV: GA->getGlobal(), DL: SDLoc (N), VT,
3149	offset: GA->getOffset(), TargetFlags: GA->getTargetFlags());
3150	return true;
3151	}
3152
3153	bool X86DAGToDAGISel::tryFoldLoad(SDNode Root, SDNode P, SDValue N,
3154	SDValue &Base, SDValue &Scale,
3155	SDValue &Index, SDValue &Disp,
3156	SDValue &Segment) {
3157	assert(Root && P && "Unknown root/parent nodes");
3158	if (!ISD::isNON_EXTLoad(N: N.getNode()) \|\|
3159	!IsProfitableToFold(N, U: P, Root) \|\|
3160	!IsLegalToFold(N, U: P, Root, OptLevel))
3161	return false;
3162
3163	return selectAddr(Parent: N.getNode(),
3164	N: N.getOperand(i: `1`), Base, Scale, Index, Disp, Segment);
3165	}
3166
3167	bool X86DAGToDAGISel::tryFoldBroadcast(SDNode Root, SDNode P, SDValue N,
3168	SDValue &Base, SDValue &Scale,
3169	SDValue &Index, SDValue &Disp,
3170	SDValue &Segment) {
3171	assert(Root && P && "Unknown root/parent nodes");
3172	if (N ->getOpcode() != X86ISD::VBROADCAST_LOAD \|\|
3173	!IsProfitableToFold(N, U: P, Root) \|\|
3174	!IsLegalToFold(N, U: P, Root, OptLevel))
3175	return false;
3176
3177	return selectAddr(Parent: N.getNode(),
3178	N: N.getOperand(i: `1`), Base, Scale, Index, Disp, Segment);
3179	}
3180
3181	/// Return an SDNode that returns the value of the global base register.
3182	/// Output instructions required to initialize the global base register,
3183	/// if necessary.
3184	SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
3185	unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
3186	auto &DL = MF->getDataLayout();
3187	return CurDAG->getRegister(Reg: GlobalBaseReg, VT: TLI->getPointerTy(DL)).getNode();
3188	}
3189
3190	bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode N) const* {
3191	if (N->getOpcode() == ISD::TRUNCATE)
3192	N = N->getOperand(Num: `0`).getNode();
3193	if (N->getOpcode() != X86ISD::Wrapper)
3194	return false;
3195
3196	auto *GA = dyn_cast<GlobalAddressSDNode>(Val: N->getOperand(Num: `0`));
3197	if (!GA)
3198	return false;
3199
3200	auto *GV = GA->getGlobal();
3201	std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange();
3202	if (CR)
3203	return CR ->getSignedMin().sge(RHS: -`1ull` << Width) &&
3204	CR ->getSignedMax().slt(RHS: `1ull` << Width);
3205	// In the kernel code model, globals are in the negative 2GB of the address
3206	// space, so globals can be a sign extended 32-bit immediate.
3207	// In other code models, small globals are in the low 2GB of the address
3208	// space, so sign extending them is equivalent to zero extending them.
3209	return Width == `32` && !TM.isLargeGlobalValue(GV);
3210	}
3211
3212	X86::CondCode X86DAGToDAGISel::getCondFromNode(SDNode N) const* {
3213	assert(N->isMachineOpcode() && "Unexpected node");
3214	unsigned Opc = N->getMachineOpcode();
3215	const MCInstrDesc &MCID = getInstrInfo()->get(Opc);
3216	int CondNo = X86::getCondSrcNoFromDesc(MCID);
3217	if (CondNo < `0`)
3218	return X86::COND_INVALID;
3219
3220	return static_cast<X86::CondCode>(N->getConstantOperandVal(Num: CondNo));
3221	}
3222
3223	/// Test whether the given X86ISD::CMP node has any users that use a flag
3224	/// other than ZF.
3225	bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
3226	// Examine each user of the node.
3227	for (SDNode::use_iterator UI = Flags ->use_begin(), UE = Flags ->use_end();
3228	UI != UE; ++UI) {
3229	// Only check things that use the flags.
3230	if (UI.getUse().getResNo() != Flags.getResNo())
3231	continue;
3232	// Only examine CopyToReg uses that copy to EFLAGS.
3233	if (UI->getOpcode() != ISD::CopyToReg \|\|
3234	cast<RegisterSDNode>(UI->getOperand(`1`))->getReg() != X86::EFLAGS)
3235	return false;
3236	// Examine each user of the CopyToReg use.
3237	for (SDNode::use_iterator FlagUI = UI ->use_begin(),
3238	FlagUE = UI ->use_end(); FlagUI != FlagUE; ++FlagUI) {
3239	// Only examine the Flag result.
3240	if (FlagUI.getUse().getResNo() != `1`) continue;
3241	// Anything unusual: assume conservatively.
3242	if (!FlagUI ->isMachineOpcode()) return false;
3243	// Examine the condition code of the user.
3244	X86::CondCode CC = getCondFromNode(N: *FlagUI);
3245
3246	switch (CC) {
3247	// Comparisons which only use the zero flag.
3248	case X86::COND_E: case X86::COND_NE:
3249	continue;
3250	// Anything else: assume conservatively.
3251	default:
3252	return false;
3253	}
3254	}
3255	}
3256	return true;
3257	}
3258
3259	/// Test whether the given X86ISD::CMP node has any uses which require the SF
3260	/// flag to be accurate.
3261	bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
3262	// Examine each user of the node.
3263	for (SDNode::use_iterator UI = Flags ->use_begin(), UE = Flags ->use_end();
3264	UI != UE; ++UI) {
3265	// Only check things that use the flags.
3266	if (UI.getUse().getResNo() != Flags.getResNo())
3267	continue;
3268	// Only examine CopyToReg uses that copy to EFLAGS.
3269	if (UI->getOpcode() != ISD::CopyToReg \|\|
3270	cast<RegisterSDNode>(UI->getOperand(`1`))->getReg() != X86::EFLAGS)
3271	return false;
3272	// Examine each user of the CopyToReg use.
3273	for (SDNode::use_iterator FlagUI = UI ->use_begin(),
3274	FlagUE = UI ->use_end(); FlagUI != FlagUE; ++FlagUI) {
3275	// Only examine the Flag result.
3276	if (FlagUI.getUse().getResNo() != `1`) continue;
3277	// Anything unusual: assume conservatively.
3278	if (!FlagUI ->isMachineOpcode()) return false;
3279	// Examine the condition code of the user.
3280	X86::CondCode CC = getCondFromNode(N: *FlagUI);
3281
3282	switch (CC) {
3283	// Comparisons which don't examine the SF flag.
3284	case X86::COND_A: case X86::COND_AE:
3285	case X86::COND_B: case X86::COND_BE:
3286	case X86::COND_E: case X86::COND_NE:
3287	case X86::COND_O: case X86::COND_NO:
3288	case X86::COND_P: case X86::COND_NP:
3289	continue;
3290	// Anything else: assume conservatively.
3291	default:
3292	return false;
3293	}
3294	}
3295	}
3296	return true;
3297	}
3298
3299	static bool mayUseCarryFlag(X86::CondCode CC) {
3300	switch (CC) {
3301	// Comparisons which don't examine the CF flag.
3302	case X86::COND_O: case X86::COND_NO:
3303	case X86::COND_E: case X86::COND_NE:
3304	case X86::COND_S: case X86::COND_NS:
3305	case X86::COND_P: case X86::COND_NP:
3306	case X86::COND_L: case X86::COND_GE:
3307	case X86::COND_G: case X86::COND_LE:
3308	return false;
3309	// Anything else: assume conservatively.
3310	default:
3311	return true;
3312	}
3313	}
3314
3315	/// Test whether the given node which sets flags has any uses which require the
3316	/// CF flag to be accurate.
3317	bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
3318	// Examine each user of the node.
3319	for (SDNode::use_iterator UI = Flags ->use_begin(), UE = Flags ->use_end();
3320	UI != UE; ++UI) {
3321	// Only check things that use the flags.
3322	if (UI.getUse().getResNo() != Flags.getResNo())
3323	continue;
3324
3325	unsigned UIOpc = UI ->getOpcode();
3326
3327	if (UIOpc == ISD::CopyToReg) {
3328	// Only examine CopyToReg uses that copy to EFLAGS.
3329	if (cast<RegisterSDNode>(UI->getOperand(`1`))->getReg() != X86::EFLAGS)
3330	return false;
3331	// Examine each user of the CopyToReg use.
3332	for (SDNode::use_iterator FlagUI = UI ->use_begin(), FlagUE = UI ->use_end();
3333	FlagUI != FlagUE; ++FlagUI) {
3334	// Only examine the Flag result.
3335	if (FlagUI.getUse().getResNo() != `1`)
3336	continue;
3337	// Anything unusual: assume conservatively.
3338	if (!FlagUI ->isMachineOpcode())
3339	return false;
3340	// Examine the condition code of the user.
3341	X86::CondCode CC = getCondFromNode(N: *FlagUI);
3342
3343	if (mayUseCarryFlag(CC))
3344	return false;
3345	}
3346
3347	// This CopyToReg is ok. Move on to the next user.
3348	continue;
3349	}
3350
3351	// This might be an unselected node. So look for the pre-isel opcodes that
3352	// use flags.
3353	unsigned CCOpNo;
3354	switch (UIOpc) {
3355	default:
3356	// Something unusual. Be conservative.
3357	return false;
3358	case X86ISD::SETCC: CCOpNo = `0`; break;
3359	case X86ISD::SETCC_CARRY: CCOpNo = `0`; break;
3360	case X86ISD::CMOV: CCOpNo = `2`; break;
3361	case X86ISD::BRCOND: CCOpNo = `2`; break;
3362	}
3363
3364	X86::CondCode CC = (X86::CondCode)UI ->getConstantOperandVal(Num: CCOpNo);
3365	if (mayUseCarryFlag(CC))
3366	return false;
3367	}
3368	return true;
3369	}
3370
3371	/// Check whether or not the chain ending in StoreNode is suitable for doing
3372	/// the {load; op; store} to modify transformation.
3373	static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
3374	SDValue StoredVal, SelectionDAG *CurDAG,
3375	unsigned LoadOpNo,
3376	LoadSDNode *&LoadNode,
3377	SDValue &InputChain) {
3378	// Is the stored value result 0 of the operation?
3379	if (StoredVal.getResNo() != `0`) return false;
3380
3381	// Are there other uses of the operation other than the store?
3382	if (!StoredVal.getNode()->hasNUsesOfValue(NUses: `1`, Value: `0`)) return false;
3383
3384	// Is the store non-extending and non-indexed?
3385	if (!ISD::isNormalStore(N: StoreNode) \|\| StoreNode->isNonTemporal())
3386	return false;
3387
3388	SDValue Load = StoredVal ->getOperand(Num: LoadOpNo);
3389	// Is the stored value a non-extending and non-indexed load?
3390	if (!ISD::isNormalLoad(N: Load.getNode())) return false;
3391
3392	// Return LoadNode by reference.
3393	LoadNode = cast<LoadSDNode>(Val&: Load);
3394
3395	// Is store the only read of the loaded value?
3396	if (!Load.hasOneUse())
3397	return false;
3398
3399	// Is the address of the store the same as the load?
3400	if (LoadNode->getBasePtr() != StoreNode->getBasePtr() \|\|
3401	LoadNode->getOffset() != StoreNode->getOffset())
3402	return false;
3403
3404	bool FoundLoad = false;
3405	SmallVector<SDValue, `4`> ChainOps;
3406	SmallVector<const SDNode *, `4`> LoopWorklist;
3407	SmallPtrSet<const SDNode *, `16`> Visited;
3408	const unsigned int Max = `1024`;
3409
3410	// Visualization of Load-Op-Store fusion:
3411	// -------------------------
3412	// Legend:
3413	// -lines = Chain operand dependencies.*
3414	// \|-lines = Normal operand dependencies.
3415	// Dependencies flow down and right. n-suffix references multiple nodes.
3416	//
3417	// C Xn C
3418	// * * *
3419	// * * *
3420	// Xn A-LD Yn TF Yn
3421	// * \ \| * \|*
3422	// * \ \| * \|*
3423	// * \ \| => A--LD_OP_ST*
3424	// * \\| \*
3425	// TF OP \
3426	// \| \ Zn*
3427	// \| \*
3428	// A-ST Zn
3429	//
3430
3431	// This merge induced dependences from: #1: Xn -> LD, OP, Zn
3432	// #2: Yn -> LD
3433	// #3: ST -> Zn
3434
3435	// Ensure the transform is safe by checking for the dual
3436	// dependencies to make sure we do not induce a loop.
3437
3438	// As LD is a predecessor to both OP and ST we can do this by checking:
3439	// a). if LD is a predecessor to a member of Xn or Yn.
3440	// b). if a Zn is a predecessor to ST.
3441
3442	// However, (b) can only occur through being a chain predecessor to
3443	// ST, which is the same as Zn being a member or predecessor of Xn,
3444	// which is a subset of LD being a predecessor of Xn. So it's
3445	// subsumed by check (a).
3446
3447	SDValue Chain = StoreNode->getChain();
3448
3449	// Gather X elements in ChainOps.
3450	if (Chain == Load.getValue(R: `1`)) {
3451	FoundLoad = true;
3452	ChainOps.push_back(Elt: Load.getOperand(i: `0`));
3453	} else if (Chain.getOpcode() == ISD::TokenFactor) {
3454	for (unsigned i = `0`, e = Chain.getNumOperands(); i != e; ++i) {
3455	SDValue Op = Chain.getOperand(i);
3456	if (Op == Load.getValue(R: `1`)) {
3457	FoundLoad = true;
3458	// Drop Load, but keep its chain. No cycle check necessary.
3459	ChainOps.push_back(Elt: Load.getOperand(i: `0`));
3460	continue;
3461	}
3462	LoopWorklist.push_back(Elt: Op.getNode());
3463	ChainOps.push_back(Elt: Op);
3464	}
3465	}
3466
3467	if (!FoundLoad)
3468	return false;
3469
3470	// Worklist is currently Xn. Add Yn to worklist.
3471	for (SDValue Op : StoredVal ->ops())
3472	if (Op.getNode() != LoadNode)
3473	LoopWorklist.push_back(Elt: Op.getNode());
3474
3475	// Check (a) if Load is a predecessor to Xn + Yn
3476	if (SDNode::hasPredecessorHelper(N: Load.getNode(), Visited, Worklist&: LoopWorklist, MaxSteps: Max,
3477	TopologicalPrune: true))
3478	return false;
3479
3480	InputChain =
3481	CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
3482	return true;
3483	}
3484
3485	// Change a chain of {load; op; store} of the same value into a simple op
3486	// through memory of that value, if the uses of the modified value and its
3487	// address are suitable.
3488	//
3489	// The tablegen pattern memory operand pattern is currently not able to match
3490	// the case where the EFLAGS on the original operation are used.
3491	//
3492	// To move this to tablegen, we'll need to improve tablegen to allow flags to
3493	// be transferred from a node in the pattern to the result node, probably with
3494	// a new keyword. For example, we have this
3495	// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3496	// [(store (add (loadi64 addr:$dst), -1), addr:$dst),
3497	// (implicit EFLAGS)]>;
3498	// but maybe need something like this
3499	// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3500	// [(store (add (loadi64 addr:$dst), -1), addr:$dst),
3501	// (transferrable EFLAGS)]>;
3502	//
3503	// Until then, we manually fold these and instruction select the operation
3504	// here.
3505	bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
3506	auto *StoreNode = cast<StoreSDNode>(Val: Node);
3507	SDValue StoredVal = StoreNode->getOperand(Num: `1`);
3508	unsigned Opc = StoredVal ->getOpcode();
3509
3510	// Before we try to select anything, make sure this is memory operand size
3511	// and opcode we can handle. Note that this must match the code below that
3512	// actually lowers the opcodes.
3513	EVT MemVT = StoreNode->getMemoryVT();
3514	if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
3515	MemVT != MVT::i8)
3516	return false;
3517
3518	bool IsCommutable = false;
3519	bool IsNegate = false;
3520	switch (Opc) {
3521	default:
3522	return false;
3523	case X86ISD::SUB:
3524	IsNegate = isNullConstant(V: StoredVal.getOperand(i: `0`));
3525	break;
3526	case X86ISD::SBB:
3527	break;
3528	case X86ISD::ADD:
3529	case X86ISD::ADC:
3530	case X86ISD::AND:
3531	case X86ISD::OR:
3532	case X86ISD::XOR:
3533	IsCommutable = true;
3534	break;
3535	}
3536
3537	unsigned LoadOpNo = IsNegate ? `1` : `0`;
3538	LoadSDNode LoadNode = nullptr*;
3539	SDValue InputChain;
3540	if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3541	LoadNode, InputChain)) {
3542	if (!IsCommutable)
3543	return false;
3544
3545	// This operation is commutable, try the other operand.
3546	LoadOpNo = `1`;
3547	if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3548	LoadNode, InputChain))
3549	return false;
3550	}
3551
3552	SDValue Base, Scale, Index, Disp, Segment;
3553	if (!selectAddr(Parent: LoadNode, N: LoadNode->getBasePtr(), Base, Scale, Index, Disp,
3554	Segment))
3555	return false;
3556
3557	auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
3558	unsigned Opc8) {
3559	switch (MemVT.getSimpleVT().SimpleTy) {
3560	case MVT::i64:
3561	return Opc64;
3562	case MVT::i32:
3563	return Opc32;
3564	case MVT::i16:
3565	return Opc16;
3566	case MVT::i8:
3567	return Opc8;
3568	default:
3569	llvm_unreachable("Invalid size!");
3570	}
3571	};
3572
3573	MachineSDNode *Result;
3574	switch (Opc) {
3575	case X86ISD::SUB:
3576	// Handle negate.
3577	if (IsNegate) {
3578	unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
3579	X86::NEG8m);
3580	const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3581	Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3582	MVT::Other, Ops);
3583	break;
3584	}
3585	[[fallthrough]];
3586	case X86ISD::ADD:
3587	// Try to match inc/dec.
3588	if (!Subtarget->slowIncDec() \|\| CurDAG->shouldOptForSize()) {
3589	bool IsOne = isOneConstant(V: StoredVal.getOperand(i: `1`));
3590	bool IsNegOne = isAllOnesConstant(V: StoredVal.getOperand(i: `1`));
3591	// ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
3592	if ((IsOne \|\| IsNegOne) && hasNoCarryFlagUses(Flags: StoredVal.getValue(R: `1`))) {
3593	unsigned NewOpc =
3594	((Opc == X86ISD::ADD) == IsOne)
3595	? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
3596	: SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
3597	const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3598	Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3599	MVT::Other, Ops);
3600	break;
3601	}
3602	}
3603	[[fallthrough]];
3604	case X86ISD::ADC:
3605	case X86ISD::SBB:
3606	case X86ISD::AND:
3607	case X86ISD::OR:
3608	case X86ISD::XOR: {
3609	auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
3610	switch (Opc) {
3611	case X86ISD::ADD:
3612	return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
3613	X86::ADD8mr);
3614	case X86ISD::ADC:
3615	return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
3616	X86::ADC8mr);
3617	case X86ISD::SUB:
3618	return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
3619	X86::SUB8mr);
3620	case X86ISD::SBB:
3621	return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
3622	X86::SBB8mr);
3623	case X86ISD::AND:
3624	return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
3625	X86::AND8mr);
3626	case X86ISD::OR:
3627	return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
3628	case X86ISD::XOR:
3629	return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
3630	X86::XOR8mr);
3631	default:
3632	llvm_unreachable("Invalid opcode!");
3633	}
3634	};
3635	auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
3636	switch (Opc) {
3637	case X86ISD::ADD:
3638	return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
3639	X86::ADD8mi);
3640	case X86ISD::ADC:
3641	return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
3642	X86::ADC8mi);
3643	case X86ISD::SUB:
3644	return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
3645	X86::SUB8mi);
3646	case X86ISD::SBB:
3647	return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
3648	X86::SBB8mi);
3649	case X86ISD::AND:
3650	return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
3651	X86::AND8mi);
3652	case X86ISD::OR:
3653	return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
3654	X86::OR8mi);
3655	case X86ISD::XOR:
3656	return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
3657	X86::XOR8mi);
3658	default:
3659	llvm_unreachable("Invalid opcode!");
3660	}
3661	};
3662
3663	unsigned NewOpc = SelectRegOpcode (Opc);
3664	SDValue Operand = StoredVal ->getOperand(Num: `1`-LoadOpNo);
3665
3666	// See if the operand is a constant that we can fold into an immediate
3667	// operand.
3668	if (auto *OperandC = dyn_cast<ConstantSDNode>(Val&: Operand)) {
3669	int64_t OperandV = OperandC->getSExtValue();
3670
3671	// Check if we can shrink the operand enough to fit in an immediate (or
3672	// fit into a smaller immediate) by negating it and switching the
3673	// operation.
3674	if ((Opc == X86ISD::ADD \|\| Opc == X86ISD::SUB) &&
3675	((MemVT != MVT::i8 && !isInt<`8`>(OperandV) && isInt<`8`>(-OperandV)) \|\|
3676	(MemVT == MVT::i64 && !isInt<`32`>(OperandV) &&
3677	isInt<`32`>(-OperandV))) &&
3678	hasNoCarryFlagUses(StoredVal.getValue(`1`))) {
3679	OperandV = -OperandV;
3680	Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
3681	}
3682
3683	if (MemVT != MVT::i64 \|\| isInt<`32`>(OperandV)) {
3684	Operand = CurDAG->getTargetConstant(Val: OperandV, DL: SDLoc (Node), VT: MemVT);
3685	NewOpc = SelectImmOpcode(Opc);
3686	}
3687	}
3688
3689	if (Opc == X86ISD::ADC \|\| Opc == X86ISD::SBB) {
3690	SDValue CopyTo =
3691	CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
3692	StoredVal.getOperand(`2`), SDValue());
3693
3694	const SDValue Ops[] = {Base, Scale, Index, Disp,
3695	Segment, Operand, CopyTo, CopyTo.getValue(R: `1`)};
3696	Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3697	Ops);
3698	} else {
3699	const SDValue Ops[] = {Base, Scale, Index, Disp,
3700	Segment, Operand, InputChain};
3701	Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3702	Ops);
3703	}
3704	break;
3705	}
3706	default:
3707	llvm_unreachable("Invalid opcode!");
3708	}
3709
3710	MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
3711	LoadNode->getMemOperand()};
3712	CurDAG->setNodeMemRefs(N: Result, NewMemRefs: MemOps);
3713
3714	// Update Load Chain uses as well.
3715	ReplaceUses(F: SDValue (LoadNode, `1`), T: SDValue (Result, `1`));
3716	ReplaceUses(F: SDValue (StoreNode, `0`), T: SDValue (Result, `1`));
3717	ReplaceUses(F: SDValue (StoredVal.getNode(), `1`), T: SDValue (Result, `0`));
3718	CurDAG->RemoveDeadNode(N: Node);
3719	return true;
3720	}
3721
3722	// See if this is an X & Mask that we can match to BEXTR/BZHI.
3723	// Where Mask is one of the following patterns:
3724	// a) x & (1 << nbits) - 1
3725	// b) x & ~(-1 << nbits)
3726	// c) x & (-1 >> (32 - y))
3727	// d) x << (32 - y) >> (32 - y)
3728	// e) (1 << nbits) - 1
3729	bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
3730	assert(
3731	(Node->getOpcode() == ISD::ADD \|\| Node->getOpcode() == ISD::AND \|\|
3732	Node->getOpcode() == ISD::SRL) &&
3733	"Should be either an and-mask, or right-shift after clearing high bits.");
3734
3735	// BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3736	if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
3737	return false;
3738
3739	MVT NVT = Node->getSimpleValueType(ResNo: `0`);
3740
3741	// Only supported for 32 and 64 bits.
3742	if (NVT != MVT::i32 && NVT != MVT::i64)
3743	return false;
3744
3745	SDValue NBits;
3746	bool NegateNBits;
3747
3748	// If we have BMI2's BZHI, we are ok with muti-use patterns.
3749	// Else, if we only have BMI1's BEXTR, we require one-use.
3750	const bool AllowExtraUsesByDefault = Subtarget->hasBMI2();
3751	auto checkUses = [AllowExtraUsesByDefault](
3752	SDValue Op, unsigned NUses,
3753	std::optional<bool> AllowExtraUses) {
3754	return AllowExtraUses.value_or(AllowExtraUsesByDefault) \|\|
3755	Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
3756	};
3757	auto checkOneUse = [checkUses](SDValue Op,
3758	std::optional<bool> AllowExtraUses =
3759	std::nullopt) {
3760	return checkUses(Op, `1`, AllowExtraUses);
3761	};
3762	auto checkTwoUse = [checkUses](SDValue Op,
3763	std::optional<bool> AllowExtraUses =
3764	std::nullopt) {
3765	return checkUses(Op, `2`, AllowExtraUses);
3766	};
3767
3768	auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
3769	if (V ->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
3770	assert(V.getSimpleValueType() == MVT::i32 &&
3771	V.getOperand(`0`).getSimpleValueType() == MVT::i64 &&
3772	"Expected i64 -> i32 truncation");
3773	V = V.getOperand(i: `0`);
3774	}
3775	return V;
3776	};
3777
3778	// a) x & ((1 << nbits) + (-1))
3779	auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, &NBits,
3780	&NegateNBits](SDValue Mask) -> bool {
3781	// Match `add`. Must only have one use!
3782	if (Mask ->getOpcode() != ISD::ADD \|\| !checkOneUse(Mask))
3783	return false;
3784	// We should be adding all-ones constant (i.e. subtracting one.)
3785	if (!isAllOnesConstant(V: Mask ->getOperand(Num: `1`)))
3786	return false;
3787	// Match `1 << nbits`. Might be truncated. Must only have one use!
3788	SDValue M0 = peekThroughOneUseTruncation (Mask ->getOperand(Num: `0`));
3789	if (M0 ->getOpcode() != ISD::SHL \|\| !checkOneUse(M0))
3790	return false;
3791	if (!isOneConstant(V: M0 ->getOperand(Num: `0`)))
3792	return false;
3793	NBits = M0 ->getOperand(Num: `1`);
3794	NegateNBits = false;
3795	return true;
3796	};
3797
3798	auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
3799	V = peekThroughOneUseTruncation (V);
3800	return CurDAG->MaskedValueIsAllOnes(
3801	Op: V, Mask: APInt::getLowBitsSet(numBits: V.getSimpleValueType().getSizeInBits(),
3802	loBitsSet: NVT.getSizeInBits()));
3803	};
3804
3805	// b) x & ~(-1 << nbits)
3806	auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
3807	&NBits, &NegateNBits](SDValue Mask) -> bool {
3808	// Match `~()`. Must only have one use!
3809	if (Mask.getOpcode() != ISD::XOR \|\| !checkOneUse(Mask))
3810	return false;
3811	// The -1 only has to be all-ones for the final Node's NVT.
3812	if (!isAllOnes (Mask ->getOperand(Num: `1`)))
3813	return false;
3814	// Match `-1 << nbits`. Might be truncated. Must only have one use!
3815	SDValue M0 = peekThroughOneUseTruncation (Mask ->getOperand(Num: `0`));
3816	if (M0 ->getOpcode() != ISD::SHL \|\| !checkOneUse(M0))
3817	return false;
3818	// The -1 only has to be all-ones for the final Node's NVT.
3819	if (!isAllOnes (M0 ->getOperand(Num: `0`)))
3820	return false;
3821	NBits = M0 ->getOperand(Num: `1`);
3822	NegateNBits = false;
3823	return true;
3824	};
3825
3826	// Try to match potentially-truncated shift amount as `(bitwidth - y)`,
3827	// or leave the shift amount as-is, but then we'll have to negate it.
3828	auto canonicalizeShiftAmt = [&NBits, &NegateNBits](SDValue ShiftAmt,
3829	unsigned Bitwidth) {
3830	NBits = ShiftAmt;
3831	NegateNBits = true;
3832	// Skip over a truncate of the shift amount, if any.
3833	if (NBits.getOpcode() == ISD::TRUNCATE)
3834	NBits = NBits.getOperand(i: `0`);
3835	// Try to match the shift amount as (bitwidth - y). It should go away, too.
3836	// If it doesn't match, that's fine, we'll just negate it ourselves.
3837	if (NBits.getOpcode() != ISD::SUB)
3838	return;
3839	auto *V0 = dyn_cast<ConstantSDNode>(Val: NBits.getOperand(i: `0`));
3840	if (!V0 \|\| V0->getZExtValue() != Bitwidth)
3841	return;
3842	NBits = NBits.getOperand(i: `1`);
3843	NegateNBits = false;
3844	};
3845
3846	// c) x & (-1 >> z) but then we'll have to subtract z from bitwidth
3847	// or
3848	// c) x & (-1 >> (32 - y))
3849	auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, &NegateNBits,
3850	canonicalizeShiftAmt](SDValue Mask) -> bool {
3851	// The mask itself may be truncated.
3852	Mask = peekThroughOneUseTruncation (Mask);
3853	unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
3854	// Match `l>>`. Must only have one use!
3855	if (Mask.getOpcode() != ISD::SRL \|\| !checkOneUse(Mask))
3856	return false;
3857	// We should be shifting truly all-ones constant.
3858	if (!isAllOnesConstant(V: Mask.getOperand(i: `0`)))
3859	return false;
3860	SDValue M1 = Mask.getOperand(i: `1`);
3861	// The shift amount should not be used externally.
3862	if (!checkOneUse(M1))
3863	return false;
3864	canonicalizeShiftAmt (M1, Bitwidth);
3865	// Pattern c. is non-canonical, and is expanded into pattern d. iff there
3866	// is no extra use of the mask. Clearly, there was one since we are here.
3867	// But at the same time, if we need to negate the shift amount,
3868	// then we don't want the mask to stick around, else it's unprofitable.
3869	return !NegateNBits;
3870	};
3871
3872	SDValue X;
3873
3874	// d) x << z >> z but then we'll have to subtract z from bitwidth
3875	// or
3876	// d) x << (32 - y) >> (32 - y)
3877	auto matchPatternD = [checkOneUse, checkTwoUse, canonicalizeShiftAmt,
3878	AllowExtraUsesByDefault, &NegateNBits,
3879	&X](SDNode Node) -> bool* {
3880	if (Node->getOpcode() != ISD::SRL)
3881	return false;
3882	SDValue N0 = Node->getOperand(Num: `0`);
3883	if (N0 ->getOpcode() != ISD::SHL)
3884	return false;
3885	unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
3886	SDValue N1 = Node->getOperand(Num: `1`);
3887	SDValue N01 = N0 ->getOperand(Num: `1`);
3888	// Both of the shifts must be by the exact same value.
3889	if (N1 != N01)
3890	return false;
3891	canonicalizeShiftAmt (N1, Bitwidth);
3892	// There should not be any external uses of the inner shift / shift amount.
3893	// Note that while we are generally okay with external uses given BMI2,
3894	// iff we need to negate the shift amount, we are not okay with extra uses.
3895	const bool AllowExtraUses = AllowExtraUsesByDefault && !NegateNBits;
3896	if (!checkOneUse(N0, AllowExtraUses) \|\| !checkTwoUse(N1, AllowExtraUses))
3897	return false;
3898	X = N0 ->getOperand(Num: `0`);
3899	return true;
3900	};
3901
3902	auto matchLowBitMask = [matchPatternA, matchPatternB,
3903	matchPatternC](SDValue Mask) -> bool {
3904	return matchPatternA (Mask) \|\| matchPatternB (Mask) \|\| matchPatternC (Mask);
3905	};
3906
3907	if (Node->getOpcode() == ISD::AND) {
3908	X = Node->getOperand(Num: `0`);
3909	SDValue Mask = Node->getOperand(Num: `1`);
3910
3911	if (matchLowBitMask (Mask)) {
3912	// Great.
3913	} else {
3914	std::swap(a&: X, b&: Mask);
3915	if (!matchLowBitMask (Mask))
3916	return false;
3917	}
3918	} else if (matchLowBitMask (SDValue (Node, `0`))) {
3919	X = CurDAG->getAllOnesConstant(DL: SDLoc (Node), VT: NVT);
3920	} else if (!matchPatternD (Node))
3921	return false;
3922
3923	// If we need to negate the shift amount, require BMI2 BZHI support.
3924	// It's just too unprofitable for BMI1 BEXTR.
3925	if (NegateNBits && !Subtarget->hasBMI2())
3926	return false;
3927
3928	SDLoc DL(Node);
3929
3930	// Truncate the shift amount.
3931	NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
3932	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: NBits);
3933
3934	// Insert 8-bit NBits into lowest 8 bits of 32-bit register.
3935	// All the other bits are undefined, we do not care about them.
3936	SDValue ImplDef = SDValue(
3937	CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), `0`);
3938	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: ImplDef);
3939
3940	SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32);
3941	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: SRIdxVal);
3942	NBits = SDValue(CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
3943	MVT::i32, ImplDef, NBits, SRIdxVal),
3944	`0`);
3945	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: NBits);
3946
3947	// We might have matched the amount of high bits to be cleared,
3948	// but we want the amount of low bits to be kept, so negate it then.
3949	if (NegateNBits) {
3950	SDValue BitWidthC = CurDAG->getConstant(NVT.getSizeInBits(), DL, MVT::i32);
3951	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: BitWidthC);
3952
3953	NBits = CurDAG->getNode(ISD::SUB, DL, MVT::i32, BitWidthC, NBits);
3954	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: NBits);
3955	}
3956
3957	if (Subtarget->hasBMI2()) {
3958	// Great, just emit the BZHI..
3959	if (NVT != MVT::i32) {
3960	// But have to place the bit count into the wide-enough register first.
3961	NBits = CurDAG->getNode(Opcode: ISD::ANY_EXTEND, DL, VT: NVT, Operand: NBits);
3962	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: NBits);
3963	}
3964
3965	SDValue Extract = CurDAG->getNode(Opcode: X86ISD::BZHI, DL, VT: NVT, N1: X, N2: NBits);
3966	ReplaceNode(F: Node, T: Extract.getNode());
3967	SelectCode(Extract.getNode());
3968	return true;
3969	}
3970
3971	// Else, if we do NOT* have BMI2, let's find out if the if the 'X' is*
3972	// logically* shifted (potentially with one-use trunc inbetween),*
3973	// and the truncation was the only use of the shift,
3974	// and if so look past one-use truncation.
3975	{
3976	SDValue RealX = peekThroughOneUseTruncation (X);
3977	// FIXME: only if the shift is one-use?
3978	if (RealX != X && RealX.getOpcode() == ISD::SRL)
3979	X = RealX;
3980	}
3981
3982	MVT XVT = X.getSimpleValueType();
3983
3984	// Else, emitting BEXTR requires one more step.
3985	// The 'control' of BEXTR has the pattern of:
3986	// [15...8 bit][ 7...0 bit] location
3987	// [ bit count][ shift] name
3988	// I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
3989
3990	// Shift NBits left by 8 bits, thus producing 'control'.
3991	// This makes the low 8 bits to be zero.
3992	SDValue C8 = CurDAG->getConstant(`8`, DL, MVT::i8);
3993	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: C8);
3994	SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8);
3995	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: Control);
3996
3997	// If the 'X' is logically* shifted, we can fold that shift into 'control'.*
3998	// FIXME: only if the shift is one-use?
3999	if (X.getOpcode() == ISD::SRL) {
4000	SDValue ShiftAmt = X.getOperand(i: `1`);
4001	X = X.getOperand(i: `0`);
4002
4003	assert(ShiftAmt.getValueType() == MVT::i8 &&
4004	"Expected shift amount to be i8");
4005
4006	// Now, zero-extend the shift amount. The bits 8...15 must* be zero!*
4007	// We could zext to i16 in some form, but we intentionally don't do that.
4008	SDValue OrigShiftAmt = ShiftAmt;
4009	ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt);
4010	insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: ShiftAmt);
4011
4012	// And now 'or' these low 8 bits of shift amount into the 'control'.
4013	Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt);
4014	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: Control);
4015	}
4016
4017	// But have to place the 'control' into the wide-enough register first.
4018	if (XVT != MVT::i32) {
4019	Control = CurDAG->getNode(Opcode: ISD::ANY_EXTEND, DL, VT: XVT, Operand: Control);
4020	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: Control);
4021	}
4022
4023	// And finally, form the BEXTR itself.
4024	SDValue Extract = CurDAG->getNode(Opcode: X86ISD::BEXTR, DL, VT: XVT, N1: X, N2: Control);
4025
4026	// The 'X' was originally truncated. Do that now.
4027	if (XVT != NVT) {
4028	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (Node, `0`), N: Extract);
4029	Extract = CurDAG->getNode(Opcode: ISD::TRUNCATE, DL, VT: NVT, Operand: Extract);
4030	}
4031
4032	ReplaceNode(F: Node, T: Extract.getNode());
4033	SelectCode(Extract.getNode());
4034
4035	return true;
4036	}
4037
4038	// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
4039	MachineSDNode X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode Node) {
4040	MVT NVT = Node->getSimpleValueType(ResNo: `0`);
4041	SDLoc dl(Node);
4042
4043	SDValue N0 = Node->getOperand(Num: `0`);
4044	SDValue N1 = Node->getOperand(Num: `1`);
4045
4046	// If we have TBM we can use an immediate for the control. If we have BMI
4047	// we should only do this if the BEXTR instruction is implemented well.
4048	// Otherwise moving the control into a register makes this more costly.
4049	// TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
4050	// hoisting the move immediate would make it worthwhile with a less optimal
4051	// BEXTR?
4052	bool PreferBEXTR =
4053	Subtarget->hasTBM() \|\| (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
4054	if (!PreferBEXTR && !Subtarget->hasBMI2())
4055	return nullptr;
4056
4057	// Must have a shift right.
4058	if (N0 ->getOpcode() != ISD::SRL && N0 ->getOpcode() != ISD::SRA)
4059	return nullptr;
4060
4061	// Shift can't have additional users.
4062	if (!N0 ->hasOneUse())
4063	return nullptr;
4064
4065	// Only supported for 32 and 64 bits.
4066	if (NVT != MVT::i32 && NVT != MVT::i64)
4067	return nullptr;
4068
4069	// Shift amount and RHS of and must be constant.
4070	auto *MaskCst = dyn_cast<ConstantSDNode>(Val&: N1);
4071	auto *ShiftCst = dyn_cast<ConstantSDNode>(Val: N0 ->getOperand(Num: `1`));
4072	if (!MaskCst \|\| !ShiftCst)
4073	return nullptr;
4074
4075	// And RHS must be a mask.
4076	uint64_t Mask = MaskCst->getZExtValue();
4077	if (!isMask_64(Value: Mask))
4078	return nullptr;
4079
4080	uint64_t Shift = ShiftCst->getZExtValue();
4081	uint64_t MaskSize = llvm::popcount(Value: Mask);
4082
4083	// Don't interfere with something that can be handled by extracting AH.
4084	// TODO: If we are able to fold a load, BEXTR might still be better than AH.
4085	if (Shift == `8` && MaskSize == `8`)
4086	return nullptr;
4087
4088	// Make sure we are only using bits that were in the original value, not
4089	// shifted in.
4090	if (Shift + MaskSize > NVT.getSizeInBits())
4091	return nullptr;
4092
4093	// BZHI, if available, is always fast, unlike BEXTR. But even if we decide
4094	// that we can't use BEXTR, it is only worthwhile using BZHI if the mask
4095	// does not fit into 32 bits. Load folding is not a sufficient reason.
4096	if (!PreferBEXTR && MaskSize <= `32`)
4097	return nullptr;
4098
4099	SDValue Control;
4100	unsigned ROpc, MOpc;
4101
4102	#define GET_EGPR_IF_ENABLED(OPC) (Subtarget->hasEGPR() ? OPC##_EVEX : OPC)
4103	if (!PreferBEXTR) {
4104	assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
4105	// If we can't make use of BEXTR then we can't fuse shift+mask stages.
4106	// Let's perform the mask first, and apply shift later. Note that we need to
4107	// widen the mask to account for the fact that we'll apply shift afterwards!
4108	Control = CurDAG->getTargetConstant(Val: Shift + MaskSize, DL: dl, VT: NVT);
4109	ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rr)
4110	: GET_EGPR_IF_ENABLED(X86::BZHI32rr);
4111	MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rm)
4112	: GET_EGPR_IF_ENABLED(X86::BZHI32rm);
4113	unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4114	Control = SDValue (CurDAG->getMachineNode(Opcode: NewOpc, dl, VT: NVT, Op1: Control), `0`);
4115	} else {
4116	// The 'control' of BEXTR has the pattern of:
4117	// [15...8 bit][ 7...0 bit] location
4118	// [ bit count][ shift] name
4119	// I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4120	Control = CurDAG->getTargetConstant(Val: Shift \| (MaskSize << `8`), DL: dl, VT: NVT);
4121	if (Subtarget->hasTBM()) {
4122	ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
4123	MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
4124	} else {
4125	assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
4126	// BMI requires the immediate to placed in a register.
4127	ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rr)
4128	: GET_EGPR_IF_ENABLED(X86::BEXTR32rr);
4129	MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rm)
4130	: GET_EGPR_IF_ENABLED(X86::BEXTR32rm);
4131	unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4132	Control = SDValue (CurDAG->getMachineNode(Opcode: NewOpc, dl, VT: NVT, Op1: Control), `0`);
4133	}
4134	}
4135
4136	MachineSDNode *NewNode;
4137	SDValue Input = N0 ->getOperand(Num: `0`);
4138	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4139	if (tryFoldLoad(Root: Node, P: N0.getNode(), N: Input, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
4140	SDValue Ops[] = {
4141	Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(i: `0`)};
4142	SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
4143	NewNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
4144	// Update the chain.
4145	ReplaceUses(F: Input.getValue(R: `1`), T: SDValue (NewNode, `2`));
4146	// Record the mem-refs
4147	CurDAG->setNodeMemRefs(N: NewNode, NewMemRefs: {cast<LoadSDNode>(Val&: Input)->getMemOperand()});
4148	} else {
4149	NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control);
4150	}
4151
4152	if (!PreferBEXTR) {
4153	// We still need to apply the shift.
4154	SDValue ShAmt = CurDAG->getTargetConstant(Val: Shift, DL: dl, VT: NVT);
4155	unsigned NewOpc = NVT == MVT::i64 ? GET_ND_IF_ENABLED(X86::SHR64ri)
4156	: GET_ND_IF_ENABLED(X86::SHR32ri);
4157	NewNode =
4158	CurDAG->getMachineNode(Opcode: NewOpc, dl, VT: NVT, Op1: SDValue (NewNode, `0`), Op2: ShAmt);
4159	}
4160
4161	return NewNode;
4162	}
4163
4164	// Emit a PCMISTR(I/M) instruction.
4165	MachineSDNode X86DAGToDAGISel::emitPCMPISTR(unsigned* ROpc, unsigned MOpc,
4166	bool MayFoldLoad, const SDLoc &dl,
4167	MVT VT, SDNode *Node) {
4168	SDValue N0 = Node->getOperand(Num: `0`);
4169	SDValue N1 = Node->getOperand(Num: `1`);
4170	SDValue Imm = Node->getOperand(Num: `2`);
4171	auto *Val = cast<ConstantSDNode>(Val&: Imm)->getConstantIntValue();
4172	Imm = CurDAG->getTargetConstant(Val: *Val, DL: SDLoc (Node), VT: Imm.getValueType());
4173
4174	// Try to fold a load. No need to check alignment.
4175	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4176	if (MayFoldLoad && tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
4177	SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4178	N1.getOperand(i: `0`) };
4179	SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
4180	MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
4181	// Update the chain.
4182	ReplaceUses(F: N1.getValue(R: `1`), T: SDValue (CNode, `2`));
4183	// Record the mem-refs
4184	CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()});
4185	return CNode;
4186	}
4187
4188	SDValue Ops[] = { N0, N1, Imm };
4189	SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
4190	MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VTs, Ops);
4191	return CNode;
4192	}
4193
4194	// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
4195	// to emit a second instruction after this one. This is needed since we have two
4196	// copyToReg nodes glued before this and we need to continue that glue through.
4197	MachineSDNode X86DAGToDAGISel::emitPCMPESTR(unsigned* ROpc, unsigned MOpc,
4198	bool MayFoldLoad, const SDLoc &dl,
4199	MVT VT, SDNode *Node,
4200	SDValue &InGlue) {
4201	SDValue N0 = Node->getOperand(Num: `0`);
4202	SDValue N2 = Node->getOperand(Num: `2`);
4203	SDValue Imm = Node->getOperand(Num: `4`);
4204	auto *Val = cast<ConstantSDNode>(Val&: Imm)->getConstantIntValue();
4205	Imm = CurDAG->getTargetConstant(Val: *Val, DL: SDLoc (Node), VT: Imm.getValueType());
4206
4207	// Try to fold a load. No need to check alignment.
4208	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4209	if (MayFoldLoad && tryFoldLoad(P: Node, N: N2, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
4210	SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4211	N2.getOperand(i: `0`), InGlue };
4212	SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
4213	MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
4214	InGlue = SDValue (CNode, `3`);
4215	// Update the chain.
4216	ReplaceUses(F: N2.getValue(R: `1`), T: SDValue (CNode, `2`));
4217	// Record the mem-refs
4218	CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N2)->getMemOperand()});
4219	return CNode;
4220	}
4221
4222	SDValue Ops[] = { N0, N2, Imm, InGlue };
4223	SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
4224	MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VTs, Ops);
4225	InGlue = SDValue (CNode, `2`);
4226	return CNode;
4227	}
4228
4229	bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
4230	EVT VT = N->getValueType(ResNo: `0`);
4231
4232	// Only handle scalar shifts.
4233	if (VT.isVector())
4234	return false;
4235
4236	// Narrower shifts only mask to 5 bits in hardware.
4237	unsigned Size = VT == MVT::i64 ? `64` : `32`;
4238
4239	SDValue OrigShiftAmt = N->getOperand(Num: `1`);
4240	SDValue ShiftAmt = OrigShiftAmt;
4241	SDLoc DL(N);
4242
4243	// Skip over a truncate of the shift amount.
4244	if (ShiftAmt ->getOpcode() == ISD::TRUNCATE)
4245	ShiftAmt = ShiftAmt ->getOperand(Num: `0`);
4246
4247	// This function is called after X86DAGToDAGISel::matchBitExtract(),
4248	// so we are not afraid that we might mess up BZHI/BEXTR pattern.
4249
4250	SDValue NewShiftAmt;
4251	if (ShiftAmt ->getOpcode() == ISD::ADD \|\| ShiftAmt ->getOpcode() == ISD::SUB \|\|
4252	ShiftAmt ->getOpcode() == ISD::XOR) {
4253	SDValue Add0 = ShiftAmt ->getOperand(Num: `0`);
4254	SDValue Add1 = ShiftAmt ->getOperand(Num: `1`);
4255	auto *Add0C = dyn_cast<ConstantSDNode>(Val&: Add0);
4256	auto *Add1C = dyn_cast<ConstantSDNode>(Val&: Add1);
4257	// If we are shifting by X+/-/^N where N == 0 mod Size, then just shift by X
4258	// to avoid the ADD/SUB/XOR.
4259	if (Add1C && Add1C->getAPIntValue().urem(RHS: Size) == `0`) {
4260	NewShiftAmt = Add0;
4261
4262	} else if (ShiftAmt ->getOpcode() != ISD::ADD && ShiftAmt.hasOneUse() &&
4263	((Add0C && Add0C->getAPIntValue().urem(RHS: Size) == Size - `1`) \|\|
4264	(Add1C && Add1C->getAPIntValue().urem(RHS: Size) == Size - `1`))) {
4265	// If we are doing a NOT on just the lower bits with (SizeN-1) -/^ X*
4266	// we can replace it with a NOT. In the XOR case it may save some code
4267	// size, in the SUB case it also may save a move.
4268	assert(Add0C == nullptr \|\| Add1C == nullptr);
4269
4270	// We can only do N-X, not X-N
4271	if (ShiftAmt ->getOpcode() == ISD::SUB && Add0C == nullptr)
4272	return false;
4273
4274	EVT OpVT = ShiftAmt.getValueType();
4275
4276	SDValue AllOnes = CurDAG->getAllOnesConstant(DL, VT: OpVT);
4277	NewShiftAmt = CurDAG->getNode(Opcode: ISD::XOR, DL, VT: OpVT,
4278	N1: Add0C == nullptr ? Add0 : Add1, N2: AllOnes);
4279	insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: AllOnes);
4280	insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: NewShiftAmt);
4281	// If we are shifting by N-X where N == 0 mod Size, then just shift by
4282	// -X to generate a NEG instead of a SUB of a constant.
4283	} else if (ShiftAmt ->getOpcode() == ISD::SUB && Add0C &&
4284	Add0C->getZExtValue() != `0`) {
4285	EVT SubVT = ShiftAmt.getValueType();
4286	SDValue X;
4287	if (Add0C->getZExtValue() % Size == `0`)
4288	X = Add1;
4289	else if (ShiftAmt.hasOneUse() && Size == `64` &&
4290	Add0C->getZExtValue() % `32` == `0`) {
4291	// We have a 64-bit shift by (n32-x), turn it into -(x+n32).
4292	// This is mainly beneficial if we already compute (x+n32).*
4293	if (Add1.getOpcode() == ISD::TRUNCATE) {
4294	Add1 = Add1.getOperand(i: `0`);
4295	SubVT = Add1.getValueType();
4296	}
4297	if (Add0.getValueType() != SubVT) {
4298	Add0 = CurDAG->getZExtOrTrunc(Op: Add0, DL, VT: SubVT);
4299	insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: Add0);
4300	}
4301
4302	X = CurDAG->getNode(Opcode: ISD::ADD, DL, VT: SubVT, N1: Add1, N2: Add0);
4303	insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: X);
4304	} else
4305	return false;
4306	// Insert a negate op.
4307	// TODO: This isn't guaranteed to replace the sub if there is a logic cone
4308	// that uses it that's not a shift.
4309	SDValue Zero = CurDAG->getConstant(Val: `0`, DL, VT: SubVT);
4310	SDValue Neg = CurDAG->getNode(Opcode: ISD::SUB, DL, VT: SubVT, N1: Zero, N2: X);
4311	NewShiftAmt = Neg;
4312
4313	// Insert these operands into a valid topological order so they can
4314	// get selected independently.
4315	insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: Zero);
4316	insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: Neg);
4317	} else
4318	return false;
4319	} else
4320	return false;
4321
4322	if (NewShiftAmt.getValueType() != MVT::i8) {
4323	// Need to truncate the shift amount.
4324	NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt);
4325	// Add to a correct topological ordering.
4326	insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: NewShiftAmt);
4327	}
4328
4329	// Insert a new mask to keep the shift amount legal. This should be removed
4330	// by isel patterns.
4331	NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt,
4332	CurDAG->getConstant(Size - `1`, DL, MVT::i8));
4333	// Place in a correct topological ordering.
4334	insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: NewShiftAmt);
4335
4336	SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, Op1: N->getOperand(Num: `0`),
4337	Op2: NewShiftAmt);
4338	if (UpdatedNode != N) {
4339	// If we found an existing node, we should replace ourselves with that node
4340	// and wait for it to be selected after its other users.
4341	ReplaceNode(F: N, T: UpdatedNode);
4342	return true;
4343	}
4344
4345	// If the original shift amount is now dead, delete it so that we don't run
4346	// it through isel.
4347	if (OrigShiftAmt.getNode()->use_empty())
4348	CurDAG->RemoveDeadNode(N: OrigShiftAmt.getNode());
4349
4350	// Now that we've optimized the shift amount, defer to normal isel to get
4351	// load folding and legacy vs BMI2 selection without repeating it here.
4352	SelectCode(N);
4353	return true;
4354	}
4355
4356	bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
4357	MVT NVT = N->getSimpleValueType(ResNo: `0`);
4358	unsigned Opcode = N->getOpcode();
4359	SDLoc dl(N);
4360
4361	// For operations of the form (x << C1) op C2, check if we can use a smaller
4362	// encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
4363	SDValue Shift = N->getOperand(Num: `0`);
4364	SDValue N1 = N->getOperand(Num: `1`);
4365
4366	auto *Cst = dyn_cast<ConstantSDNode>(Val&: N1);
4367	if (!Cst)
4368	return false;
4369
4370	int64_t Val = Cst->getSExtValue();
4371
4372	// If we have an any_extend feeding the AND, look through it to see if there
4373	// is a shift behind it. But only if the AND doesn't use the extended bits.
4374	// FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
4375	bool FoundAnyExtend = false;
4376	if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
4377	Shift.getOperand(`0`).getSimpleValueType() == MVT::i32 &&
4378	isUInt<`32`>(Val)) {
4379	FoundAnyExtend = true;
4380	Shift = Shift.getOperand(i: `0`);
4381	}
4382
4383	if (Shift.getOpcode() != ISD::SHL \|\| !Shift.hasOneUse())
4384	return false;
4385
4386	// i8 is unshrinkable, i16 should be promoted to i32.
4387	if (NVT != MVT::i32 && NVT != MVT::i64)
4388	return false;
4389
4390	auto *ShlCst = dyn_cast<ConstantSDNode>(Val: Shift.getOperand(i: `1`));
4391	if (!ShlCst)
4392	return false;
4393
4394	uint64_t ShAmt = ShlCst->getZExtValue();
4395
4396	// Make sure that we don't change the operation by removing bits.
4397	// This only matters for OR and XOR, AND is unaffected.
4398	uint64_t RemovedBitsMask = (`1ULL` << ShAmt) - `1`;
4399	if (Opcode != ISD::AND && (Val & RemovedBitsMask) != `0`)
4400	return false;
4401
4402	// Check the minimum bitwidth for the new constant.
4403	// TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
4404	auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
4405	if (Opcode == ISD::AND) {
4406	// AND32ri is the same as AND64ri32 with zext imm.
4407	// Try this before sign extended immediates below.
4408	ShiftedVal = (uint64_t)Val >> ShAmt;
4409	if (NVT == MVT::i64 && !isUInt<`32`>(Val) && isUInt<`32`>(ShiftedVal))
4410	return true;
4411	// Also swap order when the AND can become MOVZX.
4412	if (ShiftedVal == UINT8_MAX \|\| ShiftedVal == UINT16_MAX)
4413	return true;
4414	}
4415	ShiftedVal = Val >> ShAmt;
4416	if ((!isInt<`8`>(x: Val) && isInt<`8`>(x: ShiftedVal)) \|\|
4417	(!isInt<`32`>(x: Val) && isInt<`32`>(x: ShiftedVal)))
4418	return true;
4419	if (Opcode != ISD::AND) {
4420	// MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
4421	ShiftedVal = (uint64_t)Val >> ShAmt;
4422	if (NVT == MVT::i64 && !isUInt<`32`>(Val) && isUInt<`32`>(ShiftedVal))
4423	return true;
4424	}
4425	return false;
4426	};
4427
4428	int64_t ShiftedVal;
4429	if (!CanShrinkImmediate (ShiftedVal))
4430	return false;
4431
4432	// Ok, we can reorder to get a smaller immediate.
4433
4434	// But, its possible the original immediate allowed an AND to become MOVZX.
4435	// Doing this late due to avoid the MakedValueIsZero call as late as
4436	// possible.
4437	if (Opcode == ISD::AND) {
4438	// Find the smallest zext this could possibly be.
4439	unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
4440	ZExtWidth = llvm::bit_ceil(Value: std::max(a: ZExtWidth, b: `8U`));
4441
4442	// Figure out which bits need to be zero to achieve that mask.
4443	APInt NeededMask = APInt::getLowBitsSet(numBits: NVT.getSizeInBits(),
4444	loBitsSet: ZExtWidth);
4445	NeededMask &= ~Cst->getAPIntValue();
4446
4447	if (CurDAG->MaskedValueIsZero(Op: N->getOperand(Num: `0`), Mask: NeededMask))
4448	return false;
4449	}
4450
4451	SDValue X = Shift.getOperand(i: `0`);
4452	if (FoundAnyExtend) {
4453	SDValue NewX = CurDAG->getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: NVT, Operand: X);
4454	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (N, `0`), N: NewX);
4455	X = NewX;
4456	}
4457
4458	SDValue NewCst = CurDAG->getConstant(Val: ShiftedVal, DL: dl, VT: NVT);
4459	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (N, `0`), N: NewCst);
4460	SDValue NewBinOp = CurDAG->getNode(Opcode, DL: dl, VT: NVT, N1: X, N2: NewCst);
4461	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (N, `0`), N: NewBinOp);
4462	SDValue NewSHL = CurDAG->getNode(Opcode: ISD::SHL, DL: dl, VT: NVT, N1: NewBinOp,
4463	N2: Shift.getOperand(i: `1`));
4464	ReplaceNode(F: N, T: NewSHL.getNode());
4465	SelectCode(NewSHL.getNode());
4466	return true;
4467	}
4468
4469	bool X86DAGToDAGISel::matchVPTERNLOG(SDNode Root, SDNode ParentA,
4470	SDNode ParentB, SDNode ParentC,
4471	SDValue A, SDValue B, SDValue C,
4472	uint8_t Imm) {
4473	assert(A.isOperandOf(ParentA) && B.isOperandOf(ParentB) &&
4474	C.isOperandOf(ParentC) && "Incorrect parent node");
4475
4476	auto tryFoldLoadOrBCast =
4477	[this](SDNode Root, SDNode P, SDValue &L, SDValue &Base, SDValue &Scale,
4478	SDValue &Index, SDValue &Disp, SDValue &Segment) {
4479	if (tryFoldLoad(Root, P, N: L, Base, Scale, Index, Disp, Segment))
4480	return true;
4481
4482	// Not a load, check for broadcast which may be behind a bitcast.
4483	if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4484	P = L.getNode();
4485	L = L.getOperand(i: `0`);
4486	}
4487
4488	if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4489	return false;
4490
4491	// Only 32 and 64 bit broadcasts are supported.
4492	auto *MemIntr = cast<MemIntrinsicSDNode>(Val&: L);
4493	unsigned Size = MemIntr->getMemoryVT().getSizeInBits();
4494	if (Size != `32` && Size != `64`)
4495	return false;
4496
4497	return tryFoldBroadcast(Root, P, N: L, Base, Scale, Index, Disp, Segment);
4498	};
4499
4500	bool FoldedLoad = false;
4501	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4502	if (tryFoldLoadOrBCast (Root, ParentC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4503	FoldedLoad = true;
4504	} else if (tryFoldLoadOrBCast (Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3,
4505	Tmp4)) {
4506	FoldedLoad = true;
4507	std::swap(a&: A, b&: C);
4508	// Swap bits 1/4 and 3/6.
4509	uint8_t OldImm = Imm;
4510	Imm = OldImm & `0xa5`;
4511	if (OldImm & `0x02`) Imm \|= `0x10`;
4512	if (OldImm & `0x10`) Imm \|= `0x02`;
4513	if (OldImm & `0x08`) Imm \|= `0x40`;
4514	if (OldImm & `0x40`) Imm \|= `0x08`;
4515	} else if (tryFoldLoadOrBCast (Root, ParentB, B, Tmp0, Tmp1, Tmp2, Tmp3,
4516	Tmp4)) {
4517	FoldedLoad = true;
4518	std::swap(a&: B, b&: C);
4519	// Swap bits 1/2 and 5/6.
4520	uint8_t OldImm = Imm;
4521	Imm = OldImm & `0x99`;
4522	if (OldImm & `0x02`) Imm \|= `0x04`;
4523	if (OldImm & `0x04`) Imm \|= `0x02`;
4524	if (OldImm & `0x20`) Imm \|= `0x40`;
4525	if (OldImm & `0x40`) Imm \|= `0x20`;
4526	}
4527
4528	SDLoc DL(Root);
4529
4530	SDValue TImm = CurDAG->getTargetConstant(Imm, DL, MVT::i8);
4531
4532	MVT NVT = Root->getSimpleValueType(ResNo: `0`);
4533
4534	MachineSDNode *MNode;
4535	if (FoldedLoad) {
4536	SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
4537
4538	unsigned Opc;
4539	if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) {
4540	auto *MemIntr = cast<MemIntrinsicSDNode>(Val&: C);
4541	unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits();
4542	assert((EltSize == `32` \|\| EltSize == `64`) && "Unexpected broadcast size!");
4543
4544	bool UseD = EltSize == `32`;
4545	if (NVT.is128BitVector())
4546	Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi;
4547	else if (NVT.is256BitVector())
4548	Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi;
4549	else if (NVT.is512BitVector())
4550	Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi;
4551	else
4552	llvm_unreachable("Unexpected vector size!");
4553	} else {
4554	bool UseD = NVT.getVectorElementType() == MVT::i32;
4555	if (NVT.is128BitVector())
4556	Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi;
4557	else if (NVT.is256BitVector())
4558	Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi;
4559	else if (NVT.is512BitVector())
4560	Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi;
4561	else
4562	llvm_unreachable("Unexpected vector size!");
4563	}
4564
4565	SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(i: `0`)};
4566	MNode = CurDAG->getMachineNode(Opcode: Opc, dl: DL, VTs, Ops);
4567
4568	// Update the chain.
4569	ReplaceUses(F: C.getValue(R: `1`), T: SDValue (MNode, `1`));
4570	// Record the mem-refs
4571	CurDAG->setNodeMemRefs(N: MNode, NewMemRefs: {cast<MemSDNode>(Val&: C)->getMemOperand()});
4572	} else {
4573	bool UseD = NVT.getVectorElementType() == MVT::i32;
4574	unsigned Opc;
4575	if (NVT.is128BitVector())
4576	Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri;
4577	else if (NVT.is256BitVector())
4578	Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri;
4579	else if (NVT.is512BitVector())
4580	Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri;
4581	else
4582	llvm_unreachable("Unexpected vector size!");
4583
4584	MNode = CurDAG->getMachineNode(Opcode: Opc, dl: DL, VT: NVT, Ops: {A, B, C, TImm});
4585	}
4586
4587	ReplaceUses(F: SDValue (Root, `0`), T: SDValue (MNode, `0`));
4588	CurDAG->RemoveDeadNode(N: Root);
4589	return true;
4590	}
4591
4592	// Try to match two logic ops to a VPTERNLOG.
4593	// FIXME: Handle more complex patterns that use an operand more than once?
4594	bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
4595	MVT NVT = N->getSimpleValueType(ResNo: `0`);
4596
4597	// Make sure we support VPTERNLOG.
4598	if (!NVT.isVector() \|\| !Subtarget->hasAVX512() \|\|
4599	NVT.getVectorElementType() == MVT::i1)
4600	return false;
4601
4602	// We need VLX for 128/256-bit.
4603	if (!(Subtarget->hasVLX() \|\| NVT.is512BitVector()))
4604	return false;
4605
4606	SDValue N0 = N->getOperand(Num: `0`);
4607	SDValue N1 = N->getOperand(Num: `1`);
4608
4609	auto getFoldableLogicOp = [](SDValue Op) {
4610	// Peek through single use bitcast.
4611	if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse())
4612	Op = Op.getOperand(i: `0`);
4613
4614	if (!Op.hasOneUse())
4615	return SDValue ();
4616
4617	unsigned Opc = Op.getOpcode();
4618	if (Opc == ISD::AND \|\| Opc == ISD::OR \|\| Opc == ISD::XOR \|\|
4619	Opc == X86ISD::ANDNP)
4620	return Op;
4621
4622	return SDValue ();
4623	};
4624
4625	SDValue A, FoldableOp;
4626	if ((FoldableOp = getFoldableLogicOp (N1))) {
4627	A = N0;
4628	} else if ((FoldableOp = getFoldableLogicOp (N0))) {
4629	A = N1;
4630	} else
4631	return false;
4632
4633	SDValue B = FoldableOp.getOperand(i: `0`);
4634	SDValue C = FoldableOp.getOperand(i: `1`);
4635	SDNode *ParentA = N;
4636	SDNode *ParentB = FoldableOp.getNode();
4637	SDNode *ParentC = FoldableOp.getNode();
4638
4639	// We can build the appropriate control immediate by performing the logic
4640	// operation we're matching using these constants for A, B, and C.
4641	uint8_t TernlogMagicA = `0xf0`;
4642	uint8_t TernlogMagicB = `0xcc`;
4643	uint8_t TernlogMagicC = `0xaa`;
4644
4645	// Some of the inputs may be inverted, peek through them and invert the
4646	// magic values accordingly.
4647	// TODO: There may be a bitcast before the xor that we should peek through.
4648	auto PeekThroughNot = [](SDValue &Op, SDNode *&Parent, uint8_t &Magic) {
4649	if (Op.getOpcode() == ISD::XOR && Op.hasOneUse() &&
4650	ISD::isBuildVectorAllOnes(N: Op.getOperand(i: `1`).getNode())) {
4651	Magic = ~Magic;
4652	Parent = Op.getNode();
4653	Op = Op.getOperand(i: `0`);
4654	}
4655	};
4656
4657	PeekThroughNot (A, ParentA, TernlogMagicA);
4658	PeekThroughNot (B, ParentB, TernlogMagicB);
4659	PeekThroughNot (C, ParentC, TernlogMagicC);
4660
4661	uint8_t Imm;
4662	switch (FoldableOp.getOpcode()) {
4663	default: llvm_unreachable("Unexpected opcode!");
4664	case ISD::AND: Imm = TernlogMagicB & TernlogMagicC; break;
4665	case ISD::OR: Imm = TernlogMagicB \| TernlogMagicC; break;
4666	case ISD::XOR: Imm = TernlogMagicB ^ TernlogMagicC; break;
4667	case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break;
4668	}
4669
4670	switch (N->getOpcode()) {
4671	default: llvm_unreachable("Unexpected opcode!");
4672	case X86ISD::ANDNP:
4673	if (A == N0)
4674	Imm &= ~TernlogMagicA;
4675	else
4676	Imm = ~(Imm) & TernlogMagicA;
4677	break;
4678	case ISD::AND: Imm &= TernlogMagicA; break;
4679	case ISD::OR: Imm \|= TernlogMagicA; break;
4680	case ISD::XOR: Imm ^= TernlogMagicA; break;
4681	}
4682
4683	return matchVPTERNLOG(Root: N, ParentA, ParentB, ParentC, A, B, C, Imm);
4684	}
4685
4686	/// If the high bits of an 'and' operand are known zero, try setting the
4687	/// high bits of an 'and' constant operand to produce a smaller encoding by
4688	/// creating a small, sign-extended negative immediate rather than a large
4689	/// positive one. This reverses a transform in SimplifyDemandedBits that
4690	/// shrinks mask constants by clearing bits. There is also a possibility that
4691	/// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
4692	/// case, just replace the 'and'. Return 'true' if the node is replaced.
4693	bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
4694	// i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
4695	// have immediate operands.
4696	MVT VT = And->getSimpleValueType(ResNo: `0`);
4697	if (VT != MVT::i32 && VT != MVT::i64)
4698	return false;
4699
4700	auto *And1C = dyn_cast<ConstantSDNode>(Val: And->getOperand(Num: `1`));
4701	if (!And1C)
4702	return false;
4703
4704	// Bail out if the mask constant is already negative. It's can't shrink more.
4705	// If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
4706	// patterns to use a 32-bit and instead of a 64-bit and by relying on the
4707	// implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
4708	// are negative too.
4709	APInt MaskVal = And1C->getAPIntValue();
4710	unsigned MaskLZ = MaskVal.countl_zero();
4711	if (!MaskLZ \|\| (VT == MVT::i64 && MaskLZ == `32`))
4712	return false;
4713
4714	// Don't extend into the upper 32 bits of a 64 bit mask.
4715	if (VT == MVT::i64 && MaskLZ >= `32`) {
4716	MaskLZ -= `32`;
4717	MaskVal = MaskVal.trunc(width: `32`);
4718	}
4719
4720	SDValue And0 = And->getOperand(Num: `0`);
4721	APInt HighZeros = APInt::getHighBitsSet(numBits: MaskVal.getBitWidth(), hiBitsSet: MaskLZ);
4722	APInt NegMaskVal = MaskVal \| HighZeros;
4723
4724	// If a negative constant would not allow a smaller encoding, there's no need
4725	// to continue. Only change the constant when we know it's a win.
4726	unsigned MinWidth = NegMaskVal.getSignificantBits();
4727	if (MinWidth > `32` \|\| (MinWidth > `8` && MaskVal.getSignificantBits() <= `32`))
4728	return false;
4729
4730	// Extend masks if we truncated above.
4731	if (VT == MVT::i64 && MaskVal.getBitWidth() < `64`) {
4732	NegMaskVal = NegMaskVal.zext(width: `64`);
4733	HighZeros = HighZeros.zext(width: `64`);
4734	}
4735
4736	// The variable operand must be all zeros in the top bits to allow using the
4737	// new, negative constant as the mask.
4738	if (!CurDAG->MaskedValueIsZero(Op: And0, Mask: HighZeros))
4739	return false;
4740
4741	// Check if the mask is -1. In that case, this is an unnecessary instruction
4742	// that escaped earlier analysis.
4743	if (NegMaskVal.isAllOnes()) {
4744	ReplaceNode(F: And, T: And0.getNode());
4745	return true;
4746	}
4747
4748	// A negative mask allows a smaller encoding. Create a new 'and' node.
4749	SDValue NewMask = CurDAG->getConstant(Val: NegMaskVal, DL: SDLoc (And), VT);
4750	insertDAGNode(DAG&: *CurDAG, Pos: SDValue (And, `0`), N: NewMask);
4751	SDValue NewAnd = CurDAG->getNode(Opcode: ISD::AND, DL: SDLoc (And), VT, N1: And0, N2: NewMask);
4752	ReplaceNode(F: And, T: NewAnd.getNode());
4753	SelectCode(NewAnd.getNode());
4754	return true;
4755	}
4756
4757	static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
4758	bool FoldedBCast, bool Masked) {
4759	#define VPTESTM_CASE(VT, SUFFIX) \
4760	case MVT::VT: \
4761	if (Masked) \
4762	return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \
4763	return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX;
4764
4765
4766	#define VPTESTM_BROADCAST_CASES(SUFFIX) \
4767	default: llvm_unreachable("Unexpected VT!"); \
4768	VPTESTM_CASE(v4i32, DZ128##SUFFIX) \
4769	VPTESTM_CASE(v2i64, QZ128##SUFFIX) \
4770	VPTESTM_CASE(v8i32, DZ256##SUFFIX) \
4771	VPTESTM_CASE(v4i64, QZ256##SUFFIX) \
4772	VPTESTM_CASE(v16i32, DZ##SUFFIX) \
4773	VPTESTM_CASE(v8i64, QZ##SUFFIX)
4774
4775	#define VPTESTM_FULL_CASES(SUFFIX) \
4776	VPTESTM_BROADCAST_CASES(SUFFIX) \
4777	VPTESTM_CASE(v16i8, BZ128##SUFFIX) \
4778	VPTESTM_CASE(v8i16, WZ128##SUFFIX) \
4779	VPTESTM_CASE(v32i8, BZ256##SUFFIX) \
4780	VPTESTM_CASE(v16i16, WZ256##SUFFIX) \
4781	VPTESTM_CASE(v64i8, BZ##SUFFIX) \
4782	VPTESTM_CASE(v32i16, WZ##SUFFIX)
4783
4784	if (FoldedBCast) {
4785	switch (TestVT.SimpleTy) {
4786	VPTESTM_BROADCAST_CASES(rmb)
4787	}
4788	}
4789
4790	if (FoldedLoad) {
4791	switch (TestVT.SimpleTy) {
4792	VPTESTM_FULL_CASES(rm)
4793	}
4794	}
4795
4796	switch (TestVT.SimpleTy) {
4797	VPTESTM_FULL_CASES(rr)
4798	}
4799
4800	#undef VPTESTM_FULL_CASES
4801	#undef VPTESTM_BROADCAST_CASES
4802	#undef VPTESTM_CASE
4803	}
4804
4805	// Try to create VPTESTM instruction. If InMask is not null, it will be used
4806	// to form a masked operation.
4807	bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
4808	SDValue InMask) {
4809	assert(Subtarget->hasAVX512() && "Expected AVX512!");
4810	assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
4811	"Unexpected VT!");
4812
4813	// Look for equal and not equal compares.
4814	ISD::CondCode CC = cast<CondCodeSDNode>(Val: Setcc.getOperand(i: `2`))->get();
4815	if (CC != ISD::SETEQ && CC != ISD::SETNE)
4816	return false;
4817
4818	SDValue SetccOp0 = Setcc.getOperand(i: `0`);
4819	SDValue SetccOp1 = Setcc.getOperand(i: `1`);
4820
4821	// Canonicalize the all zero vector to the RHS.
4822	if (ISD::isBuildVectorAllZeros(N: SetccOp0.getNode()))
4823	std::swap(a&: SetccOp0, b&: SetccOp1);
4824
4825	// See if we're comparing against zero.
4826	if (!ISD::isBuildVectorAllZeros(N: SetccOp1.getNode()))
4827	return false;
4828
4829	SDValue N0 = SetccOp0;
4830
4831	MVT CmpVT = N0.getSimpleValueType();
4832	MVT CmpSVT = CmpVT.getVectorElementType();
4833
4834	// Start with both operands the same. We'll try to refine this.
4835	SDValue Src0 = N0;
4836	SDValue Src1 = N0;
4837
4838	{
4839	// Look through single use bitcasts.
4840	SDValue N0Temp = N0;
4841	if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
4842	N0Temp = N0.getOperand(i: `0`);
4843
4844	// Look for single use AND.
4845	if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
4846	Src0 = N0Temp.getOperand(i: `0`);
4847	Src1 = N0Temp.getOperand(i: `1`);
4848	}
4849	}
4850
4851	// Without VLX we need to widen the operation.
4852	bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
4853
4854	auto tryFoldLoadOrBCast = [&](SDNode Root, SDNode P, SDValue &L,
4855	SDValue &Base, SDValue &Scale, SDValue &Index,
4856	SDValue &Disp, SDValue &Segment) {
4857	// If we need to widen, we can't fold the load.
4858	if (!Widen)
4859	if (tryFoldLoad(Root, P, N: L, Base, Scale, Index, Disp, Segment))
4860	return true;
4861
4862	// If we didn't fold a load, try to match broadcast. No widening limitation
4863	// for this. But only 32 and 64 bit types are supported.
4864	if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64)
4865	return false;
4866
4867	// Look through single use bitcasts.
4868	if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4869	P = L.getNode();
4870	L = L.getOperand(i: `0`);
4871	}
4872
4873	if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4874	return false;
4875
4876	auto *MemIntr = cast<MemIntrinsicSDNode>(Val&: L);
4877	if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits())
4878	return false;
4879
4880	return tryFoldBroadcast(Root, P, N: L, Base, Scale, Index, Disp, Segment);
4881	};
4882
4883	// We can only fold loads if the sources are unique.
4884	bool CanFoldLoads = Src0 != Src1;
4885
4886	bool FoldedLoad = false;
4887	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4888	if (CanFoldLoads) {
4889	FoldedLoad = tryFoldLoadOrBCast (Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2,
4890	Tmp3, Tmp4);
4891	if (!FoldedLoad) {
4892	// And is commutative.
4893	FoldedLoad = tryFoldLoadOrBCast (Root, N0.getNode(), Src0, Tmp0, Tmp1,
4894	Tmp2, Tmp3, Tmp4);
4895	if (FoldedLoad)
4896	std::swap(a&: Src0, b&: Src1);
4897	}
4898	}
4899
4900	bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD;
4901
4902	bool IsMasked = InMask.getNode() != nullptr;
4903
4904	SDLoc dl(Root);
4905
4906	MVT ResVT = Setcc.getSimpleValueType();
4907	MVT MaskVT = ResVT;
4908	if (Widen) {
4909	// Widen the inputs using insert_subreg or copy_to_regclass.
4910	unsigned Scale = CmpVT.is128BitVector() ? `4` : `2`;
4911	unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
4912	unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
4913	CmpVT = MVT::getVectorVT(VT: CmpSVT, NumElements: NumElts);
4914	MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
4915	SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl,
4916	CmpVT), `0`);
4917	Src0 = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL: dl, VT: CmpVT, Operand: ImplDef, Subreg: Src0);
4918
4919	if (!FoldedBCast)
4920	Src1 = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL: dl, VT: CmpVT, Operand: ImplDef, Subreg: Src1);
4921
4922	if (IsMasked) {
4923	// Widen the mask.
4924	unsigned RegClass = TLI->getRegClassFor(VT: MaskVT)->getID();
4925	SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
4926	InMask = SDValue (CurDAG->getMachineNode(Opcode: TargetOpcode::COPY_TO_REGCLASS,
4927	dl, VT: MaskVT, Op1: InMask, Op2: RC), `0`);
4928	}
4929	}
4930
4931	bool IsTestN = CC == ISD::SETEQ;
4932	unsigned Opc = getVPTESTMOpc(TestVT: CmpVT, IsTestN, FoldedLoad, FoldedBCast,
4933	Masked: IsMasked);
4934
4935	MachineSDNode *CNode;
4936	if (FoldedLoad) {
4937	SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other);
4938
4939	if (IsMasked) {
4940	SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
4941	Src1.getOperand(i: `0`) };
4942	CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
4943	} else {
4944	SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
4945	Src1.getOperand(i: `0`) };
4946	CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
4947	}
4948
4949	// Update the chain.
4950	ReplaceUses(F: Src1.getValue(R: `1`), T: SDValue (CNode, `1`));
4951	// Record the mem-refs
4952	CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<MemSDNode>(Val&: Src1)->getMemOperand()});
4953	} else {
4954	if (IsMasked)
4955	CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MaskVT, Op1: InMask, Op2: Src0, Op3: Src1);
4956	else
4957	CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MaskVT, Op1: Src0, Op2: Src1);
4958	}
4959
4960	// If we widened, we need to shrink the mask VT.
4961	if (Widen) {
4962	unsigned RegClass = TLI->getRegClassFor(VT: ResVT)->getID();
4963	SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
4964	CNode = CurDAG->getMachineNode(Opcode: TargetOpcode::COPY_TO_REGCLASS,
4965	dl, VT: ResVT, Op1: SDValue (CNode, `0`), Op2: RC);
4966	}
4967
4968	ReplaceUses(F: SDValue (Root, `0`), T: SDValue (CNode, `0`));
4969	CurDAG->RemoveDeadNode(N: Root);
4970	return true;
4971	}
4972
4973	// Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
4974	// into vpternlog.
4975	bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
4976	assert(N->getOpcode() == ISD::OR && "Unexpected opcode!");
4977
4978	MVT NVT = N->getSimpleValueType(ResNo: `0`);
4979
4980	// Make sure we support VPTERNLOG.
4981	if (!NVT.isVector() \|\| !Subtarget->hasAVX512())
4982	return false;
4983
4984	// We need VLX for 128/256-bit.
4985	if (!(Subtarget->hasVLX() \|\| NVT.is512BitVector()))
4986	return false;
4987
4988	SDValue N0 = N->getOperand(Num: `0`);
4989	SDValue N1 = N->getOperand(Num: `1`);
4990
4991	// Canonicalize AND to LHS.
4992	if (N1.getOpcode() == ISD::AND)
4993	std::swap(a&: N0, b&: N1);
4994
4995	if (N0.getOpcode() != ISD::AND \|\|
4996	N1.getOpcode() != X86ISD::ANDNP \|\|
4997	!N0.hasOneUse() \|\| !N1.hasOneUse())
4998	return false;
4999
5000	// ANDN is not commutable, use it to pick down A and C.
5001	SDValue A = N1.getOperand(i: `0`);
5002	SDValue C = N1.getOperand(i: `1`);
5003
5004	// AND is commutable, if one operand matches A, the other operand is B.
5005	// Otherwise this isn't a match.
5006	SDValue B;
5007	if (N0.getOperand(i: `0`) == A)
5008	B = N0.getOperand(i: `1`);
5009	else if (N0.getOperand(i: `1`) == A)
5010	B = N0.getOperand(i: `0`);
5011	else
5012	return false;
5013
5014	SDLoc dl(N);
5015	SDValue Imm = CurDAG->getTargetConstant(`0xCA`, dl, MVT::i8);
5016	SDValue Ternlog = CurDAG->getNode(Opcode: X86ISD::VPTERNLOG, DL: dl, VT: NVT, N1: A, N2: B, N3: C, N4: Imm);
5017	ReplaceNode(F: N, T: Ternlog.getNode());
5018
5019	return matchVPTERNLOG(Root: Ternlog.getNode(), ParentA: Ternlog.getNode(), ParentB: Ternlog.getNode(),
5020	ParentC: Ternlog.getNode(), A, B, C, Imm: `0xCA`);
5021	}
5022
5023	void X86DAGToDAGISel::Select(SDNode *Node) {
5024	MVT NVT = Node->getSimpleValueType(ResNo: `0`);
5025	unsigned Opcode = Node->getOpcode();
5026	SDLoc dl(Node);
5027
5028	if (Node->isMachineOpcode()) {
5029	LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << `'\n'`);
5030	Node->setNodeId(-`1`);
5031	return; // Already selected.
5032	}
5033
5034	switch (Opcode) {
5035	default: break;
5036	case ISD::INTRINSIC_W_CHAIN: {
5037	unsigned IntNo = Node->getConstantOperandVal(Num: `1`);
5038	switch (IntNo) {
5039	default: break;
5040	case Intrinsic::x86_encodekey128:
5041	case Intrinsic::x86_encodekey256: {
5042	if (!Subtarget->hasKL())
5043	break;
5044
5045	unsigned Opcode;
5046	switch (IntNo) {
5047	default: llvm_unreachable("Impossible intrinsic");
5048	case Intrinsic::x86_encodekey128:
5049	Opcode = X86::ENCODEKEY128;
5050	break;
5051	case Intrinsic::x86_encodekey256:
5052	Opcode = X86::ENCODEKEY256;
5053	break;
5054	}
5055
5056	SDValue Chain = Node->getOperand(Num: `0`);
5057	Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(`3`),
5058	SDValue());
5059	if (Opcode == X86::ENCODEKEY256)
5060	Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(`4`),
5061	Chain.getValue(`1`));
5062
5063	MachineSDNode *Res = CurDAG->getMachineNode(
5064	Opcode, dl, VTs: Node->getVTList(),
5065	Ops: {Node->getOperand(Num: `2`), Chain, Chain.getValue(R: `1`)});
5066	ReplaceNode(F: Node, T: Res);
5067	return;
5068	}
5069	case Intrinsic::x86_tileloadd64_internal:
5070	case Intrinsic::x86_tileloaddt164_internal: {
5071	if (!Subtarget->hasAMXTILE())
5072	break;
5073	unsigned Opc = IntNo == Intrinsic::x86_tileloadd64_internal
5074	? X86::PTILELOADDV
5075	: X86::PTILELOADDT1V;
5076	// _tile_loadd_internal(row, col, buf, STRIDE)
5077	SDValue Base = Node->getOperand(Num: `4`);
5078	SDValue Scale = getI8Imm(Imm: `1`, DL: dl);
5079	SDValue Index = Node->getOperand(Num: `5`);
5080	SDValue Disp = CurDAG->getTargetConstant(`0`, dl, MVT::i32);
5081	SDValue Segment = CurDAG->getRegister(`0`, MVT::i16);
5082	SDValue Chain = Node->getOperand(Num: `0`);
5083	MachineSDNode *CNode;
5084	SDValue Ops[] = {Node->getOperand(Num: `2`),
5085	Node->getOperand(Num: `3`),
5086	Base,
5087	Scale,
5088	Index,
5089	Disp,
5090	Segment,
5091	Chain};
5092	CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
5093	ReplaceNode(F: Node, T: CNode);
5094	return;
5095	}
5096	}
5097	break;
5098	}
5099	case ISD::INTRINSIC_VOID: {
5100	unsigned IntNo = Node->getConstantOperandVal(Num: `1`);
5101	switch (IntNo) {
5102	default: break;
5103	case Intrinsic::x86_sse3_monitor:
5104	case Intrinsic::x86_monitorx:
5105	case Intrinsic::x86_clzero: {
5106	bool Use64BitPtr = Node->getOperand(`2`).getValueType() == MVT::i64;
5107
5108	unsigned Opc = `0`;
5109	switch (IntNo) {
5110	default: llvm_unreachable("Unexpected intrinsic!");
5111	case Intrinsic::x86_sse3_monitor:
5112	if (!Subtarget->hasSSE3())
5113	break;
5114	Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
5115	break;
5116	case Intrinsic::x86_monitorx:
5117	if (!Subtarget->hasMWAITX())
5118	break;
5119	Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
5120	break;
5121	case Intrinsic::x86_clzero:
5122	if (!Subtarget->hasCLZERO())
5123	break;
5124	Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
5125	break;
5126	}
5127
5128	if (Opc) {
5129	unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
5130	SDValue Chain = CurDAG->getCopyToReg(Chain: Node->getOperand(Num: `0`), dl, Reg: PtrReg,
5131	N: Node->getOperand(Num: `2`), Glue: SDValue ());
5132	SDValue InGlue = Chain.getValue(R: `1`);
5133
5134	if (IntNo == Intrinsic::x86_sse3_monitor \|\|
5135	IntNo == Intrinsic::x86_monitorx) {
5136	// Copy the other two operands to ECX and EDX.
5137	Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(`3`),
5138	InGlue);
5139	InGlue = Chain.getValue(R: `1`);
5140	Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(`4`),
5141	InGlue);
5142	InGlue = Chain.getValue(R: `1`);
5143	}
5144
5145	MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
5146	{ Chain, InGlue});
5147	ReplaceNode(F: Node, T: CNode);
5148	return;
5149	}
5150
5151	break;
5152	}
5153	case Intrinsic::x86_tilestored64_internal: {
5154	unsigned Opc = X86::PTILESTOREDV;
5155	// _tile_stored_internal(row, col, buf, STRIDE, c)
5156	SDValue Base = Node->getOperand(Num: `4`);
5157	SDValue Scale = getI8Imm(Imm: `1`, DL: dl);
5158	SDValue Index = Node->getOperand(Num: `5`);
5159	SDValue Disp = CurDAG->getTargetConstant(`0`, dl, MVT::i32);
5160	SDValue Segment = CurDAG->getRegister(`0`, MVT::i16);
5161	SDValue Chain = Node->getOperand(Num: `0`);
5162	MachineSDNode *CNode;
5163	SDValue Ops[] = {Node->getOperand(Num: `2`),
5164	Node->getOperand(Num: `3`),
5165	Base,
5166	Scale,
5167	Index,
5168	Disp,
5169	Segment,
5170	Node->getOperand(Num: `6`),
5171	Chain};
5172	CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5173	ReplaceNode(F: Node, T: CNode);
5174	return;
5175	}
5176	case Intrinsic::x86_tileloadd64:
5177	case Intrinsic::x86_tileloaddt164:
5178	case Intrinsic::x86_tilestored64: {
5179	if (!Subtarget->hasAMXTILE())
5180	break;
5181	unsigned Opc;
5182	switch (IntNo) {
5183	default: llvm_unreachable("Unexpected intrinsic!");
5184	case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break;
5185	case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break;
5186	case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break;
5187	}
5188	// FIXME: Match displacement and scale.
5189	unsigned TIndex = Node->getConstantOperandVal(Num: `2`);
5190	SDValue TReg = getI8Imm(Imm: TIndex, DL: dl);
5191	SDValue Base = Node->getOperand(Num: `3`);
5192	SDValue Scale = getI8Imm(Imm: `1`, DL: dl);
5193	SDValue Index = Node->getOperand(Num: `4`);
5194	SDValue Disp = CurDAG->getTargetConstant(`0`, dl, MVT::i32);
5195	SDValue Segment = CurDAG->getRegister(`0`, MVT::i16);
5196	SDValue Chain = Node->getOperand(Num: `0`);
5197	MachineSDNode *CNode;
5198	if (Opc == X86::PTILESTORED) {
5199	SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain };
5200	CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5201	} else {
5202	SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain };
5203	CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5204	}
5205	ReplaceNode(F: Node, T: CNode);
5206	return;
5207	}
5208	}
5209	break;
5210	}
5211	case ISD::BRIND:
5212	case X86ISD::NT_BRIND: {
5213	if (Subtarget->isTargetNaCl())
5214	// NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
5215	// leave the instruction alone.
5216	break;
5217	if (Subtarget->isTarget64BitILP32()) {
5218	// Converts a 32-bit register to a 64-bit, zero-extended version of
5219	// it. This is needed because x86-64 can do many things, but jmp %r32
5220	// ain't one of them.
5221	SDValue Target = Node->getOperand(Num: `1`);
5222	assert(Target.getValueType() == MVT::i32 && "Unexpected VT!");
5223	SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, MVT::i64);
5224	SDValue Brind = CurDAG->getNode(Opcode, dl, MVT::Other,
5225	Node->getOperand(`0`), ZextTarget);
5226	ReplaceNode(F: Node, T: Brind.getNode());
5227	SelectCode(ZextTarget.getNode());
5228	SelectCode(Brind.getNode());
5229	return;
5230	}
5231	break;
5232	}
5233	case X86ISD::GlobalBaseReg:
5234	ReplaceNode(F: Node, T: getGlobalBaseReg());
5235	return;
5236
5237	case ISD::BITCAST:
5238	// Just drop all 128/256/512-bit bitcasts.
5239	if (NVT.is512BitVector() \|\| NVT.is256BitVector() \|\| NVT.is128BitVector() \|\|
5240	NVT == MVT::f128) {
5241	ReplaceUses(F: SDValue (Node, `0`), T: Node->getOperand(Num: `0`));
5242	CurDAG->RemoveDeadNode(N: Node);
5243	return;
5244	}
5245	break;
5246
5247	case ISD::SRL:
5248	if (matchBitExtract(Node))
5249	return;
5250	[[fallthrough]];
5251	case ISD::SRA:
5252	case ISD::SHL:
5253	if (tryShiftAmountMod(N: Node))
5254	return;
5255	break;
5256
5257	case X86ISD::VPTERNLOG: {
5258	uint8_t Imm = Node->getConstantOperandVal(Num: `3`);
5259	if (matchVPTERNLOG(Root: Node, ParentA: Node, ParentB: Node, ParentC: Node, A: Node->getOperand(Num: `0`),
5260	B: Node->getOperand(Num: `1`), C: Node->getOperand(Num: `2`), Imm))
5261	return;
5262	break;
5263	}
5264
5265	case X86ISD::ANDNP:
5266	if (tryVPTERNLOG(N: Node))
5267	return;
5268	break;
5269
5270	case ISD::AND:
5271	if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
5272	// Try to form a masked VPTESTM. Operands can be in either order.
5273	SDValue N0 = Node->getOperand(Num: `0`);
5274	SDValue N1 = Node->getOperand(Num: `1`);
5275	if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
5276	tryVPTESTM(Root: Node, Setcc: N0, InMask: N1))
5277	return;
5278	if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
5279	tryVPTESTM(Root: Node, Setcc: N1, InMask: N0))
5280	return;
5281	}
5282
5283	if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
5284	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (NewNode, `0`));
5285	CurDAG->RemoveDeadNode(N: Node);
5286	return;
5287	}
5288	if (matchBitExtract(Node))
5289	return;
5290	if (AndImmShrink && shrinkAndImmediate(And: Node))
5291	return;
5292
5293	[[fallthrough]];
5294	case ISD::OR:
5295	case ISD::XOR:
5296	if (tryShrinkShlLogicImm(N: Node))
5297	return;
5298	if (Opcode == ISD::OR && tryMatchBitSelect(N: Node))
5299	return;
5300	if (tryVPTERNLOG(N: Node))
5301	return;
5302
5303	[[fallthrough]];
5304	case ISD::ADD:
5305	if (Opcode == ISD::ADD && matchBitExtract(Node))
5306	return;
5307	[[fallthrough]];
5308	case ISD::SUB: {
5309	// Try to avoid folding immediates with multiple uses for optsize.
5310	// This code tries to select to register form directly to avoid going
5311	// through the isel table which might fold the immediate. We can't change
5312	// the patterns on the add/sub/and/or/xor with immediate paterns in the
5313	// tablegen files to check immediate use count without making the patterns
5314	// unavailable to the fast-isel table.
5315	if (!CurDAG->shouldOptForSize())
5316	break;
5317
5318	// Only handle i8/i16/i32/i64.
5319	if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
5320	break;
5321
5322	SDValue N0 = Node->getOperand(Num: `0`);
5323	SDValue N1 = Node->getOperand(Num: `1`);
5324
5325	auto *Cst = dyn_cast<ConstantSDNode>(Val&: N1);
5326	if (!Cst)
5327	break;
5328
5329	int64_t Val = Cst->getSExtValue();
5330
5331	// Make sure its an immediate that is considered foldable.
5332	// FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
5333	if (!isInt<`8`>(x: Val) && !isInt<`32`>(x: Val))
5334	break;
5335
5336	// If this can match to INC/DEC, let it go.
5337	if (Opcode == ISD::ADD && (Val == `1` \|\| Val == -`1`))
5338	break;
5339
5340	// Check if we should avoid folding this immediate.
5341	if (!shouldAvoidImmediateInstFormsForSize(N: N1.getNode()))
5342	break;
5343
5344	// We should not fold the immediate. So we need a register form instead.
5345	unsigned ROpc, MOpc;
5346	switch (NVT.SimpleTy) {
5347	default: llvm_unreachable("Unexpected VT!");
5348	case MVT::i8:
5349	switch (Opcode) {
5350	default: llvm_unreachable("Unexpected opcode!");
5351	case ISD::ADD:
5352	ROpc = GET_ND_IF_ENABLED(X86::ADD8rr);
5353	MOpc = GET_ND_IF_ENABLED(X86::ADD8rm);
5354	break;
5355	case ISD::SUB:
5356	ROpc = GET_ND_IF_ENABLED(X86::SUB8rr);
5357	MOpc = GET_ND_IF_ENABLED(X86::SUB8rm);
5358	break;
5359	case ISD::AND:
5360	ROpc = GET_ND_IF_ENABLED(X86::AND8rr);
5361	MOpc = GET_ND_IF_ENABLED(X86::AND8rm);
5362	break;
5363	case ISD::OR:
5364	ROpc = GET_ND_IF_ENABLED(X86::OR8rr);
5365	MOpc = GET_ND_IF_ENABLED(X86::OR8rm);
5366	break;
5367	case ISD::XOR:
5368	ROpc = GET_ND_IF_ENABLED(X86::XOR8rr);
5369	MOpc = GET_ND_IF_ENABLED(X86::XOR8rm);
5370	break;
5371	}
5372	break;
5373	case MVT::i16:
5374	switch (Opcode) {
5375	default: llvm_unreachable("Unexpected opcode!");
5376	case ISD::ADD:
5377	ROpc = GET_ND_IF_ENABLED(X86::ADD16rr);
5378	MOpc = GET_ND_IF_ENABLED(X86::ADD16rm);
5379	break;
5380	case ISD::SUB:
5381	ROpc = GET_ND_IF_ENABLED(X86::SUB16rr);
5382	MOpc = GET_ND_IF_ENABLED(X86::SUB16rm);
5383	break;
5384	case ISD::AND:
5385	ROpc = GET_ND_IF_ENABLED(X86::AND16rr);
5386	MOpc = GET_ND_IF_ENABLED(X86::AND16rm);
5387	break;
5388	case ISD::OR:
5389	ROpc = GET_ND_IF_ENABLED(X86::OR16rr);
5390	MOpc = GET_ND_IF_ENABLED(X86::OR16rm);
5391	break;
5392	case ISD::XOR:
5393	ROpc = GET_ND_IF_ENABLED(X86::XOR16rr);
5394	MOpc = GET_ND_IF_ENABLED(X86::XOR16rm);
5395	break;
5396	}
5397	break;
5398	case MVT::i32:
5399	switch (Opcode) {
5400	default: llvm_unreachable("Unexpected opcode!");
5401	case ISD::ADD:
5402	ROpc = GET_ND_IF_ENABLED(X86::ADD32rr);
5403	MOpc = GET_ND_IF_ENABLED(X86::ADD32rm);
5404	break;
5405	case ISD::SUB:
5406	ROpc = GET_ND_IF_ENABLED(X86::SUB32rr);
5407	MOpc = GET_ND_IF_ENABLED(X86::SUB32rm);
5408	break;
5409	case ISD::AND:
5410	ROpc = GET_ND_IF_ENABLED(X86::AND32rr);
5411	MOpc = GET_ND_IF_ENABLED(X86::AND32rm);
5412	break;
5413	case ISD::OR:
5414	ROpc = GET_ND_IF_ENABLED(X86::OR32rr);
5415	MOpc = GET_ND_IF_ENABLED(X86::OR32rm);
5416	break;
5417	case ISD::XOR:
5418	ROpc = GET_ND_IF_ENABLED(X86::XOR32rr);
5419	MOpc = GET_ND_IF_ENABLED(X86::XOR32rm);
5420	break;
5421	}
5422	break;
5423	case MVT::i64:
5424	switch (Opcode) {
5425	default: llvm_unreachable("Unexpected opcode!");
5426	case ISD::ADD:
5427	ROpc = GET_ND_IF_ENABLED(X86::ADD64rr);
5428	MOpc = GET_ND_IF_ENABLED(X86::ADD64rm);
5429	break;
5430	case ISD::SUB:
5431	ROpc = GET_ND_IF_ENABLED(X86::SUB64rr);
5432	MOpc = GET_ND_IF_ENABLED(X86::SUB64rm);
5433	break;
5434	case ISD::AND:
5435	ROpc = GET_ND_IF_ENABLED(X86::AND64rr);
5436	MOpc = GET_ND_IF_ENABLED(X86::AND64rm);
5437	break;
5438	case ISD::OR:
5439	ROpc = GET_ND_IF_ENABLED(X86::OR64rr);
5440	MOpc = GET_ND_IF_ENABLED(X86::OR64rm);
5441	break;
5442	case ISD::XOR:
5443	ROpc = GET_ND_IF_ENABLED(X86::XOR64rr);
5444	MOpc = GET_ND_IF_ENABLED(X86::XOR64rm);
5445	break;
5446	}
5447	break;
5448	}
5449
5450	// Ok this is a AND/OR/XOR/ADD/SUB with constant.
5451
5452	// If this is a not a subtract, we can still try to fold a load.
5453	if (Opcode != ISD::SUB) {
5454	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5455	if (tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
5456	SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(i: `0`) };
5457	SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
5458	MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5459	// Update the chain.
5460	ReplaceUses(F: N0.getValue(R: `1`), T: SDValue (CNode, `2`));
5461	// Record the mem-refs
5462	CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N0)->getMemOperand()});
5463	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (CNode, `0`));
5464	CurDAG->RemoveDeadNode(N: Node);
5465	return;
5466	}
5467	}
5468
5469	CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1);
5470	return;
5471	}
5472
5473	case X86ISD::SMUL:
5474	// i16/i32/i64 are handled with isel patterns.
5475	if (NVT != MVT::i8)
5476	break;
5477	[[fallthrough]];
5478	case X86ISD::UMUL: {
5479	SDValue N0 = Node->getOperand(Num: `0`);
5480	SDValue N1 = Node->getOperand(Num: `1`);
5481
5482	unsigned LoReg, ROpc, MOpc;
5483	switch (NVT.SimpleTy) {
5484	default: llvm_unreachable("Unsupported VT!");
5485	case MVT::i8:
5486	LoReg = X86::AL;
5487	ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
5488	MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
5489	break;
5490	case MVT::i16:
5491	LoReg = X86::AX;
5492	ROpc = X86::MUL16r;
5493	MOpc = X86::MUL16m;
5494	break;
5495	case MVT::i32:
5496	LoReg = X86::EAX;
5497	ROpc = X86::MUL32r;
5498	MOpc = X86::MUL32m;
5499	break;
5500	case MVT::i64:
5501	LoReg = X86::RAX;
5502	ROpc = X86::MUL64r;
5503	MOpc = X86::MUL64m;
5504	break;
5505	}
5506
5507	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5508	bool FoldedLoad = tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5509	// Multiply is commutative.
5510	if (!FoldedLoad) {
5511	FoldedLoad = tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5512	if (FoldedLoad)
5513	std::swap(a&: N0, b&: N1);
5514	}
5515
5516	SDValue InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: LoReg,
5517	N: N0, Glue: SDValue ()).getValue(R: `1`);
5518
5519	MachineSDNode *CNode;
5520	if (FoldedLoad) {
5521	// i16/i32/i64 use an instruction that produces a low and high result even
5522	// though only the low result is used.
5523	SDVTList VTs;
5524	if (NVT == MVT::i8)
5525	VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
5526	else
5527	VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other);
5528
5529	SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(i: `0`),
5530	InGlue };
5531	CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5532
5533	// Update the chain.
5534	ReplaceUses(N1.getValue(`1`), SDValue(CNode, NVT == MVT::i8 ? `2` : `3`));
5535	// Record the mem-refs
5536	CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()});
5537	} else {
5538	// i16/i32/i64 use an instruction that produces a low and high result even
5539	// though only the low result is used.
5540	SDVTList VTs;
5541	if (NVT == MVT::i8)
5542	VTs = CurDAG->getVTList(NVT, MVT::i32);
5543	else
5544	VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
5545
5546	CNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VTs, Ops: {N1, InGlue});
5547	}
5548
5549	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (CNode, `0`));
5550	ReplaceUses(SDValue(Node, `1`), SDValue(CNode, NVT == MVT::i8 ? `1` : `2`));
5551	CurDAG->RemoveDeadNode(N: Node);
5552	return;
5553	}
5554
5555	case ISD::SMUL_LOHI:
5556	case ISD::UMUL_LOHI: {
5557	SDValue N0 = Node->getOperand(Num: `0`);
5558	SDValue N1 = Node->getOperand(Num: `1`);
5559
5560	unsigned Opc, MOpc;
5561	unsigned LoReg, HiReg;
5562	bool IsSigned = Opcode == ISD::SMUL_LOHI;
5563	bool UseMULX = !IsSigned && Subtarget->hasBMI2();
5564	bool UseMULXHi = UseMULX && SDValue (Node, `0`).use_empty();
5565	switch (NVT.SimpleTy) {
5566	default: llvm_unreachable("Unsupported VT!");
5567	case MVT::i32:
5568	Opc = UseMULXHi ? X86::MULX32Hrr
5569	: UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rr)
5570	: IsSigned ? X86::IMUL32r
5571	: X86::MUL32r;
5572	MOpc = UseMULXHi ? X86::MULX32Hrm
5573	: UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rm)
5574	: IsSigned ? X86::IMUL32m
5575	: X86::MUL32m;
5576	LoReg = UseMULX ? X86::EDX : X86::EAX;
5577	HiReg = X86::EDX;
5578	break;
5579	case MVT::i64:
5580	Opc = UseMULXHi ? X86::MULX64Hrr
5581	: UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rr)
5582	: IsSigned ? X86::IMUL64r
5583	: X86::MUL64r;
5584	MOpc = UseMULXHi ? X86::MULX64Hrm
5585	: UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rm)
5586	: IsSigned ? X86::IMUL64m
5587	: X86::MUL64m;
5588	LoReg = UseMULX ? X86::RDX : X86::RAX;
5589	HiReg = X86::RDX;
5590	break;
5591	}
5592
5593	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5594	bool foldedLoad = tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5595	// Multiply is commutative.
5596	if (!foldedLoad) {
5597	foldedLoad = tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5598	if (foldedLoad)
5599	std::swap(a&: N0, b&: N1);
5600	}
5601
5602	SDValue InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: LoReg,
5603	N: N0, Glue: SDValue ()).getValue(R: `1`);
5604	SDValue ResHi, ResLo;
5605	if (foldedLoad) {
5606	SDValue Chain;
5607	MachineSDNode CNode = nullptr*;
5608	SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(i: `0`),
5609	InGlue };
5610	if (UseMULXHi) {
5611	SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
5612	CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5613	ResHi = SDValue (CNode, `0`);
5614	Chain = SDValue (CNode, `1`);
5615	} else if (UseMULX) {
5616	SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other);
5617	CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5618	ResHi = SDValue (CNode, `0`);
5619	ResLo = SDValue (CNode, `1`);
5620	Chain = SDValue (CNode, `2`);
5621	} else {
5622	SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
5623	CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops);
5624	Chain = SDValue (CNode, `0`);
5625	InGlue = SDValue (CNode, `1`);
5626	}
5627
5628	// Update the chain.
5629	ReplaceUses(F: N1.getValue(R: `1`), T: Chain);
5630	// Record the mem-refs
5631	CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()});
5632	} else {
5633	SDValue Ops[] = { N1, InGlue };
5634	if (UseMULXHi) {
5635	SDVTList VTs = CurDAG->getVTList(VT: NVT);
5636	SDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
5637	ResHi = SDValue (CNode, `0`);
5638	} else if (UseMULX) {
5639	SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT);
5640	SDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
5641	ResHi = SDValue (CNode, `0`);
5642	ResLo = SDValue (CNode, `1`);
5643	} else {
5644	SDVTList VTs = CurDAG->getVTList(MVT::Glue);
5645	SDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops);
5646	InGlue = SDValue (CNode, `0`);
5647	}
5648	}
5649
5650	// Copy the low half of the result, if it is needed.
5651	if (!SDValue (Node, `0`).use_empty()) {
5652	if (!ResLo) {
5653	assert(LoReg && "Register for low half is not defined!");
5654	ResLo = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl, Reg: LoReg,
5655	VT: NVT, Glue: InGlue);
5656	InGlue = ResLo.getValue(R: `2`);
5657	}
5658	ReplaceUses(F: SDValue (Node, `0`), T: ResLo);
5659	LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
5660	dbgs() << `'\n'`);
5661	}
5662	// Copy the high half of the result, if it is needed.
5663	if (!SDValue (Node, `1`).use_empty()) {
5664	if (!ResHi) {
5665	assert(HiReg && "Register for high half is not defined!");
5666	ResHi = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl, Reg: HiReg,
5667	VT: NVT, Glue: InGlue);
5668	InGlue = ResHi.getValue(R: `2`);
5669	}
5670	ReplaceUses(F: SDValue (Node, `1`), T: ResHi);
5671	LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
5672	dbgs() << `'\n'`);
5673	}
5674
5675	CurDAG->RemoveDeadNode(N: Node);
5676	return;
5677	}
5678
5679	case ISD::SDIVREM:
5680	case ISD::UDIVREM: {
5681	SDValue N0 = Node->getOperand(Num: `0`);
5682	SDValue N1 = Node->getOperand(Num: `1`);
5683
5684	unsigned ROpc, MOpc;
5685	bool isSigned = Opcode == ISD::SDIVREM;
5686	if (!isSigned) {
5687	switch (NVT.SimpleTy) {
5688	default: llvm_unreachable("Unsupported VT!");
5689	case MVT::i8: ROpc = X86::DIV8r; MOpc = X86::DIV8m; break;
5690	case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break;
5691	case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break;
5692	case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break;
5693	}
5694	} else {
5695	switch (NVT.SimpleTy) {
5696	default: llvm_unreachable("Unsupported VT!");
5697	case MVT::i8: ROpc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
5698	case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
5699	case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
5700	case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
5701	}
5702	}
5703
5704	unsigned LoReg, HiReg, ClrReg;
5705	unsigned SExtOpcode;
5706	switch (NVT.SimpleTy) {
5707	default: llvm_unreachable("Unsupported VT!");
5708	case MVT::i8:
5709	LoReg = X86::AL; ClrReg = HiReg = X86::AH;
5710	SExtOpcode = `0`; // Not used.
5711	break;
5712	case MVT::i16:
5713	LoReg = X86::AX; HiReg = X86::DX;
5714	ClrReg = X86::DX;
5715	SExtOpcode = X86::CWD;
5716	break;
5717	case MVT::i32:
5718	LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
5719	SExtOpcode = X86::CDQ;
5720	break;
5721	case MVT::i64:
5722	LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
5723	SExtOpcode = X86::CQO;
5724	break;
5725	}
5726
5727	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5728	bool foldedLoad = tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4);
5729	bool signBitIsZero = CurDAG->SignBitIsZero(Op: N0);
5730
5731	SDValue InGlue;
5732	if (NVT == MVT::i8) {
5733	// Special case for div8, just use a move with zero extension to AX to
5734	// clear the upper 8 bits (AH).
5735	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
5736	MachineSDNode *Move;
5737	if (tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
5738	SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(i: `0`) };
5739	unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8
5740	: X86::MOVZX16rm8;
5741	Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, MVT::Other, Ops);
5742	Chain = SDValue (Move, `1`);
5743	ReplaceUses(F: N0.getValue(R: `1`), T: Chain);
5744	// Record the mem-refs
5745	CurDAG->setNodeMemRefs(N: Move, NewMemRefs: {cast<LoadSDNode>(Val&: N0)->getMemOperand()});
5746	} else {
5747	unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8
5748	: X86::MOVZX16rr8;
5749	Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, N0);
5750	Chain = CurDAG->getEntryNode();
5751	}
5752	Chain = CurDAG->getCopyToReg(Chain, dl, X86::AX, SDValue(Move, `0`),
5753	SDValue());
5754	InGlue = Chain.getValue(R: `1`);
5755	} else {
5756	InGlue =
5757	CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl,
5758	Reg: LoReg, N: N0, Glue: SDValue ()).getValue(R: `1`);
5759	if (isSigned && !signBitIsZero) {
5760	// Sign extend the low part into the high part.
5761	InGlue =
5762	SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InGlue),`0`);
5763	} else {
5764	// Zero out the high part, effectively zero extending the input.
5765	SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
5766	SDValue ClrNode = SDValue(
5767	CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, std::nullopt), `0`);
5768	switch (NVT.SimpleTy) {
5769	case MVT::i16:
5770	ClrNode =
5771	SDValue(CurDAG->getMachineNode(
5772	TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
5773	CurDAG->getTargetConstant(X86::sub_16bit, dl,
5774	MVT::i32)),
5775	`0`);
5776	break;
5777	case MVT::i32:
5778	break;
5779	case MVT::i64:
5780	ClrNode =
5781	SDValue(CurDAG->getMachineNode(
5782	TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
5783	CurDAG->getTargetConstant(`0`, dl, MVT::i64), ClrNode,
5784	CurDAG->getTargetConstant(X86::sub_32bit, dl,
5785	MVT::i32)),
5786	`0`);
5787	break;
5788	default:
5789	llvm_unreachable("Unexpected division source");
5790	}
5791
5792	InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: ClrReg,
5793	N: ClrNode, Glue: InGlue).getValue(R: `1`);
5794	}
5795	}
5796
5797	if (foldedLoad) {
5798	SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(i: `0`),
5799	InGlue };
5800	MachineSDNode *CNode =
5801	CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
5802	InGlue = SDValue (CNode, `1`);
5803	// Update the chain.
5804	ReplaceUses(F: N1.getValue(R: `1`), T: SDValue (CNode, `0`));
5805	// Record the mem-refs
5806	CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()});
5807	} else {
5808	InGlue =
5809	SDValue(CurDAG->getMachineNode(ROpc, dl, MVT::Glue, N1, InGlue), `0`);
5810	}
5811
5812	// Prevent use of AH in a REX instruction by explicitly copying it to
5813	// an ABCD_L register.
5814	//
5815	// The current assumption of the register allocator is that isel
5816	// won't generate explicit references to the GR8_ABCD_H registers. If
5817	// the allocator and/or the backend get enhanced to be more robust in
5818	// that regard, this can be, and should be, removed.
5819	if (HiReg == X86::AH && !SDValue(Node, `1`).use_empty()) {
5820	SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
5821	unsigned AHExtOpcode =
5822	isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
5823
5824	SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
5825	MVT::Glue, AHCopy, InGlue);
5826	SDValue Result(RNode, `0`);
5827	InGlue = SDValue (RNode, `1`);
5828
5829	Result =
5830	CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
5831
5832	ReplaceUses(F: SDValue (Node, `1`), T: Result);
5833	LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
5834	dbgs() << `'\n'`);
5835	}
5836	// Copy the division (low) result, if it is needed.
5837	if (!SDValue (Node, `0`).use_empty()) {
5838	SDValue Result = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl,
5839	Reg: LoReg, VT: NVT, Glue: InGlue);
5840	InGlue = Result.getValue(R: `2`);
5841	ReplaceUses(F: SDValue (Node, `0`), T: Result);
5842	LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
5843	dbgs() << `'\n'`);
5844	}
5845	// Copy the remainder (high) result, if it is needed.
5846	if (!SDValue (Node, `1`).use_empty()) {
5847	SDValue Result = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl,
5848	Reg: HiReg, VT: NVT, Glue: InGlue);
5849	InGlue = Result.getValue(R: `2`);
5850	ReplaceUses(F: SDValue (Node, `1`), T: Result);
5851	LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
5852	dbgs() << `'\n'`);
5853	}
5854	CurDAG->RemoveDeadNode(N: Node);
5855	return;
5856	}
5857
5858	case X86ISD::FCMP:
5859	case X86ISD::STRICT_FCMP:
5860	case X86ISD::STRICT_FCMPS: {
5861	bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP \|\|
5862	Node->getOpcode() == X86ISD::STRICT_FCMPS;
5863	SDValue N0 = Node->getOperand(Num: IsStrictCmp ? `1` : `0`);
5864	SDValue N1 = Node->getOperand(Num: IsStrictCmp ? `2` : `1`);
5865
5866	// Save the original VT of the compare.
5867	MVT CmpVT = N0.getSimpleValueType();
5868
5869	// Floating point needs special handling if we don't have FCOMI.
5870	if (Subtarget->canUseCMOV())
5871	break;
5872
5873	bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS;
5874
5875	unsigned Opc;
5876	switch (CmpVT.SimpleTy) {
5877	default: llvm_unreachable("Unexpected type!");
5878	case MVT::f32:
5879	Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32;
5880	break;
5881	case MVT::f64:
5882	Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64;
5883	break;
5884	case MVT::f80:
5885	Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80;
5886	break;
5887	}
5888
5889	SDValue Chain =
5890	IsStrictCmp ? Node->getOperand(Num: `0`) : CurDAG->getEntryNode();
5891	SDValue Glue;
5892	if (IsStrictCmp) {
5893	SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
5894	Chain = SDValue (CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops: {N0, N1, Chain}), `0`);
5895	Glue = Chain.getValue(R: `1`);
5896	} else {
5897	Glue = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N0, N1), `0`);
5898	}
5899
5900	// Move FPSW to AX.
5901	SDValue FNSTSW =
5902	SDValue(CurDAG->getMachineNode(X86::FNSTSW16r, dl, MVT::i16, Glue), `0`);
5903
5904	// Extract upper 8-bits of AX.
5905	SDValue Extract =
5906	CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl, MVT::i8, FNSTSW);
5907
5908	// Move AH into flags.
5909	// Some 64-bit targets lack SAHF support, but they do support FCOMI.
5910	assert(Subtarget->canUseLAHFSAHF() &&
5911	"Target doesn't support SAHF or FCOMI?");
5912	SDValue AH = CurDAG->getCopyToReg(Chain, dl, X86::AH, Extract, SDValue());
5913	Chain = AH;
5914	SDValue SAHF = SDValue(
5915	CurDAG->getMachineNode(X86::SAHF, dl, MVT::i32, AH.getValue(`1`)), `0`);
5916
5917	if (IsStrictCmp)
5918	ReplaceUses(F: SDValue (Node, `1`), T: Chain);
5919
5920	ReplaceUses(F: SDValue (Node, `0`), T: SAHF);
5921	CurDAG->RemoveDeadNode(N: Node);
5922	return;
5923	}
5924
5925	case X86ISD::CMP: {
5926	SDValue N0 = Node->getOperand(Num: `0`);
5927	SDValue N1 = Node->getOperand(Num: `1`);
5928
5929	// Optimizations for TEST compares.
5930	if (!isNullConstant(V: N1))
5931	break;
5932
5933	// Save the original VT of the compare.
5934	MVT CmpVT = N0.getSimpleValueType();
5935
5936	// If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
5937	// by a test instruction. The test should be removed later by
5938	// analyzeCompare if we are using only the zero flag.
5939	// TODO: Should we check the users and use the BEXTR flags directly?
5940	if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
5941	if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node: N0.getNode())) {
5942	unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
5943	: X86::TEST32rr;
5944	SDValue BEXTR = SDValue (NewNode, `0`);
5945	NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR);
5946	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (NewNode, `0`));
5947	CurDAG->RemoveDeadNode(N: Node);
5948	return;
5949	}
5950	}
5951
5952	// We can peek through truncates, but we need to be careful below.
5953	if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
5954	N0 = N0.getOperand(i: `0`);
5955
5956	// Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
5957	// use a smaller encoding.
5958	// Look past the truncate if CMP is the only use of it.
5959	if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
5960	N0.getValueType() != MVT::i8) {
5961	auto *MaskC = dyn_cast<ConstantSDNode>(Val: N0.getOperand(i: `1`));
5962	if (!MaskC)
5963	break;
5964
5965	// We may have looked through a truncate so mask off any bits that
5966	// shouldn't be part of the compare.
5967	uint64_t Mask = MaskC->getZExtValue();
5968	Mask &= maskTrailingOnes<uint64_t>(N: CmpVT.getScalarSizeInBits());
5969
5970	// Check if we can replace AND+IMM{32,64} with a shift. This is possible
5971	// for masks like 0xFF000000 or 0x00FFFFFF and if we care only about the
5972	// zero flag.
5973	if (CmpVT == MVT::i64 && !isInt<`8`>(Mask) && isShiftedMask_64(Mask) &&
5974	onlyUsesZeroFlag(SDValue(Node, `0`))) {
5975	unsigned ShiftOpcode = ISD::DELETED_NODE;
5976	unsigned ShiftAmt;
5977	unsigned SubRegIdx;
5978	MVT SubRegVT;
5979	unsigned TestOpcode;
5980	unsigned LeadingZeros = llvm::countl_zero(Val: Mask);
5981	unsigned TrailingZeros = llvm::countr_zero(Val: Mask);
5982
5983	// With leading/trailing zeros, the transform is profitable if we can
5984	// eliminate a movabsq or shrink a 32-bit immediate to 8-bit without
5985	// incurring any extra register moves.
5986	bool SavesBytes = !isInt<`32`>(x: Mask) \|\| N0.getOperand(i: `0`).hasOneUse();
5987	if (LeadingZeros == `0` && SavesBytes) {
5988	// If the mask covers the most significant bit, then we can replace
5989	// TEST+AND with a SHR and check eflags.
5990	// This emits a redundant TEST which is subsequently eliminated.
5991	ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
5992	ShiftAmt = TrailingZeros;
5993	SubRegIdx = `0`;
5994	TestOpcode = X86::TEST64rr;
5995	} else if (TrailingZeros == `0` && SavesBytes) {
5996	// If the mask covers the least significant bit, then we can replace
5997	// TEST+AND with a SHL and check eflags.
5998	// This emits a redundant TEST which is subsequently eliminated.
5999	ShiftOpcode = GET_ND_IF_ENABLED(X86::SHL64ri);
6000	ShiftAmt = LeadingZeros;
6001	SubRegIdx = `0`;
6002	TestOpcode = X86::TEST64rr;
6003	} else if (MaskC->hasOneUse() && !isInt<`32`>(x: Mask)) {
6004	// If the shifted mask extends into the high half and is 8/16/32 bits
6005	// wide, then replace it with a SHR and a TEST8rr/TEST16rr/TEST32rr.
6006	unsigned PopCount = `64` - LeadingZeros - TrailingZeros;
6007	if (PopCount == `8`) {
6008	ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6009	ShiftAmt = TrailingZeros;
6010	SubRegIdx = X86::sub_8bit;
6011	SubRegVT = MVT::i8;
6012	TestOpcode = X86::TEST8rr;
6013	} else if (PopCount == `16`) {
6014	ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6015	ShiftAmt = TrailingZeros;
6016	SubRegIdx = X86::sub_16bit;
6017	SubRegVT = MVT::i16;
6018	TestOpcode = X86::TEST16rr;
6019	} else if (PopCount == `32`) {
6020	ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6021	ShiftAmt = TrailingZeros;
6022	SubRegIdx = X86::sub_32bit;
6023	SubRegVT = MVT::i32;
6024	TestOpcode = X86::TEST32rr;
6025	}
6026	}
6027	if (ShiftOpcode != ISD::DELETED_NODE) {
6028	SDValue ShiftC = CurDAG->getTargetConstant(ShiftAmt, dl, MVT::i64);
6029	SDValue Shift = SDValue(
6030	CurDAG->getMachineNode(ShiftOpcode, dl, MVT::i64, MVT::i32,
6031	N0.getOperand(`0`), ShiftC),
6032	`0`);
6033	if (SubRegIdx != `0`) {
6034	Shift =
6035	CurDAG->getTargetExtractSubreg(SRIdx: SubRegIdx, DL: dl, VT: SubRegVT, Operand: Shift);
6036	}
6037	MachineSDNode *Test =
6038	CurDAG->getMachineNode(TestOpcode, dl, MVT::i32, Shift, Shift);
6039	ReplaceNode(F: Node, T: Test);
6040	return;
6041	}
6042	}
6043
6044	MVT VT;
6045	int SubRegOp;
6046	unsigned ROpc, MOpc;
6047
6048	// For each of these checks we need to be careful if the sign flag is
6049	// being used. It is only safe to use the sign flag in two conditions,
6050	// either the sign bit in the shrunken mask is zero or the final test
6051	// size is equal to the original compare size.
6052
6053	if (isUInt<`8`>(Mask) &&
6054	(!(Mask & `0x80`) \|\| CmpVT == MVT::i8 \|\|
6055	hasNoSignFlagUses(SDValue(Node, `0`)))) {
6056	// For example, convert "testl %eax, $8" to "testb %al, $8"
6057	VT = MVT::i8;
6058	SubRegOp = X86::sub_8bit;
6059	ROpc = X86::TEST8ri;
6060	MOpc = X86::TEST8mi;
6061	} else if (OptForMinSize && isUInt<`16`>(Mask) &&
6062	(!(Mask & `0x8000`) \|\| CmpVT == MVT::i16 \|\|
6063	hasNoSignFlagUses(SDValue(Node, `0`)))) {
6064	// For example, "testl %eax, $32776" to "testw %ax, $32776".
6065	// NOTE: We only want to form TESTW instructions if optimizing for
6066	// min size. Otherwise we only save one byte and possibly get a length
6067	// changing prefix penalty in the decoders.
6068	VT = MVT::i16;
6069	SubRegOp = X86::sub_16bit;
6070	ROpc = X86::TEST16ri;
6071	MOpc = X86::TEST16mi;
6072	} else if (isUInt<`32`>(Mask) && N0.getValueType() != MVT::i16 &&
6073	((!(Mask & `0x80000000`) &&
6074	// Without minsize 16-bit Cmps can get here so we need to
6075	// be sure we calculate the correct sign flag if needed.
6076	(CmpVT != MVT::i16 \|\| !(Mask & `0x8000`))) \|\|
6077	CmpVT == MVT::i32 \|\|
6078	hasNoSignFlagUses(SDValue(Node, `0`)))) {
6079	// For example, "testq %rax, $268468232" to "testl %eax, $268468232".
6080	// NOTE: We only want to run that transform if N0 is 32 or 64 bits.
6081	// Otherwize, we find ourselves in a position where we have to do
6082	// promotion. If previous passes did not promote the and, we assume
6083	// they had a good reason not to and do not promote here.
6084	VT = MVT::i32;
6085	SubRegOp = X86::sub_32bit;
6086	ROpc = X86::TEST32ri;
6087	MOpc = X86::TEST32mi;
6088	} else {
6089	// No eligible transformation was found.
6090	break;
6091	}
6092
6093	SDValue Imm = CurDAG->getTargetConstant(Val: Mask, DL: dl, VT);
6094	SDValue Reg = N0.getOperand(i: `0`);
6095
6096	// Emit a testl or testw.
6097	MachineSDNode *NewNode;
6098	SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
6099	if (tryFoldLoad(Root: Node, P: N0.getNode(), N: Reg, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) {
6100	if (auto *LoadN = dyn_cast<LoadSDNode>(Val: N0.getOperand(i: `0`).getNode())) {
6101	if (!LoadN->isSimple()) {
6102	unsigned NumVolBits = LoadN->getValueType(ResNo: `0`).getSizeInBits();
6103	if ((MOpc == X86::TEST8mi && NumVolBits != `8`) \|\|
6104	(MOpc == X86::TEST16mi && NumVolBits != `16`) \|\|
6105	(MOpc == X86::TEST32mi && NumVolBits != `32`))
6106	break;
6107	}
6108	}
6109	SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
6110	Reg.getOperand(i: `0`) };
6111	NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops);
6112	// Update the chain.
6113	ReplaceUses(F: Reg.getValue(R: `1`), T: SDValue (NewNode, `1`));
6114	// Record the mem-refs
6115	CurDAG->setNodeMemRefs(N: NewNode,
6116	NewMemRefs: {cast<LoadSDNode>(Val&: Reg)->getMemOperand()});
6117	} else {
6118	// Extract the subregister if necessary.
6119	if (N0.getValueType() != VT)
6120	Reg = CurDAG->getTargetExtractSubreg(SRIdx: SubRegOp, DL: dl, VT, Operand: Reg);
6121
6122	NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm);
6123	}
6124	// Replace CMP with TEST.
6125	ReplaceNode(F: Node, T: NewNode);
6126	return;
6127	}
6128	break;
6129	}
6130	case X86ISD::PCMPISTR: {
6131	if (!Subtarget->hasSSE42())
6132	break;
6133
6134	bool NeedIndex = !SDValue (Node, `0`).use_empty();
6135	bool NeedMask = !SDValue (Node, `1`).use_empty();
6136	// We can't fold a load if we are going to make two instructions.
6137	bool MayFoldLoad = !NeedIndex \|\| !NeedMask;
6138
6139	MachineSDNode *CNode;
6140	if (NeedMask) {
6141	unsigned ROpc =
6142	Subtarget->hasAVX() ? X86::VPCMPISTRMrri : X86::PCMPISTRMrri;
6143	unsigned MOpc =
6144	Subtarget->hasAVX() ? X86::VPCMPISTRMrmi : X86::PCMPISTRMrmi;
6145	CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
6146	ReplaceUses(F: SDValue (Node, `1`), T: SDValue (CNode, `0`));
6147	}
6148	if (NeedIndex \|\| !NeedMask) {
6149	unsigned ROpc =
6150	Subtarget->hasAVX() ? X86::VPCMPISTRIrri : X86::PCMPISTRIrri;
6151	unsigned MOpc =
6152	Subtarget->hasAVX() ? X86::VPCMPISTRIrmi : X86::PCMPISTRIrmi;
6153	CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
6154	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (CNode, `0`));
6155	}
6156
6157	// Connect the flag usage to the last instruction created.
6158	ReplaceUses(F: SDValue (Node, `2`), T: SDValue (CNode, `1`));
6159	CurDAG->RemoveDeadNode(N: Node);
6160	return;
6161	}
6162	case X86ISD::PCMPESTR: {
6163	if (!Subtarget->hasSSE42())
6164	break;
6165
6166	// Copy the two implicit register inputs.
6167	SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
6168	Node->getOperand(`1`),
6169	SDValue()).getValue(`1`);
6170	InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
6171	Node->getOperand(`3`), InGlue).getValue(`1`);
6172
6173	bool NeedIndex = !SDValue (Node, `0`).use_empty();
6174	bool NeedMask = !SDValue (Node, `1`).use_empty();
6175	// We can't fold a load if we are going to make two instructions.
6176	bool MayFoldLoad = !NeedIndex \|\| !NeedMask;
6177
6178	MachineSDNode *CNode;
6179	if (NeedMask) {
6180	unsigned ROpc =
6181	Subtarget->hasAVX() ? X86::VPCMPESTRMrri : X86::PCMPESTRMrri;
6182	unsigned MOpc =
6183	Subtarget->hasAVX() ? X86::VPCMPESTRMrmi : X86::PCMPESTRMrmi;
6184	CNode =
6185	emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node, InGlue);
6186	ReplaceUses(F: SDValue (Node, `1`), T: SDValue (CNode, `0`));
6187	}
6188	if (NeedIndex \|\| !NeedMask) {
6189	unsigned ROpc =
6190	Subtarget->hasAVX() ? X86::VPCMPESTRIrri : X86::PCMPESTRIrri;
6191	unsigned MOpc =
6192	Subtarget->hasAVX() ? X86::VPCMPESTRIrmi : X86::PCMPESTRIrmi;
6193	CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InGlue);
6194	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (CNode, `0`));
6195	}
6196	// Connect the flag usage to the last instruction created.
6197	ReplaceUses(F: SDValue (Node, `2`), T: SDValue (CNode, `1`));
6198	CurDAG->RemoveDeadNode(N: Node);
6199	return;
6200	}
6201
6202	case ISD::SETCC: {
6203	if (NVT.isVector() && tryVPTESTM(Root: Node, Setcc: SDValue (Node, `0`), InMask: SDValue ()))
6204	return;
6205
6206	break;
6207	}
6208
6209	case ISD::STORE:
6210	if (foldLoadStoreIntoMemOperand(Node))
6211	return;
6212	break;
6213
6214	case X86ISD::SETCC_CARRY: {
6215	MVT VT = Node->getSimpleValueType(ResNo: `0`);
6216	SDValue Result;
6217	if (Subtarget->hasSBBDepBreaking()) {
6218	// We have to do this manually because tblgen will put the eflags copy in
6219	// the wrong place if we use an extract_subreg in the pattern.
6220	// Copy flags to the EFLAGS register and glue it to next node.
6221	SDValue EFLAGS =
6222	CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
6223	Node->getOperand(`1`), SDValue());
6224
6225	// Create a 64-bit instruction if the result is 64-bits otherwise use the
6226	// 32-bit version.
6227	unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
6228	MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
6229	Result = SDValue (
6230	CurDAG->getMachineNode(Opcode: Opc, dl, VT: SetVT, Op1: EFLAGS, Op2: EFLAGS.getValue(R: `1`)),
6231	`0`);
6232	} else {
6233	// The target does not recognize sbb with the same reg operand as a
6234	// no-source idiom, so we explicitly zero the input values.
6235	Result = getSBBZero(N: Node);
6236	}
6237
6238	// For less than 32-bits we need to extract from the 32-bit node.
6239	if (VT == MVT::i8 \|\| VT == MVT::i16) {
6240	int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6241	Result = CurDAG->getTargetExtractSubreg(SRIdx: SubIndex, DL: dl, VT, Operand: Result);
6242	}
6243
6244	ReplaceUses(F: SDValue (Node, `0`), T: Result);
6245	CurDAG->RemoveDeadNode(N: Node);
6246	return;
6247	}
6248	case X86ISD::SBB: {
6249	if (isNullConstant(V: Node->getOperand(Num: `0`)) &&
6250	isNullConstant(V: Node->getOperand(Num: `1`))) {
6251	SDValue Result = getSBBZero(N: Node);
6252
6253	// Replace the flag use.
6254	ReplaceUses(F: SDValue (Node, `1`), T: Result.getValue(R: `1`));
6255
6256	// Replace the result use.
6257	if (!SDValue (Node, `0`).use_empty()) {
6258	// For less than 32-bits we need to extract from the 32-bit node.
6259	MVT VT = Node->getSimpleValueType(ResNo: `0`);
6260	if (VT == MVT::i8 \|\| VT == MVT::i16) {
6261	int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6262	Result = CurDAG->getTargetExtractSubreg(SRIdx: SubIndex, DL: dl, VT, Operand: Result);
6263	}
6264	ReplaceUses(F: SDValue (Node, `0`), T: Result);
6265	}
6266
6267	CurDAG->RemoveDeadNode(N: Node);
6268	return;
6269	}
6270	break;
6271	}
6272	case X86ISD::MGATHER: {
6273	auto *Mgt = cast<X86MaskedGatherSDNode>(Val: Node);
6274	SDValue IndexOp = Mgt->getIndex();
6275	SDValue Mask = Mgt->getMask();
6276	MVT IndexVT = IndexOp.getSimpleValueType();
6277	MVT ValueVT = Node->getSimpleValueType(ResNo: `0`);
6278	MVT MaskVT = Mask.getSimpleValueType();
6279
6280	// This is just to prevent crashes if the nodes are malformed somehow. We're
6281	// otherwise only doing loose type checking in here based on type what
6282	// a type constraint would say just like table based isel.
6283	if (!ValueVT.isVector() \|\| !MaskVT.isVector())
6284	break;
6285
6286	unsigned NumElts = ValueVT.getVectorNumElements();
6287	MVT ValueSVT = ValueVT.getVectorElementType();
6288
6289	bool IsFP = ValueSVT.isFloatingPoint();
6290	unsigned EltSize = ValueSVT.getSizeInBits();
6291
6292	unsigned Opc = `0`;
6293	bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1;
6294	if (AVX512Gather) {
6295	if (IndexVT == MVT::v4i32 && NumElts == `4` && EltSize == `32`)
6296	Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm;
6297	else if (IndexVT == MVT::v8i32 && NumElts == `8` && EltSize == `32`)
6298	Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm;
6299	else if (IndexVT == MVT::v16i32 && NumElts == `16` && EltSize == `32`)
6300	Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm;
6301	else if (IndexVT == MVT::v4i32 && NumElts == `2` && EltSize == `64`)
6302	Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm;
6303	else if (IndexVT == MVT::v4i32 && NumElts == `4` && EltSize == `64`)
6304	Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm;
6305	else if (IndexVT == MVT::v8i32 && NumElts == `8` && EltSize == `64`)
6306	Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm;
6307	else if (IndexVT == MVT::v2i64 && NumElts == `4` && EltSize == `32`)
6308	Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm;
6309	else if (IndexVT == MVT::v4i64 && NumElts == `4` && EltSize == `32`)
6310	Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm;
6311	else if (IndexVT == MVT::v8i64 && NumElts == `8` && EltSize == `32`)
6312	Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm;
6313	else if (IndexVT == MVT::v2i64 && NumElts == `2` && EltSize == `64`)
6314	Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm;
6315	else if (IndexVT == MVT::v4i64 && NumElts == `4` && EltSize == `64`)
6316	Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm;
6317	else if (IndexVT == MVT::v8i64 && NumElts == `8` && EltSize == `64`)
6318	Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm;
6319	} else {
6320	assert(EVT (MaskVT) == EVT (ValueVT).changeVectorElementTypeToInteger() &&
6321	"Unexpected mask VT!");
6322	if (IndexVT == MVT::v4i32 && NumElts == `4` && EltSize == `32`)
6323	Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm;
6324	else if (IndexVT == MVT::v8i32 && NumElts == `8` && EltSize == `32`)
6325	Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm;
6326	else if (IndexVT == MVT::v4i32 && NumElts == `2` && EltSize == `64`)
6327	Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm;
6328	else if (IndexVT == MVT::v4i32 && NumElts == `4` && EltSize == `64`)
6329	Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm;
6330	else if (IndexVT == MVT::v2i64 && NumElts == `4` && EltSize == `32`)
6331	Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm;
6332	else if (IndexVT == MVT::v4i64 && NumElts == `4` && EltSize == `32`)
6333	Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm;
6334	else if (IndexVT == MVT::v2i64 && NumElts == `2` && EltSize == `64`)
6335	Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm;
6336	else if (IndexVT == MVT::v4i64 && NumElts == `4` && EltSize == `64`)
6337	Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm;
6338	}
6339
6340	if (!Opc)
6341	break;
6342
6343	SDValue Base, Scale, Index, Disp, Segment;
6344	if (!selectVectorAddr(Parent: Mgt, BasePtr: Mgt->getBasePtr(), IndexOp, ScaleOp: Mgt->getScale(),
6345	Base, Scale, Index, Disp, Segment))
6346	break;
6347
6348	SDValue PassThru = Mgt->getPassThru();
6349	SDValue Chain = Mgt->getChain();
6350	// Gather instructions have a mask output not in the ISD node.
6351	SDVTList VTs = CurDAG->getVTList(ValueVT, MaskVT, MVT::Other);
6352
6353	MachineSDNode *NewNode;
6354	if (AVX512Gather) {
6355	SDValue Ops[] = {PassThru, Mask, Base, Scale,
6356	Index, Disp, Segment, Chain};
6357	NewNode = CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc (dl), VTs, Ops);
6358	} else {
6359	SDValue Ops[] = {PassThru, Base, Scale, Index,
6360	Disp, Segment, Mask, Chain};
6361	NewNode = CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc (dl), VTs, Ops);
6362	}
6363	CurDAG->setNodeMemRefs(N: NewNode, NewMemRefs: {Mgt->getMemOperand()});
6364	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (NewNode, `0`));
6365	ReplaceUses(F: SDValue (Node, `1`), T: SDValue (NewNode, `2`));
6366	CurDAG->RemoveDeadNode(N: Node);
6367	return;
6368	}
6369	case X86ISD::MSCATTER: {
6370	auto *Sc = cast<X86MaskedScatterSDNode>(Val: Node);
6371	SDValue Value = Sc->getValue();
6372	SDValue IndexOp = Sc->getIndex();
6373	MVT IndexVT = IndexOp.getSimpleValueType();
6374	MVT ValueVT = Value.getSimpleValueType();
6375
6376	// This is just to prevent crashes if the nodes are malformed somehow. We're
6377	// otherwise only doing loose type checking in here based on type what
6378	// a type constraint would say just like table based isel.
6379	if (!ValueVT.isVector())
6380	break;
6381
6382	unsigned NumElts = ValueVT.getVectorNumElements();
6383	MVT ValueSVT = ValueVT.getVectorElementType();
6384
6385	bool IsFP = ValueSVT.isFloatingPoint();
6386	unsigned EltSize = ValueSVT.getSizeInBits();
6387
6388	unsigned Opc;
6389	if (IndexVT == MVT::v4i32 && NumElts == `4` && EltSize == `32`)
6390	Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr;
6391	else if (IndexVT == MVT::v8i32 && NumElts == `8` && EltSize == `32`)
6392	Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr;
6393	else if (IndexVT == MVT::v16i32 && NumElts == `16` && EltSize == `32`)
6394	Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr;
6395	else if (IndexVT == MVT::v4i32 && NumElts == `2` && EltSize == `64`)
6396	Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr;
6397	else if (IndexVT == MVT::v4i32 && NumElts == `4` && EltSize == `64`)
6398	Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr;
6399	else if (IndexVT == MVT::v8i32 && NumElts == `8` && EltSize == `64`)
6400	Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr;
6401	else if (IndexVT == MVT::v2i64 && NumElts == `4` && EltSize == `32`)
6402	Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr;
6403	else if (IndexVT == MVT::v4i64 && NumElts == `4` && EltSize == `32`)
6404	Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr;
6405	else if (IndexVT == MVT::v8i64 && NumElts == `8` && EltSize == `32`)
6406	Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr;
6407	else if (IndexVT == MVT::v2i64 && NumElts == `2` && EltSize == `64`)
6408	Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr;
6409	else if (IndexVT == MVT::v4i64 && NumElts == `4` && EltSize == `64`)
6410	Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr;
6411	else if (IndexVT == MVT::v8i64 && NumElts == `8` && EltSize == `64`)
6412	Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr;
6413	else
6414	break;
6415
6416	SDValue Base, Scale, Index, Disp, Segment;
6417	if (!selectVectorAddr(Parent: Sc, BasePtr: Sc->getBasePtr(), IndexOp, ScaleOp: Sc->getScale(),
6418	Base, Scale, Index, Disp, Segment))
6419	break;
6420
6421	SDValue Mask = Sc->getMask();
6422	SDValue Chain = Sc->getChain();
6423	// Scatter instructions have a mask output not in the ISD node.
6424	SDVTList VTs = CurDAG->getVTList(Mask.getValueType(), MVT::Other);
6425	SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain};
6426
6427	MachineSDNode *NewNode = CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc (dl), VTs, Ops);
6428	CurDAG->setNodeMemRefs(N: NewNode, NewMemRefs: {Sc->getMemOperand()});
6429	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (NewNode, `1`));
6430	CurDAG->RemoveDeadNode(N: Node);
6431	return;
6432	}
6433	case ISD::PREALLOCATED_SETUP: {
6434	auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6435	auto CallId = MFI->getPreallocatedIdForCallSite(
6436	CS: cast<SrcValueSDNode>(Val: Node->getOperand(Num: `1`))->getValue());
6437	SDValue Chain = Node->getOperand(Num: `0`);
6438	SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
6439	MachineSDNode *New = CurDAG->getMachineNode(
6440	TargetOpcode::PREALLOCATED_SETUP, dl, MVT::Other, CallIdValue, Chain);
6441	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (New, `0`)); // Chain
6442	CurDAG->RemoveDeadNode(N: Node);
6443	return;
6444	}
6445	case ISD::PREALLOCATED_ARG: {
6446	auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6447	auto CallId = MFI->getPreallocatedIdForCallSite(
6448	CS: cast<SrcValueSDNode>(Val: Node->getOperand(Num: `1`))->getValue());
6449	SDValue Chain = Node->getOperand(Num: `0`);
6450	SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
6451	SDValue ArgIndex = Node->getOperand(Num: `2`);
6452	SDValue Ops[`3`];
6453	Ops[`0`] = CallIdValue;
6454	Ops[`1`] = ArgIndex;
6455	Ops[`2`] = Chain;
6456	MachineSDNode *New = CurDAG->getMachineNode(
6457	TargetOpcode::PREALLOCATED_ARG, dl,
6458	CurDAG->getVTList(TLI->getPointerTy(CurDAG->getDataLayout()),
6459	MVT::Other),
6460	Ops);
6461	ReplaceUses(F: SDValue (Node, `0`), T: SDValue (New, `0`)); // Arg pointer
6462	ReplaceUses(F: SDValue (Node, `1`), T: SDValue (New, `1`)); // Chain
6463	CurDAG->RemoveDeadNode(N: Node);
6464	return;
6465	}
6466	case X86ISD::AESENCWIDE128KL:
6467	case X86ISD::AESDECWIDE128KL:
6468	case X86ISD::AESENCWIDE256KL:
6469	case X86ISD::AESDECWIDE256KL: {
6470	if (!Subtarget->hasWIDEKL())
6471	break;
6472
6473	unsigned Opcode;
6474	switch (Node->getOpcode()) {
6475	default:
6476	llvm_unreachable("Unexpected opcode!");
6477	case X86ISD::AESENCWIDE128KL:
6478	Opcode = X86::AESENCWIDE128KL;
6479	break;
6480	case X86ISD::AESDECWIDE128KL:
6481	Opcode = X86::AESDECWIDE128KL;
6482	break;
6483	case X86ISD::AESENCWIDE256KL:
6484	Opcode = X86::AESENCWIDE256KL;
6485	break;
6486	case X86ISD::AESDECWIDE256KL:
6487	Opcode = X86::AESDECWIDE256KL;
6488	break;
6489	}
6490
6491	SDValue Chain = Node->getOperand(Num: `0`);
6492	SDValue Addr = Node->getOperand(Num: `1`);
6493
6494	SDValue Base, Scale, Index, Disp, Segment;
6495	if (!selectAddr(Parent: Node, N: Addr, Base, Scale, Index, Disp, Segment))
6496	break;
6497
6498	Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(`2`),
6499	SDValue());
6500	Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(`3`),
6501	Chain.getValue(`1`));
6502	Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM2, Node->getOperand(`4`),
6503	Chain.getValue(`1`));
6504	Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM3, Node->getOperand(`5`),
6505	Chain.getValue(`1`));
6506	Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM4, Node->getOperand(`6`),
6507	Chain.getValue(`1`));
6508	Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM5, Node->getOperand(`7`),
6509	Chain.getValue(`1`));
6510	Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM6, Node->getOperand(`8`),
6511	Chain.getValue(`1`));
6512	Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM7, Node->getOperand(`9`),
6513	Chain.getValue(`1`));
6514
6515	MachineSDNode *Res = CurDAG->getMachineNode(
6516	Opcode, dl, VTs: Node->getVTList(),
6517	Ops: {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(R: `1`)});
6518	CurDAG->setNodeMemRefs(N: Res, NewMemRefs: cast<MemSDNode>(Val: Node)->getMemOperand());
6519	ReplaceNode(F: Node, T: Res);
6520	return;
6521	}
6522	}
6523
6524	SelectCode(Node);
6525	}
6526
6527	bool X86DAGToDAGISel::SelectInlineAsmMemoryOperand(
6528	const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
6529	std::vector<SDValue> &OutOps) {
6530	SDValue Op0, Op1, Op2, Op3, Op4;
6531	switch (ConstraintID) {
6532	default:
6533	llvm_unreachable("Unexpected asm memory constraint");
6534	case InlineAsm::ConstraintCode::o: // offsetable ??
6535	case InlineAsm::ConstraintCode::v: // not offsetable ??
6536	case InlineAsm::ConstraintCode::m: // memory
6537	case InlineAsm::ConstraintCode::X:
6538	case InlineAsm::ConstraintCode::p: // address
6539	if (!selectAddr(Parent: nullptr, N: Op, Base&: Op0, Scale&: Op1, Index&: Op2, Disp&: Op3, Segment&: Op4))
6540	return true;
6541	break;
6542	}
6543
6544	OutOps.push_back(x: Op0);
6545	OutOps.push_back(x: Op1);
6546	OutOps.push_back(x: Op2);
6547	OutOps.push_back(x: Op3);
6548	OutOps.push_back(x: Op4);
6549	return false;
6550	}
6551
6552	/// This pass converts a legalized DAG into a X86-specific DAG,
6553	/// ready for instruction scheduling.
6554	FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
6555	CodeGenOptLevel OptLevel) {
6556	return new X86DAGToDAGISel (TM, OptLevel);
6557	}
6558

source code of llvm/lib/Target/X86/X86ISelDAGToDAG.cpp