1 | //===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file defines a DAG pattern matching instruction selector for X86, |
10 | // converting from a legalized dag to a X86 dag. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "X86.h" |
15 | #include "X86MachineFunctionInfo.h" |
16 | #include "X86RegisterInfo.h" |
17 | #include "X86Subtarget.h" |
18 | #include "X86TargetMachine.h" |
19 | #include "llvm/ADT/Statistic.h" |
20 | #include "llvm/CodeGen/MachineModuleInfo.h" |
21 | #include "llvm/CodeGen/SelectionDAGISel.h" |
22 | #include "llvm/Config/llvm-config.h" |
23 | #include "llvm/IR/ConstantRange.h" |
24 | #include "llvm/IR/Function.h" |
25 | #include "llvm/IR/Instructions.h" |
26 | #include "llvm/IR/Intrinsics.h" |
27 | #include "llvm/IR/IntrinsicsX86.h" |
28 | #include "llvm/IR/Type.h" |
29 | #include "llvm/Support/Debug.h" |
30 | #include "llvm/Support/ErrorHandling.h" |
31 | #include "llvm/Support/KnownBits.h" |
32 | #include "llvm/Support/MathExtras.h" |
33 | #include <cstdint> |
34 | |
35 | using namespace llvm; |
36 | |
37 | #define DEBUG_TYPE "x86-isel" |
38 | #define PASS_NAME "X86 DAG->DAG Instruction Selection" |
39 | |
40 | STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor" ); |
41 | |
42 | static cl::opt<bool> AndImmShrink("x86-and-imm-shrink" , cl::init(Val: true), |
43 | cl::desc("Enable setting constant bits to reduce size of mask immediates" ), |
44 | cl::Hidden); |
45 | |
46 | static cl::opt<bool> EnablePromoteAnyextLoad( |
47 | "x86-promote-anyext-load" , cl::init(Val: true), |
48 | cl::desc("Enable promoting aligned anyext load to wider load" ), cl::Hidden); |
49 | |
50 | extern cl::opt<bool> IndirectBranchTracking; |
51 | |
52 | //===----------------------------------------------------------------------===// |
53 | // Pattern Matcher Implementation |
54 | //===----------------------------------------------------------------------===// |
55 | |
56 | namespace { |
57 | /// This corresponds to X86AddressMode, but uses SDValue's instead of register |
58 | /// numbers for the leaves of the matched tree. |
59 | struct X86ISelAddressMode { |
60 | enum { |
61 | RegBase, |
62 | FrameIndexBase |
63 | } BaseType = RegBase; |
64 | |
65 | // This is really a union, discriminated by BaseType! |
66 | SDValue Base_Reg; |
67 | int Base_FrameIndex = 0; |
68 | |
69 | unsigned Scale = 1; |
70 | SDValue IndexReg; |
71 | int32_t Disp = 0; |
72 | SDValue Segment; |
73 | const GlobalValue *GV = nullptr; |
74 | const Constant *CP = nullptr; |
75 | const BlockAddress *BlockAddr = nullptr; |
76 | const char *ES = nullptr; |
77 | MCSymbol *MCSym = nullptr; |
78 | int JT = -1; |
79 | Align Alignment; // CP alignment. |
80 | unsigned char SymbolFlags = X86II::MO_NO_FLAG; // X86II::MO_* |
81 | bool NegateIndex = false; |
82 | |
83 | X86ISelAddressMode() = default; |
84 | |
85 | bool hasSymbolicDisplacement() const { |
86 | return GV != nullptr || CP != nullptr || ES != nullptr || |
87 | MCSym != nullptr || JT != -1 || BlockAddr != nullptr; |
88 | } |
89 | |
90 | bool hasBaseOrIndexReg() const { |
91 | return BaseType == FrameIndexBase || |
92 | IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr; |
93 | } |
94 | |
95 | /// Return true if this addressing mode is already RIP-relative. |
96 | bool isRIPRelative() const { |
97 | if (BaseType != RegBase) return false; |
98 | if (RegisterSDNode *RegNode = |
99 | dyn_cast_or_null<RegisterSDNode>(Val: Base_Reg.getNode())) |
100 | return RegNode->getReg() == X86::RIP; |
101 | return false; |
102 | } |
103 | |
104 | void setBaseReg(SDValue Reg) { |
105 | BaseType = RegBase; |
106 | Base_Reg = Reg; |
107 | } |
108 | |
109 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
110 | void dump(SelectionDAG *DAG = nullptr) { |
111 | dbgs() << "X86ISelAddressMode " << this << '\n'; |
112 | dbgs() << "Base_Reg " ; |
113 | if (Base_Reg.getNode()) |
114 | Base_Reg.getNode()->dump(G: DAG); |
115 | else |
116 | dbgs() << "nul\n" ; |
117 | if (BaseType == FrameIndexBase) |
118 | dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n'; |
119 | dbgs() << " Scale " << Scale << '\n' |
120 | << "IndexReg " ; |
121 | if (NegateIndex) |
122 | dbgs() << "negate " ; |
123 | if (IndexReg.getNode()) |
124 | IndexReg.getNode()->dump(G: DAG); |
125 | else |
126 | dbgs() << "nul\n" ; |
127 | dbgs() << " Disp " << Disp << '\n' |
128 | << "GV " ; |
129 | if (GV) |
130 | GV->dump(); |
131 | else |
132 | dbgs() << "nul" ; |
133 | dbgs() << " CP " ; |
134 | if (CP) |
135 | CP->dump(); |
136 | else |
137 | dbgs() << "nul" ; |
138 | dbgs() << '\n' |
139 | << "ES " ; |
140 | if (ES) |
141 | dbgs() << ES; |
142 | else |
143 | dbgs() << "nul" ; |
144 | dbgs() << " MCSym " ; |
145 | if (MCSym) |
146 | dbgs() << MCSym; |
147 | else |
148 | dbgs() << "nul" ; |
149 | dbgs() << " JT" << JT << " Align" << Alignment.value() << '\n'; |
150 | } |
151 | #endif |
152 | }; |
153 | } |
154 | |
155 | namespace { |
156 | //===--------------------------------------------------------------------===// |
157 | /// ISel - X86-specific code to select X86 machine instructions for |
158 | /// SelectionDAG operations. |
159 | /// |
160 | class X86DAGToDAGISel final : public SelectionDAGISel { |
161 | /// Keep a pointer to the X86Subtarget around so that we can |
162 | /// make the right decision when generating code for different targets. |
163 | const X86Subtarget *Subtarget; |
164 | |
165 | /// If true, selector should try to optimize for minimum code size. |
166 | bool OptForMinSize; |
167 | |
168 | /// Disable direct TLS access through segment registers. |
169 | bool IndirectTlsSegRefs; |
170 | |
171 | public: |
172 | static char ID; |
173 | |
174 | X86DAGToDAGISel() = delete; |
175 | |
176 | explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOptLevel OptLevel) |
177 | : SelectionDAGISel(ID, tm, OptLevel), Subtarget(nullptr), |
178 | OptForMinSize(false), IndirectTlsSegRefs(false) {} |
179 | |
180 | bool runOnMachineFunction(MachineFunction &MF) override { |
181 | // Reset the subtarget each time through. |
182 | Subtarget = &MF.getSubtarget<X86Subtarget>(); |
183 | IndirectTlsSegRefs = MF.getFunction().hasFnAttribute( |
184 | Kind: "indirect-tls-seg-refs" ); |
185 | |
186 | // OptFor[Min]Size are used in pattern predicates that isel is matching. |
187 | OptForMinSize = MF.getFunction().hasMinSize(); |
188 | assert((!OptForMinSize || MF.getFunction().hasOptSize()) && |
189 | "OptForMinSize implies OptForSize" ); |
190 | |
191 | SelectionDAGISel::runOnMachineFunction(MF); |
192 | return true; |
193 | } |
194 | |
195 | void emitFunctionEntryCode() override; |
196 | |
197 | bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override; |
198 | |
199 | void PreprocessISelDAG() override; |
200 | void PostprocessISelDAG() override; |
201 | |
202 | // Include the pieces autogenerated from the target description. |
203 | #include "X86GenDAGISel.inc" |
204 | |
205 | private: |
206 | void Select(SDNode *N) override; |
207 | |
208 | bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM); |
209 | bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM, |
210 | bool AllowSegmentRegForX32 = false); |
211 | bool matchWrapper(SDValue N, X86ISelAddressMode &AM); |
212 | bool matchAddress(SDValue N, X86ISelAddressMode &AM); |
213 | bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM); |
214 | bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth); |
215 | SDValue matchIndexRecursively(SDValue N, X86ISelAddressMode &AM, |
216 | unsigned Depth); |
217 | bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, |
218 | unsigned Depth); |
219 | bool matchVectorAddressRecursively(SDValue N, X86ISelAddressMode &AM, |
220 | unsigned Depth); |
221 | bool matchAddressBase(SDValue N, X86ISelAddressMode &AM); |
222 | bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base, |
223 | SDValue &Scale, SDValue &Index, SDValue &Disp, |
224 | SDValue &Segment); |
225 | bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp, |
226 | SDValue ScaleOp, SDValue &Base, SDValue &Scale, |
227 | SDValue &Index, SDValue &Disp, SDValue &Segment); |
228 | bool selectMOV64Imm32(SDValue N, SDValue &Imm); |
229 | bool selectLEAAddr(SDValue N, SDValue &Base, |
230 | SDValue &Scale, SDValue &Index, SDValue &Disp, |
231 | SDValue &Segment); |
232 | bool selectLEA64_32Addr(SDValue N, SDValue &Base, |
233 | SDValue &Scale, SDValue &Index, SDValue &Disp, |
234 | SDValue &Segment); |
235 | bool selectTLSADDRAddr(SDValue N, SDValue &Base, |
236 | SDValue &Scale, SDValue &Index, SDValue &Disp, |
237 | SDValue &Segment); |
238 | bool selectRelocImm(SDValue N, SDValue &Op); |
239 | |
240 | bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N, |
241 | SDValue &Base, SDValue &Scale, |
242 | SDValue &Index, SDValue &Disp, |
243 | SDValue &Segment); |
244 | |
245 | // Convenience method where P is also root. |
246 | bool tryFoldLoad(SDNode *P, SDValue N, |
247 | SDValue &Base, SDValue &Scale, |
248 | SDValue &Index, SDValue &Disp, |
249 | SDValue &Segment) { |
250 | return tryFoldLoad(Root: P, P, N, Base, Scale, Index, Disp, Segment); |
251 | } |
252 | |
253 | bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N, |
254 | SDValue &Base, SDValue &Scale, |
255 | SDValue &Index, SDValue &Disp, |
256 | SDValue &Segment); |
257 | |
258 | bool isProfitableToFormMaskedOp(SDNode *N) const; |
259 | |
260 | /// Implement addressing mode selection for inline asm expressions. |
261 | bool SelectInlineAsmMemoryOperand(const SDValue &Op, |
262 | InlineAsm::ConstraintCode ConstraintID, |
263 | std::vector<SDValue> &OutOps) override; |
264 | |
265 | void emitSpecialCodeForMain(); |
266 | |
267 | inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL, |
268 | MVT VT, SDValue &Base, SDValue &Scale, |
269 | SDValue &Index, SDValue &Disp, |
270 | SDValue &Segment) { |
271 | if (AM.BaseType == X86ISelAddressMode::FrameIndexBase) |
272 | Base = CurDAG->getTargetFrameIndex( |
273 | FI: AM.Base_FrameIndex, VT: TLI->getPointerTy(DL: CurDAG->getDataLayout())); |
274 | else if (AM.Base_Reg.getNode()) |
275 | Base = AM.Base_Reg; |
276 | else |
277 | Base = CurDAG->getRegister(Reg: 0, VT); |
278 | |
279 | Scale = getI8Imm(Imm: AM.Scale, DL); |
280 | |
281 | #define GET_ND_IF_ENABLED(OPC) (Subtarget->hasNDD() ? OPC##_ND : OPC) |
282 | // Negate the index if needed. |
283 | if (AM.NegateIndex) { |
284 | unsigned NegOpc = VT == MVT::i64 ? GET_ND_IF_ENABLED(X86::NEG64r) |
285 | : GET_ND_IF_ENABLED(X86::NEG32r); |
286 | SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32, |
287 | AM.IndexReg), 0); |
288 | AM.IndexReg = Neg; |
289 | } |
290 | |
291 | if (AM.IndexReg.getNode()) |
292 | Index = AM.IndexReg; |
293 | else |
294 | Index = CurDAG->getRegister(Reg: 0, VT); |
295 | |
296 | // These are 32-bit even in 64-bit mode since RIP-relative offset |
297 | // is 32-bit. |
298 | if (AM.GV) |
299 | Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(), |
300 | MVT::i32, AM.Disp, |
301 | AM.SymbolFlags); |
302 | else if (AM.CP) |
303 | Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Alignment, |
304 | AM.Disp, AM.SymbolFlags); |
305 | else if (AM.ES) { |
306 | assert(!AM.Disp && "Non-zero displacement is ignored with ES." ); |
307 | Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags); |
308 | } else if (AM.MCSym) { |
309 | assert(!AM.Disp && "Non-zero displacement is ignored with MCSym." ); |
310 | assert(AM.SymbolFlags == 0 && "oo" ); |
311 | Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32); |
312 | } else if (AM.JT != -1) { |
313 | assert(!AM.Disp && "Non-zero displacement is ignored with JT." ); |
314 | Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags); |
315 | } else if (AM.BlockAddr) |
316 | Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp, |
317 | AM.SymbolFlags); |
318 | else |
319 | Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32); |
320 | |
321 | if (AM.Segment.getNode()) |
322 | Segment = AM.Segment; |
323 | else |
324 | Segment = CurDAG->getRegister(0, MVT::i16); |
325 | } |
326 | |
327 | // Utility function to determine whether we should avoid selecting |
328 | // immediate forms of instructions for better code size or not. |
329 | // At a high level, we'd like to avoid such instructions when |
330 | // we have similar constants used within the same basic block |
331 | // that can be kept in a register. |
332 | // |
333 | bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const { |
334 | uint32_t UseCount = 0; |
335 | |
336 | // Do not want to hoist if we're not optimizing for size. |
337 | // TODO: We'd like to remove this restriction. |
338 | // See the comment in X86InstrInfo.td for more info. |
339 | if (!CurDAG->shouldOptForSize()) |
340 | return false; |
341 | |
342 | // Walk all the users of the immediate. |
343 | for (const SDNode *User : N->uses()) { |
344 | if (UseCount >= 2) |
345 | break; |
346 | |
347 | // This user is already selected. Count it as a legitimate use and |
348 | // move on. |
349 | if (User->isMachineOpcode()) { |
350 | UseCount++; |
351 | continue; |
352 | } |
353 | |
354 | // We want to count stores of immediates as real uses. |
355 | if (User->getOpcode() == ISD::STORE && |
356 | User->getOperand(1).getNode() == N) { |
357 | UseCount++; |
358 | continue; |
359 | } |
360 | |
361 | // We don't currently match users that have > 2 operands (except |
362 | // for stores, which are handled above) |
363 | // Those instruction won't match in ISEL, for now, and would |
364 | // be counted incorrectly. |
365 | // This may change in the future as we add additional instruction |
366 | // types. |
367 | if (User->getNumOperands() != 2) |
368 | continue; |
369 | |
370 | // If this is a sign-extended 8-bit integer immediate used in an ALU |
371 | // instruction, there is probably an opcode encoding to save space. |
372 | auto *C = dyn_cast<ConstantSDNode>(N); |
373 | if (C && isInt<8>(C->getSExtValue())) |
374 | continue; |
375 | |
376 | // Immediates that are used for offsets as part of stack |
377 | // manipulation should be left alone. These are typically |
378 | // used to indicate SP offsets for argument passing and |
379 | // will get pulled into stores/pushes (implicitly). |
380 | if (User->getOpcode() == X86ISD::ADD || |
381 | User->getOpcode() == ISD::ADD || |
382 | User->getOpcode() == X86ISD::SUB || |
383 | User->getOpcode() == ISD::SUB) { |
384 | |
385 | // Find the other operand of the add/sub. |
386 | SDValue OtherOp = User->getOperand(0); |
387 | if (OtherOp.getNode() == N) |
388 | OtherOp = User->getOperand(1); |
389 | |
390 | // Don't count if the other operand is SP. |
391 | RegisterSDNode *RegNode; |
392 | if (OtherOp->getOpcode() == ISD::CopyFromReg && |
393 | (RegNode = dyn_cast_or_null<RegisterSDNode>( |
394 | OtherOp->getOperand(1).getNode()))) |
395 | if ((RegNode->getReg() == X86::ESP) || |
396 | (RegNode->getReg() == X86::RSP)) |
397 | continue; |
398 | } |
399 | |
400 | // ... otherwise, count this and move on. |
401 | UseCount++; |
402 | } |
403 | |
404 | // If we have more than 1 use, then recommend for hoisting. |
405 | return (UseCount > 1); |
406 | } |
407 | |
408 | /// Return a target constant with the specified value of type i8. |
409 | inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) { |
410 | return CurDAG->getTargetConstant(Imm, DL, MVT::i8); |
411 | } |
412 | |
413 | /// Return a target constant with the specified value, of type i32. |
414 | inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) { |
415 | return CurDAG->getTargetConstant(Imm, DL, MVT::i32); |
416 | } |
417 | |
418 | /// Return a target constant with the specified value, of type i64. |
419 | inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) { |
420 | return CurDAG->getTargetConstant(Imm, DL, MVT::i64); |
421 | } |
422 | |
423 | SDValue (SDNode *N, unsigned VecWidth, |
424 | const SDLoc &DL) { |
425 | assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width" ); |
426 | uint64_t Index = N->getConstantOperandVal(Num: 1); |
427 | MVT VecVT = N->getOperand(Num: 0).getSimpleValueType(); |
428 | return getI8Imm(Imm: (Index * VecVT.getScalarSizeInBits()) / VecWidth, DL); |
429 | } |
430 | |
431 | SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth, |
432 | const SDLoc &DL) { |
433 | assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width" ); |
434 | uint64_t Index = N->getConstantOperandVal(Num: 2); |
435 | MVT VecVT = N->getSimpleValueType(ResNo: 0); |
436 | return getI8Imm(Imm: (Index * VecVT.getScalarSizeInBits()) / VecWidth, DL); |
437 | } |
438 | |
439 | SDValue getPermuteVINSERTCommutedImmediate(SDNode *N, unsigned VecWidth, |
440 | const SDLoc &DL) { |
441 | assert(VecWidth == 128 && "Unexpected vector width" ); |
442 | uint64_t Index = N->getConstantOperandVal(Num: 2); |
443 | MVT VecVT = N->getSimpleValueType(ResNo: 0); |
444 | uint64_t InsertIdx = (Index * VecVT.getScalarSizeInBits()) / VecWidth; |
445 | assert((InsertIdx == 0 || InsertIdx == 1) && "Bad insertf128 index" ); |
446 | // vinsert(0,sub,vec) -> [sub0][vec1] -> vperm2x128(0x30,vec,sub) |
447 | // vinsert(1,sub,vec) -> [vec0][sub0] -> vperm2x128(0x02,vec,sub) |
448 | return getI8Imm(Imm: InsertIdx ? 0x02 : 0x30, DL); |
449 | } |
450 | |
451 | SDValue getSBBZero(SDNode *N) { |
452 | SDLoc dl(N); |
453 | MVT VT = N->getSimpleValueType(ResNo: 0); |
454 | |
455 | // Create zero. |
456 | SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32); |
457 | SDValue Zero = SDValue( |
458 | CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, std::nullopt), 0); |
459 | if (VT == MVT::i64) { |
460 | Zero = SDValue( |
461 | CurDAG->getMachineNode( |
462 | TargetOpcode::SUBREG_TO_REG, dl, MVT::i64, |
463 | CurDAG->getTargetConstant(0, dl, MVT::i64), Zero, |
464 | CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)), |
465 | 0); |
466 | } |
467 | |
468 | // Copy flags to the EFLAGS register and glue it to next node. |
469 | unsigned Opcode = N->getOpcode(); |
470 | assert((Opcode == X86ISD::SBB || Opcode == X86ISD::SETCC_CARRY) && |
471 | "Unexpected opcode for SBB materialization" ); |
472 | unsigned FlagOpIndex = Opcode == X86ISD::SBB ? 2 : 1; |
473 | SDValue EFLAGS = |
474 | CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS, |
475 | N->getOperand(FlagOpIndex), SDValue()); |
476 | |
477 | // Create a 64-bit instruction if the result is 64-bits otherwise use the |
478 | // 32-bit version. |
479 | unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr; |
480 | MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32; |
481 | VTs = CurDAG->getVTList(SBBVT, MVT::i32); |
482 | return SDValue( |
483 | CurDAG->getMachineNode(Opc, dl, VTs, |
484 | {Zero, Zero, EFLAGS, EFLAGS.getValue(R: 1)}), |
485 | 0); |
486 | } |
487 | |
488 | // Helper to detect unneeded and instructions on shift amounts. Called |
489 | // from PatFrags in tablegen. |
490 | bool isUnneededShiftMask(SDNode *N, unsigned Width) const { |
491 | assert(N->getOpcode() == ISD::AND && "Unexpected opcode" ); |
492 | const APInt &Val = N->getConstantOperandAPInt(Num: 1); |
493 | |
494 | if (Val.countr_one() >= Width) |
495 | return true; |
496 | |
497 | APInt Mask = Val | CurDAG->computeKnownBits(Op: N->getOperand(Num: 0)).Zero; |
498 | return Mask.countr_one() >= Width; |
499 | } |
500 | |
501 | /// Return an SDNode that returns the value of the global base register. |
502 | /// Output instructions required to initialize the global base register, |
503 | /// if necessary. |
504 | SDNode *getGlobalBaseReg(); |
505 | |
506 | /// Return a reference to the TargetMachine, casted to the target-specific |
507 | /// type. |
508 | const X86TargetMachine &getTargetMachine() const { |
509 | return static_cast<const X86TargetMachine &>(TM); |
510 | } |
511 | |
512 | /// Return a reference to the TargetInstrInfo, casted to the target-specific |
513 | /// type. |
514 | const X86InstrInfo *getInstrInfo() const { |
515 | return Subtarget->getInstrInfo(); |
516 | } |
517 | |
518 | /// Return a condition code of the given SDNode |
519 | X86::CondCode getCondFromNode(SDNode *N) const; |
520 | |
521 | /// Address-mode matching performs shift-of-and to and-of-shift |
522 | /// reassociation in order to expose more scaled addressing |
523 | /// opportunities. |
524 | bool ComplexPatternFuncMutatesDAG() const override { |
525 | return true; |
526 | } |
527 | |
528 | bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const; |
529 | |
530 | // Indicates we should prefer to use a non-temporal load for this load. |
531 | bool useNonTemporalLoad(LoadSDNode *N) const { |
532 | if (!N->isNonTemporal()) |
533 | return false; |
534 | |
535 | unsigned StoreSize = N->getMemoryVT().getStoreSize(); |
536 | |
537 | if (N->getAlign().value() < StoreSize) |
538 | return false; |
539 | |
540 | switch (StoreSize) { |
541 | default: llvm_unreachable("Unsupported store size" ); |
542 | case 4: |
543 | case 8: |
544 | return false; |
545 | case 16: |
546 | return Subtarget->hasSSE41(); |
547 | case 32: |
548 | return Subtarget->hasAVX2(); |
549 | case 64: |
550 | return Subtarget->hasAVX512(); |
551 | } |
552 | } |
553 | |
554 | bool foldLoadStoreIntoMemOperand(SDNode *Node); |
555 | MachineSDNode *matchBEXTRFromAndImm(SDNode *Node); |
556 | bool matchBitExtract(SDNode *Node); |
557 | bool shrinkAndImmediate(SDNode *N); |
558 | bool isMaskZeroExtended(SDNode *N) const; |
559 | bool tryShiftAmountMod(SDNode *N); |
560 | bool tryShrinkShlLogicImm(SDNode *N); |
561 | bool tryVPTERNLOG(SDNode *N); |
562 | bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentB, |
563 | SDNode *ParentC, SDValue A, SDValue B, SDValue C, |
564 | uint8_t Imm); |
565 | bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask); |
566 | bool tryMatchBitSelect(SDNode *N); |
567 | |
568 | MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, |
569 | const SDLoc &dl, MVT VT, SDNode *Node); |
570 | MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, |
571 | const SDLoc &dl, MVT VT, SDNode *Node, |
572 | SDValue &InGlue); |
573 | |
574 | bool tryOptimizeRem8Extend(SDNode *N); |
575 | |
576 | bool onlyUsesZeroFlag(SDValue Flags) const; |
577 | bool hasNoSignFlagUses(SDValue Flags) const; |
578 | bool hasNoCarryFlagUses(SDValue Flags) const; |
579 | }; |
580 | } |
581 | |
582 | char X86DAGToDAGISel::ID = 0; |
583 | |
584 | INITIALIZE_PASS(X86DAGToDAGISel, DEBUG_TYPE, PASS_NAME, false, false) |
585 | |
586 | // Returns true if this masked compare can be implemented legally with this |
587 | // type. |
588 | static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) { |
589 | unsigned Opcode = N->getOpcode(); |
590 | if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM || |
591 | Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC || |
592 | Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) { |
593 | // We can get 256-bit 8 element types here without VLX being enabled. When |
594 | // this happens we will use 512-bit operations and the mask will not be |
595 | // zero extended. |
596 | EVT OpVT = N->getOperand(Num: 0).getValueType(); |
597 | // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the |
598 | // second operand. |
599 | if (Opcode == X86ISD::STRICT_CMPM) |
600 | OpVT = N->getOperand(Num: 1).getValueType(); |
601 | if (OpVT.is256BitVector() || OpVT.is128BitVector()) |
602 | return Subtarget->hasVLX(); |
603 | |
604 | return true; |
605 | } |
606 | // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check. |
607 | if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM || |
608 | Opcode == X86ISD::FSETCCM_SAE) |
609 | return true; |
610 | |
611 | return false; |
612 | } |
613 | |
614 | // Returns true if we can assume the writer of the mask has zero extended it |
615 | // for us. |
616 | bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const { |
617 | // If this is an AND, check if we have a compare on either side. As long as |
618 | // one side guarantees the mask is zero extended, the AND will preserve those |
619 | // zeros. |
620 | if (N->getOpcode() == ISD::AND) |
621 | return isLegalMaskCompare(N: N->getOperand(Num: 0).getNode(), Subtarget) || |
622 | isLegalMaskCompare(N: N->getOperand(Num: 1).getNode(), Subtarget); |
623 | |
624 | return isLegalMaskCompare(N, Subtarget); |
625 | } |
626 | |
627 | bool |
628 | X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const { |
629 | if (OptLevel == CodeGenOptLevel::None) |
630 | return false; |
631 | |
632 | if (!N.hasOneUse()) |
633 | return false; |
634 | |
635 | if (N.getOpcode() != ISD::LOAD) |
636 | return true; |
637 | |
638 | // Don't fold non-temporal loads if we have an instruction for them. |
639 | if (useNonTemporalLoad(N: cast<LoadSDNode>(Val&: N))) |
640 | return false; |
641 | |
642 | // If N is a load, do additional profitability checks. |
643 | if (U == Root) { |
644 | switch (U->getOpcode()) { |
645 | default: break; |
646 | case X86ISD::ADD: |
647 | case X86ISD::ADC: |
648 | case X86ISD::SUB: |
649 | case X86ISD::SBB: |
650 | case X86ISD::AND: |
651 | case X86ISD::XOR: |
652 | case X86ISD::OR: |
653 | case ISD::ADD: |
654 | case ISD::UADDO_CARRY: |
655 | case ISD::AND: |
656 | case ISD::OR: |
657 | case ISD::XOR: { |
658 | SDValue Op1 = U->getOperand(Num: 1); |
659 | |
660 | // If the other operand is a 8-bit immediate we should fold the immediate |
661 | // instead. This reduces code size. |
662 | // e.g. |
663 | // movl 4(%esp), %eax |
664 | // addl $4, %eax |
665 | // vs. |
666 | // movl $4, %eax |
667 | // addl 4(%esp), %eax |
668 | // The former is 2 bytes shorter. In case where the increment is 1, then |
669 | // the saving can be 4 bytes (by using incl %eax). |
670 | if (auto *Imm = dyn_cast<ConstantSDNode>(Val&: Op1)) { |
671 | if (Imm->getAPIntValue().isSignedIntN(N: 8)) |
672 | return false; |
673 | |
674 | // If this is a 64-bit AND with an immediate that fits in 32-bits, |
675 | // prefer using the smaller and over folding the load. This is needed to |
676 | // make sure immediates created by shrinkAndImmediate are always folded. |
677 | // Ideally we would narrow the load during DAG combine and get the |
678 | // best of both worlds. |
679 | if (U->getOpcode() == ISD::AND && |
680 | Imm->getAPIntValue().getBitWidth() == 64 && |
681 | Imm->getAPIntValue().isIntN(N: 32)) |
682 | return false; |
683 | |
684 | // If this really a zext_inreg that can be represented with a movzx |
685 | // instruction, prefer that. |
686 | // TODO: We could shrink the load and fold if it is non-volatile. |
687 | if (U->getOpcode() == ISD::AND && |
688 | (Imm->getAPIntValue() == UINT8_MAX || |
689 | Imm->getAPIntValue() == UINT16_MAX || |
690 | Imm->getAPIntValue() == UINT32_MAX)) |
691 | return false; |
692 | |
693 | // ADD/SUB with can negate the immediate and use the opposite operation |
694 | // to fit 128 into a sign extended 8 bit immediate. |
695 | if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) && |
696 | (-Imm->getAPIntValue()).isSignedIntN(N: 8)) |
697 | return false; |
698 | |
699 | if ((U->getOpcode() == X86ISD::ADD || U->getOpcode() == X86ISD::SUB) && |
700 | (-Imm->getAPIntValue()).isSignedIntN(N: 8) && |
701 | hasNoCarryFlagUses(Flags: SDValue(U, 1))) |
702 | return false; |
703 | } |
704 | |
705 | // If the other operand is a TLS address, we should fold it instead. |
706 | // This produces |
707 | // movl %gs:0, %eax |
708 | // leal i@NTPOFF(%eax), %eax |
709 | // instead of |
710 | // movl $i@NTPOFF, %eax |
711 | // addl %gs:0, %eax |
712 | // if the block also has an access to a second TLS address this will save |
713 | // a load. |
714 | // FIXME: This is probably also true for non-TLS addresses. |
715 | if (Op1.getOpcode() == X86ISD::Wrapper) { |
716 | SDValue Val = Op1.getOperand(i: 0); |
717 | if (Val.getOpcode() == ISD::TargetGlobalTLSAddress) |
718 | return false; |
719 | } |
720 | |
721 | // Don't fold load if this matches the BTS/BTR/BTC patterns. |
722 | // BTS: (or X, (shl 1, n)) |
723 | // BTR: (and X, (rotl -2, n)) |
724 | // BTC: (xor X, (shl 1, n)) |
725 | if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) { |
726 | if (U->getOperand(Num: 0).getOpcode() == ISD::SHL && |
727 | isOneConstant(V: U->getOperand(Num: 0).getOperand(i: 0))) |
728 | return false; |
729 | |
730 | if (U->getOperand(Num: 1).getOpcode() == ISD::SHL && |
731 | isOneConstant(V: U->getOperand(Num: 1).getOperand(i: 0))) |
732 | return false; |
733 | } |
734 | if (U->getOpcode() == ISD::AND) { |
735 | SDValue U0 = U->getOperand(Num: 0); |
736 | SDValue U1 = U->getOperand(Num: 1); |
737 | if (U0.getOpcode() == ISD::ROTL) { |
738 | auto *C = dyn_cast<ConstantSDNode>(Val: U0.getOperand(i: 0)); |
739 | if (C && C->getSExtValue() == -2) |
740 | return false; |
741 | } |
742 | |
743 | if (U1.getOpcode() == ISD::ROTL) { |
744 | auto *C = dyn_cast<ConstantSDNode>(Val: U1.getOperand(i: 0)); |
745 | if (C && C->getSExtValue() == -2) |
746 | return false; |
747 | } |
748 | } |
749 | |
750 | break; |
751 | } |
752 | case ISD::SHL: |
753 | case ISD::SRA: |
754 | case ISD::SRL: |
755 | // Don't fold a load into a shift by immediate. The BMI2 instructions |
756 | // support folding a load, but not an immediate. The legacy instructions |
757 | // support folding an immediate, but can't fold a load. Folding an |
758 | // immediate is preferable to folding a load. |
759 | if (isa<ConstantSDNode>(Val: U->getOperand(Num: 1))) |
760 | return false; |
761 | |
762 | break; |
763 | } |
764 | } |
765 | |
766 | // Prevent folding a load if this can implemented with an insert_subreg or |
767 | // a move that implicitly zeroes. |
768 | if (Root->getOpcode() == ISD::INSERT_SUBVECTOR && |
769 | isNullConstant(V: Root->getOperand(Num: 2)) && |
770 | (Root->getOperand(Num: 0).isUndef() || |
771 | ISD::isBuildVectorAllZeros(N: Root->getOperand(Num: 0).getNode()))) |
772 | return false; |
773 | |
774 | return true; |
775 | } |
776 | |
777 | // Indicates it is profitable to form an AVX512 masked operation. Returning |
778 | // false will favor a masked register-register masked move or vblendm and the |
779 | // operation will be selected separately. |
780 | bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode *N) const { |
781 | assert( |
782 | (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SELECTS) && |
783 | "Unexpected opcode!" ); |
784 | |
785 | // If the operation has additional users, the operation will be duplicated. |
786 | // Check the use count to prevent that. |
787 | // FIXME: Are there cheap opcodes we might want to duplicate? |
788 | return N->getOperand(Num: 1).hasOneUse(); |
789 | } |
790 | |
791 | /// Replace the original chain operand of the call with |
792 | /// load's chain operand and move load below the call's chain operand. |
793 | static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, |
794 | SDValue Call, SDValue OrigChain) { |
795 | SmallVector<SDValue, 8> Ops; |
796 | SDValue Chain = OrigChain.getOperand(i: 0); |
797 | if (Chain.getNode() == Load.getNode()) |
798 | Ops.push_back(Elt: Load.getOperand(i: 0)); |
799 | else { |
800 | assert(Chain.getOpcode() == ISD::TokenFactor && |
801 | "Unexpected chain operand" ); |
802 | for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) |
803 | if (Chain.getOperand(i).getNode() == Load.getNode()) |
804 | Ops.push_back(Elt: Load.getOperand(i: 0)); |
805 | else |
806 | Ops.push_back(Elt: Chain.getOperand(i)); |
807 | SDValue NewChain = |
808 | CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops); |
809 | Ops.clear(); |
810 | Ops.push_back(Elt: NewChain); |
811 | } |
812 | Ops.append(in_start: OrigChain->op_begin() + 1, in_end: OrigChain->op_end()); |
813 | CurDAG->UpdateNodeOperands(N: OrigChain.getNode(), Ops); |
814 | CurDAG->UpdateNodeOperands(N: Load.getNode(), Op1: Call.getOperand(i: 0), |
815 | Op2: Load.getOperand(i: 1), Op3: Load.getOperand(i: 2)); |
816 | |
817 | Ops.clear(); |
818 | Ops.push_back(Elt: SDValue(Load.getNode(), 1)); |
819 | Ops.append(in_start: Call->op_begin() + 1, in_end: Call->op_end()); |
820 | CurDAG->UpdateNodeOperands(N: Call.getNode(), Ops); |
821 | } |
822 | |
823 | /// Return true if call address is a load and it can be |
824 | /// moved below CALLSEQ_START and the chains leading up to the call. |
825 | /// Return the CALLSEQ_START by reference as a second output. |
826 | /// In the case of a tail call, there isn't a callseq node between the call |
827 | /// chain and the load. |
828 | static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) { |
829 | // The transformation is somewhat dangerous if the call's chain was glued to |
830 | // the call. After MoveBelowOrigChain the load is moved between the call and |
831 | // the chain, this can create a cycle if the load is not folded. So it is |
832 | // *really* important that we are sure the load will be folded. |
833 | if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse()) |
834 | return false; |
835 | auto *LD = dyn_cast<LoadSDNode>(Val: Callee.getNode()); |
836 | if (!LD || |
837 | !LD->isSimple() || |
838 | LD->getAddressingMode() != ISD::UNINDEXED || |
839 | LD->getExtensionType() != ISD::NON_EXTLOAD) |
840 | return false; |
841 | |
842 | // Now let's find the callseq_start. |
843 | while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) { |
844 | if (!Chain.hasOneUse()) |
845 | return false; |
846 | Chain = Chain.getOperand(i: 0); |
847 | } |
848 | |
849 | if (!Chain.getNumOperands()) |
850 | return false; |
851 | // Since we are not checking for AA here, conservatively abort if the chain |
852 | // writes to memory. It's not safe to move the callee (a load) across a store. |
853 | if (isa<MemSDNode>(Val: Chain.getNode()) && |
854 | cast<MemSDNode>(Val: Chain.getNode())->writeMem()) |
855 | return false; |
856 | if (Chain.getOperand(i: 0).getNode() == Callee.getNode()) |
857 | return true; |
858 | if (Chain.getOperand(i: 0).getOpcode() == ISD::TokenFactor && |
859 | Callee.getValue(R: 1).isOperandOf(N: Chain.getOperand(i: 0).getNode()) && |
860 | Callee.getValue(R: 1).hasOneUse()) |
861 | return true; |
862 | return false; |
863 | } |
864 | |
865 | static bool isEndbrImm64(uint64_t Imm) { |
866 | // There may be some other prefix bytes between 0xF3 and 0x0F1EFA. |
867 | // i.g: 0xF3660F1EFA, 0xF3670F1EFA |
868 | if ((Imm & 0x00FFFFFF) != 0x0F1EFA) |
869 | return false; |
870 | |
871 | uint8_t OptionalPrefixBytes [] = {0x26, 0x2e, 0x36, 0x3e, 0x64, |
872 | 0x65, 0x66, 0x67, 0xf0, 0xf2}; |
873 | int i = 24; // 24bit 0x0F1EFA has matched |
874 | while (i < 64) { |
875 | uint8_t Byte = (Imm >> i) & 0xFF; |
876 | if (Byte == 0xF3) |
877 | return true; |
878 | if (!llvm::is_contained(Range&: OptionalPrefixBytes, Element: Byte)) |
879 | return false; |
880 | i += 8; |
881 | } |
882 | |
883 | return false; |
884 | } |
885 | |
886 | static bool needBWI(MVT VT) { |
887 | return (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v64i8); |
888 | } |
889 | |
890 | void X86DAGToDAGISel::PreprocessISelDAG() { |
891 | bool MadeChange = false; |
892 | for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), |
893 | E = CurDAG->allnodes_end(); I != E; ) { |
894 | SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues. |
895 | |
896 | // This is for CET enhancement. |
897 | // |
898 | // ENDBR32 and ENDBR64 have specific opcodes: |
899 | // ENDBR32: F3 0F 1E FB |
900 | // ENDBR64: F3 0F 1E FA |
901 | // And we want that attackers won’t find unintended ENDBR32/64 |
902 | // opcode matches in the binary |
903 | // Here’s an example: |
904 | // If the compiler had to generate asm for the following code: |
905 | // a = 0xF30F1EFA |
906 | // it could, for example, generate: |
907 | // mov 0xF30F1EFA, dword ptr[a] |
908 | // In such a case, the binary would include a gadget that starts |
909 | // with a fake ENDBR64 opcode. Therefore, we split such generation |
910 | // into multiple operations, let it not shows in the binary |
911 | if (N->getOpcode() == ISD::Constant) { |
912 | MVT VT = N->getSimpleValueType(ResNo: 0); |
913 | int64_t Imm = cast<ConstantSDNode>(Val: N)->getSExtValue(); |
914 | int32_t EndbrImm = Subtarget->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB; |
915 | if (Imm == EndbrImm || isEndbrImm64(Imm)) { |
916 | // Check that the cf-protection-branch is enabled. |
917 | Metadata *CFProtectionBranch = |
918 | MF->getMMI().getModule()->getModuleFlag(Key: "cf-protection-branch" ); |
919 | if (CFProtectionBranch || IndirectBranchTracking) { |
920 | SDLoc dl(N); |
921 | SDValue Complement = CurDAG->getConstant(Val: ~Imm, DL: dl, VT, isTarget: false, isOpaque: true); |
922 | Complement = CurDAG->getNOT(DL: dl, Val: Complement, VT); |
923 | --I; |
924 | CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Complement); |
925 | ++I; |
926 | MadeChange = true; |
927 | continue; |
928 | } |
929 | } |
930 | } |
931 | |
932 | // If this is a target specific AND node with no flag usages, turn it back |
933 | // into ISD::AND to enable test instruction matching. |
934 | if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(Value: 1)) { |
935 | SDValue Res = CurDAG->getNode(Opcode: ISD::AND, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), |
936 | N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1)); |
937 | --I; |
938 | CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Res); |
939 | ++I; |
940 | MadeChange = true; |
941 | continue; |
942 | } |
943 | |
944 | // Convert vector increment or decrement to sub/add with an all-ones |
945 | // constant: |
946 | // add X, <1, 1...> --> sub X, <-1, -1...> |
947 | // sub X, <1, 1...> --> add X, <-1, -1...> |
948 | // The all-ones vector constant can be materialized using a pcmpeq |
949 | // instruction that is commonly recognized as an idiom (has no register |
950 | // dependency), so that's better/smaller than loading a splat 1 constant. |
951 | // |
952 | // But don't do this if it would inhibit a potentially profitable load |
953 | // folding opportunity for the other operand. That only occurs with the |
954 | // intersection of: |
955 | // (1) The other operand (op0) is load foldable. |
956 | // (2) The op is an add (otherwise, we are *creating* an add and can still |
957 | // load fold the other op). |
958 | // (3) The target has AVX (otherwise, we have a destructive add and can't |
959 | // load fold the other op without killing the constant op). |
960 | // (4) The constant 1 vector has multiple uses (so it is profitable to load |
961 | // into a register anyway). |
962 | auto mayPreventLoadFold = [&]() { |
963 | return X86::mayFoldLoad(Op: N->getOperand(Num: 0), Subtarget: *Subtarget) && |
964 | N->getOpcode() == ISD::ADD && Subtarget->hasAVX() && |
965 | !N->getOperand(Num: 1).hasOneUse(); |
966 | }; |
967 | if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) && |
968 | N->getSimpleValueType(ResNo: 0).isVector() && !mayPreventLoadFold()) { |
969 | APInt SplatVal; |
970 | if (X86::isConstantSplat(Op: N->getOperand(Num: 1), SplatVal) && |
971 | SplatVal.isOne()) { |
972 | SDLoc DL(N); |
973 | |
974 | MVT VT = N->getSimpleValueType(ResNo: 0); |
975 | unsigned NumElts = VT.getSizeInBits() / 32; |
976 | SDValue AllOnes = |
977 | CurDAG->getAllOnesConstant(DL, VT: MVT::getVectorVT(MVT::i32, NumElts)); |
978 | AllOnes = CurDAG->getBitcast(VT, V: AllOnes); |
979 | |
980 | unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD; |
981 | SDValue Res = |
982 | CurDAG->getNode(Opcode: NewOpcode, DL, VT, N1: N->getOperand(Num: 0), N2: AllOnes); |
983 | --I; |
984 | CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode()); |
985 | ++I; |
986 | MadeChange = true; |
987 | continue; |
988 | } |
989 | } |
990 | |
991 | switch (N->getOpcode()) { |
992 | case X86ISD::VBROADCAST: { |
993 | MVT VT = N->getSimpleValueType(ResNo: 0); |
994 | // Emulate v32i16/v64i8 broadcast without BWI. |
995 | if (!Subtarget->hasBWI() && needBWI(VT)) { |
996 | MVT NarrowVT = VT.getHalfNumVectorElementsVT(); |
997 | SDLoc dl(N); |
998 | SDValue NarrowBCast = |
999 | CurDAG->getNode(Opcode: X86ISD::VBROADCAST, DL: dl, VT: NarrowVT, Operand: N->getOperand(Num: 0)); |
1000 | SDValue Res = |
1001 | CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: CurDAG->getUNDEF(VT), |
1002 | N2: NarrowBCast, N3: CurDAG->getIntPtrConstant(Val: 0, DL: dl)); |
1003 | unsigned Index = NarrowVT.getVectorMinNumElements(); |
1004 | Res = CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: Res, N2: NarrowBCast, |
1005 | N3: CurDAG->getIntPtrConstant(Val: Index, DL: dl)); |
1006 | |
1007 | --I; |
1008 | CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode()); |
1009 | ++I; |
1010 | MadeChange = true; |
1011 | continue; |
1012 | } |
1013 | |
1014 | break; |
1015 | } |
1016 | case X86ISD::VBROADCAST_LOAD: { |
1017 | MVT VT = N->getSimpleValueType(ResNo: 0); |
1018 | // Emulate v32i16/v64i8 broadcast without BWI. |
1019 | if (!Subtarget->hasBWI() && needBWI(VT)) { |
1020 | MVT NarrowVT = VT.getHalfNumVectorElementsVT(); |
1021 | auto *MemNode = cast<MemSDNode>(Val: N); |
1022 | SDLoc dl(N); |
1023 | SDVTList VTs = CurDAG->getVTList(NarrowVT, MVT::Other); |
1024 | SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()}; |
1025 | SDValue NarrowBCast = CurDAG->getMemIntrinsicNode( |
1026 | Opcode: X86ISD::VBROADCAST_LOAD, dl, VTList: VTs, Ops, MemVT: MemNode->getMemoryVT(), |
1027 | MMO: MemNode->getMemOperand()); |
1028 | SDValue Res = |
1029 | CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: CurDAG->getUNDEF(VT), |
1030 | N2: NarrowBCast, N3: CurDAG->getIntPtrConstant(Val: 0, DL: dl)); |
1031 | unsigned Index = NarrowVT.getVectorMinNumElements(); |
1032 | Res = CurDAG->getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: dl, VT, N1: Res, N2: NarrowBCast, |
1033 | N3: CurDAG->getIntPtrConstant(Val: Index, DL: dl)); |
1034 | |
1035 | --I; |
1036 | SDValue To[] = {Res, NarrowBCast.getValue(R: 1)}; |
1037 | CurDAG->ReplaceAllUsesWith(From: N, To); |
1038 | ++I; |
1039 | MadeChange = true; |
1040 | continue; |
1041 | } |
1042 | |
1043 | break; |
1044 | } |
1045 | case ISD::LOAD: { |
1046 | // If this is a XMM/YMM load of the same lower bits as another YMM/ZMM |
1047 | // load, then just extract the lower subvector and avoid the second load. |
1048 | auto *Ld = cast<LoadSDNode>(Val: N); |
1049 | MVT VT = N->getSimpleValueType(ResNo: 0); |
1050 | if (!ISD::isNormalLoad(N: Ld) || !Ld->isSimple() || |
1051 | !(VT.is128BitVector() || VT.is256BitVector())) |
1052 | break; |
1053 | |
1054 | MVT MaxVT = VT; |
1055 | SDNode *MaxLd = nullptr; |
1056 | SDValue Ptr = Ld->getBasePtr(); |
1057 | SDValue Chain = Ld->getChain(); |
1058 | for (SDNode *User : Ptr->uses()) { |
1059 | auto *UserLd = dyn_cast<LoadSDNode>(Val: User); |
1060 | MVT UserVT = User->getSimpleValueType(ResNo: 0); |
1061 | if (User != N && UserLd && ISD::isNormalLoad(N: User) && |
1062 | UserLd->getBasePtr() == Ptr && UserLd->getChain() == Chain && |
1063 | !User->hasAnyUseOfValue(Value: 1) && |
1064 | (UserVT.is256BitVector() || UserVT.is512BitVector()) && |
1065 | UserVT.getSizeInBits() > VT.getSizeInBits() && |
1066 | (!MaxLd || UserVT.getSizeInBits() > MaxVT.getSizeInBits())) { |
1067 | MaxLd = User; |
1068 | MaxVT = UserVT; |
1069 | } |
1070 | } |
1071 | if (MaxLd) { |
1072 | SDLoc dl(N); |
1073 | unsigned NumSubElts = VT.getSizeInBits() / MaxVT.getScalarSizeInBits(); |
1074 | MVT SubVT = MVT::getVectorVT(VT: MaxVT.getScalarType(), NumElements: NumSubElts); |
1075 | SDValue = CurDAG->getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT: SubVT, |
1076 | N1: SDValue(MaxLd, 0), |
1077 | N2: CurDAG->getIntPtrConstant(Val: 0, DL: dl)); |
1078 | SDValue Res = CurDAG->getBitcast(VT, V: Extract); |
1079 | |
1080 | --I; |
1081 | SDValue To[] = {Res, SDValue(MaxLd, 1)}; |
1082 | CurDAG->ReplaceAllUsesWith(From: N, To); |
1083 | ++I; |
1084 | MadeChange = true; |
1085 | continue; |
1086 | } |
1087 | break; |
1088 | } |
1089 | case ISD::VSELECT: { |
1090 | // Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG. |
1091 | EVT EleVT = N->getOperand(Num: 0).getValueType().getVectorElementType(); |
1092 | if (EleVT == MVT::i1) |
1093 | break; |
1094 | |
1095 | assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!" ); |
1096 | assert(N->getValueType(0).getVectorElementType() != MVT::i16 && |
1097 | "We can't replace VSELECT with BLENDV in vXi16!" ); |
1098 | SDValue R; |
1099 | if (Subtarget->hasVLX() && CurDAG->ComputeNumSignBits(Op: N->getOperand(Num: 0)) == |
1100 | EleVT.getSizeInBits()) { |
1101 | R = CurDAG->getNode(X86ISD::VPTERNLOG, SDLoc(N), N->getValueType(ResNo: 0), |
1102 | N->getOperand(Num: 0), N->getOperand(Num: 1), N->getOperand(Num: 2), |
1103 | CurDAG->getTargetConstant(0xCA, SDLoc(N), MVT::i8)); |
1104 | } else { |
1105 | R = CurDAG->getNode(Opcode: X86ISD::BLENDV, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), |
1106 | N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1), |
1107 | N3: N->getOperand(Num: 2)); |
1108 | } |
1109 | --I; |
1110 | CurDAG->ReplaceAllUsesWith(From: N, To: R.getNode()); |
1111 | ++I; |
1112 | MadeChange = true; |
1113 | continue; |
1114 | } |
1115 | case ISD::FP_ROUND: |
1116 | case ISD::STRICT_FP_ROUND: |
1117 | case ISD::FP_TO_SINT: |
1118 | case ISD::FP_TO_UINT: |
1119 | case ISD::STRICT_FP_TO_SINT: |
1120 | case ISD::STRICT_FP_TO_UINT: { |
1121 | // Replace vector fp_to_s/uint with their X86 specific equivalent so we |
1122 | // don't need 2 sets of patterns. |
1123 | if (!N->getSimpleValueType(ResNo: 0).isVector()) |
1124 | break; |
1125 | |
1126 | unsigned NewOpc; |
1127 | switch (N->getOpcode()) { |
1128 | default: llvm_unreachable("Unexpected opcode!" ); |
1129 | case ISD::FP_ROUND: NewOpc = X86ISD::VFPROUND; break; |
1130 | case ISD::STRICT_FP_ROUND: NewOpc = X86ISD::STRICT_VFPROUND; break; |
1131 | case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break; |
1132 | case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break; |
1133 | case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break; |
1134 | case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break; |
1135 | } |
1136 | SDValue Res; |
1137 | if (N->isStrictFPOpcode()) |
1138 | Res = |
1139 | CurDAG->getNode(NewOpc, SDLoc(N), {N->getValueType(0), MVT::Other}, |
1140 | {N->getOperand(0), N->getOperand(1)}); |
1141 | else |
1142 | Res = |
1143 | CurDAG->getNode(Opcode: NewOpc, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), |
1144 | Operand: N->getOperand(Num: 0)); |
1145 | --I; |
1146 | CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode()); |
1147 | ++I; |
1148 | MadeChange = true; |
1149 | continue; |
1150 | } |
1151 | case ISD::SHL: |
1152 | case ISD::SRA: |
1153 | case ISD::SRL: { |
1154 | // Replace vector shifts with their X86 specific equivalent so we don't |
1155 | // need 2 sets of patterns. |
1156 | if (!N->getValueType(ResNo: 0).isVector()) |
1157 | break; |
1158 | |
1159 | unsigned NewOpc; |
1160 | switch (N->getOpcode()) { |
1161 | default: llvm_unreachable("Unexpected opcode!" ); |
1162 | case ISD::SHL: NewOpc = X86ISD::VSHLV; break; |
1163 | case ISD::SRA: NewOpc = X86ISD::VSRAV; break; |
1164 | case ISD::SRL: NewOpc = X86ISD::VSRLV; break; |
1165 | } |
1166 | SDValue Res = CurDAG->getNode(Opcode: NewOpc, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), |
1167 | N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1)); |
1168 | --I; |
1169 | CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Res); |
1170 | ++I; |
1171 | MadeChange = true; |
1172 | continue; |
1173 | } |
1174 | case ISD::ANY_EXTEND: |
1175 | case ISD::ANY_EXTEND_VECTOR_INREG: { |
1176 | // Replace vector any extend with the zero extend equivalents so we don't |
1177 | // need 2 sets of patterns. Ignore vXi1 extensions. |
1178 | if (!N->getValueType(ResNo: 0).isVector()) |
1179 | break; |
1180 | |
1181 | unsigned NewOpc; |
1182 | if (N->getOperand(Num: 0).getScalarValueSizeInBits() == 1) { |
1183 | assert(N->getOpcode() == ISD::ANY_EXTEND && |
1184 | "Unexpected opcode for mask vector!" ); |
1185 | NewOpc = ISD::SIGN_EXTEND; |
1186 | } else { |
1187 | NewOpc = N->getOpcode() == ISD::ANY_EXTEND |
1188 | ? ISD::ZERO_EXTEND |
1189 | : ISD::ZERO_EXTEND_VECTOR_INREG; |
1190 | } |
1191 | |
1192 | SDValue Res = CurDAG->getNode(Opcode: NewOpc, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), |
1193 | Operand: N->getOperand(Num: 0)); |
1194 | --I; |
1195 | CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Res); |
1196 | ++I; |
1197 | MadeChange = true; |
1198 | continue; |
1199 | } |
1200 | case ISD::FCEIL: |
1201 | case ISD::STRICT_FCEIL: |
1202 | case ISD::FFLOOR: |
1203 | case ISD::STRICT_FFLOOR: |
1204 | case ISD::FTRUNC: |
1205 | case ISD::STRICT_FTRUNC: |
1206 | case ISD::FROUNDEVEN: |
1207 | case ISD::STRICT_FROUNDEVEN: |
1208 | case ISD::FNEARBYINT: |
1209 | case ISD::STRICT_FNEARBYINT: |
1210 | case ISD::FRINT: |
1211 | case ISD::STRICT_FRINT: { |
1212 | // Replace fp rounding with their X86 specific equivalent so we don't |
1213 | // need 2 sets of patterns. |
1214 | unsigned Imm; |
1215 | switch (N->getOpcode()) { |
1216 | default: llvm_unreachable("Unexpected opcode!" ); |
1217 | case ISD::STRICT_FCEIL: |
1218 | case ISD::FCEIL: Imm = 0xA; break; |
1219 | case ISD::STRICT_FFLOOR: |
1220 | case ISD::FFLOOR: Imm = 0x9; break; |
1221 | case ISD::STRICT_FTRUNC: |
1222 | case ISD::FTRUNC: Imm = 0xB; break; |
1223 | case ISD::STRICT_FROUNDEVEN: |
1224 | case ISD::FROUNDEVEN: Imm = 0x8; break; |
1225 | case ISD::STRICT_FNEARBYINT: |
1226 | case ISD::FNEARBYINT: Imm = 0xC; break; |
1227 | case ISD::STRICT_FRINT: |
1228 | case ISD::FRINT: Imm = 0x4; break; |
1229 | } |
1230 | SDLoc dl(N); |
1231 | bool IsStrict = N->isStrictFPOpcode(); |
1232 | SDValue Res; |
1233 | if (IsStrict) |
1234 | Res = CurDAG->getNode(X86ISD::STRICT_VRNDSCALE, dl, |
1235 | {N->getValueType(0), MVT::Other}, |
1236 | {N->getOperand(0), N->getOperand(1), |
1237 | CurDAG->getTargetConstant(Imm, dl, MVT::i32)}); |
1238 | else |
1239 | Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, N->getValueType(ResNo: 0), |
1240 | N->getOperand(Num: 0), |
1241 | CurDAG->getTargetConstant(Imm, dl, MVT::i32)); |
1242 | --I; |
1243 | CurDAG->ReplaceAllUsesWith(From: N, To: Res.getNode()); |
1244 | ++I; |
1245 | MadeChange = true; |
1246 | continue; |
1247 | } |
1248 | case X86ISD::FANDN: |
1249 | case X86ISD::FAND: |
1250 | case X86ISD::FOR: |
1251 | case X86ISD::FXOR: { |
1252 | // Widen scalar fp logic ops to vector to reduce isel patterns. |
1253 | // FIXME: Can we do this during lowering/combine. |
1254 | MVT VT = N->getSimpleValueType(ResNo: 0); |
1255 | if (VT.isVector() || VT == MVT::f128) |
1256 | break; |
1257 | |
1258 | MVT VecVT = VT == MVT::f64 ? MVT::v2f64 |
1259 | : VT == MVT::f32 ? MVT::v4f32 |
1260 | : MVT::v8f16; |
1261 | |
1262 | SDLoc dl(N); |
1263 | SDValue Op0 = CurDAG->getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT: VecVT, |
1264 | Operand: N->getOperand(Num: 0)); |
1265 | SDValue Op1 = CurDAG->getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT: VecVT, |
1266 | Operand: N->getOperand(Num: 1)); |
1267 | |
1268 | SDValue Res; |
1269 | if (Subtarget->hasSSE2()) { |
1270 | EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger(); |
1271 | Op0 = CurDAG->getNode(Opcode: ISD::BITCAST, DL: dl, VT: IntVT, Operand: Op0); |
1272 | Op1 = CurDAG->getNode(Opcode: ISD::BITCAST, DL: dl, VT: IntVT, Operand: Op1); |
1273 | unsigned Opc; |
1274 | switch (N->getOpcode()) { |
1275 | default: llvm_unreachable("Unexpected opcode!" ); |
1276 | case X86ISD::FANDN: Opc = X86ISD::ANDNP; break; |
1277 | case X86ISD::FAND: Opc = ISD::AND; break; |
1278 | case X86ISD::FOR: Opc = ISD::OR; break; |
1279 | case X86ISD::FXOR: Opc = ISD::XOR; break; |
1280 | } |
1281 | Res = CurDAG->getNode(Opcode: Opc, DL: dl, VT: IntVT, N1: Op0, N2: Op1); |
1282 | Res = CurDAG->getNode(Opcode: ISD::BITCAST, DL: dl, VT: VecVT, Operand: Res); |
1283 | } else { |
1284 | Res = CurDAG->getNode(Opcode: N->getOpcode(), DL: dl, VT: VecVT, N1: Op0, N2: Op1); |
1285 | } |
1286 | Res = CurDAG->getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT, N1: Res, |
1287 | N2: CurDAG->getIntPtrConstant(Val: 0, DL: dl)); |
1288 | --I; |
1289 | CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Res); |
1290 | ++I; |
1291 | MadeChange = true; |
1292 | continue; |
1293 | } |
1294 | } |
1295 | |
1296 | if (OptLevel != CodeGenOptLevel::None && |
1297 | // Only do this when the target can fold the load into the call or |
1298 | // jmp. |
1299 | !Subtarget->useIndirectThunkCalls() && |
1300 | ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) || |
1301 | (N->getOpcode() == X86ISD::TC_RETURN && |
1302 | (Subtarget->is64Bit() || |
1303 | !getTargetMachine().isPositionIndependent())))) { |
1304 | /// Also try moving call address load from outside callseq_start to just |
1305 | /// before the call to allow it to be folded. |
1306 | /// |
1307 | /// [Load chain] |
1308 | /// ^ |
1309 | /// | |
1310 | /// [Load] |
1311 | /// ^ ^ |
1312 | /// | | |
1313 | /// / \-- |
1314 | /// / | |
1315 | ///[CALLSEQ_START] | |
1316 | /// ^ | |
1317 | /// | | |
1318 | /// [LOAD/C2Reg] | |
1319 | /// | | |
1320 | /// \ / |
1321 | /// \ / |
1322 | /// [CALL] |
1323 | bool HasCallSeq = N->getOpcode() == X86ISD::CALL; |
1324 | SDValue Chain = N->getOperand(Num: 0); |
1325 | SDValue Load = N->getOperand(Num: 1); |
1326 | if (!isCalleeLoad(Callee: Load, Chain, HasCallSeq)) |
1327 | continue; |
1328 | moveBelowOrigChain(CurDAG, Load, Call: SDValue(N, 0), OrigChain: Chain); |
1329 | ++NumLoadMoved; |
1330 | MadeChange = true; |
1331 | continue; |
1332 | } |
1333 | |
1334 | // Lower fpround and fpextend nodes that target the FP stack to be store and |
1335 | // load to the stack. This is a gross hack. We would like to simply mark |
1336 | // these as being illegal, but when we do that, legalize produces these when |
1337 | // it expands calls, then expands these in the same legalize pass. We would |
1338 | // like dag combine to be able to hack on these between the call expansion |
1339 | // and the node legalization. As such this pass basically does "really |
1340 | // late" legalization of these inline with the X86 isel pass. |
1341 | // FIXME: This should only happen when not compiled with -O0. |
1342 | switch (N->getOpcode()) { |
1343 | default: continue; |
1344 | case ISD::FP_ROUND: |
1345 | case ISD::FP_EXTEND: |
1346 | { |
1347 | MVT SrcVT = N->getOperand(Num: 0).getSimpleValueType(); |
1348 | MVT DstVT = N->getSimpleValueType(ResNo: 0); |
1349 | |
1350 | // If any of the sources are vectors, no fp stack involved. |
1351 | if (SrcVT.isVector() || DstVT.isVector()) |
1352 | continue; |
1353 | |
1354 | // If the source and destination are SSE registers, then this is a legal |
1355 | // conversion that should not be lowered. |
1356 | const X86TargetLowering *X86Lowering = |
1357 | static_cast<const X86TargetLowering *>(TLI); |
1358 | bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: SrcVT); |
1359 | bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: DstVT); |
1360 | if (SrcIsSSE && DstIsSSE) |
1361 | continue; |
1362 | |
1363 | if (!SrcIsSSE && !DstIsSSE) { |
1364 | // If this is an FPStack extension, it is a noop. |
1365 | if (N->getOpcode() == ISD::FP_EXTEND) |
1366 | continue; |
1367 | // If this is a value-preserving FPStack truncation, it is a noop. |
1368 | if (N->getConstantOperandVal(Num: 1)) |
1369 | continue; |
1370 | } |
1371 | |
1372 | // Here we could have an FP stack truncation or an FPStack <-> SSE convert. |
1373 | // FPStack has extload and truncstore. SSE can fold direct loads into other |
1374 | // operations. Based on this, decide what we want to do. |
1375 | MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT; |
1376 | SDValue MemTmp = CurDAG->CreateStackTemporary(VT: MemVT); |
1377 | int SPFI = cast<FrameIndexSDNode>(Val&: MemTmp)->getIndex(); |
1378 | MachinePointerInfo MPI = |
1379 | MachinePointerInfo::getFixedStack(MF&: CurDAG->getMachineFunction(), FI: SPFI); |
1380 | SDLoc dl(N); |
1381 | |
1382 | // FIXME: optimize the case where the src/dest is a load or store? |
1383 | |
1384 | SDValue Store = CurDAG->getTruncStore( |
1385 | Chain: CurDAG->getEntryNode(), dl, Val: N->getOperand(Num: 0), Ptr: MemTmp, PtrInfo: MPI, SVT: MemVT); |
1386 | SDValue Result = CurDAG->getExtLoad(ExtType: ISD::EXTLOAD, dl, VT: DstVT, Chain: Store, |
1387 | Ptr: MemTmp, PtrInfo: MPI, MemVT); |
1388 | |
1389 | // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the |
1390 | // extload we created. This will cause general havok on the dag because |
1391 | // anything below the conversion could be folded into other existing nodes. |
1392 | // To avoid invalidating 'I', back it up to the convert node. |
1393 | --I; |
1394 | CurDAG->ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Result); |
1395 | break; |
1396 | } |
1397 | |
1398 | //The sequence of events for lowering STRICT_FP versions of these nodes requires |
1399 | //dealing with the chain differently, as there is already a preexisting chain. |
1400 | case ISD::STRICT_FP_ROUND: |
1401 | case ISD::STRICT_FP_EXTEND: |
1402 | { |
1403 | MVT SrcVT = N->getOperand(Num: 1).getSimpleValueType(); |
1404 | MVT DstVT = N->getSimpleValueType(ResNo: 0); |
1405 | |
1406 | // If any of the sources are vectors, no fp stack involved. |
1407 | if (SrcVT.isVector() || DstVT.isVector()) |
1408 | continue; |
1409 | |
1410 | // If the source and destination are SSE registers, then this is a legal |
1411 | // conversion that should not be lowered. |
1412 | const X86TargetLowering *X86Lowering = |
1413 | static_cast<const X86TargetLowering *>(TLI); |
1414 | bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: SrcVT); |
1415 | bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(VT: DstVT); |
1416 | if (SrcIsSSE && DstIsSSE) |
1417 | continue; |
1418 | |
1419 | if (!SrcIsSSE && !DstIsSSE) { |
1420 | // If this is an FPStack extension, it is a noop. |
1421 | if (N->getOpcode() == ISD::STRICT_FP_EXTEND) |
1422 | continue; |
1423 | // If this is a value-preserving FPStack truncation, it is a noop. |
1424 | if (N->getConstantOperandVal(Num: 2)) |
1425 | continue; |
1426 | } |
1427 | |
1428 | // Here we could have an FP stack truncation or an FPStack <-> SSE convert. |
1429 | // FPStack has extload and truncstore. SSE can fold direct loads into other |
1430 | // operations. Based on this, decide what we want to do. |
1431 | MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT; |
1432 | SDValue MemTmp = CurDAG->CreateStackTemporary(VT: MemVT); |
1433 | int SPFI = cast<FrameIndexSDNode>(Val&: MemTmp)->getIndex(); |
1434 | MachinePointerInfo MPI = |
1435 | MachinePointerInfo::getFixedStack(MF&: CurDAG->getMachineFunction(), FI: SPFI); |
1436 | SDLoc dl(N); |
1437 | |
1438 | // FIXME: optimize the case where the src/dest is a load or store? |
1439 | |
1440 | //Since the operation is StrictFP, use the preexisting chain. |
1441 | SDValue Store, Result; |
1442 | if (!SrcIsSSE) { |
1443 | SDVTList VTs = CurDAG->getVTList(MVT::Other); |
1444 | SDValue Ops[] = {N->getOperand(Num: 0), N->getOperand(Num: 1), MemTmp}; |
1445 | Store = CurDAG->getMemIntrinsicNode(Opcode: X86ISD::FST, dl, VTList: VTs, Ops, MemVT, |
1446 | PtrInfo: MPI, /*Align*/ Alignment: std::nullopt, |
1447 | Flags: MachineMemOperand::MOStore); |
1448 | if (N->getFlags().hasNoFPExcept()) { |
1449 | SDNodeFlags Flags = Store->getFlags(); |
1450 | Flags.setNoFPExcept(true); |
1451 | Store->setFlags(Flags); |
1452 | } |
1453 | } else { |
1454 | assert(SrcVT == MemVT && "Unexpected VT!" ); |
1455 | Store = CurDAG->getStore(Chain: N->getOperand(Num: 0), dl, Val: N->getOperand(Num: 1), Ptr: MemTmp, |
1456 | PtrInfo: MPI); |
1457 | } |
1458 | |
1459 | if (!DstIsSSE) { |
1460 | SDVTList VTs = CurDAG->getVTList(DstVT, MVT::Other); |
1461 | SDValue Ops[] = {Store, MemTmp}; |
1462 | Result = CurDAG->getMemIntrinsicNode( |
1463 | Opcode: X86ISD::FLD, dl, VTList: VTs, Ops, MemVT, PtrInfo: MPI, |
1464 | /*Align*/ Alignment: std::nullopt, Flags: MachineMemOperand::MOLoad); |
1465 | if (N->getFlags().hasNoFPExcept()) { |
1466 | SDNodeFlags Flags = Result->getFlags(); |
1467 | Flags.setNoFPExcept(true); |
1468 | Result->setFlags(Flags); |
1469 | } |
1470 | } else { |
1471 | assert(DstVT == MemVT && "Unexpected VT!" ); |
1472 | Result = CurDAG->getLoad(VT: DstVT, dl, Chain: Store, Ptr: MemTmp, PtrInfo: MPI); |
1473 | } |
1474 | |
1475 | // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the |
1476 | // extload we created. This will cause general havok on the dag because |
1477 | // anything below the conversion could be folded into other existing nodes. |
1478 | // To avoid invalidating 'I', back it up to the convert node. |
1479 | --I; |
1480 | CurDAG->ReplaceAllUsesWith(From: N, To: Result.getNode()); |
1481 | break; |
1482 | } |
1483 | } |
1484 | |
1485 | |
1486 | // Now that we did that, the node is dead. Increment the iterator to the |
1487 | // next node to process, then delete N. |
1488 | ++I; |
1489 | MadeChange = true; |
1490 | } |
1491 | |
1492 | // Remove any dead nodes that may have been left behind. |
1493 | if (MadeChange) |
1494 | CurDAG->RemoveDeadNodes(); |
1495 | } |
1496 | |
1497 | // Look for a redundant movzx/movsx that can occur after an 8-bit divrem. |
1498 | bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) { |
1499 | unsigned Opc = N->getMachineOpcode(); |
1500 | if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 && |
1501 | Opc != X86::MOVSX64rr8) |
1502 | return false; |
1503 | |
1504 | SDValue N0 = N->getOperand(Num: 0); |
1505 | |
1506 | // We need to be extracting the lower bit of an extend. |
1507 | if (!N0.isMachineOpcode() || |
1508 | N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG || |
1509 | N0.getConstantOperandVal(i: 1) != X86::sub_8bit) |
1510 | return false; |
1511 | |
1512 | // We're looking for either a movsx or movzx to match the original opcode. |
1513 | unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX |
1514 | : X86::MOVSX32rr8_NOREX; |
1515 | SDValue N00 = N0.getOperand(i: 0); |
1516 | if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc) |
1517 | return false; |
1518 | |
1519 | if (Opc == X86::MOVSX64rr8) { |
1520 | // If we had a sign extend from 8 to 64 bits. We still need to go from 32 |
1521 | // to 64. |
1522 | MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N), |
1523 | MVT::i64, N00); |
1524 | ReplaceUses(F: N, T: Extend); |
1525 | } else { |
1526 | // Ok we can drop this extend and just use the original extend. |
1527 | ReplaceUses(F: N, T: N00.getNode()); |
1528 | } |
1529 | |
1530 | return true; |
1531 | } |
1532 | |
1533 | void X86DAGToDAGISel::PostprocessISelDAG() { |
1534 | // Skip peepholes at -O0. |
1535 | if (TM.getOptLevel() == CodeGenOptLevel::None) |
1536 | return; |
1537 | |
1538 | SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end(); |
1539 | |
1540 | bool MadeChange = false; |
1541 | while (Position != CurDAG->allnodes_begin()) { |
1542 | SDNode *N = &*--Position; |
1543 | // Skip dead nodes and any non-machine opcodes. |
1544 | if (N->use_empty() || !N->isMachineOpcode()) |
1545 | continue; |
1546 | |
1547 | if (tryOptimizeRem8Extend(N)) { |
1548 | MadeChange = true; |
1549 | continue; |
1550 | } |
1551 | |
1552 | // Look for a TESTrr+ANDrr pattern where both operands of the test are |
1553 | // the same. Rewrite to remove the AND. |
1554 | unsigned Opc = N->getMachineOpcode(); |
1555 | if ((Opc == X86::TEST8rr || Opc == X86::TEST16rr || |
1556 | Opc == X86::TEST32rr || Opc == X86::TEST64rr) && |
1557 | N->getOperand(Num: 0) == N->getOperand(Num: 1) && |
1558 | N->getOperand(Num: 0)->hasNUsesOfValue(NUses: 2, Value: N->getOperand(Num: 0).getResNo()) && |
1559 | N->getOperand(Num: 0).isMachineOpcode()) { |
1560 | SDValue And = N->getOperand(Num: 0); |
1561 | unsigned N0Opc = And.getMachineOpcode(); |
1562 | if ((N0Opc == X86::AND8rr || N0Opc == X86::AND16rr || |
1563 | N0Opc == X86::AND32rr || N0Opc == X86::AND64rr) && |
1564 | !And->hasAnyUseOfValue(1)) { |
1565 | MachineSDNode *Test = CurDAG->getMachineNode(Opc, SDLoc(N), |
1566 | MVT::i32, |
1567 | And.getOperand(0), |
1568 | And.getOperand(1)); |
1569 | ReplaceUses(F: N, T: Test); |
1570 | MadeChange = true; |
1571 | continue; |
1572 | } |
1573 | if ((N0Opc == X86::AND8rm || N0Opc == X86::AND16rm || |
1574 | N0Opc == X86::AND32rm || N0Opc == X86::AND64rm) && |
1575 | !And->hasAnyUseOfValue(1)) { |
1576 | unsigned NewOpc; |
1577 | switch (N0Opc) { |
1578 | case X86::AND8rm: NewOpc = X86::TEST8mr; break; |
1579 | case X86::AND16rm: NewOpc = X86::TEST16mr; break; |
1580 | case X86::AND32rm: NewOpc = X86::TEST32mr; break; |
1581 | case X86::AND64rm: NewOpc = X86::TEST64mr; break; |
1582 | } |
1583 | |
1584 | // Need to swap the memory and register operand. |
1585 | SDValue Ops[] = { And.getOperand(i: 1), |
1586 | And.getOperand(i: 2), |
1587 | And.getOperand(i: 3), |
1588 | And.getOperand(i: 4), |
1589 | And.getOperand(i: 5), |
1590 | And.getOperand(i: 0), |
1591 | And.getOperand(i: 6) /* Chain */ }; |
1592 | MachineSDNode *Test = CurDAG->getMachineNode(NewOpc, SDLoc(N), |
1593 | MVT::i32, MVT::Other, Ops); |
1594 | CurDAG->setNodeMemRefs( |
1595 | N: Test, NewMemRefs: cast<MachineSDNode>(Val: And.getNode())->memoperands()); |
1596 | ReplaceUses(F: And.getValue(R: 2), T: SDValue(Test, 1)); |
1597 | ReplaceUses(F: SDValue(N, 0), T: SDValue(Test, 0)); |
1598 | MadeChange = true; |
1599 | continue; |
1600 | } |
1601 | } |
1602 | |
1603 | // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is |
1604 | // used. We're doing this late so we can prefer to fold the AND into masked |
1605 | // comparisons. Doing that can be better for the live range of the mask |
1606 | // register. |
1607 | if ((Opc == X86::KORTESTBrr || Opc == X86::KORTESTWrr || |
1608 | Opc == X86::KORTESTDrr || Opc == X86::KORTESTQrr) && |
1609 | N->getOperand(0) == N->getOperand(1) && |
1610 | N->isOnlyUserOf(N->getOperand(0).getNode()) && |
1611 | N->getOperand(0).isMachineOpcode() && |
1612 | onlyUsesZeroFlag(SDValue(N, 0))) { |
1613 | SDValue And = N->getOperand(Num: 0); |
1614 | unsigned N0Opc = And.getMachineOpcode(); |
1615 | // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other |
1616 | // KAND instructions and KTEST use the same ISA feature. |
1617 | if (N0Opc == X86::KANDBrr || |
1618 | (N0Opc == X86::KANDWrr && Subtarget->hasDQI()) || |
1619 | N0Opc == X86::KANDDrr || N0Opc == X86::KANDQrr) { |
1620 | unsigned NewOpc; |
1621 | switch (Opc) { |
1622 | default: llvm_unreachable("Unexpected opcode!" ); |
1623 | case X86::KORTESTBrr: NewOpc = X86::KTESTBrr; break; |
1624 | case X86::KORTESTWrr: NewOpc = X86::KTESTWrr; break; |
1625 | case X86::KORTESTDrr: NewOpc = X86::KTESTDrr; break; |
1626 | case X86::KORTESTQrr: NewOpc = X86::KTESTQrr; break; |
1627 | } |
1628 | MachineSDNode *KTest = CurDAG->getMachineNode(NewOpc, SDLoc(N), |
1629 | MVT::i32, |
1630 | And.getOperand(0), |
1631 | And.getOperand(1)); |
1632 | ReplaceUses(F: N, T: KTest); |
1633 | MadeChange = true; |
1634 | continue; |
1635 | } |
1636 | } |
1637 | |
1638 | // Attempt to remove vectors moves that were inserted to zero upper bits. |
1639 | if (Opc != TargetOpcode::SUBREG_TO_REG) |
1640 | continue; |
1641 | |
1642 | unsigned SubRegIdx = N->getConstantOperandVal(Num: 2); |
1643 | if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm) |
1644 | continue; |
1645 | |
1646 | SDValue Move = N->getOperand(Num: 1); |
1647 | if (!Move.isMachineOpcode()) |
1648 | continue; |
1649 | |
1650 | // Make sure its one of the move opcodes we recognize. |
1651 | switch (Move.getMachineOpcode()) { |
1652 | default: |
1653 | continue; |
1654 | case X86::VMOVAPDrr: case X86::VMOVUPDrr: |
1655 | case X86::VMOVAPSrr: case X86::VMOVUPSrr: |
1656 | case X86::VMOVDQArr: case X86::VMOVDQUrr: |
1657 | case X86::VMOVAPDYrr: case X86::VMOVUPDYrr: |
1658 | case X86::VMOVAPSYrr: case X86::VMOVUPSYrr: |
1659 | case X86::VMOVDQAYrr: case X86::VMOVDQUYrr: |
1660 | case X86::VMOVAPDZ128rr: case X86::VMOVUPDZ128rr: |
1661 | case X86::VMOVAPSZ128rr: case X86::VMOVUPSZ128rr: |
1662 | case X86::VMOVDQA32Z128rr: case X86::VMOVDQU32Z128rr: |
1663 | case X86::VMOVDQA64Z128rr: case X86::VMOVDQU64Z128rr: |
1664 | case X86::VMOVAPDZ256rr: case X86::VMOVUPDZ256rr: |
1665 | case X86::VMOVAPSZ256rr: case X86::VMOVUPSZ256rr: |
1666 | case X86::VMOVDQA32Z256rr: case X86::VMOVDQU32Z256rr: |
1667 | case X86::VMOVDQA64Z256rr: case X86::VMOVDQU64Z256rr: |
1668 | break; |
1669 | } |
1670 | |
1671 | SDValue In = Move.getOperand(i: 0); |
1672 | if (!In.isMachineOpcode() || |
1673 | In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END) |
1674 | continue; |
1675 | |
1676 | // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers |
1677 | // the SHA instructions which use a legacy encoding. |
1678 | uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags; |
1679 | if ((TSFlags & X86II::EncodingMask) != X86II::VEX && |
1680 | (TSFlags & X86II::EncodingMask) != X86II::EVEX && |
1681 | (TSFlags & X86II::EncodingMask) != X86II::XOP) |
1682 | continue; |
1683 | |
1684 | // Producing instruction is another vector instruction. We can drop the |
1685 | // move. |
1686 | CurDAG->UpdateNodeOperands(N, Op1: N->getOperand(Num: 0), Op2: In, Op3: N->getOperand(Num: 2)); |
1687 | MadeChange = true; |
1688 | } |
1689 | |
1690 | if (MadeChange) |
1691 | CurDAG->RemoveDeadNodes(); |
1692 | } |
1693 | |
1694 | |
1695 | /// Emit any code that needs to be executed only in the main function. |
1696 | void X86DAGToDAGISel::emitSpecialCodeForMain() { |
1697 | if (Subtarget->isTargetCygMing()) { |
1698 | TargetLowering::ArgListTy Args; |
1699 | auto &DL = CurDAG->getDataLayout(); |
1700 | |
1701 | TargetLowering::CallLoweringInfo CLI(*CurDAG); |
1702 | CLI.setChain(CurDAG->getRoot()) |
1703 | .setCallee(CC: CallingConv::C, ResultType: Type::getVoidTy(C&: *CurDAG->getContext()), |
1704 | Target: CurDAG->getExternalSymbol(Sym: "__main" , VT: TLI->getPointerTy(DL)), |
1705 | ArgsList: std::move(Args)); |
1706 | const TargetLowering &TLI = CurDAG->getTargetLoweringInfo(); |
1707 | std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI); |
1708 | CurDAG->setRoot(Result.second); |
1709 | } |
1710 | } |
1711 | |
1712 | void X86DAGToDAGISel::emitFunctionEntryCode() { |
1713 | // If this is main, emit special code for main. |
1714 | const Function &F = MF->getFunction(); |
1715 | if (F.hasExternalLinkage() && F.getName() == "main" ) |
1716 | emitSpecialCodeForMain(); |
1717 | } |
1718 | |
1719 | static bool isDispSafeForFrameIndex(int64_t Val) { |
1720 | // On 64-bit platforms, we can run into an issue where a frame index |
1721 | // includes a displacement that, when added to the explicit displacement, |
1722 | // will overflow the displacement field. Assuming that the frame index |
1723 | // displacement fits into a 31-bit integer (which is only slightly more |
1724 | // aggressive than the current fundamental assumption that it fits into |
1725 | // a 32-bit integer), a 31-bit disp should always be safe. |
1726 | return isInt<31>(x: Val); |
1727 | } |
1728 | |
1729 | bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset, |
1730 | X86ISelAddressMode &AM) { |
1731 | // We may have already matched a displacement and the caller just added the |
1732 | // symbolic displacement. So we still need to do the checks even if Offset |
1733 | // is zero. |
1734 | |
1735 | int64_t Val = AM.Disp + Offset; |
1736 | |
1737 | // Cannot combine ExternalSymbol displacements with integer offsets. |
1738 | if (Val != 0 && (AM.ES || AM.MCSym)) |
1739 | return true; |
1740 | |
1741 | CodeModel::Model M = TM.getCodeModel(); |
1742 | if (Subtarget->is64Bit()) { |
1743 | if (Val != 0 && |
1744 | !X86::isOffsetSuitableForCodeModel(Offset: Val, M, |
1745 | hasSymbolicDisplacement: AM.hasSymbolicDisplacement())) |
1746 | return true; |
1747 | // In addition to the checks required for a register base, check that |
1748 | // we do not try to use an unsafe Disp with a frame index. |
1749 | if (AM.BaseType == X86ISelAddressMode::FrameIndexBase && |
1750 | !isDispSafeForFrameIndex(Val)) |
1751 | return true; |
1752 | // In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to |
1753 | // 64 bits. Instructions with 32-bit register addresses perform this zero |
1754 | // extension for us and we can safely ignore the high bits of Offset. |
1755 | // Instructions with only a 32-bit immediate address do not, though: they |
1756 | // sign extend instead. This means only address the low 2GB of address space |
1757 | // is directly addressable, we need indirect addressing for the high 2GB of |
1758 | // address space. |
1759 | // TODO: Some of the earlier checks may be relaxed for ILP32 mode as the |
1760 | // implicit zero extension of instructions would cover up any problem. |
1761 | // However, we have asserts elsewhere that get triggered if we do, so keep |
1762 | // the checks for now. |
1763 | // TODO: We would actually be able to accept these, as well as the same |
1764 | // addresses in LP64 mode, by adding the EIZ pseudo-register as an operand |
1765 | // to get an address size override to be emitted. However, this |
1766 | // pseudo-register is not part of any register class and therefore causes |
1767 | // MIR verification to fail. |
1768 | if (Subtarget->isTarget64BitILP32() && !isUInt<31>(x: Val) && |
1769 | !AM.hasBaseOrIndexReg()) |
1770 | return true; |
1771 | } |
1772 | AM.Disp = Val; |
1773 | return false; |
1774 | } |
1775 | |
1776 | bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM, |
1777 | bool AllowSegmentRegForX32) { |
1778 | SDValue Address = N->getOperand(Num: 1); |
1779 | |
1780 | // load gs:0 -> GS segment register. |
1781 | // load fs:0 -> FS segment register. |
1782 | // |
1783 | // This optimization is generally valid because the GNU TLS model defines that |
1784 | // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode |
1785 | // with 32-bit registers, as we get in ILP32 mode, those registers are first |
1786 | // zero-extended to 64 bits and then added it to the base address, which gives |
1787 | // unwanted results when the register holds a negative value. |
1788 | // For more information see http://people.redhat.com/drepper/tls.pdf |
1789 | if (isNullConstant(V: Address) && AM.Segment.getNode() == nullptr && |
1790 | !IndirectTlsSegRefs && |
1791 | (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() || |
1792 | Subtarget->isTargetFuchsia())) { |
1793 | if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32) |
1794 | return true; |
1795 | switch (N->getPointerInfo().getAddrSpace()) { |
1796 | case X86AS::GS: |
1797 | AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); |
1798 | return false; |
1799 | case X86AS::FS: |
1800 | AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); |
1801 | return false; |
1802 | // Address space X86AS::SS is not handled here, because it is not used to |
1803 | // address TLS areas. |
1804 | } |
1805 | } |
1806 | |
1807 | return true; |
1808 | } |
1809 | |
1810 | /// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing |
1811 | /// mode. These wrap things that will resolve down into a symbol reference. |
1812 | /// If no match is possible, this returns true, otherwise it returns false. |
1813 | bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) { |
1814 | // If the addressing mode already has a symbol as the displacement, we can |
1815 | // never match another symbol. |
1816 | if (AM.hasSymbolicDisplacement()) |
1817 | return true; |
1818 | |
1819 | bool IsRIPRelTLS = false; |
1820 | bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP; |
1821 | if (IsRIPRel) { |
1822 | SDValue Val = N.getOperand(i: 0); |
1823 | if (Val.getOpcode() == ISD::TargetGlobalTLSAddress) |
1824 | IsRIPRelTLS = true; |
1825 | } |
1826 | |
1827 | // We can't use an addressing mode in the 64-bit large code model. |
1828 | // Global TLS addressing is an exception. In the medium code model, |
1829 | // we use can use a mode when RIP wrappers are present. |
1830 | // That signifies access to globals that are known to be "near", |
1831 | // such as the GOT itself. |
1832 | CodeModel::Model M = TM.getCodeModel(); |
1833 | if (Subtarget->is64Bit() && M == CodeModel::Large && !IsRIPRelTLS) |
1834 | return true; |
1835 | |
1836 | // Base and index reg must be 0 in order to use %rip as base. |
1837 | if (IsRIPRel && AM.hasBaseOrIndexReg()) |
1838 | return true; |
1839 | |
1840 | // Make a local copy in case we can't do this fold. |
1841 | X86ISelAddressMode Backup = AM; |
1842 | |
1843 | int64_t Offset = 0; |
1844 | SDValue N0 = N.getOperand(i: 0); |
1845 | if (auto *G = dyn_cast<GlobalAddressSDNode>(Val&: N0)) { |
1846 | AM.GV = G->getGlobal(); |
1847 | AM.SymbolFlags = G->getTargetFlags(); |
1848 | Offset = G->getOffset(); |
1849 | } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(Val&: N0)) { |
1850 | AM.CP = CP->getConstVal(); |
1851 | AM.Alignment = CP->getAlign(); |
1852 | AM.SymbolFlags = CP->getTargetFlags(); |
1853 | Offset = CP->getOffset(); |
1854 | } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Val&: N0)) { |
1855 | AM.ES = S->getSymbol(); |
1856 | AM.SymbolFlags = S->getTargetFlags(); |
1857 | } else if (auto *S = dyn_cast<MCSymbolSDNode>(Val&: N0)) { |
1858 | AM.MCSym = S->getMCSymbol(); |
1859 | } else if (auto *J = dyn_cast<JumpTableSDNode>(Val&: N0)) { |
1860 | AM.JT = J->getIndex(); |
1861 | AM.SymbolFlags = J->getTargetFlags(); |
1862 | } else if (auto *BA = dyn_cast<BlockAddressSDNode>(Val&: N0)) { |
1863 | AM.BlockAddr = BA->getBlockAddress(); |
1864 | AM.SymbolFlags = BA->getTargetFlags(); |
1865 | Offset = BA->getOffset(); |
1866 | } else |
1867 | llvm_unreachable("Unhandled symbol reference node." ); |
1868 | |
1869 | // Can't use an addressing mode with large globals. |
1870 | if (Subtarget->is64Bit() && !IsRIPRel && AM.GV && |
1871 | TM.isLargeGlobalValue(GV: AM.GV)) { |
1872 | AM = Backup; |
1873 | return true; |
1874 | } |
1875 | |
1876 | if (foldOffsetIntoAddress(Offset, AM)) { |
1877 | AM = Backup; |
1878 | return true; |
1879 | } |
1880 | |
1881 | if (IsRIPRel) |
1882 | AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64)); |
1883 | |
1884 | // Commit the changes now that we know this fold is safe. |
1885 | return false; |
1886 | } |
1887 | |
1888 | /// Add the specified node to the specified addressing mode, returning true if |
1889 | /// it cannot be done. This just pattern matches for the addressing mode. |
1890 | bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) { |
1891 | if (matchAddressRecursively(N, AM, Depth: 0)) |
1892 | return true; |
1893 | |
1894 | // Post-processing: Make a second attempt to fold a load, if we now know |
1895 | // that there will not be any other register. This is only performed for |
1896 | // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded |
1897 | // any foldable load the first time. |
1898 | if (Subtarget->isTarget64BitILP32() && |
1899 | AM.BaseType == X86ISelAddressMode::RegBase && |
1900 | AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) { |
1901 | SDValue Save_Base_Reg = AM.Base_Reg; |
1902 | if (auto *LoadN = dyn_cast<LoadSDNode>(Val&: Save_Base_Reg)) { |
1903 | AM.Base_Reg = SDValue(); |
1904 | if (matchLoadInAddress(N: LoadN, AM, /*AllowSegmentRegForX32=*/true)) |
1905 | AM.Base_Reg = Save_Base_Reg; |
1906 | } |
1907 | } |
1908 | |
1909 | // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has |
1910 | // a smaller encoding and avoids a scaled-index. |
1911 | if (AM.Scale == 2 && |
1912 | AM.BaseType == X86ISelAddressMode::RegBase && |
1913 | AM.Base_Reg.getNode() == nullptr) { |
1914 | AM.Base_Reg = AM.IndexReg; |
1915 | AM.Scale = 1; |
1916 | } |
1917 | |
1918 | // Post-processing: Convert foo to foo(%rip), even in non-PIC mode, |
1919 | // because it has a smaller encoding. |
1920 | if (TM.getCodeModel() != CodeModel::Large && |
1921 | (!AM.GV || !TM.isLargeGlobalValue(GV: AM.GV)) && Subtarget->is64Bit() && |
1922 | AM.Scale == 1 && AM.BaseType == X86ISelAddressMode::RegBase && |
1923 | AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr && |
1924 | AM.SymbolFlags == X86II::MO_NO_FLAG && AM.hasSymbolicDisplacement()) { |
1925 | AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64); |
1926 | } |
1927 | |
1928 | return false; |
1929 | } |
1930 | |
1931 | bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM, |
1932 | unsigned Depth) { |
1933 | // Add an artificial use to this node so that we can keep track of |
1934 | // it if it gets CSE'd with a different node. |
1935 | HandleSDNode Handle(N); |
1936 | |
1937 | X86ISelAddressMode Backup = AM; |
1938 | if (!matchAddressRecursively(N: N.getOperand(i: 0), AM, Depth: Depth+1) && |
1939 | !matchAddressRecursively(N: Handle.getValue().getOperand(i: 1), AM, Depth: Depth+1)) |
1940 | return false; |
1941 | AM = Backup; |
1942 | |
1943 | // Try again after commutating the operands. |
1944 | if (!matchAddressRecursively(N: Handle.getValue().getOperand(i: 1), AM, |
1945 | Depth: Depth + 1) && |
1946 | !matchAddressRecursively(N: Handle.getValue().getOperand(i: 0), AM, Depth: Depth + 1)) |
1947 | return false; |
1948 | AM = Backup; |
1949 | |
1950 | // If we couldn't fold both operands into the address at the same time, |
1951 | // see if we can just put each operand into a register and fold at least |
1952 | // the add. |
1953 | if (AM.BaseType == X86ISelAddressMode::RegBase && |
1954 | !AM.Base_Reg.getNode() && |
1955 | !AM.IndexReg.getNode()) { |
1956 | N = Handle.getValue(); |
1957 | AM.Base_Reg = N.getOperand(i: 0); |
1958 | AM.IndexReg = N.getOperand(i: 1); |
1959 | AM.Scale = 1; |
1960 | return false; |
1961 | } |
1962 | N = Handle.getValue(); |
1963 | return true; |
1964 | } |
1965 | |
1966 | // Insert a node into the DAG at least before the Pos node's position. This |
1967 | // will reposition the node as needed, and will assign it a node ID that is <= |
1968 | // the Pos node's ID. Note that this does *not* preserve the uniqueness of node |
1969 | // IDs! The selection DAG must no longer depend on their uniqueness when this |
1970 | // is used. |
1971 | static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) { |
1972 | if (N->getNodeId() == -1 || |
1973 | (SelectionDAGISel::getUninvalidatedNodeId(N: N.getNode()) > |
1974 | SelectionDAGISel::getUninvalidatedNodeId(N: Pos.getNode()))) { |
1975 | DAG.RepositionNode(Position: Pos->getIterator(), N: N.getNode()); |
1976 | // Mark Node as invalid for pruning as after this it may be a successor to a |
1977 | // selected node but otherwise be in the same position of Pos. |
1978 | // Conservatively mark it with the same -abs(Id) to assure node id |
1979 | // invariant is preserved. |
1980 | N->setNodeId(Pos->getNodeId()); |
1981 | SelectionDAGISel::InvalidateNodeId(N: N.getNode()); |
1982 | } |
1983 | } |
1984 | |
1985 | // Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if |
1986 | // safe. This allows us to convert the shift and and into an h-register |
1987 | // extract and a scaled index. Returns false if the simplification is |
1988 | // performed. |
1989 | static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, |
1990 | uint64_t Mask, |
1991 | SDValue Shift, SDValue X, |
1992 | X86ISelAddressMode &AM) { |
1993 | if (Shift.getOpcode() != ISD::SRL || |
1994 | !isa<ConstantSDNode>(Val: Shift.getOperand(i: 1)) || |
1995 | !Shift.hasOneUse()) |
1996 | return true; |
1997 | |
1998 | int ScaleLog = 8 - Shift.getConstantOperandVal(i: 1); |
1999 | if (ScaleLog <= 0 || ScaleLog >= 4 || |
2000 | Mask != (0xffu << ScaleLog)) |
2001 | return true; |
2002 | |
2003 | MVT XVT = X.getSimpleValueType(); |
2004 | MVT VT = N.getSimpleValueType(); |
2005 | SDLoc DL(N); |
2006 | SDValue Eight = DAG.getConstant(8, DL, MVT::i8); |
2007 | SDValue NewMask = DAG.getConstant(Val: 0xff, DL, VT: XVT); |
2008 | SDValue Srl = DAG.getNode(Opcode: ISD::SRL, DL, VT: XVT, N1: X, N2: Eight); |
2009 | SDValue And = DAG.getNode(Opcode: ISD::AND, DL, VT: XVT, N1: Srl, N2: NewMask); |
2010 | SDValue Ext = DAG.getZExtOrTrunc(Op: And, DL, VT); |
2011 | SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8); |
2012 | SDValue Shl = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: Ext, N2: ShlCount); |
2013 | |
2014 | // Insert the new nodes into the topological ordering. We must do this in |
2015 | // a valid topological ordering as nothing is going to go back and re-sort |
2016 | // these nodes. We continually insert before 'N' in sequence as this is |
2017 | // essentially a pre-flattened and pre-sorted sequence of nodes. There is no |
2018 | // hierarchy left to express. |
2019 | insertDAGNode(DAG, Pos: N, N: Eight); |
2020 | insertDAGNode(DAG, Pos: N, N: NewMask); |
2021 | insertDAGNode(DAG, Pos: N, N: Srl); |
2022 | insertDAGNode(DAG, Pos: N, N: And); |
2023 | insertDAGNode(DAG, Pos: N, N: Ext); |
2024 | insertDAGNode(DAG, Pos: N, N: ShlCount); |
2025 | insertDAGNode(DAG, Pos: N, N: Shl); |
2026 | DAG.ReplaceAllUsesWith(From: N, To: Shl); |
2027 | DAG.RemoveDeadNode(N: N.getNode()); |
2028 | AM.IndexReg = Ext; |
2029 | AM.Scale = (1 << ScaleLog); |
2030 | return false; |
2031 | } |
2032 | |
2033 | // Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this |
2034 | // allows us to fold the shift into this addressing mode. Returns false if the |
2035 | // transform succeeded. |
2036 | static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, |
2037 | X86ISelAddressMode &AM) { |
2038 | SDValue Shift = N.getOperand(i: 0); |
2039 | |
2040 | // Use a signed mask so that shifting right will insert sign bits. These |
2041 | // bits will be removed when we shift the result left so it doesn't matter |
2042 | // what we use. This might allow a smaller immediate encoding. |
2043 | int64_t Mask = cast<ConstantSDNode>(Val: N->getOperand(Num: 1))->getSExtValue(); |
2044 | |
2045 | // If we have an any_extend feeding the AND, look through it to see if there |
2046 | // is a shift behind it. But only if the AND doesn't use the extended bits. |
2047 | // FIXME: Generalize this to other ANY_EXTEND than i32 to i64? |
2048 | bool FoundAnyExtend = false; |
2049 | if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() && |
2050 | Shift.getOperand(0).getSimpleValueType() == MVT::i32 && |
2051 | isUInt<32>(Mask)) { |
2052 | FoundAnyExtend = true; |
2053 | Shift = Shift.getOperand(i: 0); |
2054 | } |
2055 | |
2056 | if (Shift.getOpcode() != ISD::SHL || |
2057 | !isa<ConstantSDNode>(Val: Shift.getOperand(i: 1))) |
2058 | return true; |
2059 | |
2060 | SDValue X = Shift.getOperand(i: 0); |
2061 | |
2062 | // Not likely to be profitable if either the AND or SHIFT node has more |
2063 | // than one use (unless all uses are for address computation). Besides, |
2064 | // isel mechanism requires their node ids to be reused. |
2065 | if (!N.hasOneUse() || !Shift.hasOneUse()) |
2066 | return true; |
2067 | |
2068 | // Verify that the shift amount is something we can fold. |
2069 | unsigned ShiftAmt = Shift.getConstantOperandVal(i: 1); |
2070 | if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3) |
2071 | return true; |
2072 | |
2073 | MVT VT = N.getSimpleValueType(); |
2074 | SDLoc DL(N); |
2075 | if (FoundAnyExtend) { |
2076 | SDValue NewX = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT, Operand: X); |
2077 | insertDAGNode(DAG, Pos: N, N: NewX); |
2078 | X = NewX; |
2079 | } |
2080 | |
2081 | SDValue NewMask = DAG.getConstant(Val: Mask >> ShiftAmt, DL, VT); |
2082 | SDValue NewAnd = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: X, N2: NewMask); |
2083 | SDValue NewShift = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: NewAnd, N2: Shift.getOperand(i: 1)); |
2084 | |
2085 | // Insert the new nodes into the topological ordering. We must do this in |
2086 | // a valid topological ordering as nothing is going to go back and re-sort |
2087 | // these nodes. We continually insert before 'N' in sequence as this is |
2088 | // essentially a pre-flattened and pre-sorted sequence of nodes. There is no |
2089 | // hierarchy left to express. |
2090 | insertDAGNode(DAG, Pos: N, N: NewMask); |
2091 | insertDAGNode(DAG, Pos: N, N: NewAnd); |
2092 | insertDAGNode(DAG, Pos: N, N: NewShift); |
2093 | DAG.ReplaceAllUsesWith(From: N, To: NewShift); |
2094 | DAG.RemoveDeadNode(N: N.getNode()); |
2095 | |
2096 | AM.Scale = 1 << ShiftAmt; |
2097 | AM.IndexReg = NewAnd; |
2098 | return false; |
2099 | } |
2100 | |
2101 | // Implement some heroics to detect shifts of masked values where the mask can |
2102 | // be replaced by extending the shift and undoing that in the addressing mode |
2103 | // scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and |
2104 | // (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in |
2105 | // the addressing mode. This results in code such as: |
2106 | // |
2107 | // int f(short *y, int *lookup_table) { |
2108 | // ... |
2109 | // return *y + lookup_table[*y >> 11]; |
2110 | // } |
2111 | // |
2112 | // Turning into: |
2113 | // movzwl (%rdi), %eax |
2114 | // movl %eax, %ecx |
2115 | // shrl $11, %ecx |
2116 | // addl (%rsi,%rcx,4), %eax |
2117 | // |
2118 | // Instead of: |
2119 | // movzwl (%rdi), %eax |
2120 | // movl %eax, %ecx |
2121 | // shrl $9, %ecx |
2122 | // andl $124, %rcx |
2123 | // addl (%rsi,%rcx), %eax |
2124 | // |
2125 | // Note that this function assumes the mask is provided as a mask *after* the |
2126 | // value is shifted. The input chain may or may not match that, but computing |
2127 | // such a mask is trivial. |
2128 | static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, |
2129 | uint64_t Mask, |
2130 | SDValue Shift, SDValue X, |
2131 | X86ISelAddressMode &AM) { |
2132 | if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() || |
2133 | !isa<ConstantSDNode>(Val: Shift.getOperand(i: 1))) |
2134 | return true; |
2135 | |
2136 | // We need to ensure that mask is a continuous run of bits. |
2137 | unsigned MaskIdx, MaskLen; |
2138 | if (!isShiftedMask_64(Value: Mask, MaskIdx, MaskLen)) |
2139 | return true; |
2140 | unsigned MaskLZ = 64 - (MaskIdx + MaskLen); |
2141 | |
2142 | unsigned ShiftAmt = Shift.getConstantOperandVal(i: 1); |
2143 | |
2144 | // The amount of shift we're trying to fit into the addressing mode is taken |
2145 | // from the shifted mask index (number of trailing zeros of the mask). |
2146 | unsigned AMShiftAmt = MaskIdx; |
2147 | |
2148 | // There is nothing we can do here unless the mask is removing some bits. |
2149 | // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits. |
2150 | if (AMShiftAmt == 0 || AMShiftAmt > 3) return true; |
2151 | |
2152 | // Scale the leading zero count down based on the actual size of the value. |
2153 | // Also scale it down based on the size of the shift. |
2154 | unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt; |
2155 | if (MaskLZ < ScaleDown) |
2156 | return true; |
2157 | MaskLZ -= ScaleDown; |
2158 | |
2159 | // The final check is to ensure that any masked out high bits of X are |
2160 | // already known to be zero. Otherwise, the mask has a semantic impact |
2161 | // other than masking out a couple of low bits. Unfortunately, because of |
2162 | // the mask, zero extensions will be removed from operands in some cases. |
2163 | // This code works extra hard to look through extensions because we can |
2164 | // replace them with zero extensions cheaply if necessary. |
2165 | bool ReplacingAnyExtend = false; |
2166 | if (X.getOpcode() == ISD::ANY_EXTEND) { |
2167 | unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() - |
2168 | X.getOperand(i: 0).getSimpleValueType().getSizeInBits(); |
2169 | // Assume that we'll replace the any-extend with a zero-extend, and |
2170 | // narrow the search to the extended value. |
2171 | X = X.getOperand(i: 0); |
2172 | MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits; |
2173 | ReplacingAnyExtend = true; |
2174 | } |
2175 | APInt MaskedHighBits = |
2176 | APInt::getHighBitsSet(numBits: X.getSimpleValueType().getSizeInBits(), hiBitsSet: MaskLZ); |
2177 | if (!DAG.MaskedValueIsZero(Op: X, Mask: MaskedHighBits)) |
2178 | return true; |
2179 | |
2180 | // We've identified a pattern that can be transformed into a single shift |
2181 | // and an addressing mode. Make it so. |
2182 | MVT VT = N.getSimpleValueType(); |
2183 | if (ReplacingAnyExtend) { |
2184 | assert(X.getValueType() != VT); |
2185 | // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND. |
2186 | SDValue NewX = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SDLoc(X), VT, Operand: X); |
2187 | insertDAGNode(DAG, Pos: N, N: NewX); |
2188 | X = NewX; |
2189 | } |
2190 | |
2191 | MVT XVT = X.getSimpleValueType(); |
2192 | SDLoc DL(N); |
2193 | SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8); |
2194 | SDValue NewSRL = DAG.getNode(Opcode: ISD::SRL, DL, VT: XVT, N1: X, N2: NewSRLAmt); |
2195 | SDValue NewExt = DAG.getZExtOrTrunc(Op: NewSRL, DL, VT); |
2196 | SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8); |
2197 | SDValue NewSHL = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: NewExt, N2: NewSHLAmt); |
2198 | |
2199 | // Insert the new nodes into the topological ordering. We must do this in |
2200 | // a valid topological ordering as nothing is going to go back and re-sort |
2201 | // these nodes. We continually insert before 'N' in sequence as this is |
2202 | // essentially a pre-flattened and pre-sorted sequence of nodes. There is no |
2203 | // hierarchy left to express. |
2204 | insertDAGNode(DAG, Pos: N, N: NewSRLAmt); |
2205 | insertDAGNode(DAG, Pos: N, N: NewSRL); |
2206 | insertDAGNode(DAG, Pos: N, N: NewExt); |
2207 | insertDAGNode(DAG, Pos: N, N: NewSHLAmt); |
2208 | insertDAGNode(DAG, Pos: N, N: NewSHL); |
2209 | DAG.ReplaceAllUsesWith(From: N, To: NewSHL); |
2210 | DAG.RemoveDeadNode(N: N.getNode()); |
2211 | |
2212 | AM.Scale = 1 << AMShiftAmt; |
2213 | AM.IndexReg = NewExt; |
2214 | return false; |
2215 | } |
2216 | |
2217 | // Transform "(X >> SHIFT) & (MASK << C1)" to |
2218 | // "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be |
2219 | // matched to a BEXTR later. Returns false if the simplification is performed. |
2220 | static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N, |
2221 | uint64_t Mask, |
2222 | SDValue Shift, SDValue X, |
2223 | X86ISelAddressMode &AM, |
2224 | const X86Subtarget &Subtarget) { |
2225 | if (Shift.getOpcode() != ISD::SRL || |
2226 | !isa<ConstantSDNode>(Val: Shift.getOperand(i: 1)) || |
2227 | !Shift.hasOneUse() || !N.hasOneUse()) |
2228 | return true; |
2229 | |
2230 | // Only do this if BEXTR will be matched by matchBEXTRFromAndImm. |
2231 | if (!Subtarget.hasTBM() && |
2232 | !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR())) |
2233 | return true; |
2234 | |
2235 | // We need to ensure that mask is a continuous run of bits. |
2236 | unsigned MaskIdx, MaskLen; |
2237 | if (!isShiftedMask_64(Value: Mask, MaskIdx, MaskLen)) |
2238 | return true; |
2239 | |
2240 | unsigned ShiftAmt = Shift.getConstantOperandVal(i: 1); |
2241 | |
2242 | // The amount of shift we're trying to fit into the addressing mode is taken |
2243 | // from the shifted mask index (number of trailing zeros of the mask). |
2244 | unsigned AMShiftAmt = MaskIdx; |
2245 | |
2246 | // There is nothing we can do here unless the mask is removing some bits. |
2247 | // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits. |
2248 | if (AMShiftAmt == 0 || AMShiftAmt > 3) return true; |
2249 | |
2250 | MVT XVT = X.getSimpleValueType(); |
2251 | MVT VT = N.getSimpleValueType(); |
2252 | SDLoc DL(N); |
2253 | SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8); |
2254 | SDValue NewSRL = DAG.getNode(Opcode: ISD::SRL, DL, VT: XVT, N1: X, N2: NewSRLAmt); |
2255 | SDValue NewMask = DAG.getConstant(Val: Mask >> AMShiftAmt, DL, VT: XVT); |
2256 | SDValue NewAnd = DAG.getNode(Opcode: ISD::AND, DL, VT: XVT, N1: NewSRL, N2: NewMask); |
2257 | SDValue NewExt = DAG.getZExtOrTrunc(Op: NewAnd, DL, VT); |
2258 | SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8); |
2259 | SDValue NewSHL = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: NewExt, N2: NewSHLAmt); |
2260 | |
2261 | // Insert the new nodes into the topological ordering. We must do this in |
2262 | // a valid topological ordering as nothing is going to go back and re-sort |
2263 | // these nodes. We continually insert before 'N' in sequence as this is |
2264 | // essentially a pre-flattened and pre-sorted sequence of nodes. There is no |
2265 | // hierarchy left to express. |
2266 | insertDAGNode(DAG, Pos: N, N: NewSRLAmt); |
2267 | insertDAGNode(DAG, Pos: N, N: NewSRL); |
2268 | insertDAGNode(DAG, Pos: N, N: NewMask); |
2269 | insertDAGNode(DAG, Pos: N, N: NewAnd); |
2270 | insertDAGNode(DAG, Pos: N, N: NewExt); |
2271 | insertDAGNode(DAG, Pos: N, N: NewSHLAmt); |
2272 | insertDAGNode(DAG, Pos: N, N: NewSHL); |
2273 | DAG.ReplaceAllUsesWith(From: N, To: NewSHL); |
2274 | DAG.RemoveDeadNode(N: N.getNode()); |
2275 | |
2276 | AM.Scale = 1 << AMShiftAmt; |
2277 | AM.IndexReg = NewExt; |
2278 | return false; |
2279 | } |
2280 | |
2281 | // Attempt to peek further into a scaled index register, collecting additional |
2282 | // extensions / offsets / etc. Returns /p N if we can't peek any further. |
2283 | SDValue X86DAGToDAGISel::matchIndexRecursively(SDValue N, |
2284 | X86ISelAddressMode &AM, |
2285 | unsigned Depth) { |
2286 | assert(AM.IndexReg.getNode() == nullptr && "IndexReg already matched" ); |
2287 | assert((AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8) && |
2288 | "Illegal index scale" ); |
2289 | |
2290 | // Limit recursion. |
2291 | if (Depth >= SelectionDAG::MaxRecursionDepth) |
2292 | return N; |
2293 | |
2294 | EVT VT = N.getValueType(); |
2295 | unsigned Opc = N.getOpcode(); |
2296 | |
2297 | // index: add(x,c) -> index: x, disp + c |
2298 | if (CurDAG->isBaseWithConstantOffset(Op: N)) { |
2299 | auto *AddVal = cast<ConstantSDNode>(Val: N.getOperand(i: 1)); |
2300 | uint64_t Offset = (uint64_t)AddVal->getSExtValue() * AM.Scale; |
2301 | if (!foldOffsetIntoAddress(Offset, AM)) |
2302 | return matchIndexRecursively(N: N.getOperand(i: 0), AM, Depth: Depth + 1); |
2303 | } |
2304 | |
2305 | // index: add(x,x) -> index: x, scale * 2 |
2306 | if (Opc == ISD::ADD && N.getOperand(i: 0) == N.getOperand(i: 1)) { |
2307 | if (AM.Scale <= 4) { |
2308 | AM.Scale *= 2; |
2309 | return matchIndexRecursively(N: N.getOperand(i: 0), AM, Depth: Depth + 1); |
2310 | } |
2311 | } |
2312 | |
2313 | // index: shl(x,i) -> index: x, scale * (1 << i) |
2314 | if (Opc == X86ISD::VSHLI) { |
2315 | uint64_t ShiftAmt = N.getConstantOperandVal(i: 1); |
2316 | uint64_t ScaleAmt = 1ULL << ShiftAmt; |
2317 | if ((AM.Scale * ScaleAmt) <= 8) { |
2318 | AM.Scale *= ScaleAmt; |
2319 | return matchIndexRecursively(N: N.getOperand(i: 0), AM, Depth: Depth + 1); |
2320 | } |
2321 | } |
2322 | |
2323 | // index: sext(add_nsw(x,c)) -> index: sext(x), disp + sext(c) |
2324 | // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext? |
2325 | if (Opc == ISD::SIGN_EXTEND && !VT.isVector() && N.hasOneUse()) { |
2326 | SDValue Src = N.getOperand(i: 0); |
2327 | if (Src.getOpcode() == ISD::ADD && Src->getFlags().hasNoSignedWrap() && |
2328 | Src.hasOneUse()) { |
2329 | if (CurDAG->isBaseWithConstantOffset(Op: Src)) { |
2330 | SDValue AddSrc = Src.getOperand(i: 0); |
2331 | auto *AddVal = cast<ConstantSDNode>(Val: Src.getOperand(i: 1)); |
2332 | uint64_t Offset = (uint64_t)AddVal->getSExtValue(); |
2333 | if (!foldOffsetIntoAddress(Offset: Offset * AM.Scale, AM)) { |
2334 | SDLoc DL(N); |
2335 | SDValue ExtSrc = CurDAG->getNode(Opcode: Opc, DL, VT, Operand: AddSrc); |
2336 | SDValue ExtVal = CurDAG->getConstant(Val: Offset, DL, VT); |
2337 | SDValue ExtAdd = CurDAG->getNode(Opcode: ISD::ADD, DL, VT, N1: ExtSrc, N2: ExtVal); |
2338 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtSrc); |
2339 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtVal); |
2340 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtAdd); |
2341 | CurDAG->ReplaceAllUsesWith(From: N, To: ExtAdd); |
2342 | CurDAG->RemoveDeadNode(N: N.getNode()); |
2343 | return ExtSrc; |
2344 | } |
2345 | } |
2346 | } |
2347 | } |
2348 | |
2349 | // index: zext(add_nuw(x,c)) -> index: zext(x), disp + zext(c) |
2350 | // index: zext(addlike(x,c)) -> index: zext(x), disp + zext(c) |
2351 | // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext? |
2352 | if (Opc == ISD::ZERO_EXTEND && !VT.isVector() && N.hasOneUse()) { |
2353 | SDValue Src = N.getOperand(i: 0); |
2354 | unsigned SrcOpc = Src.getOpcode(); |
2355 | if (((SrcOpc == ISD::ADD && Src->getFlags().hasNoUnsignedWrap()) || |
2356 | CurDAG->isADDLike(Op: Src)) && |
2357 | Src.hasOneUse()) { |
2358 | if (CurDAG->isBaseWithConstantOffset(Op: Src)) { |
2359 | SDValue AddSrc = Src.getOperand(i: 0); |
2360 | uint64_t Offset = Src.getConstantOperandVal(i: 1); |
2361 | if (!foldOffsetIntoAddress(Offset: Offset * AM.Scale, AM)) { |
2362 | SDLoc DL(N); |
2363 | SDValue Res; |
2364 | // If we're also scaling, see if we can use that as well. |
2365 | if (AddSrc.getOpcode() == ISD::SHL && |
2366 | isa<ConstantSDNode>(Val: AddSrc.getOperand(i: 1))) { |
2367 | SDValue ShVal = AddSrc.getOperand(i: 0); |
2368 | uint64_t ShAmt = AddSrc.getConstantOperandVal(i: 1); |
2369 | APInt HiBits = |
2370 | APInt::getHighBitsSet(numBits: AddSrc.getScalarValueSizeInBits(), hiBitsSet: ShAmt); |
2371 | uint64_t ScaleAmt = 1ULL << ShAmt; |
2372 | if ((AM.Scale * ScaleAmt) <= 8 && |
2373 | (AddSrc->getFlags().hasNoUnsignedWrap() || |
2374 | CurDAG->MaskedValueIsZero(Op: ShVal, Mask: HiBits))) { |
2375 | AM.Scale *= ScaleAmt; |
2376 | SDValue ExtShVal = CurDAG->getNode(Opcode: Opc, DL, VT, Operand: ShVal); |
2377 | SDValue ExtShift = CurDAG->getNode(Opcode: ISD::SHL, DL, VT, N1: ExtShVal, |
2378 | N2: AddSrc.getOperand(i: 1)); |
2379 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtShVal); |
2380 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtShift); |
2381 | AddSrc = ExtShift; |
2382 | Res = ExtShVal; |
2383 | } |
2384 | } |
2385 | SDValue ExtSrc = CurDAG->getNode(Opcode: Opc, DL, VT, Operand: AddSrc); |
2386 | SDValue ExtVal = CurDAG->getConstant(Val: Offset, DL, VT); |
2387 | SDValue ExtAdd = CurDAG->getNode(Opcode: SrcOpc, DL, VT, N1: ExtSrc, N2: ExtVal); |
2388 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtSrc); |
2389 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtVal); |
2390 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: ExtAdd); |
2391 | CurDAG->ReplaceAllUsesWith(From: N, To: ExtAdd); |
2392 | CurDAG->RemoveDeadNode(N: N.getNode()); |
2393 | return Res ? Res : ExtSrc; |
2394 | } |
2395 | } |
2396 | } |
2397 | } |
2398 | |
2399 | // TODO: Handle extensions, shifted masks etc. |
2400 | return N; |
2401 | } |
2402 | |
2403 | bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, |
2404 | unsigned Depth) { |
2405 | SDLoc dl(N); |
2406 | LLVM_DEBUG({ |
2407 | dbgs() << "MatchAddress: " ; |
2408 | AM.dump(CurDAG); |
2409 | }); |
2410 | // Limit recursion. |
2411 | if (Depth >= SelectionDAG::MaxRecursionDepth) |
2412 | return matchAddressBase(N, AM); |
2413 | |
2414 | // If this is already a %rip relative address, we can only merge immediates |
2415 | // into it. Instead of handling this in every case, we handle it here. |
2416 | // RIP relative addressing: %rip + 32-bit displacement! |
2417 | if (AM.isRIPRelative()) { |
2418 | // FIXME: JumpTable and ExternalSymbol address currently don't like |
2419 | // displacements. It isn't very important, but this should be fixed for |
2420 | // consistency. |
2421 | if (!(AM.ES || AM.MCSym) && AM.JT != -1) |
2422 | return true; |
2423 | |
2424 | if (auto *Cst = dyn_cast<ConstantSDNode>(Val&: N)) |
2425 | if (!foldOffsetIntoAddress(Offset: Cst->getSExtValue(), AM)) |
2426 | return false; |
2427 | return true; |
2428 | } |
2429 | |
2430 | switch (N.getOpcode()) { |
2431 | default: break; |
2432 | case ISD::LOCAL_RECOVER: { |
2433 | if (!AM.hasSymbolicDisplacement() && AM.Disp == 0) |
2434 | if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(Val: N.getOperand(i: 0))) { |
2435 | // Use the symbol and don't prefix it. |
2436 | AM.MCSym = ESNode->getMCSymbol(); |
2437 | return false; |
2438 | } |
2439 | break; |
2440 | } |
2441 | case ISD::Constant: { |
2442 | uint64_t Val = cast<ConstantSDNode>(Val&: N)->getSExtValue(); |
2443 | if (!foldOffsetIntoAddress(Offset: Val, AM)) |
2444 | return false; |
2445 | break; |
2446 | } |
2447 | |
2448 | case X86ISD::Wrapper: |
2449 | case X86ISD::WrapperRIP: |
2450 | if (!matchWrapper(N, AM)) |
2451 | return false; |
2452 | break; |
2453 | |
2454 | case ISD::LOAD: |
2455 | if (!matchLoadInAddress(N: cast<LoadSDNode>(Val&: N), AM)) |
2456 | return false; |
2457 | break; |
2458 | |
2459 | case ISD::FrameIndex: |
2460 | if (AM.BaseType == X86ISelAddressMode::RegBase && |
2461 | AM.Base_Reg.getNode() == nullptr && |
2462 | (!Subtarget->is64Bit() || isDispSafeForFrameIndex(Val: AM.Disp))) { |
2463 | AM.BaseType = X86ISelAddressMode::FrameIndexBase; |
2464 | AM.Base_FrameIndex = cast<FrameIndexSDNode>(Val&: N)->getIndex(); |
2465 | return false; |
2466 | } |
2467 | break; |
2468 | |
2469 | case ISD::SHL: |
2470 | if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) |
2471 | break; |
2472 | |
2473 | if (auto *CN = dyn_cast<ConstantSDNode>(Val: N.getOperand(i: 1))) { |
2474 | unsigned Val = CN->getZExtValue(); |
2475 | // Note that we handle x<<1 as (,x,2) rather than (x,x) here so |
2476 | // that the base operand remains free for further matching. If |
2477 | // the base doesn't end up getting used, a post-processing step |
2478 | // in MatchAddress turns (,x,2) into (x,x), which is cheaper. |
2479 | if (Val == 1 || Val == 2 || Val == 3) { |
2480 | SDValue ShVal = N.getOperand(i: 0); |
2481 | AM.Scale = 1 << Val; |
2482 | AM.IndexReg = matchIndexRecursively(N: ShVal, AM, Depth: Depth + 1); |
2483 | return false; |
2484 | } |
2485 | } |
2486 | break; |
2487 | |
2488 | case ISD::SRL: { |
2489 | // Scale must not be used already. |
2490 | if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break; |
2491 | |
2492 | // We only handle up to 64-bit values here as those are what matter for |
2493 | // addressing mode optimizations. |
2494 | assert(N.getSimpleValueType().getSizeInBits() <= 64 && |
2495 | "Unexpected value size!" ); |
2496 | |
2497 | SDValue And = N.getOperand(i: 0); |
2498 | if (And.getOpcode() != ISD::AND) break; |
2499 | SDValue X = And.getOperand(i: 0); |
2500 | |
2501 | // The mask used for the transform is expected to be post-shift, but we |
2502 | // found the shift first so just apply the shift to the mask before passing |
2503 | // it down. |
2504 | if (!isa<ConstantSDNode>(Val: N.getOperand(i: 1)) || |
2505 | !isa<ConstantSDNode>(Val: And.getOperand(i: 1))) |
2506 | break; |
2507 | uint64_t Mask = And.getConstantOperandVal(i: 1) >> N.getConstantOperandVal(i: 1); |
2508 | |
2509 | // Try to fold the mask and shift into the scale, and return false if we |
2510 | // succeed. |
2511 | if (!foldMaskAndShiftToScale(DAG&: *CurDAG, N, Mask, Shift: N, X, AM)) |
2512 | return false; |
2513 | break; |
2514 | } |
2515 | |
2516 | case ISD::SMUL_LOHI: |
2517 | case ISD::UMUL_LOHI: |
2518 | // A mul_lohi where we need the low part can be folded as a plain multiply. |
2519 | if (N.getResNo() != 0) break; |
2520 | [[fallthrough]]; |
2521 | case ISD::MUL: |
2522 | case X86ISD::MUL_IMM: |
2523 | // X*[3,5,9] -> X+X*[2,4,8] |
2524 | if (AM.BaseType == X86ISelAddressMode::RegBase && |
2525 | AM.Base_Reg.getNode() == nullptr && |
2526 | AM.IndexReg.getNode() == nullptr) { |
2527 | if (auto *CN = dyn_cast<ConstantSDNode>(Val: N.getOperand(i: 1))) |
2528 | if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 || |
2529 | CN->getZExtValue() == 9) { |
2530 | AM.Scale = unsigned(CN->getZExtValue())-1; |
2531 | |
2532 | SDValue MulVal = N.getOperand(i: 0); |
2533 | SDValue Reg; |
2534 | |
2535 | // Okay, we know that we have a scale by now. However, if the scaled |
2536 | // value is an add of something and a constant, we can fold the |
2537 | // constant into the disp field here. |
2538 | if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() && |
2539 | isa<ConstantSDNode>(Val: MulVal.getOperand(i: 1))) { |
2540 | Reg = MulVal.getOperand(i: 0); |
2541 | auto *AddVal = cast<ConstantSDNode>(Val: MulVal.getOperand(i: 1)); |
2542 | uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue(); |
2543 | if (foldOffsetIntoAddress(Offset: Disp, AM)) |
2544 | Reg = N.getOperand(i: 0); |
2545 | } else { |
2546 | Reg = N.getOperand(i: 0); |
2547 | } |
2548 | |
2549 | AM.IndexReg = AM.Base_Reg = Reg; |
2550 | return false; |
2551 | } |
2552 | } |
2553 | break; |
2554 | |
2555 | case ISD::SUB: { |
2556 | // Given A-B, if A can be completely folded into the address and |
2557 | // the index field with the index field unused, use -B as the index. |
2558 | // This is a win if a has multiple parts that can be folded into |
2559 | // the address. Also, this saves a mov if the base register has |
2560 | // other uses, since it avoids a two-address sub instruction, however |
2561 | // it costs an additional mov if the index register has other uses. |
2562 | |
2563 | // Add an artificial use to this node so that we can keep track of |
2564 | // it if it gets CSE'd with a different node. |
2565 | HandleSDNode Handle(N); |
2566 | |
2567 | // Test if the LHS of the sub can be folded. |
2568 | X86ISelAddressMode Backup = AM; |
2569 | if (matchAddressRecursively(N: N.getOperand(i: 0), AM, Depth: Depth+1)) { |
2570 | N = Handle.getValue(); |
2571 | AM = Backup; |
2572 | break; |
2573 | } |
2574 | N = Handle.getValue(); |
2575 | // Test if the index field is free for use. |
2576 | if (AM.IndexReg.getNode() || AM.isRIPRelative()) { |
2577 | AM = Backup; |
2578 | break; |
2579 | } |
2580 | |
2581 | int Cost = 0; |
2582 | SDValue RHS = N.getOperand(i: 1); |
2583 | // If the RHS involves a register with multiple uses, this |
2584 | // transformation incurs an extra mov, due to the neg instruction |
2585 | // clobbering its operand. |
2586 | if (!RHS.getNode()->hasOneUse() || |
2587 | RHS.getNode()->getOpcode() == ISD::CopyFromReg || |
2588 | RHS.getNode()->getOpcode() == ISD::TRUNCATE || |
2589 | RHS.getNode()->getOpcode() == ISD::ANY_EXTEND || |
2590 | (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND && |
2591 | RHS.getOperand(0).getValueType() == MVT::i32)) |
2592 | ++Cost; |
2593 | // If the base is a register with multiple uses, this |
2594 | // transformation may save a mov. |
2595 | if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() && |
2596 | !AM.Base_Reg.getNode()->hasOneUse()) || |
2597 | AM.BaseType == X86ISelAddressMode::FrameIndexBase) |
2598 | --Cost; |
2599 | // If the folded LHS was interesting, this transformation saves |
2600 | // address arithmetic. |
2601 | if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) + |
2602 | ((AM.Disp != 0) && (Backup.Disp == 0)) + |
2603 | (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2) |
2604 | --Cost; |
2605 | // If it doesn't look like it may be an overall win, don't do it. |
2606 | if (Cost >= 0) { |
2607 | AM = Backup; |
2608 | break; |
2609 | } |
2610 | |
2611 | // Ok, the transformation is legal and appears profitable. Go for it. |
2612 | // Negation will be emitted later to avoid creating dangling nodes if this |
2613 | // was an unprofitable LEA. |
2614 | AM.IndexReg = RHS; |
2615 | AM.NegateIndex = true; |
2616 | AM.Scale = 1; |
2617 | return false; |
2618 | } |
2619 | |
2620 | case ISD::OR: |
2621 | case ISD::XOR: |
2622 | // See if we can treat the OR/XOR node as an ADD node. |
2623 | if (!CurDAG->isADDLike(Op: N)) |
2624 | break; |
2625 | [[fallthrough]]; |
2626 | case ISD::ADD: |
2627 | if (!matchAdd(N, AM, Depth)) |
2628 | return false; |
2629 | break; |
2630 | |
2631 | case ISD::AND: { |
2632 | // Perform some heroic transforms on an and of a constant-count shift |
2633 | // with a constant to enable use of the scaled offset field. |
2634 | |
2635 | // Scale must not be used already. |
2636 | if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break; |
2637 | |
2638 | // We only handle up to 64-bit values here as those are what matter for |
2639 | // addressing mode optimizations. |
2640 | assert(N.getSimpleValueType().getSizeInBits() <= 64 && |
2641 | "Unexpected value size!" ); |
2642 | |
2643 | if (!isa<ConstantSDNode>(Val: N.getOperand(i: 1))) |
2644 | break; |
2645 | |
2646 | if (N.getOperand(i: 0).getOpcode() == ISD::SRL) { |
2647 | SDValue Shift = N.getOperand(i: 0); |
2648 | SDValue X = Shift.getOperand(i: 0); |
2649 | |
2650 | uint64_t Mask = N.getConstantOperandVal(i: 1); |
2651 | |
2652 | // Try to fold the mask and shift into an extract and scale. |
2653 | if (!foldMaskAndShiftToExtract(DAG&: *CurDAG, N, Mask, Shift, X, AM)) |
2654 | return false; |
2655 | |
2656 | // Try to fold the mask and shift directly into the scale. |
2657 | if (!foldMaskAndShiftToScale(DAG&: *CurDAG, N, Mask, Shift, X, AM)) |
2658 | return false; |
2659 | |
2660 | // Try to fold the mask and shift into BEXTR and scale. |
2661 | if (!foldMaskedShiftToBEXTR(DAG&: *CurDAG, N, Mask, Shift, X, AM, Subtarget: *Subtarget)) |
2662 | return false; |
2663 | } |
2664 | |
2665 | // Try to swap the mask and shift to place shifts which can be done as |
2666 | // a scale on the outside of the mask. |
2667 | if (!foldMaskedShiftToScaledMask(DAG&: *CurDAG, N, AM)) |
2668 | return false; |
2669 | |
2670 | break; |
2671 | } |
2672 | case ISD::ZERO_EXTEND: { |
2673 | // Try to widen a zexted shift left to the same size as its use, so we can |
2674 | // match the shift as a scale factor. |
2675 | if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) |
2676 | break; |
2677 | |
2678 | SDValue Src = N.getOperand(i: 0); |
2679 | |
2680 | // See if we can match a zext(addlike(x,c)). |
2681 | // TODO: Move more ZERO_EXTEND patterns into matchIndexRecursively. |
2682 | if (Src.getOpcode() == ISD::ADD || Src.getOpcode() == ISD::OR) |
2683 | if (SDValue Index = matchIndexRecursively(N, AM, Depth: Depth + 1)) |
2684 | if (Index != N) { |
2685 | AM.IndexReg = Index; |
2686 | return false; |
2687 | } |
2688 | |
2689 | // Peek through mask: zext(and(shl(x,c1),c2)) |
2690 | APInt Mask = APInt::getAllOnes(numBits: Src.getScalarValueSizeInBits()); |
2691 | if (Src.getOpcode() == ISD::AND && Src.hasOneUse()) |
2692 | if (auto *MaskC = dyn_cast<ConstantSDNode>(Val: Src.getOperand(i: 1))) { |
2693 | Mask = MaskC->getAPIntValue(); |
2694 | Src = Src.getOperand(i: 0); |
2695 | } |
2696 | |
2697 | if (Src.getOpcode() == ISD::SHL && Src.hasOneUse()) { |
2698 | // Give up if the shift is not a valid scale factor [1,2,3]. |
2699 | SDValue ShlSrc = Src.getOperand(i: 0); |
2700 | SDValue ShlAmt = Src.getOperand(i: 1); |
2701 | auto *ShAmtC = dyn_cast<ConstantSDNode>(Val&: ShlAmt); |
2702 | if (!ShAmtC) |
2703 | break; |
2704 | unsigned ShAmtV = ShAmtC->getZExtValue(); |
2705 | if (ShAmtV > 3) |
2706 | break; |
2707 | |
2708 | // The narrow shift must only shift out zero bits (it must be 'nuw'). |
2709 | // That makes it safe to widen to the destination type. |
2710 | APInt HighZeros = |
2711 | APInt::getHighBitsSet(numBits: ShlSrc.getValueSizeInBits(), hiBitsSet: ShAmtV); |
2712 | if (!Src->getFlags().hasNoUnsignedWrap() && |
2713 | !CurDAG->MaskedValueIsZero(Op: ShlSrc, Mask: HighZeros & Mask)) |
2714 | break; |
2715 | |
2716 | // zext (shl nuw i8 %x, C1) to i32 |
2717 | // --> shl (zext i8 %x to i32), (zext C1) |
2718 | // zext (and (shl nuw i8 %x, C1), C2) to i32 |
2719 | // --> shl (zext i8 (and %x, C2 >> C1) to i32), (zext C1) |
2720 | MVT SrcVT = ShlSrc.getSimpleValueType(); |
2721 | MVT VT = N.getSimpleValueType(); |
2722 | SDLoc DL(N); |
2723 | |
2724 | SDValue Res = ShlSrc; |
2725 | if (!Mask.isAllOnes()) { |
2726 | Res = CurDAG->getConstant(Val: Mask.lshr(shiftAmt: ShAmtV), DL, VT: SrcVT); |
2727 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: Res); |
2728 | Res = CurDAG->getNode(Opcode: ISD::AND, DL, VT: SrcVT, N1: ShlSrc, N2: Res); |
2729 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: Res); |
2730 | } |
2731 | SDValue Zext = CurDAG->getNode(Opcode: ISD::ZERO_EXTEND, DL, VT, Operand: Res); |
2732 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: Zext); |
2733 | SDValue NewShl = CurDAG->getNode(Opcode: ISD::SHL, DL, VT, N1: Zext, N2: ShlAmt); |
2734 | insertDAGNode(DAG&: *CurDAG, Pos: N, N: NewShl); |
2735 | CurDAG->ReplaceAllUsesWith(From: N, To: NewShl); |
2736 | CurDAG->RemoveDeadNode(N: N.getNode()); |
2737 | |
2738 | // Convert the shift to scale factor. |
2739 | AM.Scale = 1 << ShAmtV; |
2740 | // If matchIndexRecursively is not called here, |
2741 | // Zext may be replaced by other nodes but later used to call a builder |
2742 | // method |
2743 | AM.IndexReg = matchIndexRecursively(N: Zext, AM, Depth: Depth + 1); |
2744 | return false; |
2745 | } |
2746 | |
2747 | if (Src.getOpcode() == ISD::SRL && !Mask.isAllOnes()) { |
2748 | // Try to fold the mask and shift into an extract and scale. |
2749 | if (!foldMaskAndShiftToExtract(DAG&: *CurDAG, N, Mask: Mask.getZExtValue(), Shift: Src, |
2750 | X: Src.getOperand(i: 0), AM)) |
2751 | return false; |
2752 | |
2753 | // Try to fold the mask and shift directly into the scale. |
2754 | if (!foldMaskAndShiftToScale(DAG&: *CurDAG, N, Mask: Mask.getZExtValue(), Shift: Src, |
2755 | X: Src.getOperand(i: 0), AM)) |
2756 | return false; |
2757 | |
2758 | // Try to fold the mask and shift into BEXTR and scale. |
2759 | if (!foldMaskedShiftToBEXTR(DAG&: *CurDAG, N, Mask: Mask.getZExtValue(), Shift: Src, |
2760 | X: Src.getOperand(i: 0), AM, Subtarget: *Subtarget)) |
2761 | return false; |
2762 | } |
2763 | |
2764 | break; |
2765 | } |
2766 | } |
2767 | |
2768 | return matchAddressBase(N, AM); |
2769 | } |
2770 | |
2771 | /// Helper for MatchAddress. Add the specified node to the |
2772 | /// specified addressing mode without any further recursion. |
2773 | bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) { |
2774 | // Is the base register already occupied? |
2775 | if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) { |
2776 | // If so, check to see if the scale index register is set. |
2777 | if (!AM.IndexReg.getNode()) { |
2778 | AM.IndexReg = N; |
2779 | AM.Scale = 1; |
2780 | return false; |
2781 | } |
2782 | |
2783 | // Otherwise, we cannot select it. |
2784 | return true; |
2785 | } |
2786 | |
2787 | // Default, generate it as a register. |
2788 | AM.BaseType = X86ISelAddressMode::RegBase; |
2789 | AM.Base_Reg = N; |
2790 | return false; |
2791 | } |
2792 | |
2793 | bool X86DAGToDAGISel::matchVectorAddressRecursively(SDValue N, |
2794 | X86ISelAddressMode &AM, |
2795 | unsigned Depth) { |
2796 | SDLoc dl(N); |
2797 | LLVM_DEBUG({ |
2798 | dbgs() << "MatchVectorAddress: " ; |
2799 | AM.dump(CurDAG); |
2800 | }); |
2801 | // Limit recursion. |
2802 | if (Depth >= SelectionDAG::MaxRecursionDepth) |
2803 | return matchAddressBase(N, AM); |
2804 | |
2805 | // TODO: Support other operations. |
2806 | switch (N.getOpcode()) { |
2807 | case ISD::Constant: { |
2808 | uint64_t Val = cast<ConstantSDNode>(Val&: N)->getSExtValue(); |
2809 | if (!foldOffsetIntoAddress(Offset: Val, AM)) |
2810 | return false; |
2811 | break; |
2812 | } |
2813 | case X86ISD::Wrapper: |
2814 | if (!matchWrapper(N, AM)) |
2815 | return false; |
2816 | break; |
2817 | case ISD::ADD: { |
2818 | // Add an artificial use to this node so that we can keep track of |
2819 | // it if it gets CSE'd with a different node. |
2820 | HandleSDNode Handle(N); |
2821 | |
2822 | X86ISelAddressMode Backup = AM; |
2823 | if (!matchVectorAddressRecursively(N: N.getOperand(i: 0), AM, Depth: Depth + 1) && |
2824 | !matchVectorAddressRecursively(N: Handle.getValue().getOperand(i: 1), AM, |
2825 | Depth: Depth + 1)) |
2826 | return false; |
2827 | AM = Backup; |
2828 | |
2829 | // Try again after commuting the operands. |
2830 | if (!matchVectorAddressRecursively(N: Handle.getValue().getOperand(i: 1), AM, |
2831 | Depth: Depth + 1) && |
2832 | !matchVectorAddressRecursively(N: Handle.getValue().getOperand(i: 0), AM, |
2833 | Depth: Depth + 1)) |
2834 | return false; |
2835 | AM = Backup; |
2836 | |
2837 | N = Handle.getValue(); |
2838 | break; |
2839 | } |
2840 | } |
2841 | |
2842 | return matchAddressBase(N, AM); |
2843 | } |
2844 | |
2845 | /// Helper for selectVectorAddr. Handles things that can be folded into a |
2846 | /// gather/scatter address. The index register and scale should have already |
2847 | /// been handled. |
2848 | bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) { |
2849 | return matchVectorAddressRecursively(N, AM, Depth: 0); |
2850 | } |
2851 | |
2852 | bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, |
2853 | SDValue IndexOp, SDValue ScaleOp, |
2854 | SDValue &Base, SDValue &Scale, |
2855 | SDValue &Index, SDValue &Disp, |
2856 | SDValue &Segment) { |
2857 | X86ISelAddressMode AM; |
2858 | AM.Scale = ScaleOp->getAsZExtVal(); |
2859 | |
2860 | // Attempt to match index patterns, as long as we're not relying on implicit |
2861 | // sign-extension, which is performed BEFORE scale. |
2862 | if (IndexOp.getScalarValueSizeInBits() == BasePtr.getScalarValueSizeInBits()) |
2863 | AM.IndexReg = matchIndexRecursively(N: IndexOp, AM, Depth: 0); |
2864 | else |
2865 | AM.IndexReg = IndexOp; |
2866 | |
2867 | unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace(); |
2868 | if (AddrSpace == X86AS::GS) |
2869 | AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); |
2870 | if (AddrSpace == X86AS::FS) |
2871 | AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); |
2872 | if (AddrSpace == X86AS::SS) |
2873 | AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16); |
2874 | |
2875 | SDLoc DL(BasePtr); |
2876 | MVT VT = BasePtr.getSimpleValueType(); |
2877 | |
2878 | // Try to match into the base and displacement fields. |
2879 | if (matchVectorAddress(N: BasePtr, AM)) |
2880 | return false; |
2881 | |
2882 | getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); |
2883 | return true; |
2884 | } |
2885 | |
2886 | /// Returns true if it is able to pattern match an addressing mode. |
2887 | /// It returns the operands which make up the maximal addressing mode it can |
2888 | /// match by reference. |
2889 | /// |
2890 | /// Parent is the parent node of the addr operand that is being matched. It |
2891 | /// is always a load, store, atomic node, or null. It is only null when |
2892 | /// checking memory operands for inline asm nodes. |
2893 | bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base, |
2894 | SDValue &Scale, SDValue &Index, |
2895 | SDValue &Disp, SDValue &Segment) { |
2896 | X86ISelAddressMode AM; |
2897 | |
2898 | if (Parent && |
2899 | // This list of opcodes are all the nodes that have an "addr:$ptr" operand |
2900 | // that are not a MemSDNode, and thus don't have proper addrspace info. |
2901 | Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme |
2902 | Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores |
2903 | Parent->getOpcode() != X86ISD::TLSCALL && // Fixme |
2904 | Parent->getOpcode() != X86ISD::ENQCMD && // Fixme |
2905 | Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme |
2906 | Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp |
2907 | Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp |
2908 | unsigned AddrSpace = |
2909 | cast<MemSDNode>(Val: Parent)->getPointerInfo().getAddrSpace(); |
2910 | if (AddrSpace == X86AS::GS) |
2911 | AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); |
2912 | if (AddrSpace == X86AS::FS) |
2913 | AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); |
2914 | if (AddrSpace == X86AS::SS) |
2915 | AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16); |
2916 | } |
2917 | |
2918 | // Save the DL and VT before calling matchAddress, it can invalidate N. |
2919 | SDLoc DL(N); |
2920 | MVT VT = N.getSimpleValueType(); |
2921 | |
2922 | if (matchAddress(N, AM)) |
2923 | return false; |
2924 | |
2925 | getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); |
2926 | return true; |
2927 | } |
2928 | |
2929 | bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) { |
2930 | // Cannot use 32 bit constants to reference objects in kernel/large code |
2931 | // model. |
2932 | if (TM.getCodeModel() == CodeModel::Kernel || |
2933 | TM.getCodeModel() == CodeModel::Large) |
2934 | return false; |
2935 | |
2936 | // In static codegen with small code model, we can get the address of a label |
2937 | // into a register with 'movl' |
2938 | if (N->getOpcode() != X86ISD::Wrapper) |
2939 | return false; |
2940 | |
2941 | N = N.getOperand(i: 0); |
2942 | |
2943 | // At least GNU as does not accept 'movl' for TPOFF relocations. |
2944 | // FIXME: We could use 'movl' when we know we are targeting MC. |
2945 | if (N->getOpcode() == ISD::TargetGlobalTLSAddress) |
2946 | return false; |
2947 | |
2948 | Imm = N; |
2949 | // Small/medium code model can reference non-TargetGlobalAddress objects with |
2950 | // 32 bit constants. |
2951 | if (N->getOpcode() != ISD::TargetGlobalAddress) { |
2952 | return TM.getCodeModel() == CodeModel::Small || |
2953 | TM.getCodeModel() == CodeModel::Medium; |
2954 | } |
2955 | |
2956 | const GlobalValue *GV = cast<GlobalAddressSDNode>(Val&: N)->getGlobal(); |
2957 | if (std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange()) |
2958 | return CR->getUnsignedMax().ult(RHS: 1ull << 32); |
2959 | |
2960 | return !TM.isLargeGlobalValue(GV); |
2961 | } |
2962 | |
2963 | bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base, |
2964 | SDValue &Scale, SDValue &Index, |
2965 | SDValue &Disp, SDValue &Segment) { |
2966 | // Save the debug loc before calling selectLEAAddr, in case it invalidates N. |
2967 | SDLoc DL(N); |
2968 | |
2969 | if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment)) |
2970 | return false; |
2971 | |
2972 | auto *RN = dyn_cast<RegisterSDNode>(Val&: Base); |
2973 | if (RN && RN->getReg() == 0) |
2974 | Base = CurDAG->getRegister(0, MVT::i64); |
2975 | else if (Base.getValueType() == MVT::i32 && !isa<FrameIndexSDNode>(Base)) { |
2976 | // Base could already be %rip, particularly in the x32 ABI. |
2977 | SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL, |
2978 | MVT::i64), 0); |
2979 | Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef, |
2980 | Base); |
2981 | } |
2982 | |
2983 | RN = dyn_cast<RegisterSDNode>(Val&: Index); |
2984 | if (RN && RN->getReg() == 0) |
2985 | Index = CurDAG->getRegister(0, MVT::i64); |
2986 | else { |
2987 | assert(Index.getValueType() == MVT::i32 && |
2988 | "Expect to be extending 32-bit registers for use in LEA" ); |
2989 | SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL, |
2990 | MVT::i64), 0); |
2991 | Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef, |
2992 | Index); |
2993 | } |
2994 | |
2995 | return true; |
2996 | } |
2997 | |
2998 | /// Calls SelectAddr and determines if the maximal addressing |
2999 | /// mode it matches can be cost effectively emitted as an LEA instruction. |
3000 | bool X86DAGToDAGISel::selectLEAAddr(SDValue N, |
3001 | SDValue &Base, SDValue &Scale, |
3002 | SDValue &Index, SDValue &Disp, |
3003 | SDValue &Segment) { |
3004 | X86ISelAddressMode AM; |
3005 | |
3006 | // Save the DL and VT before calling matchAddress, it can invalidate N. |
3007 | SDLoc DL(N); |
3008 | MVT VT = N.getSimpleValueType(); |
3009 | |
3010 | // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support |
3011 | // segments. |
3012 | SDValue Copy = AM.Segment; |
3013 | SDValue T = CurDAG->getRegister(0, MVT::i32); |
3014 | AM.Segment = T; |
3015 | if (matchAddress(N, AM)) |
3016 | return false; |
3017 | assert (T == AM.Segment); |
3018 | AM.Segment = Copy; |
3019 | |
3020 | unsigned Complexity = 0; |
3021 | if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode()) |
3022 | Complexity = 1; |
3023 | else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase) |
3024 | Complexity = 4; |
3025 | |
3026 | if (AM.IndexReg.getNode()) |
3027 | Complexity++; |
3028 | |
3029 | // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with |
3030 | // a simple shift. |
3031 | if (AM.Scale > 1) |
3032 | Complexity++; |
3033 | |
3034 | // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA |
3035 | // to a LEA. This is determined with some experimentation but is by no means |
3036 | // optimal (especially for code size consideration). LEA is nice because of |
3037 | // its three-address nature. Tweak the cost function again when we can run |
3038 | // convertToThreeAddress() at register allocation time. |
3039 | if (AM.hasSymbolicDisplacement()) { |
3040 | // For X86-64, always use LEA to materialize RIP-relative addresses. |
3041 | if (Subtarget->is64Bit()) |
3042 | Complexity = 4; |
3043 | else |
3044 | Complexity += 2; |
3045 | } |
3046 | |
3047 | // Heuristic: try harder to form an LEA from ADD if the operands set flags. |
3048 | // Unlike ADD, LEA does not affect flags, so we will be less likely to require |
3049 | // duplicating flag-producing instructions later in the pipeline. |
3050 | if (N.getOpcode() == ISD::ADD) { |
3051 | auto isMathWithFlags = [](SDValue V) { |
3052 | switch (V.getOpcode()) { |
3053 | case X86ISD::ADD: |
3054 | case X86ISD::SUB: |
3055 | case X86ISD::ADC: |
3056 | case X86ISD::SBB: |
3057 | case X86ISD::SMUL: |
3058 | case X86ISD::UMUL: |
3059 | /* TODO: These opcodes can be added safely, but we may want to justify |
3060 | their inclusion for different reasons (better for reg-alloc). |
3061 | case X86ISD::OR: |
3062 | case X86ISD::XOR: |
3063 | case X86ISD::AND: |
3064 | */ |
3065 | // Value 1 is the flag output of the node - verify it's not dead. |
3066 | return !SDValue(V.getNode(), 1).use_empty(); |
3067 | default: |
3068 | return false; |
3069 | } |
3070 | }; |
3071 | // TODO: We might want to factor in whether there's a load folding |
3072 | // opportunity for the math op that disappears with LEA. |
3073 | if (isMathWithFlags(N.getOperand(i: 0)) || isMathWithFlags(N.getOperand(i: 1))) |
3074 | Complexity++; |
3075 | } |
3076 | |
3077 | if (AM.Disp) |
3078 | Complexity++; |
3079 | |
3080 | // If it isn't worth using an LEA, reject it. |
3081 | if (Complexity <= 2) |
3082 | return false; |
3083 | |
3084 | getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); |
3085 | return true; |
3086 | } |
3087 | |
3088 | /// This is only run on TargetGlobalTLSAddress nodes. |
3089 | bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base, |
3090 | SDValue &Scale, SDValue &Index, |
3091 | SDValue &Disp, SDValue &Segment) { |
3092 | assert(N.getOpcode() == ISD::TargetGlobalTLSAddress || |
3093 | N.getOpcode() == ISD::TargetExternalSymbol); |
3094 | |
3095 | X86ISelAddressMode AM; |
3096 | if (auto *GA = dyn_cast<GlobalAddressSDNode>(Val&: N)) { |
3097 | AM.GV = GA->getGlobal(); |
3098 | AM.Disp += GA->getOffset(); |
3099 | AM.SymbolFlags = GA->getTargetFlags(); |
3100 | } else { |
3101 | auto *SA = cast<ExternalSymbolSDNode>(Val&: N); |
3102 | AM.ES = SA->getSymbol(); |
3103 | AM.SymbolFlags = SA->getTargetFlags(); |
3104 | } |
3105 | |
3106 | if (Subtarget->is32Bit()) { |
3107 | AM.Scale = 1; |
3108 | AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32); |
3109 | } |
3110 | |
3111 | MVT VT = N.getSimpleValueType(); |
3112 | getAddressOperands(AM, DL: SDLoc(N), VT, Base, Scale, Index, Disp, Segment); |
3113 | return true; |
3114 | } |
3115 | |
3116 | bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) { |
3117 | // Keep track of the original value type and whether this value was |
3118 | // truncated. If we see a truncation from pointer type to VT that truncates |
3119 | // bits that are known to be zero, we can use a narrow reference. |
3120 | EVT VT = N.getValueType(); |
3121 | bool WasTruncated = false; |
3122 | if (N.getOpcode() == ISD::TRUNCATE) { |
3123 | WasTruncated = true; |
3124 | N = N.getOperand(i: 0); |
3125 | } |
3126 | |
3127 | if (N.getOpcode() != X86ISD::Wrapper) |
3128 | return false; |
3129 | |
3130 | // We can only use non-GlobalValues as immediates if they were not truncated, |
3131 | // as we do not have any range information. If we have a GlobalValue and the |
3132 | // address was not truncated, we can select it as an operand directly. |
3133 | unsigned Opc = N.getOperand(i: 0)->getOpcode(); |
3134 | if (Opc != ISD::TargetGlobalAddress || !WasTruncated) { |
3135 | Op = N.getOperand(i: 0); |
3136 | // We can only select the operand directly if we didn't have to look past a |
3137 | // truncate. |
3138 | return !WasTruncated; |
3139 | } |
3140 | |
3141 | // Check that the global's range fits into VT. |
3142 | auto *GA = cast<GlobalAddressSDNode>(Val: N.getOperand(i: 0)); |
3143 | std::optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange(); |
3144 | if (!CR || CR->getUnsignedMax().uge(RHS: 1ull << VT.getSizeInBits())) |
3145 | return false; |
3146 | |
3147 | // Okay, we can use a narrow reference. |
3148 | Op = CurDAG->getTargetGlobalAddress(GV: GA->getGlobal(), DL: SDLoc(N), VT, |
3149 | offset: GA->getOffset(), TargetFlags: GA->getTargetFlags()); |
3150 | return true; |
3151 | } |
3152 | |
3153 | bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N, |
3154 | SDValue &Base, SDValue &Scale, |
3155 | SDValue &Index, SDValue &Disp, |
3156 | SDValue &Segment) { |
3157 | assert(Root && P && "Unknown root/parent nodes" ); |
3158 | if (!ISD::isNON_EXTLoad(N: N.getNode()) || |
3159 | !IsProfitableToFold(N, U: P, Root) || |
3160 | !IsLegalToFold(N, U: P, Root, OptLevel)) |
3161 | return false; |
3162 | |
3163 | return selectAddr(Parent: N.getNode(), |
3164 | N: N.getOperand(i: 1), Base, Scale, Index, Disp, Segment); |
3165 | } |
3166 | |
3167 | bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N, |
3168 | SDValue &Base, SDValue &Scale, |
3169 | SDValue &Index, SDValue &Disp, |
3170 | SDValue &Segment) { |
3171 | assert(Root && P && "Unknown root/parent nodes" ); |
3172 | if (N->getOpcode() != X86ISD::VBROADCAST_LOAD || |
3173 | !IsProfitableToFold(N, U: P, Root) || |
3174 | !IsLegalToFold(N, U: P, Root, OptLevel)) |
3175 | return false; |
3176 | |
3177 | return selectAddr(Parent: N.getNode(), |
3178 | N: N.getOperand(i: 1), Base, Scale, Index, Disp, Segment); |
3179 | } |
3180 | |
3181 | /// Return an SDNode that returns the value of the global base register. |
3182 | /// Output instructions required to initialize the global base register, |
3183 | /// if necessary. |
3184 | SDNode *X86DAGToDAGISel::getGlobalBaseReg() { |
3185 | unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF); |
3186 | auto &DL = MF->getDataLayout(); |
3187 | return CurDAG->getRegister(Reg: GlobalBaseReg, VT: TLI->getPointerTy(DL)).getNode(); |
3188 | } |
3189 | |
3190 | bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const { |
3191 | if (N->getOpcode() == ISD::TRUNCATE) |
3192 | N = N->getOperand(Num: 0).getNode(); |
3193 | if (N->getOpcode() != X86ISD::Wrapper) |
3194 | return false; |
3195 | |
3196 | auto *GA = dyn_cast<GlobalAddressSDNode>(Val: N->getOperand(Num: 0)); |
3197 | if (!GA) |
3198 | return false; |
3199 | |
3200 | auto *GV = GA->getGlobal(); |
3201 | std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange(); |
3202 | if (CR) |
3203 | return CR->getSignedMin().sge(RHS: -1ull << Width) && |
3204 | CR->getSignedMax().slt(RHS: 1ull << Width); |
3205 | // In the kernel code model, globals are in the negative 2GB of the address |
3206 | // space, so globals can be a sign extended 32-bit immediate. |
3207 | // In other code models, small globals are in the low 2GB of the address |
3208 | // space, so sign extending them is equivalent to zero extending them. |
3209 | return Width == 32 && !TM.isLargeGlobalValue(GV); |
3210 | } |
3211 | |
3212 | X86::CondCode X86DAGToDAGISel::getCondFromNode(SDNode *N) const { |
3213 | assert(N->isMachineOpcode() && "Unexpected node" ); |
3214 | unsigned Opc = N->getMachineOpcode(); |
3215 | const MCInstrDesc &MCID = getInstrInfo()->get(Opc); |
3216 | int CondNo = X86::getCondSrcNoFromDesc(MCID); |
3217 | if (CondNo < 0) |
3218 | return X86::COND_INVALID; |
3219 | |
3220 | return static_cast<X86::CondCode>(N->getConstantOperandVal(Num: CondNo)); |
3221 | } |
3222 | |
3223 | /// Test whether the given X86ISD::CMP node has any users that use a flag |
3224 | /// other than ZF. |
3225 | bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const { |
3226 | // Examine each user of the node. |
3227 | for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end(); |
3228 | UI != UE; ++UI) { |
3229 | // Only check things that use the flags. |
3230 | if (UI.getUse().getResNo() != Flags.getResNo()) |
3231 | continue; |
3232 | // Only examine CopyToReg uses that copy to EFLAGS. |
3233 | if (UI->getOpcode() != ISD::CopyToReg || |
3234 | cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS) |
3235 | return false; |
3236 | // Examine each user of the CopyToReg use. |
3237 | for (SDNode::use_iterator FlagUI = UI->use_begin(), |
3238 | FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) { |
3239 | // Only examine the Flag result. |
3240 | if (FlagUI.getUse().getResNo() != 1) continue; |
3241 | // Anything unusual: assume conservatively. |
3242 | if (!FlagUI->isMachineOpcode()) return false; |
3243 | // Examine the condition code of the user. |
3244 | X86::CondCode CC = getCondFromNode(N: *FlagUI); |
3245 | |
3246 | switch (CC) { |
3247 | // Comparisons which only use the zero flag. |
3248 | case X86::COND_E: case X86::COND_NE: |
3249 | continue; |
3250 | // Anything else: assume conservatively. |
3251 | default: |
3252 | return false; |
3253 | } |
3254 | } |
3255 | } |
3256 | return true; |
3257 | } |
3258 | |
3259 | /// Test whether the given X86ISD::CMP node has any uses which require the SF |
3260 | /// flag to be accurate. |
3261 | bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const { |
3262 | // Examine each user of the node. |
3263 | for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end(); |
3264 | UI != UE; ++UI) { |
3265 | // Only check things that use the flags. |
3266 | if (UI.getUse().getResNo() != Flags.getResNo()) |
3267 | continue; |
3268 | // Only examine CopyToReg uses that copy to EFLAGS. |
3269 | if (UI->getOpcode() != ISD::CopyToReg || |
3270 | cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS) |
3271 | return false; |
3272 | // Examine each user of the CopyToReg use. |
3273 | for (SDNode::use_iterator FlagUI = UI->use_begin(), |
3274 | FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) { |
3275 | // Only examine the Flag result. |
3276 | if (FlagUI.getUse().getResNo() != 1) continue; |
3277 | // Anything unusual: assume conservatively. |
3278 | if (!FlagUI->isMachineOpcode()) return false; |
3279 | // Examine the condition code of the user. |
3280 | X86::CondCode CC = getCondFromNode(N: *FlagUI); |
3281 | |
3282 | switch (CC) { |
3283 | // Comparisons which don't examine the SF flag. |
3284 | case X86::COND_A: case X86::COND_AE: |
3285 | case X86::COND_B: case X86::COND_BE: |
3286 | case X86::COND_E: case X86::COND_NE: |
3287 | case X86::COND_O: case X86::COND_NO: |
3288 | case X86::COND_P: case X86::COND_NP: |
3289 | continue; |
3290 | // Anything else: assume conservatively. |
3291 | default: |
3292 | return false; |
3293 | } |
3294 | } |
3295 | } |
3296 | return true; |
3297 | } |
3298 | |
3299 | static bool mayUseCarryFlag(X86::CondCode CC) { |
3300 | switch (CC) { |
3301 | // Comparisons which don't examine the CF flag. |
3302 | case X86::COND_O: case X86::COND_NO: |
3303 | case X86::COND_E: case X86::COND_NE: |
3304 | case X86::COND_S: case X86::COND_NS: |
3305 | case X86::COND_P: case X86::COND_NP: |
3306 | case X86::COND_L: case X86::COND_GE: |
3307 | case X86::COND_G: case X86::COND_LE: |
3308 | return false; |
3309 | // Anything else: assume conservatively. |
3310 | default: |
3311 | return true; |
3312 | } |
3313 | } |
3314 | |
3315 | /// Test whether the given node which sets flags has any uses which require the |
3316 | /// CF flag to be accurate. |
3317 | bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const { |
3318 | // Examine each user of the node. |
3319 | for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end(); |
3320 | UI != UE; ++UI) { |
3321 | // Only check things that use the flags. |
3322 | if (UI.getUse().getResNo() != Flags.getResNo()) |
3323 | continue; |
3324 | |
3325 | unsigned UIOpc = UI->getOpcode(); |
3326 | |
3327 | if (UIOpc == ISD::CopyToReg) { |
3328 | // Only examine CopyToReg uses that copy to EFLAGS. |
3329 | if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS) |
3330 | return false; |
3331 | // Examine each user of the CopyToReg use. |
3332 | for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end(); |
3333 | FlagUI != FlagUE; ++FlagUI) { |
3334 | // Only examine the Flag result. |
3335 | if (FlagUI.getUse().getResNo() != 1) |
3336 | continue; |
3337 | // Anything unusual: assume conservatively. |
3338 | if (!FlagUI->isMachineOpcode()) |
3339 | return false; |
3340 | // Examine the condition code of the user. |
3341 | X86::CondCode CC = getCondFromNode(N: *FlagUI); |
3342 | |
3343 | if (mayUseCarryFlag(CC)) |
3344 | return false; |
3345 | } |
3346 | |
3347 | // This CopyToReg is ok. Move on to the next user. |
3348 | continue; |
3349 | } |
3350 | |
3351 | // This might be an unselected node. So look for the pre-isel opcodes that |
3352 | // use flags. |
3353 | unsigned CCOpNo; |
3354 | switch (UIOpc) { |
3355 | default: |
3356 | // Something unusual. Be conservative. |
3357 | return false; |
3358 | case X86ISD::SETCC: CCOpNo = 0; break; |
3359 | case X86ISD::SETCC_CARRY: CCOpNo = 0; break; |
3360 | case X86ISD::CMOV: CCOpNo = 2; break; |
3361 | case X86ISD::BRCOND: CCOpNo = 2; break; |
3362 | } |
3363 | |
3364 | X86::CondCode CC = (X86::CondCode)UI->getConstantOperandVal(Num: CCOpNo); |
3365 | if (mayUseCarryFlag(CC)) |
3366 | return false; |
3367 | } |
3368 | return true; |
3369 | } |
3370 | |
3371 | /// Check whether or not the chain ending in StoreNode is suitable for doing |
3372 | /// the {load; op; store} to modify transformation. |
3373 | static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode, |
3374 | SDValue StoredVal, SelectionDAG *CurDAG, |
3375 | unsigned LoadOpNo, |
3376 | LoadSDNode *&LoadNode, |
3377 | SDValue &InputChain) { |
3378 | // Is the stored value result 0 of the operation? |
3379 | if (StoredVal.getResNo() != 0) return false; |
3380 | |
3381 | // Are there other uses of the operation other than the store? |
3382 | if (!StoredVal.getNode()->hasNUsesOfValue(NUses: 1, Value: 0)) return false; |
3383 | |
3384 | // Is the store non-extending and non-indexed? |
3385 | if (!ISD::isNormalStore(N: StoreNode) || StoreNode->isNonTemporal()) |
3386 | return false; |
3387 | |
3388 | SDValue Load = StoredVal->getOperand(Num: LoadOpNo); |
3389 | // Is the stored value a non-extending and non-indexed load? |
3390 | if (!ISD::isNormalLoad(N: Load.getNode())) return false; |
3391 | |
3392 | // Return LoadNode by reference. |
3393 | LoadNode = cast<LoadSDNode>(Val&: Load); |
3394 | |
3395 | // Is store the only read of the loaded value? |
3396 | if (!Load.hasOneUse()) |
3397 | return false; |
3398 | |
3399 | // Is the address of the store the same as the load? |
3400 | if (LoadNode->getBasePtr() != StoreNode->getBasePtr() || |
3401 | LoadNode->getOffset() != StoreNode->getOffset()) |
3402 | return false; |
3403 | |
3404 | bool FoundLoad = false; |
3405 | SmallVector<SDValue, 4> ChainOps; |
3406 | SmallVector<const SDNode *, 4> LoopWorklist; |
3407 | SmallPtrSet<const SDNode *, 16> Visited; |
3408 | const unsigned int Max = 1024; |
3409 | |
3410 | // Visualization of Load-Op-Store fusion: |
3411 | // ------------------------- |
3412 | // Legend: |
3413 | // *-lines = Chain operand dependencies. |
3414 | // |-lines = Normal operand dependencies. |
3415 | // Dependencies flow down and right. n-suffix references multiple nodes. |
3416 | // |
3417 | // C Xn C |
3418 | // * * * |
3419 | // * * * |
3420 | // Xn A-LD Yn TF Yn |
3421 | // * * \ | * | |
3422 | // * * \ | * | |
3423 | // * * \ | => A--LD_OP_ST |
3424 | // * * \| \ |
3425 | // TF OP \ |
3426 | // * | \ Zn |
3427 | // * | \ |
3428 | // A-ST Zn |
3429 | // |
3430 | |
3431 | // This merge induced dependences from: #1: Xn -> LD, OP, Zn |
3432 | // #2: Yn -> LD |
3433 | // #3: ST -> Zn |
3434 | |
3435 | // Ensure the transform is safe by checking for the dual |
3436 | // dependencies to make sure we do not induce a loop. |
3437 | |
3438 | // As LD is a predecessor to both OP and ST we can do this by checking: |
3439 | // a). if LD is a predecessor to a member of Xn or Yn. |
3440 | // b). if a Zn is a predecessor to ST. |
3441 | |
3442 | // However, (b) can only occur through being a chain predecessor to |
3443 | // ST, which is the same as Zn being a member or predecessor of Xn, |
3444 | // which is a subset of LD being a predecessor of Xn. So it's |
3445 | // subsumed by check (a). |
3446 | |
3447 | SDValue Chain = StoreNode->getChain(); |
3448 | |
3449 | // Gather X elements in ChainOps. |
3450 | if (Chain == Load.getValue(R: 1)) { |
3451 | FoundLoad = true; |
3452 | ChainOps.push_back(Elt: Load.getOperand(i: 0)); |
3453 | } else if (Chain.getOpcode() == ISD::TokenFactor) { |
3454 | for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) { |
3455 | SDValue Op = Chain.getOperand(i); |
3456 | if (Op == Load.getValue(R: 1)) { |
3457 | FoundLoad = true; |
3458 | // Drop Load, but keep its chain. No cycle check necessary. |
3459 | ChainOps.push_back(Elt: Load.getOperand(i: 0)); |
3460 | continue; |
3461 | } |
3462 | LoopWorklist.push_back(Elt: Op.getNode()); |
3463 | ChainOps.push_back(Elt: Op); |
3464 | } |
3465 | } |
3466 | |
3467 | if (!FoundLoad) |
3468 | return false; |
3469 | |
3470 | // Worklist is currently Xn. Add Yn to worklist. |
3471 | for (SDValue Op : StoredVal->ops()) |
3472 | if (Op.getNode() != LoadNode) |
3473 | LoopWorklist.push_back(Elt: Op.getNode()); |
3474 | |
3475 | // Check (a) if Load is a predecessor to Xn + Yn |
3476 | if (SDNode::hasPredecessorHelper(N: Load.getNode(), Visited, Worklist&: LoopWorklist, MaxSteps: Max, |
3477 | TopologicalPrune: true)) |
3478 | return false; |
3479 | |
3480 | InputChain = |
3481 | CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps); |
3482 | return true; |
3483 | } |
3484 | |
3485 | // Change a chain of {load; op; store} of the same value into a simple op |
3486 | // through memory of that value, if the uses of the modified value and its |
3487 | // address are suitable. |
3488 | // |
3489 | // The tablegen pattern memory operand pattern is currently not able to match |
3490 | // the case where the EFLAGS on the original operation are used. |
3491 | // |
3492 | // To move this to tablegen, we'll need to improve tablegen to allow flags to |
3493 | // be transferred from a node in the pattern to the result node, probably with |
3494 | // a new keyword. For example, we have this |
3495 | // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", |
3496 | // [(store (add (loadi64 addr:$dst), -1), addr:$dst), |
3497 | // (implicit EFLAGS)]>; |
3498 | // but maybe need something like this |
3499 | // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", |
3500 | // [(store (add (loadi64 addr:$dst), -1), addr:$dst), |
3501 | // (transferrable EFLAGS)]>; |
3502 | // |
3503 | // Until then, we manually fold these and instruction select the operation |
3504 | // here. |
3505 | bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { |
3506 | auto *StoreNode = cast<StoreSDNode>(Val: Node); |
3507 | SDValue StoredVal = StoreNode->getOperand(Num: 1); |
3508 | unsigned Opc = StoredVal->getOpcode(); |
3509 | |
3510 | // Before we try to select anything, make sure this is memory operand size |
3511 | // and opcode we can handle. Note that this must match the code below that |
3512 | // actually lowers the opcodes. |
3513 | EVT MemVT = StoreNode->getMemoryVT(); |
3514 | if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 && |
3515 | MemVT != MVT::i8) |
3516 | return false; |
3517 | |
3518 | bool IsCommutable = false; |
3519 | bool IsNegate = false; |
3520 | switch (Opc) { |
3521 | default: |
3522 | return false; |
3523 | case X86ISD::SUB: |
3524 | IsNegate = isNullConstant(V: StoredVal.getOperand(i: 0)); |
3525 | break; |
3526 | case X86ISD::SBB: |
3527 | break; |
3528 | case X86ISD::ADD: |
3529 | case X86ISD::ADC: |
3530 | case X86ISD::AND: |
3531 | case X86ISD::OR: |
3532 | case X86ISD::XOR: |
3533 | IsCommutable = true; |
3534 | break; |
3535 | } |
3536 | |
3537 | unsigned LoadOpNo = IsNegate ? 1 : 0; |
3538 | LoadSDNode *LoadNode = nullptr; |
3539 | SDValue InputChain; |
3540 | if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo, |
3541 | LoadNode, InputChain)) { |
3542 | if (!IsCommutable) |
3543 | return false; |
3544 | |
3545 | // This operation is commutable, try the other operand. |
3546 | LoadOpNo = 1; |
3547 | if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo, |
3548 | LoadNode, InputChain)) |
3549 | return false; |
3550 | } |
3551 | |
3552 | SDValue Base, Scale, Index, Disp, Segment; |
3553 | if (!selectAddr(Parent: LoadNode, N: LoadNode->getBasePtr(), Base, Scale, Index, Disp, |
3554 | Segment)) |
3555 | return false; |
3556 | |
3557 | auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16, |
3558 | unsigned Opc8) { |
3559 | switch (MemVT.getSimpleVT().SimpleTy) { |
3560 | case MVT::i64: |
3561 | return Opc64; |
3562 | case MVT::i32: |
3563 | return Opc32; |
3564 | case MVT::i16: |
3565 | return Opc16; |
3566 | case MVT::i8: |
3567 | return Opc8; |
3568 | default: |
3569 | llvm_unreachable("Invalid size!" ); |
3570 | } |
3571 | }; |
3572 | |
3573 | MachineSDNode *Result; |
3574 | switch (Opc) { |
3575 | case X86ISD::SUB: |
3576 | // Handle negate. |
3577 | if (IsNegate) { |
3578 | unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m, |
3579 | X86::NEG8m); |
3580 | const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain}; |
3581 | Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, |
3582 | MVT::Other, Ops); |
3583 | break; |
3584 | } |
3585 | [[fallthrough]]; |
3586 | case X86ISD::ADD: |
3587 | // Try to match inc/dec. |
3588 | if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) { |
3589 | bool IsOne = isOneConstant(V: StoredVal.getOperand(i: 1)); |
3590 | bool IsNegOne = isAllOnesConstant(V: StoredVal.getOperand(i: 1)); |
3591 | // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec. |
3592 | if ((IsOne || IsNegOne) && hasNoCarryFlagUses(Flags: StoredVal.getValue(R: 1))) { |
3593 | unsigned NewOpc = |
3594 | ((Opc == X86ISD::ADD) == IsOne) |
3595 | ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m) |
3596 | : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m); |
3597 | const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain}; |
3598 | Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, |
3599 | MVT::Other, Ops); |
3600 | break; |
3601 | } |
3602 | } |
3603 | [[fallthrough]]; |
3604 | case X86ISD::ADC: |
3605 | case X86ISD::SBB: |
3606 | case X86ISD::AND: |
3607 | case X86ISD::OR: |
3608 | case X86ISD::XOR: { |
3609 | auto SelectRegOpcode = [SelectOpcode](unsigned Opc) { |
3610 | switch (Opc) { |
3611 | case X86ISD::ADD: |
3612 | return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr, |
3613 | X86::ADD8mr); |
3614 | case X86ISD::ADC: |
3615 | return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr, |
3616 | X86::ADC8mr); |
3617 | case X86ISD::SUB: |
3618 | return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr, |
3619 | X86::SUB8mr); |
3620 | case X86ISD::SBB: |
3621 | return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr, |
3622 | X86::SBB8mr); |
3623 | case X86ISD::AND: |
3624 | return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr, |
3625 | X86::AND8mr); |
3626 | case X86ISD::OR: |
3627 | return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr); |
3628 | case X86ISD::XOR: |
3629 | return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr, |
3630 | X86::XOR8mr); |
3631 | default: |
3632 | llvm_unreachable("Invalid opcode!" ); |
3633 | } |
3634 | }; |
3635 | auto SelectImmOpcode = [SelectOpcode](unsigned Opc) { |
3636 | switch (Opc) { |
3637 | case X86ISD::ADD: |
3638 | return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi, |
3639 | X86::ADD8mi); |
3640 | case X86ISD::ADC: |
3641 | return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi, |
3642 | X86::ADC8mi); |
3643 | case X86ISD::SUB: |
3644 | return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi, |
3645 | X86::SUB8mi); |
3646 | case X86ISD::SBB: |
3647 | return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi, |
3648 | X86::SBB8mi); |
3649 | case X86ISD::AND: |
3650 | return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi, |
3651 | X86::AND8mi); |
3652 | case X86ISD::OR: |
3653 | return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi, |
3654 | X86::OR8mi); |
3655 | case X86ISD::XOR: |
3656 | return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi, |
3657 | X86::XOR8mi); |
3658 | default: |
3659 | llvm_unreachable("Invalid opcode!" ); |
3660 | } |
3661 | }; |
3662 | |
3663 | unsigned NewOpc = SelectRegOpcode(Opc); |
3664 | SDValue Operand = StoredVal->getOperand(Num: 1-LoadOpNo); |
3665 | |
3666 | // See if the operand is a constant that we can fold into an immediate |
3667 | // operand. |
3668 | if (auto *OperandC = dyn_cast<ConstantSDNode>(Val&: Operand)) { |
3669 | int64_t OperandV = OperandC->getSExtValue(); |
3670 | |
3671 | // Check if we can shrink the operand enough to fit in an immediate (or |
3672 | // fit into a smaller immediate) by negating it and switching the |
3673 | // operation. |
3674 | if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) && |
3675 | ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) || |
3676 | (MemVT == MVT::i64 && !isInt<32>(OperandV) && |
3677 | isInt<32>(-OperandV))) && |
3678 | hasNoCarryFlagUses(StoredVal.getValue(1))) { |
3679 | OperandV = -OperandV; |
3680 | Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD; |
3681 | } |
3682 | |
3683 | if (MemVT != MVT::i64 || isInt<32>(OperandV)) { |
3684 | Operand = CurDAG->getTargetConstant(Val: OperandV, DL: SDLoc(Node), VT: MemVT); |
3685 | NewOpc = SelectImmOpcode(Opc); |
3686 | } |
3687 | } |
3688 | |
3689 | if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) { |
3690 | SDValue CopyTo = |
3691 | CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS, |
3692 | StoredVal.getOperand(2), SDValue()); |
3693 | |
3694 | const SDValue Ops[] = {Base, Scale, Index, Disp, |
3695 | Segment, Operand, CopyTo, CopyTo.getValue(R: 1)}; |
3696 | Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, |
3697 | Ops); |
3698 | } else { |
3699 | const SDValue Ops[] = {Base, Scale, Index, Disp, |
3700 | Segment, Operand, InputChain}; |
3701 | Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, |
3702 | Ops); |
3703 | } |
3704 | break; |
3705 | } |
3706 | default: |
3707 | llvm_unreachable("Invalid opcode!" ); |
3708 | } |
3709 | |
3710 | MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(), |
3711 | LoadNode->getMemOperand()}; |
3712 | CurDAG->setNodeMemRefs(N: Result, NewMemRefs: MemOps); |
3713 | |
3714 | // Update Load Chain uses as well. |
3715 | ReplaceUses(F: SDValue(LoadNode, 1), T: SDValue(Result, 1)); |
3716 | ReplaceUses(F: SDValue(StoreNode, 0), T: SDValue(Result, 1)); |
3717 | ReplaceUses(F: SDValue(StoredVal.getNode(), 1), T: SDValue(Result, 0)); |
3718 | CurDAG->RemoveDeadNode(N: Node); |
3719 | return true; |
3720 | } |
3721 | |
3722 | // See if this is an X & Mask that we can match to BEXTR/BZHI. |
3723 | // Where Mask is one of the following patterns: |
3724 | // a) x & (1 << nbits) - 1 |
3725 | // b) x & ~(-1 << nbits) |
3726 | // c) x & (-1 >> (32 - y)) |
3727 | // d) x << (32 - y) >> (32 - y) |
3728 | // e) (1 << nbits) - 1 |
3729 | bool X86DAGToDAGISel::(SDNode *Node) { |
3730 | assert( |
3731 | (Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::AND || |
3732 | Node->getOpcode() == ISD::SRL) && |
3733 | "Should be either an and-mask, or right-shift after clearing high bits." ); |
3734 | |
3735 | // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one. |
3736 | if (!Subtarget->hasBMI() && !Subtarget->hasBMI2()) |
3737 | return false; |
3738 | |
3739 | MVT NVT = Node->getSimpleValueType(ResNo: 0); |
3740 | |
3741 | // Only supported for 32 and 64 bits. |
3742 | if (NVT != MVT::i32 && NVT != MVT::i64) |
3743 | return false; |
3744 | |
3745 | SDValue NBits; |
3746 | bool NegateNBits; |
3747 | |
3748 | // If we have BMI2's BZHI, we are ok with muti-use patterns. |
3749 | // Else, if we only have BMI1's BEXTR, we require one-use. |
3750 | const bool = Subtarget->hasBMI2(); |
3751 | auto checkUses = [AllowExtraUsesByDefault]( |
3752 | SDValue Op, unsigned NUses, |
3753 | std::optional<bool> ) { |
3754 | return AllowExtraUses.value_or(AllowExtraUsesByDefault) || |
3755 | Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo()); |
3756 | }; |
3757 | auto checkOneUse = [checkUses](SDValue Op, |
3758 | std::optional<bool> = |
3759 | std::nullopt) { |
3760 | return checkUses(Op, 1, AllowExtraUses); |
3761 | }; |
3762 | auto checkTwoUse = [checkUses](SDValue Op, |
3763 | std::optional<bool> = |
3764 | std::nullopt) { |
3765 | return checkUses(Op, 2, AllowExtraUses); |
3766 | }; |
3767 | |
3768 | auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) { |
3769 | if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) { |
3770 | assert(V.getSimpleValueType() == MVT::i32 && |
3771 | V.getOperand(0).getSimpleValueType() == MVT::i64 && |
3772 | "Expected i64 -> i32 truncation" ); |
3773 | V = V.getOperand(i: 0); |
3774 | } |
3775 | return V; |
3776 | }; |
3777 | |
3778 | // a) x & ((1 << nbits) + (-1)) |
3779 | auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, &NBits, |
3780 | &NegateNBits](SDValue Mask) -> bool { |
3781 | // Match `add`. Must only have one use! |
3782 | if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask)) |
3783 | return false; |
3784 | // We should be adding all-ones constant (i.e. subtracting one.) |
3785 | if (!isAllOnesConstant(V: Mask->getOperand(Num: 1))) |
3786 | return false; |
3787 | // Match `1 << nbits`. Might be truncated. Must only have one use! |
3788 | SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(Num: 0)); |
3789 | if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0)) |
3790 | return false; |
3791 | if (!isOneConstant(V: M0->getOperand(Num: 0))) |
3792 | return false; |
3793 | NBits = M0->getOperand(Num: 1); |
3794 | NegateNBits = false; |
3795 | return true; |
3796 | }; |
3797 | |
3798 | auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) { |
3799 | V = peekThroughOneUseTruncation(V); |
3800 | return CurDAG->MaskedValueIsAllOnes( |
3801 | Op: V, Mask: APInt::getLowBitsSet(numBits: V.getSimpleValueType().getSizeInBits(), |
3802 | loBitsSet: NVT.getSizeInBits())); |
3803 | }; |
3804 | |
3805 | // b) x & ~(-1 << nbits) |
3806 | auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation, |
3807 | &NBits, &NegateNBits](SDValue Mask) -> bool { |
3808 | // Match `~()`. Must only have one use! |
3809 | if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask)) |
3810 | return false; |
3811 | // The -1 only has to be all-ones for the final Node's NVT. |
3812 | if (!isAllOnes(Mask->getOperand(Num: 1))) |
3813 | return false; |
3814 | // Match `-1 << nbits`. Might be truncated. Must only have one use! |
3815 | SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(Num: 0)); |
3816 | if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0)) |
3817 | return false; |
3818 | // The -1 only has to be all-ones for the final Node's NVT. |
3819 | if (!isAllOnes(M0->getOperand(Num: 0))) |
3820 | return false; |
3821 | NBits = M0->getOperand(Num: 1); |
3822 | NegateNBits = false; |
3823 | return true; |
3824 | }; |
3825 | |
3826 | // Try to match potentially-truncated shift amount as `(bitwidth - y)`, |
3827 | // or leave the shift amount as-is, but then we'll have to negate it. |
3828 | auto canonicalizeShiftAmt = [&NBits, &NegateNBits](SDValue ShiftAmt, |
3829 | unsigned Bitwidth) { |
3830 | NBits = ShiftAmt; |
3831 | NegateNBits = true; |
3832 | // Skip over a truncate of the shift amount, if any. |
3833 | if (NBits.getOpcode() == ISD::TRUNCATE) |
3834 | NBits = NBits.getOperand(i: 0); |
3835 | // Try to match the shift amount as (bitwidth - y). It should go away, too. |
3836 | // If it doesn't match, that's fine, we'll just negate it ourselves. |
3837 | if (NBits.getOpcode() != ISD::SUB) |
3838 | return; |
3839 | auto *V0 = dyn_cast<ConstantSDNode>(Val: NBits.getOperand(i: 0)); |
3840 | if (!V0 || V0->getZExtValue() != Bitwidth) |
3841 | return; |
3842 | NBits = NBits.getOperand(i: 1); |
3843 | NegateNBits = false; |
3844 | }; |
3845 | |
3846 | // c) x & (-1 >> z) but then we'll have to subtract z from bitwidth |
3847 | // or |
3848 | // c) x & (-1 >> (32 - y)) |
3849 | auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, &NegateNBits, |
3850 | canonicalizeShiftAmt](SDValue Mask) -> bool { |
3851 | // The mask itself may be truncated. |
3852 | Mask = peekThroughOneUseTruncation(Mask); |
3853 | unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits(); |
3854 | // Match `l>>`. Must only have one use! |
3855 | if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask)) |
3856 | return false; |
3857 | // We should be shifting truly all-ones constant. |
3858 | if (!isAllOnesConstant(V: Mask.getOperand(i: 0))) |
3859 | return false; |
3860 | SDValue M1 = Mask.getOperand(i: 1); |
3861 | // The shift amount should not be used externally. |
3862 | if (!checkOneUse(M1)) |
3863 | return false; |
3864 | canonicalizeShiftAmt(M1, Bitwidth); |
3865 | // Pattern c. is non-canonical, and is expanded into pattern d. iff there |
3866 | // is no extra use of the mask. Clearly, there was one since we are here. |
3867 | // But at the same time, if we need to negate the shift amount, |
3868 | // then we don't want the mask to stick around, else it's unprofitable. |
3869 | return !NegateNBits; |
3870 | }; |
3871 | |
3872 | SDValue X; |
3873 | |
3874 | // d) x << z >> z but then we'll have to subtract z from bitwidth |
3875 | // or |
3876 | // d) x << (32 - y) >> (32 - y) |
3877 | auto matchPatternD = [checkOneUse, checkTwoUse, canonicalizeShiftAmt, |
3878 | AllowExtraUsesByDefault, &NegateNBits, |
3879 | &X](SDNode *Node) -> bool { |
3880 | if (Node->getOpcode() != ISD::SRL) |
3881 | return false; |
3882 | SDValue N0 = Node->getOperand(Num: 0); |
3883 | if (N0->getOpcode() != ISD::SHL) |
3884 | return false; |
3885 | unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits(); |
3886 | SDValue N1 = Node->getOperand(Num: 1); |
3887 | SDValue N01 = N0->getOperand(Num: 1); |
3888 | // Both of the shifts must be by the exact same value. |
3889 | if (N1 != N01) |
3890 | return false; |
3891 | canonicalizeShiftAmt(N1, Bitwidth); |
3892 | // There should not be any external uses of the inner shift / shift amount. |
3893 | // Note that while we are generally okay with external uses given BMI2, |
3894 | // iff we need to negate the shift amount, we are not okay with extra uses. |
3895 | const bool = AllowExtraUsesByDefault && !NegateNBits; |
3896 | if (!checkOneUse(N0, AllowExtraUses) || !checkTwoUse(N1, AllowExtraUses)) |
3897 | return false; |
3898 | X = N0->getOperand(Num: 0); |
3899 | return true; |
3900 | }; |
3901 | |
3902 | auto matchLowBitMask = [matchPatternA, matchPatternB, |
3903 | matchPatternC](SDValue Mask) -> bool { |
3904 | return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask); |
3905 | }; |
3906 | |
3907 | if (Node->getOpcode() == ISD::AND) { |
3908 | X = Node->getOperand(Num: 0); |
3909 | SDValue Mask = Node->getOperand(Num: 1); |
3910 | |
3911 | if (matchLowBitMask(Mask)) { |
3912 | // Great. |
3913 | } else { |
3914 | std::swap(a&: X, b&: Mask); |
3915 | if (!matchLowBitMask(Mask)) |
3916 | return false; |
3917 | } |
3918 | } else if (matchLowBitMask(SDValue(Node, 0))) { |
3919 | X = CurDAG->getAllOnesConstant(DL: SDLoc(Node), VT: NVT); |
3920 | } else if (!matchPatternD(Node)) |
3921 | return false; |
3922 | |
3923 | // If we need to negate the shift amount, require BMI2 BZHI support. |
3924 | // It's just too unprofitable for BMI1 BEXTR. |
3925 | if (NegateNBits && !Subtarget->hasBMI2()) |
3926 | return false; |
3927 | |
3928 | SDLoc DL(Node); |
3929 | |
3930 | // Truncate the shift amount. |
3931 | NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits); |
3932 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: NBits); |
3933 | |
3934 | // Insert 8-bit NBits into lowest 8 bits of 32-bit register. |
3935 | // All the other bits are undefined, we do not care about them. |
3936 | SDValue ImplDef = SDValue( |
3937 | CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0); |
3938 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: ImplDef); |
3939 | |
3940 | SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32); |
3941 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: SRIdxVal); |
3942 | NBits = SDValue(CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, |
3943 | MVT::i32, ImplDef, NBits, SRIdxVal), |
3944 | 0); |
3945 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: NBits); |
3946 | |
3947 | // We might have matched the amount of high bits to be cleared, |
3948 | // but we want the amount of low bits to be kept, so negate it then. |
3949 | if (NegateNBits) { |
3950 | SDValue BitWidthC = CurDAG->getConstant(NVT.getSizeInBits(), DL, MVT::i32); |
3951 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: BitWidthC); |
3952 | |
3953 | NBits = CurDAG->getNode(ISD::SUB, DL, MVT::i32, BitWidthC, NBits); |
3954 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: NBits); |
3955 | } |
3956 | |
3957 | if (Subtarget->hasBMI2()) { |
3958 | // Great, just emit the BZHI.. |
3959 | if (NVT != MVT::i32) { |
3960 | // But have to place the bit count into the wide-enough register first. |
3961 | NBits = CurDAG->getNode(Opcode: ISD::ANY_EXTEND, DL, VT: NVT, Operand: NBits); |
3962 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: NBits); |
3963 | } |
3964 | |
3965 | SDValue = CurDAG->getNode(Opcode: X86ISD::BZHI, DL, VT: NVT, N1: X, N2: NBits); |
3966 | ReplaceNode(F: Node, T: Extract.getNode()); |
3967 | SelectCode(Extract.getNode()); |
3968 | return true; |
3969 | } |
3970 | |
3971 | // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is |
3972 | // *logically* shifted (potentially with one-use trunc inbetween), |
3973 | // and the truncation was the only use of the shift, |
3974 | // and if so look past one-use truncation. |
3975 | { |
3976 | SDValue RealX = peekThroughOneUseTruncation(X); |
3977 | // FIXME: only if the shift is one-use? |
3978 | if (RealX != X && RealX.getOpcode() == ISD::SRL) |
3979 | X = RealX; |
3980 | } |
3981 | |
3982 | MVT XVT = X.getSimpleValueType(); |
3983 | |
3984 | // Else, emitting BEXTR requires one more step. |
3985 | // The 'control' of BEXTR has the pattern of: |
3986 | // [15...8 bit][ 7...0 bit] location |
3987 | // [ bit count][ shift] name |
3988 | // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11 |
3989 | |
3990 | // Shift NBits left by 8 bits, thus producing 'control'. |
3991 | // This makes the low 8 bits to be zero. |
3992 | SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8); |
3993 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: C8); |
3994 | SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8); |
3995 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: Control); |
3996 | |
3997 | // If the 'X' is *logically* shifted, we can fold that shift into 'control'. |
3998 | // FIXME: only if the shift is one-use? |
3999 | if (X.getOpcode() == ISD::SRL) { |
4000 | SDValue ShiftAmt = X.getOperand(i: 1); |
4001 | X = X.getOperand(i: 0); |
4002 | |
4003 | assert(ShiftAmt.getValueType() == MVT::i8 && |
4004 | "Expected shift amount to be i8" ); |
4005 | |
4006 | // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero! |
4007 | // We could zext to i16 in some form, but we intentionally don't do that. |
4008 | SDValue OrigShiftAmt = ShiftAmt; |
4009 | ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt); |
4010 | insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: ShiftAmt); |
4011 | |
4012 | // And now 'or' these low 8 bits of shift amount into the 'control'. |
4013 | Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt); |
4014 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: Control); |
4015 | } |
4016 | |
4017 | // But have to place the 'control' into the wide-enough register first. |
4018 | if (XVT != MVT::i32) { |
4019 | Control = CurDAG->getNode(Opcode: ISD::ANY_EXTEND, DL, VT: XVT, Operand: Control); |
4020 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: Control); |
4021 | } |
4022 | |
4023 | // And finally, form the BEXTR itself. |
4024 | SDValue = CurDAG->getNode(Opcode: X86ISD::BEXTR, DL, VT: XVT, N1: X, N2: Control); |
4025 | |
4026 | // The 'X' was originally truncated. Do that now. |
4027 | if (XVT != NVT) { |
4028 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(Node, 0), N: Extract); |
4029 | Extract = CurDAG->getNode(Opcode: ISD::TRUNCATE, DL, VT: NVT, Operand: Extract); |
4030 | } |
4031 | |
4032 | ReplaceNode(F: Node, T: Extract.getNode()); |
4033 | SelectCode(Extract.getNode()); |
4034 | |
4035 | return true; |
4036 | } |
4037 | |
4038 | // See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI. |
4039 | MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) { |
4040 | MVT NVT = Node->getSimpleValueType(ResNo: 0); |
4041 | SDLoc dl(Node); |
4042 | |
4043 | SDValue N0 = Node->getOperand(Num: 0); |
4044 | SDValue N1 = Node->getOperand(Num: 1); |
4045 | |
4046 | // If we have TBM we can use an immediate for the control. If we have BMI |
4047 | // we should only do this if the BEXTR instruction is implemented well. |
4048 | // Otherwise moving the control into a register makes this more costly. |
4049 | // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM |
4050 | // hoisting the move immediate would make it worthwhile with a less optimal |
4051 | // BEXTR? |
4052 | bool PreferBEXTR = |
4053 | Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR()); |
4054 | if (!PreferBEXTR && !Subtarget->hasBMI2()) |
4055 | return nullptr; |
4056 | |
4057 | // Must have a shift right. |
4058 | if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA) |
4059 | return nullptr; |
4060 | |
4061 | // Shift can't have additional users. |
4062 | if (!N0->hasOneUse()) |
4063 | return nullptr; |
4064 | |
4065 | // Only supported for 32 and 64 bits. |
4066 | if (NVT != MVT::i32 && NVT != MVT::i64) |
4067 | return nullptr; |
4068 | |
4069 | // Shift amount and RHS of and must be constant. |
4070 | auto *MaskCst = dyn_cast<ConstantSDNode>(Val&: N1); |
4071 | auto *ShiftCst = dyn_cast<ConstantSDNode>(Val: N0->getOperand(Num: 1)); |
4072 | if (!MaskCst || !ShiftCst) |
4073 | return nullptr; |
4074 | |
4075 | // And RHS must be a mask. |
4076 | uint64_t Mask = MaskCst->getZExtValue(); |
4077 | if (!isMask_64(Value: Mask)) |
4078 | return nullptr; |
4079 | |
4080 | uint64_t Shift = ShiftCst->getZExtValue(); |
4081 | uint64_t MaskSize = llvm::popcount(Value: Mask); |
4082 | |
4083 | // Don't interfere with something that can be handled by extracting AH. |
4084 | // TODO: If we are able to fold a load, BEXTR might still be better than AH. |
4085 | if (Shift == 8 && MaskSize == 8) |
4086 | return nullptr; |
4087 | |
4088 | // Make sure we are only using bits that were in the original value, not |
4089 | // shifted in. |
4090 | if (Shift + MaskSize > NVT.getSizeInBits()) |
4091 | return nullptr; |
4092 | |
4093 | // BZHI, if available, is always fast, unlike BEXTR. But even if we decide |
4094 | // that we can't use BEXTR, it is only worthwhile using BZHI if the mask |
4095 | // does not fit into 32 bits. Load folding is not a sufficient reason. |
4096 | if (!PreferBEXTR && MaskSize <= 32) |
4097 | return nullptr; |
4098 | |
4099 | SDValue Control; |
4100 | unsigned ROpc, MOpc; |
4101 | |
4102 | #define GET_EGPR_IF_ENABLED(OPC) (Subtarget->hasEGPR() ? OPC##_EVEX : OPC) |
4103 | if (!PreferBEXTR) { |
4104 | assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then." ); |
4105 | // If we can't make use of BEXTR then we can't fuse shift+mask stages. |
4106 | // Let's perform the mask first, and apply shift later. Note that we need to |
4107 | // widen the mask to account for the fact that we'll apply shift afterwards! |
4108 | Control = CurDAG->getTargetConstant(Val: Shift + MaskSize, DL: dl, VT: NVT); |
4109 | ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rr) |
4110 | : GET_EGPR_IF_ENABLED(X86::BZHI32rr); |
4111 | MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rm) |
4112 | : GET_EGPR_IF_ENABLED(X86::BZHI32rm); |
4113 | unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri; |
4114 | Control = SDValue(CurDAG->getMachineNode(Opcode: NewOpc, dl, VT: NVT, Op1: Control), 0); |
4115 | } else { |
4116 | // The 'control' of BEXTR has the pattern of: |
4117 | // [15...8 bit][ 7...0 bit] location |
4118 | // [ bit count][ shift] name |
4119 | // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11 |
4120 | Control = CurDAG->getTargetConstant(Val: Shift | (MaskSize << 8), DL: dl, VT: NVT); |
4121 | if (Subtarget->hasTBM()) { |
4122 | ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri; |
4123 | MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi; |
4124 | } else { |
4125 | assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then." ); |
4126 | // BMI requires the immediate to placed in a register. |
4127 | ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rr) |
4128 | : GET_EGPR_IF_ENABLED(X86::BEXTR32rr); |
4129 | MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rm) |
4130 | : GET_EGPR_IF_ENABLED(X86::BEXTR32rm); |
4131 | unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri; |
4132 | Control = SDValue(CurDAG->getMachineNode(Opcode: NewOpc, dl, VT: NVT, Op1: Control), 0); |
4133 | } |
4134 | } |
4135 | |
4136 | MachineSDNode *NewNode; |
4137 | SDValue Input = N0->getOperand(Num: 0); |
4138 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
4139 | if (tryFoldLoad(Root: Node, P: N0.getNode(), N: Input, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) { |
4140 | SDValue Ops[] = { |
4141 | Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(i: 0)}; |
4142 | SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other); |
4143 | NewNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops); |
4144 | // Update the chain. |
4145 | ReplaceUses(F: Input.getValue(R: 1), T: SDValue(NewNode, 2)); |
4146 | // Record the mem-refs |
4147 | CurDAG->setNodeMemRefs(N: NewNode, NewMemRefs: {cast<LoadSDNode>(Val&: Input)->getMemOperand()}); |
4148 | } else { |
4149 | NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control); |
4150 | } |
4151 | |
4152 | if (!PreferBEXTR) { |
4153 | // We still need to apply the shift. |
4154 | SDValue ShAmt = CurDAG->getTargetConstant(Val: Shift, DL: dl, VT: NVT); |
4155 | unsigned NewOpc = NVT == MVT::i64 ? GET_ND_IF_ENABLED(X86::SHR64ri) |
4156 | : GET_ND_IF_ENABLED(X86::SHR32ri); |
4157 | NewNode = |
4158 | CurDAG->getMachineNode(Opcode: NewOpc, dl, VT: NVT, Op1: SDValue(NewNode, 0), Op2: ShAmt); |
4159 | } |
4160 | |
4161 | return NewNode; |
4162 | } |
4163 | |
4164 | // Emit a PCMISTR(I/M) instruction. |
4165 | MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc, |
4166 | bool MayFoldLoad, const SDLoc &dl, |
4167 | MVT VT, SDNode *Node) { |
4168 | SDValue N0 = Node->getOperand(Num: 0); |
4169 | SDValue N1 = Node->getOperand(Num: 1); |
4170 | SDValue Imm = Node->getOperand(Num: 2); |
4171 | auto *Val = cast<ConstantSDNode>(Val&: Imm)->getConstantIntValue(); |
4172 | Imm = CurDAG->getTargetConstant(Val: *Val, DL: SDLoc(Node), VT: Imm.getValueType()); |
4173 | |
4174 | // Try to fold a load. No need to check alignment. |
4175 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
4176 | if (MayFoldLoad && tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) { |
4177 | SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, |
4178 | N1.getOperand(i: 0) }; |
4179 | SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other); |
4180 | MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops); |
4181 | // Update the chain. |
4182 | ReplaceUses(F: N1.getValue(R: 1), T: SDValue(CNode, 2)); |
4183 | // Record the mem-refs |
4184 | CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()}); |
4185 | return CNode; |
4186 | } |
4187 | |
4188 | SDValue Ops[] = { N0, N1, Imm }; |
4189 | SDVTList VTs = CurDAG->getVTList(VT, MVT::i32); |
4190 | MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VTs, Ops); |
4191 | return CNode; |
4192 | } |
4193 | |
4194 | // Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need |
4195 | // to emit a second instruction after this one. This is needed since we have two |
4196 | // copyToReg nodes glued before this and we need to continue that glue through. |
4197 | MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc, |
4198 | bool MayFoldLoad, const SDLoc &dl, |
4199 | MVT VT, SDNode *Node, |
4200 | SDValue &InGlue) { |
4201 | SDValue N0 = Node->getOperand(Num: 0); |
4202 | SDValue N2 = Node->getOperand(Num: 2); |
4203 | SDValue Imm = Node->getOperand(Num: 4); |
4204 | auto *Val = cast<ConstantSDNode>(Val&: Imm)->getConstantIntValue(); |
4205 | Imm = CurDAG->getTargetConstant(Val: *Val, DL: SDLoc(Node), VT: Imm.getValueType()); |
4206 | |
4207 | // Try to fold a load. No need to check alignment. |
4208 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
4209 | if (MayFoldLoad && tryFoldLoad(P: Node, N: N2, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) { |
4210 | SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, |
4211 | N2.getOperand(i: 0), InGlue }; |
4212 | SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue); |
4213 | MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops); |
4214 | InGlue = SDValue(CNode, 3); |
4215 | // Update the chain. |
4216 | ReplaceUses(F: N2.getValue(R: 1), T: SDValue(CNode, 2)); |
4217 | // Record the mem-refs |
4218 | CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N2)->getMemOperand()}); |
4219 | return CNode; |
4220 | } |
4221 | |
4222 | SDValue Ops[] = { N0, N2, Imm, InGlue }; |
4223 | SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue); |
4224 | MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VTs, Ops); |
4225 | InGlue = SDValue(CNode, 2); |
4226 | return CNode; |
4227 | } |
4228 | |
4229 | bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) { |
4230 | EVT VT = N->getValueType(ResNo: 0); |
4231 | |
4232 | // Only handle scalar shifts. |
4233 | if (VT.isVector()) |
4234 | return false; |
4235 | |
4236 | // Narrower shifts only mask to 5 bits in hardware. |
4237 | unsigned Size = VT == MVT::i64 ? 64 : 32; |
4238 | |
4239 | SDValue OrigShiftAmt = N->getOperand(Num: 1); |
4240 | SDValue ShiftAmt = OrigShiftAmt; |
4241 | SDLoc DL(N); |
4242 | |
4243 | // Skip over a truncate of the shift amount. |
4244 | if (ShiftAmt->getOpcode() == ISD::TRUNCATE) |
4245 | ShiftAmt = ShiftAmt->getOperand(Num: 0); |
4246 | |
4247 | // This function is called after X86DAGToDAGISel::matchBitExtract(), |
4248 | // so we are not afraid that we might mess up BZHI/BEXTR pattern. |
4249 | |
4250 | SDValue NewShiftAmt; |
4251 | if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB || |
4252 | ShiftAmt->getOpcode() == ISD::XOR) { |
4253 | SDValue Add0 = ShiftAmt->getOperand(Num: 0); |
4254 | SDValue Add1 = ShiftAmt->getOperand(Num: 1); |
4255 | auto *Add0C = dyn_cast<ConstantSDNode>(Val&: Add0); |
4256 | auto *Add1C = dyn_cast<ConstantSDNode>(Val&: Add1); |
4257 | // If we are shifting by X+/-/^N where N == 0 mod Size, then just shift by X |
4258 | // to avoid the ADD/SUB/XOR. |
4259 | if (Add1C && Add1C->getAPIntValue().urem(RHS: Size) == 0) { |
4260 | NewShiftAmt = Add0; |
4261 | |
4262 | } else if (ShiftAmt->getOpcode() != ISD::ADD && ShiftAmt.hasOneUse() && |
4263 | ((Add0C && Add0C->getAPIntValue().urem(RHS: Size) == Size - 1) || |
4264 | (Add1C && Add1C->getAPIntValue().urem(RHS: Size) == Size - 1))) { |
4265 | // If we are doing a NOT on just the lower bits with (Size*N-1) -/^ X |
4266 | // we can replace it with a NOT. In the XOR case it may save some code |
4267 | // size, in the SUB case it also may save a move. |
4268 | assert(Add0C == nullptr || Add1C == nullptr); |
4269 | |
4270 | // We can only do N-X, not X-N |
4271 | if (ShiftAmt->getOpcode() == ISD::SUB && Add0C == nullptr) |
4272 | return false; |
4273 | |
4274 | EVT OpVT = ShiftAmt.getValueType(); |
4275 | |
4276 | SDValue AllOnes = CurDAG->getAllOnesConstant(DL, VT: OpVT); |
4277 | NewShiftAmt = CurDAG->getNode(Opcode: ISD::XOR, DL, VT: OpVT, |
4278 | N1: Add0C == nullptr ? Add0 : Add1, N2: AllOnes); |
4279 | insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: AllOnes); |
4280 | insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: NewShiftAmt); |
4281 | // If we are shifting by N-X where N == 0 mod Size, then just shift by |
4282 | // -X to generate a NEG instead of a SUB of a constant. |
4283 | } else if (ShiftAmt->getOpcode() == ISD::SUB && Add0C && |
4284 | Add0C->getZExtValue() != 0) { |
4285 | EVT SubVT = ShiftAmt.getValueType(); |
4286 | SDValue X; |
4287 | if (Add0C->getZExtValue() % Size == 0) |
4288 | X = Add1; |
4289 | else if (ShiftAmt.hasOneUse() && Size == 64 && |
4290 | Add0C->getZExtValue() % 32 == 0) { |
4291 | // We have a 64-bit shift by (n*32-x), turn it into -(x+n*32). |
4292 | // This is mainly beneficial if we already compute (x+n*32). |
4293 | if (Add1.getOpcode() == ISD::TRUNCATE) { |
4294 | Add1 = Add1.getOperand(i: 0); |
4295 | SubVT = Add1.getValueType(); |
4296 | } |
4297 | if (Add0.getValueType() != SubVT) { |
4298 | Add0 = CurDAG->getZExtOrTrunc(Op: Add0, DL, VT: SubVT); |
4299 | insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: Add0); |
4300 | } |
4301 | |
4302 | X = CurDAG->getNode(Opcode: ISD::ADD, DL, VT: SubVT, N1: Add1, N2: Add0); |
4303 | insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: X); |
4304 | } else |
4305 | return false; |
4306 | // Insert a negate op. |
4307 | // TODO: This isn't guaranteed to replace the sub if there is a logic cone |
4308 | // that uses it that's not a shift. |
4309 | SDValue Zero = CurDAG->getConstant(Val: 0, DL, VT: SubVT); |
4310 | SDValue Neg = CurDAG->getNode(Opcode: ISD::SUB, DL, VT: SubVT, N1: Zero, N2: X); |
4311 | NewShiftAmt = Neg; |
4312 | |
4313 | // Insert these operands into a valid topological order so they can |
4314 | // get selected independently. |
4315 | insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: Zero); |
4316 | insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: Neg); |
4317 | } else |
4318 | return false; |
4319 | } else |
4320 | return false; |
4321 | |
4322 | if (NewShiftAmt.getValueType() != MVT::i8) { |
4323 | // Need to truncate the shift amount. |
4324 | NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt); |
4325 | // Add to a correct topological ordering. |
4326 | insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: NewShiftAmt); |
4327 | } |
4328 | |
4329 | // Insert a new mask to keep the shift amount legal. This should be removed |
4330 | // by isel patterns. |
4331 | NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt, |
4332 | CurDAG->getConstant(Size - 1, DL, MVT::i8)); |
4333 | // Place in a correct topological ordering. |
4334 | insertDAGNode(DAG&: *CurDAG, Pos: OrigShiftAmt, N: NewShiftAmt); |
4335 | |
4336 | SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, Op1: N->getOperand(Num: 0), |
4337 | Op2: NewShiftAmt); |
4338 | if (UpdatedNode != N) { |
4339 | // If we found an existing node, we should replace ourselves with that node |
4340 | // and wait for it to be selected after its other users. |
4341 | ReplaceNode(F: N, T: UpdatedNode); |
4342 | return true; |
4343 | } |
4344 | |
4345 | // If the original shift amount is now dead, delete it so that we don't run |
4346 | // it through isel. |
4347 | if (OrigShiftAmt.getNode()->use_empty()) |
4348 | CurDAG->RemoveDeadNode(N: OrigShiftAmt.getNode()); |
4349 | |
4350 | // Now that we've optimized the shift amount, defer to normal isel to get |
4351 | // load folding and legacy vs BMI2 selection without repeating it here. |
4352 | SelectCode(N); |
4353 | return true; |
4354 | } |
4355 | |
4356 | bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) { |
4357 | MVT NVT = N->getSimpleValueType(ResNo: 0); |
4358 | unsigned Opcode = N->getOpcode(); |
4359 | SDLoc dl(N); |
4360 | |
4361 | // For operations of the form (x << C1) op C2, check if we can use a smaller |
4362 | // encoding for C2 by transforming it into (x op (C2>>C1)) << C1. |
4363 | SDValue Shift = N->getOperand(Num: 0); |
4364 | SDValue N1 = N->getOperand(Num: 1); |
4365 | |
4366 | auto *Cst = dyn_cast<ConstantSDNode>(Val&: N1); |
4367 | if (!Cst) |
4368 | return false; |
4369 | |
4370 | int64_t Val = Cst->getSExtValue(); |
4371 | |
4372 | // If we have an any_extend feeding the AND, look through it to see if there |
4373 | // is a shift behind it. But only if the AND doesn't use the extended bits. |
4374 | // FIXME: Generalize this to other ANY_EXTEND than i32 to i64? |
4375 | bool FoundAnyExtend = false; |
4376 | if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() && |
4377 | Shift.getOperand(0).getSimpleValueType() == MVT::i32 && |
4378 | isUInt<32>(Val)) { |
4379 | FoundAnyExtend = true; |
4380 | Shift = Shift.getOperand(i: 0); |
4381 | } |
4382 | |
4383 | if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse()) |
4384 | return false; |
4385 | |
4386 | // i8 is unshrinkable, i16 should be promoted to i32. |
4387 | if (NVT != MVT::i32 && NVT != MVT::i64) |
4388 | return false; |
4389 | |
4390 | auto *ShlCst = dyn_cast<ConstantSDNode>(Val: Shift.getOperand(i: 1)); |
4391 | if (!ShlCst) |
4392 | return false; |
4393 | |
4394 | uint64_t ShAmt = ShlCst->getZExtValue(); |
4395 | |
4396 | // Make sure that we don't change the operation by removing bits. |
4397 | // This only matters for OR and XOR, AND is unaffected. |
4398 | uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1; |
4399 | if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0) |
4400 | return false; |
4401 | |
4402 | // Check the minimum bitwidth for the new constant. |
4403 | // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32. |
4404 | auto CanShrinkImmediate = [&](int64_t &ShiftedVal) { |
4405 | if (Opcode == ISD::AND) { |
4406 | // AND32ri is the same as AND64ri32 with zext imm. |
4407 | // Try this before sign extended immediates below. |
4408 | ShiftedVal = (uint64_t)Val >> ShAmt; |
4409 | if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal)) |
4410 | return true; |
4411 | // Also swap order when the AND can become MOVZX. |
4412 | if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX) |
4413 | return true; |
4414 | } |
4415 | ShiftedVal = Val >> ShAmt; |
4416 | if ((!isInt<8>(x: Val) && isInt<8>(x: ShiftedVal)) || |
4417 | (!isInt<32>(x: Val) && isInt<32>(x: ShiftedVal))) |
4418 | return true; |
4419 | if (Opcode != ISD::AND) { |
4420 | // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr |
4421 | ShiftedVal = (uint64_t)Val >> ShAmt; |
4422 | if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal)) |
4423 | return true; |
4424 | } |
4425 | return false; |
4426 | }; |
4427 | |
4428 | int64_t ShiftedVal; |
4429 | if (!CanShrinkImmediate(ShiftedVal)) |
4430 | return false; |
4431 | |
4432 | // Ok, we can reorder to get a smaller immediate. |
4433 | |
4434 | // But, its possible the original immediate allowed an AND to become MOVZX. |
4435 | // Doing this late due to avoid the MakedValueIsZero call as late as |
4436 | // possible. |
4437 | if (Opcode == ISD::AND) { |
4438 | // Find the smallest zext this could possibly be. |
4439 | unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits(); |
4440 | ZExtWidth = llvm::bit_ceil(Value: std::max(a: ZExtWidth, b: 8U)); |
4441 | |
4442 | // Figure out which bits need to be zero to achieve that mask. |
4443 | APInt NeededMask = APInt::getLowBitsSet(numBits: NVT.getSizeInBits(), |
4444 | loBitsSet: ZExtWidth); |
4445 | NeededMask &= ~Cst->getAPIntValue(); |
4446 | |
4447 | if (CurDAG->MaskedValueIsZero(Op: N->getOperand(Num: 0), Mask: NeededMask)) |
4448 | return false; |
4449 | } |
4450 | |
4451 | SDValue X = Shift.getOperand(i: 0); |
4452 | if (FoundAnyExtend) { |
4453 | SDValue NewX = CurDAG->getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: NVT, Operand: X); |
4454 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(N, 0), N: NewX); |
4455 | X = NewX; |
4456 | } |
4457 | |
4458 | SDValue NewCst = CurDAG->getConstant(Val: ShiftedVal, DL: dl, VT: NVT); |
4459 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(N, 0), N: NewCst); |
4460 | SDValue NewBinOp = CurDAG->getNode(Opcode, DL: dl, VT: NVT, N1: X, N2: NewCst); |
4461 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(N, 0), N: NewBinOp); |
4462 | SDValue NewSHL = CurDAG->getNode(Opcode: ISD::SHL, DL: dl, VT: NVT, N1: NewBinOp, |
4463 | N2: Shift.getOperand(i: 1)); |
4464 | ReplaceNode(F: N, T: NewSHL.getNode()); |
4465 | SelectCode(NewSHL.getNode()); |
4466 | return true; |
4467 | } |
4468 | |
4469 | bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA, |
4470 | SDNode *ParentB, SDNode *ParentC, |
4471 | SDValue A, SDValue B, SDValue C, |
4472 | uint8_t Imm) { |
4473 | assert(A.isOperandOf(ParentA) && B.isOperandOf(ParentB) && |
4474 | C.isOperandOf(ParentC) && "Incorrect parent node" ); |
4475 | |
4476 | auto tryFoldLoadOrBCast = |
4477 | [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale, |
4478 | SDValue &Index, SDValue &Disp, SDValue &Segment) { |
4479 | if (tryFoldLoad(Root, P, N: L, Base, Scale, Index, Disp, Segment)) |
4480 | return true; |
4481 | |
4482 | // Not a load, check for broadcast which may be behind a bitcast. |
4483 | if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) { |
4484 | P = L.getNode(); |
4485 | L = L.getOperand(i: 0); |
4486 | } |
4487 | |
4488 | if (L.getOpcode() != X86ISD::VBROADCAST_LOAD) |
4489 | return false; |
4490 | |
4491 | // Only 32 and 64 bit broadcasts are supported. |
4492 | auto *MemIntr = cast<MemIntrinsicSDNode>(Val&: L); |
4493 | unsigned Size = MemIntr->getMemoryVT().getSizeInBits(); |
4494 | if (Size != 32 && Size != 64) |
4495 | return false; |
4496 | |
4497 | return tryFoldBroadcast(Root, P, N: L, Base, Scale, Index, Disp, Segment); |
4498 | }; |
4499 | |
4500 | bool FoldedLoad = false; |
4501 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
4502 | if (tryFoldLoadOrBCast(Root, ParentC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { |
4503 | FoldedLoad = true; |
4504 | } else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3, |
4505 | Tmp4)) { |
4506 | FoldedLoad = true; |
4507 | std::swap(a&: A, b&: C); |
4508 | // Swap bits 1/4 and 3/6. |
4509 | uint8_t OldImm = Imm; |
4510 | Imm = OldImm & 0xa5; |
4511 | if (OldImm & 0x02) Imm |= 0x10; |
4512 | if (OldImm & 0x10) Imm |= 0x02; |
4513 | if (OldImm & 0x08) Imm |= 0x40; |
4514 | if (OldImm & 0x40) Imm |= 0x08; |
4515 | } else if (tryFoldLoadOrBCast(Root, ParentB, B, Tmp0, Tmp1, Tmp2, Tmp3, |
4516 | Tmp4)) { |
4517 | FoldedLoad = true; |
4518 | std::swap(a&: B, b&: C); |
4519 | // Swap bits 1/2 and 5/6. |
4520 | uint8_t OldImm = Imm; |
4521 | Imm = OldImm & 0x99; |
4522 | if (OldImm & 0x02) Imm |= 0x04; |
4523 | if (OldImm & 0x04) Imm |= 0x02; |
4524 | if (OldImm & 0x20) Imm |= 0x40; |
4525 | if (OldImm & 0x40) Imm |= 0x20; |
4526 | } |
4527 | |
4528 | SDLoc DL(Root); |
4529 | |
4530 | SDValue TImm = CurDAG->getTargetConstant(Imm, DL, MVT::i8); |
4531 | |
4532 | MVT NVT = Root->getSimpleValueType(ResNo: 0); |
4533 | |
4534 | MachineSDNode *MNode; |
4535 | if (FoldedLoad) { |
4536 | SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other); |
4537 | |
4538 | unsigned Opc; |
4539 | if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) { |
4540 | auto *MemIntr = cast<MemIntrinsicSDNode>(Val&: C); |
4541 | unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits(); |
4542 | assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!" ); |
4543 | |
4544 | bool UseD = EltSize == 32; |
4545 | if (NVT.is128BitVector()) |
4546 | Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi; |
4547 | else if (NVT.is256BitVector()) |
4548 | Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi; |
4549 | else if (NVT.is512BitVector()) |
4550 | Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi; |
4551 | else |
4552 | llvm_unreachable("Unexpected vector size!" ); |
4553 | } else { |
4554 | bool UseD = NVT.getVectorElementType() == MVT::i32; |
4555 | if (NVT.is128BitVector()) |
4556 | Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi; |
4557 | else if (NVT.is256BitVector()) |
4558 | Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi; |
4559 | else if (NVT.is512BitVector()) |
4560 | Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi; |
4561 | else |
4562 | llvm_unreachable("Unexpected vector size!" ); |
4563 | } |
4564 | |
4565 | SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(i: 0)}; |
4566 | MNode = CurDAG->getMachineNode(Opcode: Opc, dl: DL, VTs, Ops); |
4567 | |
4568 | // Update the chain. |
4569 | ReplaceUses(F: C.getValue(R: 1), T: SDValue(MNode, 1)); |
4570 | // Record the mem-refs |
4571 | CurDAG->setNodeMemRefs(N: MNode, NewMemRefs: {cast<MemSDNode>(Val&: C)->getMemOperand()}); |
4572 | } else { |
4573 | bool UseD = NVT.getVectorElementType() == MVT::i32; |
4574 | unsigned Opc; |
4575 | if (NVT.is128BitVector()) |
4576 | Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri; |
4577 | else if (NVT.is256BitVector()) |
4578 | Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri; |
4579 | else if (NVT.is512BitVector()) |
4580 | Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri; |
4581 | else |
4582 | llvm_unreachable("Unexpected vector size!" ); |
4583 | |
4584 | MNode = CurDAG->getMachineNode(Opcode: Opc, dl: DL, VT: NVT, Ops: {A, B, C, TImm}); |
4585 | } |
4586 | |
4587 | ReplaceUses(F: SDValue(Root, 0), T: SDValue(MNode, 0)); |
4588 | CurDAG->RemoveDeadNode(N: Root); |
4589 | return true; |
4590 | } |
4591 | |
4592 | // Try to match two logic ops to a VPTERNLOG. |
4593 | // FIXME: Handle more complex patterns that use an operand more than once? |
4594 | bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) { |
4595 | MVT NVT = N->getSimpleValueType(ResNo: 0); |
4596 | |
4597 | // Make sure we support VPTERNLOG. |
4598 | if (!NVT.isVector() || !Subtarget->hasAVX512() || |
4599 | NVT.getVectorElementType() == MVT::i1) |
4600 | return false; |
4601 | |
4602 | // We need VLX for 128/256-bit. |
4603 | if (!(Subtarget->hasVLX() || NVT.is512BitVector())) |
4604 | return false; |
4605 | |
4606 | SDValue N0 = N->getOperand(Num: 0); |
4607 | SDValue N1 = N->getOperand(Num: 1); |
4608 | |
4609 | auto getFoldableLogicOp = [](SDValue Op) { |
4610 | // Peek through single use bitcast. |
4611 | if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse()) |
4612 | Op = Op.getOperand(i: 0); |
4613 | |
4614 | if (!Op.hasOneUse()) |
4615 | return SDValue(); |
4616 | |
4617 | unsigned Opc = Op.getOpcode(); |
4618 | if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR || |
4619 | Opc == X86ISD::ANDNP) |
4620 | return Op; |
4621 | |
4622 | return SDValue(); |
4623 | }; |
4624 | |
4625 | SDValue A, FoldableOp; |
4626 | if ((FoldableOp = getFoldableLogicOp(N1))) { |
4627 | A = N0; |
4628 | } else if ((FoldableOp = getFoldableLogicOp(N0))) { |
4629 | A = N1; |
4630 | } else |
4631 | return false; |
4632 | |
4633 | SDValue B = FoldableOp.getOperand(i: 0); |
4634 | SDValue C = FoldableOp.getOperand(i: 1); |
4635 | SDNode *ParentA = N; |
4636 | SDNode *ParentB = FoldableOp.getNode(); |
4637 | SDNode *ParentC = FoldableOp.getNode(); |
4638 | |
4639 | // We can build the appropriate control immediate by performing the logic |
4640 | // operation we're matching using these constants for A, B, and C. |
4641 | uint8_t TernlogMagicA = 0xf0; |
4642 | uint8_t TernlogMagicB = 0xcc; |
4643 | uint8_t TernlogMagicC = 0xaa; |
4644 | |
4645 | // Some of the inputs may be inverted, peek through them and invert the |
4646 | // magic values accordingly. |
4647 | // TODO: There may be a bitcast before the xor that we should peek through. |
4648 | auto PeekThroughNot = [](SDValue &Op, SDNode *&Parent, uint8_t &Magic) { |
4649 | if (Op.getOpcode() == ISD::XOR && Op.hasOneUse() && |
4650 | ISD::isBuildVectorAllOnes(N: Op.getOperand(i: 1).getNode())) { |
4651 | Magic = ~Magic; |
4652 | Parent = Op.getNode(); |
4653 | Op = Op.getOperand(i: 0); |
4654 | } |
4655 | }; |
4656 | |
4657 | PeekThroughNot(A, ParentA, TernlogMagicA); |
4658 | PeekThroughNot(B, ParentB, TernlogMagicB); |
4659 | PeekThroughNot(C, ParentC, TernlogMagicC); |
4660 | |
4661 | uint8_t Imm; |
4662 | switch (FoldableOp.getOpcode()) { |
4663 | default: llvm_unreachable("Unexpected opcode!" ); |
4664 | case ISD::AND: Imm = TernlogMagicB & TernlogMagicC; break; |
4665 | case ISD::OR: Imm = TernlogMagicB | TernlogMagicC; break; |
4666 | case ISD::XOR: Imm = TernlogMagicB ^ TernlogMagicC; break; |
4667 | case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break; |
4668 | } |
4669 | |
4670 | switch (N->getOpcode()) { |
4671 | default: llvm_unreachable("Unexpected opcode!" ); |
4672 | case X86ISD::ANDNP: |
4673 | if (A == N0) |
4674 | Imm &= ~TernlogMagicA; |
4675 | else |
4676 | Imm = ~(Imm) & TernlogMagicA; |
4677 | break; |
4678 | case ISD::AND: Imm &= TernlogMagicA; break; |
4679 | case ISD::OR: Imm |= TernlogMagicA; break; |
4680 | case ISD::XOR: Imm ^= TernlogMagicA; break; |
4681 | } |
4682 | |
4683 | return matchVPTERNLOG(Root: N, ParentA, ParentB, ParentC, A, B, C, Imm); |
4684 | } |
4685 | |
4686 | /// If the high bits of an 'and' operand are known zero, try setting the |
4687 | /// high bits of an 'and' constant operand to produce a smaller encoding by |
4688 | /// creating a small, sign-extended negative immediate rather than a large |
4689 | /// positive one. This reverses a transform in SimplifyDemandedBits that |
4690 | /// shrinks mask constants by clearing bits. There is also a possibility that |
4691 | /// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that |
4692 | /// case, just replace the 'and'. Return 'true' if the node is replaced. |
4693 | bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) { |
4694 | // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't |
4695 | // have immediate operands. |
4696 | MVT VT = And->getSimpleValueType(ResNo: 0); |
4697 | if (VT != MVT::i32 && VT != MVT::i64) |
4698 | return false; |
4699 | |
4700 | auto *And1C = dyn_cast<ConstantSDNode>(Val: And->getOperand(Num: 1)); |
4701 | if (!And1C) |
4702 | return false; |
4703 | |
4704 | // Bail out if the mask constant is already negative. It's can't shrink more. |
4705 | // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel |
4706 | // patterns to use a 32-bit and instead of a 64-bit and by relying on the |
4707 | // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits |
4708 | // are negative too. |
4709 | APInt MaskVal = And1C->getAPIntValue(); |
4710 | unsigned MaskLZ = MaskVal.countl_zero(); |
4711 | if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32)) |
4712 | return false; |
4713 | |
4714 | // Don't extend into the upper 32 bits of a 64 bit mask. |
4715 | if (VT == MVT::i64 && MaskLZ >= 32) { |
4716 | MaskLZ -= 32; |
4717 | MaskVal = MaskVal.trunc(width: 32); |
4718 | } |
4719 | |
4720 | SDValue And0 = And->getOperand(Num: 0); |
4721 | APInt HighZeros = APInt::getHighBitsSet(numBits: MaskVal.getBitWidth(), hiBitsSet: MaskLZ); |
4722 | APInt NegMaskVal = MaskVal | HighZeros; |
4723 | |
4724 | // If a negative constant would not allow a smaller encoding, there's no need |
4725 | // to continue. Only change the constant when we know it's a win. |
4726 | unsigned MinWidth = NegMaskVal.getSignificantBits(); |
4727 | if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getSignificantBits() <= 32)) |
4728 | return false; |
4729 | |
4730 | // Extend masks if we truncated above. |
4731 | if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) { |
4732 | NegMaskVal = NegMaskVal.zext(width: 64); |
4733 | HighZeros = HighZeros.zext(width: 64); |
4734 | } |
4735 | |
4736 | // The variable operand must be all zeros in the top bits to allow using the |
4737 | // new, negative constant as the mask. |
4738 | if (!CurDAG->MaskedValueIsZero(Op: And0, Mask: HighZeros)) |
4739 | return false; |
4740 | |
4741 | // Check if the mask is -1. In that case, this is an unnecessary instruction |
4742 | // that escaped earlier analysis. |
4743 | if (NegMaskVal.isAllOnes()) { |
4744 | ReplaceNode(F: And, T: And0.getNode()); |
4745 | return true; |
4746 | } |
4747 | |
4748 | // A negative mask allows a smaller encoding. Create a new 'and' node. |
4749 | SDValue NewMask = CurDAG->getConstant(Val: NegMaskVal, DL: SDLoc(And), VT); |
4750 | insertDAGNode(DAG&: *CurDAG, Pos: SDValue(And, 0), N: NewMask); |
4751 | SDValue NewAnd = CurDAG->getNode(Opcode: ISD::AND, DL: SDLoc(And), VT, N1: And0, N2: NewMask); |
4752 | ReplaceNode(F: And, T: NewAnd.getNode()); |
4753 | SelectCode(NewAnd.getNode()); |
4754 | return true; |
4755 | } |
4756 | |
4757 | static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad, |
4758 | bool FoldedBCast, bool Masked) { |
4759 | #define VPTESTM_CASE(VT, SUFFIX) \ |
4760 | case MVT::VT: \ |
4761 | if (Masked) \ |
4762 | return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \ |
4763 | return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX; |
4764 | |
4765 | |
4766 | #define VPTESTM_BROADCAST_CASES(SUFFIX) \ |
4767 | default: llvm_unreachable("Unexpected VT!"); \ |
4768 | VPTESTM_CASE(v4i32, DZ128##SUFFIX) \ |
4769 | VPTESTM_CASE(v2i64, QZ128##SUFFIX) \ |
4770 | VPTESTM_CASE(v8i32, DZ256##SUFFIX) \ |
4771 | VPTESTM_CASE(v4i64, QZ256##SUFFIX) \ |
4772 | VPTESTM_CASE(v16i32, DZ##SUFFIX) \ |
4773 | VPTESTM_CASE(v8i64, QZ##SUFFIX) |
4774 | |
4775 | #define VPTESTM_FULL_CASES(SUFFIX) \ |
4776 | VPTESTM_BROADCAST_CASES(SUFFIX) \ |
4777 | VPTESTM_CASE(v16i8, BZ128##SUFFIX) \ |
4778 | VPTESTM_CASE(v8i16, WZ128##SUFFIX) \ |
4779 | VPTESTM_CASE(v32i8, BZ256##SUFFIX) \ |
4780 | VPTESTM_CASE(v16i16, WZ256##SUFFIX) \ |
4781 | VPTESTM_CASE(v64i8, BZ##SUFFIX) \ |
4782 | VPTESTM_CASE(v32i16, WZ##SUFFIX) |
4783 | |
4784 | if (FoldedBCast) { |
4785 | switch (TestVT.SimpleTy) { |
4786 | VPTESTM_BROADCAST_CASES(rmb) |
4787 | } |
4788 | } |
4789 | |
4790 | if (FoldedLoad) { |
4791 | switch (TestVT.SimpleTy) { |
4792 | VPTESTM_FULL_CASES(rm) |
4793 | } |
4794 | } |
4795 | |
4796 | switch (TestVT.SimpleTy) { |
4797 | VPTESTM_FULL_CASES(rr) |
4798 | } |
4799 | |
4800 | #undef VPTESTM_FULL_CASES |
4801 | #undef VPTESTM_BROADCAST_CASES |
4802 | #undef VPTESTM_CASE |
4803 | } |
4804 | |
4805 | // Try to create VPTESTM instruction. If InMask is not null, it will be used |
4806 | // to form a masked operation. |
4807 | bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, |
4808 | SDValue InMask) { |
4809 | assert(Subtarget->hasAVX512() && "Expected AVX512!" ); |
4810 | assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 && |
4811 | "Unexpected VT!" ); |
4812 | |
4813 | // Look for equal and not equal compares. |
4814 | ISD::CondCode CC = cast<CondCodeSDNode>(Val: Setcc.getOperand(i: 2))->get(); |
4815 | if (CC != ISD::SETEQ && CC != ISD::SETNE) |
4816 | return false; |
4817 | |
4818 | SDValue SetccOp0 = Setcc.getOperand(i: 0); |
4819 | SDValue SetccOp1 = Setcc.getOperand(i: 1); |
4820 | |
4821 | // Canonicalize the all zero vector to the RHS. |
4822 | if (ISD::isBuildVectorAllZeros(N: SetccOp0.getNode())) |
4823 | std::swap(a&: SetccOp0, b&: SetccOp1); |
4824 | |
4825 | // See if we're comparing against zero. |
4826 | if (!ISD::isBuildVectorAllZeros(N: SetccOp1.getNode())) |
4827 | return false; |
4828 | |
4829 | SDValue N0 = SetccOp0; |
4830 | |
4831 | MVT CmpVT = N0.getSimpleValueType(); |
4832 | MVT CmpSVT = CmpVT.getVectorElementType(); |
4833 | |
4834 | // Start with both operands the same. We'll try to refine this. |
4835 | SDValue Src0 = N0; |
4836 | SDValue Src1 = N0; |
4837 | |
4838 | { |
4839 | // Look through single use bitcasts. |
4840 | SDValue N0Temp = N0; |
4841 | if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse()) |
4842 | N0Temp = N0.getOperand(i: 0); |
4843 | |
4844 | // Look for single use AND. |
4845 | if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) { |
4846 | Src0 = N0Temp.getOperand(i: 0); |
4847 | Src1 = N0Temp.getOperand(i: 1); |
4848 | } |
4849 | } |
4850 | |
4851 | // Without VLX we need to widen the operation. |
4852 | bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector(); |
4853 | |
4854 | auto tryFoldLoadOrBCast = [&](SDNode *Root, SDNode *P, SDValue &L, |
4855 | SDValue &Base, SDValue &Scale, SDValue &Index, |
4856 | SDValue &Disp, SDValue &Segment) { |
4857 | // If we need to widen, we can't fold the load. |
4858 | if (!Widen) |
4859 | if (tryFoldLoad(Root, P, N: L, Base, Scale, Index, Disp, Segment)) |
4860 | return true; |
4861 | |
4862 | // If we didn't fold a load, try to match broadcast. No widening limitation |
4863 | // for this. But only 32 and 64 bit types are supported. |
4864 | if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64) |
4865 | return false; |
4866 | |
4867 | // Look through single use bitcasts. |
4868 | if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) { |
4869 | P = L.getNode(); |
4870 | L = L.getOperand(i: 0); |
4871 | } |
4872 | |
4873 | if (L.getOpcode() != X86ISD::VBROADCAST_LOAD) |
4874 | return false; |
4875 | |
4876 | auto *MemIntr = cast<MemIntrinsicSDNode>(Val&: L); |
4877 | if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits()) |
4878 | return false; |
4879 | |
4880 | return tryFoldBroadcast(Root, P, N: L, Base, Scale, Index, Disp, Segment); |
4881 | }; |
4882 | |
4883 | // We can only fold loads if the sources are unique. |
4884 | bool CanFoldLoads = Src0 != Src1; |
4885 | |
4886 | bool FoldedLoad = false; |
4887 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
4888 | if (CanFoldLoads) { |
4889 | FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2, |
4890 | Tmp3, Tmp4); |
4891 | if (!FoldedLoad) { |
4892 | // And is commutative. |
4893 | FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src0, Tmp0, Tmp1, |
4894 | Tmp2, Tmp3, Tmp4); |
4895 | if (FoldedLoad) |
4896 | std::swap(a&: Src0, b&: Src1); |
4897 | } |
4898 | } |
4899 | |
4900 | bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD; |
4901 | |
4902 | bool IsMasked = InMask.getNode() != nullptr; |
4903 | |
4904 | SDLoc dl(Root); |
4905 | |
4906 | MVT ResVT = Setcc.getSimpleValueType(); |
4907 | MVT MaskVT = ResVT; |
4908 | if (Widen) { |
4909 | // Widen the inputs using insert_subreg or copy_to_regclass. |
4910 | unsigned Scale = CmpVT.is128BitVector() ? 4 : 2; |
4911 | unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm; |
4912 | unsigned NumElts = CmpVT.getVectorNumElements() * Scale; |
4913 | CmpVT = MVT::getVectorVT(VT: CmpSVT, NumElements: NumElts); |
4914 | MaskVT = MVT::getVectorVT(MVT::i1, NumElts); |
4915 | SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl, |
4916 | CmpVT), 0); |
4917 | Src0 = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL: dl, VT: CmpVT, Operand: ImplDef, Subreg: Src0); |
4918 | |
4919 | if (!FoldedBCast) |
4920 | Src1 = CurDAG->getTargetInsertSubreg(SRIdx: SubReg, DL: dl, VT: CmpVT, Operand: ImplDef, Subreg: Src1); |
4921 | |
4922 | if (IsMasked) { |
4923 | // Widen the mask. |
4924 | unsigned RegClass = TLI->getRegClassFor(VT: MaskVT)->getID(); |
4925 | SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32); |
4926 | InMask = SDValue(CurDAG->getMachineNode(Opcode: TargetOpcode::COPY_TO_REGCLASS, |
4927 | dl, VT: MaskVT, Op1: InMask, Op2: RC), 0); |
4928 | } |
4929 | } |
4930 | |
4931 | bool IsTestN = CC == ISD::SETEQ; |
4932 | unsigned Opc = getVPTESTMOpc(TestVT: CmpVT, IsTestN, FoldedLoad, FoldedBCast, |
4933 | Masked: IsMasked); |
4934 | |
4935 | MachineSDNode *CNode; |
4936 | if (FoldedLoad) { |
4937 | SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other); |
4938 | |
4939 | if (IsMasked) { |
4940 | SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, |
4941 | Src1.getOperand(i: 0) }; |
4942 | CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops); |
4943 | } else { |
4944 | SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, |
4945 | Src1.getOperand(i: 0) }; |
4946 | CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops); |
4947 | } |
4948 | |
4949 | // Update the chain. |
4950 | ReplaceUses(F: Src1.getValue(R: 1), T: SDValue(CNode, 1)); |
4951 | // Record the mem-refs |
4952 | CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<MemSDNode>(Val&: Src1)->getMemOperand()}); |
4953 | } else { |
4954 | if (IsMasked) |
4955 | CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MaskVT, Op1: InMask, Op2: Src0, Op3: Src1); |
4956 | else |
4957 | CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VT: MaskVT, Op1: Src0, Op2: Src1); |
4958 | } |
4959 | |
4960 | // If we widened, we need to shrink the mask VT. |
4961 | if (Widen) { |
4962 | unsigned RegClass = TLI->getRegClassFor(VT: ResVT)->getID(); |
4963 | SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32); |
4964 | CNode = CurDAG->getMachineNode(Opcode: TargetOpcode::COPY_TO_REGCLASS, |
4965 | dl, VT: ResVT, Op1: SDValue(CNode, 0), Op2: RC); |
4966 | } |
4967 | |
4968 | ReplaceUses(F: SDValue(Root, 0), T: SDValue(CNode, 0)); |
4969 | CurDAG->RemoveDeadNode(N: Root); |
4970 | return true; |
4971 | } |
4972 | |
4973 | // Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it |
4974 | // into vpternlog. |
4975 | bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) { |
4976 | assert(N->getOpcode() == ISD::OR && "Unexpected opcode!" ); |
4977 | |
4978 | MVT NVT = N->getSimpleValueType(ResNo: 0); |
4979 | |
4980 | // Make sure we support VPTERNLOG. |
4981 | if (!NVT.isVector() || !Subtarget->hasAVX512()) |
4982 | return false; |
4983 | |
4984 | // We need VLX for 128/256-bit. |
4985 | if (!(Subtarget->hasVLX() || NVT.is512BitVector())) |
4986 | return false; |
4987 | |
4988 | SDValue N0 = N->getOperand(Num: 0); |
4989 | SDValue N1 = N->getOperand(Num: 1); |
4990 | |
4991 | // Canonicalize AND to LHS. |
4992 | if (N1.getOpcode() == ISD::AND) |
4993 | std::swap(a&: N0, b&: N1); |
4994 | |
4995 | if (N0.getOpcode() != ISD::AND || |
4996 | N1.getOpcode() != X86ISD::ANDNP || |
4997 | !N0.hasOneUse() || !N1.hasOneUse()) |
4998 | return false; |
4999 | |
5000 | // ANDN is not commutable, use it to pick down A and C. |
5001 | SDValue A = N1.getOperand(i: 0); |
5002 | SDValue C = N1.getOperand(i: 1); |
5003 | |
5004 | // AND is commutable, if one operand matches A, the other operand is B. |
5005 | // Otherwise this isn't a match. |
5006 | SDValue B; |
5007 | if (N0.getOperand(i: 0) == A) |
5008 | B = N0.getOperand(i: 1); |
5009 | else if (N0.getOperand(i: 1) == A) |
5010 | B = N0.getOperand(i: 0); |
5011 | else |
5012 | return false; |
5013 | |
5014 | SDLoc dl(N); |
5015 | SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8); |
5016 | SDValue Ternlog = CurDAG->getNode(Opcode: X86ISD::VPTERNLOG, DL: dl, VT: NVT, N1: A, N2: B, N3: C, N4: Imm); |
5017 | ReplaceNode(F: N, T: Ternlog.getNode()); |
5018 | |
5019 | return matchVPTERNLOG(Root: Ternlog.getNode(), ParentA: Ternlog.getNode(), ParentB: Ternlog.getNode(), |
5020 | ParentC: Ternlog.getNode(), A, B, C, Imm: 0xCA); |
5021 | } |
5022 | |
5023 | void X86DAGToDAGISel::Select(SDNode *Node) { |
5024 | MVT NVT = Node->getSimpleValueType(ResNo: 0); |
5025 | unsigned Opcode = Node->getOpcode(); |
5026 | SDLoc dl(Node); |
5027 | |
5028 | if (Node->isMachineOpcode()) { |
5029 | LLVM_DEBUG(dbgs() << "== " ; Node->dump(CurDAG); dbgs() << '\n'); |
5030 | Node->setNodeId(-1); |
5031 | return; // Already selected. |
5032 | } |
5033 | |
5034 | switch (Opcode) { |
5035 | default: break; |
5036 | case ISD::INTRINSIC_W_CHAIN: { |
5037 | unsigned IntNo = Node->getConstantOperandVal(Num: 1); |
5038 | switch (IntNo) { |
5039 | default: break; |
5040 | case Intrinsic::x86_encodekey128: |
5041 | case Intrinsic::x86_encodekey256: { |
5042 | if (!Subtarget->hasKL()) |
5043 | break; |
5044 | |
5045 | unsigned Opcode; |
5046 | switch (IntNo) { |
5047 | default: llvm_unreachable("Impossible intrinsic" ); |
5048 | case Intrinsic::x86_encodekey128: |
5049 | Opcode = X86::ENCODEKEY128; |
5050 | break; |
5051 | case Intrinsic::x86_encodekey256: |
5052 | Opcode = X86::ENCODEKEY256; |
5053 | break; |
5054 | } |
5055 | |
5056 | SDValue Chain = Node->getOperand(Num: 0); |
5057 | Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(3), |
5058 | SDValue()); |
5059 | if (Opcode == X86::ENCODEKEY256) |
5060 | Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(4), |
5061 | Chain.getValue(1)); |
5062 | |
5063 | MachineSDNode *Res = CurDAG->getMachineNode( |
5064 | Opcode, dl, VTs: Node->getVTList(), |
5065 | Ops: {Node->getOperand(Num: 2), Chain, Chain.getValue(R: 1)}); |
5066 | ReplaceNode(F: Node, T: Res); |
5067 | return; |
5068 | } |
5069 | case Intrinsic::x86_tileloadd64_internal: |
5070 | case Intrinsic::x86_tileloaddt164_internal: { |
5071 | if (!Subtarget->hasAMXTILE()) |
5072 | break; |
5073 | unsigned Opc = IntNo == Intrinsic::x86_tileloadd64_internal |
5074 | ? X86::PTILELOADDV |
5075 | : X86::PTILELOADDT1V; |
5076 | // _tile_loadd_internal(row, col, buf, STRIDE) |
5077 | SDValue Base = Node->getOperand(Num: 4); |
5078 | SDValue Scale = getI8Imm(Imm: 1, DL: dl); |
5079 | SDValue Index = Node->getOperand(Num: 5); |
5080 | SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); |
5081 | SDValue Segment = CurDAG->getRegister(0, MVT::i16); |
5082 | SDValue Chain = Node->getOperand(Num: 0); |
5083 | MachineSDNode *CNode; |
5084 | SDValue Ops[] = {Node->getOperand(Num: 2), |
5085 | Node->getOperand(Num: 3), |
5086 | Base, |
5087 | Scale, |
5088 | Index, |
5089 | Disp, |
5090 | Segment, |
5091 | Chain}; |
5092 | CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops); |
5093 | ReplaceNode(F: Node, T: CNode); |
5094 | return; |
5095 | } |
5096 | } |
5097 | break; |
5098 | } |
5099 | case ISD::INTRINSIC_VOID: { |
5100 | unsigned IntNo = Node->getConstantOperandVal(Num: 1); |
5101 | switch (IntNo) { |
5102 | default: break; |
5103 | case Intrinsic::x86_sse3_monitor: |
5104 | case Intrinsic::x86_monitorx: |
5105 | case Intrinsic::x86_clzero: { |
5106 | bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64; |
5107 | |
5108 | unsigned Opc = 0; |
5109 | switch (IntNo) { |
5110 | default: llvm_unreachable("Unexpected intrinsic!" ); |
5111 | case Intrinsic::x86_sse3_monitor: |
5112 | if (!Subtarget->hasSSE3()) |
5113 | break; |
5114 | Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr; |
5115 | break; |
5116 | case Intrinsic::x86_monitorx: |
5117 | if (!Subtarget->hasMWAITX()) |
5118 | break; |
5119 | Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr; |
5120 | break; |
5121 | case Intrinsic::x86_clzero: |
5122 | if (!Subtarget->hasCLZERO()) |
5123 | break; |
5124 | Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r; |
5125 | break; |
5126 | } |
5127 | |
5128 | if (Opc) { |
5129 | unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX; |
5130 | SDValue Chain = CurDAG->getCopyToReg(Chain: Node->getOperand(Num: 0), dl, Reg: PtrReg, |
5131 | N: Node->getOperand(Num: 2), Glue: SDValue()); |
5132 | SDValue InGlue = Chain.getValue(R: 1); |
5133 | |
5134 | if (IntNo == Intrinsic::x86_sse3_monitor || |
5135 | IntNo == Intrinsic::x86_monitorx) { |
5136 | // Copy the other two operands to ECX and EDX. |
5137 | Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3), |
5138 | InGlue); |
5139 | InGlue = Chain.getValue(R: 1); |
5140 | Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4), |
5141 | InGlue); |
5142 | InGlue = Chain.getValue(R: 1); |
5143 | } |
5144 | |
5145 | MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, |
5146 | { Chain, InGlue}); |
5147 | ReplaceNode(F: Node, T: CNode); |
5148 | return; |
5149 | } |
5150 | |
5151 | break; |
5152 | } |
5153 | case Intrinsic::x86_tilestored64_internal: { |
5154 | unsigned Opc = X86::PTILESTOREDV; |
5155 | // _tile_stored_internal(row, col, buf, STRIDE, c) |
5156 | SDValue Base = Node->getOperand(Num: 4); |
5157 | SDValue Scale = getI8Imm(Imm: 1, DL: dl); |
5158 | SDValue Index = Node->getOperand(Num: 5); |
5159 | SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); |
5160 | SDValue Segment = CurDAG->getRegister(0, MVT::i16); |
5161 | SDValue Chain = Node->getOperand(Num: 0); |
5162 | MachineSDNode *CNode; |
5163 | SDValue Ops[] = {Node->getOperand(Num: 2), |
5164 | Node->getOperand(Num: 3), |
5165 | Base, |
5166 | Scale, |
5167 | Index, |
5168 | Disp, |
5169 | Segment, |
5170 | Node->getOperand(Num: 6), |
5171 | Chain}; |
5172 | CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); |
5173 | ReplaceNode(F: Node, T: CNode); |
5174 | return; |
5175 | } |
5176 | case Intrinsic::x86_tileloadd64: |
5177 | case Intrinsic::x86_tileloaddt164: |
5178 | case Intrinsic::x86_tilestored64: { |
5179 | if (!Subtarget->hasAMXTILE()) |
5180 | break; |
5181 | unsigned Opc; |
5182 | switch (IntNo) { |
5183 | default: llvm_unreachable("Unexpected intrinsic!" ); |
5184 | case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break; |
5185 | case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break; |
5186 | case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break; |
5187 | } |
5188 | // FIXME: Match displacement and scale. |
5189 | unsigned TIndex = Node->getConstantOperandVal(Num: 2); |
5190 | SDValue TReg = getI8Imm(Imm: TIndex, DL: dl); |
5191 | SDValue Base = Node->getOperand(Num: 3); |
5192 | SDValue Scale = getI8Imm(Imm: 1, DL: dl); |
5193 | SDValue Index = Node->getOperand(Num: 4); |
5194 | SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); |
5195 | SDValue Segment = CurDAG->getRegister(0, MVT::i16); |
5196 | SDValue Chain = Node->getOperand(Num: 0); |
5197 | MachineSDNode *CNode; |
5198 | if (Opc == X86::PTILESTORED) { |
5199 | SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain }; |
5200 | CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); |
5201 | } else { |
5202 | SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain }; |
5203 | CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); |
5204 | } |
5205 | ReplaceNode(F: Node, T: CNode); |
5206 | return; |
5207 | } |
5208 | } |
5209 | break; |
5210 | } |
5211 | case ISD::BRIND: |
5212 | case X86ISD::NT_BRIND: { |
5213 | if (Subtarget->isTargetNaCl()) |
5214 | // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We |
5215 | // leave the instruction alone. |
5216 | break; |
5217 | if (Subtarget->isTarget64BitILP32()) { |
5218 | // Converts a 32-bit register to a 64-bit, zero-extended version of |
5219 | // it. This is needed because x86-64 can do many things, but jmp %r32 |
5220 | // ain't one of them. |
5221 | SDValue Target = Node->getOperand(Num: 1); |
5222 | assert(Target.getValueType() == MVT::i32 && "Unexpected VT!" ); |
5223 | SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, MVT::i64); |
5224 | SDValue Brind = CurDAG->getNode(Opcode, dl, MVT::Other, |
5225 | Node->getOperand(0), ZextTarget); |
5226 | ReplaceNode(F: Node, T: Brind.getNode()); |
5227 | SelectCode(ZextTarget.getNode()); |
5228 | SelectCode(Brind.getNode()); |
5229 | return; |
5230 | } |
5231 | break; |
5232 | } |
5233 | case X86ISD::GlobalBaseReg: |
5234 | ReplaceNode(F: Node, T: getGlobalBaseReg()); |
5235 | return; |
5236 | |
5237 | case ISD::BITCAST: |
5238 | // Just drop all 128/256/512-bit bitcasts. |
5239 | if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() || |
5240 | NVT == MVT::f128) { |
5241 | ReplaceUses(F: SDValue(Node, 0), T: Node->getOperand(Num: 0)); |
5242 | CurDAG->RemoveDeadNode(N: Node); |
5243 | return; |
5244 | } |
5245 | break; |
5246 | |
5247 | case ISD::SRL: |
5248 | if (matchBitExtract(Node)) |
5249 | return; |
5250 | [[fallthrough]]; |
5251 | case ISD::SRA: |
5252 | case ISD::SHL: |
5253 | if (tryShiftAmountMod(N: Node)) |
5254 | return; |
5255 | break; |
5256 | |
5257 | case X86ISD::VPTERNLOG: { |
5258 | uint8_t Imm = Node->getConstantOperandVal(Num: 3); |
5259 | if (matchVPTERNLOG(Root: Node, ParentA: Node, ParentB: Node, ParentC: Node, A: Node->getOperand(Num: 0), |
5260 | B: Node->getOperand(Num: 1), C: Node->getOperand(Num: 2), Imm)) |
5261 | return; |
5262 | break; |
5263 | } |
5264 | |
5265 | case X86ISD::ANDNP: |
5266 | if (tryVPTERNLOG(N: Node)) |
5267 | return; |
5268 | break; |
5269 | |
5270 | case ISD::AND: |
5271 | if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) { |
5272 | // Try to form a masked VPTESTM. Operands can be in either order. |
5273 | SDValue N0 = Node->getOperand(Num: 0); |
5274 | SDValue N1 = Node->getOperand(Num: 1); |
5275 | if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() && |
5276 | tryVPTESTM(Root: Node, Setcc: N0, InMask: N1)) |
5277 | return; |
5278 | if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() && |
5279 | tryVPTESTM(Root: Node, Setcc: N1, InMask: N0)) |
5280 | return; |
5281 | } |
5282 | |
5283 | if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) { |
5284 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(NewNode, 0)); |
5285 | CurDAG->RemoveDeadNode(N: Node); |
5286 | return; |
5287 | } |
5288 | if (matchBitExtract(Node)) |
5289 | return; |
5290 | if (AndImmShrink && shrinkAndImmediate(And: Node)) |
5291 | return; |
5292 | |
5293 | [[fallthrough]]; |
5294 | case ISD::OR: |
5295 | case ISD::XOR: |
5296 | if (tryShrinkShlLogicImm(N: Node)) |
5297 | return; |
5298 | if (Opcode == ISD::OR && tryMatchBitSelect(N: Node)) |
5299 | return; |
5300 | if (tryVPTERNLOG(N: Node)) |
5301 | return; |
5302 | |
5303 | [[fallthrough]]; |
5304 | case ISD::ADD: |
5305 | if (Opcode == ISD::ADD && matchBitExtract(Node)) |
5306 | return; |
5307 | [[fallthrough]]; |
5308 | case ISD::SUB: { |
5309 | // Try to avoid folding immediates with multiple uses for optsize. |
5310 | // This code tries to select to register form directly to avoid going |
5311 | // through the isel table which might fold the immediate. We can't change |
5312 | // the patterns on the add/sub/and/or/xor with immediate paterns in the |
5313 | // tablegen files to check immediate use count without making the patterns |
5314 | // unavailable to the fast-isel table. |
5315 | if (!CurDAG->shouldOptForSize()) |
5316 | break; |
5317 | |
5318 | // Only handle i8/i16/i32/i64. |
5319 | if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64) |
5320 | break; |
5321 | |
5322 | SDValue N0 = Node->getOperand(Num: 0); |
5323 | SDValue N1 = Node->getOperand(Num: 1); |
5324 | |
5325 | auto *Cst = dyn_cast<ConstantSDNode>(Val&: N1); |
5326 | if (!Cst) |
5327 | break; |
5328 | |
5329 | int64_t Val = Cst->getSExtValue(); |
5330 | |
5331 | // Make sure its an immediate that is considered foldable. |
5332 | // FIXME: Handle unsigned 32 bit immediates for 64-bit AND. |
5333 | if (!isInt<8>(x: Val) && !isInt<32>(x: Val)) |
5334 | break; |
5335 | |
5336 | // If this can match to INC/DEC, let it go. |
5337 | if (Opcode == ISD::ADD && (Val == 1 || Val == -1)) |
5338 | break; |
5339 | |
5340 | // Check if we should avoid folding this immediate. |
5341 | if (!shouldAvoidImmediateInstFormsForSize(N: N1.getNode())) |
5342 | break; |
5343 | |
5344 | // We should not fold the immediate. So we need a register form instead. |
5345 | unsigned ROpc, MOpc; |
5346 | switch (NVT.SimpleTy) { |
5347 | default: llvm_unreachable("Unexpected VT!" ); |
5348 | case MVT::i8: |
5349 | switch (Opcode) { |
5350 | default: llvm_unreachable("Unexpected opcode!" ); |
5351 | case ISD::ADD: |
5352 | ROpc = GET_ND_IF_ENABLED(X86::ADD8rr); |
5353 | MOpc = GET_ND_IF_ENABLED(X86::ADD8rm); |
5354 | break; |
5355 | case ISD::SUB: |
5356 | ROpc = GET_ND_IF_ENABLED(X86::SUB8rr); |
5357 | MOpc = GET_ND_IF_ENABLED(X86::SUB8rm); |
5358 | break; |
5359 | case ISD::AND: |
5360 | ROpc = GET_ND_IF_ENABLED(X86::AND8rr); |
5361 | MOpc = GET_ND_IF_ENABLED(X86::AND8rm); |
5362 | break; |
5363 | case ISD::OR: |
5364 | ROpc = GET_ND_IF_ENABLED(X86::OR8rr); |
5365 | MOpc = GET_ND_IF_ENABLED(X86::OR8rm); |
5366 | break; |
5367 | case ISD::XOR: |
5368 | ROpc = GET_ND_IF_ENABLED(X86::XOR8rr); |
5369 | MOpc = GET_ND_IF_ENABLED(X86::XOR8rm); |
5370 | break; |
5371 | } |
5372 | break; |
5373 | case MVT::i16: |
5374 | switch (Opcode) { |
5375 | default: llvm_unreachable("Unexpected opcode!" ); |
5376 | case ISD::ADD: |
5377 | ROpc = GET_ND_IF_ENABLED(X86::ADD16rr); |
5378 | MOpc = GET_ND_IF_ENABLED(X86::ADD16rm); |
5379 | break; |
5380 | case ISD::SUB: |
5381 | ROpc = GET_ND_IF_ENABLED(X86::SUB16rr); |
5382 | MOpc = GET_ND_IF_ENABLED(X86::SUB16rm); |
5383 | break; |
5384 | case ISD::AND: |
5385 | ROpc = GET_ND_IF_ENABLED(X86::AND16rr); |
5386 | MOpc = GET_ND_IF_ENABLED(X86::AND16rm); |
5387 | break; |
5388 | case ISD::OR: |
5389 | ROpc = GET_ND_IF_ENABLED(X86::OR16rr); |
5390 | MOpc = GET_ND_IF_ENABLED(X86::OR16rm); |
5391 | break; |
5392 | case ISD::XOR: |
5393 | ROpc = GET_ND_IF_ENABLED(X86::XOR16rr); |
5394 | MOpc = GET_ND_IF_ENABLED(X86::XOR16rm); |
5395 | break; |
5396 | } |
5397 | break; |
5398 | case MVT::i32: |
5399 | switch (Opcode) { |
5400 | default: llvm_unreachable("Unexpected opcode!" ); |
5401 | case ISD::ADD: |
5402 | ROpc = GET_ND_IF_ENABLED(X86::ADD32rr); |
5403 | MOpc = GET_ND_IF_ENABLED(X86::ADD32rm); |
5404 | break; |
5405 | case ISD::SUB: |
5406 | ROpc = GET_ND_IF_ENABLED(X86::SUB32rr); |
5407 | MOpc = GET_ND_IF_ENABLED(X86::SUB32rm); |
5408 | break; |
5409 | case ISD::AND: |
5410 | ROpc = GET_ND_IF_ENABLED(X86::AND32rr); |
5411 | MOpc = GET_ND_IF_ENABLED(X86::AND32rm); |
5412 | break; |
5413 | case ISD::OR: |
5414 | ROpc = GET_ND_IF_ENABLED(X86::OR32rr); |
5415 | MOpc = GET_ND_IF_ENABLED(X86::OR32rm); |
5416 | break; |
5417 | case ISD::XOR: |
5418 | ROpc = GET_ND_IF_ENABLED(X86::XOR32rr); |
5419 | MOpc = GET_ND_IF_ENABLED(X86::XOR32rm); |
5420 | break; |
5421 | } |
5422 | break; |
5423 | case MVT::i64: |
5424 | switch (Opcode) { |
5425 | default: llvm_unreachable("Unexpected opcode!" ); |
5426 | case ISD::ADD: |
5427 | ROpc = GET_ND_IF_ENABLED(X86::ADD64rr); |
5428 | MOpc = GET_ND_IF_ENABLED(X86::ADD64rm); |
5429 | break; |
5430 | case ISD::SUB: |
5431 | ROpc = GET_ND_IF_ENABLED(X86::SUB64rr); |
5432 | MOpc = GET_ND_IF_ENABLED(X86::SUB64rm); |
5433 | break; |
5434 | case ISD::AND: |
5435 | ROpc = GET_ND_IF_ENABLED(X86::AND64rr); |
5436 | MOpc = GET_ND_IF_ENABLED(X86::AND64rm); |
5437 | break; |
5438 | case ISD::OR: |
5439 | ROpc = GET_ND_IF_ENABLED(X86::OR64rr); |
5440 | MOpc = GET_ND_IF_ENABLED(X86::OR64rm); |
5441 | break; |
5442 | case ISD::XOR: |
5443 | ROpc = GET_ND_IF_ENABLED(X86::XOR64rr); |
5444 | MOpc = GET_ND_IF_ENABLED(X86::XOR64rm); |
5445 | break; |
5446 | } |
5447 | break; |
5448 | } |
5449 | |
5450 | // Ok this is a AND/OR/XOR/ADD/SUB with constant. |
5451 | |
5452 | // If this is a not a subtract, we can still try to fold a load. |
5453 | if (Opcode != ISD::SUB) { |
5454 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
5455 | if (tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) { |
5456 | SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(i: 0) }; |
5457 | SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other); |
5458 | MachineSDNode *CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops); |
5459 | // Update the chain. |
5460 | ReplaceUses(F: N0.getValue(R: 1), T: SDValue(CNode, 2)); |
5461 | // Record the mem-refs |
5462 | CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N0)->getMemOperand()}); |
5463 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(CNode, 0)); |
5464 | CurDAG->RemoveDeadNode(N: Node); |
5465 | return; |
5466 | } |
5467 | } |
5468 | |
5469 | CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1); |
5470 | return; |
5471 | } |
5472 | |
5473 | case X86ISD::SMUL: |
5474 | // i16/i32/i64 are handled with isel patterns. |
5475 | if (NVT != MVT::i8) |
5476 | break; |
5477 | [[fallthrough]]; |
5478 | case X86ISD::UMUL: { |
5479 | SDValue N0 = Node->getOperand(Num: 0); |
5480 | SDValue N1 = Node->getOperand(Num: 1); |
5481 | |
5482 | unsigned LoReg, ROpc, MOpc; |
5483 | switch (NVT.SimpleTy) { |
5484 | default: llvm_unreachable("Unsupported VT!" ); |
5485 | case MVT::i8: |
5486 | LoReg = X86::AL; |
5487 | ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r; |
5488 | MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m; |
5489 | break; |
5490 | case MVT::i16: |
5491 | LoReg = X86::AX; |
5492 | ROpc = X86::MUL16r; |
5493 | MOpc = X86::MUL16m; |
5494 | break; |
5495 | case MVT::i32: |
5496 | LoReg = X86::EAX; |
5497 | ROpc = X86::MUL32r; |
5498 | MOpc = X86::MUL32m; |
5499 | break; |
5500 | case MVT::i64: |
5501 | LoReg = X86::RAX; |
5502 | ROpc = X86::MUL64r; |
5503 | MOpc = X86::MUL64m; |
5504 | break; |
5505 | } |
5506 | |
5507 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
5508 | bool FoldedLoad = tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4); |
5509 | // Multiply is commutative. |
5510 | if (!FoldedLoad) { |
5511 | FoldedLoad = tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4); |
5512 | if (FoldedLoad) |
5513 | std::swap(a&: N0, b&: N1); |
5514 | } |
5515 | |
5516 | SDValue InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: LoReg, |
5517 | N: N0, Glue: SDValue()).getValue(R: 1); |
5518 | |
5519 | MachineSDNode *CNode; |
5520 | if (FoldedLoad) { |
5521 | // i16/i32/i64 use an instruction that produces a low and high result even |
5522 | // though only the low result is used. |
5523 | SDVTList VTs; |
5524 | if (NVT == MVT::i8) |
5525 | VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other); |
5526 | else |
5527 | VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other); |
5528 | |
5529 | SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(i: 0), |
5530 | InGlue }; |
5531 | CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops); |
5532 | |
5533 | // Update the chain. |
5534 | ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3)); |
5535 | // Record the mem-refs |
5536 | CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()}); |
5537 | } else { |
5538 | // i16/i32/i64 use an instruction that produces a low and high result even |
5539 | // though only the low result is used. |
5540 | SDVTList VTs; |
5541 | if (NVT == MVT::i8) |
5542 | VTs = CurDAG->getVTList(NVT, MVT::i32); |
5543 | else |
5544 | VTs = CurDAG->getVTList(NVT, NVT, MVT::i32); |
5545 | |
5546 | CNode = CurDAG->getMachineNode(Opcode: ROpc, dl, VTs, Ops: {N1, InGlue}); |
5547 | } |
5548 | |
5549 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(CNode, 0)); |
5550 | ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2)); |
5551 | CurDAG->RemoveDeadNode(N: Node); |
5552 | return; |
5553 | } |
5554 | |
5555 | case ISD::SMUL_LOHI: |
5556 | case ISD::UMUL_LOHI: { |
5557 | SDValue N0 = Node->getOperand(Num: 0); |
5558 | SDValue N1 = Node->getOperand(Num: 1); |
5559 | |
5560 | unsigned Opc, MOpc; |
5561 | unsigned LoReg, HiReg; |
5562 | bool IsSigned = Opcode == ISD::SMUL_LOHI; |
5563 | bool UseMULX = !IsSigned && Subtarget->hasBMI2(); |
5564 | bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty(); |
5565 | switch (NVT.SimpleTy) { |
5566 | default: llvm_unreachable("Unsupported VT!" ); |
5567 | case MVT::i32: |
5568 | Opc = UseMULXHi ? X86::MULX32Hrr |
5569 | : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rr) |
5570 | : IsSigned ? X86::IMUL32r |
5571 | : X86::MUL32r; |
5572 | MOpc = UseMULXHi ? X86::MULX32Hrm |
5573 | : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rm) |
5574 | : IsSigned ? X86::IMUL32m |
5575 | : X86::MUL32m; |
5576 | LoReg = UseMULX ? X86::EDX : X86::EAX; |
5577 | HiReg = X86::EDX; |
5578 | break; |
5579 | case MVT::i64: |
5580 | Opc = UseMULXHi ? X86::MULX64Hrr |
5581 | : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rr) |
5582 | : IsSigned ? X86::IMUL64r |
5583 | : X86::MUL64r; |
5584 | MOpc = UseMULXHi ? X86::MULX64Hrm |
5585 | : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rm) |
5586 | : IsSigned ? X86::IMUL64m |
5587 | : X86::MUL64m; |
5588 | LoReg = UseMULX ? X86::RDX : X86::RAX; |
5589 | HiReg = X86::RDX; |
5590 | break; |
5591 | } |
5592 | |
5593 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
5594 | bool foldedLoad = tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4); |
5595 | // Multiply is commutative. |
5596 | if (!foldedLoad) { |
5597 | foldedLoad = tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4); |
5598 | if (foldedLoad) |
5599 | std::swap(a&: N0, b&: N1); |
5600 | } |
5601 | |
5602 | SDValue InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: LoReg, |
5603 | N: N0, Glue: SDValue()).getValue(R: 1); |
5604 | SDValue ResHi, ResLo; |
5605 | if (foldedLoad) { |
5606 | SDValue Chain; |
5607 | MachineSDNode *CNode = nullptr; |
5608 | SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(i: 0), |
5609 | InGlue }; |
5610 | if (UseMULXHi) { |
5611 | SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other); |
5612 | CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops); |
5613 | ResHi = SDValue(CNode, 0); |
5614 | Chain = SDValue(CNode, 1); |
5615 | } else if (UseMULX) { |
5616 | SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other); |
5617 | CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops); |
5618 | ResHi = SDValue(CNode, 0); |
5619 | ResLo = SDValue(CNode, 1); |
5620 | Chain = SDValue(CNode, 2); |
5621 | } else { |
5622 | SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue); |
5623 | CNode = CurDAG->getMachineNode(Opcode: MOpc, dl, VTs, Ops); |
5624 | Chain = SDValue(CNode, 0); |
5625 | InGlue = SDValue(CNode, 1); |
5626 | } |
5627 | |
5628 | // Update the chain. |
5629 | ReplaceUses(F: N1.getValue(R: 1), T: Chain); |
5630 | // Record the mem-refs |
5631 | CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()}); |
5632 | } else { |
5633 | SDValue Ops[] = { N1, InGlue }; |
5634 | if (UseMULXHi) { |
5635 | SDVTList VTs = CurDAG->getVTList(VT: NVT); |
5636 | SDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops); |
5637 | ResHi = SDValue(CNode, 0); |
5638 | } else if (UseMULX) { |
5639 | SDVTList VTs = CurDAG->getVTList(VT1: NVT, VT2: NVT); |
5640 | SDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops); |
5641 | ResHi = SDValue(CNode, 0); |
5642 | ResLo = SDValue(CNode, 1); |
5643 | } else { |
5644 | SDVTList VTs = CurDAG->getVTList(MVT::Glue); |
5645 | SDNode *CNode = CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops); |
5646 | InGlue = SDValue(CNode, 0); |
5647 | } |
5648 | } |
5649 | |
5650 | // Copy the low half of the result, if it is needed. |
5651 | if (!SDValue(Node, 0).use_empty()) { |
5652 | if (!ResLo) { |
5653 | assert(LoReg && "Register for low half is not defined!" ); |
5654 | ResLo = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl, Reg: LoReg, |
5655 | VT: NVT, Glue: InGlue); |
5656 | InGlue = ResLo.getValue(R: 2); |
5657 | } |
5658 | ReplaceUses(F: SDValue(Node, 0), T: ResLo); |
5659 | LLVM_DEBUG(dbgs() << "=> " ; ResLo.getNode()->dump(CurDAG); |
5660 | dbgs() << '\n'); |
5661 | } |
5662 | // Copy the high half of the result, if it is needed. |
5663 | if (!SDValue(Node, 1).use_empty()) { |
5664 | if (!ResHi) { |
5665 | assert(HiReg && "Register for high half is not defined!" ); |
5666 | ResHi = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl, Reg: HiReg, |
5667 | VT: NVT, Glue: InGlue); |
5668 | InGlue = ResHi.getValue(R: 2); |
5669 | } |
5670 | ReplaceUses(F: SDValue(Node, 1), T: ResHi); |
5671 | LLVM_DEBUG(dbgs() << "=> " ; ResHi.getNode()->dump(CurDAG); |
5672 | dbgs() << '\n'); |
5673 | } |
5674 | |
5675 | CurDAG->RemoveDeadNode(N: Node); |
5676 | return; |
5677 | } |
5678 | |
5679 | case ISD::SDIVREM: |
5680 | case ISD::UDIVREM: { |
5681 | SDValue N0 = Node->getOperand(Num: 0); |
5682 | SDValue N1 = Node->getOperand(Num: 1); |
5683 | |
5684 | unsigned ROpc, MOpc; |
5685 | bool isSigned = Opcode == ISD::SDIVREM; |
5686 | if (!isSigned) { |
5687 | switch (NVT.SimpleTy) { |
5688 | default: llvm_unreachable("Unsupported VT!" ); |
5689 | case MVT::i8: ROpc = X86::DIV8r; MOpc = X86::DIV8m; break; |
5690 | case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break; |
5691 | case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break; |
5692 | case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break; |
5693 | } |
5694 | } else { |
5695 | switch (NVT.SimpleTy) { |
5696 | default: llvm_unreachable("Unsupported VT!" ); |
5697 | case MVT::i8: ROpc = X86::IDIV8r; MOpc = X86::IDIV8m; break; |
5698 | case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break; |
5699 | case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break; |
5700 | case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break; |
5701 | } |
5702 | } |
5703 | |
5704 | unsigned LoReg, HiReg, ClrReg; |
5705 | unsigned SExtOpcode; |
5706 | switch (NVT.SimpleTy) { |
5707 | default: llvm_unreachable("Unsupported VT!" ); |
5708 | case MVT::i8: |
5709 | LoReg = X86::AL; ClrReg = HiReg = X86::AH; |
5710 | SExtOpcode = 0; // Not used. |
5711 | break; |
5712 | case MVT::i16: |
5713 | LoReg = X86::AX; HiReg = X86::DX; |
5714 | ClrReg = X86::DX; |
5715 | SExtOpcode = X86::CWD; |
5716 | break; |
5717 | case MVT::i32: |
5718 | LoReg = X86::EAX; ClrReg = HiReg = X86::EDX; |
5719 | SExtOpcode = X86::CDQ; |
5720 | break; |
5721 | case MVT::i64: |
5722 | LoReg = X86::RAX; ClrReg = HiReg = X86::RDX; |
5723 | SExtOpcode = X86::CQO; |
5724 | break; |
5725 | } |
5726 | |
5727 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
5728 | bool foldedLoad = tryFoldLoad(P: Node, N: N1, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4); |
5729 | bool signBitIsZero = CurDAG->SignBitIsZero(Op: N0); |
5730 | |
5731 | SDValue InGlue; |
5732 | if (NVT == MVT::i8) { |
5733 | // Special case for div8, just use a move with zero extension to AX to |
5734 | // clear the upper 8 bits (AH). |
5735 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain; |
5736 | MachineSDNode *Move; |
5737 | if (tryFoldLoad(P: Node, N: N0, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) { |
5738 | SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(i: 0) }; |
5739 | unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8 |
5740 | : X86::MOVZX16rm8; |
5741 | Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, MVT::Other, Ops); |
5742 | Chain = SDValue(Move, 1); |
5743 | ReplaceUses(F: N0.getValue(R: 1), T: Chain); |
5744 | // Record the mem-refs |
5745 | CurDAG->setNodeMemRefs(N: Move, NewMemRefs: {cast<LoadSDNode>(Val&: N0)->getMemOperand()}); |
5746 | } else { |
5747 | unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8 |
5748 | : X86::MOVZX16rr8; |
5749 | Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, N0); |
5750 | Chain = CurDAG->getEntryNode(); |
5751 | } |
5752 | Chain = CurDAG->getCopyToReg(Chain, dl, X86::AX, SDValue(Move, 0), |
5753 | SDValue()); |
5754 | InGlue = Chain.getValue(R: 1); |
5755 | } else { |
5756 | InGlue = |
5757 | CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, |
5758 | Reg: LoReg, N: N0, Glue: SDValue()).getValue(R: 1); |
5759 | if (isSigned && !signBitIsZero) { |
5760 | // Sign extend the low part into the high part. |
5761 | InGlue = |
5762 | SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InGlue),0); |
5763 | } else { |
5764 | // Zero out the high part, effectively zero extending the input. |
5765 | SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32); |
5766 | SDValue ClrNode = SDValue( |
5767 | CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, std::nullopt), 0); |
5768 | switch (NVT.SimpleTy) { |
5769 | case MVT::i16: |
5770 | ClrNode = |
5771 | SDValue(CurDAG->getMachineNode( |
5772 | TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode, |
5773 | CurDAG->getTargetConstant(X86::sub_16bit, dl, |
5774 | MVT::i32)), |
5775 | 0); |
5776 | break; |
5777 | case MVT::i32: |
5778 | break; |
5779 | case MVT::i64: |
5780 | ClrNode = |
5781 | SDValue(CurDAG->getMachineNode( |
5782 | TargetOpcode::SUBREG_TO_REG, dl, MVT::i64, |
5783 | CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode, |
5784 | CurDAG->getTargetConstant(X86::sub_32bit, dl, |
5785 | MVT::i32)), |
5786 | 0); |
5787 | break; |
5788 | default: |
5789 | llvm_unreachable("Unexpected division source" ); |
5790 | } |
5791 | |
5792 | InGlue = CurDAG->getCopyToReg(Chain: CurDAG->getEntryNode(), dl, Reg: ClrReg, |
5793 | N: ClrNode, Glue: InGlue).getValue(R: 1); |
5794 | } |
5795 | } |
5796 | |
5797 | if (foldedLoad) { |
5798 | SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(i: 0), |
5799 | InGlue }; |
5800 | MachineSDNode *CNode = |
5801 | CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops); |
5802 | InGlue = SDValue(CNode, 1); |
5803 | // Update the chain. |
5804 | ReplaceUses(F: N1.getValue(R: 1), T: SDValue(CNode, 0)); |
5805 | // Record the mem-refs |
5806 | CurDAG->setNodeMemRefs(N: CNode, NewMemRefs: {cast<LoadSDNode>(Val&: N1)->getMemOperand()}); |
5807 | } else { |
5808 | InGlue = |
5809 | SDValue(CurDAG->getMachineNode(ROpc, dl, MVT::Glue, N1, InGlue), 0); |
5810 | } |
5811 | |
5812 | // Prevent use of AH in a REX instruction by explicitly copying it to |
5813 | // an ABCD_L register. |
5814 | // |
5815 | // The current assumption of the register allocator is that isel |
5816 | // won't generate explicit references to the GR8_ABCD_H registers. If |
5817 | // the allocator and/or the backend get enhanced to be more robust in |
5818 | // that regard, this can be, and should be, removed. |
5819 | if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) { |
5820 | SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8); |
5821 | unsigned AHExtOpcode = |
5822 | isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX; |
5823 | |
5824 | SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32, |
5825 | MVT::Glue, AHCopy, InGlue); |
5826 | SDValue Result(RNode, 0); |
5827 | InGlue = SDValue(RNode, 1); |
5828 | |
5829 | Result = |
5830 | CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result); |
5831 | |
5832 | ReplaceUses(F: SDValue(Node, 1), T: Result); |
5833 | LLVM_DEBUG(dbgs() << "=> " ; Result.getNode()->dump(CurDAG); |
5834 | dbgs() << '\n'); |
5835 | } |
5836 | // Copy the division (low) result, if it is needed. |
5837 | if (!SDValue(Node, 0).use_empty()) { |
5838 | SDValue Result = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl, |
5839 | Reg: LoReg, VT: NVT, Glue: InGlue); |
5840 | InGlue = Result.getValue(R: 2); |
5841 | ReplaceUses(F: SDValue(Node, 0), T: Result); |
5842 | LLVM_DEBUG(dbgs() << "=> " ; Result.getNode()->dump(CurDAG); |
5843 | dbgs() << '\n'); |
5844 | } |
5845 | // Copy the remainder (high) result, if it is needed. |
5846 | if (!SDValue(Node, 1).use_empty()) { |
5847 | SDValue Result = CurDAG->getCopyFromReg(Chain: CurDAG->getEntryNode(), dl, |
5848 | Reg: HiReg, VT: NVT, Glue: InGlue); |
5849 | InGlue = Result.getValue(R: 2); |
5850 | ReplaceUses(F: SDValue(Node, 1), T: Result); |
5851 | LLVM_DEBUG(dbgs() << "=> " ; Result.getNode()->dump(CurDAG); |
5852 | dbgs() << '\n'); |
5853 | } |
5854 | CurDAG->RemoveDeadNode(N: Node); |
5855 | return; |
5856 | } |
5857 | |
5858 | case X86ISD::FCMP: |
5859 | case X86ISD::STRICT_FCMP: |
5860 | case X86ISD::STRICT_FCMPS: { |
5861 | bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP || |
5862 | Node->getOpcode() == X86ISD::STRICT_FCMPS; |
5863 | SDValue N0 = Node->getOperand(Num: IsStrictCmp ? 1 : 0); |
5864 | SDValue N1 = Node->getOperand(Num: IsStrictCmp ? 2 : 1); |
5865 | |
5866 | // Save the original VT of the compare. |
5867 | MVT CmpVT = N0.getSimpleValueType(); |
5868 | |
5869 | // Floating point needs special handling if we don't have FCOMI. |
5870 | if (Subtarget->canUseCMOV()) |
5871 | break; |
5872 | |
5873 | bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS; |
5874 | |
5875 | unsigned Opc; |
5876 | switch (CmpVT.SimpleTy) { |
5877 | default: llvm_unreachable("Unexpected type!" ); |
5878 | case MVT::f32: |
5879 | Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32; |
5880 | break; |
5881 | case MVT::f64: |
5882 | Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64; |
5883 | break; |
5884 | case MVT::f80: |
5885 | Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80; |
5886 | break; |
5887 | } |
5888 | |
5889 | SDValue Chain = |
5890 | IsStrictCmp ? Node->getOperand(Num: 0) : CurDAG->getEntryNode(); |
5891 | SDValue Glue; |
5892 | if (IsStrictCmp) { |
5893 | SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue); |
5894 | Chain = SDValue(CurDAG->getMachineNode(Opcode: Opc, dl, VTs, Ops: {N0, N1, Chain}), 0); |
5895 | Glue = Chain.getValue(R: 1); |
5896 | } else { |
5897 | Glue = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N0, N1), 0); |
5898 | } |
5899 | |
5900 | // Move FPSW to AX. |
5901 | SDValue FNSTSW = |
5902 | SDValue(CurDAG->getMachineNode(X86::FNSTSW16r, dl, MVT::i16, Glue), 0); |
5903 | |
5904 | // Extract upper 8-bits of AX. |
5905 | SDValue = |
5906 | CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl, MVT::i8, FNSTSW); |
5907 | |
5908 | // Move AH into flags. |
5909 | // Some 64-bit targets lack SAHF support, but they do support FCOMI. |
5910 | assert(Subtarget->canUseLAHFSAHF() && |
5911 | "Target doesn't support SAHF or FCOMI?" ); |
5912 | SDValue AH = CurDAG->getCopyToReg(Chain, dl, X86::AH, Extract, SDValue()); |
5913 | Chain = AH; |
5914 | SDValue SAHF = SDValue( |
5915 | CurDAG->getMachineNode(X86::SAHF, dl, MVT::i32, AH.getValue(1)), 0); |
5916 | |
5917 | if (IsStrictCmp) |
5918 | ReplaceUses(F: SDValue(Node, 1), T: Chain); |
5919 | |
5920 | ReplaceUses(F: SDValue(Node, 0), T: SAHF); |
5921 | CurDAG->RemoveDeadNode(N: Node); |
5922 | return; |
5923 | } |
5924 | |
5925 | case X86ISD::CMP: { |
5926 | SDValue N0 = Node->getOperand(Num: 0); |
5927 | SDValue N1 = Node->getOperand(Num: 1); |
5928 | |
5929 | // Optimizations for TEST compares. |
5930 | if (!isNullConstant(V: N1)) |
5931 | break; |
5932 | |
5933 | // Save the original VT of the compare. |
5934 | MVT CmpVT = N0.getSimpleValueType(); |
5935 | |
5936 | // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed |
5937 | // by a test instruction. The test should be removed later by |
5938 | // analyzeCompare if we are using only the zero flag. |
5939 | // TODO: Should we check the users and use the BEXTR flags directly? |
5940 | if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) { |
5941 | if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node: N0.getNode())) { |
5942 | unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr |
5943 | : X86::TEST32rr; |
5944 | SDValue BEXTR = SDValue(NewNode, 0); |
5945 | NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR); |
5946 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(NewNode, 0)); |
5947 | CurDAG->RemoveDeadNode(N: Node); |
5948 | return; |
5949 | } |
5950 | } |
5951 | |
5952 | // We can peek through truncates, but we need to be careful below. |
5953 | if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse()) |
5954 | N0 = N0.getOperand(i: 0); |
5955 | |
5956 | // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to |
5957 | // use a smaller encoding. |
5958 | // Look past the truncate if CMP is the only use of it. |
5959 | if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() && |
5960 | N0.getValueType() != MVT::i8) { |
5961 | auto *MaskC = dyn_cast<ConstantSDNode>(Val: N0.getOperand(i: 1)); |
5962 | if (!MaskC) |
5963 | break; |
5964 | |
5965 | // We may have looked through a truncate so mask off any bits that |
5966 | // shouldn't be part of the compare. |
5967 | uint64_t Mask = MaskC->getZExtValue(); |
5968 | Mask &= maskTrailingOnes<uint64_t>(N: CmpVT.getScalarSizeInBits()); |
5969 | |
5970 | // Check if we can replace AND+IMM{32,64} with a shift. This is possible |
5971 | // for masks like 0xFF000000 or 0x00FFFFFF and if we care only about the |
5972 | // zero flag. |
5973 | if (CmpVT == MVT::i64 && !isInt<8>(Mask) && isShiftedMask_64(Mask) && |
5974 | onlyUsesZeroFlag(SDValue(Node, 0))) { |
5975 | unsigned ShiftOpcode = ISD::DELETED_NODE; |
5976 | unsigned ShiftAmt; |
5977 | unsigned SubRegIdx; |
5978 | MVT SubRegVT; |
5979 | unsigned TestOpcode; |
5980 | unsigned LeadingZeros = llvm::countl_zero(Val: Mask); |
5981 | unsigned TrailingZeros = llvm::countr_zero(Val: Mask); |
5982 | |
5983 | // With leading/trailing zeros, the transform is profitable if we can |
5984 | // eliminate a movabsq or shrink a 32-bit immediate to 8-bit without |
5985 | // incurring any extra register moves. |
5986 | bool SavesBytes = !isInt<32>(x: Mask) || N0.getOperand(i: 0).hasOneUse(); |
5987 | if (LeadingZeros == 0 && SavesBytes) { |
5988 | // If the mask covers the most significant bit, then we can replace |
5989 | // TEST+AND with a SHR and check eflags. |
5990 | // This emits a redundant TEST which is subsequently eliminated. |
5991 | ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri); |
5992 | ShiftAmt = TrailingZeros; |
5993 | SubRegIdx = 0; |
5994 | TestOpcode = X86::TEST64rr; |
5995 | } else if (TrailingZeros == 0 && SavesBytes) { |
5996 | // If the mask covers the least significant bit, then we can replace |
5997 | // TEST+AND with a SHL and check eflags. |
5998 | // This emits a redundant TEST which is subsequently eliminated. |
5999 | ShiftOpcode = GET_ND_IF_ENABLED(X86::SHL64ri); |
6000 | ShiftAmt = LeadingZeros; |
6001 | SubRegIdx = 0; |
6002 | TestOpcode = X86::TEST64rr; |
6003 | } else if (MaskC->hasOneUse() && !isInt<32>(x: Mask)) { |
6004 | // If the shifted mask extends into the high half and is 8/16/32 bits |
6005 | // wide, then replace it with a SHR and a TEST8rr/TEST16rr/TEST32rr. |
6006 | unsigned PopCount = 64 - LeadingZeros - TrailingZeros; |
6007 | if (PopCount == 8) { |
6008 | ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri); |
6009 | ShiftAmt = TrailingZeros; |
6010 | SubRegIdx = X86::sub_8bit; |
6011 | SubRegVT = MVT::i8; |
6012 | TestOpcode = X86::TEST8rr; |
6013 | } else if (PopCount == 16) { |
6014 | ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri); |
6015 | ShiftAmt = TrailingZeros; |
6016 | SubRegIdx = X86::sub_16bit; |
6017 | SubRegVT = MVT::i16; |
6018 | TestOpcode = X86::TEST16rr; |
6019 | } else if (PopCount == 32) { |
6020 | ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri); |
6021 | ShiftAmt = TrailingZeros; |
6022 | SubRegIdx = X86::sub_32bit; |
6023 | SubRegVT = MVT::i32; |
6024 | TestOpcode = X86::TEST32rr; |
6025 | } |
6026 | } |
6027 | if (ShiftOpcode != ISD::DELETED_NODE) { |
6028 | SDValue ShiftC = CurDAG->getTargetConstant(ShiftAmt, dl, MVT::i64); |
6029 | SDValue Shift = SDValue( |
6030 | CurDAG->getMachineNode(ShiftOpcode, dl, MVT::i64, MVT::i32, |
6031 | N0.getOperand(0), ShiftC), |
6032 | 0); |
6033 | if (SubRegIdx != 0) { |
6034 | Shift = |
6035 | CurDAG->getTargetExtractSubreg(SRIdx: SubRegIdx, DL: dl, VT: SubRegVT, Operand: Shift); |
6036 | } |
6037 | MachineSDNode *Test = |
6038 | CurDAG->getMachineNode(TestOpcode, dl, MVT::i32, Shift, Shift); |
6039 | ReplaceNode(F: Node, T: Test); |
6040 | return; |
6041 | } |
6042 | } |
6043 | |
6044 | MVT VT; |
6045 | int SubRegOp; |
6046 | unsigned ROpc, MOpc; |
6047 | |
6048 | // For each of these checks we need to be careful if the sign flag is |
6049 | // being used. It is only safe to use the sign flag in two conditions, |
6050 | // either the sign bit in the shrunken mask is zero or the final test |
6051 | // size is equal to the original compare size. |
6052 | |
6053 | if (isUInt<8>(Mask) && |
6054 | (!(Mask & 0x80) || CmpVT == MVT::i8 || |
6055 | hasNoSignFlagUses(SDValue(Node, 0)))) { |
6056 | // For example, convert "testl %eax, $8" to "testb %al, $8" |
6057 | VT = MVT::i8; |
6058 | SubRegOp = X86::sub_8bit; |
6059 | ROpc = X86::TEST8ri; |
6060 | MOpc = X86::TEST8mi; |
6061 | } else if (OptForMinSize && isUInt<16>(Mask) && |
6062 | (!(Mask & 0x8000) || CmpVT == MVT::i16 || |
6063 | hasNoSignFlagUses(SDValue(Node, 0)))) { |
6064 | // For example, "testl %eax, $32776" to "testw %ax, $32776". |
6065 | // NOTE: We only want to form TESTW instructions if optimizing for |
6066 | // min size. Otherwise we only save one byte and possibly get a length |
6067 | // changing prefix penalty in the decoders. |
6068 | VT = MVT::i16; |
6069 | SubRegOp = X86::sub_16bit; |
6070 | ROpc = X86::TEST16ri; |
6071 | MOpc = X86::TEST16mi; |
6072 | } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 && |
6073 | ((!(Mask & 0x80000000) && |
6074 | // Without minsize 16-bit Cmps can get here so we need to |
6075 | // be sure we calculate the correct sign flag if needed. |
6076 | (CmpVT != MVT::i16 || !(Mask & 0x8000))) || |
6077 | CmpVT == MVT::i32 || |
6078 | hasNoSignFlagUses(SDValue(Node, 0)))) { |
6079 | // For example, "testq %rax, $268468232" to "testl %eax, $268468232". |
6080 | // NOTE: We only want to run that transform if N0 is 32 or 64 bits. |
6081 | // Otherwize, we find ourselves in a position where we have to do |
6082 | // promotion. If previous passes did not promote the and, we assume |
6083 | // they had a good reason not to and do not promote here. |
6084 | VT = MVT::i32; |
6085 | SubRegOp = X86::sub_32bit; |
6086 | ROpc = X86::TEST32ri; |
6087 | MOpc = X86::TEST32mi; |
6088 | } else { |
6089 | // No eligible transformation was found. |
6090 | break; |
6091 | } |
6092 | |
6093 | SDValue Imm = CurDAG->getTargetConstant(Val: Mask, DL: dl, VT); |
6094 | SDValue Reg = N0.getOperand(i: 0); |
6095 | |
6096 | // Emit a testl or testw. |
6097 | MachineSDNode *NewNode; |
6098 | SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; |
6099 | if (tryFoldLoad(Root: Node, P: N0.getNode(), N: Reg, Base&: Tmp0, Scale&: Tmp1, Index&: Tmp2, Disp&: Tmp3, Segment&: Tmp4)) { |
6100 | if (auto *LoadN = dyn_cast<LoadSDNode>(Val: N0.getOperand(i: 0).getNode())) { |
6101 | if (!LoadN->isSimple()) { |
6102 | unsigned NumVolBits = LoadN->getValueType(ResNo: 0).getSizeInBits(); |
6103 | if ((MOpc == X86::TEST8mi && NumVolBits != 8) || |
6104 | (MOpc == X86::TEST16mi && NumVolBits != 16) || |
6105 | (MOpc == X86::TEST32mi && NumVolBits != 32)) |
6106 | break; |
6107 | } |
6108 | } |
6109 | SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, |
6110 | Reg.getOperand(i: 0) }; |
6111 | NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops); |
6112 | // Update the chain. |
6113 | ReplaceUses(F: Reg.getValue(R: 1), T: SDValue(NewNode, 1)); |
6114 | // Record the mem-refs |
6115 | CurDAG->setNodeMemRefs(N: NewNode, |
6116 | NewMemRefs: {cast<LoadSDNode>(Val&: Reg)->getMemOperand()}); |
6117 | } else { |
6118 | // Extract the subregister if necessary. |
6119 | if (N0.getValueType() != VT) |
6120 | Reg = CurDAG->getTargetExtractSubreg(SRIdx: SubRegOp, DL: dl, VT, Operand: Reg); |
6121 | |
6122 | NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm); |
6123 | } |
6124 | // Replace CMP with TEST. |
6125 | ReplaceNode(F: Node, T: NewNode); |
6126 | return; |
6127 | } |
6128 | break; |
6129 | } |
6130 | case X86ISD::PCMPISTR: { |
6131 | if (!Subtarget->hasSSE42()) |
6132 | break; |
6133 | |
6134 | bool NeedIndex = !SDValue(Node, 0).use_empty(); |
6135 | bool NeedMask = !SDValue(Node, 1).use_empty(); |
6136 | // We can't fold a load if we are going to make two instructions. |
6137 | bool MayFoldLoad = !NeedIndex || !NeedMask; |
6138 | |
6139 | MachineSDNode *CNode; |
6140 | if (NeedMask) { |
6141 | unsigned ROpc = |
6142 | Subtarget->hasAVX() ? X86::VPCMPISTRMrri : X86::PCMPISTRMrri; |
6143 | unsigned MOpc = |
6144 | Subtarget->hasAVX() ? X86::VPCMPISTRMrmi : X86::PCMPISTRMrmi; |
6145 | CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node); |
6146 | ReplaceUses(F: SDValue(Node, 1), T: SDValue(CNode, 0)); |
6147 | } |
6148 | if (NeedIndex || !NeedMask) { |
6149 | unsigned ROpc = |
6150 | Subtarget->hasAVX() ? X86::VPCMPISTRIrri : X86::PCMPISTRIrri; |
6151 | unsigned MOpc = |
6152 | Subtarget->hasAVX() ? X86::VPCMPISTRIrmi : X86::PCMPISTRIrmi; |
6153 | CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node); |
6154 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(CNode, 0)); |
6155 | } |
6156 | |
6157 | // Connect the flag usage to the last instruction created. |
6158 | ReplaceUses(F: SDValue(Node, 2), T: SDValue(CNode, 1)); |
6159 | CurDAG->RemoveDeadNode(N: Node); |
6160 | return; |
6161 | } |
6162 | case X86ISD::PCMPESTR: { |
6163 | if (!Subtarget->hasSSE42()) |
6164 | break; |
6165 | |
6166 | // Copy the two implicit register inputs. |
6167 | SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX, |
6168 | Node->getOperand(1), |
6169 | SDValue()).getValue(1); |
6170 | InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX, |
6171 | Node->getOperand(3), InGlue).getValue(1); |
6172 | |
6173 | bool NeedIndex = !SDValue(Node, 0).use_empty(); |
6174 | bool NeedMask = !SDValue(Node, 1).use_empty(); |
6175 | // We can't fold a load if we are going to make two instructions. |
6176 | bool MayFoldLoad = !NeedIndex || !NeedMask; |
6177 | |
6178 | MachineSDNode *CNode; |
6179 | if (NeedMask) { |
6180 | unsigned ROpc = |
6181 | Subtarget->hasAVX() ? X86::VPCMPESTRMrri : X86::PCMPESTRMrri; |
6182 | unsigned MOpc = |
6183 | Subtarget->hasAVX() ? X86::VPCMPESTRMrmi : X86::PCMPESTRMrmi; |
6184 | CNode = |
6185 | emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node, InGlue); |
6186 | ReplaceUses(F: SDValue(Node, 1), T: SDValue(CNode, 0)); |
6187 | } |
6188 | if (NeedIndex || !NeedMask) { |
6189 | unsigned ROpc = |
6190 | Subtarget->hasAVX() ? X86::VPCMPESTRIrri : X86::PCMPESTRIrri; |
6191 | unsigned MOpc = |
6192 | Subtarget->hasAVX() ? X86::VPCMPESTRIrmi : X86::PCMPESTRIrmi; |
6193 | CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InGlue); |
6194 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(CNode, 0)); |
6195 | } |
6196 | // Connect the flag usage to the last instruction created. |
6197 | ReplaceUses(F: SDValue(Node, 2), T: SDValue(CNode, 1)); |
6198 | CurDAG->RemoveDeadNode(N: Node); |
6199 | return; |
6200 | } |
6201 | |
6202 | case ISD::SETCC: { |
6203 | if (NVT.isVector() && tryVPTESTM(Root: Node, Setcc: SDValue(Node, 0), InMask: SDValue())) |
6204 | return; |
6205 | |
6206 | break; |
6207 | } |
6208 | |
6209 | case ISD::STORE: |
6210 | if (foldLoadStoreIntoMemOperand(Node)) |
6211 | return; |
6212 | break; |
6213 | |
6214 | case X86ISD::SETCC_CARRY: { |
6215 | MVT VT = Node->getSimpleValueType(ResNo: 0); |
6216 | SDValue Result; |
6217 | if (Subtarget->hasSBBDepBreaking()) { |
6218 | // We have to do this manually because tblgen will put the eflags copy in |
6219 | // the wrong place if we use an extract_subreg in the pattern. |
6220 | // Copy flags to the EFLAGS register and glue it to next node. |
6221 | SDValue EFLAGS = |
6222 | CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS, |
6223 | Node->getOperand(1), SDValue()); |
6224 | |
6225 | // Create a 64-bit instruction if the result is 64-bits otherwise use the |
6226 | // 32-bit version. |
6227 | unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r; |
6228 | MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32; |
6229 | Result = SDValue( |
6230 | CurDAG->getMachineNode(Opcode: Opc, dl, VT: SetVT, Op1: EFLAGS, Op2: EFLAGS.getValue(R: 1)), |
6231 | 0); |
6232 | } else { |
6233 | // The target does not recognize sbb with the same reg operand as a |
6234 | // no-source idiom, so we explicitly zero the input values. |
6235 | Result = getSBBZero(N: Node); |
6236 | } |
6237 | |
6238 | // For less than 32-bits we need to extract from the 32-bit node. |
6239 | if (VT == MVT::i8 || VT == MVT::i16) { |
6240 | int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit; |
6241 | Result = CurDAG->getTargetExtractSubreg(SRIdx: SubIndex, DL: dl, VT, Operand: Result); |
6242 | } |
6243 | |
6244 | ReplaceUses(F: SDValue(Node, 0), T: Result); |
6245 | CurDAG->RemoveDeadNode(N: Node); |
6246 | return; |
6247 | } |
6248 | case X86ISD::SBB: { |
6249 | if (isNullConstant(V: Node->getOperand(Num: 0)) && |
6250 | isNullConstant(V: Node->getOperand(Num: 1))) { |
6251 | SDValue Result = getSBBZero(N: Node); |
6252 | |
6253 | // Replace the flag use. |
6254 | ReplaceUses(F: SDValue(Node, 1), T: Result.getValue(R: 1)); |
6255 | |
6256 | // Replace the result use. |
6257 | if (!SDValue(Node, 0).use_empty()) { |
6258 | // For less than 32-bits we need to extract from the 32-bit node. |
6259 | MVT VT = Node->getSimpleValueType(ResNo: 0); |
6260 | if (VT == MVT::i8 || VT == MVT::i16) { |
6261 | int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit; |
6262 | Result = CurDAG->getTargetExtractSubreg(SRIdx: SubIndex, DL: dl, VT, Operand: Result); |
6263 | } |
6264 | ReplaceUses(F: SDValue(Node, 0), T: Result); |
6265 | } |
6266 | |
6267 | CurDAG->RemoveDeadNode(N: Node); |
6268 | return; |
6269 | } |
6270 | break; |
6271 | } |
6272 | case X86ISD::MGATHER: { |
6273 | auto *Mgt = cast<X86MaskedGatherSDNode>(Val: Node); |
6274 | SDValue IndexOp = Mgt->getIndex(); |
6275 | SDValue Mask = Mgt->getMask(); |
6276 | MVT IndexVT = IndexOp.getSimpleValueType(); |
6277 | MVT ValueVT = Node->getSimpleValueType(ResNo: 0); |
6278 | MVT MaskVT = Mask.getSimpleValueType(); |
6279 | |
6280 | // This is just to prevent crashes if the nodes are malformed somehow. We're |
6281 | // otherwise only doing loose type checking in here based on type what |
6282 | // a type constraint would say just like table based isel. |
6283 | if (!ValueVT.isVector() || !MaskVT.isVector()) |
6284 | break; |
6285 | |
6286 | unsigned NumElts = ValueVT.getVectorNumElements(); |
6287 | MVT ValueSVT = ValueVT.getVectorElementType(); |
6288 | |
6289 | bool IsFP = ValueSVT.isFloatingPoint(); |
6290 | unsigned EltSize = ValueSVT.getSizeInBits(); |
6291 | |
6292 | unsigned Opc = 0; |
6293 | bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1; |
6294 | if (AVX512Gather) { |
6295 | if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32) |
6296 | Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm; |
6297 | else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32) |
6298 | Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm; |
6299 | else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32) |
6300 | Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm; |
6301 | else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64) |
6302 | Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm; |
6303 | else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64) |
6304 | Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm; |
6305 | else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64) |
6306 | Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm; |
6307 | else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32) |
6308 | Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm; |
6309 | else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32) |
6310 | Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm; |
6311 | else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32) |
6312 | Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm; |
6313 | else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64) |
6314 | Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm; |
6315 | else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64) |
6316 | Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm; |
6317 | else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64) |
6318 | Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm; |
6319 | } else { |
6320 | assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() && |
6321 | "Unexpected mask VT!" ); |
6322 | if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32) |
6323 | Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm; |
6324 | else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32) |
6325 | Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm; |
6326 | else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64) |
6327 | Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm; |
6328 | else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64) |
6329 | Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm; |
6330 | else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32) |
6331 | Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm; |
6332 | else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32) |
6333 | Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm; |
6334 | else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64) |
6335 | Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm; |
6336 | else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64) |
6337 | Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm; |
6338 | } |
6339 | |
6340 | if (!Opc) |
6341 | break; |
6342 | |
6343 | SDValue Base, Scale, Index, Disp, Segment; |
6344 | if (!selectVectorAddr(Parent: Mgt, BasePtr: Mgt->getBasePtr(), IndexOp, ScaleOp: Mgt->getScale(), |
6345 | Base, Scale, Index, Disp, Segment)) |
6346 | break; |
6347 | |
6348 | SDValue PassThru = Mgt->getPassThru(); |
6349 | SDValue Chain = Mgt->getChain(); |
6350 | // Gather instructions have a mask output not in the ISD node. |
6351 | SDVTList VTs = CurDAG->getVTList(ValueVT, MaskVT, MVT::Other); |
6352 | |
6353 | MachineSDNode *NewNode; |
6354 | if (AVX512Gather) { |
6355 | SDValue Ops[] = {PassThru, Mask, Base, Scale, |
6356 | Index, Disp, Segment, Chain}; |
6357 | NewNode = CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(dl), VTs, Ops); |
6358 | } else { |
6359 | SDValue Ops[] = {PassThru, Base, Scale, Index, |
6360 | Disp, Segment, Mask, Chain}; |
6361 | NewNode = CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(dl), VTs, Ops); |
6362 | } |
6363 | CurDAG->setNodeMemRefs(N: NewNode, NewMemRefs: {Mgt->getMemOperand()}); |
6364 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(NewNode, 0)); |
6365 | ReplaceUses(F: SDValue(Node, 1), T: SDValue(NewNode, 2)); |
6366 | CurDAG->RemoveDeadNode(N: Node); |
6367 | return; |
6368 | } |
6369 | case X86ISD::MSCATTER: { |
6370 | auto *Sc = cast<X86MaskedScatterSDNode>(Val: Node); |
6371 | SDValue Value = Sc->getValue(); |
6372 | SDValue IndexOp = Sc->getIndex(); |
6373 | MVT IndexVT = IndexOp.getSimpleValueType(); |
6374 | MVT ValueVT = Value.getSimpleValueType(); |
6375 | |
6376 | // This is just to prevent crashes if the nodes are malformed somehow. We're |
6377 | // otherwise only doing loose type checking in here based on type what |
6378 | // a type constraint would say just like table based isel. |
6379 | if (!ValueVT.isVector()) |
6380 | break; |
6381 | |
6382 | unsigned NumElts = ValueVT.getVectorNumElements(); |
6383 | MVT ValueSVT = ValueVT.getVectorElementType(); |
6384 | |
6385 | bool IsFP = ValueSVT.isFloatingPoint(); |
6386 | unsigned EltSize = ValueSVT.getSizeInBits(); |
6387 | |
6388 | unsigned Opc; |
6389 | if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32) |
6390 | Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr; |
6391 | else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32) |
6392 | Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr; |
6393 | else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32) |
6394 | Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr; |
6395 | else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64) |
6396 | Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr; |
6397 | else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64) |
6398 | Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr; |
6399 | else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64) |
6400 | Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr; |
6401 | else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32) |
6402 | Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr; |
6403 | else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32) |
6404 | Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr; |
6405 | else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32) |
6406 | Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr; |
6407 | else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64) |
6408 | Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr; |
6409 | else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64) |
6410 | Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr; |
6411 | else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64) |
6412 | Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr; |
6413 | else |
6414 | break; |
6415 | |
6416 | SDValue Base, Scale, Index, Disp, Segment; |
6417 | if (!selectVectorAddr(Parent: Sc, BasePtr: Sc->getBasePtr(), IndexOp, ScaleOp: Sc->getScale(), |
6418 | Base, Scale, Index, Disp, Segment)) |
6419 | break; |
6420 | |
6421 | SDValue Mask = Sc->getMask(); |
6422 | SDValue Chain = Sc->getChain(); |
6423 | // Scatter instructions have a mask output not in the ISD node. |
6424 | SDVTList VTs = CurDAG->getVTList(Mask.getValueType(), MVT::Other); |
6425 | SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain}; |
6426 | |
6427 | MachineSDNode *NewNode = CurDAG->getMachineNode(Opcode: Opc, dl: SDLoc(dl), VTs, Ops); |
6428 | CurDAG->setNodeMemRefs(N: NewNode, NewMemRefs: {Sc->getMemOperand()}); |
6429 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(NewNode, 1)); |
6430 | CurDAG->RemoveDeadNode(N: Node); |
6431 | return; |
6432 | } |
6433 | case ISD::PREALLOCATED_SETUP: { |
6434 | auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>(); |
6435 | auto CallId = MFI->getPreallocatedIdForCallSite( |
6436 | CS: cast<SrcValueSDNode>(Val: Node->getOperand(Num: 1))->getValue()); |
6437 | SDValue Chain = Node->getOperand(Num: 0); |
6438 | SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32); |
6439 | MachineSDNode *New = CurDAG->getMachineNode( |
6440 | TargetOpcode::PREALLOCATED_SETUP, dl, MVT::Other, CallIdValue, Chain); |
6441 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(New, 0)); // Chain |
6442 | CurDAG->RemoveDeadNode(N: Node); |
6443 | return; |
6444 | } |
6445 | case ISD::PREALLOCATED_ARG: { |
6446 | auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>(); |
6447 | auto CallId = MFI->getPreallocatedIdForCallSite( |
6448 | CS: cast<SrcValueSDNode>(Val: Node->getOperand(Num: 1))->getValue()); |
6449 | SDValue Chain = Node->getOperand(Num: 0); |
6450 | SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32); |
6451 | SDValue ArgIndex = Node->getOperand(Num: 2); |
6452 | SDValue Ops[3]; |
6453 | Ops[0] = CallIdValue; |
6454 | Ops[1] = ArgIndex; |
6455 | Ops[2] = Chain; |
6456 | MachineSDNode *New = CurDAG->getMachineNode( |
6457 | TargetOpcode::PREALLOCATED_ARG, dl, |
6458 | CurDAG->getVTList(TLI->getPointerTy(CurDAG->getDataLayout()), |
6459 | MVT::Other), |
6460 | Ops); |
6461 | ReplaceUses(F: SDValue(Node, 0), T: SDValue(New, 0)); // Arg pointer |
6462 | ReplaceUses(F: SDValue(Node, 1), T: SDValue(New, 1)); // Chain |
6463 | CurDAG->RemoveDeadNode(N: Node); |
6464 | return; |
6465 | } |
6466 | case X86ISD::AESENCWIDE128KL: |
6467 | case X86ISD::AESDECWIDE128KL: |
6468 | case X86ISD::AESENCWIDE256KL: |
6469 | case X86ISD::AESDECWIDE256KL: { |
6470 | if (!Subtarget->hasWIDEKL()) |
6471 | break; |
6472 | |
6473 | unsigned Opcode; |
6474 | switch (Node->getOpcode()) { |
6475 | default: |
6476 | llvm_unreachable("Unexpected opcode!" ); |
6477 | case X86ISD::AESENCWIDE128KL: |
6478 | Opcode = X86::AESENCWIDE128KL; |
6479 | break; |
6480 | case X86ISD::AESDECWIDE128KL: |
6481 | Opcode = X86::AESDECWIDE128KL; |
6482 | break; |
6483 | case X86ISD::AESENCWIDE256KL: |
6484 | Opcode = X86::AESENCWIDE256KL; |
6485 | break; |
6486 | case X86ISD::AESDECWIDE256KL: |
6487 | Opcode = X86::AESDECWIDE256KL; |
6488 | break; |
6489 | } |
6490 | |
6491 | SDValue Chain = Node->getOperand(Num: 0); |
6492 | SDValue Addr = Node->getOperand(Num: 1); |
6493 | |
6494 | SDValue Base, Scale, Index, Disp, Segment; |
6495 | if (!selectAddr(Parent: Node, N: Addr, Base, Scale, Index, Disp, Segment)) |
6496 | break; |
6497 | |
6498 | Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(2), |
6499 | SDValue()); |
6500 | Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(3), |
6501 | Chain.getValue(1)); |
6502 | Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM2, Node->getOperand(4), |
6503 | Chain.getValue(1)); |
6504 | Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM3, Node->getOperand(5), |
6505 | Chain.getValue(1)); |
6506 | Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM4, Node->getOperand(6), |
6507 | Chain.getValue(1)); |
6508 | Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM5, Node->getOperand(7), |
6509 | Chain.getValue(1)); |
6510 | Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM6, Node->getOperand(8), |
6511 | Chain.getValue(1)); |
6512 | Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM7, Node->getOperand(9), |
6513 | Chain.getValue(1)); |
6514 | |
6515 | MachineSDNode *Res = CurDAG->getMachineNode( |
6516 | Opcode, dl, VTs: Node->getVTList(), |
6517 | Ops: {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(R: 1)}); |
6518 | CurDAG->setNodeMemRefs(N: Res, NewMemRefs: cast<MemSDNode>(Val: Node)->getMemOperand()); |
6519 | ReplaceNode(F: Node, T: Res); |
6520 | return; |
6521 | } |
6522 | } |
6523 | |
6524 | SelectCode(Node); |
6525 | } |
6526 | |
6527 | bool X86DAGToDAGISel::SelectInlineAsmMemoryOperand( |
6528 | const SDValue &Op, InlineAsm::ConstraintCode ConstraintID, |
6529 | std::vector<SDValue> &OutOps) { |
6530 | SDValue Op0, Op1, Op2, Op3, Op4; |
6531 | switch (ConstraintID) { |
6532 | default: |
6533 | llvm_unreachable("Unexpected asm memory constraint" ); |
6534 | case InlineAsm::ConstraintCode::o: // offsetable ?? |
6535 | case InlineAsm::ConstraintCode::v: // not offsetable ?? |
6536 | case InlineAsm::ConstraintCode::m: // memory |
6537 | case InlineAsm::ConstraintCode::X: |
6538 | case InlineAsm::ConstraintCode::p: // address |
6539 | if (!selectAddr(Parent: nullptr, N: Op, Base&: Op0, Scale&: Op1, Index&: Op2, Disp&: Op3, Segment&: Op4)) |
6540 | return true; |
6541 | break; |
6542 | } |
6543 | |
6544 | OutOps.push_back(x: Op0); |
6545 | OutOps.push_back(x: Op1); |
6546 | OutOps.push_back(x: Op2); |
6547 | OutOps.push_back(x: Op3); |
6548 | OutOps.push_back(x: Op4); |
6549 | return false; |
6550 | } |
6551 | |
6552 | /// This pass converts a legalized DAG into a X86-specific DAG, |
6553 | /// ready for instruction scheduling. |
6554 | FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM, |
6555 | CodeGenOptLevel OptLevel) { |
6556 | return new X86DAGToDAGISel(TM, OptLevel); |
6557 | } |
6558 | |