1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
14#include "AArch64CallingConvention.h"
15#include "AArch64ExpandImm.h"
16#include "AArch64MachineFunctionInfo.h"
17#include "AArch64PerfectShuffle.h"
18#include "AArch64RegisterInfo.h"
19#include "AArch64Subtarget.h"
20#include "MCTargetDesc/AArch64AddressingModes.h"
21#include "Utils/AArch64BaseInfo.h"
22#include "llvm/ADT/APFloat.h"
23#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/ArrayRef.h"
25#include "llvm/ADT/STLExtras.h"
26#include "llvm/ADT/SmallSet.h"
27#include "llvm/ADT/SmallVector.h"
28#include "llvm/ADT/Statistic.h"
29#include "llvm/ADT/StringRef.h"
30#include "llvm/ADT/Twine.h"
31#include "llvm/Analysis/LoopInfo.h"
32#include "llvm/Analysis/MemoryLocation.h"
33#include "llvm/Analysis/ObjCARCUtil.h"
34#include "llvm/Analysis/OptimizationRemarkEmitter.h"
35#include "llvm/Analysis/TargetTransformInfo.h"
36#include "llvm/Analysis/ValueTracking.h"
37#include "llvm/Analysis/VectorUtils.h"
38#include "llvm/CodeGen/Analysis.h"
39#include "llvm/CodeGen/CallingConvLower.h"
40#include "llvm/CodeGen/ComplexDeinterleavingPass.h"
41#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
42#include "llvm/CodeGen/GlobalISel/Utils.h"
43#include "llvm/CodeGen/ISDOpcodes.h"
44#include "llvm/CodeGen/MachineBasicBlock.h"
45#include "llvm/CodeGen/MachineFrameInfo.h"
46#include "llvm/CodeGen/MachineFunction.h"
47#include "llvm/CodeGen/MachineInstr.h"
48#include "llvm/CodeGen/MachineInstrBuilder.h"
49#include "llvm/CodeGen/MachineMemOperand.h"
50#include "llvm/CodeGen/MachineRegisterInfo.h"
51#include "llvm/CodeGen/RuntimeLibcalls.h"
52#include "llvm/CodeGen/SelectionDAG.h"
53#include "llvm/CodeGen/SelectionDAGNodes.h"
54#include "llvm/CodeGen/TargetCallingConv.h"
55#include "llvm/CodeGen/TargetInstrInfo.h"
56#include "llvm/CodeGen/TargetOpcodes.h"
57#include "llvm/CodeGen/ValueTypes.h"
58#include "llvm/CodeGenTypes/MachineValueType.h"
59#include "llvm/IR/Attributes.h"
60#include "llvm/IR/Constants.h"
61#include "llvm/IR/DataLayout.h"
62#include "llvm/IR/DebugLoc.h"
63#include "llvm/IR/DerivedTypes.h"
64#include "llvm/IR/Function.h"
65#include "llvm/IR/GetElementPtrTypeIterator.h"
66#include "llvm/IR/GlobalValue.h"
67#include "llvm/IR/IRBuilder.h"
68#include "llvm/IR/Instruction.h"
69#include "llvm/IR/Instructions.h"
70#include "llvm/IR/IntrinsicInst.h"
71#include "llvm/IR/Intrinsics.h"
72#include "llvm/IR/IntrinsicsAArch64.h"
73#include "llvm/IR/Module.h"
74#include "llvm/IR/PatternMatch.h"
75#include "llvm/IR/Type.h"
76#include "llvm/IR/Use.h"
77#include "llvm/IR/Value.h"
78#include "llvm/MC/MCRegisterInfo.h"
79#include "llvm/Support/AtomicOrdering.h"
80#include "llvm/Support/Casting.h"
81#include "llvm/Support/CodeGen.h"
82#include "llvm/Support/CommandLine.h"
83#include "llvm/Support/Debug.h"
84#include "llvm/Support/ErrorHandling.h"
85#include "llvm/Support/InstructionCost.h"
86#include "llvm/Support/KnownBits.h"
87#include "llvm/Support/MathExtras.h"
88#include "llvm/Support/raw_ostream.h"
89#include "llvm/Target/TargetMachine.h"
90#include "llvm/Target/TargetOptions.h"
91#include "llvm/TargetParser/Triple.h"
92#include <algorithm>
93#include <bitset>
94#include <cassert>
95#include <cctype>
96#include <cstdint>
97#include <cstdlib>
98#include <iterator>
99#include <limits>
100#include <optional>
101#include <tuple>
102#include <utility>
103#include <vector>
104
105using namespace llvm;
106using namespace llvm::PatternMatch;
107
108#define DEBUG_TYPE "aarch64-lower"
109
110STATISTIC(NumTailCalls, "Number of tail calls");
111STATISTIC(NumShiftInserts, "Number of vector shift inserts");
112STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
113
114// FIXME: The necessary dtprel relocations don't seem to be supported
115// well in the GNU bfd and gold linkers at the moment. Therefore, by
116// default, for now, fall back to GeneralDynamic code generation.
117cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
118 "aarch64-elf-ldtls-generation", cl::Hidden,
119 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
120 cl::init(Val: false));
121
122static cl::opt<bool>
123EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
124 cl::desc("Enable AArch64 logical imm instruction "
125 "optimization"),
126 cl::init(Val: true));
127
128// Temporary option added for the purpose of testing functionality added
129// to DAGCombiner.cpp in D92230. It is expected that this can be removed
130// in future when both implementations will be based off MGATHER rather
131// than the GLD1 nodes added for the SVE gather load intrinsics.
132static cl::opt<bool>
133EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
134 cl::desc("Combine extends of AArch64 masked "
135 "gather intrinsics"),
136 cl::init(Val: true));
137
138static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
139 cl::desc("Combine ext and trunc to TBL"),
140 cl::init(Val: true));
141
142// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
143// bottleneck after this transform on high end CPU. So this max leaf node
144// limitation is guard cmp+ccmp will be profitable.
145static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(Val: 16), cl::Hidden,
146 cl::desc("Maximum of xors"));
147
148/// Value type used for condition codes.
149static const MVT MVT_CC = MVT::i32;
150
151static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
152 AArch64::X3, AArch64::X4, AArch64::X5,
153 AArch64::X6, AArch64::X7};
154static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
155 AArch64::Q3, AArch64::Q4, AArch64::Q5,
156 AArch64::Q6, AArch64::Q7};
157
158ArrayRef<MCPhysReg> llvm::AArch64::getGPRArgRegs() { return GPRArgRegs; }
159
160ArrayRef<MCPhysReg> llvm::AArch64::getFPRArgRegs() { return FPRArgRegs; }
161
162static inline EVT getPackedSVEVectorVT(EVT VT) {
163 switch (VT.getSimpleVT().SimpleTy) {
164 default:
165 llvm_unreachable("unexpected element type for vector");
166 case MVT::i8:
167 return MVT::nxv16i8;
168 case MVT::i16:
169 return MVT::nxv8i16;
170 case MVT::i32:
171 return MVT::nxv4i32;
172 case MVT::i64:
173 return MVT::nxv2i64;
174 case MVT::f16:
175 return MVT::nxv8f16;
176 case MVT::f32:
177 return MVT::nxv4f32;
178 case MVT::f64:
179 return MVT::nxv2f64;
180 case MVT::bf16:
181 return MVT::nxv8bf16;
182 }
183}
184
185// NOTE: Currently there's only a need to return integer vector types. If this
186// changes then just add an extra "type" parameter.
187static inline EVT getPackedSVEVectorVT(ElementCount EC) {
188 switch (EC.getKnownMinValue()) {
189 default:
190 llvm_unreachable("unexpected element count for vector");
191 case 16:
192 return MVT::nxv16i8;
193 case 8:
194 return MVT::nxv8i16;
195 case 4:
196 return MVT::nxv4i32;
197 case 2:
198 return MVT::nxv2i64;
199 }
200}
201
202static inline EVT getPromotedVTForPredicate(EVT VT) {
203 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
204 "Expected scalable predicate vector type!");
205 switch (VT.getVectorMinNumElements()) {
206 default:
207 llvm_unreachable("unexpected element count for vector");
208 case 2:
209 return MVT::nxv2i64;
210 case 4:
211 return MVT::nxv4i32;
212 case 8:
213 return MVT::nxv8i16;
214 case 16:
215 return MVT::nxv16i8;
216 }
217}
218
219/// Returns true if VT's elements occupy the lowest bit positions of its
220/// associated register class without any intervening space.
221///
222/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
223/// same register class, but only nxv8f16 can be treated as a packed vector.
224static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
225 assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
226 "Expected legal vector type!");
227 return VT.isFixedLengthVector() ||
228 VT.getSizeInBits().getKnownMinValue() == AArch64::SVEBitsPerBlock;
229}
230
231// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
232// predicate and end with a passthru value matching the result type.
233static bool isMergePassthruOpcode(unsigned Opc) {
234 switch (Opc) {
235 default:
236 return false;
237 case AArch64ISD::BITREVERSE_MERGE_PASSTHRU:
238 case AArch64ISD::BSWAP_MERGE_PASSTHRU:
239 case AArch64ISD::REVH_MERGE_PASSTHRU:
240 case AArch64ISD::REVW_MERGE_PASSTHRU:
241 case AArch64ISD::REVD_MERGE_PASSTHRU:
242 case AArch64ISD::CTLZ_MERGE_PASSTHRU:
243 case AArch64ISD::CTPOP_MERGE_PASSTHRU:
244 case AArch64ISD::DUP_MERGE_PASSTHRU:
245 case AArch64ISD::ABS_MERGE_PASSTHRU:
246 case AArch64ISD::NEG_MERGE_PASSTHRU:
247 case AArch64ISD::FNEG_MERGE_PASSTHRU:
248 case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU:
249 case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU:
250 case AArch64ISD::FCEIL_MERGE_PASSTHRU:
251 case AArch64ISD::FFLOOR_MERGE_PASSTHRU:
252 case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU:
253 case AArch64ISD::FRINT_MERGE_PASSTHRU:
254 case AArch64ISD::FROUND_MERGE_PASSTHRU:
255 case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU:
256 case AArch64ISD::FTRUNC_MERGE_PASSTHRU:
257 case AArch64ISD::FP_ROUND_MERGE_PASSTHRU:
258 case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU:
259 case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU:
260 case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU:
261 case AArch64ISD::FCVTZU_MERGE_PASSTHRU:
262 case AArch64ISD::FCVTZS_MERGE_PASSTHRU:
263 case AArch64ISD::FSQRT_MERGE_PASSTHRU:
264 case AArch64ISD::FRECPX_MERGE_PASSTHRU:
265 case AArch64ISD::FABS_MERGE_PASSTHRU:
266 return true;
267 }
268}
269
270// Returns true if inactive lanes are known to be zeroed by construction.
271static bool isZeroingInactiveLanes(SDValue Op) {
272 switch (Op.getOpcode()) {
273 default:
274 return false;
275 // We guarantee i1 splat_vectors to zero the other lanes
276 case ISD::SPLAT_VECTOR:
277 case AArch64ISD::PTRUE:
278 case AArch64ISD::SETCC_MERGE_ZERO:
279 return true;
280 case ISD::INTRINSIC_WO_CHAIN:
281 switch (Op.getConstantOperandVal(i: 0)) {
282 default:
283 return false;
284 case Intrinsic::aarch64_sve_ptrue:
285 case Intrinsic::aarch64_sve_pnext:
286 case Intrinsic::aarch64_sve_cmpeq:
287 case Intrinsic::aarch64_sve_cmpne:
288 case Intrinsic::aarch64_sve_cmpge:
289 case Intrinsic::aarch64_sve_cmpgt:
290 case Intrinsic::aarch64_sve_cmphs:
291 case Intrinsic::aarch64_sve_cmphi:
292 case Intrinsic::aarch64_sve_cmpeq_wide:
293 case Intrinsic::aarch64_sve_cmpne_wide:
294 case Intrinsic::aarch64_sve_cmpge_wide:
295 case Intrinsic::aarch64_sve_cmpgt_wide:
296 case Intrinsic::aarch64_sve_cmplt_wide:
297 case Intrinsic::aarch64_sve_cmple_wide:
298 case Intrinsic::aarch64_sve_cmphs_wide:
299 case Intrinsic::aarch64_sve_cmphi_wide:
300 case Intrinsic::aarch64_sve_cmplo_wide:
301 case Intrinsic::aarch64_sve_cmpls_wide:
302 case Intrinsic::aarch64_sve_fcmpeq:
303 case Intrinsic::aarch64_sve_fcmpne:
304 case Intrinsic::aarch64_sve_fcmpge:
305 case Intrinsic::aarch64_sve_fcmpgt:
306 case Intrinsic::aarch64_sve_fcmpuo:
307 case Intrinsic::aarch64_sve_facgt:
308 case Intrinsic::aarch64_sve_facge:
309 case Intrinsic::aarch64_sve_whilege:
310 case Intrinsic::aarch64_sve_whilegt:
311 case Intrinsic::aarch64_sve_whilehi:
312 case Intrinsic::aarch64_sve_whilehs:
313 case Intrinsic::aarch64_sve_whilele:
314 case Intrinsic::aarch64_sve_whilelo:
315 case Intrinsic::aarch64_sve_whilels:
316 case Intrinsic::aarch64_sve_whilelt:
317 case Intrinsic::aarch64_sve_match:
318 case Intrinsic::aarch64_sve_nmatch:
319 case Intrinsic::aarch64_sve_whilege_x2:
320 case Intrinsic::aarch64_sve_whilegt_x2:
321 case Intrinsic::aarch64_sve_whilehi_x2:
322 case Intrinsic::aarch64_sve_whilehs_x2:
323 case Intrinsic::aarch64_sve_whilele_x2:
324 case Intrinsic::aarch64_sve_whilelo_x2:
325 case Intrinsic::aarch64_sve_whilels_x2:
326 case Intrinsic::aarch64_sve_whilelt_x2:
327 return true;
328 }
329 }
330}
331
332AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
333 const AArch64Subtarget &STI)
334 : TargetLowering(TM), Subtarget(&STI) {
335 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
336 // we have to make something up. Arbitrarily, choose ZeroOrOne.
337 setBooleanContents(ZeroOrOneBooleanContent);
338 // When comparing vectors the result sets the different elements in the
339 // vector to all-one or all-zero.
340 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
341
342 // Set up the register classes.
343 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
344 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
345
346 if (Subtarget->hasLS64()) {
347 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
348 setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
349 setOperationAction(ISD::STORE, MVT::i64x8, Custom);
350 }
351
352 if (Subtarget->hasFPARMv8()) {
353 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
354 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
355 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
356 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
357 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
358 }
359
360 if (Subtarget->hasNEON()) {
361 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
362 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
363 // Someone set us up the NEON.
364 addDRTypeForNEON(MVT::v2f32);
365 addDRTypeForNEON(MVT::v8i8);
366 addDRTypeForNEON(MVT::v4i16);
367 addDRTypeForNEON(MVT::v2i32);
368 addDRTypeForNEON(MVT::v1i64);
369 addDRTypeForNEON(MVT::v1f64);
370 addDRTypeForNEON(MVT::v4f16);
371 addDRTypeForNEON(MVT::v4bf16);
372
373 addQRTypeForNEON(MVT::v4f32);
374 addQRTypeForNEON(MVT::v2f64);
375 addQRTypeForNEON(MVT::v16i8);
376 addQRTypeForNEON(MVT::v8i16);
377 addQRTypeForNEON(MVT::v4i32);
378 addQRTypeForNEON(MVT::v2i64);
379 addQRTypeForNEON(MVT::v8f16);
380 addQRTypeForNEON(MVT::v8bf16);
381 }
382
383 if (Subtarget->hasSVEorSME()) {
384 // Add legal sve predicate types
385 addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
386 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
387 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
388 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
389 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
390
391 // Add legal sve data types
392 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
393 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
394 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
395 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
396
397 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
398 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
399 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
400 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
401 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
402 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
403
404 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
405 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
406 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
407
408 if (Subtarget->useSVEForFixedLengthVectors()) {
409 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
410 if (useSVEForFixedLengthVectorVT(VT))
411 addRegisterClass(VT, &AArch64::ZPRRegClass);
412
413 for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
414 if (useSVEForFixedLengthVectorVT(VT))
415 addRegisterClass(VT, &AArch64::ZPRRegClass);
416 }
417 }
418
419 if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
420 addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);
421 setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);
422 setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);
423
424 setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);
425 setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);
426 }
427
428 // Compute derived properties from the register classes
429 computeRegisterProperties(Subtarget->getRegisterInfo());
430
431 // Provide all sorts of operation actions
432 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
433 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
434 setOperationAction(ISD::SETCC, MVT::i32, Custom);
435 setOperationAction(ISD::SETCC, MVT::i64, Custom);
436 setOperationAction(ISD::SETCC, MVT::bf16, Custom);
437 setOperationAction(ISD::SETCC, MVT::f16, Custom);
438 setOperationAction(ISD::SETCC, MVT::f32, Custom);
439 setOperationAction(ISD::SETCC, MVT::f64, Custom);
440 setOperationAction(ISD::STRICT_FSETCC, MVT::bf16, Custom);
441 setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom);
442 setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom);
443 setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom);
444 setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);
445 setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom);
446 setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);
447 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
448 setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
449 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
450 setOperationAction(ISD::BR_CC, MVT::i32, Custom);
451 setOperationAction(ISD::BR_CC, MVT::i64, Custom);
452 setOperationAction(ISD::BR_CC, MVT::f16, Custom);
453 setOperationAction(ISD::BR_CC, MVT::f32, Custom);
454 setOperationAction(ISD::BR_CC, MVT::f64, Custom);
455 setOperationAction(ISD::SELECT, MVT::i32, Custom);
456 setOperationAction(ISD::SELECT, MVT::i64, Custom);
457 setOperationAction(ISD::SELECT, MVT::f16, Custom);
458 setOperationAction(ISD::SELECT, MVT::bf16, Custom);
459 setOperationAction(ISD::SELECT, MVT::f32, Custom);
460 setOperationAction(ISD::SELECT, MVT::f64, Custom);
461 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
462 setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
463 setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
464 setOperationAction(ISD::SELECT_CC, MVT::bf16, Custom);
465 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
466 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
467 setOperationAction(ISD::BR_JT, MVT::Other, Custom);
468 setOperationAction(ISD::JumpTable, MVT::i64, Custom);
469 setOperationAction(ISD::SETCCCARRY, MVT::i64, Custom);
470
471 setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
472 setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
473 setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
474
475 setOperationAction(ISD::FREM, MVT::f32, Expand);
476 setOperationAction(ISD::FREM, MVT::f64, Expand);
477 setOperationAction(ISD::FREM, MVT::f80, Expand);
478
479 setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
480
481 // Custom lowering hooks are needed for XOR
482 // to fold it into CSINC/CSINV.
483 setOperationAction(ISD::XOR, MVT::i32, Custom);
484 setOperationAction(ISD::XOR, MVT::i64, Custom);
485
486 // Virtually no operation on f128 is legal, but LLVM can't expand them when
487 // there's a valid register class, so we need custom operations in most cases.
488 setOperationAction(ISD::FABS, MVT::f128, Expand);
489 setOperationAction(ISD::FADD, MVT::f128, LibCall);
490 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
491 setOperationAction(ISD::FCOS, MVT::f128, Expand);
492 setOperationAction(ISD::FDIV, MVT::f128, LibCall);
493 setOperationAction(ISD::FMA, MVT::f128, Expand);
494 setOperationAction(ISD::FMUL, MVT::f128, LibCall);
495 setOperationAction(ISD::FNEG, MVT::f128, Expand);
496 setOperationAction(ISD::FPOW, MVT::f128, Expand);
497 setOperationAction(ISD::FREM, MVT::f128, Expand);
498 setOperationAction(ISD::FRINT, MVT::f128, Expand);
499 setOperationAction(ISD::FSIN, MVT::f128, Expand);
500 setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
501 setOperationAction(ISD::FSQRT, MVT::f128, Expand);
502 setOperationAction(ISD::FSUB, MVT::f128, LibCall);
503 setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
504 setOperationAction(ISD::SETCC, MVT::f128, Custom);
505 setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom);
506 setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Custom);
507 setOperationAction(ISD::BR_CC, MVT::f128, Custom);
508 setOperationAction(ISD::SELECT, MVT::f128, Custom);
509 setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
510 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
511 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
512 // aren't handled.
513
514 // Lowering for many of the conversions is actually specified by the non-f128
515 // type. The LowerXXX function will be trivial when f128 isn't involved.
516 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
517 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
518 setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
519 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
520 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
521 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom);
522 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
523 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
524 setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
525 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
526 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
527 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom);
528 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
529 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
530 setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
531 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
532 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
533 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom);
534 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
535 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
536 setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
537 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
538 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
539 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
540 if (Subtarget->hasFPARMv8()) {
541 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
542 setOperationAction(ISD::FP_ROUND, MVT::bf16, Custom);
543 }
544 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
545 setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
546 if (Subtarget->hasFPARMv8()) {
547 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
548 setOperationAction(ISD::STRICT_FP_ROUND, MVT::bf16, Custom);
549 }
550 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
551 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
552
553 setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i32, Custom);
554 setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
555 setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i32, Custom);
556 setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
557
558 // Variable arguments.
559 setOperationAction(ISD::VASTART, MVT::Other, Custom);
560 setOperationAction(ISD::VAARG, MVT::Other, Custom);
561 setOperationAction(ISD::VACOPY, MVT::Other, Custom);
562 setOperationAction(ISD::VAEND, MVT::Other, Expand);
563
564 // Variable-sized objects.
565 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
566 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
567
568 // Lowering Funnel Shifts to EXTR
569 setOperationAction(ISD::FSHR, MVT::i32, Custom);
570 setOperationAction(ISD::FSHR, MVT::i64, Custom);
571 setOperationAction(ISD::FSHL, MVT::i32, Custom);
572 setOperationAction(ISD::FSHL, MVT::i64, Custom);
573
574 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
575
576 // Constant pool entries
577 setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
578
579 // BlockAddress
580 setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
581
582 // AArch64 lacks both left-rotate and popcount instructions.
583 setOperationAction(ISD::ROTL, MVT::i32, Expand);
584 setOperationAction(ISD::ROTL, MVT::i64, Expand);
585 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
586 setOperationAction(ISD::ROTL, VT, Expand);
587 setOperationAction(ISD::ROTR, VT, Expand);
588 }
589
590 // AArch64 doesn't have i32 MULH{S|U}.
591 setOperationAction(ISD::MULHU, MVT::i32, Expand);
592 setOperationAction(ISD::MULHS, MVT::i32, Expand);
593
594 // AArch64 doesn't have {U|S}MUL_LOHI.
595 setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
596 setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
597 setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
598 setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
599
600 if (Subtarget->hasCSSC()) {
601 setOperationAction(ISD::CTPOP, MVT::i32, Legal);
602 setOperationAction(ISD::CTPOP, MVT::i64, Legal);
603 setOperationAction(ISD::CTPOP, MVT::i128, Expand);
604
605 setOperationAction(ISD::PARITY, MVT::i128, Expand);
606
607 setOperationAction(ISD::CTTZ, MVT::i32, Legal);
608 setOperationAction(ISD::CTTZ, MVT::i64, Legal);
609 setOperationAction(ISD::CTTZ, MVT::i128, Expand);
610
611 setOperationAction(ISD::ABS, MVT::i32, Legal);
612 setOperationAction(ISD::ABS, MVT::i64, Legal);
613
614 setOperationAction(ISD::SMAX, MVT::i32, Legal);
615 setOperationAction(ISD::SMAX, MVT::i64, Legal);
616 setOperationAction(ISD::UMAX, MVT::i32, Legal);
617 setOperationAction(ISD::UMAX, MVT::i64, Legal);
618
619 setOperationAction(ISD::SMIN, MVT::i32, Legal);
620 setOperationAction(ISD::SMIN, MVT::i64, Legal);
621 setOperationAction(ISD::UMIN, MVT::i32, Legal);
622 setOperationAction(ISD::UMIN, MVT::i64, Legal);
623 } else {
624 setOperationAction(ISD::CTPOP, MVT::i32, Custom);
625 setOperationAction(ISD::CTPOP, MVT::i64, Custom);
626 setOperationAction(ISD::CTPOP, MVT::i128, Custom);
627
628 setOperationAction(ISD::PARITY, MVT::i64, Custom);
629 setOperationAction(ISD::PARITY, MVT::i128, Custom);
630
631 setOperationAction(ISD::ABS, MVT::i32, Custom);
632 setOperationAction(ISD::ABS, MVT::i64, Custom);
633 }
634
635 setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
636 setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
637 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
638 setOperationAction(ISD::SDIVREM, VT, Expand);
639 setOperationAction(ISD::UDIVREM, VT, Expand);
640 }
641 setOperationAction(ISD::SREM, MVT::i32, Expand);
642 setOperationAction(ISD::SREM, MVT::i64, Expand);
643 setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
644 setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
645 setOperationAction(ISD::UREM, MVT::i32, Expand);
646 setOperationAction(ISD::UREM, MVT::i64, Expand);
647
648 // Custom lower Add/Sub/Mul with overflow.
649 setOperationAction(ISD::SADDO, MVT::i32, Custom);
650 setOperationAction(ISD::SADDO, MVT::i64, Custom);
651 setOperationAction(ISD::UADDO, MVT::i32, Custom);
652 setOperationAction(ISD::UADDO, MVT::i64, Custom);
653 setOperationAction(ISD::SSUBO, MVT::i32, Custom);
654 setOperationAction(ISD::SSUBO, MVT::i64, Custom);
655 setOperationAction(ISD::USUBO, MVT::i32, Custom);
656 setOperationAction(ISD::USUBO, MVT::i64, Custom);
657 setOperationAction(ISD::SMULO, MVT::i32, Custom);
658 setOperationAction(ISD::SMULO, MVT::i64, Custom);
659 setOperationAction(ISD::UMULO, MVT::i32, Custom);
660 setOperationAction(ISD::UMULO, MVT::i64, Custom);
661
662 setOperationAction(ISD::UADDO_CARRY, MVT::i32, Custom);
663 setOperationAction(ISD::UADDO_CARRY, MVT::i64, Custom);
664 setOperationAction(ISD::USUBO_CARRY, MVT::i32, Custom);
665 setOperationAction(ISD::USUBO_CARRY, MVT::i64, Custom);
666 setOperationAction(ISD::SADDO_CARRY, MVT::i32, Custom);
667 setOperationAction(ISD::SADDO_CARRY, MVT::i64, Custom);
668 setOperationAction(ISD::SSUBO_CARRY, MVT::i32, Custom);
669 setOperationAction(ISD::SSUBO_CARRY, MVT::i64, Custom);
670
671 setOperationAction(ISD::FSIN, MVT::f32, Expand);
672 setOperationAction(ISD::FSIN, MVT::f64, Expand);
673 setOperationAction(ISD::FCOS, MVT::f32, Expand);
674 setOperationAction(ISD::FCOS, MVT::f64, Expand);
675 setOperationAction(ISD::FPOW, MVT::f32, Expand);
676 setOperationAction(ISD::FPOW, MVT::f64, Expand);
677 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
678 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
679 if (Subtarget->hasFullFP16()) {
680 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Custom);
681 setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Custom);
682 } else {
683 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote);
684 setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Promote);
685 }
686
687 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
688 ISD::FCOS, ISD::FSIN, ISD::FSINCOS,
689 ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
690 ISD::FLOG, ISD::FLOG2, ISD::FLOG10,
691 ISD::STRICT_FREM,
692 ISD::STRICT_FPOW, ISD::STRICT_FPOWI, ISD::STRICT_FCOS,
693 ISD::STRICT_FSIN, ISD::STRICT_FEXP, ISD::STRICT_FEXP2,
694 ISD::STRICT_FLOG, ISD::STRICT_FLOG2, ISD::STRICT_FLOG10}) {
695 setOperationAction(Op, MVT::f16, Promote);
696 setOperationAction(Op, MVT::v4f16, Expand);
697 setOperationAction(Op, MVT::v8f16, Expand);
698 setOperationAction(Op, MVT::bf16, Promote);
699 setOperationAction(Op, MVT::v4bf16, Expand);
700 setOperationAction(Op, MVT::v8bf16, Expand);
701 }
702
703 auto LegalizeNarrowFP = [this](MVT ScalarVT) {
704 for (auto Op : {
705 ISD::SETCC,
706 ISD::SELECT_CC,
707 ISD::BR_CC,
708 ISD::FADD,
709 ISD::FSUB,
710 ISD::FMUL,
711 ISD::FDIV,
712 ISD::FMA,
713 ISD::FCEIL,
714 ISD::FSQRT,
715 ISD::FFLOOR,
716 ISD::FNEARBYINT,
717 ISD::FRINT,
718 ISD::FROUND,
719 ISD::FROUNDEVEN,
720 ISD::FTRUNC,
721 ISD::FMINNUM,
722 ISD::FMAXNUM,
723 ISD::FMINIMUM,
724 ISD::FMAXIMUM,
725 ISD::STRICT_FADD,
726 ISD::STRICT_FSUB,
727 ISD::STRICT_FMUL,
728 ISD::STRICT_FDIV,
729 ISD::STRICT_FMA,
730 ISD::STRICT_FCEIL,
731 ISD::STRICT_FFLOOR,
732 ISD::STRICT_FSQRT,
733 ISD::STRICT_FRINT,
734 ISD::STRICT_FNEARBYINT,
735 ISD::STRICT_FROUND,
736 ISD::STRICT_FTRUNC,
737 ISD::STRICT_FROUNDEVEN,
738 ISD::STRICT_FMINNUM,
739 ISD::STRICT_FMAXNUM,
740 ISD::STRICT_FMINIMUM,
741 ISD::STRICT_FMAXIMUM,
742 })
743 setOperationAction(Op, VT: ScalarVT, Action: Promote);
744
745 for (auto Op : {ISD::FNEG, ISD::FABS})
746 setOperationAction(Op, VT: ScalarVT, Action: Legal);
747
748 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
749 // because the result type is integer.
750 for (auto Op : {ISD::LROUND, ISD::LLROUND, ISD::LRINT, ISD::LLRINT,
751 ISD::STRICT_LROUND, ISD::STRICT_LLROUND, ISD::STRICT_LRINT,
752 ISD::STRICT_LLRINT})
753 setOperationAction(Op, VT: ScalarVT, Action: Custom);
754
755 // promote v4f16 to v4f32 when that is known to be safe.
756 auto V4Narrow = MVT::getVectorVT(VT: ScalarVT, NumElements: 4);
757 setOperationPromotedToType(ISD::FADD, V4Narrow, MVT::v4f32);
758 setOperationPromotedToType(ISD::FSUB, V4Narrow, MVT::v4f32);
759 setOperationPromotedToType(ISD::FMUL, V4Narrow, MVT::v4f32);
760 setOperationPromotedToType(ISD::FDIV, V4Narrow, MVT::v4f32);
761 setOperationPromotedToType(ISD::FCEIL, V4Narrow, MVT::v4f32);
762 setOperationPromotedToType(ISD::FFLOOR, V4Narrow, MVT::v4f32);
763 setOperationPromotedToType(ISD::FROUND, V4Narrow, MVT::v4f32);
764 setOperationPromotedToType(ISD::FTRUNC, V4Narrow, MVT::v4f32);
765 setOperationPromotedToType(ISD::FROUNDEVEN, V4Narrow, MVT::v4f32);
766 setOperationPromotedToType(ISD::FRINT, V4Narrow, MVT::v4f32);
767 setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);
768
769 setOperationAction(Op: ISD::FABS, VT: V4Narrow, Action: Legal);
770 setOperationAction(Op: ISD::FNEG, VT: V4Narrow, Action: Legal);
771 setOperationAction(Op: ISD::FMA, VT: V4Narrow, Action: Expand);
772 setOperationAction(Op: ISD::SETCC, VT: V4Narrow, Action: Custom);
773 setOperationAction(Op: ISD::BR_CC, VT: V4Narrow, Action: Expand);
774 setOperationAction(Op: ISD::SELECT, VT: V4Narrow, Action: Expand);
775 setOperationAction(Op: ISD::SELECT_CC, VT: V4Narrow, Action: Expand);
776 setOperationAction(Op: ISD::FCOPYSIGN, VT: V4Narrow, Action: Custom);
777 setOperationAction(Op: ISD::FSQRT, VT: V4Narrow, Action: Expand);
778
779 auto V8Narrow = MVT::getVectorVT(VT: ScalarVT, NumElements: 8);
780 setOperationAction(Op: ISD::FABS, VT: V8Narrow, Action: Legal);
781 setOperationAction(Op: ISD::FADD, VT: V8Narrow, Action: Legal);
782 setOperationAction(Op: ISD::FCEIL, VT: V8Narrow, Action: Legal);
783 setOperationAction(Op: ISD::FCOPYSIGN, VT: V8Narrow, Action: Custom);
784 setOperationAction(Op: ISD::FDIV, VT: V8Narrow, Action: Legal);
785 setOperationAction(Op: ISD::FFLOOR, VT: V8Narrow, Action: Legal);
786 setOperationAction(Op: ISD::FMA, VT: V8Narrow, Action: Expand);
787 setOperationAction(Op: ISD::FMUL, VT: V8Narrow, Action: Legal);
788 setOperationAction(Op: ISD::FNEARBYINT, VT: V8Narrow, Action: Legal);
789 setOperationAction(Op: ISD::FNEG, VT: V8Narrow, Action: Legal);
790 setOperationAction(Op: ISD::FROUND, VT: V8Narrow, Action: Legal);
791 setOperationAction(Op: ISD::FROUNDEVEN, VT: V8Narrow, Action: Legal);
792 setOperationAction(Op: ISD::FRINT, VT: V8Narrow, Action: Legal);
793 setOperationAction(Op: ISD::FSQRT, VT: V8Narrow, Action: Expand);
794 setOperationAction(Op: ISD::FSUB, VT: V8Narrow, Action: Legal);
795 setOperationAction(Op: ISD::FTRUNC, VT: V8Narrow, Action: Legal);
796 setOperationAction(Op: ISD::SETCC, VT: V8Narrow, Action: Expand);
797 setOperationAction(Op: ISD::BR_CC, VT: V8Narrow, Action: Expand);
798 setOperationAction(Op: ISD::SELECT, VT: V8Narrow, Action: Expand);
799 setOperationAction(Op: ISD::SELECT_CC, VT: V8Narrow, Action: Expand);
800 setOperationAction(Op: ISD::FP_EXTEND, VT: V8Narrow, Action: Expand);
801 };
802
803 if (!Subtarget->hasFullFP16()) {
804 LegalizeNarrowFP(MVT::f16);
805 }
806 LegalizeNarrowFP(MVT::bf16);
807 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Custom);
808 setOperationAction(ISD::FP_ROUND, MVT::v4bf16, Custom);
809
810 // AArch64 has implementations of a lot of rounding-like FP operations.
811 for (auto Op :
812 {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL,
813 ISD::FRINT, ISD::FTRUNC, ISD::FROUND,
814 ISD::FROUNDEVEN, ISD::FMINNUM, ISD::FMAXNUM,
815 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::LROUND,
816 ISD::LLROUND, ISD::LRINT, ISD::LLRINT,
817 ISD::STRICT_FFLOOR, ISD::STRICT_FCEIL, ISD::STRICT_FNEARBYINT,
818 ISD::STRICT_FRINT, ISD::STRICT_FTRUNC, ISD::STRICT_FROUNDEVEN,
819 ISD::STRICT_FROUND, ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM,
820 ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_LROUND,
821 ISD::STRICT_LLROUND, ISD::STRICT_LRINT, ISD::STRICT_LLRINT}) {
822 for (MVT Ty : {MVT::f32, MVT::f64})
823 setOperationAction(Op, Ty, Legal);
824 if (Subtarget->hasFullFP16())
825 setOperationAction(Op, MVT::f16, Legal);
826 }
827
828 // Basic strict FP operations are legal
829 for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL,
830 ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT}) {
831 for (MVT Ty : {MVT::f32, MVT::f64})
832 setOperationAction(Op, Ty, Legal);
833 if (Subtarget->hasFullFP16())
834 setOperationAction(Op, MVT::f16, Legal);
835 }
836
837 // Strict conversion to a larger type is legal
838 for (auto VT : {MVT::f32, MVT::f64})
839 setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal);
840
841 setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
842
843 setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
844 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
845
846 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
847 if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
848 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, LibCall);
849 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, LibCall);
850 } else {
851 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand);
852 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Expand);
853 }
854 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom);
855 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
856
857 // Generate outline atomics library calls only if LSE was not specified for
858 // subtarget
859 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
860 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, LibCall);
861 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, LibCall);
862 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, LibCall);
863 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, LibCall);
864 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, LibCall);
865 setOperationAction(ISD::ATOMIC_SWAP, MVT::i8, LibCall);
866 setOperationAction(ISD::ATOMIC_SWAP, MVT::i16, LibCall);
867 setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, LibCall);
868 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, LibCall);
869 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i8, LibCall);
870 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i16, LibCall);
871 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, LibCall);
872 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, LibCall);
873 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i8, LibCall);
874 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i16, LibCall);
875 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, LibCall);
876 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, LibCall);
877 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i8, LibCall);
878 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i16, LibCall);
879 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i32, LibCall);
880 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i64, LibCall);
881 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i8, LibCall);
882 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i16, LibCall);
883 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall);
884 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, LibCall);
885#define LCALLNAMES(A, B, N) \
886 setLibcallName(A##N##_RELAX, #B #N "_relax"); \
887 setLibcallName(A##N##_ACQ, #B #N "_acq"); \
888 setLibcallName(A##N##_REL, #B #N "_rel"); \
889 setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
890#define LCALLNAME4(A, B) \
891 LCALLNAMES(A, B, 1) \
892 LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
893#define LCALLNAME5(A, B) \
894 LCALLNAMES(A, B, 1) \
895 LCALLNAMES(A, B, 2) \
896 LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
897 LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
898 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
899 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
900 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
901 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
902 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
903#undef LCALLNAMES
904#undef LCALLNAME4
905#undef LCALLNAME5
906 }
907
908 if (Subtarget->hasLSE128()) {
909 // Custom lowering because i128 is not legal. Must be replaced by 2x64
910 // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
911 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i128, Custom);
912 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i128, Custom);
913 setOperationAction(ISD::ATOMIC_SWAP, MVT::i128, Custom);
914 }
915
916 // 128-bit loads and stores can be done without expanding
917 setOperationAction(ISD::LOAD, MVT::i128, Custom);
918 setOperationAction(ISD::STORE, MVT::i128, Custom);
919
920 // Aligned 128-bit loads and stores are single-copy atomic according to the
921 // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
922 if (Subtarget->hasLSE2()) {
923 setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
924 setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
925 }
926
927 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
928 // custom lowering, as there are no un-paired non-temporal stores and
929 // legalization will break up 256 bit inputs.
930 setOperationAction(ISD::STORE, MVT::v32i8, Custom);
931 setOperationAction(ISD::STORE, MVT::v16i16, Custom);
932 setOperationAction(ISD::STORE, MVT::v16f16, Custom);
933 setOperationAction(ISD::STORE, MVT::v16bf16, Custom);
934 setOperationAction(ISD::STORE, MVT::v8i32, Custom);
935 setOperationAction(ISD::STORE, MVT::v8f32, Custom);
936 setOperationAction(ISD::STORE, MVT::v4f64, Custom);
937 setOperationAction(ISD::STORE, MVT::v4i64, Custom);
938
939 // 256 bit non-temporal loads can be lowered to LDNP. This is done using
940 // custom lowering, as there are no un-paired non-temporal loads legalization
941 // will break up 256 bit inputs.
942 setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
943 setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
944 setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
945 setOperationAction(ISD::LOAD, MVT::v16bf16, Custom);
946 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
947 setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
948 setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
949 setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
950
951 // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
952 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
953
954 if (getLibcallName(Call: RTLIB::SINCOS_STRET_F32) != nullptr &&
955 getLibcallName(Call: RTLIB::SINCOS_STRET_F64) != nullptr) {
956 // Issue __sincos_stret if available.
957 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
958 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
959 } else {
960 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
961 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
962 }
963
964 if (Subtarget->getTargetTriple().isOSMSVCRT()) {
965 // MSVCRT doesn't have powi; fall back to pow
966 setLibcallName(Call: RTLIB::POWI_F32, Name: nullptr);
967 setLibcallName(Call: RTLIB::POWI_F64, Name: nullptr);
968 }
969
970 // Make floating-point constants legal for the large code model, so they don't
971 // become loads from the constant pool.
972 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
973 setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
974 setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
975 }
976
977 // AArch64 does not have floating-point extending loads, i1 sign-extending
978 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
979 for (MVT VT : MVT::fp_valuetypes()) {
980 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
981 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
982 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
983 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
984 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
985 }
986 for (MVT VT : MVT::integer_valuetypes())
987 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
988
989 for (MVT WideVT : MVT::fp_valuetypes()) {
990 for (MVT NarrowVT : MVT::fp_valuetypes()) {
991 if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {
992 setTruncStoreAction(WideVT, NarrowVT, Expand);
993 }
994 }
995 }
996
997 if (Subtarget->hasFPARMv8()) {
998 setOperationAction(ISD::BITCAST, MVT::i16, Custom);
999 setOperationAction(ISD::BITCAST, MVT::f16, Custom);
1000 setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
1001 }
1002
1003 // Indexed loads and stores are supported.
1004 for (unsigned im = (unsigned)ISD::PRE_INC;
1005 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
1006 setIndexedLoadAction(im, MVT::i8, Legal);
1007 setIndexedLoadAction(im, MVT::i16, Legal);
1008 setIndexedLoadAction(im, MVT::i32, Legal);
1009 setIndexedLoadAction(im, MVT::i64, Legal);
1010 setIndexedLoadAction(im, MVT::f64, Legal);
1011 setIndexedLoadAction(im, MVT::f32, Legal);
1012 setIndexedLoadAction(im, MVT::f16, Legal);
1013 setIndexedLoadAction(im, MVT::bf16, Legal);
1014 setIndexedStoreAction(im, MVT::i8, Legal);
1015 setIndexedStoreAction(im, MVT::i16, Legal);
1016 setIndexedStoreAction(im, MVT::i32, Legal);
1017 setIndexedStoreAction(im, MVT::i64, Legal);
1018 setIndexedStoreAction(im, MVT::f64, Legal);
1019 setIndexedStoreAction(im, MVT::f32, Legal);
1020 setIndexedStoreAction(im, MVT::f16, Legal);
1021 setIndexedStoreAction(im, MVT::bf16, Legal);
1022 }
1023
1024 // Trap.
1025 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1026 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
1027 setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
1028
1029 // We combine OR nodes for bitfield operations.
1030 setTargetDAGCombine(ISD::OR);
1031 // Try to create BICs for vector ANDs.
1032 setTargetDAGCombine(ISD::AND);
1033
1034 // Vector add and sub nodes may conceal a high-half opportunity.
1035 // Also, try to fold ADD into CSINC/CSINV..
1036 setTargetDAGCombine({ISD::ADD, ISD::ABS, ISD::SUB, ISD::XOR, ISD::SINT_TO_FP,
1037 ISD::UINT_TO_FP});
1038
1039 setTargetDAGCombine({ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
1040 ISD::FP_TO_UINT_SAT, ISD::FADD, ISD::FDIV});
1041
1042 // Try and combine setcc with csel
1043 setTargetDAGCombine(ISD::SETCC);
1044
1045 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
1046
1047 setTargetDAGCombine({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND,
1048 ISD::VECTOR_SPLICE, ISD::SIGN_EXTEND_INREG,
1049 ISD::CONCAT_VECTORS, ISD::EXTRACT_SUBVECTOR,
1050 ISD::INSERT_SUBVECTOR, ISD::STORE, ISD::BUILD_VECTOR});
1051 setTargetDAGCombine(ISD::TRUNCATE);
1052 setTargetDAGCombine(ISD::LOAD);
1053
1054 setTargetDAGCombine(ISD::MSTORE);
1055
1056 setTargetDAGCombine(ISD::MUL);
1057
1058 setTargetDAGCombine({ISD::SELECT, ISD::VSELECT});
1059
1060 setTargetDAGCombine({ISD::INTRINSIC_VOID, ISD::INTRINSIC_W_CHAIN,
1061 ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT,
1062 ISD::VECREDUCE_ADD, ISD::STEP_VECTOR});
1063
1064 setTargetDAGCombine({ISD::MGATHER, ISD::MSCATTER});
1065
1066 setTargetDAGCombine(ISD::FP_EXTEND);
1067
1068 setTargetDAGCombine(ISD::GlobalAddress);
1069
1070 setTargetDAGCombine(ISD::CTLZ);
1071
1072 setTargetDAGCombine(ISD::VECREDUCE_AND);
1073 setTargetDAGCombine(ISD::VECREDUCE_OR);
1074 setTargetDAGCombine(ISD::VECREDUCE_XOR);
1075
1076 setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
1077
1078 // In case of strict alignment, avoid an excessive number of byte wide stores.
1079 MaxStoresPerMemsetOptSize = 8;
1080 MaxStoresPerMemset =
1081 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
1082
1083 MaxGluedStoresPerMemcpy = 4;
1084 MaxStoresPerMemcpyOptSize = 4;
1085 MaxStoresPerMemcpy =
1086 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
1087
1088 MaxStoresPerMemmoveOptSize = 4;
1089 MaxStoresPerMemmove = 4;
1090
1091 MaxLoadsPerMemcmpOptSize = 4;
1092 MaxLoadsPerMemcmp =
1093 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
1094
1095 setStackPointerRegisterToSaveRestore(AArch64::SP);
1096
1097 setSchedulingPreference(Sched::Hybrid);
1098
1099 EnableExtLdPromotion = true;
1100
1101 // Set required alignment.
1102 setMinFunctionAlignment(Align(4));
1103 // Set preferred alignments.
1104
1105 // Don't align loops on Windows. The SEH unwind info generation needs to
1106 // know the exact length of functions before the alignments have been
1107 // expanded.
1108 if (!Subtarget->isTargetWindows())
1109 setPrefLoopAlignment(STI.getPrefLoopAlignment());
1110 setMaxBytesForAlignment(STI.getMaxBytesForLoopAlignment());
1111 setPrefFunctionAlignment(STI.getPrefFunctionAlignment());
1112
1113 // Only change the limit for entries in a jump table if specified by
1114 // the sub target, but not at the command line.
1115 unsigned MaxJT = STI.getMaximumJumpTableSize();
1116 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1117 setMaximumJumpTableSize(MaxJT);
1118
1119 setHasExtractBitsInsn(true);
1120
1121 setMaxDivRemBitWidthSupported(128);
1122
1123 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1124
1125 if (Subtarget->hasNEON()) {
1126 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1127 // silliness like this:
1128 for (auto Op :
1129 {ISD::SELECT, ISD::SELECT_CC,
1130 ISD::BR_CC, ISD::FADD, ISD::FSUB,
1131 ISD::FMUL, ISD::FDIV, ISD::FMA,
1132 ISD::FNEG, ISD::FABS, ISD::FCEIL,
1133 ISD::FSQRT, ISD::FFLOOR, ISD::FNEARBYINT,
1134 ISD::FSIN, ISD::FCOS, ISD::FPOW,
1135 ISD::FLOG, ISD::FLOG2, ISD::FLOG10,
1136 ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
1137 ISD::FRINT, ISD::FROUND, ISD::FROUNDEVEN,
1138 ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM,
1139 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::STRICT_FADD,
1140 ISD::STRICT_FSUB, ISD::STRICT_FMUL, ISD::STRICT_FDIV,
1141 ISD::STRICT_FMA, ISD::STRICT_FCEIL, ISD::STRICT_FFLOOR,
1142 ISD::STRICT_FSQRT, ISD::STRICT_FRINT, ISD::STRICT_FNEARBYINT,
1143 ISD::STRICT_FROUND, ISD::STRICT_FTRUNC, ISD::STRICT_FROUNDEVEN,
1144 ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, ISD::STRICT_FMINIMUM,
1145 ISD::STRICT_FMAXIMUM})
1146 setOperationAction(Op, MVT::v1f64, Expand);
1147
1148 for (auto Op :
1149 {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP,
1150 ISD::FP_ROUND, ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT, ISD::MUL,
1151 ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT,
1152 ISD::STRICT_SINT_TO_FP, ISD::STRICT_UINT_TO_FP, ISD::STRICT_FP_ROUND})
1153 setOperationAction(Op, MVT::v1i64, Expand);
1154
1155 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1156 // elements smaller than i32, so promote the input to i32 first.
1157 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1158 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1159
1160 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1161 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1162 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1163 for (auto Op : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
1164 ISD::STRICT_UINT_TO_FP})
1165 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1166 setOperationAction(Op, VT, Custom);
1167
1168 if (Subtarget->hasFullFP16()) {
1169 setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
1170 setOperationAction(ISD::ConstantFP, MVT::bf16, Legal);
1171
1172 setOperationAction(ISD::SINT_TO_FP, MVT::v8i8, Custom);
1173 setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
1174 setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Custom);
1175 setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom);
1176 setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
1177 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
1178 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom);
1179 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
1180 } else {
1181 // when AArch64 doesn't have fullfp16 support, promote the input
1182 // to i32 first.
1183 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1184 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1185 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1186 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1187 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1188 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1189 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1190 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1191 }
1192
1193 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
1194 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
1195 setOperationAction(ISD::BITREVERSE, MVT::v8i8, Legal);
1196 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Legal);
1197 setOperationAction(ISD::BITREVERSE, MVT::v2i32, Custom);
1198 setOperationAction(ISD::BITREVERSE, MVT::v4i32, Custom);
1199 setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom);
1200 setOperationAction(ISD::BITREVERSE, MVT::v2i64, Custom);
1201 for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1202 setOperationAction(ISD::UMAX, VT, Custom);
1203 setOperationAction(ISD::SMAX, VT, Custom);
1204 setOperationAction(ISD::UMIN, VT, Custom);
1205 setOperationAction(ISD::SMIN, VT, Custom);
1206 }
1207
1208 // Custom handling for some quad-vector types to detect MULL.
1209 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1210 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1211 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1212 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
1213 setOperationAction(ISD::MUL, MVT::v2i32, Custom);
1214 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1215
1216 // Saturates
1217 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1218 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1219 setOperationAction(ISD::SADDSAT, VT, Legal);
1220 setOperationAction(ISD::UADDSAT, VT, Legal);
1221 setOperationAction(ISD::SSUBSAT, VT, Legal);
1222 setOperationAction(ISD::USUBSAT, VT, Legal);
1223 }
1224
1225 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1226 MVT::v4i32}) {
1227 setOperationAction(ISD::AVGFLOORS, VT, Legal);
1228 setOperationAction(ISD::AVGFLOORU, VT, Legal);
1229 setOperationAction(ISD::AVGCEILS, VT, Legal);
1230 setOperationAction(ISD::AVGCEILU, VT, Legal);
1231 setOperationAction(ISD::ABDS, VT, Legal);
1232 setOperationAction(ISD::ABDU, VT, Legal);
1233 }
1234
1235 // Vector reductions
1236 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1237 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1238 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1239 setOperationAction(ISD::VECREDUCE_FMAX, VT, Legal);
1240 setOperationAction(ISD::VECREDUCE_FMIN, VT, Legal);
1241 setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Legal);
1242 setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Legal);
1243
1244 setOperationAction(ISD::VECREDUCE_FADD, VT, Legal);
1245 }
1246 }
1247 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1248 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1249 setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1250 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1251 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1252 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1253 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1254 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1255 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1256 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1257 }
1258 setOperationAction(ISD::VECREDUCE_ADD, MVT::v2i64, Custom);
1259 setOperationAction(ISD::VECREDUCE_AND, MVT::v2i64, Custom);
1260 setOperationAction(ISD::VECREDUCE_OR, MVT::v2i64, Custom);
1261 setOperationAction(ISD::VECREDUCE_XOR, MVT::v2i64, Custom);
1262
1263 setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
1264 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1265 // Likewise, narrowing and extending vector loads/stores aren't handled
1266 // directly.
1267 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
1268 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
1269
1270 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1271 setOperationAction(ISD::MULHS, VT, Legal);
1272 setOperationAction(ISD::MULHU, VT, Legal);
1273 } else {
1274 setOperationAction(ISD::MULHS, VT, Expand);
1275 setOperationAction(ISD::MULHU, VT, Expand);
1276 }
1277 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
1278 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
1279
1280 setOperationAction(ISD::BSWAP, VT, Expand);
1281 setOperationAction(ISD::CTTZ, VT, Expand);
1282
1283 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1284 setTruncStoreAction(VT, InnerVT, Expand);
1285 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1286 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1287 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1288 }
1289 }
1290
1291 // AArch64 has implementations of a lot of rounding-like FP operations.
1292 for (auto Op :
1293 {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC,
1294 ISD::FROUND, ISD::FROUNDEVEN, ISD::STRICT_FFLOOR,
1295 ISD::STRICT_FNEARBYINT, ISD::STRICT_FCEIL, ISD::STRICT_FRINT,
1296 ISD::STRICT_FTRUNC, ISD::STRICT_FROUND, ISD::STRICT_FROUNDEVEN}) {
1297 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1298 setOperationAction(Op, Ty, Legal);
1299 if (Subtarget->hasFullFP16())
1300 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1301 setOperationAction(Op, Ty, Legal);
1302 }
1303
1304 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1305
1306 setOperationAction(ISD::BITCAST, MVT::i2, Custom);
1307 setOperationAction(ISD::BITCAST, MVT::i4, Custom);
1308 setOperationAction(ISD::BITCAST, MVT::i8, Custom);
1309 setOperationAction(ISD::BITCAST, MVT::i16, Custom);
1310
1311 setOperationAction(ISD::BITCAST, MVT::v2i8, Custom);
1312 setOperationAction(ISD::BITCAST, MVT::v2i16, Custom);
1313 setOperationAction(ISD::BITCAST, MVT::v4i8, Custom);
1314
1315 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1316 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1317 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1318 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1319 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1320 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1321
1322 // ADDP custom lowering
1323 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1324 setOperationAction(ISD::ADD, VT, Custom);
1325 // FADDP custom lowering
1326 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1327 setOperationAction(ISD::FADD, VT, Custom);
1328 }
1329
1330 if (Subtarget->hasSME()) {
1331 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1332 }
1333
1334 // FIXME: Move lowering for more nodes here if those are common between
1335 // SVE and SME.
1336 if (Subtarget->hasSVEorSME()) {
1337 for (auto VT :
1338 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1339 setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
1340 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1341 setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom);
1342 setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom);
1343 }
1344 }
1345
1346 if (Subtarget->hasSVEorSME()) {
1347 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1348 setOperationAction(ISD::BITREVERSE, VT, Custom);
1349 setOperationAction(ISD::BSWAP, VT, Custom);
1350 setOperationAction(ISD::CTLZ, VT, Custom);
1351 setOperationAction(ISD::CTPOP, VT, Custom);
1352 setOperationAction(ISD::CTTZ, VT, Custom);
1353 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1354 setOperationAction(ISD::UINT_TO_FP, VT, Custom);
1355 setOperationAction(ISD::SINT_TO_FP, VT, Custom);
1356 setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1357 setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1358 setOperationAction(ISD::MGATHER, VT, Custom);
1359 setOperationAction(ISD::MSCATTER, VT, Custom);
1360 setOperationAction(ISD::MLOAD, VT, Custom);
1361 setOperationAction(ISD::MUL, VT, Custom);
1362 setOperationAction(ISD::MULHS, VT, Custom);
1363 setOperationAction(ISD::MULHU, VT, Custom);
1364 setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
1365 setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
1366 setOperationAction(ISD::SELECT, VT, Custom);
1367 setOperationAction(ISD::SETCC, VT, Custom);
1368 setOperationAction(ISD::SDIV, VT, Custom);
1369 setOperationAction(ISD::UDIV, VT, Custom);
1370 setOperationAction(ISD::SMIN, VT, Custom);
1371 setOperationAction(ISD::UMIN, VT, Custom);
1372 setOperationAction(ISD::SMAX, VT, Custom);
1373 setOperationAction(ISD::UMAX, VT, Custom);
1374 setOperationAction(ISD::SHL, VT, Custom);
1375 setOperationAction(ISD::SRL, VT, Custom);
1376 setOperationAction(ISD::SRA, VT, Custom);
1377 setOperationAction(ISD::ABS, VT, Custom);
1378 setOperationAction(ISD::ABDS, VT, Custom);
1379 setOperationAction(ISD::ABDU, VT, Custom);
1380 setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1381 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1382 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1383 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1384 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1385 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1386 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1387 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1388 setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom);
1389 setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom);
1390
1391 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
1392 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
1393 setOperationAction(ISD::SELECT_CC, VT, Expand);
1394 setOperationAction(ISD::ROTL, VT, Expand);
1395 setOperationAction(ISD::ROTR, VT, Expand);
1396
1397 setOperationAction(ISD::SADDSAT, VT, Legal);
1398 setOperationAction(ISD::UADDSAT, VT, Legal);
1399 setOperationAction(ISD::SSUBSAT, VT, Legal);
1400 setOperationAction(ISD::USUBSAT, VT, Legal);
1401 setOperationAction(ISD::UREM, VT, Expand);
1402 setOperationAction(ISD::SREM, VT, Expand);
1403 setOperationAction(ISD::SDIVREM, VT, Expand);
1404 setOperationAction(ISD::UDIVREM, VT, Expand);
1405
1406 setOperationAction(ISD::AVGFLOORS, VT, Custom);
1407 setOperationAction(ISD::AVGFLOORU, VT, Custom);
1408 setOperationAction(ISD::AVGCEILS, VT, Custom);
1409 setOperationAction(ISD::AVGCEILU, VT, Custom);
1410
1411 if (!Subtarget->isLittleEndian())
1412 setOperationAction(ISD::BITCAST, VT, Expand);
1413
1414 if (Subtarget->hasSVE2orSME())
1415 // For SLI/SRI.
1416 setOperationAction(ISD::OR, VT, Custom);
1417 }
1418
1419 // Illegal unpacked integer vector types.
1420 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1421 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1422 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1423 }
1424
1425 // Legalize unpacked bitcasts to REINTERPRET_CAST.
1426 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32, MVT::nxv2bf16,
1427 MVT::nxv4bf16, MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32})
1428 setOperationAction(ISD::BITCAST, VT, Custom);
1429
1430 for (auto VT :
1431 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1432 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1433 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal);
1434
1435 for (auto VT :
1436 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1437 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1438 setOperationAction(ISD::SELECT, VT, Custom);
1439 setOperationAction(ISD::SETCC, VT, Custom);
1440 setOperationAction(ISD::TRUNCATE, VT, Custom);
1441 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1442 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1443 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1444
1445 setOperationAction(ISD::SELECT_CC, VT, Expand);
1446 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1447 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1448
1449 // There are no legal MVT::nxv16f## based types.
1450 if (VT != MVT::nxv16i1) {
1451 setOperationAction(ISD::SINT_TO_FP, VT, Custom);
1452 setOperationAction(ISD::UINT_TO_FP, VT, Custom);
1453 }
1454 }
1455
1456 // NEON doesn't support masked loads/stores/gathers/scatters, but SVE does
1457 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1458 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1459 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1460 setOperationAction(ISD::MLOAD, VT, Custom);
1461 setOperationAction(ISD::MSTORE, VT, Custom);
1462 setOperationAction(ISD::MGATHER, VT, Custom);
1463 setOperationAction(ISD::MSCATTER, VT, Custom);
1464 }
1465
1466 // Firstly, exclude all scalable vector extending loads/truncating stores,
1467 // include both integer and floating scalable vector.
1468 for (MVT VT : MVT::scalable_vector_valuetypes()) {
1469 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1470 setTruncStoreAction(VT, InnerVT, Expand);
1471 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1472 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1473 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1474 }
1475 }
1476
1477 // Then, selectively enable those which we directly support.
1478 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1479 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1480 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1481 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1482 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1483 setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1484 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1485 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1486 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1487 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1488 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1489 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1490 setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1491 }
1492
1493 // SVE supports truncating stores of 64 and 128-bit vectors
1494 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1495 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1496 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1497 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1498 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1499
1500 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1501 MVT::nxv4f32, MVT::nxv2f64}) {
1502 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1503 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1504 setOperationAction(ISD::MGATHER, VT, Custom);
1505 setOperationAction(ISD::MSCATTER, VT, Custom);
1506 setOperationAction(ISD::MLOAD, VT, Custom);
1507 setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
1508 setOperationAction(ISD::SELECT, VT, Custom);
1509 setOperationAction(ISD::SETCC, VT, Custom);
1510 setOperationAction(ISD::FADD, VT, Custom);
1511 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1512 setOperationAction(ISD::FDIV, VT, Custom);
1513 setOperationAction(ISD::FMA, VT, Custom);
1514 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1515 setOperationAction(ISD::FMAXNUM, VT, Custom);
1516 setOperationAction(ISD::FMINIMUM, VT, Custom);
1517 setOperationAction(ISD::FMINNUM, VT, Custom);
1518 setOperationAction(ISD::FMUL, VT, Custom);
1519 setOperationAction(ISD::FNEG, VT, Custom);
1520 setOperationAction(ISD::FSUB, VT, Custom);
1521 setOperationAction(ISD::FCEIL, VT, Custom);
1522 setOperationAction(ISD::FFLOOR, VT, Custom);
1523 setOperationAction(ISD::FNEARBYINT, VT, Custom);
1524 setOperationAction(ISD::FRINT, VT, Custom);
1525 setOperationAction(ISD::FROUND, VT, Custom);
1526 setOperationAction(ISD::FROUNDEVEN, VT, Custom);
1527 setOperationAction(ISD::FTRUNC, VT, Custom);
1528 setOperationAction(ISD::FSQRT, VT, Custom);
1529 setOperationAction(ISD::FABS, VT, Custom);
1530 setOperationAction(ISD::FP_EXTEND, VT, Custom);
1531 setOperationAction(ISD::FP_ROUND, VT, Custom);
1532 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1533 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
1534 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
1535 setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Custom);
1536 setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Custom);
1537 if (Subtarget->isSVEAvailable())
1538 setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
1539 setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
1540 setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom);
1541 setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom);
1542
1543 setOperationAction(ISD::SELECT_CC, VT, Expand);
1544 setOperationAction(ISD::FREM, VT, Expand);
1545 setOperationAction(ISD::FPOW, VT, Expand);
1546 setOperationAction(ISD::FPOWI, VT, Expand);
1547 setOperationAction(ISD::FCOS, VT, Expand);
1548 setOperationAction(ISD::FSIN, VT, Expand);
1549 setOperationAction(ISD::FSINCOS, VT, Expand);
1550 setOperationAction(ISD::FEXP, VT, Expand);
1551 setOperationAction(ISD::FEXP2, VT, Expand);
1552 setOperationAction(ISD::FEXP10, VT, Expand);
1553 setOperationAction(ISD::FLOG, VT, Expand);
1554 setOperationAction(ISD::FLOG2, VT, Expand);
1555 setOperationAction(ISD::FLOG10, VT, Expand);
1556
1557 setCondCodeAction(ISD::SETO, VT, Expand);
1558 setCondCodeAction(ISD::SETOLT, VT, Expand);
1559 setCondCodeAction(ISD::SETLT, VT, Expand);
1560 setCondCodeAction(ISD::SETOLE, VT, Expand);
1561 setCondCodeAction(ISD::SETLE, VT, Expand);
1562 setCondCodeAction(ISD::SETULT, VT, Expand);
1563 setCondCodeAction(ISD::SETULE, VT, Expand);
1564 setCondCodeAction(ISD::SETUGE, VT, Expand);
1565 setCondCodeAction(ISD::SETUGT, VT, Expand);
1566 setCondCodeAction(ISD::SETUEQ, VT, Expand);
1567 setCondCodeAction(ISD::SETONE, VT, Expand);
1568
1569 if (!Subtarget->isLittleEndian())
1570 setOperationAction(ISD::BITCAST, VT, Expand);
1571 }
1572
1573 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1574 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1575 setOperationAction(ISD::MGATHER, VT, Custom);
1576 setOperationAction(ISD::MSCATTER, VT, Custom);
1577 setOperationAction(ISD::MLOAD, VT, Custom);
1578 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1579 setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
1580
1581 if (!Subtarget->isLittleEndian())
1582 setOperationAction(ISD::BITCAST, VT, Expand);
1583 }
1584
1585 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
1586 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
1587
1588 // NEON doesn't support integer divides, but SVE does
1589 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1590 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1591 setOperationAction(ISD::SDIV, VT, Custom);
1592 setOperationAction(ISD::UDIV, VT, Custom);
1593 }
1594
1595 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1596 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1597 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1598
1599 if (Subtarget->isSVEAvailable()) {
1600 // NEON doesn't support across-vector reductions, but SVE does.
1601 for (auto VT :
1602 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64})
1603 setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
1604 }
1605
1606 // NOTE: Currently this has to happen after computeRegisterProperties rather
1607 // than the preferred option of combining it with the addRegisterClass call.
1608 if (Subtarget->useSVEForFixedLengthVectors()) {
1609 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
1610 if (useSVEForFixedLengthVectorVT(
1611 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1612 addTypeForFixedLengthSVE(VT);
1613 }
1614 for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) {
1615 if (useSVEForFixedLengthVectorVT(
1616 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1617 addTypeForFixedLengthSVE(VT);
1618 }
1619
1620 // 64bit results can mean a bigger than NEON input.
1621 for (auto VT : {MVT::v8i8, MVT::v4i16})
1622 setOperationAction(ISD::TRUNCATE, VT, Custom);
1623 setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom);
1624
1625 // 128bit results imply a bigger than NEON input.
1626 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1627 setOperationAction(ISD::TRUNCATE, VT, Custom);
1628 for (auto VT : {MVT::v8f16, MVT::v4f32})
1629 setOperationAction(ISD::FP_ROUND, VT, Custom);
1630
1631 // These operations are not supported on NEON but SVE can do them.
1632 setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom);
1633 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1634 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1635 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1636 setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
1637 setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
1638 setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
1639 setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
1640 setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1641 setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1642 setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1643 setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1644 setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1645 setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1646 setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1647 setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1648 setOperationAction(ISD::VECREDUCE_SMAX, MVT::v2i64, Custom);
1649 setOperationAction(ISD::VECREDUCE_SMIN, MVT::v2i64, Custom);
1650 setOperationAction(ISD::VECREDUCE_UMAX, MVT::v2i64, Custom);
1651 setOperationAction(ISD::VECREDUCE_UMIN, MVT::v2i64, Custom);
1652
1653 // Int operations with no NEON support.
1654 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1655 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1656 setOperationAction(ISD::BITREVERSE, VT, Custom);
1657 setOperationAction(ISD::CTTZ, VT, Custom);
1658 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1659 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1660 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1661 setOperationAction(ISD::MULHS, VT, Custom);
1662 setOperationAction(ISD::MULHU, VT, Custom);
1663 }
1664
1665
1666 // Use SVE for vectors with more than 2 elements.
1667 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1668 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1669 }
1670
1671 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
1672 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
1673 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
1674 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
1675
1676 setOperationAction(ISD::VSCALE, MVT::i32, Custom);
1677 }
1678
1679 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1680 // Only required for llvm.aarch64.mops.memset.tag
1681 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
1682 }
1683
1684 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1685
1686 if (Subtarget->hasSVE()) {
1687 setOperationAction(ISD::FLDEXP, MVT::f64, Custom);
1688 setOperationAction(ISD::FLDEXP, MVT::f32, Custom);
1689 setOperationAction(ISD::FLDEXP, MVT::f16, Custom);
1690 setOperationAction(ISD::FLDEXP, MVT::bf16, Custom);
1691 }
1692
1693 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1694
1695 IsStrictFPEnabled = true;
1696 setMaxAtomicSizeInBitsSupported(128);
1697
1698 if (Subtarget->isWindowsArm64EC()) {
1699 // FIXME: are there intrinsics we need to exclude from this?
1700 for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) {
1701 auto code = static_cast<RTLIB::Libcall>(i);
1702 auto libcallName = getLibcallName(Call: code);
1703 if ((libcallName != nullptr) && (libcallName[0] != '#')) {
1704 setLibcallName(Call: code, Name: Saver.save(S: Twine("#") + libcallName).data());
1705 }
1706 }
1707 }
1708}
1709
1710void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1711 assert(VT.isVector() && "VT should be a vector type");
1712
1713 if (VT.isFloatingPoint()) {
1714 MVT PromoteTo = EVT(VT).changeVectorElementTypeToInteger().getSimpleVT();
1715 setOperationPromotedToType(Opc: ISD::LOAD, OrigVT: VT, DestVT: PromoteTo);
1716 setOperationPromotedToType(Opc: ISD::STORE, OrigVT: VT, DestVT: PromoteTo);
1717 }
1718
1719 // Mark vector float intrinsics as expand.
1720 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1721 setOperationAction(Op: ISD::FSIN, VT, Action: Expand);
1722 setOperationAction(Op: ISD::FCOS, VT, Action: Expand);
1723 setOperationAction(Op: ISD::FPOW, VT, Action: Expand);
1724 setOperationAction(Op: ISD::FLOG, VT, Action: Expand);
1725 setOperationAction(Op: ISD::FLOG2, VT, Action: Expand);
1726 setOperationAction(Op: ISD::FLOG10, VT, Action: Expand);
1727 setOperationAction(Op: ISD::FEXP, VT, Action: Expand);
1728 setOperationAction(Op: ISD::FEXP2, VT, Action: Expand);
1729 setOperationAction(Op: ISD::FEXP10, VT, Action: Expand);
1730 }
1731
1732 // But we do support custom-lowering for FCOPYSIGN.
1733 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
1734 ((VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v4f16 ||
1735 VT == MVT::v8f16) &&
1736 Subtarget->hasFullFP16()))
1737 setOperationAction(Op: ISD::FCOPYSIGN, VT, Action: Custom);
1738
1739 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT, Action: Custom);
1740 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT, Action: Custom);
1741 setOperationAction(Op: ISD::BUILD_VECTOR, VT, Action: Custom);
1742 setOperationAction(Op: ISD::ZERO_EXTEND_VECTOR_INREG, VT, Action: Custom);
1743 setOperationAction(Op: ISD::VECTOR_SHUFFLE, VT, Action: Custom);
1744 setOperationAction(Op: ISD::EXTRACT_SUBVECTOR, VT, Action: Custom);
1745 setOperationAction(Op: ISD::SRA, VT, Action: Custom);
1746 setOperationAction(Op: ISD::SRL, VT, Action: Custom);
1747 setOperationAction(Op: ISD::SHL, VT, Action: Custom);
1748 setOperationAction(Op: ISD::OR, VT, Action: Custom);
1749 setOperationAction(Op: ISD::SETCC, VT, Action: Custom);
1750 setOperationAction(Op: ISD::CONCAT_VECTORS, VT, Action: Legal);
1751
1752 setOperationAction(Op: ISD::SELECT, VT, Action: Expand);
1753 setOperationAction(Op: ISD::SELECT_CC, VT, Action: Expand);
1754 setOperationAction(Op: ISD::VSELECT, VT, Action: Expand);
1755 for (MVT InnerVT : MVT::all_valuetypes())
1756 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1757
1758 // CNT supports only B element sizes, then use UADDLP to widen.
1759 if (VT != MVT::v8i8 && VT != MVT::v16i8)
1760 setOperationAction(Op: ISD::CTPOP, VT, Action: Custom);
1761
1762 setOperationAction(Op: ISD::UDIV, VT, Action: Expand);
1763 setOperationAction(Op: ISD::SDIV, VT, Action: Expand);
1764 setOperationAction(Op: ISD::UREM, VT, Action: Expand);
1765 setOperationAction(Op: ISD::SREM, VT, Action: Expand);
1766 setOperationAction(Op: ISD::FREM, VT, Action: Expand);
1767
1768 for (unsigned Opcode :
1769 {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
1770 ISD::FP_TO_UINT_SAT, ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT})
1771 setOperationAction(Op: Opcode, VT, Action: Custom);
1772
1773 if (!VT.isFloatingPoint())
1774 setOperationAction(Op: ISD::ABS, VT, Action: Legal);
1775
1776 // [SU][MIN|MAX] are available for all NEON types apart from i64.
1777 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1778 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1779 setOperationAction(Op: Opcode, VT, Action: Legal);
1780
1781 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
1782 // NEON types.
1783 if (VT.isFloatingPoint() &&
1784 VT.getVectorElementType() != MVT::bf16 &&
1785 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1786 for (unsigned Opcode :
1787 {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM,
1788 ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_FMINNUM,
1789 ISD::STRICT_FMAXNUM, ISD::STRICT_FADD, ISD::STRICT_FSUB,
1790 ISD::STRICT_FMUL, ISD::STRICT_FDIV, ISD::STRICT_FMA,
1791 ISD::STRICT_FSQRT})
1792 setOperationAction(Op: Opcode, VT, Action: Legal);
1793
1794 // Strict fp extend and trunc are legal
1795 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
1796 setOperationAction(Op: ISD::STRICT_FP_EXTEND, VT, Action: Legal);
1797 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
1798 setOperationAction(Op: ISD::STRICT_FP_ROUND, VT, Action: Legal);
1799
1800 // FIXME: We could potentially make use of the vector comparison instructions
1801 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
1802 // complications:
1803 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
1804 // so we would need to expand when the condition code doesn't match the
1805 // kind of comparison.
1806 // * Some kinds of comparison require more than one FCMXY instruction so
1807 // would need to be expanded instead.
1808 // * The lowering of the non-strict versions involves target-specific ISD
1809 // nodes so we would likely need to add strict versions of all of them and
1810 // handle them appropriately.
1811 setOperationAction(Op: ISD::STRICT_FSETCC, VT, Action: Expand);
1812 setOperationAction(Op: ISD::STRICT_FSETCCS, VT, Action: Expand);
1813
1814 if (Subtarget->isLittleEndian()) {
1815 for (unsigned im = (unsigned)ISD::PRE_INC;
1816 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
1817 setIndexedLoadAction(IdxModes: im, VT, Action: Legal);
1818 setIndexedStoreAction(IdxModes: im, VT, Action: Legal);
1819 }
1820 }
1821
1822 if (Subtarget->hasD128()) {
1823 setOperationAction(ISD::READ_REGISTER, MVT::i128, Custom);
1824 setOperationAction(ISD::WRITE_REGISTER, MVT::i128, Custom);
1825 }
1826}
1827
1828bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
1829 EVT OpVT) const {
1830 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
1831 if (!Subtarget->hasSVE())
1832 return true;
1833
1834 // We can only support legal predicate result types. We can use the SVE
1835 // whilelo instruction for generating fixed-width predicates too.
1836 if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
1837 ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 &&
1838 ResVT != MVT::v8i1 && ResVT != MVT::v16i1)
1839 return true;
1840
1841 // The whilelo instruction only works with i32 or i64 scalar inputs.
1842 if (OpVT != MVT::i32 && OpVT != MVT::i64)
1843 return true;
1844
1845 return false;
1846}
1847
1848bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const {
1849 return !Subtarget->hasSVEorSME() || VT != MVT::nxv16i1;
1850}
1851
1852void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
1853 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
1854
1855 // By default everything must be expanded.
1856 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1857 setOperationAction(Op, VT, Action: Expand);
1858
1859 if (VT.isFloatingPoint()) {
1860 setCondCodeAction(CCs: ISD::SETO, VT, Action: Expand);
1861 setCondCodeAction(CCs: ISD::SETOLT, VT, Action: Expand);
1862 setCondCodeAction(CCs: ISD::SETOLE, VT, Action: Expand);
1863 setCondCodeAction(CCs: ISD::SETULT, VT, Action: Expand);
1864 setCondCodeAction(CCs: ISD::SETULE, VT, Action: Expand);
1865 setCondCodeAction(CCs: ISD::SETUGE, VT, Action: Expand);
1866 setCondCodeAction(CCs: ISD::SETUGT, VT, Action: Expand);
1867 setCondCodeAction(CCs: ISD::SETUEQ, VT, Action: Expand);
1868 setCondCodeAction(CCs: ISD::SETONE, VT, Action: Expand);
1869 }
1870
1871 TargetLoweringBase::LegalizeAction Default =
1872 VT == MVT::v1f64 ? Expand : Custom;
1873
1874 // Mark integer truncating stores/extending loads as having custom lowering
1875 if (VT.isInteger()) {
1876 MVT InnerVT = VT.changeVectorElementType(MVT::i8);
1877 while (InnerVT != VT) {
1878 setTruncStoreAction(ValVT: VT, MemVT: InnerVT, Action: Default);
1879 setLoadExtAction(ExtType: ISD::ZEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Default);
1880 setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Default);
1881 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Default);
1882 InnerVT = InnerVT.changeVectorElementType(
1883 EltVT: MVT::getIntegerVT(BitWidth: 2 * InnerVT.getScalarSizeInBits()));
1884 }
1885 }
1886
1887 // Mark floating-point truncating stores/extending loads as having custom
1888 // lowering
1889 if (VT.isFloatingPoint()) {
1890 MVT InnerVT = VT.changeVectorElementType(MVT::f16);
1891 while (InnerVT != VT) {
1892 setTruncStoreAction(ValVT: VT, MemVT: InnerVT, Action: Custom);
1893 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Default);
1894 InnerVT = InnerVT.changeVectorElementType(
1895 EltVT: MVT::getFloatingPointVT(BitWidth: 2 * InnerVT.getScalarSizeInBits()));
1896 }
1897 }
1898
1899 bool PreferNEON = VT.is64BitVector() || VT.is128BitVector();
1900 bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
1901
1902 // Lower fixed length vector operations to scalable equivalents.
1903 setOperationAction(Op: ISD::ABS, VT, Action: Default);
1904 setOperationAction(Op: ISD::ADD, VT, Action: Default);
1905 setOperationAction(Op: ISD::AND, VT, Action: Default);
1906 setOperationAction(Op: ISD::ANY_EXTEND, VT, Action: Default);
1907 setOperationAction(Op: ISD::BITCAST, VT, Action: PreferNEON ? Legal : Default);
1908 setOperationAction(Op: ISD::BITREVERSE, VT, Action: Default);
1909 setOperationAction(Op: ISD::BSWAP, VT, Action: Default);
1910 setOperationAction(Op: ISD::BUILD_VECTOR, VT, Action: Default);
1911 setOperationAction(Op: ISD::CONCAT_VECTORS, VT, Action: Default);
1912 setOperationAction(Op: ISD::CTLZ, VT, Action: Default);
1913 setOperationAction(Op: ISD::CTPOP, VT, Action: Default);
1914 setOperationAction(Op: ISD::CTTZ, VT, Action: Default);
1915 setOperationAction(Op: ISD::EXTRACT_SUBVECTOR, VT, Action: Default);
1916 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT, Action: Default);
1917 setOperationAction(Op: ISD::FABS, VT, Action: Default);
1918 setOperationAction(Op: ISD::FADD, VT, Action: Default);
1919 setOperationAction(Op: ISD::FCEIL, VT, Action: Default);
1920 setOperationAction(Op: ISD::FCOPYSIGN, VT, Action: Default);
1921 setOperationAction(Op: ISD::FDIV, VT, Action: Default);
1922 setOperationAction(Op: ISD::FFLOOR, VT, Action: Default);
1923 setOperationAction(Op: ISD::FMA, VT, Action: Default);
1924 setOperationAction(Op: ISD::FMAXIMUM, VT, Action: Default);
1925 setOperationAction(Op: ISD::FMAXNUM, VT, Action: Default);
1926 setOperationAction(Op: ISD::FMINIMUM, VT, Action: Default);
1927 setOperationAction(Op: ISD::FMINNUM, VT, Action: Default);
1928 setOperationAction(Op: ISD::FMUL, VT, Action: Default);
1929 setOperationAction(Op: ISD::FNEARBYINT, VT, Action: Default);
1930 setOperationAction(Op: ISD::FNEG, VT, Action: Default);
1931 setOperationAction(Op: ISD::FP_EXTEND, VT, Action: Default);
1932 setOperationAction(Op: ISD::FP_ROUND, VT, Action: Default);
1933 setOperationAction(Op: ISD::FP_TO_SINT, VT, Action: Default);
1934 setOperationAction(Op: ISD::FP_TO_UINT, VT, Action: Default);
1935 setOperationAction(Op: ISD::FRINT, VT, Action: Default);
1936 setOperationAction(Op: ISD::FROUND, VT, Action: Default);
1937 setOperationAction(Op: ISD::FROUNDEVEN, VT, Action: Default);
1938 setOperationAction(Op: ISD::FSQRT, VT, Action: Default);
1939 setOperationAction(Op: ISD::FSUB, VT, Action: Default);
1940 setOperationAction(Op: ISD::FTRUNC, VT, Action: Default);
1941 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT, Action: Default);
1942 setOperationAction(Op: ISD::LOAD, VT, Action: PreferNEON ? Legal : Default);
1943 setOperationAction(Op: ISD::MGATHER, VT, Action: PreferSVE ? Default : Expand);
1944 setOperationAction(Op: ISD::MLOAD, VT, Action: Default);
1945 setOperationAction(Op: ISD::MSCATTER, VT, Action: PreferSVE ? Default : Expand);
1946 setOperationAction(Op: ISD::MSTORE, VT, Action: Default);
1947 setOperationAction(Op: ISD::MUL, VT, Action: Default);
1948 setOperationAction(Op: ISD::MULHS, VT, Action: Default);
1949 setOperationAction(Op: ISD::MULHU, VT, Action: Default);
1950 setOperationAction(Op: ISD::OR, VT, Action: Default);
1951 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT, Action: PreferNEON ? Legal : Expand);
1952 setOperationAction(Op: ISD::SDIV, VT, Action: Default);
1953 setOperationAction(Op: ISD::SELECT, VT, Action: Default);
1954 setOperationAction(Op: ISD::SETCC, VT, Action: Default);
1955 setOperationAction(Op: ISD::SHL, VT, Action: Default);
1956 setOperationAction(Op: ISD::SIGN_EXTEND, VT, Action: Default);
1957 setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT, Action: Default);
1958 setOperationAction(Op: ISD::SINT_TO_FP, VT, Action: Default);
1959 setOperationAction(Op: ISD::SMAX, VT, Action: Default);
1960 setOperationAction(Op: ISD::SMIN, VT, Action: Default);
1961 setOperationAction(Op: ISD::SPLAT_VECTOR, VT, Action: Default);
1962 setOperationAction(Op: ISD::SRA, VT, Action: Default);
1963 setOperationAction(Op: ISD::SRL, VT, Action: Default);
1964 setOperationAction(Op: ISD::STORE, VT, Action: PreferNEON ? Legal : Default);
1965 setOperationAction(Op: ISD::SUB, VT, Action: Default);
1966 setOperationAction(Op: ISD::TRUNCATE, VT, Action: Default);
1967 setOperationAction(Op: ISD::UDIV, VT, Action: Default);
1968 setOperationAction(Op: ISD::UINT_TO_FP, VT, Action: Default);
1969 setOperationAction(Op: ISD::UMAX, VT, Action: Default);
1970 setOperationAction(Op: ISD::UMIN, VT, Action: Default);
1971 setOperationAction(Op: ISD::VECREDUCE_ADD, VT, Action: Default);
1972 setOperationAction(Op: ISD::VECREDUCE_AND, VT, Action: Default);
1973 setOperationAction(Op: ISD::VECREDUCE_FADD, VT, Action: Default);
1974 setOperationAction(Op: ISD::VECREDUCE_FMAX, VT, Action: Default);
1975 setOperationAction(Op: ISD::VECREDUCE_FMIN, VT, Action: Default);
1976 setOperationAction(Op: ISD::VECREDUCE_FMAXIMUM, VT, Action: Default);
1977 setOperationAction(Op: ISD::VECREDUCE_FMINIMUM, VT, Action: Default);
1978 setOperationAction(Op: ISD::VECREDUCE_OR, VT, Action: Default);
1979 setOperationAction(Op: ISD::VECREDUCE_SEQ_FADD, VT, Action: PreferSVE ? Default : Expand);
1980 setOperationAction(Op: ISD::VECREDUCE_SMAX, VT, Action: Default);
1981 setOperationAction(Op: ISD::VECREDUCE_SMIN, VT, Action: Default);
1982 setOperationAction(Op: ISD::VECREDUCE_UMAX, VT, Action: Default);
1983 setOperationAction(Op: ISD::VECREDUCE_UMIN, VT, Action: Default);
1984 setOperationAction(Op: ISD::VECREDUCE_XOR, VT, Action: Default);
1985 setOperationAction(Op: ISD::VECTOR_SHUFFLE, VT, Action: Default);
1986 setOperationAction(Op: ISD::VECTOR_SPLICE, VT, Action: Default);
1987 setOperationAction(Op: ISD::VSELECT, VT, Action: Default);
1988 setOperationAction(Op: ISD::XOR, VT, Action: Default);
1989 setOperationAction(Op: ISD::ZERO_EXTEND, VT, Action: Default);
1990}
1991
1992void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
1993 addRegisterClass(VT, &AArch64::FPR64RegClass);
1994 addTypeForNEON(VT);
1995}
1996
1997void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
1998 addRegisterClass(VT, &AArch64::FPR128RegClass);
1999 addTypeForNEON(VT);
2000}
2001
2002EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &,
2003 LLVMContext &C, EVT VT) const {
2004 if (!VT.isVector())
2005 return MVT::i32;
2006 if (VT.isScalableVector())
2007 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
2008 return VT.changeVectorElementTypeToInteger();
2009}
2010
2011// isIntImmediate - This method tests to see if the node is a constant
2012// operand. If so Imm will receive the value.
2013static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
2014 if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(Val: N)) {
2015 Imm = C->getZExtValue();
2016 return true;
2017 }
2018 return false;
2019}
2020
2021// isOpcWithIntImmediate - This method tests to see if the node is a specific
2022// opcode and that it has a immediate integer right operand.
2023// If so Imm will receive the value.
2024static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
2025 uint64_t &Imm) {
2026 return N->getOpcode() == Opc &&
2027 isIntImmediate(N: N->getOperand(Num: 1).getNode(), Imm);
2028}
2029
2030static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
2031 const APInt &Demanded,
2032 TargetLowering::TargetLoweringOpt &TLO,
2033 unsigned NewOpc) {
2034 uint64_t OldImm = Imm, NewImm, Enc;
2035 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
2036
2037 // Return if the immediate is already all zeros, all ones, a bimm32 or a
2038 // bimm64.
2039 if (Imm == 0 || Imm == Mask ||
2040 AArch64_AM::isLogicalImmediate(imm: Imm & Mask, regSize: Size))
2041 return false;
2042
2043 unsigned EltSize = Size;
2044 uint64_t DemandedBits = Demanded.getZExtValue();
2045
2046 // Clear bits that are not demanded.
2047 Imm &= DemandedBits;
2048
2049 while (true) {
2050 // The goal here is to set the non-demanded bits in a way that minimizes
2051 // the number of switching between 0 and 1. In order to achieve this goal,
2052 // we set the non-demanded bits to the value of the preceding demanded bits.
2053 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2054 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
2055 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2056 // The final result is 0b11000011.
2057 uint64_t NonDemandedBits = ~DemandedBits;
2058 uint64_t InvertedImm = ~Imm & DemandedBits;
2059 uint64_t RotatedImm =
2060 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2061 NonDemandedBits;
2062 uint64_t Sum = RotatedImm + NonDemandedBits;
2063 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2064 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2065 NewImm = (Imm | Ones) & Mask;
2066
2067 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2068 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2069 // we halve the element size and continue the search.
2070 if (isShiftedMask_64(Value: NewImm) || isShiftedMask_64(Value: ~(NewImm | ~Mask)))
2071 break;
2072
2073 // We cannot shrink the element size any further if it is 2-bits.
2074 if (EltSize == 2)
2075 return false;
2076
2077 EltSize /= 2;
2078 Mask >>= EltSize;
2079 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2080
2081 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2082 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2083 return false;
2084
2085 // Merge the upper and lower halves of Imm and DemandedBits.
2086 Imm |= Hi;
2087 DemandedBits |= DemandedBitsHi;
2088 }
2089
2090 ++NumOptimizedImms;
2091
2092 // Replicate the element across the register width.
2093 while (EltSize < Size) {
2094 NewImm |= NewImm << EltSize;
2095 EltSize *= 2;
2096 }
2097
2098 (void)OldImm;
2099 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2100 "demanded bits should never be altered");
2101 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2102
2103 // Create the new constant immediate node.
2104 EVT VT = Op.getValueType();
2105 SDLoc DL(Op);
2106 SDValue New;
2107
2108 // If the new constant immediate is all-zeros or all-ones, let the target
2109 // independent DAG combine optimize this node.
2110 if (NewImm == 0 || NewImm == OrigMask) {
2111 New = TLO.DAG.getNode(Opcode: Op.getOpcode(), DL, VT, N1: Op.getOperand(i: 0),
2112 N2: TLO.DAG.getConstant(Val: NewImm, DL, VT));
2113 // Otherwise, create a machine node so that target independent DAG combine
2114 // doesn't undo this optimization.
2115 } else {
2116 Enc = AArch64_AM::encodeLogicalImmediate(imm: NewImm, regSize: Size);
2117 SDValue EncConst = TLO.DAG.getTargetConstant(Val: Enc, DL, VT);
2118 New = SDValue(
2119 TLO.DAG.getMachineNode(Opcode: NewOpc, dl: DL, VT, Op1: Op.getOperand(i: 0), Op2: EncConst), 0);
2120 }
2121
2122 return TLO.CombineTo(O: Op, N: New);
2123}
2124
2125bool AArch64TargetLowering::targetShrinkDemandedConstant(
2126 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2127 TargetLoweringOpt &TLO) const {
2128 // Delay this optimization to as late as possible.
2129 if (!TLO.LegalOps)
2130 return false;
2131
2132 if (!EnableOptimizeLogicalImm)
2133 return false;
2134
2135 EVT VT = Op.getValueType();
2136 if (VT.isVector())
2137 return false;
2138
2139 unsigned Size = VT.getSizeInBits();
2140 assert((Size == 32 || Size == 64) &&
2141 "i32 or i64 is expected after legalization.");
2142
2143 // Exit early if we demand all bits.
2144 if (DemandedBits.popcount() == Size)
2145 return false;
2146
2147 unsigned NewOpc;
2148 switch (Op.getOpcode()) {
2149 default:
2150 return false;
2151 case ISD::AND:
2152 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2153 break;
2154 case ISD::OR:
2155 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2156 break;
2157 case ISD::XOR:
2158 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2159 break;
2160 }
2161 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 1));
2162 if (!C)
2163 return false;
2164 uint64_t Imm = C->getZExtValue();
2165 return optimizeLogicalImm(Op, Size, Imm, Demanded: DemandedBits, TLO, NewOpc);
2166}
2167
2168/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2169/// Mask are known to be either zero or one and return them Known.
2170void AArch64TargetLowering::computeKnownBitsForTargetNode(
2171 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2172 const SelectionDAG &DAG, unsigned Depth) const {
2173 switch (Op.getOpcode()) {
2174 default:
2175 break;
2176 case AArch64ISD::DUP: {
2177 SDValue SrcOp = Op.getOperand(i: 0);
2178 Known = DAG.computeKnownBits(Op: SrcOp, Depth: Depth + 1);
2179 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2180 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2181 "Expected DUP implicit truncation");
2182 Known = Known.trunc(BitWidth: Op.getScalarValueSizeInBits());
2183 }
2184 break;
2185 }
2186 case AArch64ISD::CSEL: {
2187 KnownBits Known2;
2188 Known = DAG.computeKnownBits(Op: Op->getOperand(Num: 0), Depth: Depth + 1);
2189 Known2 = DAG.computeKnownBits(Op: Op->getOperand(Num: 1), Depth: Depth + 1);
2190 Known = Known.intersectWith(RHS: Known2);
2191 break;
2192 }
2193 case AArch64ISD::BICi: {
2194 // Compute the bit cleared value.
2195 uint64_t Mask =
2196 ~(Op->getConstantOperandVal(Num: 1) << Op->getConstantOperandVal(Num: 2));
2197 Known = DAG.computeKnownBits(Op: Op->getOperand(Num: 0), Depth: Depth + 1);
2198 Known &= KnownBits::makeConstant(C: APInt(Known.getBitWidth(), Mask));
2199 break;
2200 }
2201 case AArch64ISD::VLSHR: {
2202 KnownBits Known2;
2203 Known = DAG.computeKnownBits(Op: Op->getOperand(Num: 0), Depth: Depth + 1);
2204 Known2 = DAG.computeKnownBits(Op: Op->getOperand(Num: 1), Depth: Depth + 1);
2205 Known = KnownBits::lshr(LHS: Known, RHS: Known2);
2206 break;
2207 }
2208 case AArch64ISD::VASHR: {
2209 KnownBits Known2;
2210 Known = DAG.computeKnownBits(Op: Op->getOperand(Num: 0), Depth: Depth + 1);
2211 Known2 = DAG.computeKnownBits(Op: Op->getOperand(Num: 1), Depth: Depth + 1);
2212 Known = KnownBits::ashr(LHS: Known, RHS: Known2);
2213 break;
2214 }
2215 case AArch64ISD::VSHL: {
2216 KnownBits Known2;
2217 Known = DAG.computeKnownBits(Op: Op->getOperand(Num: 0), Depth: Depth + 1);
2218 Known2 = DAG.computeKnownBits(Op: Op->getOperand(Num: 1), Depth: Depth + 1);
2219 Known = KnownBits::shl(LHS: Known, RHS: Known2);
2220 break;
2221 }
2222 case AArch64ISD::MOVI: {
2223 Known = KnownBits::makeConstant(
2224 C: APInt(Known.getBitWidth(), Op->getConstantOperandVal(Num: 0)));
2225 break;
2226 }
2227 case AArch64ISD::LOADgot:
2228 case AArch64ISD::ADDlow: {
2229 if (!Subtarget->isTargetILP32())
2230 break;
2231 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2232 Known.Zero = APInt::getHighBitsSet(numBits: 64, hiBitsSet: 32);
2233 break;
2234 }
2235 case AArch64ISD::ASSERT_ZEXT_BOOL: {
2236 Known = DAG.computeKnownBits(Op: Op->getOperand(Num: 0), Depth: Depth + 1);
2237 Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2238 break;
2239 }
2240 case ISD::INTRINSIC_W_CHAIN: {
2241 Intrinsic::ID IntID =
2242 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(Num: 1));
2243 switch (IntID) {
2244 default: return;
2245 case Intrinsic::aarch64_ldaxr:
2246 case Intrinsic::aarch64_ldxr: {
2247 unsigned BitWidth = Known.getBitWidth();
2248 EVT VT = cast<MemIntrinsicSDNode>(Val: Op)->getMemoryVT();
2249 unsigned MemBits = VT.getScalarSizeInBits();
2250 Known.Zero |= APInt::getHighBitsSet(numBits: BitWidth, hiBitsSet: BitWidth - MemBits);
2251 return;
2252 }
2253 }
2254 break;
2255 }
2256 case ISD::INTRINSIC_WO_CHAIN:
2257 case ISD::INTRINSIC_VOID: {
2258 unsigned IntNo = Op.getConstantOperandVal(i: 0);
2259 switch (IntNo) {
2260 default:
2261 break;
2262 case Intrinsic::aarch64_neon_uaddlv: {
2263 MVT VT = Op.getOperand(i: 1).getValueType().getSimpleVT();
2264 unsigned BitWidth = Known.getBitWidth();
2265 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2266 unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
2267 assert(BitWidth >= Bound && "Unexpected width!");
2268 APInt Mask = APInt::getHighBitsSet(numBits: BitWidth, hiBitsSet: BitWidth - Bound);
2269 Known.Zero |= Mask;
2270 }
2271 break;
2272 }
2273 case Intrinsic::aarch64_neon_umaxv:
2274 case Intrinsic::aarch64_neon_uminv: {
2275 // Figure out the datatype of the vector operand. The UMINV instruction
2276 // will zero extend the result, so we can mark as known zero all the
2277 // bits larger than the element datatype. 32-bit or larget doesn't need
2278 // this as those are legal types and will be handled by isel directly.
2279 MVT VT = Op.getOperand(i: 1).getValueType().getSimpleVT();
2280 unsigned BitWidth = Known.getBitWidth();
2281 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2282 assert(BitWidth >= 8 && "Unexpected width!");
2283 APInt Mask = APInt::getHighBitsSet(numBits: BitWidth, hiBitsSet: BitWidth - 8);
2284 Known.Zero |= Mask;
2285 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2286 assert(BitWidth >= 16 && "Unexpected width!");
2287 APInt Mask = APInt::getHighBitsSet(numBits: BitWidth, hiBitsSet: BitWidth - 16);
2288 Known.Zero |= Mask;
2289 }
2290 break;
2291 } break;
2292 }
2293 }
2294 }
2295}
2296
2297unsigned AArch64TargetLowering::ComputeNumSignBitsForTargetNode(
2298 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2299 unsigned Depth) const {
2300 EVT VT = Op.getValueType();
2301 unsigned VTBits = VT.getScalarSizeInBits();
2302 unsigned Opcode = Op.getOpcode();
2303 switch (Opcode) {
2304 case AArch64ISD::CMEQ:
2305 case AArch64ISD::CMGE:
2306 case AArch64ISD::CMGT:
2307 case AArch64ISD::CMHI:
2308 case AArch64ISD::CMHS:
2309 case AArch64ISD::FCMEQ:
2310 case AArch64ISD::FCMGE:
2311 case AArch64ISD::FCMGT:
2312 case AArch64ISD::CMEQz:
2313 case AArch64ISD::CMGEz:
2314 case AArch64ISD::CMGTz:
2315 case AArch64ISD::CMLEz:
2316 case AArch64ISD::CMLTz:
2317 case AArch64ISD::FCMEQz:
2318 case AArch64ISD::FCMGEz:
2319 case AArch64ISD::FCMGTz:
2320 case AArch64ISD::FCMLEz:
2321 case AArch64ISD::FCMLTz:
2322 // Compares return either 0 or all-ones
2323 return VTBits;
2324 }
2325
2326 return 1;
2327}
2328
2329MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
2330 EVT) const {
2331 return MVT::i64;
2332}
2333
2334bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
2335 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2336 unsigned *Fast) const {
2337 if (Subtarget->requiresStrictAlign())
2338 return false;
2339
2340 if (Fast) {
2341 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2342 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2343 // See comments in performSTORECombine() for more details about
2344 // these conditions.
2345
2346 // Code that uses clang vector extensions can mark that it
2347 // wants unaligned accesses to be treated as fast by
2348 // underspecifying alignment to be 1 or 2.
2349 Alignment <= 2 ||
2350
2351 // Disregard v2i64. Memcpy lowering produces those and splitting
2352 // them regresses performance on micro-benchmarks and olden/bh.
2353 VT == MVT::v2i64;
2354 }
2355 return true;
2356}
2357
2358// Same as above but handling LLTs instead.
2359bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
2360 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2361 unsigned *Fast) const {
2362 if (Subtarget->requiresStrictAlign())
2363 return false;
2364
2365 if (Fast) {
2366 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2367 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2368 Ty.getSizeInBytes() != 16 ||
2369 // See comments in performSTORECombine() for more details about
2370 // these conditions.
2371
2372 // Code that uses clang vector extensions can mark that it
2373 // wants unaligned accesses to be treated as fast by
2374 // underspecifying alignment to be 1 or 2.
2375 Alignment <= 2 ||
2376
2377 // Disregard v2i64. Memcpy lowering produces those and splitting
2378 // them regresses performance on micro-benchmarks and olden/bh.
2379 Ty == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64);
2380 }
2381 return true;
2382}
2383
2384FastISel *
2385AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
2386 const TargetLibraryInfo *libInfo) const {
2387 return AArch64::createFastISel(funcInfo, libInfo);
2388}
2389
2390const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
2391#define MAKE_CASE(V) \
2392 case V: \
2393 return #V;
2394 switch ((AArch64ISD::NodeType)Opcode) {
2395 case AArch64ISD::FIRST_NUMBER:
2396 break;
2397 MAKE_CASE(AArch64ISD::COALESCER_BARRIER)
2398 MAKE_CASE(AArch64ISD::SMSTART)
2399 MAKE_CASE(AArch64ISD::SMSTOP)
2400 MAKE_CASE(AArch64ISD::RESTORE_ZA)
2401 MAKE_CASE(AArch64ISD::RESTORE_ZT)
2402 MAKE_CASE(AArch64ISD::SAVE_ZT)
2403 MAKE_CASE(AArch64ISD::CALL)
2404 MAKE_CASE(AArch64ISD::ADRP)
2405 MAKE_CASE(AArch64ISD::ADR)
2406 MAKE_CASE(AArch64ISD::ADDlow)
2407 MAKE_CASE(AArch64ISD::LOADgot)
2408 MAKE_CASE(AArch64ISD::RET_GLUE)
2409 MAKE_CASE(AArch64ISD::BRCOND)
2410 MAKE_CASE(AArch64ISD::CSEL)
2411 MAKE_CASE(AArch64ISD::CSINV)
2412 MAKE_CASE(AArch64ISD::CSNEG)
2413 MAKE_CASE(AArch64ISD::CSINC)
2414 MAKE_CASE(AArch64ISD::THREAD_POINTER)
2415 MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ)
2416 MAKE_CASE(AArch64ISD::PROBED_ALLOCA)
2417 MAKE_CASE(AArch64ISD::ABDS_PRED)
2418 MAKE_CASE(AArch64ISD::ABDU_PRED)
2419 MAKE_CASE(AArch64ISD::HADDS_PRED)
2420 MAKE_CASE(AArch64ISD::HADDU_PRED)
2421 MAKE_CASE(AArch64ISD::MUL_PRED)
2422 MAKE_CASE(AArch64ISD::MULHS_PRED)
2423 MAKE_CASE(AArch64ISD::MULHU_PRED)
2424 MAKE_CASE(AArch64ISD::RHADDS_PRED)
2425 MAKE_CASE(AArch64ISD::RHADDU_PRED)
2426 MAKE_CASE(AArch64ISD::SDIV_PRED)
2427 MAKE_CASE(AArch64ISD::SHL_PRED)
2428 MAKE_CASE(AArch64ISD::SMAX_PRED)
2429 MAKE_CASE(AArch64ISD::SMIN_PRED)
2430 MAKE_CASE(AArch64ISD::SRA_PRED)
2431 MAKE_CASE(AArch64ISD::SRL_PRED)
2432 MAKE_CASE(AArch64ISD::UDIV_PRED)
2433 MAKE_CASE(AArch64ISD::UMAX_PRED)
2434 MAKE_CASE(AArch64ISD::UMIN_PRED)
2435 MAKE_CASE(AArch64ISD::SRAD_MERGE_OP1)
2436 MAKE_CASE(AArch64ISD::FNEG_MERGE_PASSTHRU)
2437 MAKE_CASE(AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU)
2438 MAKE_CASE(AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU)
2439 MAKE_CASE(AArch64ISD::FCEIL_MERGE_PASSTHRU)
2440 MAKE_CASE(AArch64ISD::FFLOOR_MERGE_PASSTHRU)
2441 MAKE_CASE(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU)
2442 MAKE_CASE(AArch64ISD::FRINT_MERGE_PASSTHRU)
2443 MAKE_CASE(AArch64ISD::FROUND_MERGE_PASSTHRU)
2444 MAKE_CASE(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU)
2445 MAKE_CASE(AArch64ISD::FTRUNC_MERGE_PASSTHRU)
2446 MAKE_CASE(AArch64ISD::FP_ROUND_MERGE_PASSTHRU)
2447 MAKE_CASE(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU)
2448 MAKE_CASE(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU)
2449 MAKE_CASE(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU)
2450 MAKE_CASE(AArch64ISD::FCVTZU_MERGE_PASSTHRU)
2451 MAKE_CASE(AArch64ISD::FCVTZS_MERGE_PASSTHRU)
2452 MAKE_CASE(AArch64ISD::FSQRT_MERGE_PASSTHRU)
2453 MAKE_CASE(AArch64ISD::FRECPX_MERGE_PASSTHRU)
2454 MAKE_CASE(AArch64ISD::FABS_MERGE_PASSTHRU)
2455 MAKE_CASE(AArch64ISD::ABS_MERGE_PASSTHRU)
2456 MAKE_CASE(AArch64ISD::NEG_MERGE_PASSTHRU)
2457 MAKE_CASE(AArch64ISD::SETCC_MERGE_ZERO)
2458 MAKE_CASE(AArch64ISD::ADC)
2459 MAKE_CASE(AArch64ISD::SBC)
2460 MAKE_CASE(AArch64ISD::ADDS)
2461 MAKE_CASE(AArch64ISD::SUBS)
2462 MAKE_CASE(AArch64ISD::ADCS)
2463 MAKE_CASE(AArch64ISD::SBCS)
2464 MAKE_CASE(AArch64ISD::ANDS)
2465 MAKE_CASE(AArch64ISD::CCMP)
2466 MAKE_CASE(AArch64ISD::CCMN)
2467 MAKE_CASE(AArch64ISD::FCCMP)
2468 MAKE_CASE(AArch64ISD::FCMP)
2469 MAKE_CASE(AArch64ISD::STRICT_FCMP)
2470 MAKE_CASE(AArch64ISD::STRICT_FCMPE)
2471 MAKE_CASE(AArch64ISD::FCVTXN)
2472 MAKE_CASE(AArch64ISD::SME_ZA_LDR)
2473 MAKE_CASE(AArch64ISD::SME_ZA_STR)
2474 MAKE_CASE(AArch64ISD::DUP)
2475 MAKE_CASE(AArch64ISD::DUPLANE8)
2476 MAKE_CASE(AArch64ISD::DUPLANE16)
2477 MAKE_CASE(AArch64ISD::DUPLANE32)
2478 MAKE_CASE(AArch64ISD::DUPLANE64)
2479 MAKE_CASE(AArch64ISD::DUPLANE128)
2480 MAKE_CASE(AArch64ISD::MOVI)
2481 MAKE_CASE(AArch64ISD::MOVIshift)
2482 MAKE_CASE(AArch64ISD::MOVIedit)
2483 MAKE_CASE(AArch64ISD::MOVImsl)
2484 MAKE_CASE(AArch64ISD::FMOV)
2485 MAKE_CASE(AArch64ISD::MVNIshift)
2486 MAKE_CASE(AArch64ISD::MVNImsl)
2487 MAKE_CASE(AArch64ISD::BICi)
2488 MAKE_CASE(AArch64ISD::ORRi)
2489 MAKE_CASE(AArch64ISD::BSP)
2490 MAKE_CASE(AArch64ISD::ZIP1)
2491 MAKE_CASE(AArch64ISD::ZIP2)
2492 MAKE_CASE(AArch64ISD::UZP1)
2493 MAKE_CASE(AArch64ISD::UZP2)
2494 MAKE_CASE(AArch64ISD::TRN1)
2495 MAKE_CASE(AArch64ISD::TRN2)
2496 MAKE_CASE(AArch64ISD::REV16)
2497 MAKE_CASE(AArch64ISD::REV32)
2498 MAKE_CASE(AArch64ISD::REV64)
2499 MAKE_CASE(AArch64ISD::EXT)
2500 MAKE_CASE(AArch64ISD::SPLICE)
2501 MAKE_CASE(AArch64ISD::VSHL)
2502 MAKE_CASE(AArch64ISD::VLSHR)
2503 MAKE_CASE(AArch64ISD::VASHR)
2504 MAKE_CASE(AArch64ISD::VSLI)
2505 MAKE_CASE(AArch64ISD::VSRI)
2506 MAKE_CASE(AArch64ISD::CMEQ)
2507 MAKE_CASE(AArch64ISD::CMGE)
2508 MAKE_CASE(AArch64ISD::CMGT)
2509 MAKE_CASE(AArch64ISD::CMHI)
2510 MAKE_CASE(AArch64ISD::CMHS)
2511 MAKE_CASE(AArch64ISD::FCMEQ)
2512 MAKE_CASE(AArch64ISD::FCMGE)
2513 MAKE_CASE(AArch64ISD::FCMGT)
2514 MAKE_CASE(AArch64ISD::CMEQz)
2515 MAKE_CASE(AArch64ISD::CMGEz)
2516 MAKE_CASE(AArch64ISD::CMGTz)
2517 MAKE_CASE(AArch64ISD::CMLEz)
2518 MAKE_CASE(AArch64ISD::CMLTz)
2519 MAKE_CASE(AArch64ISD::FCMEQz)
2520 MAKE_CASE(AArch64ISD::FCMGEz)
2521 MAKE_CASE(AArch64ISD::FCMGTz)
2522 MAKE_CASE(AArch64ISD::FCMLEz)
2523 MAKE_CASE(AArch64ISD::FCMLTz)
2524 MAKE_CASE(AArch64ISD::SADDV)
2525 MAKE_CASE(AArch64ISD::UADDV)
2526 MAKE_CASE(AArch64ISD::UADDLV)
2527 MAKE_CASE(AArch64ISD::SADDLV)
2528 MAKE_CASE(AArch64ISD::SDOT)
2529 MAKE_CASE(AArch64ISD::UDOT)
2530 MAKE_CASE(AArch64ISD::SMINV)
2531 MAKE_CASE(AArch64ISD::UMINV)
2532 MAKE_CASE(AArch64ISD::SMAXV)
2533 MAKE_CASE(AArch64ISD::UMAXV)
2534 MAKE_CASE(AArch64ISD::SADDV_PRED)
2535 MAKE_CASE(AArch64ISD::UADDV_PRED)
2536 MAKE_CASE(AArch64ISD::SMAXV_PRED)
2537 MAKE_CASE(AArch64ISD::UMAXV_PRED)
2538 MAKE_CASE(AArch64ISD::SMINV_PRED)
2539 MAKE_CASE(AArch64ISD::UMINV_PRED)
2540 MAKE_CASE(AArch64ISD::ORV_PRED)
2541 MAKE_CASE(AArch64ISD::EORV_PRED)
2542 MAKE_CASE(AArch64ISD::ANDV_PRED)
2543 MAKE_CASE(AArch64ISD::CLASTA_N)
2544 MAKE_CASE(AArch64ISD::CLASTB_N)
2545 MAKE_CASE(AArch64ISD::LASTA)
2546 MAKE_CASE(AArch64ISD::LASTB)
2547 MAKE_CASE(AArch64ISD::REINTERPRET_CAST)
2548 MAKE_CASE(AArch64ISD::LS64_BUILD)
2549 MAKE_CASE(AArch64ISD::LS64_EXTRACT)
2550 MAKE_CASE(AArch64ISD::TBL)
2551 MAKE_CASE(AArch64ISD::FADD_PRED)
2552 MAKE_CASE(AArch64ISD::FADDA_PRED)
2553 MAKE_CASE(AArch64ISD::FADDV_PRED)
2554 MAKE_CASE(AArch64ISD::FDIV_PRED)
2555 MAKE_CASE(AArch64ISD::FMA_PRED)
2556 MAKE_CASE(AArch64ISD::FMAX_PRED)
2557 MAKE_CASE(AArch64ISD::FMAXV_PRED)
2558 MAKE_CASE(AArch64ISD::FMAXNM_PRED)
2559 MAKE_CASE(AArch64ISD::FMAXNMV_PRED)
2560 MAKE_CASE(AArch64ISD::FMIN_PRED)
2561 MAKE_CASE(AArch64ISD::FMINV_PRED)
2562 MAKE_CASE(AArch64ISD::FMINNM_PRED)
2563 MAKE_CASE(AArch64ISD::FMINNMV_PRED)
2564 MAKE_CASE(AArch64ISD::FMUL_PRED)
2565 MAKE_CASE(AArch64ISD::FSUB_PRED)
2566 MAKE_CASE(AArch64ISD::RDSVL)
2567 MAKE_CASE(AArch64ISD::BIC)
2568 MAKE_CASE(AArch64ISD::CBZ)
2569 MAKE_CASE(AArch64ISD::CBNZ)
2570 MAKE_CASE(AArch64ISD::TBZ)
2571 MAKE_CASE(AArch64ISD::TBNZ)
2572 MAKE_CASE(AArch64ISD::TC_RETURN)
2573 MAKE_CASE(AArch64ISD::PREFETCH)
2574 MAKE_CASE(AArch64ISD::SITOF)
2575 MAKE_CASE(AArch64ISD::UITOF)
2576 MAKE_CASE(AArch64ISD::NVCAST)
2577 MAKE_CASE(AArch64ISD::MRS)
2578 MAKE_CASE(AArch64ISD::SQSHL_I)
2579 MAKE_CASE(AArch64ISD::UQSHL_I)
2580 MAKE_CASE(AArch64ISD::SRSHR_I)
2581 MAKE_CASE(AArch64ISD::URSHR_I)
2582 MAKE_CASE(AArch64ISD::SQSHLU_I)
2583 MAKE_CASE(AArch64ISD::WrapperLarge)
2584 MAKE_CASE(AArch64ISD::LD2post)
2585 MAKE_CASE(AArch64ISD::LD3post)
2586 MAKE_CASE(AArch64ISD::LD4post)
2587 MAKE_CASE(AArch64ISD::ST2post)
2588 MAKE_CASE(AArch64ISD::ST3post)
2589 MAKE_CASE(AArch64ISD::ST4post)
2590 MAKE_CASE(AArch64ISD::LD1x2post)
2591 MAKE_CASE(AArch64ISD::LD1x3post)
2592 MAKE_CASE(AArch64ISD::LD1x4post)
2593 MAKE_CASE(AArch64ISD::ST1x2post)
2594 MAKE_CASE(AArch64ISD::ST1x3post)
2595 MAKE_CASE(AArch64ISD::ST1x4post)
2596 MAKE_CASE(AArch64ISD::LD1DUPpost)
2597 MAKE_CASE(AArch64ISD::LD2DUPpost)
2598 MAKE_CASE(AArch64ISD::LD3DUPpost)
2599 MAKE_CASE(AArch64ISD::LD4DUPpost)
2600 MAKE_CASE(AArch64ISD::LD1LANEpost)
2601 MAKE_CASE(AArch64ISD::LD2LANEpost)
2602 MAKE_CASE(AArch64ISD::LD3LANEpost)
2603 MAKE_CASE(AArch64ISD::LD4LANEpost)
2604 MAKE_CASE(AArch64ISD::ST2LANEpost)
2605 MAKE_CASE(AArch64ISD::ST3LANEpost)
2606 MAKE_CASE(AArch64ISD::ST4LANEpost)
2607 MAKE_CASE(AArch64ISD::SMULL)
2608 MAKE_CASE(AArch64ISD::UMULL)
2609 MAKE_CASE(AArch64ISD::PMULL)
2610 MAKE_CASE(AArch64ISD::FRECPE)
2611 MAKE_CASE(AArch64ISD::FRECPS)
2612 MAKE_CASE(AArch64ISD::FRSQRTE)
2613 MAKE_CASE(AArch64ISD::FRSQRTS)
2614 MAKE_CASE(AArch64ISD::STG)
2615 MAKE_CASE(AArch64ISD::STZG)
2616 MAKE_CASE(AArch64ISD::ST2G)
2617 MAKE_CASE(AArch64ISD::STZ2G)
2618 MAKE_CASE(AArch64ISD::SUNPKHI)
2619 MAKE_CASE(AArch64ISD::SUNPKLO)
2620 MAKE_CASE(AArch64ISD::UUNPKHI)
2621 MAKE_CASE(AArch64ISD::UUNPKLO)
2622 MAKE_CASE(AArch64ISD::INSR)
2623 MAKE_CASE(AArch64ISD::PTEST)
2624 MAKE_CASE(AArch64ISD::PTEST_ANY)
2625 MAKE_CASE(AArch64ISD::PTRUE)
2626 MAKE_CASE(AArch64ISD::LD1_MERGE_ZERO)
2627 MAKE_CASE(AArch64ISD::LD1S_MERGE_ZERO)
2628 MAKE_CASE(AArch64ISD::LDNF1_MERGE_ZERO)
2629 MAKE_CASE(AArch64ISD::LDNF1S_MERGE_ZERO)
2630 MAKE_CASE(AArch64ISD::LDFF1_MERGE_ZERO)
2631 MAKE_CASE(AArch64ISD::LDFF1S_MERGE_ZERO)
2632 MAKE_CASE(AArch64ISD::LD1RQ_MERGE_ZERO)
2633 MAKE_CASE(AArch64ISD::LD1RO_MERGE_ZERO)
2634 MAKE_CASE(AArch64ISD::SVE_LD2_MERGE_ZERO)
2635 MAKE_CASE(AArch64ISD::SVE_LD3_MERGE_ZERO)
2636 MAKE_CASE(AArch64ISD::SVE_LD4_MERGE_ZERO)
2637 MAKE_CASE(AArch64ISD::GLD1_MERGE_ZERO)
2638 MAKE_CASE(AArch64ISD::GLD1_SCALED_MERGE_ZERO)
2639 MAKE_CASE(AArch64ISD::GLD1_SXTW_MERGE_ZERO)
2640 MAKE_CASE(AArch64ISD::GLD1_UXTW_MERGE_ZERO)
2641 MAKE_CASE(AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO)
2642 MAKE_CASE(AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO)
2643 MAKE_CASE(AArch64ISD::GLD1_IMM_MERGE_ZERO)
2644 MAKE_CASE(AArch64ISD::GLD1Q_MERGE_ZERO)
2645 MAKE_CASE(AArch64ISD::GLD1Q_INDEX_MERGE_ZERO)
2646 MAKE_CASE(AArch64ISD::GLD1S_MERGE_ZERO)
2647 MAKE_CASE(AArch64ISD::GLD1S_SCALED_MERGE_ZERO)
2648 MAKE_CASE(AArch64ISD::GLD1S_SXTW_MERGE_ZERO)
2649 MAKE_CASE(AArch64ISD::GLD1S_UXTW_MERGE_ZERO)
2650 MAKE_CASE(AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO)
2651 MAKE_CASE(AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO)
2652 MAKE_CASE(AArch64ISD::GLD1S_IMM_MERGE_ZERO)
2653 MAKE_CASE(AArch64ISD::GLDFF1_MERGE_ZERO)
2654 MAKE_CASE(AArch64ISD::GLDFF1_SCALED_MERGE_ZERO)
2655 MAKE_CASE(AArch64ISD::GLDFF1_SXTW_MERGE_ZERO)
2656 MAKE_CASE(AArch64ISD::GLDFF1_UXTW_MERGE_ZERO)
2657 MAKE_CASE(AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO)
2658 MAKE_CASE(AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO)
2659 MAKE_CASE(AArch64ISD::GLDFF1_IMM_MERGE_ZERO)
2660 MAKE_CASE(AArch64ISD::GLDFF1S_MERGE_ZERO)
2661 MAKE_CASE(AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO)
2662 MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO)
2663 MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO)
2664 MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO)
2665 MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO)
2666 MAKE_CASE(AArch64ISD::GLDFF1S_IMM_MERGE_ZERO)
2667 MAKE_CASE(AArch64ISD::GLDNT1_MERGE_ZERO)
2668 MAKE_CASE(AArch64ISD::GLDNT1_INDEX_MERGE_ZERO)
2669 MAKE_CASE(AArch64ISD::GLDNT1S_MERGE_ZERO)
2670 MAKE_CASE(AArch64ISD::SST1Q_PRED)
2671 MAKE_CASE(AArch64ISD::SST1Q_INDEX_PRED)
2672 MAKE_CASE(AArch64ISD::ST1_PRED)
2673 MAKE_CASE(AArch64ISD::SST1_PRED)
2674 MAKE_CASE(AArch64ISD::SST1_SCALED_PRED)
2675 MAKE_CASE(AArch64ISD::SST1_SXTW_PRED)
2676 MAKE_CASE(AArch64ISD::SST1_UXTW_PRED)
2677 MAKE_CASE(AArch64ISD::SST1_SXTW_SCALED_PRED)
2678 MAKE_CASE(AArch64ISD::SST1_UXTW_SCALED_PRED)
2679 MAKE_CASE(AArch64ISD::SST1_IMM_PRED)
2680 MAKE_CASE(AArch64ISD::SSTNT1_PRED)
2681 MAKE_CASE(AArch64ISD::SSTNT1_INDEX_PRED)
2682 MAKE_CASE(AArch64ISD::LDP)
2683 MAKE_CASE(AArch64ISD::LDIAPP)
2684 MAKE_CASE(AArch64ISD::LDNP)
2685 MAKE_CASE(AArch64ISD::STP)
2686 MAKE_CASE(AArch64ISD::STILP)
2687 MAKE_CASE(AArch64ISD::STNP)
2688 MAKE_CASE(AArch64ISD::BITREVERSE_MERGE_PASSTHRU)
2689 MAKE_CASE(AArch64ISD::BSWAP_MERGE_PASSTHRU)
2690 MAKE_CASE(AArch64ISD::REVH_MERGE_PASSTHRU)
2691 MAKE_CASE(AArch64ISD::REVW_MERGE_PASSTHRU)
2692 MAKE_CASE(AArch64ISD::REVD_MERGE_PASSTHRU)
2693 MAKE_CASE(AArch64ISD::CTLZ_MERGE_PASSTHRU)
2694 MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU)
2695 MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU)
2696 MAKE_CASE(AArch64ISD::INDEX_VECTOR)
2697 MAKE_CASE(AArch64ISD::ADDP)
2698 MAKE_CASE(AArch64ISD::SADDLP)
2699 MAKE_CASE(AArch64ISD::UADDLP)
2700 MAKE_CASE(AArch64ISD::CALL_RVMARKER)
2701 MAKE_CASE(AArch64ISD::ASSERT_ZEXT_BOOL)
2702 MAKE_CASE(AArch64ISD::MOPS_MEMSET)
2703 MAKE_CASE(AArch64ISD::MOPS_MEMSET_TAGGING)
2704 MAKE_CASE(AArch64ISD::MOPS_MEMCOPY)
2705 MAKE_CASE(AArch64ISD::MOPS_MEMMOVE)
2706 MAKE_CASE(AArch64ISD::CALL_BTI)
2707 MAKE_CASE(AArch64ISD::MRRS)
2708 MAKE_CASE(AArch64ISD::MSRR)
2709 MAKE_CASE(AArch64ISD::RSHRNB_I)
2710 MAKE_CASE(AArch64ISD::CTTZ_ELTS)
2711 MAKE_CASE(AArch64ISD::CALL_ARM64EC_TO_X64)
2712 MAKE_CASE(AArch64ISD::URSHR_I_PRED)
2713 }
2714#undef MAKE_CASE
2715 return nullptr;
2716}
2717
2718MachineBasicBlock *
2719AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
2720 MachineBasicBlock *MBB) const {
2721 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2722 // phi node:
2723
2724 // OrigBB:
2725 // [... previous instrs leading to comparison ...]
2726 // b.ne TrueBB
2727 // b EndBB
2728 // TrueBB:
2729 // ; Fallthrough
2730 // EndBB:
2731 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2732
2733 MachineFunction *MF = MBB->getParent();
2734 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2735 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2736 DebugLoc DL = MI.getDebugLoc();
2737 MachineFunction::iterator It = ++MBB->getIterator();
2738
2739 Register DestReg = MI.getOperand(i: 0).getReg();
2740 Register IfTrueReg = MI.getOperand(i: 1).getReg();
2741 Register IfFalseReg = MI.getOperand(i: 2).getReg();
2742 unsigned CondCode = MI.getOperand(i: 3).getImm();
2743 bool NZCVKilled = MI.getOperand(i: 4).isKill();
2744
2745 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(BB: LLVM_BB);
2746 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(BB: LLVM_BB);
2747 MF->insert(MBBI: It, MBB: TrueBB);
2748 MF->insert(MBBI: It, MBB: EndBB);
2749
2750 // Transfer rest of current basic-block to EndBB
2751 EndBB->splice(Where: EndBB->begin(), Other: MBB, From: std::next(x: MachineBasicBlock::iterator(MI)),
2752 To: MBB->end());
2753 EndBB->transferSuccessorsAndUpdatePHIs(FromMBB: MBB);
2754
2755 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2756 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2757 MBB->addSuccessor(Succ: TrueBB);
2758 MBB->addSuccessor(Succ: EndBB);
2759
2760 // TrueBB falls through to the end.
2761 TrueBB->addSuccessor(Succ: EndBB);
2762
2763 if (!NZCVKilled) {
2764 TrueBB->addLiveIn(AArch64::NZCV);
2765 EndBB->addLiveIn(AArch64::NZCV);
2766 }
2767
2768 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2769 .addReg(IfTrueReg)
2770 .addMBB(TrueBB)
2771 .addReg(IfFalseReg)
2772 .addMBB(MBB);
2773
2774 MI.eraseFromParent();
2775 return EndBB;
2776}
2777
2778MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
2779 MachineInstr &MI, MachineBasicBlock *BB) const {
2780 assert(!isAsynchronousEHPersonality(classifyEHPersonality(
2781 BB->getParent()->getFunction().getPersonalityFn())) &&
2782 "SEH does not use catchret!");
2783 return BB;
2784}
2785
2786MachineBasicBlock *
2787AArch64TargetLowering::EmitDynamicProbedAlloc(MachineInstr &MI,
2788 MachineBasicBlock *MBB) const {
2789 MachineFunction &MF = *MBB->getParent();
2790 MachineBasicBlock::iterator MBBI = MI.getIterator();
2791 DebugLoc DL = MBB->findDebugLoc(MBBI);
2792 const AArch64InstrInfo &TII =
2793 *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
2794 Register TargetReg = MI.getOperand(i: 0).getReg();
2795 MachineBasicBlock::iterator NextInst =
2796 TII.probedStackAlloc(MBBI, TargetReg, FrameSetup: false);
2797
2798 MI.eraseFromParent();
2799 return NextInst->getParent();
2800}
2801
2802MachineBasicBlock *
2803AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
2804 MachineInstr &MI,
2805 MachineBasicBlock *BB) const {
2806 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2807 MachineInstrBuilder MIB = BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: Opc));
2808
2809 MIB.addReg(RegNo: BaseReg + MI.getOperand(i: 0).getImm(), flags: RegState::Define);
2810 MIB.add(MO: MI.getOperand(i: 1)); // slice index register
2811 MIB.add(MO: MI.getOperand(i: 2)); // slice index offset
2812 MIB.add(MO: MI.getOperand(i: 3)); // pg
2813 MIB.add(MO: MI.getOperand(i: 4)); // base
2814 MIB.add(MO: MI.getOperand(i: 5)); // offset
2815
2816 MI.eraseFromParent(); // The pseudo is gone now.
2817 return BB;
2818}
2819
2820MachineBasicBlock *
2821AArch64TargetLowering::EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const {
2822 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2823 MachineInstrBuilder MIB =
2824 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
2825
2826 MIB.addReg(AArch64::ZA, RegState::Define);
2827 MIB.add(MO: MI.getOperand(i: 0)); // Vector select register
2828 MIB.add(MO: MI.getOperand(i: 1)); // Vector select offset
2829 MIB.add(MO: MI.getOperand(i: 2)); // Base
2830 MIB.add(MO: MI.getOperand(i: 1)); // Offset, same as vector select offset
2831
2832 MI.eraseFromParent(); // The pseudo is gone now.
2833 return BB;
2834}
2835
2836MachineBasicBlock *AArch64TargetLowering::EmitZTInstr(MachineInstr &MI,
2837 MachineBasicBlock *BB,
2838 unsigned Opcode,
2839 bool Op0IsDef) const {
2840 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2841 MachineInstrBuilder MIB;
2842
2843 MIB = BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode))
2844 .addReg(RegNo: MI.getOperand(i: 0).getReg(), flags: Op0IsDef ? RegState::Define : 0);
2845 for (unsigned I = 1; I < MI.getNumOperands(); ++I)
2846 MIB.add(MO: MI.getOperand(i: I));
2847
2848 MI.eraseFromParent(); // The pseudo is gone now.
2849 return BB;
2850}
2851
2852MachineBasicBlock *
2853AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
2854 MachineInstr &MI,
2855 MachineBasicBlock *BB, bool HasTile) const {
2856 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2857 MachineInstrBuilder MIB = BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: Opc));
2858 unsigned StartIdx = 0;
2859
2860 if (HasTile) {
2861 MIB.addReg(RegNo: BaseReg + MI.getOperand(i: 0).getImm(), flags: RegState::Define);
2862 MIB.addReg(RegNo: BaseReg + MI.getOperand(i: 0).getImm());
2863 StartIdx = 1;
2864 } else
2865 MIB.addReg(RegNo: BaseReg, flags: RegState::Define).addReg(RegNo: BaseReg);
2866
2867 for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
2868 MIB.add(MO: MI.getOperand(i: I));
2869
2870 MI.eraseFromParent(); // The pseudo is gone now.
2871 return BB;
2872}
2873
2874MachineBasicBlock *
2875AArch64TargetLowering::EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const {
2876 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2877 MachineInstrBuilder MIB =
2878 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
2879 MIB.add(MO: MI.getOperand(i: 0)); // Mask
2880
2881 unsigned Mask = MI.getOperand(i: 0).getImm();
2882 for (unsigned I = 0; I < 8; I++) {
2883 if (Mask & (1 << I))
2884 MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
2885 }
2886
2887 MI.eraseFromParent(); // The pseudo is gone now.
2888 return BB;
2889}
2890
2891MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
2892 MachineInstr &MI, MachineBasicBlock *BB) const {
2893
2894 int SMEOrigInstr = AArch64::getSMEPseudoMap(Opcode: MI.getOpcode());
2895 if (SMEOrigInstr != -1) {
2896 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2897 uint64_t SMEMatrixType =
2898 TII->get(Opcode: MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
2899 switch (SMEMatrixType) {
2900 case (AArch64::SMEMatrixArray):
2901 return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB, /*HasTile*/ false);
2902 case (AArch64::SMEMatrixTileB):
2903 return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB, /*HasTile*/ true);
2904 case (AArch64::SMEMatrixTileH):
2905 return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB, /*HasTile*/ true);
2906 case (AArch64::SMEMatrixTileS):
2907 return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB, /*HasTile*/ true);
2908 case (AArch64::SMEMatrixTileD):
2909 return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB, /*HasTile*/ true);
2910 case (AArch64::SMEMatrixTileQ):
2911 return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB, /*HasTile*/ true);
2912 }
2913 }
2914
2915 switch (MI.getOpcode()) {
2916 default:
2917#ifndef NDEBUG
2918 MI.dump();
2919#endif
2920 llvm_unreachable("Unexpected instruction for custom inserter!");
2921
2922 case AArch64::F128CSEL:
2923 return EmitF128CSEL(MI, MBB: BB);
2924 case TargetOpcode::STATEPOINT:
2925 // STATEPOINT is a pseudo instruction which has no implicit defs/uses
2926 // while bl call instruction (where statepoint will be lowered at the end)
2927 // has implicit def. This def is early-clobber as it will be set at
2928 // the moment of the call and earlier than any use is read.
2929 // Add this implicit dead def here as a workaround.
2930 MI.addOperand(*MI.getMF(),
2931 MachineOperand::CreateReg(
2932 AArch64::LR, /*isDef*/ true,
2933 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
2934 /*isUndef*/ false, /*isEarlyClobber*/ true));
2935 [[fallthrough]];
2936 case TargetOpcode::STACKMAP:
2937 case TargetOpcode::PATCHPOINT:
2938 return emitPatchPoint(MI, MBB: BB);
2939
2940 case TargetOpcode::PATCHABLE_EVENT_CALL:
2941 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
2942 return BB;
2943
2944 case AArch64::CATCHRET:
2945 return EmitLoweredCatchRet(MI, BB);
2946
2947 case AArch64::PROBED_STACKALLOC_DYN:
2948 return EmitDynamicProbedAlloc(MI, MBB: BB);
2949
2950 case AArch64::LD1_MXIPXX_H_PSEUDO_B:
2951 return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
2952 case AArch64::LD1_MXIPXX_H_PSEUDO_H:
2953 return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
2954 case AArch64::LD1_MXIPXX_H_PSEUDO_S:
2955 return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
2956 case AArch64::LD1_MXIPXX_H_PSEUDO_D:
2957 return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
2958 case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
2959 return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
2960 case AArch64::LD1_MXIPXX_V_PSEUDO_B:
2961 return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
2962 case AArch64::LD1_MXIPXX_V_PSEUDO_H:
2963 return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
2964 case AArch64::LD1_MXIPXX_V_PSEUDO_S:
2965 return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
2966 case AArch64::LD1_MXIPXX_V_PSEUDO_D:
2967 return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
2968 case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
2969 return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
2970 case AArch64::LDR_ZA_PSEUDO:
2971 return EmitFill(MI, BB);
2972 case AArch64::LDR_TX_PSEUDO:
2973 return EmitZTInstr(MI, BB, AArch64::LDR_TX, /*Op0IsDef=*/true);
2974 case AArch64::STR_TX_PSEUDO:
2975 return EmitZTInstr(MI, BB, AArch64::STR_TX, /*Op0IsDef=*/false);
2976 case AArch64::ZERO_M_PSEUDO:
2977 return EmitZero(MI, BB);
2978 case AArch64::ZERO_T_PSEUDO:
2979 return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
2980 }
2981}
2982
2983//===----------------------------------------------------------------------===//
2984// AArch64 Lowering private implementation.
2985//===----------------------------------------------------------------------===//
2986
2987//===----------------------------------------------------------------------===//
2988// Lowering Code
2989//===----------------------------------------------------------------------===//
2990
2991// Forward declarations of SVE fixed length lowering helpers
2992static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT);
2993static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);
2994static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);
2995static SDValue convertFixedMaskToScalableVector(SDValue Mask,
2996 SelectionDAG &DAG);
2997static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT);
2998static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
2999 EVT VT);
3000
3001/// isZerosVector - Check whether SDNode N is a zero-filled vector.
3002static bool isZerosVector(const SDNode *N) {
3003 // Look through a bit convert.
3004 while (N->getOpcode() == ISD::BITCAST)
3005 N = N->getOperand(Num: 0).getNode();
3006
3007 if (ISD::isConstantSplatVectorAllZeros(N))
3008 return true;
3009
3010 if (N->getOpcode() != AArch64ISD::DUP)
3011 return false;
3012
3013 auto Opnd0 = N->getOperand(Num: 0);
3014 return isNullConstant(V: Opnd0) || isNullFPConstant(V: Opnd0);
3015}
3016
3017/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
3018/// CC
3019static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
3020 switch (CC) {
3021 default:
3022 llvm_unreachable("Unknown condition code!");
3023 case ISD::SETNE:
3024 return AArch64CC::NE;
3025 case ISD::SETEQ:
3026 return AArch64CC::EQ;
3027 case ISD::SETGT:
3028 return AArch64CC::GT;
3029 case ISD::SETGE:
3030 return AArch64CC::GE;
3031 case ISD::SETLT:
3032 return AArch64CC::LT;
3033 case ISD::SETLE:
3034 return AArch64CC::LE;
3035 case ISD::SETUGT:
3036 return AArch64CC::HI;
3037 case ISD::SETUGE:
3038 return AArch64CC::HS;
3039 case ISD::SETULT:
3040 return AArch64CC::LO;
3041 case ISD::SETULE:
3042 return AArch64CC::LS;
3043 }
3044}
3045
3046/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
3047static void changeFPCCToAArch64CC(ISD::CondCode CC,
3048 AArch64CC::CondCode &CondCode,
3049 AArch64CC::CondCode &CondCode2) {
3050 CondCode2 = AArch64CC::AL;
3051 switch (CC) {
3052 default:
3053 llvm_unreachable("Unknown FP condition!");
3054 case ISD::SETEQ:
3055 case ISD::SETOEQ:
3056 CondCode = AArch64CC::EQ;
3057 break;
3058 case ISD::SETGT:
3059 case ISD::SETOGT:
3060 CondCode = AArch64CC::GT;
3061 break;
3062 case ISD::SETGE:
3063 case ISD::SETOGE:
3064 CondCode = AArch64CC::GE;
3065 break;
3066 case ISD::SETOLT:
3067 CondCode = AArch64CC::MI;
3068 break;
3069 case ISD::SETOLE:
3070 CondCode = AArch64CC::LS;
3071 break;
3072 case ISD::SETONE:
3073 CondCode = AArch64CC::MI;
3074 CondCode2 = AArch64CC::GT;
3075 break;
3076 case ISD::SETO:
3077 CondCode = AArch64CC::VC;
3078 break;
3079 case ISD::SETUO:
3080 CondCode = AArch64CC::VS;
3081 break;
3082 case ISD::SETUEQ:
3083 CondCode = AArch64CC::EQ;
3084 CondCode2 = AArch64CC::VS;
3085 break;
3086 case ISD::SETUGT:
3087 CondCode = AArch64CC::HI;
3088 break;
3089 case ISD::SETUGE:
3090 CondCode = AArch64CC::PL;
3091 break;
3092 case ISD::SETLT:
3093 case ISD::SETULT:
3094 CondCode = AArch64CC::LT;
3095 break;
3096 case ISD::SETLE:
3097 case ISD::SETULE:
3098 CondCode = AArch64CC::LE;
3099 break;
3100 case ISD::SETNE:
3101 case ISD::SETUNE:
3102 CondCode = AArch64CC::NE;
3103 break;
3104 }
3105}
3106
3107/// Convert a DAG fp condition code to an AArch64 CC.
3108/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
3109/// should be AND'ed instead of OR'ed.
3110static void changeFPCCToANDAArch64CC(ISD::CondCode CC,
3111 AArch64CC::CondCode &CondCode,
3112 AArch64CC::CondCode &CondCode2) {
3113 CondCode2 = AArch64CC::AL;
3114 switch (CC) {
3115 default:
3116 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3117 assert(CondCode2 == AArch64CC::AL);
3118 break;
3119 case ISD::SETONE:
3120 // (a one b)
3121 // == ((a olt b) || (a ogt b))
3122 // == ((a ord b) && (a une b))
3123 CondCode = AArch64CC::VC;
3124 CondCode2 = AArch64CC::NE;
3125 break;
3126 case ISD::SETUEQ:
3127 // (a ueq b)
3128 // == ((a uno b) || (a oeq b))
3129 // == ((a ule b) && (a uge b))
3130 CondCode = AArch64CC::PL;
3131 CondCode2 = AArch64CC::LE;
3132 break;
3133 }
3134}
3135
3136/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
3137/// CC usable with the vector instructions. Fewer operations are available
3138/// without a real NZCV register, so we have to use less efficient combinations
3139/// to get the same effect.
3140static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
3141 AArch64CC::CondCode &CondCode,
3142 AArch64CC::CondCode &CondCode2,
3143 bool &Invert) {
3144 Invert = false;
3145 switch (CC) {
3146 default:
3147 // Mostly the scalar mappings work fine.
3148 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3149 break;
3150 case ISD::SETUO:
3151 Invert = true;
3152 [[fallthrough]];
3153 case ISD::SETO:
3154 CondCode = AArch64CC::MI;
3155 CondCode2 = AArch64CC::GE;
3156 break;
3157 case ISD::SETUEQ:
3158 case ISD::SETULT:
3159 case ISD::SETULE:
3160 case ISD::SETUGT:
3161 case ISD::SETUGE:
3162 // All of the compare-mask comparisons are ordered, but we can switch
3163 // between the two by a double inversion. E.g. ULE == !OGT.
3164 Invert = true;
3165 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
3166 CondCode, CondCode2);
3167 break;
3168 }
3169}
3170
3171static bool isLegalArithImmed(uint64_t C) {
3172 // Matches AArch64DAGToDAGISel::SelectArithImmed().
3173 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
3174 LLVM_DEBUG(dbgs() << "Is imm " << C
3175 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
3176 return IsLegal;
3177}
3178
3179// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3180// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3181// can be set differently by this operation. It comes down to whether
3182// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3183// everything is fine. If not then the optimization is wrong. Thus general
3184// comparisons are only valid if op2 != 0.
3185//
3186// So, finally, the only LLVM-native comparisons that don't mention C and V
3187// are SETEQ and SETNE. They're the only ones we can safely use CMN for in
3188// the absence of information about op2.
3189static bool isCMN(SDValue Op, ISD::CondCode CC) {
3190 return Op.getOpcode() == ISD::SUB && isNullConstant(V: Op.getOperand(i: 0)) &&
3191 (CC == ISD::SETEQ || CC == ISD::SETNE);
3192}
3193
3194static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl,
3195 SelectionDAG &DAG, SDValue Chain,
3196 bool IsSignaling) {
3197 EVT VT = LHS.getValueType();
3198 assert(VT != MVT::f128);
3199
3200 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3201
3202 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3203 LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3204 {Chain, LHS});
3205 RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3206 {LHS.getValue(1), RHS});
3207 Chain = RHS.getValue(R: 1);
3208 VT = MVT::f32;
3209 }
3210 unsigned Opcode =
3211 IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
3212 return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
3213}
3214
3215static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
3216 const SDLoc &dl, SelectionDAG &DAG) {
3217 EVT VT = LHS.getValueType();
3218 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3219
3220 if (VT.isFloatingPoint()) {
3221 assert(VT != MVT::f128);
3222 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3223 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
3224 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
3225 VT = MVT::f32;
3226 }
3227 return DAG.getNode(Opcode: AArch64ISD::FCMP, DL: dl, VT, N1: LHS, N2: RHS);
3228 }
3229
3230 // The CMP instruction is just an alias for SUBS, and representing it as
3231 // SUBS means that it's possible to get CSE with subtract operations.
3232 // A later phase can perform the optimization of setting the destination
3233 // register to WZR/XZR if it ends up being unused.
3234 unsigned Opcode = AArch64ISD::SUBS;
3235
3236 if (isCMN(Op: RHS, CC)) {
3237 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3238 Opcode = AArch64ISD::ADDS;
3239 RHS = RHS.getOperand(i: 1);
3240 } else if (isCMN(Op: LHS, CC)) {
3241 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3242 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3243 Opcode = AArch64ISD::ADDS;
3244 LHS = LHS.getOperand(i: 1);
3245 } else if (isNullConstant(V: RHS) && !isUnsignedIntSetCC(Code: CC)) {
3246 if (LHS.getOpcode() == ISD::AND) {
3247 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3248 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3249 // of the signed comparisons.
3250 const SDValue ANDSNode = DAG.getNode(Opcode: AArch64ISD::ANDS, DL: dl,
3251 VTList: DAG.getVTList(VT1: VT, VT2: MVT_CC),
3252 N1: LHS.getOperand(i: 0),
3253 N2: LHS.getOperand(i: 1));
3254 // Replace all users of (and X, Y) with newly generated (ands X, Y)
3255 DAG.ReplaceAllUsesWith(From: LHS, To: ANDSNode);
3256 return ANDSNode.getValue(R: 1);
3257 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3258 // Use result of ANDS
3259 return LHS.getValue(R: 1);
3260 }
3261 }
3262
3263 return DAG.getNode(Opcode, DL: dl, VTList: DAG.getVTList(VT1: VT, VT2: MVT_CC), N1: LHS, N2: RHS)
3264 .getValue(R: 1);
3265}
3266
3267/// \defgroup AArch64CCMP CMP;CCMP matching
3268///
3269/// These functions deal with the formation of CMP;CCMP;... sequences.
3270/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3271/// a comparison. They set the NZCV flags to a predefined value if their
3272/// predicate is false. This allows to express arbitrary conjunctions, for
3273/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3274/// expressed as:
3275/// cmp A
3276/// ccmp B, inv(CB), CA
3277/// check for CB flags
3278///
3279/// This naturally lets us implement chains of AND operations with SETCC
3280/// operands. And we can even implement some other situations by transforming
3281/// them:
3282/// - We can implement (NEG SETCC) i.e. negating a single comparison by
3283/// negating the flags used in a CCMP/FCCMP operations.
3284/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3285/// by negating the flags we test for afterwards. i.e.
3286/// NEG (CMP CCMP CCCMP ...) can be implemented.
3287/// - Note that we can only ever negate all previously processed results.
3288/// What we can not implement by flipping the flags to test is a negation
3289/// of two sub-trees (because the negation affects all sub-trees emitted so
3290/// far, so the 2nd sub-tree we emit would also affect the first).
3291/// With those tools we can implement some OR operations:
3292/// - (OR (SETCC A) (SETCC B)) can be implemented via:
3293/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3294/// - After transforming OR to NEG/AND combinations we may be able to use NEG
3295/// elimination rules from earlier to implement the whole thing as a
3296/// CCMP/FCCMP chain.
3297///
3298/// As complete example:
3299/// or (or (setCA (cmp A)) (setCB (cmp B)))
3300/// (and (setCC (cmp C)) (setCD (cmp D)))"
3301/// can be reassociated to:
3302/// or (and (setCC (cmp C)) setCD (cmp D))
3303// (or (setCA (cmp A)) (setCB (cmp B)))
3304/// can be transformed to:
3305/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3306/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3307/// which can be implemented as:
3308/// cmp C
3309/// ccmp D, inv(CD), CC
3310/// ccmp A, CA, inv(CD)
3311/// ccmp B, CB, inv(CA)
3312/// check for CB flags
3313///
3314/// A counterexample is "or (and A B) (and C D)" which translates to
3315/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3316/// can only implement 1 of the inner (not) operations, but not both!
3317/// @{
3318
3319/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3320static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
3321 ISD::CondCode CC, SDValue CCOp,
3322 AArch64CC::CondCode Predicate,
3323 AArch64CC::CondCode OutCC,
3324 const SDLoc &DL, SelectionDAG &DAG) {
3325 unsigned Opcode = 0;
3326 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3327
3328 if (LHS.getValueType().isFloatingPoint()) {
3329 assert(LHS.getValueType() != MVT::f128);
3330 if ((LHS.getValueType() == MVT::f16 && !FullFP16) ||
3331 LHS.getValueType() == MVT::bf16) {
3332 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3333 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3334 }
3335 Opcode = AArch64ISD::FCCMP;
3336 } else if (RHS.getOpcode() == ISD::SUB) {
3337 SDValue SubOp0 = RHS.getOperand(i: 0);
3338 if (isNullConstant(V: SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
3339 // See emitComparison() on why we can only do this for SETEQ and SETNE.
3340 Opcode = AArch64ISD::CCMN;
3341 RHS = RHS.getOperand(i: 1);
3342 }
3343 }
3344 if (Opcode == 0)
3345 Opcode = AArch64ISD::CCMP;
3346
3347 SDValue Condition = DAG.getConstant(Val: Predicate, DL, VT: MVT_CC);
3348 AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(Code: OutCC);
3349 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(Code: InvOutCC);
3350 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
3351 return DAG.getNode(Opcode, DL, VT: MVT_CC, N1: LHS, N2: RHS, N3: NZCVOp, N4: Condition, N5: CCOp);
3352}
3353
3354/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3355/// expressed as a conjunction. See \ref AArch64CCMP.
3356/// \param CanNegate Set to true if we can negate the whole sub-tree just by
3357/// changing the conditions on the SETCC tests.
3358/// (this means we can call emitConjunctionRec() with
3359/// Negate==true on this sub-tree)
3360/// \param MustBeFirst Set to true if this subtree needs to be negated and we
3361/// cannot do the negation naturally. We are required to
3362/// emit the subtree first in this case.
3363/// \param WillNegate Is true if are called when the result of this
3364/// subexpression must be negated. This happens when the
3365/// outer expression is an OR. We can use this fact to know
3366/// that we have a double negation (or (or ...) ...) that
3367/// can be implemented for free.
3368static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
3369 bool &MustBeFirst, bool WillNegate,
3370 unsigned Depth = 0) {
3371 if (!Val.hasOneUse())
3372 return false;
3373 unsigned Opcode = Val->getOpcode();
3374 if (Opcode == ISD::SETCC) {
3375 if (Val->getOperand(0).getValueType() == MVT::f128)
3376 return false;
3377 CanNegate = true;
3378 MustBeFirst = false;
3379 return true;
3380 }
3381 // Protect against exponential runtime and stack overflow.
3382 if (Depth > 6)
3383 return false;
3384 if (Opcode == ISD::AND || Opcode == ISD::OR) {
3385 bool IsOR = Opcode == ISD::OR;
3386 SDValue O0 = Val->getOperand(Num: 0);
3387 SDValue O1 = Val->getOperand(Num: 1);
3388 bool CanNegateL;
3389 bool MustBeFirstL;
3390 if (!canEmitConjunction(Val: O0, CanNegate&: CanNegateL, MustBeFirst&: MustBeFirstL, WillNegate: IsOR, Depth: Depth+1))
3391 return false;
3392 bool CanNegateR;
3393 bool MustBeFirstR;
3394 if (!canEmitConjunction(Val: O1, CanNegate&: CanNegateR, MustBeFirst&: MustBeFirstR, WillNegate: IsOR, Depth: Depth+1))
3395 return false;
3396
3397 if (MustBeFirstL && MustBeFirstR)
3398 return false;
3399
3400 if (IsOR) {
3401 // For an OR expression we need to be able to naturally negate at least
3402 // one side or we cannot do the transformation at all.
3403 if (!CanNegateL && !CanNegateR)
3404 return false;
3405 // If we the result of the OR will be negated and we can naturally negate
3406 // the leafs, then this sub-tree as a whole negates naturally.
3407 CanNegate = WillNegate && CanNegateL && CanNegateR;
3408 // If we cannot naturally negate the whole sub-tree, then this must be
3409 // emitted first.
3410 MustBeFirst = !CanNegate;
3411 } else {
3412 assert(Opcode == ISD::AND && "Must be OR or AND");
3413 // We cannot naturally negate an AND operation.
3414 CanNegate = false;
3415 MustBeFirst = MustBeFirstL || MustBeFirstR;
3416 }
3417 return true;
3418 }
3419 return false;
3420}
3421
3422/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3423/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3424/// Tries to transform the given i1 producing node @p Val to a series compare
3425/// and conditional compare operations. @returns an NZCV flags producing node
3426/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
3427/// transformation was not possible.
3428/// \p Negate is true if we want this sub-tree being negated just by changing
3429/// SETCC conditions.
3430static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val,
3431 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
3432 AArch64CC::CondCode Predicate) {
3433 // We're at a tree leaf, produce a conditional comparison operation.
3434 unsigned Opcode = Val->getOpcode();
3435 if (Opcode == ISD::SETCC) {
3436 SDValue LHS = Val->getOperand(Num: 0);
3437 SDValue RHS = Val->getOperand(Num: 1);
3438 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Val->getOperand(Num: 2))->get();
3439 bool isInteger = LHS.getValueType().isInteger();
3440 if (Negate)
3441 CC = getSetCCInverse(Operation: CC, Type: LHS.getValueType());
3442 SDLoc DL(Val);
3443 // Determine OutCC and handle FP special case.
3444 if (isInteger) {
3445 OutCC = changeIntCCToAArch64CC(CC);
3446 } else {
3447 assert(LHS.getValueType().isFloatingPoint());
3448 AArch64CC::CondCode ExtraCC;
3449 changeFPCCToANDAArch64CC(CC, CondCode&: OutCC, CondCode2&: ExtraCC);
3450 // Some floating point conditions can't be tested with a single condition
3451 // code. Construct an additional comparison in this case.
3452 if (ExtraCC != AArch64CC::AL) {
3453 SDValue ExtraCmp;
3454 if (!CCOp.getNode())
3455 ExtraCmp = emitComparison(LHS, RHS, CC, dl: DL, DAG);
3456 else
3457 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
3458 OutCC: ExtraCC, DL, DAG);
3459 CCOp = ExtraCmp;
3460 Predicate = ExtraCC;
3461 }
3462 }
3463
3464 // Produce a normal comparison if we are first in the chain
3465 if (!CCOp)
3466 return emitComparison(LHS, RHS, CC, dl: DL, DAG);
3467 // Otherwise produce a ccmp.
3468 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
3469 DAG);
3470 }
3471 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
3472
3473 bool IsOR = Opcode == ISD::OR;
3474
3475 SDValue LHS = Val->getOperand(Num: 0);
3476 bool CanNegateL;
3477 bool MustBeFirstL;
3478 bool ValidL = canEmitConjunction(Val: LHS, CanNegate&: CanNegateL, MustBeFirst&: MustBeFirstL, WillNegate: IsOR);
3479 assert(ValidL && "Valid conjunction/disjunction tree");
3480 (void)ValidL;
3481
3482 SDValue RHS = Val->getOperand(Num: 1);
3483 bool CanNegateR;
3484 bool MustBeFirstR;
3485 bool ValidR = canEmitConjunction(Val: RHS, CanNegate&: CanNegateR, MustBeFirst&: MustBeFirstR, WillNegate: IsOR);
3486 assert(ValidR && "Valid conjunction/disjunction tree");
3487 (void)ValidR;
3488
3489 // Swap sub-tree that must come first to the right side.
3490 if (MustBeFirstL) {
3491 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
3492 std::swap(a&: LHS, b&: RHS);
3493 std::swap(a&: CanNegateL, b&: CanNegateR);
3494 std::swap(a&: MustBeFirstL, b&: MustBeFirstR);
3495 }
3496
3497 bool NegateR;
3498 bool NegateAfterR;
3499 bool NegateL;
3500 bool NegateAfterAll;
3501 if (Opcode == ISD::OR) {
3502 // Swap the sub-tree that we can negate naturally to the left.
3503 if (!CanNegateL) {
3504 assert(CanNegateR && "at least one side must be negatable");
3505 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
3506 assert(!Negate);
3507 std::swap(a&: LHS, b&: RHS);
3508 NegateR = false;
3509 NegateAfterR = true;
3510 } else {
3511 // Negate the left sub-tree if possible, otherwise negate the result.
3512 NegateR = CanNegateR;
3513 NegateAfterR = !CanNegateR;
3514 }
3515 NegateL = true;
3516 NegateAfterAll = !Negate;
3517 } else {
3518 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
3519 assert(!Negate && "Valid conjunction/disjunction tree");
3520
3521 NegateL = false;
3522 NegateR = false;
3523 NegateAfterR = false;
3524 NegateAfterAll = false;
3525 }
3526
3527 // Emit sub-trees.
3528 AArch64CC::CondCode RHSCC;
3529 SDValue CmpR = emitConjunctionRec(DAG, Val: RHS, OutCC&: RHSCC, Negate: NegateR, CCOp, Predicate);
3530 if (NegateAfterR)
3531 RHSCC = AArch64CC::getInvertedCondCode(Code: RHSCC);
3532 SDValue CmpL = emitConjunctionRec(DAG, Val: LHS, OutCC, Negate: NegateL, CCOp: CmpR, Predicate: RHSCC);
3533 if (NegateAfterAll)
3534 OutCC = AArch64CC::getInvertedCondCode(Code: OutCC);
3535 return CmpL;
3536}
3537
3538/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
3539/// In some cases this is even possible with OR operations in the expression.
3540/// See \ref AArch64CCMP.
3541/// \see emitConjunctionRec().
3542static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val,
3543 AArch64CC::CondCode &OutCC) {
3544 bool DummyCanNegate;
3545 bool DummyMustBeFirst;
3546 if (!canEmitConjunction(Val, CanNegate&: DummyCanNegate, MustBeFirst&: DummyMustBeFirst, WillNegate: false))
3547 return SDValue();
3548
3549 return emitConjunctionRec(DAG, Val, OutCC, Negate: false, CCOp: SDValue(), Predicate: AArch64CC::AL);
3550}
3551
3552/// @}
3553
3554/// Returns how profitable it is to fold a comparison's operand's shift and/or
3555/// extension operations.
3556static unsigned getCmpOperandFoldingProfit(SDValue Op) {
3557 auto isSupportedExtend = [&](SDValue V) {
3558 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
3559 return true;
3560
3561 if (V.getOpcode() == ISD::AND)
3562 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(Val: V.getOperand(i: 1))) {
3563 uint64_t Mask = MaskCst->getZExtValue();
3564 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
3565 }
3566
3567 return false;
3568 };
3569
3570 if (!Op.hasOneUse())
3571 return 0;
3572
3573 if (isSupportedExtend(Op))
3574 return 1;
3575
3576 unsigned Opc = Op.getOpcode();
3577 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
3578 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 1))) {
3579 uint64_t Shift = ShiftCst->getZExtValue();
3580 if (isSupportedExtend(Op.getOperand(i: 0)))
3581 return (Shift <= 4) ? 2 : 1;
3582 EVT VT = Op.getValueType();
3583 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
3584 return 1;
3585 }
3586
3587 return 0;
3588}
3589
3590static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
3591 SDValue &AArch64cc, SelectionDAG &DAG,
3592 const SDLoc &dl) {
3593 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(Val: RHS.getNode())) {
3594 EVT VT = RHS.getValueType();
3595 uint64_t C = RHSC->getZExtValue();
3596 if (!isLegalArithImmed(C)) {
3597 // Constant does not fit, try adjusting it by one?
3598 switch (CC) {
3599 default:
3600 break;
3601 case ISD::SETLT:
3602 case ISD::SETGE:
3603 if ((VT == MVT::i32 && C != 0x80000000 &&
3604 isLegalArithImmed((uint32_t)(C - 1))) ||
3605 (VT == MVT::i64 && C != 0x80000000ULL &&
3606 isLegalArithImmed(C - 1ULL))) {
3607 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
3608 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3609 RHS = DAG.getConstant(Val: C, DL: dl, VT);
3610 }
3611 break;
3612 case ISD::SETULT:
3613 case ISD::SETUGE:
3614 if ((VT == MVT::i32 && C != 0 &&
3615 isLegalArithImmed((uint32_t)(C - 1))) ||
3616 (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
3617 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
3618 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3619 RHS = DAG.getConstant(Val: C, DL: dl, VT);
3620 }
3621 break;
3622 case ISD::SETLE:
3623 case ISD::SETGT:
3624 if ((VT == MVT::i32 && C != INT32_MAX &&
3625 isLegalArithImmed((uint32_t)(C + 1))) ||
3626 (VT == MVT::i64 && C != INT64_MAX &&
3627 isLegalArithImmed(C + 1ULL))) {
3628 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
3629 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3630 RHS = DAG.getConstant(Val: C, DL: dl, VT);
3631 }
3632 break;
3633 case ISD::SETULE:
3634 case ISD::SETUGT:
3635 if ((VT == MVT::i32 && C != UINT32_MAX &&
3636 isLegalArithImmed((uint32_t)(C + 1))) ||
3637 (VT == MVT::i64 && C != UINT64_MAX &&
3638 isLegalArithImmed(C + 1ULL))) {
3639 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
3640 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3641 RHS = DAG.getConstant(Val: C, DL: dl, VT);
3642 }
3643 break;
3644 }
3645 }
3646 }
3647
3648 // Comparisons are canonicalized so that the RHS operand is simpler than the
3649 // LHS one, the extreme case being when RHS is an immediate. However, AArch64
3650 // can fold some shift+extend operations on the RHS operand, so swap the
3651 // operands if that can be done.
3652 //
3653 // For example:
3654 // lsl w13, w11, #1
3655 // cmp w13, w12
3656 // can be turned into:
3657 // cmp w12, w11, lsl #1
3658 if (!isa<ConstantSDNode>(Val: RHS) || !isLegalArithImmed(C: RHS->getAsZExtVal())) {
3659 SDValue TheLHS = isCMN(Op: LHS, CC) ? LHS.getOperand(i: 1) : LHS;
3660
3661 if (getCmpOperandFoldingProfit(Op: TheLHS) > getCmpOperandFoldingProfit(Op: RHS)) {
3662 std::swap(a&: LHS, b&: RHS);
3663 CC = ISD::getSetCCSwappedOperands(Operation: CC);
3664 }
3665 }
3666
3667 SDValue Cmp;
3668 AArch64CC::CondCode AArch64CC;
3669 if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(Val: RHS)) {
3670 const ConstantSDNode *RHSC = cast<ConstantSDNode>(Val&: RHS);
3671
3672 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
3673 // For the i8 operand, the largest immediate is 255, so this can be easily
3674 // encoded in the compare instruction. For the i16 operand, however, the
3675 // largest immediate cannot be encoded in the compare.
3676 // Therefore, use a sign extending load and cmn to avoid materializing the
3677 // -1 constant. For example,
3678 // movz w1, #65535
3679 // ldrh w0, [x0, #0]
3680 // cmp w0, w1
3681 // >
3682 // ldrsh w0, [x0, #0]
3683 // cmn w0, #1
3684 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
3685 // if and only if (sext LHS) == (sext RHS). The checks are in place to
3686 // ensure both the LHS and RHS are truly zero extended and to make sure the
3687 // transformation is profitable.
3688 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
3689 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
3690 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
3691 LHS.getNode()->hasNUsesOfValue(1, 0)) {
3692 int16_t ValueofRHS = RHS->getAsZExtVal();
3693 if (ValueofRHS < 0 && isLegalArithImmed(C: -ValueofRHS)) {
3694 SDValue SExt =
3695 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
3696 DAG.getValueType(MVT::i16));
3697 Cmp = emitComparison(LHS: SExt, RHS: DAG.getConstant(Val: ValueofRHS, DL: dl,
3698 VT: RHS.getValueType()),
3699 CC, dl, DAG);
3700 AArch64CC = changeIntCCToAArch64CC(CC);
3701 }
3702 }
3703
3704 if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
3705 if ((Cmp = emitConjunction(DAG, Val: LHS, OutCC&: AArch64CC))) {
3706 if ((CC == ISD::SETNE) ^ RHSC->isZero())
3707 AArch64CC = AArch64CC::getInvertedCondCode(Code: AArch64CC);
3708 }
3709 }
3710 }
3711
3712 if (!Cmp) {
3713 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
3714 AArch64CC = changeIntCCToAArch64CC(CC);
3715 }
3716 AArch64cc = DAG.getConstant(Val: AArch64CC, DL: dl, VT: MVT_CC);
3717 return Cmp;
3718}
3719
3720static std::pair<SDValue, SDValue>
3721getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
3722 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
3723 "Unsupported value type");
3724 SDValue Value, Overflow;
3725 SDLoc DL(Op);
3726 SDValue LHS = Op.getOperand(i: 0);
3727 SDValue RHS = Op.getOperand(i: 1);
3728 unsigned Opc = 0;
3729 switch (Op.getOpcode()) {
3730 default:
3731 llvm_unreachable("Unknown overflow instruction!");
3732 case ISD::SADDO:
3733 Opc = AArch64ISD::ADDS;
3734 CC = AArch64CC::VS;
3735 break;
3736 case ISD::UADDO:
3737 Opc = AArch64ISD::ADDS;
3738 CC = AArch64CC::HS;
3739 break;
3740 case ISD::SSUBO:
3741 Opc = AArch64ISD::SUBS;
3742 CC = AArch64CC::VS;
3743 break;
3744 case ISD::USUBO:
3745 Opc = AArch64ISD::SUBS;
3746 CC = AArch64CC::LO;
3747 break;
3748 // Multiply needs a little bit extra work.
3749 case ISD::SMULO:
3750 case ISD::UMULO: {
3751 CC = AArch64CC::NE;
3752 bool IsSigned = Op.getOpcode() == ISD::SMULO;
3753 if (Op.getValueType() == MVT::i32) {
3754 // Extend to 64-bits, then perform a 64-bit multiply.
3755 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3756 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
3757 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
3758 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3759 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
3760
3761 // Check that the result fits into a 32-bit integer.
3762 SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC);
3763 if (IsSigned) {
3764 // cmp xreg, wreg, sxtw
3765 SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
3766 Overflow =
3767 DAG.getNode(Opcode: AArch64ISD::SUBS, DL, VTList: VTs, N1: Mul, N2: SExtMul).getValue(R: 1);
3768 } else {
3769 // tst xreg, #0xffffffff00000000
3770 SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
3771 Overflow =
3772 DAG.getNode(Opcode: AArch64ISD::ANDS, DL, VTList: VTs, N1: Mul, N2: UpperBits).getValue(R: 1);
3773 }
3774 break;
3775 }
3776 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
3777 // For the 64 bit multiply
3778 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3779 if (IsSigned) {
3780 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
3781 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
3782 DAG.getConstant(63, DL, MVT::i64));
3783 // It is important that LowerBits is last, otherwise the arithmetic
3784 // shift will not be folded into the compare (SUBS).
3785 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3786 Overflow = DAG.getNode(Opcode: AArch64ISD::SUBS, DL, VTList: VTs, N1: UpperBits, N2: LowerBits)
3787 .getValue(R: 1);
3788 } else {
3789 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
3790 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3791 Overflow =
3792 DAG.getNode(AArch64ISD::SUBS, DL, VTs,
3793 DAG.getConstant(0, DL, MVT::i64),
3794 UpperBits).getValue(1);
3795 }
3796 break;
3797 }
3798 } // switch (...)
3799
3800 if (Opc) {
3801 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
3802
3803 // Emit the AArch64 operation with overflow check.
3804 Value = DAG.getNode(Opcode: Opc, DL, VTList: VTs, N1: LHS, N2: RHS);
3805 Overflow = Value.getValue(R: 1);
3806 }
3807 return std::make_pair(x&: Value, y&: Overflow);
3808}
3809
3810SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
3811 if (useSVEForFixedLengthVectorVT(VT: Op.getValueType(),
3812 OverrideNEON: !Subtarget->isNeonAvailable()))
3813 return LowerToScalableOp(Op, DAG);
3814
3815 SDValue Sel = Op.getOperand(i: 0);
3816 SDValue Other = Op.getOperand(i: 1);
3817 SDLoc dl(Sel);
3818
3819 // If the operand is an overflow checking operation, invert the condition
3820 // code and kill the Not operation. I.e., transform:
3821 // (xor (overflow_op_bool, 1))
3822 // -->
3823 // (csel 1, 0, invert(cc), overflow_op_bool)
3824 // ... which later gets transformed to just a cset instruction with an
3825 // inverted condition code, rather than a cset + eor sequence.
3826 if (isOneConstant(V: Other) && ISD::isOverflowIntrOpRes(Op: Sel)) {
3827 // Only lower legal XALUO ops.
3828 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: Sel->getValueType(ResNo: 0)))
3829 return SDValue();
3830
3831 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3832 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3833 AArch64CC::CondCode CC;
3834 SDValue Value, Overflow;
3835 std::tie(args&: Value, args&: Overflow) = getAArch64XALUOOp(CC, Op: Sel.getValue(R: 0), DAG);
3836 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3837 return DAG.getNode(Opcode: AArch64ISD::CSEL, DL: dl, VT: Op.getValueType(), N1: TVal, N2: FVal,
3838 N3: CCVal, N4: Overflow);
3839 }
3840 // If neither operand is a SELECT_CC, give up.
3841 if (Sel.getOpcode() != ISD::SELECT_CC)
3842 std::swap(a&: Sel, b&: Other);
3843 if (Sel.getOpcode() != ISD::SELECT_CC)
3844 return Op;
3845
3846 // The folding we want to perform is:
3847 // (xor x, (select_cc a, b, cc, 0, -1) )
3848 // -->
3849 // (csel x, (xor x, -1), cc ...)
3850 //
3851 // The latter will get matched to a CSINV instruction.
3852
3853 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Sel.getOperand(i: 4))->get();
3854 SDValue LHS = Sel.getOperand(i: 0);
3855 SDValue RHS = Sel.getOperand(i: 1);
3856 SDValue TVal = Sel.getOperand(i: 2);
3857 SDValue FVal = Sel.getOperand(i: 3);
3858
3859 // FIXME: This could be generalized to non-integer comparisons.
3860 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
3861 return Op;
3862
3863 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(Val&: FVal);
3864 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(Val&: TVal);
3865
3866 // The values aren't constants, this isn't the pattern we're looking for.
3867 if (!CFVal || !CTVal)
3868 return Op;
3869
3870 // We can commute the SELECT_CC by inverting the condition. This
3871 // might be needed to make this fit into a CSINV pattern.
3872 if (CTVal->isAllOnes() && CFVal->isZero()) {
3873 std::swap(a&: TVal, b&: FVal);
3874 std::swap(a&: CTVal, b&: CFVal);
3875 CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
3876 }
3877
3878 // If the constants line up, perform the transform!
3879 if (CTVal->isZero() && CFVal->isAllOnes()) {
3880 SDValue CCVal;
3881 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, AArch64cc&: CCVal, DAG, dl);
3882
3883 FVal = Other;
3884 TVal = DAG.getNode(Opcode: ISD::XOR, DL: dl, VT: Other.getValueType(), N1: Other,
3885 N2: DAG.getConstant(Val: -1ULL, DL: dl, VT: Other.getValueType()));
3886
3887 return DAG.getNode(Opcode: AArch64ISD::CSEL, DL: dl, VT: Sel.getValueType(), N1: FVal, N2: TVal,
3888 N3: CCVal, N4: Cmp);
3889 }
3890
3891 return Op;
3892}
3893
3894// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
3895// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
3896// sets 'C' bit to 0.
3897static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert) {
3898 SDLoc DL(Value);
3899 EVT VT = Value.getValueType();
3900 SDValue Op0 = Invert ? DAG.getConstant(Val: 0, DL, VT) : Value;
3901 SDValue Op1 = Invert ? Value : DAG.getConstant(Val: 1, DL, VT);
3902 SDValue Cmp =
3903 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::Glue), Op0, Op1);
3904 return Cmp.getValue(R: 1);
3905}
3906
3907// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
3908// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
3909static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG,
3910 bool Invert) {
3911 assert(Glue.getResNo() == 1);
3912 SDLoc DL(Glue);
3913 SDValue Zero = DAG.getConstant(Val: 0, DL, VT);
3914 SDValue One = DAG.getConstant(Val: 1, DL, VT);
3915 unsigned Cond = Invert ? AArch64CC::LO : AArch64CC::HS;
3916 SDValue CC = DAG.getConstant(Cond, DL, MVT::i32);
3917 return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: One, N2: Zero, N3: CC, N4: Glue);
3918}
3919
3920// Value is 1 if 'V' bit of NZCV is 1, else 0
3921static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG) {
3922 assert(Glue.getResNo() == 1);
3923 SDLoc DL(Glue);
3924 SDValue Zero = DAG.getConstant(Val: 0, DL, VT);
3925 SDValue One = DAG.getConstant(Val: 1, DL, VT);
3926 SDValue CC = DAG.getConstant(AArch64CC::VS, DL, MVT::i32);
3927 return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: One, N2: Zero, N3: CC, N4: Glue);
3928}
3929
3930// This lowering is inefficient, but it will get cleaned up by
3931// `foldOverflowCheck`
3932static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG,
3933 unsigned Opcode, bool IsSigned) {
3934 EVT VT0 = Op.getValue(R: 0).getValueType();
3935 EVT VT1 = Op.getValue(R: 1).getValueType();
3936
3937 if (VT0 != MVT::i32 && VT0 != MVT::i64)
3938 return SDValue();
3939
3940 bool InvertCarry = Opcode == AArch64ISD::SBCS;
3941 SDValue OpLHS = Op.getOperand(i: 0);
3942 SDValue OpRHS = Op.getOperand(i: 1);
3943 SDValue OpCarryIn = valueToCarryFlag(Value: Op.getOperand(i: 2), DAG, Invert: InvertCarry);
3944
3945 SDLoc DL(Op);
3946 SDVTList VTs = DAG.getVTList(VT1: VT0, VT2: VT1);
3947
3948 SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::Glue), OpLHS,
3949 OpRHS, OpCarryIn);
3950
3951 SDValue OutFlag =
3952 IsSigned ? overflowFlagToValue(Glue: Sum.getValue(R: 1), VT: VT1, DAG)
3953 : carryFlagToValue(Glue: Sum.getValue(R: 1), VT: VT1, DAG, Invert: InvertCarry);
3954
3955 return DAG.getNode(Opcode: ISD::MERGE_VALUES, DL, VTList: VTs, N1: Sum, N2: OutFlag);
3956}
3957
3958static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
3959 // Let legalize expand this if it isn't a legal type yet.
3960 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: Op.getValueType()))
3961 return SDValue();
3962
3963 SDLoc dl(Op);
3964 AArch64CC::CondCode CC;
3965 // The actual operation that sets the overflow or carry flag.
3966 SDValue Value, Overflow;
3967 std::tie(args&: Value, args&: Overflow) = getAArch64XALUOOp(CC, Op, DAG);
3968
3969 // We use 0 and 1 as false and true values.
3970 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3971 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3972
3973 // We use an inverted condition, because the conditional select is inverted
3974 // too. This will allow it to be selected to a single instruction:
3975 // CSINC Wd, WZR, WZR, invert(cond).
3976 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3977 Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
3978 CCVal, Overflow);
3979
3980 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
3981 return DAG.getNode(Opcode: ISD::MERGE_VALUES, DL: dl, VTList: VTs, N1: Value, N2: Overflow);
3982}
3983
3984// Prefetch operands are:
3985// 1: Address to prefetch
3986// 2: bool isWrite
3987// 3: int locality (0 = no locality ... 3 = extreme locality)
3988// 4: bool isDataCache
3989static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
3990 SDLoc DL(Op);
3991 unsigned IsWrite = Op.getConstantOperandVal(i: 2);
3992 unsigned Locality = Op.getConstantOperandVal(i: 3);
3993 unsigned IsData = Op.getConstantOperandVal(i: 4);
3994
3995 bool IsStream = !Locality;
3996 // When the locality number is set
3997 if (Locality) {
3998 // The front-end should have filtered out the out-of-range values
3999 assert(Locality <= 3 && "Prefetch locality out-of-range");
4000 // The locality degree is the opposite of the cache speed.
4001 // Put the number the other way around.
4002 // The encoding starts at 0 for level 1
4003 Locality = 3 - Locality;
4004 }
4005
4006 // built the mask value encoding the expected behavior.
4007 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
4008 (!IsData << 3) | // IsDataCache bit
4009 (Locality << 1) | // Cache level bits
4010 (unsigned)IsStream; // Stream bit
4011 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
4012 DAG.getTargetConstant(PrfOp, DL, MVT::i32),
4013 Op.getOperand(1));
4014}
4015
4016SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
4017 SelectionDAG &DAG) const {
4018 EVT VT = Op.getValueType();
4019 if (VT.isScalableVector())
4020 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
4021
4022 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
4023 return LowerFixedLengthFPExtendToSVE(Op, DAG);
4024
4025 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
4026 return SDValue();
4027}
4028
4029SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
4030 SelectionDAG &DAG) const {
4031 EVT VT = Op.getValueType();
4032 if (VT.isScalableVector())
4033 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4034
4035 bool IsStrict = Op->isStrictFPOpcode();
4036 SDValue SrcVal = Op.getOperand(i: IsStrict ? 1 : 0);
4037 EVT SrcVT = SrcVal.getValueType();
4038 bool Trunc = Op.getConstantOperandVal(i: IsStrict ? 2 : 1) == 1;
4039
4040 if (useSVEForFixedLengthVectorVT(VT: SrcVT, OverrideNEON: !Subtarget->isNeonAvailable()))
4041 return LowerFixedLengthFPRoundToSVE(Op, DAG);
4042
4043 // Expand cases where the result type is BF16 but we don't have hardware
4044 // instructions to lower it.
4045 if (VT.getScalarType() == MVT::bf16 &&
4046 !((Subtarget->hasNEON() || Subtarget->hasSME()) &&
4047 Subtarget->hasBF16())) {
4048 SDLoc dl(Op);
4049 SDValue Narrow = SrcVal;
4050 SDValue NaN;
4051 EVT I32 = SrcVT.changeElementType(MVT::i32);
4052 EVT F32 = SrcVT.changeElementType(MVT::f32);
4053 if (SrcVT.getScalarType() == MVT::f32) {
4054 bool NeverSNaN = DAG.isKnownNeverSNaN(Op: Narrow);
4055 Narrow = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: I32, Operand: Narrow);
4056 if (!NeverSNaN) {
4057 // Set the quiet bit.
4058 NaN = DAG.getNode(Opcode: ISD::OR, DL: dl, VT: I32, N1: Narrow,
4059 N2: DAG.getConstant(Val: 0x400000, DL: dl, VT: I32));
4060 }
4061 } else if (SrcVT.getScalarType() == MVT::f64) {
4062 Narrow = DAG.getNode(Opcode: AArch64ISD::FCVTXN, DL: dl, VT: F32, Operand: Narrow);
4063 Narrow = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: I32, Operand: Narrow);
4064 } else {
4065 return SDValue();
4066 }
4067 if (!Trunc) {
4068 SDValue One = DAG.getConstant(Val: 1, DL: dl, VT: I32);
4069 SDValue Lsb = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: I32, N1: Narrow,
4070 N2: DAG.getShiftAmountConstant(Val: 16, VT: I32, DL: dl));
4071 Lsb = DAG.getNode(Opcode: ISD::AND, DL: dl, VT: I32, N1: Lsb, N2: One);
4072 SDValue RoundingBias =
4073 DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: I32, N1: DAG.getConstant(Val: 0x7fff, DL: dl, VT: I32), N2: Lsb);
4074 Narrow = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: I32, N1: Narrow, N2: RoundingBias);
4075 }
4076
4077 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4078 // 0x80000000.
4079 if (NaN) {
4080 SDValue IsNaN = DAG.getSetCC(
4081 DL: dl, VT: getSetCCResultType(DAG.getDataLayout(), C&: *DAG.getContext(), VT: SrcVT),
4082 LHS: SrcVal, RHS: SrcVal, Cond: ISD::SETUO);
4083 Narrow = DAG.getSelect(DL: dl, VT: I32, Cond: IsNaN, LHS: NaN, RHS: Narrow);
4084 }
4085
4086 // Now that we have rounded, shift the bits into position.
4087 Narrow = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: I32, N1: Narrow,
4088 N2: DAG.getShiftAmountConstant(Val: 16, VT: I32, DL: dl));
4089 if (VT.isVector()) {
4090 EVT I16 = I32.changeVectorElementType(MVT::i16);
4091 Narrow = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: I16, Operand: Narrow);
4092 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT, Operand: Narrow);
4093 }
4094 Narrow = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: F32, Operand: Narrow);
4095 SDValue Result = DAG.getTargetExtractSubreg(AArch64::hsub, dl, VT, Narrow);
4096 return IsStrict ? DAG.getMergeValues(Ops: {Result, Op.getOperand(i: 0)}, dl)
4097 : Result;
4098 }
4099
4100 if (SrcVT != MVT::f128) {
4101 // Expand cases where the input is a vector bigger than NEON.
4102 if (useSVEForFixedLengthVectorVT(VT: SrcVT))
4103 return SDValue();
4104
4105 // It's legal except when f128 is involved
4106 return Op;
4107 }
4108
4109 return SDValue();
4110}
4111
4112SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
4113 SelectionDAG &DAG) const {
4114 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4115 // Any additional optimization in this function should be recorded
4116 // in the cost tables.
4117 bool IsStrict = Op->isStrictFPOpcode();
4118 EVT InVT = Op.getOperand(i: IsStrict ? 1 : 0).getValueType();
4119 EVT VT = Op.getValueType();
4120
4121 if (VT.isScalableVector()) {
4122 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
4123 ? AArch64ISD::FCVTZU_MERGE_PASSTHRU
4124 : AArch64ISD::FCVTZS_MERGE_PASSTHRU;
4125 return LowerToPredicatedOp(Op, DAG, NewOp: Opcode);
4126 }
4127
4128 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()) ||
4129 useSVEForFixedLengthVectorVT(VT: InVT, OverrideNEON: !Subtarget->isNeonAvailable()))
4130 return LowerFixedLengthFPToIntToSVE(Op, DAG);
4131
4132 unsigned NumElts = InVT.getVectorNumElements();
4133
4134 // f16 conversions are promoted to f32 when full fp16 is not supported.
4135 if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4136 InVT.getVectorElementType() == MVT::bf16) {
4137 MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
4138 SDLoc dl(Op);
4139 if (IsStrict) {
4140 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NewVT, MVT::Other},
4141 {Op.getOperand(0), Op.getOperand(1)});
4142 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4143 {Ext.getValue(1), Ext.getValue(0)});
4144 }
4145 return DAG.getNode(
4146 Opcode: Op.getOpcode(), DL: dl, VT: Op.getValueType(),
4147 Operand: DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: NewVT, Operand: Op.getOperand(i: 0)));
4148 }
4149
4150 uint64_t VTSize = VT.getFixedSizeInBits();
4151 uint64_t InVTSize = InVT.getFixedSizeInBits();
4152 if (VTSize < InVTSize) {
4153 SDLoc dl(Op);
4154 if (IsStrict) {
4155 InVT = InVT.changeVectorElementTypeToInteger();
4156 SDValue Cv = DAG.getNode(Op.getOpcode(), dl, {InVT, MVT::Other},
4157 {Op.getOperand(0), Op.getOperand(1)});
4158 SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT, Operand: Cv);
4159 return DAG.getMergeValues(Ops: {Trunc, Cv.getValue(R: 1)}, dl);
4160 }
4161 SDValue Cv =
4162 DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: InVT.changeVectorElementTypeToInteger(),
4163 Operand: Op.getOperand(i: 0));
4164 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT, Operand: Cv);
4165 }
4166
4167 if (VTSize > InVTSize) {
4168 SDLoc dl(Op);
4169 MVT ExtVT =
4170 MVT::getVectorVT(VT: MVT::getFloatingPointVT(BitWidth: VT.getScalarSizeInBits()),
4171 NumElements: VT.getVectorNumElements());
4172 if (IsStrict) {
4173 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {ExtVT, MVT::Other},
4174 {Op.getOperand(0), Op.getOperand(1)});
4175 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4176 {Ext.getValue(1), Ext.getValue(0)});
4177 }
4178 SDValue Ext = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: ExtVT, Operand: Op.getOperand(i: 0));
4179 return DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT, Operand: Ext);
4180 }
4181
4182 // Use a scalar operation for conversions between single-element vectors of
4183 // the same size.
4184 if (NumElts == 1) {
4185 SDLoc dl(Op);
4186 SDValue Extract = DAG.getNode(
4187 ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(),
4188 Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, dl, MVT::i64));
4189 EVT ScalarVT = VT.getScalarType();
4190 if (IsStrict)
4191 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
4192 {Op.getOperand(0), Extract});
4193 return DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: ScalarVT, Operand: Extract);
4194 }
4195
4196 // Type changing conversions are illegal.
4197 return Op;
4198}
4199
4200SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
4201 SelectionDAG &DAG) const {
4202 bool IsStrict = Op->isStrictFPOpcode();
4203 SDValue SrcVal = Op.getOperand(i: IsStrict ? 1 : 0);
4204
4205 if (SrcVal.getValueType().isVector())
4206 return LowerVectorFP_TO_INT(Op, DAG);
4207
4208 // f16 conversions are promoted to f32 when full fp16 is not supported.
4209 if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4210 SrcVal.getValueType() == MVT::bf16) {
4211 SDLoc dl(Op);
4212 if (IsStrict) {
4213 SDValue Ext =
4214 DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
4215 {Op.getOperand(0), SrcVal});
4216 return DAG.getNode(Op.getOpcode(), dl, {Op.getValueType(), MVT::Other},
4217 {Ext.getValue(1), Ext.getValue(0)});
4218 }
4219 return DAG.getNode(
4220 Op.getOpcode(), dl, Op.getValueType(),
4221 DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal));
4222 }
4223
4224 if (SrcVal.getValueType() != MVT::f128) {
4225 // It's legal except when f128 is involved
4226 return Op;
4227 }
4228
4229 return SDValue();
4230}
4231
4232SDValue
4233AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
4234 SelectionDAG &DAG) const {
4235 // AArch64 FP-to-int conversions saturate to the destination element size, so
4236 // we can lower common saturating conversions to simple instructions.
4237 SDValue SrcVal = Op.getOperand(i: 0);
4238 EVT SrcVT = SrcVal.getValueType();
4239 EVT DstVT = Op.getValueType();
4240 EVT SatVT = cast<VTSDNode>(Val: Op.getOperand(i: 1))->getVT();
4241
4242 uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
4243 uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
4244 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4245 assert(SatWidth <= DstElementWidth &&
4246 "Saturation width cannot exceed result width");
4247
4248 // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
4249 // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
4250 // types, so this is hard to reach.
4251 if (DstVT.isScalableVector())
4252 return SDValue();
4253
4254 EVT SrcElementVT = SrcVT.getVectorElementType();
4255
4256 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4257 if ((SrcElementVT == MVT::f16 &&
4258 (!Subtarget->hasFullFP16() || DstElementWidth > 16)) ||
4259 SrcElementVT == MVT::bf16) {
4260 MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
4261 SrcVal = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SDLoc(Op), VT: F32VT, Operand: SrcVal);
4262 SrcVT = F32VT;
4263 SrcElementVT = MVT::f32;
4264 SrcElementWidth = 32;
4265 } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
4266 SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)
4267 return SDValue();
4268
4269 SDLoc DL(Op);
4270 // Cases that we can emit directly.
4271 if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth)
4272 return DAG.getNode(Opcode: Op.getOpcode(), DL, VT: DstVT, N1: SrcVal,
4273 N2: DAG.getValueType(DstVT.getScalarType()));
4274
4275 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4276 // result. This is only valid if the legal cvt is larger than the saturate
4277 // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
4278 // (at least until sqxtn is selected).
4279 if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
4280 return SDValue();
4281
4282 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
4283 SDValue NativeCvt = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: IntVT, N1: SrcVal,
4284 N2: DAG.getValueType(IntVT.getScalarType()));
4285 SDValue Sat;
4286 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4287 SDValue MinC = DAG.getConstant(
4288 Val: APInt::getSignedMaxValue(numBits: SatWidth).sext(width: SrcElementWidth), DL, VT: IntVT);
4289 SDValue Min = DAG.getNode(Opcode: ISD::SMIN, DL, VT: IntVT, N1: NativeCvt, N2: MinC);
4290 SDValue MaxC = DAG.getConstant(
4291 Val: APInt::getSignedMinValue(numBits: SatWidth).sext(width: SrcElementWidth), DL, VT: IntVT);
4292 Sat = DAG.getNode(Opcode: ISD::SMAX, DL, VT: IntVT, N1: Min, N2: MaxC);
4293 } else {
4294 SDValue MinC = DAG.getConstant(
4295 Val: APInt::getAllOnes(numBits: SatWidth).zext(width: SrcElementWidth), DL, VT: IntVT);
4296 Sat = DAG.getNode(Opcode: ISD::UMIN, DL, VT: IntVT, N1: NativeCvt, N2: MinC);
4297 }
4298
4299 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: DstVT, Operand: Sat);
4300}
4301
4302SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
4303 SelectionDAG &DAG) const {
4304 // AArch64 FP-to-int conversions saturate to the destination register size, so
4305 // we can lower common saturating conversions to simple instructions.
4306 SDValue SrcVal = Op.getOperand(i: 0);
4307 EVT SrcVT = SrcVal.getValueType();
4308
4309 if (SrcVT.isVector())
4310 return LowerVectorFP_TO_INT_SAT(Op, DAG);
4311
4312 EVT DstVT = Op.getValueType();
4313 EVT SatVT = cast<VTSDNode>(Val: Op.getOperand(i: 1))->getVT();
4314 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4315 uint64_t DstWidth = DstVT.getScalarSizeInBits();
4316 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
4317
4318 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4319 if ((SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) || SrcVT == MVT::bf16) {
4320 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);
4321 SrcVT = MVT::f32;
4322 } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16 &&
4323 SrcVT != MVT::bf16)
4324 return SDValue();
4325
4326 SDLoc DL(Op);
4327 // Cases that we can emit directly.
4328 if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
4329 (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
4330 DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
4331 return DAG.getNode(Opcode: Op.getOpcode(), DL, VT: DstVT, N1: SrcVal,
4332 N2: DAG.getValueType(DstVT));
4333
4334 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4335 // result. This is only valid if the legal cvt is larger than the saturate
4336 // width.
4337 if (DstWidth < SatWidth)
4338 return SDValue();
4339
4340 SDValue NativeCvt =
4341 DAG.getNode(Opcode: Op.getOpcode(), DL, VT: DstVT, N1: SrcVal, N2: DAG.getValueType(DstVT));
4342 SDValue Sat;
4343 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4344 SDValue MinC = DAG.getConstant(
4345 Val: APInt::getSignedMaxValue(numBits: SatWidth).sext(width: DstWidth), DL, VT: DstVT);
4346 SDValue Min = DAG.getNode(Opcode: ISD::SMIN, DL, VT: DstVT, N1: NativeCvt, N2: MinC);
4347 SDValue MaxC = DAG.getConstant(
4348 Val: APInt::getSignedMinValue(numBits: SatWidth).sext(width: DstWidth), DL, VT: DstVT);
4349 Sat = DAG.getNode(Opcode: ISD::SMAX, DL, VT: DstVT, N1: Min, N2: MaxC);
4350 } else {
4351 SDValue MinC = DAG.getConstant(
4352 Val: APInt::getAllOnes(numBits: SatWidth).zext(width: DstWidth), DL, VT: DstVT);
4353 Sat = DAG.getNode(Opcode: ISD::UMIN, DL, VT: DstVT, N1: NativeCvt, N2: MinC);
4354 }
4355
4356 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: DstVT, Operand: Sat);
4357}
4358
4359SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
4360 SelectionDAG &DAG) const {
4361 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4362 // Any additional optimization in this function should be recorded
4363 // in the cost tables.
4364 bool IsStrict = Op->isStrictFPOpcode();
4365 EVT VT = Op.getValueType();
4366 SDLoc dl(Op);
4367 SDValue In = Op.getOperand(i: IsStrict ? 1 : 0);
4368 EVT InVT = In.getValueType();
4369 unsigned Opc = Op.getOpcode();
4370 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
4371
4372 if (VT.isScalableVector()) {
4373 if (InVT.getVectorElementType() == MVT::i1) {
4374 // We can't directly extend an SVE predicate; extend it first.
4375 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4376 EVT CastVT = getPromotedVTForPredicate(VT: InVT);
4377 In = DAG.getNode(Opcode: CastOpc, DL: dl, VT: CastVT, Operand: In);
4378 return DAG.getNode(Opcode: Opc, DL: dl, VT, Operand: In);
4379 }
4380
4381 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
4382 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
4383 return LowerToPredicatedOp(Op, DAG, NewOp: Opcode);
4384 }
4385
4386 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()) ||
4387 useSVEForFixedLengthVectorVT(VT: InVT, OverrideNEON: !Subtarget->isNeonAvailable()))
4388 return LowerFixedLengthIntToFPToSVE(Op, DAG);
4389
4390 // Promote bf16 conversions to f32.
4391 if (VT.getVectorElementType() == MVT::bf16) {
4392 EVT F32 = VT.changeElementType(MVT::f32);
4393 if (IsStrict) {
4394 SDValue Val = DAG.getNode(Op.getOpcode(), dl, {F32, MVT::Other},
4395 {Op.getOperand(0), In});
4396 return DAG.getNode(
4397 ISD::STRICT_FP_ROUND, dl, {Op.getValueType(), MVT::Other},
4398 {Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)});
4399 }
4400 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: dl, VT: Op.getValueType(),
4401 N1: DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: F32, Operand: In),
4402 N2: DAG.getIntPtrConstant(Val: 0, DL: dl));
4403 }
4404
4405 uint64_t VTSize = VT.getFixedSizeInBits();
4406 uint64_t InVTSize = InVT.getFixedSizeInBits();
4407 if (VTSize < InVTSize) {
4408 MVT CastVT =
4409 MVT::getVectorVT(VT: MVT::getFloatingPointVT(BitWidth: InVT.getScalarSizeInBits()),
4410 NumElements: InVT.getVectorNumElements());
4411 if (IsStrict) {
4412 In = DAG.getNode(Opc, dl, {CastVT, MVT::Other},
4413 {Op.getOperand(0), In});
4414 return DAG.getNode(
4415 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
4416 {In.getValue(1), In.getValue(0), DAG.getIntPtrConstant(0, dl)});
4417 }
4418 In = DAG.getNode(Opcode: Opc, DL: dl, VT: CastVT, Operand: In);
4419 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: dl, VT, N1: In,
4420 N2: DAG.getIntPtrConstant(Val: 0, DL: dl, /*isTarget=*/true));
4421 }
4422
4423 if (VTSize > InVTSize) {
4424 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4425 EVT CastVT = VT.changeVectorElementTypeToInteger();
4426 In = DAG.getNode(Opcode: CastOpc, DL: dl, VT: CastVT, Operand: In);
4427 if (IsStrict)
4428 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op.getOperand(0), In});
4429 return DAG.getNode(Opcode: Opc, DL: dl, VT, Operand: In);
4430 }
4431
4432 // Use a scalar operation for conversions between single-element vectors of
4433 // the same size.
4434 if (VT.getVectorNumElements() == 1) {
4435 SDValue Extract = DAG.getNode(
4436 ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(),
4437 In, DAG.getConstant(0, dl, MVT::i64));
4438 EVT ScalarVT = VT.getScalarType();
4439 if (IsStrict)
4440 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
4441 {Op.getOperand(0), Extract});
4442 return DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: ScalarVT, Operand: Extract);
4443 }
4444
4445 return Op;
4446}
4447
4448SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
4449 SelectionDAG &DAG) const {
4450 if (Op.getValueType().isVector())
4451 return LowerVectorINT_TO_FP(Op, DAG);
4452
4453 bool IsStrict = Op->isStrictFPOpcode();
4454 SDValue SrcVal = Op.getOperand(i: IsStrict ? 1 : 0);
4455
4456 bool IsSigned = Op->getOpcode() == ISD::STRICT_SINT_TO_FP ||
4457 Op->getOpcode() == ISD::SINT_TO_FP;
4458
4459 auto IntToFpViaPromotion = [&](EVT PromoteVT) {
4460 SDLoc dl(Op);
4461 if (IsStrict) {
4462 SDValue Val = DAG.getNode(Op.getOpcode(), dl, {PromoteVT, MVT::Other},
4463 {Op.getOperand(0), SrcVal});
4464 return DAG.getNode(
4465 ISD::STRICT_FP_ROUND, dl, {Op.getValueType(), MVT::Other},
4466 {Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)});
4467 }
4468 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: dl, VT: Op.getValueType(),
4469 N1: DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: PromoteVT, Operand: SrcVal),
4470 N2: DAG.getIntPtrConstant(Val: 0, DL: dl));
4471 };
4472
4473 if (Op.getValueType() == MVT::bf16) {
4474 unsigned MaxWidth = IsSigned
4475 ? DAG.ComputeMaxSignificantBits(Op: SrcVal)
4476 : DAG.computeKnownBits(Op: SrcVal).countMaxActiveBits();
4477 // bf16 conversions are promoted to f32 when converting from i16.
4478 if (MaxWidth <= 24) {
4479 return IntToFpViaPromotion(MVT::f32);
4480 }
4481
4482 // bf16 conversions are promoted to f64 when converting from i32.
4483 if (MaxWidth <= 53) {
4484 return IntToFpViaPromotion(MVT::f64);
4485 }
4486
4487 // We need to be careful about i64 -> bf16.
4488 // Consider an i32 22216703.
4489 // This number cannot be represented exactly as an f32 and so a itofp will
4490 // turn it into 22216704.0 fptrunc to bf16 will turn this into 22282240.0
4491 // However, the correct bf16 was supposed to be 22151168.0
4492 // We need to use sticky rounding to get this correct.
4493 if (SrcVal.getValueType() == MVT::i64) {
4494 SDLoc DL(Op);
4495 // This algorithm is equivalent to the following:
4496 // uint64_t SrcHi = SrcVal & ~0xfffull;
4497 // uint64_t SrcLo = SrcVal & 0xfffull;
4498 // uint64_t Highest = SrcVal >> 53;
4499 // bool HasHighest = Highest != 0;
4500 // uint64_t ToRound = HasHighest ? SrcHi : SrcVal;
4501 // double Rounded = static_cast<double>(ToRound);
4502 // uint64_t RoundedBits = std::bit_cast<uint64_t>(Rounded);
4503 // uint64_t HasLo = SrcLo != 0;
4504 // bool NeedsAdjustment = HasHighest & HasLo;
4505 // uint64_t AdjustedBits = RoundedBits | uint64_t{NeedsAdjustment};
4506 // double Adjusted = std::bit_cast<double>(AdjustedBits);
4507 // return static_cast<__bf16>(Adjusted);
4508 //
4509 // Essentially, what happens is that SrcVal either fits perfectly in a
4510 // double-precision value or it is too big. If it is sufficiently small,
4511 // we should just go u64 -> double -> bf16 in a naive way. Otherwise, we
4512 // ensure that u64 -> double has no rounding error by only using the 52
4513 // MSB of the input. The low order bits will get merged into a sticky bit
4514 // which will avoid issues incurred by double rounding.
4515
4516 // Signed conversion is more or less like so:
4517 // copysign((__bf16)abs(SrcVal), SrcVal)
4518 SDValue SignBit;
4519 if (IsSigned) {
4520 SignBit = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
4521 DAG.getConstant(1ull << 63, DL, MVT::i64));
4522 SrcVal = DAG.getNode(ISD::ABS, DL, MVT::i64, SrcVal);
4523 }
4524 SDValue SrcHi = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
4525 DAG.getConstant(~0xfffull, DL, MVT::i64));
4526 SDValue SrcLo = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
4527 DAG.getConstant(0xfffull, DL, MVT::i64));
4528 SDValue Highest =
4529 DAG.getNode(ISD::SRL, DL, MVT::i64, SrcVal,
4530 DAG.getShiftAmountConstant(53, MVT::i64, DL));
4531 SDValue Zero64 = DAG.getConstant(0, DL, MVT::i64);
4532 SDValue ToRound =
4533 DAG.getSelectCC(DL, LHS: Highest, RHS: Zero64, True: SrcHi, False: SrcVal, Cond: ISD::SETNE);
4534 SDValue Rounded =
4535 IsStrict ? DAG.getNode(Op.getOpcode(), DL, {MVT::f64, MVT::Other},
4536 {Op.getOperand(0), ToRound})
4537 : DAG.getNode(Op.getOpcode(), DL, MVT::f64, ToRound);
4538
4539 SDValue RoundedBits = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Rounded);
4540 if (SignBit) {
4541 RoundedBits = DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, SignBit);
4542 }
4543
4544 SDValue HasHighest = DAG.getSetCC(
4545 DL,
4546 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
4547 Highest, Zero64, ISD::SETNE);
4548
4549 SDValue HasLo = DAG.getSetCC(
4550 DL,
4551 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
4552 SrcLo, Zero64, ISD::SETNE);
4553
4554 SDValue NeedsAdjustment =
4555 DAG.getNode(Opcode: ISD::AND, DL, VT: HasLo.getValueType(), N1: HasHighest, N2: HasLo);
4556 NeedsAdjustment = DAG.getZExtOrTrunc(NeedsAdjustment, DL, MVT::i64);
4557
4558 SDValue AdjustedBits =
4559 DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, NeedsAdjustment);
4560 SDValue Adjusted = DAG.getNode(ISD::BITCAST, DL, MVT::f64, AdjustedBits);
4561 return IsStrict
4562 ? DAG.getNode(ISD::STRICT_FP_ROUND, DL,
4563 {Op.getValueType(), MVT::Other},
4564 {Rounded.getValue(1), Adjusted,
4565 DAG.getIntPtrConstant(0, DL)})
4566 : DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(), Adjusted,
4567 DAG.getIntPtrConstant(0, DL, true));
4568 }
4569 }
4570
4571 // f16 conversions are promoted to f32 when full fp16 is not supported.
4572 if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
4573 return IntToFpViaPromotion(MVT::f32);
4574 }
4575
4576 // i128 conversions are libcalls.
4577 if (SrcVal.getValueType() == MVT::i128)
4578 return SDValue();
4579
4580 // Other conversions are legal, unless it's to the completely software-based
4581 // fp128.
4582 if (Op.getValueType() != MVT::f128)
4583 return Op;
4584 return SDValue();
4585}
4586
4587SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
4588 SelectionDAG &DAG) const {
4589 // For iOS, we want to call an alternative entry point: __sincos_stret,
4590 // which returns the values in two S / D registers.
4591 SDLoc dl(Op);
4592 SDValue Arg = Op.getOperand(i: 0);
4593 EVT ArgVT = Arg.getValueType();
4594 Type *ArgTy = ArgVT.getTypeForEVT(Context&: *DAG.getContext());
4595
4596 ArgListTy Args;
4597 ArgListEntry Entry;
4598
4599 Entry.Node = Arg;
4600 Entry.Ty = ArgTy;
4601 Entry.IsSExt = false;
4602 Entry.IsZExt = false;
4603 Args.push_back(x: Entry);
4604
4605 RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
4606 : RTLIB::SINCOS_STRET_F32;
4607 const char *LibcallName = getLibcallName(Call: LC);
4608 SDValue Callee =
4609 DAG.getExternalSymbol(Sym: LibcallName, VT: getPointerTy(DL: DAG.getDataLayout()));
4610
4611 StructType *RetTy = StructType::get(elt1: ArgTy, elts: ArgTy);
4612 TargetLowering::CallLoweringInfo CLI(DAG);
4613 CLI.setDebugLoc(dl)
4614 .setChain(DAG.getEntryNode())
4615 .setLibCallee(CC: CallingConv::Fast, ResultType: RetTy, Target: Callee, ArgsList: std::move(Args));
4616
4617 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
4618 return CallResult.first;
4619}
4620
4621static MVT getSVEContainerType(EVT ContentTy);
4622
4623SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
4624 SelectionDAG &DAG) const {
4625 EVT OpVT = Op.getValueType();
4626 EVT ArgVT = Op.getOperand(i: 0).getValueType();
4627
4628 if (useSVEForFixedLengthVectorVT(VT: OpVT))
4629 return LowerFixedLengthBitcastToSVE(Op, DAG);
4630
4631 if (OpVT.isScalableVector()) {
4632 // Bitcasting between unpacked vector types of different element counts is
4633 // not a NOP because the live elements are laid out differently.
4634 // 01234567
4635 // e.g. nxv2i32 = XX??XX??
4636 // nxv4f16 = X?X?X?X?
4637 if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
4638 return SDValue();
4639
4640 if (isTypeLegal(VT: OpVT) && !isTypeLegal(VT: ArgVT)) {
4641 assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
4642 "Expected int->fp bitcast!");
4643 SDValue ExtResult =
4644 DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SDLoc(Op), VT: getSVEContainerType(ContentTy: ArgVT),
4645 Operand: Op.getOperand(i: 0));
4646 return getSVESafeBitCast(VT: OpVT, Op: ExtResult, DAG);
4647 }
4648 return getSVESafeBitCast(VT: OpVT, Op: Op.getOperand(i: 0), DAG);
4649 }
4650
4651 if (OpVT != MVT::f16 && OpVT != MVT::bf16)
4652 return SDValue();
4653
4654 // Bitcasts between f16 and bf16 are legal.
4655 if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
4656 return Op;
4657
4658 assert(ArgVT == MVT::i16);
4659 SDLoc DL(Op);
4660
4661 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
4662 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
4663 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, OpVT, Op);
4664}
4665
4666static EVT getExtensionTo64Bits(const EVT &OrigVT) {
4667 if (OrigVT.getSizeInBits() >= 64)
4668 return OrigVT;
4669
4670 assert(OrigVT.isSimple() && "Expecting a simple value type");
4671
4672 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
4673 switch (OrigSimpleTy) {
4674 default: llvm_unreachable("Unexpected Vector Type");
4675 case MVT::v2i8:
4676 case MVT::v2i16:
4677 return MVT::v2i32;
4678 case MVT::v4i8:
4679 return MVT::v4i16;
4680 }
4681}
4682
4683static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG,
4684 const EVT &OrigTy,
4685 const EVT &ExtTy,
4686 unsigned ExtOpcode) {
4687 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
4688 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
4689 // 64-bits we need to insert a new extension so that it will be 64-bits.
4690 assert(ExtTy.is128BitVector() && "Unexpected extension size");
4691 if (OrigTy.getSizeInBits() >= 64)
4692 return N;
4693
4694 // Must extend size to at least 64 bits to be used as an operand for VMULL.
4695 EVT NewVT = getExtensionTo64Bits(OrigVT: OrigTy);
4696
4697 return DAG.getNode(Opcode: ExtOpcode, DL: SDLoc(N), VT: NewVT, Operand: N);
4698}
4699
4700// Returns lane if Op extracts from a two-element vector and lane is constant
4701// (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
4702static std::optional<uint64_t>
4703getConstantLaneNumOfExtractHalfOperand(SDValue &Op) {
4704 SDNode *OpNode = Op.getNode();
4705 if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
4706 return std::nullopt;
4707
4708 EVT VT = OpNode->getOperand(Num: 0).getValueType();
4709 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: OpNode->getOperand(Num: 1));
4710 if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C)
4711 return std::nullopt;
4712
4713 return C->getZExtValue();
4714}
4715
4716static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG,
4717 bool isSigned) {
4718 EVT VT = N.getValueType();
4719
4720 if (N.getOpcode() != ISD::BUILD_VECTOR)
4721 return false;
4722
4723 for (const SDValue &Elt : N->op_values()) {
4724 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: Elt)) {
4725 unsigned EltSize = VT.getScalarSizeInBits();
4726 unsigned HalfSize = EltSize / 2;
4727 if (isSigned) {
4728 if (!isIntN(N: HalfSize, x: C->getSExtValue()))
4729 return false;
4730 } else {
4731 if (!isUIntN(N: HalfSize, x: C->getZExtValue()))
4732 return false;
4733 }
4734 continue;
4735 }
4736 return false;
4737 }
4738
4739 return true;
4740}
4741
4742static SDValue skipExtensionForVectorMULL(SDValue N, SelectionDAG &DAG) {
4743 EVT VT = N.getValueType();
4744 assert(VT.is128BitVector() && "Unexpected vector MULL size");
4745
4746 unsigned NumElts = VT.getVectorNumElements();
4747 unsigned OrigEltSize = VT.getScalarSizeInBits();
4748 unsigned EltSize = OrigEltSize / 2;
4749 MVT TruncVT = MVT::getVectorVT(VT: MVT::getIntegerVT(BitWidth: EltSize), NumElements: NumElts);
4750
4751 APInt HiBits = APInt::getHighBitsSet(numBits: OrigEltSize, hiBitsSet: EltSize);
4752 if (DAG.MaskedValueIsZero(Op: N, Mask: HiBits))
4753 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SDLoc(N), VT: TruncVT, Operand: N);
4754
4755 if (ISD::isExtOpcode(Opcode: N.getOpcode()))
4756 return addRequiredExtensionForVectorMULL(N: N.getOperand(i: 0), DAG,
4757 OrigTy: N.getOperand(i: 0).getValueType(), ExtTy: VT,
4758 ExtOpcode: N.getOpcode());
4759
4760 assert(N.getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
4761 SDLoc dl(N);
4762 SmallVector<SDValue, 8> Ops;
4763 for (unsigned i = 0; i != NumElts; ++i) {
4764 const APInt &CInt = N.getConstantOperandAPInt(i);
4765 // Element types smaller than 32 bits are not legal, so use i32 elements.
4766 // The values are implicitly truncated so sext vs. zext doesn't matter.
4767 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
4768 }
4769 return DAG.getBuildVector(VT: TruncVT, DL: dl, Ops);
4770}
4771
4772static bool isSignExtended(SDValue N, SelectionDAG &DAG) {
4773 return N.getOpcode() == ISD::SIGN_EXTEND ||
4774 N.getOpcode() == ISD::ANY_EXTEND ||
4775 isExtendedBUILD_VECTOR(N, DAG, isSigned: true);
4776}
4777
4778static bool isZeroExtended(SDValue N, SelectionDAG &DAG) {
4779 return N.getOpcode() == ISD::ZERO_EXTEND ||
4780 N.getOpcode() == ISD::ANY_EXTEND ||
4781 isExtendedBUILD_VECTOR(N, DAG, isSigned: false);
4782}
4783
4784static bool isAddSubSExt(SDValue N, SelectionDAG &DAG) {
4785 unsigned Opcode = N.getOpcode();
4786 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
4787 SDValue N0 = N.getOperand(i: 0);
4788 SDValue N1 = N.getOperand(i: 1);
4789 return N0->hasOneUse() && N1->hasOneUse() &&
4790 isSignExtended(N: N0, DAG) && isSignExtended(N: N1, DAG);
4791 }
4792 return false;
4793}
4794
4795static bool isAddSubZExt(SDValue N, SelectionDAG &DAG) {
4796 unsigned Opcode = N.getOpcode();
4797 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
4798 SDValue N0 = N.getOperand(i: 0);
4799 SDValue N1 = N.getOperand(i: 1);
4800 return N0->hasOneUse() && N1->hasOneUse() &&
4801 isZeroExtended(N: N0, DAG) && isZeroExtended(N: N1, DAG);
4802 }
4803 return false;
4804}
4805
4806SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
4807 SelectionDAG &DAG) const {
4808 // The rounding mode is in bits 23:22 of the FPSCR.
4809 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
4810 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
4811 // so that the shift + and get folded into a bitfield extract.
4812 SDLoc dl(Op);
4813
4814 SDValue Chain = Op.getOperand(i: 0);
4815 SDValue FPCR_64 = DAG.getNode(
4816 ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other},
4817 {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)});
4818 Chain = FPCR_64.getValue(R: 1);
4819 SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
4820 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
4821 DAG.getConstant(1U << 22, dl, MVT::i32));
4822 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
4823 DAG.getConstant(22, dl, MVT::i32));
4824 SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
4825 DAG.getConstant(3, dl, MVT::i32));
4826 return DAG.getMergeValues(Ops: {AND, Chain}, dl);
4827}
4828
4829SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
4830 SelectionDAG &DAG) const {
4831 SDLoc DL(Op);
4832 SDValue Chain = Op->getOperand(Num: 0);
4833 SDValue RMValue = Op->getOperand(Num: 1);
4834
4835 // The rounding mode is in bits 23:22 of the FPCR.
4836 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
4837 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
4838 // ((arg - 1) & 3) << 22).
4839 //
4840 // The argument of llvm.set.rounding must be within the segment [0, 3], so
4841 // NearestTiesToAway (4) is not handled here. It is responsibility of the code
4842 // generated llvm.set.rounding to ensure this condition.
4843
4844 // Calculate new value of FPCR[23:22].
4845 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
4846 DAG.getConstant(1, DL, MVT::i32));
4847 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
4848 DAG.getConstant(0x3, DL, MVT::i32));
4849 RMValue =
4850 DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
4851 DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
4852 RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
4853
4854 // Get current value of FPCR.
4855 SDValue Ops[] = {
4856 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
4857 SDValue FPCR =
4858 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
4859 Chain = FPCR.getValue(R: 1);
4860 FPCR = FPCR.getValue(R: 0);
4861
4862 // Put new rounding mode into FPSCR[23:22].
4863 const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
4864 FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
4865 DAG.getConstant(RMMask, DL, MVT::i64));
4866 FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
4867 SDValue Ops2[] = {
4868 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
4869 FPCR};
4870 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
4871}
4872
4873static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
4874 SDLoc DL, bool &IsMLA) {
4875 bool IsN0SExt = isSignExtended(N: N0, DAG);
4876 bool IsN1SExt = isSignExtended(N: N1, DAG);
4877 if (IsN0SExt && IsN1SExt)
4878 return AArch64ISD::SMULL;
4879
4880 bool IsN0ZExt = isZeroExtended(N: N0, DAG);
4881 bool IsN1ZExt = isZeroExtended(N: N1, DAG);
4882
4883 if (IsN0ZExt && IsN1ZExt)
4884 return AArch64ISD::UMULL;
4885
4886 // Select SMULL if we can replace zext with sext.
4887 if (((IsN0SExt && IsN1ZExt) || (IsN0ZExt && IsN1SExt)) &&
4888 !isExtendedBUILD_VECTOR(N: N0, DAG, isSigned: false) &&
4889 !isExtendedBUILD_VECTOR(N: N1, DAG, isSigned: false)) {
4890 SDValue ZextOperand;
4891 if (IsN0ZExt)
4892 ZextOperand = N0.getOperand(i: 0);
4893 else
4894 ZextOperand = N1.getOperand(i: 0);
4895 if (DAG.SignBitIsZero(Op: ZextOperand)) {
4896 SDValue NewSext =
4897 DAG.getSExtOrTrunc(Op: ZextOperand, DL, VT: N0.getValueType());
4898 if (IsN0ZExt)
4899 N0 = NewSext;
4900 else
4901 N1 = NewSext;
4902 return AArch64ISD::SMULL;
4903 }
4904 }
4905
4906 // Select UMULL if we can replace the other operand with an extend.
4907 if (IsN0ZExt || IsN1ZExt) {
4908 EVT VT = N0.getValueType();
4909 APInt Mask = APInt::getHighBitsSet(numBits: VT.getScalarSizeInBits(),
4910 hiBitsSet: VT.getScalarSizeInBits() / 2);
4911 if (DAG.MaskedValueIsZero(Op: IsN0ZExt ? N1 : N0, Mask))
4912 return AArch64ISD::UMULL;
4913 }
4914
4915 if (!IsN1SExt && !IsN1ZExt)
4916 return 0;
4917
4918 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
4919 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
4920 if (IsN1SExt && isAddSubSExt(N: N0, DAG)) {
4921 IsMLA = true;
4922 return AArch64ISD::SMULL;
4923 }
4924 if (IsN1ZExt && isAddSubZExt(N: N0, DAG)) {
4925 IsMLA = true;
4926 return AArch64ISD::UMULL;
4927 }
4928 if (IsN0ZExt && isAddSubZExt(N: N1, DAG)) {
4929 std::swap(a&: N0, b&: N1);
4930 IsMLA = true;
4931 return AArch64ISD::UMULL;
4932 }
4933 return 0;
4934}
4935
4936SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
4937 EVT VT = Op.getValueType();
4938
4939 bool OverrideNEON = !Subtarget->isNeonAvailable();
4940 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
4941 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::MUL_PRED);
4942
4943 // Multiplications are only custom-lowered for 128-bit and 64-bit vectors so
4944 // that VMULL can be detected. Otherwise v2i64 multiplications are not legal.
4945 assert((VT.is128BitVector() || VT.is64BitVector()) && VT.isInteger() &&
4946 "unexpected type for custom-lowering ISD::MUL");
4947 SDValue N0 = Op.getOperand(i: 0);
4948 SDValue N1 = Op.getOperand(i: 1);
4949 bool isMLA = false;
4950 EVT OVT = VT;
4951 if (VT.is64BitVector()) {
4952 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4953 isNullConstant(V: N0.getOperand(i: 1)) &&
4954 N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4955 isNullConstant(V: N1.getOperand(i: 1))) {
4956 N0 = N0.getOperand(i: 0);
4957 N1 = N1.getOperand(i: 0);
4958 VT = N0.getValueType();
4959 } else {
4960 if (VT == MVT::v1i64) {
4961 if (Subtarget->hasSVE())
4962 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::MUL_PRED);
4963 // Fall through to expand this. It is not legal.
4964 return SDValue();
4965 } else
4966 // Other vector multiplications are legal.
4967 return Op;
4968 }
4969 }
4970
4971 SDLoc DL(Op);
4972 unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, IsMLA&: isMLA);
4973
4974 if (!NewOpc) {
4975 if (VT.getVectorElementType() == MVT::i64) {
4976 // If SVE is available then i64 vector multiplications can also be made
4977 // legal.
4978 if (Subtarget->hasSVE())
4979 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::MUL_PRED);
4980 // Fall through to expand this. It is not legal.
4981 return SDValue();
4982 } else
4983 // Other vector multiplications are legal.
4984 return Op;
4985 }
4986
4987 // Legalize to a S/UMULL instruction
4988 SDValue Op0;
4989 SDValue Op1 = skipExtensionForVectorMULL(N: N1, DAG);
4990 if (!isMLA) {
4991 Op0 = skipExtensionForVectorMULL(N: N0, DAG);
4992 assert(Op0.getValueType().is64BitVector() &&
4993 Op1.getValueType().is64BitVector() &&
4994 "unexpected types for extended operands to VMULL");
4995 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OVT,
4996 DAG.getNode(NewOpc, DL, VT, Op0, Op1),
4997 DAG.getConstant(0, DL, MVT::i64));
4998 }
4999 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
5000 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
5001 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
5002 SDValue N00 = skipExtensionForVectorMULL(N: N0.getOperand(i: 0), DAG);
5003 SDValue N01 = skipExtensionForVectorMULL(N: N0.getOperand(i: 1), DAG);
5004 EVT Op1VT = Op1.getValueType();
5005 return DAG.getNode(
5006 ISD::EXTRACT_SUBVECTOR, DL, OVT,
5007 DAG.getNode(N0.getOpcode(), DL, VT,
5008 DAG.getNode(NewOpc, DL, VT,
5009 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
5010 DAG.getNode(NewOpc, DL, VT,
5011 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)),
5012 DAG.getConstant(0, DL, MVT::i64));
5013}
5014
5015static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
5016 int Pattern) {
5017 if (VT == MVT::nxv1i1 && Pattern == AArch64SVEPredPattern::all)
5018 return DAG.getConstant(1, DL, MVT::nxv1i1);
5019 return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
5020 DAG.getTargetConstant(Pattern, DL, MVT::i32));
5021}
5022
5023static SDValue optimizeIncrementingWhile(SDValue Op, SelectionDAG &DAG,
5024 bool IsSigned, bool IsEqual) {
5025 if (!isa<ConstantSDNode>(Val: Op.getOperand(i: 1)) ||
5026 !isa<ConstantSDNode>(Val: Op.getOperand(i: 2)))
5027 return SDValue();
5028
5029 SDLoc dl(Op);
5030 APInt X = Op.getConstantOperandAPInt(i: 1);
5031 APInt Y = Op.getConstantOperandAPInt(i: 2);
5032 bool Overflow;
5033 APInt NumActiveElems =
5034 IsSigned ? Y.ssub_ov(RHS: X, Overflow) : Y.usub_ov(RHS: X, Overflow);
5035
5036 if (Overflow)
5037 return SDValue();
5038
5039 if (IsEqual) {
5040 APInt One(NumActiveElems.getBitWidth(), 1, IsSigned);
5041 NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(RHS: One, Overflow)
5042 : NumActiveElems.uadd_ov(RHS: One, Overflow);
5043 if (Overflow)
5044 return SDValue();
5045 }
5046
5047 std::optional<unsigned> PredPattern =
5048 getSVEPredPatternFromNumElements(NumActiveElems.getZExtValue());
5049 unsigned MinSVEVectorSize = std::max(
5050 a: DAG.getSubtarget<AArch64Subtarget>().getMinSVEVectorSizeInBits(), b: 128u);
5051 unsigned ElementSize = 128 / Op.getValueType().getVectorMinNumElements();
5052 if (PredPattern != std::nullopt &&
5053 NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))
5054 return getPTrue(DAG, DL: dl, VT: Op.getValueType(), Pattern: *PredPattern);
5055
5056 return SDValue();
5057}
5058
5059// Returns a safe bitcast between two scalable vector predicates, where
5060// any newly created lanes from a widening bitcast are defined as zero.
5061static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) {
5062 SDLoc DL(Op);
5063 EVT InVT = Op.getValueType();
5064
5065 assert(InVT.getVectorElementType() == MVT::i1 &&
5066 VT.getVectorElementType() == MVT::i1 &&
5067 "Expected a predicate-to-predicate bitcast");
5068 assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
5069 InVT.isScalableVector() &&
5070 DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
5071 "Only expect to cast between legal scalable predicate types!");
5072
5073 // Return the operand if the cast isn't changing type,
5074 // e.g. <n x 16 x i1> -> <n x 16 x i1>
5075 if (InVT == VT)
5076 return Op;
5077
5078 SDValue Reinterpret = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT, Operand: Op);
5079
5080 // We only have to zero the lanes if new lanes are being defined, e.g. when
5081 // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
5082 // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
5083 // we can return here.
5084 if (InVT.bitsGT(VT))
5085 return Reinterpret;
5086
5087 // Check if the other lanes are already known to be zeroed by
5088 // construction.
5089 if (isZeroingInactiveLanes(Op))
5090 return Reinterpret;
5091
5092 // Zero the newly introduced lanes.
5093 SDValue Mask = DAG.getConstant(Val: 1, DL, VT: InVT);
5094 Mask = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT, Operand: Mask);
5095 return DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Reinterpret, N2: Mask);
5096}
5097
5098SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
5099 SDValue Chain, SDLoc DL,
5100 EVT VT) const {
5101 SDValue Callee = DAG.getExternalSymbol(Sym: "__arm_sme_state",
5102 VT: getPointerTy(DL: DAG.getDataLayout()));
5103 Type *Int64Ty = Type::getInt64Ty(C&: *DAG.getContext());
5104 Type *RetTy = StructType::get(elt1: Int64Ty, elts: Int64Ty);
5105 TargetLowering::CallLoweringInfo CLI(DAG);
5106 ArgListTy Args;
5107 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
5108 CC: CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2,
5109 ResultType: RetTy, Target: Callee, ArgsList: std::move(Args));
5110 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5111 SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
5112 return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
5113 Mask);
5114}
5115
5116// Lower an SME LDR/STR ZA intrinsic
5117// Case 1: If the vector number (vecnum) is an immediate in range, it gets
5118// folded into the instruction
5119// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
5120// Case 2: If the vecnum is not an immediate, then it is used to modify the base
5121// and tile slice registers
5122// ldr(%tileslice, %ptr, %vecnum)
5123// ->
5124// %svl = rdsvl
5125// %ptr2 = %ptr + %svl * %vecnum
5126// %tileslice2 = %tileslice + %vecnum
5127// ldr [%tileslice2, 0], [%ptr2, 0]
5128// Case 3: If the vecnum is an immediate out of range, then the same is done as
5129// case 2, but the base and slice registers are modified by the greatest
5130// multiple of 15 lower than the vecnum and the remainder is folded into the
5131// instruction. This means that successive loads and stores that are offset from
5132// each other can share the same base and slice register updates.
5133// ldr(%tileslice, %ptr, 22)
5134// ldr(%tileslice, %ptr, 23)
5135// ->
5136// %svl = rdsvl
5137// %ptr2 = %ptr + %svl * 15
5138// %tileslice2 = %tileslice + 15
5139// ldr [%tileslice2, 7], [%ptr2, 7]
5140// ldr [%tileslice2, 8], [%ptr2, 8]
5141// Case 4: If the vecnum is an add of an immediate, then the non-immediate
5142// operand and the immediate can be folded into the instruction, like case 2.
5143// ldr(%tileslice, %ptr, %vecnum + 7)
5144// ldr(%tileslice, %ptr, %vecnum + 8)
5145// ->
5146// %svl = rdsvl
5147// %ptr2 = %ptr + %svl * %vecnum
5148// %tileslice2 = %tileslice + %vecnum
5149// ldr [%tileslice2, 7], [%ptr2, 7]
5150// ldr [%tileslice2, 8], [%ptr2, 8]
5151// Case 5: The vecnum being an add of an immediate out of range is also handled,
5152// in which case the same remainder logic as case 3 is used.
5153SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad) {
5154 SDLoc DL(N);
5155
5156 SDValue TileSlice = N->getOperand(Num: 2);
5157 SDValue Base = N->getOperand(Num: 3);
5158 SDValue VecNum = N->getOperand(Num: 4);
5159 int32_t ConstAddend = 0;
5160 SDValue VarAddend = VecNum;
5161
5162 // If the vnum is an add of an immediate, we can fold it into the instruction
5163 if (VecNum.getOpcode() == ISD::ADD &&
5164 isa<ConstantSDNode>(Val: VecNum.getOperand(i: 1))) {
5165 ConstAddend = cast<ConstantSDNode>(Val: VecNum.getOperand(i: 1))->getSExtValue();
5166 VarAddend = VecNum.getOperand(i: 0);
5167 } else if (auto ImmNode = dyn_cast<ConstantSDNode>(Val&: VecNum)) {
5168 ConstAddend = ImmNode->getSExtValue();
5169 VarAddend = SDValue();
5170 }
5171
5172 int32_t ImmAddend = ConstAddend % 16;
5173 if (int32_t C = (ConstAddend - ImmAddend)) {
5174 SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32);
5175 VarAddend = VarAddend
5176 ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal})
5177 : CVal;
5178 }
5179
5180 if (VarAddend) {
5181 // Get the vector length that will be multiplied by vnum
5182 auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
5183 DAG.getConstant(1, DL, MVT::i32));
5184
5185 // Multiply SVL and vnum then add it to the base
5186 SDValue Mul = DAG.getNode(
5187 ISD::MUL, DL, MVT::i64,
5188 {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)});
5189 Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul});
5190 // Just add vnum to the tileslice
5191 TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend});
5192 }
5193
5194 return DAG.getNode(IsLoad ? AArch64ISD::SME_ZA_LDR : AArch64ISD::SME_ZA_STR,
5195 DL, MVT::Other,
5196 {/*Chain=*/N.getOperand(0), TileSlice, Base,
5197 DAG.getTargetConstant(ImmAddend, DL, MVT::i32)});
5198}
5199
5200SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
5201 SelectionDAG &DAG) const {
5202 unsigned IntNo = Op.getConstantOperandVal(i: 1);
5203 SDLoc DL(Op);
5204 switch (IntNo) {
5205 default:
5206 return SDValue(); // Don't custom lower most intrinsics.
5207 case Intrinsic::aarch64_prefetch: {
5208 SDValue Chain = Op.getOperand(i: 0);
5209 SDValue Addr = Op.getOperand(i: 2);
5210
5211 unsigned IsWrite = Op.getConstantOperandVal(i: 3);
5212 unsigned Locality = Op.getConstantOperandVal(i: 4);
5213 unsigned IsStream = Op.getConstantOperandVal(i: 5);
5214 unsigned IsData = Op.getConstantOperandVal(i: 6);
5215 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
5216 (!IsData << 3) | // IsDataCache bit
5217 (Locality << 1) | // Cache level bits
5218 (unsigned)IsStream; // Stream bit
5219
5220 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain,
5221 DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr);
5222 }
5223 case Intrinsic::aarch64_sme_str:
5224 case Intrinsic::aarch64_sme_ldr: {
5225 return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr);
5226 }
5227 case Intrinsic::aarch64_sme_za_enable:
5228 return DAG.getNode(
5229 AArch64ISD::SMSTART, DL, MVT::Other,
5230 Op->getOperand(0), // Chain
5231 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5232 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
5233 case Intrinsic::aarch64_sme_za_disable:
5234 return DAG.getNode(
5235 AArch64ISD::SMSTOP, DL, MVT::Other,
5236 Op->getOperand(0), // Chain
5237 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5238 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
5239 }
5240}
5241
5242SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
5243 SelectionDAG &DAG) const {
5244 unsigned IntNo = Op.getConstantOperandVal(i: 1);
5245 SDLoc DL(Op);
5246 switch (IntNo) {
5247 default:
5248 return SDValue(); // Don't custom lower most intrinsics.
5249 case Intrinsic::aarch64_mops_memset_tag: {
5250 auto Node = cast<MemIntrinsicSDNode>(Val: Op.getNode());
5251 SDValue Chain = Node->getChain();
5252 SDValue Dst = Op.getOperand(i: 2);
5253 SDValue Val = Op.getOperand(i: 3);
5254 Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
5255 SDValue Size = Op.getOperand(i: 4);
5256 auto Alignment = Node->getMemOperand()->getAlign();
5257 bool IsVol = Node->isVolatile();
5258 auto DstPtrInfo = Node->getPointerInfo();
5259
5260 const auto &SDI =
5261 static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
5262 SDValue MS =
5263 SDI.EmitMOPS(SDOpcode: AArch64ISD::MOPS_MEMSET_TAGGING, DAG, DL, Chain, Dst, SrcOrValue: Val,
5264 Size, Alignment, isVolatile: IsVol, DstPtrInfo, SrcPtrInfo: MachinePointerInfo{});
5265
5266 // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
5267 // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
5268 // LowerOperationWrapper will complain that the number of results has
5269 // changed.
5270 return DAG.getMergeValues(Ops: {MS.getValue(R: 0), MS.getValue(R: 2)}, dl: DL);
5271 }
5272 }
5273}
5274
5275SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
5276 SelectionDAG &DAG) const {
5277 unsigned IntNo = Op.getConstantOperandVal(i: 0);
5278 SDLoc dl(Op);
5279 switch (IntNo) {
5280 default: return SDValue(); // Don't custom lower most intrinsics.
5281 case Intrinsic::thread_pointer: {
5282 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
5283 return DAG.getNode(Opcode: AArch64ISD::THREAD_POINTER, DL: dl, VT: PtrVT);
5284 }
5285 case Intrinsic::aarch64_neon_abs: {
5286 EVT Ty = Op.getValueType();
5287 if (Ty == MVT::i64) {
5288 SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
5289 Op.getOperand(1));
5290 Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
5291 return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
5292 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(VT: Ty)) {
5293 return DAG.getNode(Opcode: ISD::ABS, DL: dl, VT: Ty, Operand: Op.getOperand(i: 1));
5294 } else {
5295 report_fatal_error(reason: "Unexpected type for AArch64 NEON intrinic");
5296 }
5297 }
5298 case Intrinsic::aarch64_neon_pmull64: {
5299 SDValue LHS = Op.getOperand(i: 1);
5300 SDValue RHS = Op.getOperand(i: 2);
5301
5302 std::optional<uint64_t> LHSLane =
5303 getConstantLaneNumOfExtractHalfOperand(Op&: LHS);
5304 std::optional<uint64_t> RHSLane =
5305 getConstantLaneNumOfExtractHalfOperand(Op&: RHS);
5306
5307 assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1");
5308 assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1");
5309
5310 // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
5311 // instructions execute on SIMD registers. So canonicalize i64 to v1i64,
5312 // which ISel recognizes better. For example, generate a ldr into d*
5313 // registers as opposed to a GPR load followed by a fmov.
5314 auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,
5315 std::optional<uint64_t> OtherLane,
5316 const SDLoc &dl,
5317 SelectionDAG &DAG) -> SDValue {
5318 // If the operand is an higher half itself, rewrite it to
5319 // extract_high_v2i64; this way aarch64_neon_pmull64 could
5320 // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
5321 if (NLane && *NLane == 1)
5322 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
5323 N.getOperand(0), DAG.getConstant(1, dl, MVT::i64));
5324
5325 // Operand N is not a higher half but the other operand is.
5326 if (OtherLane && *OtherLane == 1) {
5327 // If this operand is a lower half, rewrite it to
5328 // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
5329 // align lanes of two operands. A roundtrip sequence (to move from lane
5330 // 1 to lane 0) is like this:
5331 // mov x8, v0.d[1]
5332 // fmov d0, x8
5333 if (NLane && *NLane == 0)
5334 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
5335 DAG.getNode(AArch64ISD::DUPLANE64, dl, MVT::v2i64,
5336 N.getOperand(0),
5337 DAG.getConstant(0, dl, MVT::i64)),
5338 DAG.getConstant(1, dl, MVT::i64));
5339
5340 // Otherwise just dup from main to all lanes.
5341 return DAG.getNode(AArch64ISD::DUP, dl, MVT::v1i64, N);
5342 }
5343
5344 // Neither operand is an extract of higher half, so codegen may just use
5345 // the non-high version of PMULL instruction. Use v1i64 to represent i64.
5346 assert(N.getValueType() == MVT::i64 &&
5347 "Intrinsic aarch64_neon_pmull64 requires i64 parameters");
5348 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, N);
5349 };
5350
5351 LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, dl, DAG);
5352 RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, dl, DAG);
5353
5354 return DAG.getNode(Opcode: AArch64ISD::PMULL, DL: dl, VT: Op.getValueType(), N1: LHS, N2: RHS);
5355 }
5356 case Intrinsic::aarch64_neon_smax:
5357 return DAG.getNode(Opcode: ISD::SMAX, DL: dl, VT: Op.getValueType(),
5358 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
5359 case Intrinsic::aarch64_neon_umax:
5360 return DAG.getNode(Opcode: ISD::UMAX, DL: dl, VT: Op.getValueType(),
5361 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
5362 case Intrinsic::aarch64_neon_smin:
5363 return DAG.getNode(Opcode: ISD::SMIN, DL: dl, VT: Op.getValueType(),
5364 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
5365 case Intrinsic::aarch64_neon_umin:
5366 return DAG.getNode(Opcode: ISD::UMIN, DL: dl, VT: Op.getValueType(),
5367 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
5368 case Intrinsic::aarch64_neon_scalar_sqxtn:
5369 case Intrinsic::aarch64_neon_scalar_sqxtun:
5370 case Intrinsic::aarch64_neon_scalar_uqxtn: {
5371 assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32);
5372 if (Op.getValueType() == MVT::i32)
5373 return DAG.getNode(ISD::BITCAST, dl, MVT::i32,
5374 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::f32,
5375 Op.getOperand(0),
5376 DAG.getNode(ISD::BITCAST, dl, MVT::f64,
5377 Op.getOperand(1))));
5378 return SDValue();
5379 }
5380 case Intrinsic::aarch64_sve_whilelo:
5381 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/false,
5382 /*IsEqual=*/false);
5383 case Intrinsic::aarch64_sve_whilelt:
5384 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/true,
5385 /*IsEqual=*/false);
5386 case Intrinsic::aarch64_sve_whilels:
5387 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/false,
5388 /*IsEqual=*/true);
5389 case Intrinsic::aarch64_sve_whilele:
5390 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/true,
5391 /*IsEqual=*/true);
5392 case Intrinsic::aarch64_sve_sunpkhi:
5393 return DAG.getNode(Opcode: AArch64ISD::SUNPKHI, DL: dl, VT: Op.getValueType(),
5394 Operand: Op.getOperand(i: 1));
5395 case Intrinsic::aarch64_sve_sunpklo:
5396 return DAG.getNode(Opcode: AArch64ISD::SUNPKLO, DL: dl, VT: Op.getValueType(),
5397 Operand: Op.getOperand(i: 1));
5398 case Intrinsic::aarch64_sve_uunpkhi:
5399 return DAG.getNode(Opcode: AArch64ISD::UUNPKHI, DL: dl, VT: Op.getValueType(),
5400 Operand: Op.getOperand(i: 1));
5401 case Intrinsic::aarch64_sve_uunpklo:
5402 return DAG.getNode(Opcode: AArch64ISD::UUNPKLO, DL: dl, VT: Op.getValueType(),
5403 Operand: Op.getOperand(i: 1));
5404 case Intrinsic::aarch64_sve_clasta_n:
5405 return DAG.getNode(Opcode: AArch64ISD::CLASTA_N, DL: dl, VT: Op.getValueType(),
5406 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
5407 case Intrinsic::aarch64_sve_clastb_n:
5408 return DAG.getNode(Opcode: AArch64ISD::CLASTB_N, DL: dl, VT: Op.getValueType(),
5409 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
5410 case Intrinsic::aarch64_sve_lasta:
5411 return DAG.getNode(Opcode: AArch64ISD::LASTA, DL: dl, VT: Op.getValueType(),
5412 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
5413 case Intrinsic::aarch64_sve_lastb:
5414 return DAG.getNode(Opcode: AArch64ISD::LASTB, DL: dl, VT: Op.getValueType(),
5415 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
5416 case Intrinsic::aarch64_sve_rev:
5417 return DAG.getNode(Opcode: ISD::VECTOR_REVERSE, DL: dl, VT: Op.getValueType(),
5418 Operand: Op.getOperand(i: 1));
5419 case Intrinsic::aarch64_sve_tbl:
5420 return DAG.getNode(Opcode: AArch64ISD::TBL, DL: dl, VT: Op.getValueType(),
5421 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
5422 case Intrinsic::aarch64_sve_trn1:
5423 return DAG.getNode(Opcode: AArch64ISD::TRN1, DL: dl, VT: Op.getValueType(),
5424 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
5425 case Intrinsic::aarch64_sve_trn2:
5426 return DAG.getNode(Opcode: AArch64ISD::TRN2, DL: dl, VT: Op.getValueType(),
5427 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
5428 case Intrinsic::aarch64_sve_uzp1:
5429 return DAG.getNode(Opcode: AArch64ISD::UZP1, DL: dl, VT: Op.getValueType(),
5430 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
5431 case Intrinsic::aarch64_sve_uzp2:
5432 return DAG.getNode(Opcode: AArch64ISD::UZP2, DL: dl, VT: Op.getValueType(),
5433 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
5434 case Intrinsic::aarch64_sve_zip1:
5435 return DAG.getNode(Opcode: AArch64ISD::ZIP1, DL: dl, VT: Op.getValueType(),
5436 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
5437 case Intrinsic::aarch64_sve_zip2:
5438 return DAG.getNode(Opcode: AArch64ISD::ZIP2, DL: dl, VT: Op.getValueType(),
5439 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
5440 case Intrinsic::aarch64_sve_splice:
5441 return DAG.getNode(Opcode: AArch64ISD::SPLICE, DL: dl, VT: Op.getValueType(),
5442 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
5443 case Intrinsic::aarch64_sve_ptrue:
5444 return getPTrue(DAG, DL: dl, VT: Op.getValueType(), Pattern: Op.getConstantOperandVal(i: 1));
5445 case Intrinsic::aarch64_sve_clz:
5446 return DAG.getNode(Opcode: AArch64ISD::CTLZ_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5447 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
5448 case Intrinsic::aarch64_sme_cntsb:
5449 return DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5450 DAG.getConstant(1, dl, MVT::i32));
5451 case Intrinsic::aarch64_sme_cntsh: {
5452 SDValue One = DAG.getConstant(1, dl, MVT::i32);
5453 SDValue Bytes = DAG.getNode(Opcode: AArch64ISD::RDSVL, DL: dl, VT: Op.getValueType(), Operand: One);
5454 return DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: Op.getValueType(), N1: Bytes, N2: One);
5455 }
5456 case Intrinsic::aarch64_sme_cntsw: {
5457 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5458 DAG.getConstant(1, dl, MVT::i32));
5459 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
5460 DAG.getConstant(2, dl, MVT::i32));
5461 }
5462 case Intrinsic::aarch64_sme_cntsd: {
5463 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5464 DAG.getConstant(1, dl, MVT::i32));
5465 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
5466 DAG.getConstant(3, dl, MVT::i32));
5467 }
5468 case Intrinsic::aarch64_sve_cnt: {
5469 SDValue Data = Op.getOperand(i: 3);
5470 // CTPOP only supports integer operands.
5471 if (Data.getValueType().isFloatingPoint())
5472 Data = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: Op.getValueType(), Operand: Data);
5473 return DAG.getNode(Opcode: AArch64ISD::CTPOP_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5474 N1: Op.getOperand(i: 2), N2: Data, N3: Op.getOperand(i: 1));
5475 }
5476 case Intrinsic::aarch64_sve_dupq_lane:
5477 return LowerDUPQLane(Op, DAG);
5478 case Intrinsic::aarch64_sve_convert_from_svbool:
5479 if (Op.getValueType() == MVT::aarch64svcount)
5480 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: Op.getValueType(), Operand: Op.getOperand(i: 1));
5481 return getSVEPredicateBitCast(VT: Op.getValueType(), Op: Op.getOperand(i: 1), DAG);
5482 case Intrinsic::aarch64_sve_convert_to_svbool:
5483 if (Op.getOperand(1).getValueType() == MVT::aarch64svcount)
5484 return DAG.getNode(ISD::BITCAST, dl, MVT::nxv16i1, Op.getOperand(1));
5485 return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);
5486 case Intrinsic::aarch64_sve_fneg:
5487 return DAG.getNode(Opcode: AArch64ISD::FNEG_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5488 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
5489 case Intrinsic::aarch64_sve_frintp:
5490 return DAG.getNode(Opcode: AArch64ISD::FCEIL_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5491 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
5492 case Intrinsic::aarch64_sve_frintm:
5493 return DAG.getNode(Opcode: AArch64ISD::FFLOOR_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5494 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
5495 case Intrinsic::aarch64_sve_frinti:
5496 return DAG.getNode(Opcode: AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5497 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
5498 case Intrinsic::aarch64_sve_frintx:
5499 return DAG.getNode(Opcode: AArch64ISD::FRINT_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5500 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
5501 case Intrinsic::aarch64_sve_frinta:
5502 return DAG.getNode(Opcode: AArch64ISD::FROUND_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5503 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
5504 case Intrinsic::aarch64_sve_frintn:
5505 return DAG.getNode(Opcode: AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5506 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
5507 case Intrinsic::aarch64_sve_frintz:
5508 return DAG.getNode(Opcode: AArch64ISD::FTRUNC_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5509 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
5510 case Intrinsic::aarch64_sve_ucvtf:
5511 return DAG.getNode(Opcode: AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, DL: dl,
5512 VT: Op.getValueType(), N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3),
5513 N3: Op.getOperand(i: 1));
5514 case Intrinsic::aarch64_sve_scvtf:
5515 return DAG.getNode(Opcode: AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, DL: dl,
5516 VT: Op.getValueType(), N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3),
5517 N3: Op.getOperand(i: 1));
5518 case Intrinsic::aarch64_sve_fcvtzu:
5519 return DAG.getNode(Opcode: AArch64ISD::FCVTZU_MERGE_PASSTHRU, DL: dl,
5520 VT: Op.getValueType(), N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3),
5521 N3: Op.getOperand(i: 1));
5522 case Intrinsic::aarch64_sve_fcvtzs:
5523 return DAG.getNode(Opcode: AArch64ISD::FCVTZS_MERGE_PASSTHRU, DL: dl,
5524 VT: Op.getValueType(), N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3),
5525 N3: Op.getOperand(i: 1));
5526 case Intrinsic::aarch64_sve_fsqrt:
5527 return DAG.getNode(Opcode: AArch64ISD::FSQRT_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5528 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
5529 case Intrinsic::aarch64_sve_frecpx:
5530 return DAG.getNode(Opcode: AArch64ISD::FRECPX_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5531 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
5532 case Intrinsic::aarch64_sve_frecpe_x:
5533 return DAG.getNode(Opcode: AArch64ISD::FRECPE, DL: dl, VT: Op.getValueType(),
5534 Operand: Op.getOperand(i: 1));
5535 case Intrinsic::aarch64_sve_frecps_x:
5536 return DAG.getNode(Opcode: AArch64ISD::FRECPS, DL: dl, VT: Op.getValueType(),
5537 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
5538 case Intrinsic::aarch64_sve_frsqrte_x:
5539 return DAG.getNode(Opcode: AArch64ISD::FRSQRTE, DL: dl, VT: Op.getValueType(),
5540 Operand: Op.getOperand(i: 1));
5541 case Intrinsic::aarch64_sve_frsqrts_x:
5542 return DAG.getNode(Opcode: AArch64ISD::FRSQRTS, DL: dl, VT: Op.getValueType(),
5543 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
5544 case Intrinsic::aarch64_sve_fabs:
5545 return DAG.getNode(Opcode: AArch64ISD::FABS_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5546 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
5547 case Intrinsic::aarch64_sve_abs:
5548 return DAG.getNode(Opcode: AArch64ISD::ABS_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5549 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
5550 case Intrinsic::aarch64_sve_neg:
5551 return DAG.getNode(Opcode: AArch64ISD::NEG_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5552 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
5553 case Intrinsic::aarch64_sve_insr: {
5554 SDValue Scalar = Op.getOperand(i: 2);
5555 EVT ScalarTy = Scalar.getValueType();
5556 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
5557 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
5558
5559 return DAG.getNode(Opcode: AArch64ISD::INSR, DL: dl, VT: Op.getValueType(),
5560 N1: Op.getOperand(i: 1), N2: Scalar);
5561 }
5562 case Intrinsic::aarch64_sve_rbit:
5563 return DAG.getNode(Opcode: AArch64ISD::BITREVERSE_MERGE_PASSTHRU, DL: dl,
5564 VT: Op.getValueType(), N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3),
5565 N3: Op.getOperand(i: 1));
5566 case Intrinsic::aarch64_sve_revb:
5567 return DAG.getNode(Opcode: AArch64ISD::BSWAP_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5568 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
5569 case Intrinsic::aarch64_sve_revh:
5570 return DAG.getNode(Opcode: AArch64ISD::REVH_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5571 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
5572 case Intrinsic::aarch64_sve_revw:
5573 return DAG.getNode(Opcode: AArch64ISD::REVW_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5574 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
5575 case Intrinsic::aarch64_sve_revd:
5576 return DAG.getNode(Opcode: AArch64ISD::REVD_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5577 N1: Op.getOperand(i: 2), N2: Op.getOperand(i: 3), N3: Op.getOperand(i: 1));
5578 case Intrinsic::aarch64_sve_sxtb:
5579 return DAG.getNode(
5580 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
5581 Op.getOperand(2), Op.getOperand(3),
5582 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
5583 Op.getOperand(1));
5584 case Intrinsic::aarch64_sve_sxth:
5585 return DAG.getNode(
5586 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
5587 Op.getOperand(2), Op.getOperand(3),
5588 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
5589 Op.getOperand(1));
5590 case Intrinsic::aarch64_sve_sxtw:
5591 return DAG.getNode(
5592 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
5593 Op.getOperand(2), Op.getOperand(3),
5594 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
5595 Op.getOperand(1));
5596 case Intrinsic::aarch64_sve_uxtb:
5597 return DAG.getNode(
5598 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
5599 Op.getOperand(2), Op.getOperand(3),
5600 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
5601 Op.getOperand(1));
5602 case Intrinsic::aarch64_sve_uxth:
5603 return DAG.getNode(
5604 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
5605 Op.getOperand(2), Op.getOperand(3),
5606 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
5607 Op.getOperand(1));
5608 case Intrinsic::aarch64_sve_uxtw:
5609 return DAG.getNode(
5610 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
5611 Op.getOperand(2), Op.getOperand(3),
5612 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
5613 Op.getOperand(1));
5614 case Intrinsic::localaddress: {
5615 const auto &MF = DAG.getMachineFunction();
5616 const auto *RegInfo = Subtarget->getRegisterInfo();
5617 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
5618 return DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl, Reg,
5619 VT: Op.getSimpleValueType());
5620 }
5621
5622 case Intrinsic::eh_recoverfp: {
5623 // FIXME: This needs to be implemented to correctly handle highly aligned
5624 // stack objects. For now we simply return the incoming FP. Refer D53541
5625 // for more details.
5626 SDValue FnOp = Op.getOperand(i: 1);
5627 SDValue IncomingFPOp = Op.getOperand(i: 2);
5628 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Val&: FnOp);
5629 auto *Fn = dyn_cast_or_null<Function>(Val: GSD ? GSD->getGlobal() : nullptr);
5630 if (!Fn)
5631 report_fatal_error(
5632 reason: "llvm.eh.recoverfp must take a function as the first argument");
5633 return IncomingFPOp;
5634 }
5635
5636 case Intrinsic::aarch64_neon_vsri:
5637 case Intrinsic::aarch64_neon_vsli:
5638 case Intrinsic::aarch64_sve_sri:
5639 case Intrinsic::aarch64_sve_sli: {
5640 EVT Ty = Op.getValueType();
5641
5642 if (!Ty.isVector())
5643 report_fatal_error(reason: "Unexpected type for aarch64_neon_vsli");
5644
5645 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
5646
5647 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri ||
5648 IntNo == Intrinsic::aarch64_sve_sri;
5649 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
5650 return DAG.getNode(Opcode, DL: dl, VT: Ty, N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2),
5651 N3: Op.getOperand(i: 3));
5652 }
5653
5654 case Intrinsic::aarch64_neon_srhadd:
5655 case Intrinsic::aarch64_neon_urhadd:
5656 case Intrinsic::aarch64_neon_shadd:
5657 case Intrinsic::aarch64_neon_uhadd: {
5658 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
5659 IntNo == Intrinsic::aarch64_neon_shadd);
5660 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
5661 IntNo == Intrinsic::aarch64_neon_urhadd);
5662 unsigned Opcode = IsSignedAdd
5663 ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
5664 : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
5665 return DAG.getNode(Opcode, DL: dl, VT: Op.getValueType(), N1: Op.getOperand(i: 1),
5666 N2: Op.getOperand(i: 2));
5667 }
5668 case Intrinsic::aarch64_neon_saddlp:
5669 case Intrinsic::aarch64_neon_uaddlp: {
5670 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
5671 ? AArch64ISD::UADDLP
5672 : AArch64ISD::SADDLP;
5673 return DAG.getNode(Opcode, DL: dl, VT: Op.getValueType(), Operand: Op.getOperand(i: 1));
5674 }
5675 case Intrinsic::aarch64_neon_sdot:
5676 case Intrinsic::aarch64_neon_udot:
5677 case Intrinsic::aarch64_sve_sdot:
5678 case Intrinsic::aarch64_sve_udot: {
5679 unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
5680 IntNo == Intrinsic::aarch64_sve_udot)
5681 ? AArch64ISD::UDOT
5682 : AArch64ISD::SDOT;
5683 return DAG.getNode(Opcode, DL: dl, VT: Op.getValueType(), N1: Op.getOperand(i: 1),
5684 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
5685 }
5686 case Intrinsic::get_active_lane_mask: {
5687 SDValue ID =
5688 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64);
5689 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: Op.getValueType(), N1: ID,
5690 N2: Op.getOperand(i: 1), N3: Op.getOperand(i: 2));
5691 }
5692 case Intrinsic::aarch64_neon_uaddlv: {
5693 EVT OpVT = Op.getOperand(i: 1).getValueType();
5694 EVT ResVT = Op.getValueType();
5695 if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
5696 OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) {
5697 // In order to avoid insert_subvector, used v4i32 than v2i32.
5698 SDValue UADDLV =
5699 DAG.getNode(AArch64ISD::UADDLV, dl, MVT::v4i32, Op.getOperand(1));
5700 SDValue EXTRACT_VEC_ELT =
5701 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, UADDLV,
5702 DAG.getConstant(0, dl, MVT::i64));
5703 return EXTRACT_VEC_ELT;
5704 }
5705 return SDValue();
5706 }
5707 case Intrinsic::experimental_cttz_elts: {
5708 SDValue NewCttzElts =
5709 DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, Op.getOperand(1));
5710
5711 return DAG.getZExtOrTrunc(Op: NewCttzElts, DL: dl, VT: Op.getValueType());
5712 }
5713 }
5714}
5715
5716bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
5717 if (VT.getVectorElementType() == MVT::i8 ||
5718 VT.getVectorElementType() == MVT::i16) {
5719 EltTy = MVT::i32;
5720 return true;
5721 }
5722 return false;
5723}
5724
5725bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
5726 EVT DataVT) const {
5727 const EVT IndexVT = Extend.getOperand(i: 0).getValueType();
5728 // SVE only supports implicit extension of 32-bit indices.
5729 if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
5730 return false;
5731
5732 // Indices cannot be smaller than the main data type.
5733 if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
5734 return false;
5735
5736 // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
5737 // element container type, which would violate the previous clause.
5738 return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
5739}
5740
5741bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
5742 EVT ExtVT = ExtVal.getValueType();
5743 if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
5744 return false;
5745
5746 // It may be worth creating extending masked loads if there are multiple
5747 // masked loads using the same predicate. That way we'll end up creating
5748 // extending masked loads that may then get split by the legaliser. This
5749 // results in just one set of predicate unpacks at the start, instead of
5750 // multiple sets of vector unpacks after each load.
5751 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(Val: ExtVal->getOperand(Num: 0))) {
5752 if (!isLoadExtLegalOrCustom(ExtType: ISD::ZEXTLOAD, ValVT: ExtVT, MemVT: Ld->getValueType(ResNo: 0))) {
5753 // Disable extending masked loads for fixed-width for now, since the code
5754 // quality doesn't look great.
5755 if (!ExtVT.isScalableVector())
5756 return false;
5757
5758 unsigned NumExtMaskedLoads = 0;
5759 for (auto *U : Ld->getMask()->uses())
5760 if (isa<MaskedLoadSDNode>(Val: U))
5761 NumExtMaskedLoads++;
5762
5763 if (NumExtMaskedLoads <= 1)
5764 return false;
5765 }
5766 }
5767
5768 return true;
5769}
5770
5771unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
5772 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
5773 {std::make_tuple(/*Scaled*/ args: false, /*Signed*/ args: false, /*Extend*/ args: false),
5774 AArch64ISD::GLD1_MERGE_ZERO},
5775 {std::make_tuple(/*Scaled*/ args: false, /*Signed*/ args: false, /*Extend*/ args: true),
5776 AArch64ISD::GLD1_UXTW_MERGE_ZERO},
5777 {std::make_tuple(/*Scaled*/ args: false, /*Signed*/ args: true, /*Extend*/ args: false),
5778 AArch64ISD::GLD1_MERGE_ZERO},
5779 {std::make_tuple(/*Scaled*/ args: false, /*Signed*/ args: true, /*Extend*/ args: true),
5780 AArch64ISD::GLD1_SXTW_MERGE_ZERO},
5781 {std::make_tuple(/*Scaled*/ args: true, /*Signed*/ args: false, /*Extend*/ args: false),
5782 AArch64ISD::GLD1_SCALED_MERGE_ZERO},
5783 {std::make_tuple(/*Scaled*/ args: true, /*Signed*/ args: false, /*Extend*/ args: true),
5784 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO},
5785 {std::make_tuple(/*Scaled*/ args: true, /*Signed*/ args: true, /*Extend*/ args: false),
5786 AArch64ISD::GLD1_SCALED_MERGE_ZERO},
5787 {std::make_tuple(/*Scaled*/ args: true, /*Signed*/ args: true, /*Extend*/ args: true),
5788 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO},
5789 };
5790 auto Key = std::make_tuple(args&: IsScaled, args&: IsSigned, args&: NeedsExtend);
5791 return AddrModes.find(x: Key)->second;
5792}
5793
5794unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
5795 switch (Opcode) {
5796 default:
5797 llvm_unreachable("unimplemented opcode");
5798 return Opcode;
5799 case AArch64ISD::GLD1_MERGE_ZERO:
5800 return AArch64ISD::GLD1S_MERGE_ZERO;
5801 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
5802 return AArch64ISD::GLD1S_IMM_MERGE_ZERO;
5803 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
5804 return AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
5805 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
5806 return AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
5807 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
5808 return AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
5809 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
5810 return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
5811 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
5812 return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
5813 }
5814}
5815
5816SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
5817 SelectionDAG &DAG) const {
5818 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Val&: Op);
5819
5820 SDLoc DL(Op);
5821 SDValue Chain = MGT->getChain();
5822 SDValue PassThru = MGT->getPassThru();
5823 SDValue Mask = MGT->getMask();
5824 SDValue BasePtr = MGT->getBasePtr();
5825 SDValue Index = MGT->getIndex();
5826 SDValue Scale = MGT->getScale();
5827 EVT VT = Op.getValueType();
5828 EVT MemVT = MGT->getMemoryVT();
5829 ISD::LoadExtType ExtType = MGT->getExtensionType();
5830 ISD::MemIndexType IndexType = MGT->getIndexType();
5831
5832 // SVE supports zero (and so undef) passthrough values only, everything else
5833 // must be handled manually by an explicit select on the load's output.
5834 if (!PassThru->isUndef() && !isZerosVector(N: PassThru.getNode())) {
5835 SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
5836 SDValue Load =
5837 DAG.getMaskedGather(VTs: MGT->getVTList(), MemVT, dl: DL, Ops,
5838 MMO: MGT->getMemOperand(), IndexType, ExtTy: ExtType);
5839 SDValue Select = DAG.getSelect(DL, VT, Cond: Mask, LHS: Load, RHS: PassThru);
5840 return DAG.getMergeValues(Ops: {Select, Load.getValue(R: 1)}, dl: DL);
5841 }
5842
5843 bool IsScaled = MGT->isIndexScaled();
5844 bool IsSigned = MGT->isIndexSigned();
5845
5846 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
5847 // must be calculated before hand.
5848 uint64_t ScaleVal = Scale->getAsZExtVal();
5849 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
5850 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
5851 EVT IndexVT = Index.getValueType();
5852 Index = DAG.getNode(Opcode: ISD::SHL, DL, VT: IndexVT, N1: Index,
5853 N2: DAG.getConstant(Val: Log2_32(Value: ScaleVal), DL, VT: IndexVT));
5854 Scale = DAG.getTargetConstant(Val: 1, DL, VT: Scale.getValueType());
5855
5856 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
5857 return DAG.getMaskedGather(VTs: MGT->getVTList(), MemVT, dl: DL, Ops,
5858 MMO: MGT->getMemOperand(), IndexType, ExtTy: ExtType);
5859 }
5860
5861 // Lower fixed length gather to a scalable equivalent.
5862 if (VT.isFixedLengthVector()) {
5863 assert(Subtarget->useSVEForFixedLengthVectors() &&
5864 "Cannot lower when not using SVE for fixed vectors!");
5865
5866 // NOTE: Handle floating-point as if integer then bitcast the result.
5867 EVT DataVT = VT.changeVectorElementTypeToInteger();
5868 MemVT = MemVT.changeVectorElementTypeToInteger();
5869
5870 // Find the smallest integer fixed length vector we can use for the gather.
5871 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
5872 if (DataVT.getVectorElementType() == MVT::i64 ||
5873 Index.getValueType().getVectorElementType() == MVT::i64 ||
5874 Mask.getValueType().getVectorElementType() == MVT::i64)
5875 PromotedVT = VT.changeVectorElementType(MVT::i64);
5876
5877 // Promote vector operands except for passthrough, which we know is either
5878 // undef or zero, and thus best constructed directly.
5879 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5880 Index = DAG.getNode(Opcode: ExtOpcode, DL, VT: PromotedVT, Operand: Index);
5881 Mask = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: PromotedVT, Operand: Mask);
5882
5883 // A promoted result type forces the need for an extending load.
5884 if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
5885 ExtType = ISD::EXTLOAD;
5886
5887 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: PromotedVT);
5888
5889 // Convert fixed length vector operands to scalable.
5890 MemVT = ContainerVT.changeVectorElementType(EltVT: MemVT.getVectorElementType());
5891 Index = convertToScalableVector(DAG, VT: ContainerVT, V: Index);
5892 Mask = convertFixedMaskToScalableVector(Mask, DAG);
5893 PassThru = PassThru->isUndef() ? DAG.getUNDEF(VT: ContainerVT)
5894 : DAG.getConstant(Val: 0, DL, VT: ContainerVT);
5895
5896 // Emit equivalent scalable vector gather.
5897 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
5898 SDValue Load =
5899 DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
5900 Ops, MGT->getMemOperand(), IndexType, ExtType);
5901
5902 // Extract fixed length data then convert to the required result type.
5903 SDValue Result = convertFromScalableVector(DAG, VT: PromotedVT, V: Load);
5904 Result = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: DataVT, Operand: Result);
5905 if (VT.isFloatingPoint())
5906 Result = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Result);
5907
5908 return DAG.getMergeValues(Ops: {Result, Load.getValue(R: 1)}, dl: DL);
5909 }
5910
5911 // Everything else is legal.
5912 return Op;
5913}
5914
5915SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
5916 SelectionDAG &DAG) const {
5917 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Val&: Op);
5918
5919 SDLoc DL(Op);
5920 SDValue Chain = MSC->getChain();
5921 SDValue StoreVal = MSC->getValue();
5922 SDValue Mask = MSC->getMask();
5923 SDValue BasePtr = MSC->getBasePtr();
5924 SDValue Index = MSC->getIndex();
5925 SDValue Scale = MSC->getScale();
5926 EVT VT = StoreVal.getValueType();
5927 EVT MemVT = MSC->getMemoryVT();
5928 ISD::MemIndexType IndexType = MSC->getIndexType();
5929 bool Truncating = MSC->isTruncatingStore();
5930
5931 bool IsScaled = MSC->isIndexScaled();
5932 bool IsSigned = MSC->isIndexSigned();
5933
5934 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
5935 // must be calculated before hand.
5936 uint64_t ScaleVal = Scale->getAsZExtVal();
5937 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
5938 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
5939 EVT IndexVT = Index.getValueType();
5940 Index = DAG.getNode(Opcode: ISD::SHL, DL, VT: IndexVT, N1: Index,
5941 N2: DAG.getConstant(Val: Log2_32(Value: ScaleVal), DL, VT: IndexVT));
5942 Scale = DAG.getTargetConstant(Val: 1, DL, VT: Scale.getValueType());
5943
5944 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
5945 return DAG.getMaskedScatter(VTs: MSC->getVTList(), MemVT, dl: DL, Ops,
5946 MMO: MSC->getMemOperand(), IndexType, IsTruncating: Truncating);
5947 }
5948
5949 // Lower fixed length scatter to a scalable equivalent.
5950 if (VT.isFixedLengthVector()) {
5951 assert(Subtarget->useSVEForFixedLengthVectors() &&
5952 "Cannot lower when not using SVE for fixed vectors!");
5953
5954 // Once bitcast we treat floating-point scatters as if integer.
5955 if (VT.isFloatingPoint()) {
5956 VT = VT.changeVectorElementTypeToInteger();
5957 MemVT = MemVT.changeVectorElementTypeToInteger();
5958 StoreVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: StoreVal);
5959 }
5960
5961 // Find the smallest integer fixed length vector we can use for the scatter.
5962 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
5963 if (VT.getVectorElementType() == MVT::i64 ||
5964 Index.getValueType().getVectorElementType() == MVT::i64 ||
5965 Mask.getValueType().getVectorElementType() == MVT::i64)
5966 PromotedVT = VT.changeVectorElementType(MVT::i64);
5967
5968 // Promote vector operands.
5969 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5970 Index = DAG.getNode(Opcode: ExtOpcode, DL, VT: PromotedVT, Operand: Index);
5971 Mask = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: PromotedVT, Operand: Mask);
5972 StoreVal = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: PromotedVT, Operand: StoreVal);
5973
5974 // A promoted value type forces the need for a truncating store.
5975 if (PromotedVT != VT)
5976 Truncating = true;
5977
5978 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: PromotedVT);
5979
5980 // Convert fixed length vector operands to scalable.
5981 MemVT = ContainerVT.changeVectorElementType(EltVT: MemVT.getVectorElementType());
5982 Index = convertToScalableVector(DAG, VT: ContainerVT, V: Index);
5983 Mask = convertFixedMaskToScalableVector(Mask, DAG);
5984 StoreVal = convertToScalableVector(DAG, VT: ContainerVT, V: StoreVal);
5985
5986 // Emit equivalent scalable vector scatter.
5987 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
5988 return DAG.getMaskedScatter(VTs: MSC->getVTList(), MemVT, dl: DL, Ops,
5989 MMO: MSC->getMemOperand(), IndexType, IsTruncating: Truncating);
5990 }
5991
5992 // Everything else is legal.
5993 return Op;
5994}
5995
5996SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
5997 SDLoc DL(Op);
5998 MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Val&: Op);
5999 assert(LoadNode && "Expected custom lowering of a masked load node");
6000 EVT VT = Op->getValueType(ResNo: 0);
6001
6002 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
6003 return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
6004
6005 SDValue PassThru = LoadNode->getPassThru();
6006 SDValue Mask = LoadNode->getMask();
6007
6008 if (PassThru->isUndef() || isZerosVector(N: PassThru.getNode()))
6009 return Op;
6010
6011 SDValue Load = DAG.getMaskedLoad(
6012 VT, dl: DL, Chain: LoadNode->getChain(), Base: LoadNode->getBasePtr(),
6013 Offset: LoadNode->getOffset(), Mask, Src0: DAG.getUNDEF(VT), MemVT: LoadNode->getMemoryVT(),
6014 MMO: LoadNode->getMemOperand(), AM: LoadNode->getAddressingMode(),
6015 LoadNode->getExtensionType());
6016
6017 SDValue Result = DAG.getSelect(DL, VT, Cond: Mask, LHS: Load, RHS: PassThru);
6018
6019 return DAG.getMergeValues(Ops: {Result, Load.getValue(R: 1)}, dl: DL);
6020}
6021
6022// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
6023static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
6024 EVT VT, EVT MemVT,
6025 SelectionDAG &DAG) {
6026 assert(VT.isVector() && "VT should be a vector type");
6027 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
6028
6029 SDValue Value = ST->getValue();
6030
6031 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
6032 // the word lane which represent the v4i8 subvector. It optimizes the store
6033 // to:
6034 //
6035 // xtn v0.8b, v0.8h
6036 // str s0, [x0]
6037
6038 SDValue Undef = DAG.getUNDEF(MVT::i16);
6039 SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
6040 {Undef, Undef, Undef, Undef});
6041
6042 SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
6043 Value, UndefVec);
6044 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
6045
6046 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
6047 SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
6048 Trunc, DAG.getConstant(0, DL, MVT::i64));
6049
6050 return DAG.getStore(Chain: ST->getChain(), dl: DL, Val: ExtractTrunc,
6051 Ptr: ST->getBasePtr(), MMO: ST->getMemOperand());
6052}
6053
6054// Custom lowering for any store, vector or scalar and/or default or with
6055// a truncate operations. Currently only custom lower truncate operation
6056// from vector v4i16 to v4i8 or volatile stores of i128.
6057SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
6058 SelectionDAG &DAG) const {
6059 SDLoc Dl(Op);
6060 StoreSDNode *StoreNode = cast<StoreSDNode>(Val&: Op);
6061 assert (StoreNode && "Can only custom lower store nodes");
6062
6063 SDValue Value = StoreNode->getValue();
6064
6065 EVT VT = Value.getValueType();
6066 EVT MemVT = StoreNode->getMemoryVT();
6067
6068 if (VT.isVector()) {
6069 if (useSVEForFixedLengthVectorVT(
6070 VT,
6071 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
6072 return LowerFixedLengthVectorStoreToSVE(Op, DAG);
6073
6074 unsigned AS = StoreNode->getAddressSpace();
6075 Align Alignment = StoreNode->getAlign();
6076 if (Alignment < MemVT.getStoreSize() &&
6077 !allowsMisalignedMemoryAccesses(VT: MemVT, AddrSpace: AS, Alignment,
6078 Flags: StoreNode->getMemOperand()->getFlags(),
6079 Fast: nullptr)) {
6080 return scalarizeVectorStore(ST: StoreNode, DAG);
6081 }
6082
6083 if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
6084 MemVT == MVT::v4i8) {
6085 return LowerTruncateVectorStore(DL: Dl, ST: StoreNode, VT, MemVT, DAG);
6086 }
6087 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
6088 // the custom lowering, as there are no un-paired non-temporal stores and
6089 // legalization will break up 256 bit inputs.
6090 ElementCount EC = MemVT.getVectorElementCount();
6091 if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
6092 EC.isKnownEven() && DAG.getDataLayout().isLittleEndian() &&
6093 (MemVT.getScalarSizeInBits() == 8u ||
6094 MemVT.getScalarSizeInBits() == 16u ||
6095 MemVT.getScalarSizeInBits() == 32u ||
6096 MemVT.getScalarSizeInBits() == 64u)) {
6097 SDValue Lo =
6098 DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
6099 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
6100 StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
6101 SDValue Hi =
6102 DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
6103 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
6104 StoreNode->getValue(),
6105 DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
6106 SDValue Result = DAG.getMemIntrinsicNode(
6107 AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
6108 {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
6109 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
6110 return Result;
6111 }
6112 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
6113 return LowerStore128(Op, DAG);
6114 } else if (MemVT == MVT::i64x8) {
6115 SDValue Value = StoreNode->getValue();
6116 assert(Value->getValueType(0) == MVT::i64x8);
6117 SDValue Chain = StoreNode->getChain();
6118 SDValue Base = StoreNode->getBasePtr();
6119 EVT PtrVT = Base.getValueType();
6120 for (unsigned i = 0; i < 8; i++) {
6121 SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
6122 Value, DAG.getConstant(i, Dl, MVT::i32));
6123 SDValue Ptr = DAG.getNode(Opcode: ISD::ADD, DL: Dl, VT: PtrVT, N1: Base,
6124 N2: DAG.getConstant(Val: i * 8, DL: Dl, VT: PtrVT));
6125 Chain = DAG.getStore(Chain, dl: Dl, Val: Part, Ptr, PtrInfo: StoreNode->getPointerInfo(),
6126 Alignment: StoreNode->getOriginalAlign());
6127 }
6128 return Chain;
6129 }
6130
6131 return SDValue();
6132}
6133
6134/// Lower atomic or volatile 128-bit stores to a single STP instruction.
6135SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
6136 SelectionDAG &DAG) const {
6137 MemSDNode *StoreNode = cast<MemSDNode>(Val&: Op);
6138 assert(StoreNode->getMemoryVT() == MVT::i128);
6139 assert(StoreNode->isVolatile() || StoreNode->isAtomic());
6140
6141 bool IsStoreRelease =
6142 StoreNode->getMergedOrdering() == AtomicOrdering::Release;
6143 if (StoreNode->isAtomic())
6144 assert((Subtarget->hasFeature(AArch64::FeatureLSE2) &&
6145 Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) ||
6146 StoreNode->getMergedOrdering() == AtomicOrdering::Unordered ||
6147 StoreNode->getMergedOrdering() == AtomicOrdering::Monotonic);
6148
6149 SDValue Value = (StoreNode->getOpcode() == ISD::STORE ||
6150 StoreNode->getOpcode() == ISD::ATOMIC_STORE)
6151 ? StoreNode->getOperand(Num: 1)
6152 : StoreNode->getOperand(Num: 2);
6153 SDLoc DL(Op);
6154 auto StoreValue = DAG.SplitScalar(Value, DL, MVT::i64, MVT::i64);
6155 unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;
6156 if (DAG.getDataLayout().isBigEndian())
6157 std::swap(StoreValue.first, StoreValue.second);
6158 SDValue Result = DAG.getMemIntrinsicNode(
6159 Opcode, DL, DAG.getVTList(MVT::Other),
6160 {StoreNode->getChain(), StoreValue.first, StoreValue.second,
6161 StoreNode->getBasePtr()},
6162 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
6163 return Result;
6164}
6165
6166SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
6167 SelectionDAG &DAG) const {
6168 SDLoc DL(Op);
6169 LoadSDNode *LoadNode = cast<LoadSDNode>(Val&: Op);
6170 assert(LoadNode && "Expected custom lowering of a load node");
6171
6172 if (LoadNode->getMemoryVT() == MVT::i64x8) {
6173 SmallVector<SDValue, 8> Ops;
6174 SDValue Base = LoadNode->getBasePtr();
6175 SDValue Chain = LoadNode->getChain();
6176 EVT PtrVT = Base.getValueType();
6177 for (unsigned i = 0; i < 8; i++) {
6178 SDValue Ptr = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: Base,
6179 N2: DAG.getConstant(Val: i * 8, DL, VT: PtrVT));
6180 SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr,
6181 LoadNode->getPointerInfo(),
6182 LoadNode->getOriginalAlign());
6183 Ops.push_back(Elt: Part);
6184 Chain = SDValue(Part.getNode(), 1);
6185 }
6186 SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
6187 return DAG.getMergeValues(Ops: {Loaded, Chain}, dl: DL);
6188 }
6189
6190 // Custom lowering for extending v4i8 vector loads.
6191 EVT VT = Op->getValueType(ResNo: 0);
6192 assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
6193
6194 if (LoadNode->getMemoryVT() != MVT::v4i8)
6195 return SDValue();
6196
6197 unsigned ExtType;
6198 if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
6199 ExtType = ISD::SIGN_EXTEND;
6200 else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
6201 LoadNode->getExtensionType() == ISD::EXTLOAD)
6202 ExtType = ISD::ZERO_EXTEND;
6203 else
6204 return SDValue();
6205
6206 SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
6207 LoadNode->getBasePtr(), MachinePointerInfo());
6208 SDValue Chain = Load.getValue(R: 1);
6209 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
6210 SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
6211 SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
6212 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
6213 DAG.getConstant(0, DL, MVT::i64));
6214 if (VT == MVT::v4i32)
6215 Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
6216 return DAG.getMergeValues(Ops: {Ext, Chain}, dl: DL);
6217}
6218
6219// Generate SUBS and CSEL for integer abs.
6220SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
6221 MVT VT = Op.getSimpleValueType();
6222
6223 if (VT.isVector())
6224 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::ABS_MERGE_PASSTHRU);
6225
6226 SDLoc DL(Op);
6227 SDValue Neg = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: DAG.getConstant(Val: 0, DL, VT),
6228 N2: Op.getOperand(i: 0));
6229 // Generate SUBS & CSEL.
6230 SDValue Cmp =
6231 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
6232 Op.getOperand(0), DAG.getConstant(0, DL, VT));
6233 return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
6234 DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
6235 Cmp.getValue(1));
6236}
6237
6238static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) {
6239 SDValue Chain = Op.getOperand(i: 0);
6240 SDValue Cond = Op.getOperand(i: 1);
6241 SDValue Dest = Op.getOperand(i: 2);
6242
6243 AArch64CC::CondCode CC;
6244 if (SDValue Cmp = emitConjunction(DAG, Val: Cond, OutCC&: CC)) {
6245 SDLoc dl(Op);
6246 SDValue CCVal = DAG.getConstant(CC, dl, MVT::i32);
6247 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
6248 Cmp);
6249 }
6250
6251 return SDValue();
6252}
6253
6254// Treat FSHR with constant shifts as legal operation, otherwise it is expanded
6255// FSHL is converted to FSHR before deciding what to do with it
6256static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG) {
6257 SDValue Shifts = Op.getOperand(i: 2);
6258 // Check if the shift amount is a constant
6259 // If opcode is FSHL, convert it to FSHR
6260 if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Val&: Shifts)) {
6261 SDLoc DL(Op);
6262 MVT VT = Op.getSimpleValueType();
6263
6264 if (Op.getOpcode() == ISD::FSHL) {
6265 unsigned int NewShiftNo =
6266 VT.getFixedSizeInBits() - ShiftNo->getZExtValue();
6267 return DAG.getNode(
6268 Opcode: ISD::FSHR, DL, VT, N1: Op.getOperand(i: 0), N2: Op.getOperand(i: 1),
6269 N3: DAG.getConstant(Val: NewShiftNo, DL, VT: Shifts.getValueType()));
6270 } else if (Op.getOpcode() == ISD::FSHR) {
6271 return Op;
6272 }
6273 }
6274
6275 return SDValue();
6276}
6277
6278static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG) {
6279 SDValue X = Op.getOperand(i: 0);
6280 EVT XScalarTy = X.getValueType();
6281 SDValue Exp = Op.getOperand(i: 1);
6282
6283 SDLoc DL(Op);
6284 EVT XVT, ExpVT;
6285 switch (Op.getSimpleValueType().SimpleTy) {
6286 default:
6287 return SDValue();
6288 case MVT::bf16:
6289 case MVT::f16:
6290 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);
6291 [[fallthrough]];
6292 case MVT::f32:
6293 XVT = MVT::nxv4f32;
6294 ExpVT = MVT::nxv4i32;
6295 break;
6296 case MVT::f64:
6297 XVT = MVT::nxv2f64;
6298 ExpVT = MVT::nxv2i64;
6299 Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Exp);
6300 break;
6301 }
6302
6303 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
6304 SDValue VX =
6305 DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: XVT, N1: DAG.getUNDEF(VT: XVT), N2: X, N3: Zero);
6306 SDValue VExp = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: ExpVT,
6307 N1: DAG.getUNDEF(VT: ExpVT), N2: Exp, N3: Zero);
6308 SDValue VPg = getPTrue(DAG, DL, XVT.changeVectorElementType(MVT::i1),
6309 AArch64SVEPredPattern::all);
6310 SDValue FScale =
6311 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, XVT,
6312 DAG.getConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64),
6313 VPg, VX, VExp);
6314 SDValue Final =
6315 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: X.getValueType(), N1: FScale, N2: Zero);
6316 if (X.getValueType() != XScalarTy)
6317 Final = DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: XScalarTy, N1: Final,
6318 N2: DAG.getIntPtrConstant(Val: 1, DL: SDLoc(Op)));
6319 return Final;
6320}
6321
6322SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
6323 SelectionDAG &DAG) const {
6324 LLVM_DEBUG(dbgs() << "Custom lowering: ");
6325 LLVM_DEBUG(Op.dump());
6326
6327 switch (Op.getOpcode()) {
6328 default:
6329 llvm_unreachable("unimplemented operand");
6330 return SDValue();
6331 case ISD::BITCAST:
6332 return LowerBITCAST(Op, DAG);
6333 case ISD::GlobalAddress:
6334 return LowerGlobalAddress(Op, DAG);
6335 case ISD::GlobalTLSAddress:
6336 return LowerGlobalTLSAddress(Op, DAG);
6337 case ISD::SETCC:
6338 case ISD::STRICT_FSETCC:
6339 case ISD::STRICT_FSETCCS:
6340 return LowerSETCC(Op, DAG);
6341 case ISD::SETCCCARRY:
6342 return LowerSETCCCARRY(Op, DAG);
6343 case ISD::BRCOND:
6344 return LowerBRCOND(Op, DAG);
6345 case ISD::BR_CC:
6346 return LowerBR_CC(Op, DAG);
6347 case ISD::SELECT:
6348 return LowerSELECT(Op, DAG);
6349 case ISD::SELECT_CC:
6350 return LowerSELECT_CC(Op, DAG);
6351 case ISD::JumpTable:
6352 return LowerJumpTable(Op, DAG);
6353 case ISD::BR_JT:
6354 return LowerBR_JT(Op, DAG);
6355 case ISD::ConstantPool:
6356 return LowerConstantPool(Op, DAG);
6357 case ISD::BlockAddress:
6358 return LowerBlockAddress(Op, DAG);
6359 case ISD::VASTART:
6360 return LowerVASTART(Op, DAG);
6361 case ISD::VACOPY:
6362 return LowerVACOPY(Op, DAG);
6363 case ISD::VAARG:
6364 return LowerVAARG(Op, DAG);
6365 case ISD::UADDO_CARRY:
6366 return lowerADDSUBO_CARRY(Op, DAG, Opcode: AArch64ISD::ADCS, IsSigned: false /*unsigned*/);
6367 case ISD::USUBO_CARRY:
6368 return lowerADDSUBO_CARRY(Op, DAG, Opcode: AArch64ISD::SBCS, IsSigned: false /*unsigned*/);
6369 case ISD::SADDO_CARRY:
6370 return lowerADDSUBO_CARRY(Op, DAG, Opcode: AArch64ISD::ADCS, IsSigned: true /*signed*/);
6371 case ISD::SSUBO_CARRY:
6372 return lowerADDSUBO_CARRY(Op, DAG, Opcode: AArch64ISD::SBCS, IsSigned: true /*signed*/);
6373 case ISD::SADDO:
6374 case ISD::UADDO:
6375 case ISD::SSUBO:
6376 case ISD::USUBO:
6377 case ISD::SMULO:
6378 case ISD::UMULO:
6379 return LowerXALUO(Op, DAG);
6380 case ISD::FADD:
6381 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FADD_PRED);
6382 case ISD::FSUB:
6383 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FSUB_PRED);
6384 case ISD::FMUL:
6385 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FMUL_PRED);
6386 case ISD::FMA:
6387 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FMA_PRED);
6388 case ISD::FDIV:
6389 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FDIV_PRED);
6390 case ISD::FNEG:
6391 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FNEG_MERGE_PASSTHRU);
6392 case ISD::FCEIL:
6393 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FCEIL_MERGE_PASSTHRU);
6394 case ISD::FFLOOR:
6395 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FFLOOR_MERGE_PASSTHRU);
6396 case ISD::FNEARBYINT:
6397 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
6398 case ISD::FRINT:
6399 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FRINT_MERGE_PASSTHRU);
6400 case ISD::FROUND:
6401 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FROUND_MERGE_PASSTHRU);
6402 case ISD::FROUNDEVEN:
6403 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
6404 case ISD::FTRUNC:
6405 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FTRUNC_MERGE_PASSTHRU);
6406 case ISD::FSQRT:
6407 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FSQRT_MERGE_PASSTHRU);
6408 case ISD::FABS:
6409 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FABS_MERGE_PASSTHRU);
6410 case ISD::FP_ROUND:
6411 case ISD::STRICT_FP_ROUND:
6412 return LowerFP_ROUND(Op, DAG);
6413 case ISD::FP_EXTEND:
6414 return LowerFP_EXTEND(Op, DAG);
6415 case ISD::FRAMEADDR:
6416 return LowerFRAMEADDR(Op, DAG);
6417 case ISD::SPONENTRY:
6418 return LowerSPONENTRY(Op, DAG);
6419 case ISD::RETURNADDR:
6420 return LowerRETURNADDR(Op, DAG);
6421 case ISD::ADDROFRETURNADDR:
6422 return LowerADDROFRETURNADDR(Op, DAG);
6423 case ISD::CONCAT_VECTORS:
6424 return LowerCONCAT_VECTORS(Op, DAG);
6425 case ISD::INSERT_VECTOR_ELT:
6426 return LowerINSERT_VECTOR_ELT(Op, DAG);
6427 case ISD::EXTRACT_VECTOR_ELT:
6428 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
6429 case ISD::BUILD_VECTOR:
6430 return LowerBUILD_VECTOR(Op, DAG);
6431 case ISD::ZERO_EXTEND_VECTOR_INREG:
6432 return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
6433 case ISD::VECTOR_SHUFFLE:
6434 return LowerVECTOR_SHUFFLE(Op, DAG);
6435 case ISD::SPLAT_VECTOR:
6436 return LowerSPLAT_VECTOR(Op, DAG);
6437 case ISD::EXTRACT_SUBVECTOR:
6438 return LowerEXTRACT_SUBVECTOR(Op, DAG);
6439 case ISD::INSERT_SUBVECTOR:
6440 return LowerINSERT_SUBVECTOR(Op, DAG);
6441 case ISD::SDIV:
6442 case ISD::UDIV:
6443 return LowerDIV(Op, DAG);
6444 case ISD::SMIN:
6445 case ISD::UMIN:
6446 case ISD::SMAX:
6447 case ISD::UMAX:
6448 return LowerMinMax(Op, DAG);
6449 case ISD::SRA:
6450 case ISD::SRL:
6451 case ISD::SHL:
6452 return LowerVectorSRA_SRL_SHL(Op, DAG);
6453 case ISD::SHL_PARTS:
6454 case ISD::SRL_PARTS:
6455 case ISD::SRA_PARTS:
6456 return LowerShiftParts(Op, DAG);
6457 case ISD::CTPOP:
6458 case ISD::PARITY:
6459 return LowerCTPOP_PARITY(Op, DAG);
6460 case ISD::FCOPYSIGN:
6461 return LowerFCOPYSIGN(Op, DAG);
6462 case ISD::OR:
6463 return LowerVectorOR(Op, DAG);
6464 case ISD::XOR:
6465 return LowerXOR(Op, DAG);
6466 case ISD::PREFETCH:
6467 return LowerPREFETCH(Op, DAG);
6468 case ISD::SINT_TO_FP:
6469 case ISD::UINT_TO_FP:
6470 case ISD::STRICT_SINT_TO_FP:
6471 case ISD::STRICT_UINT_TO_FP:
6472 return LowerINT_TO_FP(Op, DAG);
6473 case ISD::FP_TO_SINT:
6474 case ISD::FP_TO_UINT:
6475 case ISD::STRICT_FP_TO_SINT:
6476 case ISD::STRICT_FP_TO_UINT:
6477 return LowerFP_TO_INT(Op, DAG);
6478 case ISD::FP_TO_SINT_SAT:
6479 case ISD::FP_TO_UINT_SAT:
6480 return LowerFP_TO_INT_SAT(Op, DAG);
6481 case ISD::FSINCOS:
6482 return LowerFSINCOS(Op, DAG);
6483 case ISD::GET_ROUNDING:
6484 return LowerGET_ROUNDING(Op, DAG);
6485 case ISD::SET_ROUNDING:
6486 return LowerSET_ROUNDING(Op, DAG);
6487 case ISD::MUL:
6488 return LowerMUL(Op, DAG);
6489 case ISD::MULHS:
6490 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::MULHS_PRED);
6491 case ISD::MULHU:
6492 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::MULHU_PRED);
6493 case ISD::INTRINSIC_W_CHAIN:
6494 return LowerINTRINSIC_W_CHAIN(Op, DAG);
6495 case ISD::INTRINSIC_WO_CHAIN:
6496 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6497 case ISD::INTRINSIC_VOID:
6498 return LowerINTRINSIC_VOID(Op, DAG);
6499 case ISD::ATOMIC_STORE:
6500 if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
6501 assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3());
6502 return LowerStore128(Op, DAG);
6503 }
6504 return SDValue();
6505 case ISD::STORE:
6506 return LowerSTORE(Op, DAG);
6507 case ISD::MSTORE:
6508 return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
6509 case ISD::MGATHER:
6510 return LowerMGATHER(Op, DAG);
6511 case ISD::MSCATTER:
6512 return LowerMSCATTER(Op, DAG);
6513 case ISD::VECREDUCE_SEQ_FADD:
6514 return LowerVECREDUCE_SEQ_FADD(ScalarOp: Op, DAG);
6515 case ISD::VECREDUCE_ADD:
6516 case ISD::VECREDUCE_AND:
6517 case ISD::VECREDUCE_OR:
6518 case ISD::VECREDUCE_XOR:
6519 case ISD::VECREDUCE_SMAX:
6520 case ISD::VECREDUCE_SMIN:
6521 case ISD::VECREDUCE_UMAX:
6522 case ISD::VECREDUCE_UMIN:
6523 case ISD::VECREDUCE_FADD:
6524 case ISD::VECREDUCE_FMAX:
6525 case ISD::VECREDUCE_FMIN:
6526 case ISD::VECREDUCE_FMAXIMUM:
6527 case ISD::VECREDUCE_FMINIMUM:
6528 return LowerVECREDUCE(Op, DAG);
6529 case ISD::ATOMIC_LOAD_AND:
6530 return LowerATOMIC_LOAD_AND(Op, DAG);
6531 case ISD::DYNAMIC_STACKALLOC:
6532 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6533 case ISD::VSCALE:
6534 return LowerVSCALE(Op, DAG);
6535 case ISD::ANY_EXTEND:
6536 case ISD::SIGN_EXTEND:
6537 case ISD::ZERO_EXTEND:
6538 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
6539 case ISD::SIGN_EXTEND_INREG: {
6540 // Only custom lower when ExtraVT has a legal byte based element type.
6541 EVT ExtraVT = cast<VTSDNode>(Val: Op.getOperand(i: 1))->getVT();
6542 EVT ExtraEltVT = ExtraVT.getVectorElementType();
6543 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
6544 (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
6545 return SDValue();
6546
6547 return LowerToPredicatedOp(Op, DAG,
6548 NewOp: AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU);
6549 }
6550 case ISD::TRUNCATE:
6551 return LowerTRUNCATE(Op, DAG);
6552 case ISD::MLOAD:
6553 return LowerMLOAD(Op, DAG);
6554 case ISD::LOAD:
6555 if (useSVEForFixedLengthVectorVT(VT: Op.getValueType(),
6556 OverrideNEON: !Subtarget->isNeonAvailable()))
6557 return LowerFixedLengthVectorLoadToSVE(Op, DAG);
6558 return LowerLOAD(Op, DAG);
6559 case ISD::ADD:
6560 case ISD::AND:
6561 case ISD::SUB:
6562 return LowerToScalableOp(Op, DAG);
6563 case ISD::FMAXIMUM:
6564 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FMAX_PRED);
6565 case ISD::FMAXNUM:
6566 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FMAXNM_PRED);
6567 case ISD::FMINIMUM:
6568 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FMIN_PRED);
6569 case ISD::FMINNUM:
6570 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FMINNM_PRED);
6571 case ISD::VSELECT:
6572 return LowerFixedLengthVectorSelectToSVE(Op, DAG);
6573 case ISD::ABS:
6574 return LowerABS(Op, DAG);
6575 case ISD::ABDS:
6576 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::ABDS_PRED);
6577 case ISD::ABDU:
6578 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::ABDU_PRED);
6579 case ISD::AVGFLOORS:
6580 return LowerAVG(Op, DAG, NewOp: AArch64ISD::HADDS_PRED);
6581 case ISD::AVGFLOORU:
6582 return LowerAVG(Op, DAG, NewOp: AArch64ISD::HADDU_PRED);
6583 case ISD::AVGCEILS:
6584 return LowerAVG(Op, DAG, NewOp: AArch64ISD::RHADDS_PRED);
6585 case ISD::AVGCEILU:
6586 return LowerAVG(Op, DAG, NewOp: AArch64ISD::RHADDU_PRED);
6587 case ISD::BITREVERSE:
6588 return LowerBitreverse(Op, DAG);
6589 case ISD::BSWAP:
6590 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::BSWAP_MERGE_PASSTHRU);
6591 case ISD::CTLZ:
6592 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::CTLZ_MERGE_PASSTHRU);
6593 case ISD::CTTZ:
6594 return LowerCTTZ(Op, DAG);
6595 case ISD::VECTOR_SPLICE:
6596 return LowerVECTOR_SPLICE(Op, DAG);
6597 case ISD::VECTOR_DEINTERLEAVE:
6598 return LowerVECTOR_DEINTERLEAVE(Op, DAG);
6599 case ISD::VECTOR_INTERLEAVE:
6600 return LowerVECTOR_INTERLEAVE(Op, DAG);
6601 case ISD::LROUND:
6602 case ISD::LLROUND:
6603 case ISD::LRINT:
6604 case ISD::LLRINT: {
6605 assert((Op.getOperand(0).getValueType() == MVT::f16 ||
6606 Op.getOperand(0).getValueType() == MVT::bf16) &&
6607 "Expected custom lowering of rounding operations only for f16");
6608 SDLoc DL(Op);
6609 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Op.getOperand(0));
6610 return DAG.getNode(Opcode: Op.getOpcode(), DL, VT: Op.getValueType(), Operand: Ext);
6611 }
6612 case ISD::STRICT_LROUND:
6613 case ISD::STRICT_LLROUND:
6614 case ISD::STRICT_LRINT:
6615 case ISD::STRICT_LLRINT: {
6616 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
6617 Op.getOperand(1).getValueType() == MVT::bf16) &&
6618 "Expected custom lowering of rounding operations only for f16");
6619 SDLoc DL(Op);
6620 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
6621 {Op.getOperand(0), Op.getOperand(1)});
6622 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
6623 {Ext.getValue(1), Ext.getValue(0)});
6624 }
6625 case ISD::WRITE_REGISTER: {
6626 assert(Op.getOperand(2).getValueType() == MVT::i128 &&
6627 "WRITE_REGISTER custom lowering is only for 128-bit sysregs");
6628 SDLoc DL(Op);
6629
6630 SDValue Chain = Op.getOperand(i: 0);
6631 SDValue SysRegName = Op.getOperand(i: 1);
6632 std::pair<SDValue, SDValue> Pair =
6633 DAG.SplitScalar(Op.getOperand(2), DL, MVT::i64, MVT::i64);
6634
6635 // chain = MSRR(chain, sysregname, lo, hi)
6636 SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain,
6637 SysRegName, Pair.first, Pair.second);
6638
6639 return Result;
6640 }
6641 case ISD::FSHL:
6642 case ISD::FSHR:
6643 return LowerFunnelShift(Op, DAG);
6644 case ISD::FLDEXP:
6645 return LowerFLDEXP(Op, DAG);
6646 }
6647}
6648
6649bool AArch64TargetLowering::mergeStoresAfterLegalization(EVT VT) const {
6650 return !Subtarget->useSVEForFixedLengthVectors();
6651}
6652
6653bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(
6654 EVT VT, bool OverrideNEON) const {
6655 if (!VT.isFixedLengthVector() || !VT.isSimple())
6656 return false;
6657
6658 // Don't use SVE for vectors we cannot scalarize if required.
6659 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
6660 // Fixed length predicates should be promoted to i8.
6661 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
6662 case MVT::i1:
6663 default:
6664 return false;
6665 case MVT::i8:
6666 case MVT::i16:
6667 case MVT::i32:
6668 case MVT::i64:
6669 case MVT::f16:
6670 case MVT::f32:
6671 case MVT::f64:
6672 break;
6673 }
6674
6675 // NEON-sized vectors can be emulated using SVE instructions.
6676 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
6677 return Subtarget->hasSVEorSME();
6678
6679 // Ensure NEON MVTs only belong to a single register class.
6680 if (VT.getFixedSizeInBits() <= 128)
6681 return false;
6682
6683 // Ensure wider than NEON code generation is enabled.
6684 if (!Subtarget->useSVEForFixedLengthVectors())
6685 return false;
6686
6687 // Don't use SVE for types that don't fit.
6688 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
6689 return false;
6690
6691 // TODO: Perhaps an artificial restriction, but worth having whilst getting
6692 // the base fixed length SVE support in place.
6693 if (!VT.isPow2VectorType())
6694 return false;
6695
6696 return true;
6697}
6698
6699//===----------------------------------------------------------------------===//
6700// Calling Convention Implementation
6701//===----------------------------------------------------------------------===//
6702
6703static unsigned getIntrinsicID(const SDNode *N) {
6704 unsigned Opcode = N->getOpcode();
6705 switch (Opcode) {
6706 default:
6707 return Intrinsic::not_intrinsic;
6708 case ISD::INTRINSIC_WO_CHAIN: {
6709 unsigned IID = N->getConstantOperandVal(Num: 0);
6710 if (IID < Intrinsic::num_intrinsics)
6711 return IID;
6712 return Intrinsic::not_intrinsic;
6713 }
6714 }
6715}
6716
6717bool AArch64TargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
6718 SDValue N1) const {
6719 if (!N0.hasOneUse())
6720 return false;
6721
6722 unsigned IID = getIntrinsicID(N: N1.getNode());
6723 // Avoid reassociating expressions that can be lowered to smlal/umlal.
6724 if (IID == Intrinsic::aarch64_neon_umull ||
6725 N1.getOpcode() == AArch64ISD::UMULL ||
6726 IID == Intrinsic::aarch64_neon_smull ||
6727 N1.getOpcode() == AArch64ISD::SMULL)
6728 return N0.getOpcode() != ISD::ADD;
6729
6730 return true;
6731}
6732
6733/// Selects the correct CCAssignFn for a given CallingConvention value.
6734CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
6735 bool IsVarArg) const {
6736 switch (CC) {
6737 default:
6738 report_fatal_error(reason: "Unsupported calling convention.");
6739 case CallingConv::GHC:
6740 return CC_AArch64_GHC;
6741 case CallingConv::C:
6742 case CallingConv::Fast:
6743 case CallingConv::PreserveMost:
6744 case CallingConv::PreserveAll:
6745 case CallingConv::CXX_FAST_TLS:
6746 case CallingConv::Swift:
6747 case CallingConv::SwiftTail:
6748 case CallingConv::Tail:
6749 case CallingConv::GRAAL:
6750 if (Subtarget->isTargetWindows()) {
6751 if (IsVarArg) {
6752 if (Subtarget->isWindowsArm64EC())
6753 return CC_AArch64_Arm64EC_VarArg;
6754 return CC_AArch64_Win64_VarArg;
6755 }
6756 return CC_AArch64_Win64PCS;
6757 }
6758 if (!Subtarget->isTargetDarwin())
6759 return CC_AArch64_AAPCS;
6760 if (!IsVarArg)
6761 return CC_AArch64_DarwinPCS;
6762 return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg
6763 : CC_AArch64_DarwinPCS_VarArg;
6764 case CallingConv::Win64:
6765 if (IsVarArg) {
6766 if (Subtarget->isWindowsArm64EC())
6767 return CC_AArch64_Arm64EC_VarArg;
6768 return CC_AArch64_Win64_VarArg;
6769 }
6770 return CC_AArch64_Win64PCS;
6771 case CallingConv::CFGuard_Check:
6772 if (Subtarget->isWindowsArm64EC())
6773 return CC_AArch64_Arm64EC_CFGuard_Check;
6774 return CC_AArch64_Win64_CFGuard_Check;
6775 case CallingConv::AArch64_VectorCall:
6776 case CallingConv::AArch64_SVE_VectorCall:
6777 case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0:
6778 case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2:
6779 return CC_AArch64_AAPCS;
6780 case CallingConv::ARM64EC_Thunk_X64:
6781 return CC_AArch64_Arm64EC_Thunk;
6782 case CallingConv::ARM64EC_Thunk_Native:
6783 return CC_AArch64_Arm64EC_Thunk_Native;
6784 }
6785}
6786
6787CCAssignFn *
6788AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
6789 switch (CC) {
6790 default:
6791 return RetCC_AArch64_AAPCS;
6792 case CallingConv::ARM64EC_Thunk_X64:
6793 return RetCC_AArch64_Arm64EC_Thunk;
6794 case CallingConv::CFGuard_Check:
6795 if (Subtarget->isWindowsArm64EC())
6796 return RetCC_AArch64_Arm64EC_CFGuard_Check;
6797 return RetCC_AArch64_AAPCS;
6798 }
6799}
6800
6801
6802unsigned
6803AArch64TargetLowering::allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL,
6804 SelectionDAG &DAG) const {
6805 MachineFunction &MF = DAG.getMachineFunction();
6806 MachineFrameInfo &MFI = MF.getFrameInfo();
6807
6808 // Allocate a lazy-save buffer object of size SVL.B * SVL.B (worst-case)
6809 SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
6810 DAG.getConstant(1, DL, MVT::i32));
6811 SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N);
6812 SDValue Ops[] = {Chain, NN, DAG.getConstant(1, DL, MVT::i64)};
6813 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other);
6814 SDValue Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL, VTs, Ops);
6815 Chain = Buffer.getValue(R: 1);
6816 MFI.CreateVariableSizedObject(Alignment: Align(1), Alloca: nullptr);
6817
6818 // Allocate an additional TPIDR2 object on the stack (16 bytes)
6819 unsigned TPIDR2Obj = MFI.CreateStackObject(Size: 16, Alignment: Align(16), isSpillSlot: false);
6820
6821 // Store the buffer pointer to the TPIDR2 stack object.
6822 MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, Offset: TPIDR2Obj);
6823 SDValue Ptr = DAG.getFrameIndex(
6824 FI: TPIDR2Obj,
6825 VT: DAG.getTargetLoweringInfo().getFrameIndexTy(DL: DAG.getDataLayout()));
6826 Chain = DAG.getStore(Chain, dl: DL, Val: Buffer, Ptr, PtrInfo: MPI);
6827
6828 // Set the reserved bytes (10-15) to zero
6829 EVT PtrTy = Ptr.getValueType();
6830 SDValue ReservedPtr =
6831 DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrTy, N1: Ptr, N2: DAG.getConstant(Val: 10, DL, VT: PtrTy));
6832 Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i16), ReservedPtr,
6833 MPI);
6834 ReservedPtr =
6835 DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrTy, N1: Ptr, N2: DAG.getConstant(Val: 12, DL, VT: PtrTy));
6836 Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i32), ReservedPtr,
6837 MPI);
6838
6839 return TPIDR2Obj;
6840}
6841
6842static bool isPassedInFPR(EVT VT) {
6843 return VT.isFixedLengthVector() ||
6844 (VT.isFloatingPoint() && !VT.isScalableVector());
6845}
6846
6847SDValue AArch64TargetLowering::LowerFormalArguments(
6848 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
6849 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
6850 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
6851 MachineFunction &MF = DAG.getMachineFunction();
6852 const Function &F = MF.getFunction();
6853 MachineFrameInfo &MFI = MF.getFrameInfo();
6854 bool IsWin64 = Subtarget->isCallingConvWin64(CC: F.getCallingConv());
6855 bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 ||
6856 (isVarArg && Subtarget->isWindowsArm64EC());
6857 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
6858
6859 SmallVector<ISD::OutputArg, 4> Outs;
6860 GetReturnInfo(CC: CallConv, ReturnType: F.getReturnType(), attr: F.getAttributes(), Outs,
6861 TLI: DAG.getTargetLoweringInfo(), DL: MF.getDataLayout());
6862 if (any_of(Range&: Outs, P: [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
6863 FuncInfo->setIsSVECC(true);
6864
6865 // Assign locations to all of the incoming arguments.
6866 SmallVector<CCValAssign, 16> ArgLocs;
6867 DenseMap<unsigned, SDValue> CopiedRegs;
6868 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
6869
6870 // At this point, Ins[].VT may already be promoted to i32. To correctly
6871 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
6872 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
6873 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
6874 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
6875 // LocVT.
6876 unsigned NumArgs = Ins.size();
6877 Function::const_arg_iterator CurOrigArg = F.arg_begin();
6878 unsigned CurArgIdx = 0;
6879 for (unsigned i = 0; i != NumArgs; ++i) {
6880 MVT ValVT = Ins[i].VT;
6881 if (Ins[i].isOrigArg()) {
6882 std::advance(i&: CurOrigArg, n: Ins[i].getOrigArgIndex() - CurArgIdx);
6883 CurArgIdx = Ins[i].getOrigArgIndex();
6884
6885 // Get type of the original argument.
6886 EVT ActualVT = getValueType(DL: DAG.getDataLayout(), Ty: CurOrigArg->getType(),
6887 /*AllowUnknown*/ true);
6888 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
6889 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
6890 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
6891 ValVT = MVT::i8;
6892 else if (ActualMVT == MVT::i16)
6893 ValVT = MVT::i16;
6894 }
6895 bool UseVarArgCC = false;
6896 if (IsWin64)
6897 UseVarArgCC = isVarArg;
6898 CCAssignFn *AssignFn = CCAssignFnForCall(CC: CallConv, IsVarArg: UseVarArgCC);
6899 bool Res =
6900 AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
6901 assert(!Res && "Call operand has unhandled type");
6902 (void)Res;
6903 }
6904
6905 SMEAttrs Attrs(MF.getFunction());
6906 bool IsLocallyStreaming =
6907 !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
6908 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
6909 SDValue Glue = Chain.getValue(R: 1);
6910
6911 SmallVector<SDValue, 16> ArgValues;
6912 unsigned ExtraArgLocs = 0;
6913 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
6914 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
6915
6916 if (Ins[i].Flags.isByVal()) {
6917 // Byval is used for HFAs in the PCS, but the system should work in a
6918 // non-compliant manner for larger structs.
6919 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
6920 int Size = Ins[i].Flags.getByValSize();
6921 unsigned NumRegs = (Size + 7) / 8;
6922
6923 // FIXME: This works on big-endian for composite byvals, which are the common
6924 // case. It should also work for fundamental types too.
6925 unsigned FrameIdx =
6926 MFI.CreateFixedObject(Size: 8 * NumRegs, SPOffset: VA.getLocMemOffset(), IsImmutable: false);
6927 SDValue FrameIdxN = DAG.getFrameIndex(FI: FrameIdx, VT: PtrVT);
6928 InVals.push_back(Elt: FrameIdxN);
6929
6930 continue;
6931 }
6932
6933 if (Ins[i].Flags.isSwiftAsync())
6934 MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
6935
6936 SDValue ArgValue;
6937 if (VA.isRegLoc()) {
6938 // Arguments stored in registers.
6939 EVT RegVT = VA.getLocVT();
6940 const TargetRegisterClass *RC;
6941
6942 if (RegVT == MVT::i32)
6943 RC = &AArch64::GPR32RegClass;
6944 else if (RegVT == MVT::i64)
6945 RC = &AArch64::GPR64RegClass;
6946 else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
6947 RC = &AArch64::FPR16RegClass;
6948 else if (RegVT == MVT::f32)
6949 RC = &AArch64::FPR32RegClass;
6950 else if (RegVT == MVT::f64 || RegVT.is64BitVector())
6951 RC = &AArch64::FPR64RegClass;
6952 else if (RegVT == MVT::f128 || RegVT.is128BitVector())
6953 RC = &AArch64::FPR128RegClass;
6954 else if (RegVT.isScalableVector() &&
6955 RegVT.getVectorElementType() == MVT::i1) {
6956 FuncInfo->setIsSVECC(true);
6957 RC = &AArch64::PPRRegClass;
6958 } else if (RegVT == MVT::aarch64svcount) {
6959 FuncInfo->setIsSVECC(true);
6960 RC = &AArch64::PPRRegClass;
6961 } else if (RegVT.isScalableVector()) {
6962 FuncInfo->setIsSVECC(true);
6963 RC = &AArch64::ZPRRegClass;
6964 } else
6965 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
6966
6967 // Transform the arguments in physical registers into virtual ones.
6968 Register Reg = MF.addLiveIn(PReg: VA.getLocReg(), RC);
6969
6970 if (IsLocallyStreaming) {
6971 // LocallyStreamingFunctions must insert the SMSTART in the correct
6972 // position, so we use Glue to ensure no instructions can be scheduled
6973 // between the chain of:
6974 // t0: ch,glue = EntryNode
6975 // t1: res,ch,glue = CopyFromReg
6976 // ...
6977 // tn: res,ch,glue = CopyFromReg t(n-1), ..
6978 // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
6979 // ^^^^^^
6980 // This will be the new Chain/Root node.
6981 ArgValue = DAG.getCopyFromReg(Chain, dl: DL, Reg, VT: RegVT, Glue);
6982 Glue = ArgValue.getValue(R: 2);
6983 if (isPassedInFPR(VT: ArgValue.getValueType())) {
6984 ArgValue =
6985 DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
6986 DAG.getVTList(ArgValue.getValueType(), MVT::Glue),
6987 {ArgValue, Glue});
6988 Glue = ArgValue.getValue(R: 1);
6989 }
6990 } else
6991 ArgValue = DAG.getCopyFromReg(Chain, dl: DL, Reg, VT: RegVT);
6992
6993 // If this is an 8, 16 or 32-bit value, it is really passed promoted
6994 // to 64 bits. Insert an assert[sz]ext to capture this, then
6995 // truncate to the right size.
6996 switch (VA.getLocInfo()) {
6997 default:
6998 llvm_unreachable("Unknown loc info!");
6999 case CCValAssign::Full:
7000 break;
7001 case CCValAssign::Indirect:
7002 assert(
7003 (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
7004 "Indirect arguments should be scalable on most subtargets");
7005 break;
7006 case CCValAssign::BCvt:
7007 ArgValue = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getValVT(), Operand: ArgValue);
7008 break;
7009 case CCValAssign::AExt:
7010 case CCValAssign::SExt:
7011 case CCValAssign::ZExt:
7012 break;
7013 case CCValAssign::AExtUpper:
7014 ArgValue = DAG.getNode(Opcode: ISD::SRL, DL, VT: RegVT, N1: ArgValue,
7015 N2: DAG.getConstant(Val: 32, DL, VT: RegVT));
7016 ArgValue = DAG.getZExtOrTrunc(Op: ArgValue, DL, VT: VA.getValVT());
7017 break;
7018 }
7019 } else { // VA.isRegLoc()
7020 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
7021 unsigned ArgOffset = VA.getLocMemOffset();
7022 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
7023 ? VA.getLocVT().getSizeInBits()
7024 : VA.getValVT().getSizeInBits()) / 8;
7025
7026 uint32_t BEAlign = 0;
7027 if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
7028 !Ins[i].Flags.isInConsecutiveRegs())
7029 BEAlign = 8 - ArgSize;
7030
7031 SDValue FIN;
7032 MachinePointerInfo PtrInfo;
7033 if (StackViaX4) {
7034 // In both the ARM64EC varargs convention and the thunk convention,
7035 // arguments on the stack are accessed relative to x4, not sp. In
7036 // the thunk convention, there's an additional offset of 32 bytes
7037 // to account for the shadow store.
7038 unsigned ObjOffset = ArgOffset + BEAlign;
7039 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
7040 ObjOffset += 32;
7041 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
7042 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7043 FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
7044 DAG.getConstant(ObjOffset, DL, MVT::i64));
7045 PtrInfo = MachinePointerInfo::getUnknownStack(MF);
7046 } else {
7047 int FI = MFI.CreateFixedObject(Size: ArgSize, SPOffset: ArgOffset + BEAlign, IsImmutable: true);
7048
7049 // Create load nodes to retrieve arguments from the stack.
7050 FIN = DAG.getFrameIndex(FI, VT: getPointerTy(DL: DAG.getDataLayout()));
7051 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
7052 }
7053
7054 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
7055 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
7056 MVT MemVT = VA.getValVT();
7057
7058 switch (VA.getLocInfo()) {
7059 default:
7060 break;
7061 case CCValAssign::Trunc:
7062 case CCValAssign::BCvt:
7063 MemVT = VA.getLocVT();
7064 break;
7065 case CCValAssign::Indirect:
7066 assert((VA.getValVT().isScalableVector() ||
7067 Subtarget->isWindowsArm64EC()) &&
7068 "Indirect arguments should be scalable on most subtargets");
7069 MemVT = VA.getLocVT();
7070 break;
7071 case CCValAssign::SExt:
7072 ExtType = ISD::SEXTLOAD;
7073 break;
7074 case CCValAssign::ZExt:
7075 ExtType = ISD::ZEXTLOAD;
7076 break;
7077 case CCValAssign::AExt:
7078 ExtType = ISD::EXTLOAD;
7079 break;
7080 }
7081
7082 ArgValue = DAG.getExtLoad(ExtType, dl: DL, VT: VA.getLocVT(), Chain, Ptr: FIN, PtrInfo,
7083 MemVT);
7084 }
7085
7086 if (VA.getLocInfo() == CCValAssign::Indirect) {
7087 assert((VA.getValVT().isScalableVT() ||
7088 Subtarget->isWindowsArm64EC()) &&
7089 "Indirect arguments should be scalable on most subtargets");
7090
7091 uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
7092 unsigned NumParts = 1;
7093 if (Ins[i].Flags.isInConsecutiveRegs()) {
7094 assert(!Ins[i].Flags.isInConsecutiveRegsLast());
7095 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
7096 ++NumParts;
7097 }
7098
7099 MVT PartLoad = VA.getValVT();
7100 SDValue Ptr = ArgValue;
7101
7102 // Ensure we generate all loads for each tuple part, whilst updating the
7103 // pointer after each load correctly using vscale.
7104 while (NumParts > 0) {
7105 ArgValue = DAG.getLoad(VT: PartLoad, dl: DL, Chain, Ptr, PtrInfo: MachinePointerInfo());
7106 InVals.push_back(Elt: ArgValue);
7107 NumParts--;
7108 if (NumParts > 0) {
7109 SDValue BytesIncrement;
7110 if (PartLoad.isScalableVector()) {
7111 BytesIncrement = DAG.getVScale(
7112 DL, VT: Ptr.getValueType(),
7113 MulImm: APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
7114 } else {
7115 BytesIncrement = DAG.getConstant(
7116 Val: APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
7117 VT: Ptr.getValueType());
7118 }
7119 SDNodeFlags Flags;
7120 Flags.setNoUnsignedWrap(true);
7121 Ptr = DAG.getNode(Opcode: ISD::ADD, DL, VT: Ptr.getValueType(), N1: Ptr,
7122 N2: BytesIncrement, Flags);
7123 ExtraArgLocs++;
7124 i++;
7125 }
7126 }
7127 } else {
7128 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
7129 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
7130 ArgValue, DAG.getValueType(MVT::i32));
7131
7132 // i1 arguments are zero-extended to i8 by the caller. Emit a
7133 // hint to reflect this.
7134 if (Ins[i].isOrigArg()) {
7135 Argument *OrigArg = F.getArg(i: Ins[i].getOrigArgIndex());
7136 if (OrigArg->getType()->isIntegerTy(Bitwidth: 1)) {
7137 if (!Ins[i].Flags.isZExt()) {
7138 ArgValue = DAG.getNode(Opcode: AArch64ISD::ASSERT_ZEXT_BOOL, DL,
7139 VT: ArgValue.getValueType(), Operand: ArgValue);
7140 }
7141 }
7142 }
7143
7144 InVals.push_back(Elt: ArgValue);
7145 }
7146 }
7147 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
7148
7149 // Insert the SMSTART if this is a locally streaming function and
7150 // make sure it is Glued to the last CopyFromReg value.
7151 if (IsLocallyStreaming) {
7152 SDValue PStateSM;
7153 if (Attrs.hasStreamingCompatibleInterface()) {
7154 PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
7155 Register Reg = MF.getRegInfo().createVirtualRegister(
7156 RegClass: getRegClassFor(VT: PStateSM.getValueType().getSimpleVT()));
7157 FuncInfo->setPStateSMReg(Reg);
7158 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg, N: PStateSM);
7159 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, InGlue: Glue,
7160 Condition: AArch64SME::IfCallerIsNonStreaming, PStateSM);
7161 } else
7162 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, InGlue: Glue,
7163 Condition: AArch64SME::Always);
7164
7165 // Ensure that the SMSTART happens after the CopyWithChain such that its
7166 // chain result is used.
7167 for (unsigned I=0; I<InVals.size(); ++I) {
7168 Register Reg = MF.getRegInfo().createVirtualRegister(
7169 RegClass: getRegClassFor(VT: InVals[I].getValueType().getSimpleVT()));
7170 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg, N: InVals[I]);
7171 InVals[I] = DAG.getCopyFromReg(Chain, dl: DL, Reg,
7172 VT: InVals[I].getValueType());
7173 }
7174 }
7175
7176 // varargs
7177 if (isVarArg) {
7178 if (!Subtarget->isTargetDarwin() || IsWin64) {
7179 // The AAPCS variadic function ABI is identical to the non-variadic
7180 // one. As a result there may be more arguments in registers and we should
7181 // save them for future reference.
7182 // Win64 variadic functions also pass arguments in registers, but all float
7183 // arguments are passed in integer registers.
7184 saveVarArgRegisters(CCInfo, DAG, DL, Chain);
7185 }
7186
7187 // This will point to the next argument passed via stack.
7188 unsigned VarArgsOffset = CCInfo.getStackSize();
7189 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
7190 VarArgsOffset = alignTo(Value: VarArgsOffset, Align: Subtarget->isTargetILP32() ? 4 : 8);
7191 FuncInfo->setVarArgsStackOffset(VarArgsOffset);
7192 FuncInfo->setVarArgsStackIndex(
7193 MFI.CreateFixedObject(Size: 4, SPOffset: VarArgsOffset, IsImmutable: true));
7194
7195 if (MFI.hasMustTailInVarArgFunc()) {
7196 SmallVector<MVT, 2> RegParmTypes;
7197 RegParmTypes.push_back(MVT::i64);
7198 RegParmTypes.push_back(MVT::f128);
7199 // Compute the set of forwarded registers. The rest are scratch.
7200 SmallVectorImpl<ForwardedRegister> &Forwards =
7201 FuncInfo->getForwardedMustTailRegParms();
7202 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
7203 Fn: CC_AArch64_AAPCS);
7204
7205 // Conservatively forward X8, since it might be used for aggregate return.
7206 if (!CCInfo.isAllocated(AArch64::X8)) {
7207 Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
7208 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
7209 }
7210 }
7211 }
7212
7213 // On Windows, InReg pointers must be returned, so record the pointer in a
7214 // virtual register at the start of the function so it can be returned in the
7215 // epilogue.
7216 if (IsWin64 || F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64) {
7217 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
7218 if ((F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64 ||
7219 Ins[I].Flags.isInReg()) &&
7220 Ins[I].Flags.isSRet()) {
7221 assert(!FuncInfo->getSRetReturnReg());
7222
7223 MVT PtrTy = getPointerTy(DL: DAG.getDataLayout());
7224 Register Reg =
7225 MF.getRegInfo().createVirtualRegister(RegClass: getRegClassFor(VT: PtrTy));
7226 FuncInfo->setSRetReturnReg(Reg);
7227
7228 SDValue Copy = DAG.getCopyToReg(Chain: DAG.getEntryNode(), dl: DL, Reg, N: InVals[I]);
7229 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
7230 break;
7231 }
7232 }
7233 }
7234
7235 unsigned StackArgSize = CCInfo.getStackSize();
7236 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
7237 if (DoesCalleeRestoreStack(CallCC: CallConv, TailCallOpt)) {
7238 // This is a non-standard ABI so by fiat I say we're allowed to make full
7239 // use of the stack area to be popped, which must be aligned to 16 bytes in
7240 // any case:
7241 StackArgSize = alignTo(Value: StackArgSize, Align: 16);
7242
7243 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
7244 // a multiple of 16.
7245 FuncInfo->setArgumentStackToRestore(StackArgSize);
7246
7247 // This realignment carries over to the available bytes below. Our own
7248 // callers will guarantee the space is free by giving an aligned value to
7249 // CALLSEQ_START.
7250 }
7251 // Even if we're not expected to free up the space, it's useful to know how
7252 // much is there while considering tail calls (because we can reuse it).
7253 FuncInfo->setBytesInStackArgArea(StackArgSize);
7254
7255 if (Subtarget->hasCustomCallingConv())
7256 Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
7257
7258 // Conservatively assume the function requires the lazy-save mechanism.
7259 if (SMEAttrs(MF.getFunction()).hasZAState()) {
7260 unsigned TPIDR2Obj = allocateLazySaveBuffer(Chain, DL, DAG);
7261 FuncInfo->setLazySaveTPIDR2Obj(TPIDR2Obj);
7262 }
7263
7264 return Chain;
7265}
7266
7267void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
7268 SelectionDAG &DAG,
7269 const SDLoc &DL,
7270 SDValue &Chain) const {
7271 MachineFunction &MF = DAG.getMachineFunction();
7272 MachineFrameInfo &MFI = MF.getFrameInfo();
7273 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
7274 auto PtrVT = getPointerTy(DL: DAG.getDataLayout());
7275 bool IsWin64 = Subtarget->isCallingConvWin64(CC: MF.getFunction().getCallingConv());
7276
7277 SmallVector<SDValue, 8> MemOps;
7278
7279 auto GPRArgRegs = AArch64::getGPRArgRegs();
7280 unsigned NumGPRArgRegs = GPRArgRegs.size();
7281 if (Subtarget->isWindowsArm64EC()) {
7282 // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
7283 // functions.
7284 NumGPRArgRegs = 4;
7285 }
7286 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(Regs: GPRArgRegs);
7287
7288 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
7289 int GPRIdx = 0;
7290 if (GPRSaveSize != 0) {
7291 if (IsWin64) {
7292 GPRIdx = MFI.CreateFixedObject(Size: GPRSaveSize, SPOffset: -(int)GPRSaveSize, IsImmutable: false);
7293 if (GPRSaveSize & 15)
7294 // The extra size here, if triggered, will always be 8.
7295 MFI.CreateFixedObject(Size: 16 - (GPRSaveSize & 15), SPOffset: -(int)alignTo(Value: GPRSaveSize, Align: 16), IsImmutable: false);
7296 } else
7297 GPRIdx = MFI.CreateStackObject(Size: GPRSaveSize, Alignment: Align(8), isSpillSlot: false);
7298
7299 SDValue FIN;
7300 if (Subtarget->isWindowsArm64EC()) {
7301 // With the Arm64EC ABI, we reserve the save area as usual, but we
7302 // compute its address relative to x4. For a normal AArch64->AArch64
7303 // call, x4 == sp on entry, but calls from an entry thunk can pass in a
7304 // different address.
7305 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
7306 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7307 FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val,
7308 DAG.getConstant(GPRSaveSize, DL, MVT::i64));
7309 } else {
7310 FIN = DAG.getFrameIndex(FI: GPRIdx, VT: PtrVT);
7311 }
7312
7313 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
7314 Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
7315 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7316 SDValue Store =
7317 DAG.getStore(Chain: Val.getValue(R: 1), dl: DL, Val, Ptr: FIN,
7318 PtrInfo: IsWin64 ? MachinePointerInfo::getFixedStack(
7319 MF, FI: GPRIdx, Offset: (i - FirstVariadicGPR) * 8)
7320 : MachinePointerInfo::getStack(MF, Offset: i * 8));
7321 MemOps.push_back(Elt: Store);
7322 FIN =
7323 DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: FIN, N2: DAG.getConstant(Val: 8, DL, VT: PtrVT));
7324 }
7325 }
7326 FuncInfo->setVarArgsGPRIndex(GPRIdx);
7327 FuncInfo->setVarArgsGPRSize(GPRSaveSize);
7328
7329 if (Subtarget->hasFPARMv8() && !IsWin64) {
7330 auto FPRArgRegs = AArch64::getFPRArgRegs();
7331 const unsigned NumFPRArgRegs = FPRArgRegs.size();
7332 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(Regs: FPRArgRegs);
7333
7334 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
7335 int FPRIdx = 0;
7336 if (FPRSaveSize != 0) {
7337 FPRIdx = MFI.CreateStackObject(Size: FPRSaveSize, Alignment: Align(16), isSpillSlot: false);
7338
7339 SDValue FIN = DAG.getFrameIndex(FI: FPRIdx, VT: PtrVT);
7340
7341 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
7342 Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
7343 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
7344
7345 SDValue Store = DAG.getStore(Chain: Val.getValue(R: 1), dl: DL, Val, Ptr: FIN,
7346 PtrInfo: MachinePointerInfo::getStack(MF, Offset: i * 16));
7347 MemOps.push_back(Elt: Store);
7348 FIN = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: FIN,
7349 N2: DAG.getConstant(Val: 16, DL, VT: PtrVT));
7350 }
7351 }
7352 FuncInfo->setVarArgsFPRIndex(FPRIdx);
7353 FuncInfo->setVarArgsFPRSize(FPRSaveSize);
7354 }
7355
7356 if (!MemOps.empty()) {
7357 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
7358 }
7359}
7360
7361/// LowerCallResult - Lower the result values of a call into the
7362/// appropriate copies out of appropriate physical registers.
7363SDValue AArch64TargetLowering::LowerCallResult(
7364 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
7365 const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
7366 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
7367 SDValue ThisVal, bool RequiresSMChange) const {
7368 DenseMap<unsigned, SDValue> CopiedRegs;
7369 // Copy all of the result registers out of their specified physreg.
7370 for (unsigned i = 0; i != RVLocs.size(); ++i) {
7371 CCValAssign VA = RVLocs[i];
7372
7373 // Pass 'this' value directly from the argument to return value, to avoid
7374 // reg unit interference
7375 if (i == 0 && isThisReturn) {
7376 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
7377 "unexpected return calling convention register assignment");
7378 InVals.push_back(Elt: ThisVal);
7379 continue;
7380 }
7381
7382 // Avoid copying a physreg twice since RegAllocFast is incompetent and only
7383 // allows one use of a physreg per block.
7384 SDValue Val = CopiedRegs.lookup(Val: VA.getLocReg());
7385 if (!Val) {
7386 Val =
7387 DAG.getCopyFromReg(Chain, dl: DL, Reg: VA.getLocReg(), VT: VA.getLocVT(), Glue: InGlue);
7388 Chain = Val.getValue(R: 1);
7389 InGlue = Val.getValue(R: 2);
7390 CopiedRegs[VA.getLocReg()] = Val;
7391 }
7392
7393 switch (VA.getLocInfo()) {
7394 default:
7395 llvm_unreachable("Unknown loc info!");
7396 case CCValAssign::Full:
7397 break;
7398 case CCValAssign::BCvt:
7399 Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getValVT(), Operand: Val);
7400 break;
7401 case CCValAssign::AExtUpper:
7402 Val = DAG.getNode(Opcode: ISD::SRL, DL, VT: VA.getLocVT(), N1: Val,
7403 N2: DAG.getConstant(Val: 32, DL, VT: VA.getLocVT()));
7404 [[fallthrough]];
7405 case CCValAssign::AExt:
7406 [[fallthrough]];
7407 case CCValAssign::ZExt:
7408 Val = DAG.getZExtOrTrunc(Op: Val, DL, VT: VA.getValVT());
7409 break;
7410 }
7411
7412 if (RequiresSMChange && isPassedInFPR(VT: VA.getValVT()))
7413 Val = DAG.getNode(Opcode: AArch64ISD::COALESCER_BARRIER, DL, VT: Val.getValueType(),
7414 Operand: Val);
7415
7416 InVals.push_back(Elt: Val);
7417 }
7418
7419 return Chain;
7420}
7421
7422/// Return true if the calling convention is one that we can guarantee TCO for.
7423static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
7424 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
7425 CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
7426}
7427
7428/// Return true if we might ever do TCO for calls with this calling convention.
7429static bool mayTailCallThisCC(CallingConv::ID CC) {
7430 switch (CC) {
7431 case CallingConv::C:
7432 case CallingConv::AArch64_SVE_VectorCall:
7433 case CallingConv::PreserveMost:
7434 case CallingConv::PreserveAll:
7435 case CallingConv::Swift:
7436 case CallingConv::SwiftTail:
7437 case CallingConv::Tail:
7438 case CallingConv::Fast:
7439 return true;
7440 default:
7441 return false;
7442 }
7443}
7444
7445static void analyzeCallOperands(const AArch64TargetLowering &TLI,
7446 const AArch64Subtarget *Subtarget,
7447 const TargetLowering::CallLoweringInfo &CLI,
7448 CCState &CCInfo) {
7449 const SelectionDAG &DAG = CLI.DAG;
7450 CallingConv::ID CalleeCC = CLI.CallConv;
7451 bool IsVarArg = CLI.IsVarArg;
7452 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7453 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CC: CalleeCC);
7454
7455 // For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack
7456 // for the shadow store.
7457 if (CalleeCC == CallingConv::ARM64EC_Thunk_X64)
7458 CCInfo.AllocateStack(Size: 32, Alignment: Align(16));
7459
7460 unsigned NumArgs = Outs.size();
7461 for (unsigned i = 0; i != NumArgs; ++i) {
7462 MVT ArgVT = Outs[i].VT;
7463 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
7464
7465 bool UseVarArgCC = false;
7466 if (IsVarArg) {
7467 // On Windows, the fixed arguments in a vararg call are passed in GPRs
7468 // too, so use the vararg CC to force them to integer registers.
7469 if (IsCalleeWin64) {
7470 UseVarArgCC = true;
7471 } else {
7472 UseVarArgCC = !Outs[i].IsFixed;
7473 }
7474 }
7475
7476 if (!UseVarArgCC) {
7477 // Get type of the original argument.
7478 EVT ActualVT =
7479 TLI.getValueType(DL: DAG.getDataLayout(), Ty: CLI.Args[Outs[i].OrigArgIndex].Ty,
7480 /*AllowUnknown*/ true);
7481 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
7482 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
7483 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
7484 ArgVT = MVT::i8;
7485 else if (ActualMVT == MVT::i16)
7486 ArgVT = MVT::i16;
7487 }
7488
7489 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC: CalleeCC, IsVarArg: UseVarArgCC);
7490 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
7491 assert(!Res && "Call operand has unhandled type");
7492 (void)Res;
7493 }
7494}
7495
7496bool AArch64TargetLowering::isEligibleForTailCallOptimization(
7497 const CallLoweringInfo &CLI) const {
7498 CallingConv::ID CalleeCC = CLI.CallConv;
7499 if (!mayTailCallThisCC(CC: CalleeCC))
7500 return false;
7501
7502 SDValue Callee = CLI.Callee;
7503 bool IsVarArg = CLI.IsVarArg;
7504 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7505 const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
7506 const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
7507 const SelectionDAG &DAG = CLI.DAG;
7508 MachineFunction &MF = DAG.getMachineFunction();
7509 const Function &CallerF = MF.getFunction();
7510 CallingConv::ID CallerCC = CallerF.getCallingConv();
7511
7512 // SME Streaming functions are not eligible for TCO as they may require
7513 // the streaming mode or ZA to be restored after returning from the call.
7514 SMEAttrs CallerAttrs(MF.getFunction());
7515 auto CalleeAttrs = CLI.CB ? SMEAttrs(*CLI.CB) : SMEAttrs(SMEAttrs::Normal);
7516 if (CallerAttrs.requiresSMChange(Callee: CalleeAttrs) ||
7517 CallerAttrs.requiresLazySave(Callee: CalleeAttrs) ||
7518 CallerAttrs.hasStreamingBody())
7519 return false;
7520
7521 // Functions using the C or Fast calling convention that have an SVE signature
7522 // preserve more registers and should assume the SVE_VectorCall CC.
7523 // The check for matching callee-saved regs will determine whether it is
7524 // eligible for TCO.
7525 if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
7526 MF.getInfo<AArch64FunctionInfo>()->isSVECC())
7527 CallerCC = CallingConv::AArch64_SVE_VectorCall;
7528
7529 bool CCMatch = CallerCC == CalleeCC;
7530
7531 // When using the Windows calling convention on a non-windows OS, we want
7532 // to back up and restore X18 in such functions; we can't do a tail call
7533 // from those functions.
7534 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
7535 CalleeCC != CallingConv::Win64)
7536 return false;
7537
7538 // Byval parameters hand the function a pointer directly into the stack area
7539 // we want to reuse during a tail call. Working around this *is* possible (see
7540 // X86) but less efficient and uglier in LowerCall.
7541 for (Function::const_arg_iterator i = CallerF.arg_begin(),
7542 e = CallerF.arg_end();
7543 i != e; ++i) {
7544 if (i->hasByValAttr())
7545 return false;
7546
7547 // On Windows, "inreg" attributes signify non-aggregate indirect returns.
7548 // In this case, it is necessary to save/restore X0 in the callee. Tail
7549 // call opt interferes with this. So we disable tail call opt when the
7550 // caller has an argument with "inreg" attribute.
7551
7552 // FIXME: Check whether the callee also has an "inreg" argument.
7553 if (i->hasInRegAttr())
7554 return false;
7555 }
7556
7557 if (canGuaranteeTCO(CC: CalleeCC, GuaranteeTailCalls: getTargetMachine().Options.GuaranteedTailCallOpt))
7558 return CCMatch;
7559
7560 // Externally-defined functions with weak linkage should not be
7561 // tail-called on AArch64 when the OS does not support dynamic
7562 // pre-emption of symbols, as the AAELF spec requires normal calls
7563 // to undefined weak functions to be replaced with a NOP or jump to the
7564 // next instruction. The behaviour of branch instructions in this
7565 // situation (as used for tail calls) is implementation-defined, so we
7566 // cannot rely on the linker replacing the tail call with a return.
7567 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Val&: Callee)) {
7568 const GlobalValue *GV = G->getGlobal();
7569 const Triple &TT = getTargetMachine().getTargetTriple();
7570 if (GV->hasExternalWeakLinkage() &&
7571 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
7572 return false;
7573 }
7574
7575 // Now we search for cases where we can use a tail call without changing the
7576 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
7577 // concept.
7578
7579 // I want anyone implementing a new calling convention to think long and hard
7580 // about this assert.
7581 assert((!IsVarArg || CalleeCC == CallingConv::C) &&
7582 "Unexpected variadic calling convention");
7583
7584 LLVMContext &C = *DAG.getContext();
7585 // Check that the call results are passed in the same way.
7586 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
7587 CalleeFn: CCAssignFnForCall(CC: CalleeCC, IsVarArg),
7588 CallerFn: CCAssignFnForCall(CC: CallerCC, IsVarArg)))
7589 return false;
7590 // The callee has to preserve all registers the caller needs to preserve.
7591 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
7592 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
7593 if (!CCMatch) {
7594 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
7595 if (Subtarget->hasCustomCallingConv()) {
7596 TRI->UpdateCustomCallPreservedMask(MF, Mask: &CallerPreserved);
7597 TRI->UpdateCustomCallPreservedMask(MF, Mask: &CalleePreserved);
7598 }
7599 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
7600 return false;
7601 }
7602
7603 // Nothing more to check if the callee is taking no arguments
7604 if (Outs.empty())
7605 return true;
7606
7607 SmallVector<CCValAssign, 16> ArgLocs;
7608 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
7609
7610 analyzeCallOperands(TLI: *this, Subtarget, CLI, CCInfo);
7611
7612 if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
7613 // When we are musttail, additional checks have been done and we can safely ignore this check
7614 // At least two cases here: if caller is fastcc then we can't have any
7615 // memory arguments (we'd be expected to clean up the stack afterwards). If
7616 // caller is C then we could potentially use its argument area.
7617
7618 // FIXME: for now we take the most conservative of these in both cases:
7619 // disallow all variadic memory operands.
7620 for (const CCValAssign &ArgLoc : ArgLocs)
7621 if (!ArgLoc.isRegLoc())
7622 return false;
7623 }
7624
7625 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
7626
7627 // If any of the arguments is passed indirectly, it must be SVE, so the
7628 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
7629 // allocate space on the stack. That is why we determine this explicitly here
7630 // the call cannot be a tailcall.
7631 if (llvm::any_of(Range&: ArgLocs, P: [&](CCValAssign &A) {
7632 assert((A.getLocInfo() != CCValAssign::Indirect ||
7633 A.getValVT().isScalableVector() ||
7634 Subtarget->isWindowsArm64EC()) &&
7635 "Expected value to be scalable");
7636 return A.getLocInfo() == CCValAssign::Indirect;
7637 }))
7638 return false;
7639
7640 // If the stack arguments for this call do not fit into our own save area then
7641 // the call cannot be made tail.
7642 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
7643 return false;
7644
7645 const MachineRegisterInfo &MRI = MF.getRegInfo();
7646 if (!parametersInCSRMatch(MRI, CallerPreservedMask: CallerPreserved, ArgLocs, OutVals))
7647 return false;
7648
7649 return true;
7650}
7651
7652SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
7653 SelectionDAG &DAG,
7654 MachineFrameInfo &MFI,
7655 int ClobberedFI) const {
7656 SmallVector<SDValue, 8> ArgChains;
7657 int64_t FirstByte = MFI.getObjectOffset(ObjectIdx: ClobberedFI);
7658 int64_t LastByte = FirstByte + MFI.getObjectSize(ObjectIdx: ClobberedFI) - 1;
7659
7660 // Include the original chain at the beginning of the list. When this is
7661 // used by target LowerCall hooks, this helps legalize find the
7662 // CALLSEQ_BEGIN node.
7663 ArgChains.push_back(Elt: Chain);
7664
7665 // Add a chain value for each stack argument corresponding
7666 for (SDNode *U : DAG.getEntryNode().getNode()->uses())
7667 if (LoadSDNode *L = dyn_cast<LoadSDNode>(Val: U))
7668 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val: L->getBasePtr()))
7669 if (FI->getIndex() < 0) {
7670 int64_t InFirstByte = MFI.getObjectOffset(ObjectIdx: FI->getIndex());
7671 int64_t InLastByte = InFirstByte;
7672 InLastByte += MFI.getObjectSize(ObjectIdx: FI->getIndex()) - 1;
7673
7674 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
7675 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
7676 ArgChains.push_back(Elt: SDValue(L, 1));
7677 }
7678
7679 // Build a tokenfactor for all the chains.
7680 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
7681}
7682
7683bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
7684 bool TailCallOpt) const {
7685 return (CallCC == CallingConv::Fast && TailCallOpt) ||
7686 CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
7687}
7688
7689// Check if the value is zero-extended from i1 to i8
7690static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
7691 unsigned SizeInBits = Arg.getValueType().getSizeInBits();
7692 if (SizeInBits < 8)
7693 return false;
7694
7695 APInt RequredZero(SizeInBits, 0xFE);
7696 KnownBits Bits = DAG.computeKnownBits(Op: Arg, Depth: 4);
7697 bool ZExtBool = (Bits.Zero & RequredZero) == RequredZero;
7698 return ZExtBool;
7699}
7700
7701void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
7702 SDNode *Node) const {
7703 // Live-in physreg copies that are glued to SMSTART are applied as
7704 // implicit-def's in the InstrEmitter. Here we remove them, allowing the
7705 // register allocator to pass call args in callee saved regs, without extra
7706 // copies to avoid these fake clobbers of actually-preserved GPRs.
7707 if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
7708 MI.getOpcode() == AArch64::MSRpstatePseudo) {
7709 for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
7710 if (MachineOperand &MO = MI.getOperand(i: I);
7711 MO.isReg() && MO.isImplicit() && MO.isDef() &&
7712 (AArch64::GPR32RegClass.contains(MO.getReg()) ||
7713 AArch64::GPR64RegClass.contains(MO.getReg())))
7714 MI.removeOperand(OpNo: I);
7715
7716 // The SVE vector length can change when entering/leaving streaming mode.
7717 if (MI.getOperand(0).getImm() == AArch64SVCR::SVCRSM ||
7718 MI.getOperand(0).getImm() == AArch64SVCR::SVCRSMZA) {
7719 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
7720 /*IsImplicit=*/true));
7721 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/true,
7722 /*IsImplicit=*/true));
7723 }
7724 }
7725
7726 // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
7727 // have nothing to do with VG, were it not that they are used to materialise a
7728 // frame-address. If they contain a frame-index to a scalable vector, this
7729 // will likely require an ADDVL instruction to materialise the address, thus
7730 // reading VG.
7731 const MachineFunction &MF = *MI.getMF();
7732 if (MF.getInfo<AArch64FunctionInfo>()->hasStreamingModeChanges() &&
7733 (MI.getOpcode() == AArch64::ADDXri ||
7734 MI.getOpcode() == AArch64::SUBXri)) {
7735 const MachineOperand &MO = MI.getOperand(i: 1);
7736 if (MO.isFI() && MF.getFrameInfo().getStackID(MO.getIndex()) ==
7737 TargetStackID::ScalableVector)
7738 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
7739 /*IsImplicit=*/true));
7740 }
7741}
7742
7743SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL,
7744 bool Enable, SDValue Chain,
7745 SDValue InGlue,
7746 unsigned Condition,
7747 SDValue PStateSM) const {
7748 MachineFunction &MF = DAG.getMachineFunction();
7749 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
7750 FuncInfo->setHasStreamingModeChanges(true);
7751
7752 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
7753 SDValue RegMask = DAG.getRegisterMask(RegMask: TRI->getSMStartStopCallPreservedMask());
7754 SDValue MSROp =
7755 DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);
7756 SDValue ConditionOp = DAG.getTargetConstant(Condition, DL, MVT::i64);
7757 SmallVector<SDValue> Ops = {Chain, MSROp, ConditionOp};
7758 if (Condition != AArch64SME::Always) {
7759 assert(PStateSM && "PStateSM should be defined");
7760 Ops.push_back(Elt: PStateSM);
7761 }
7762 Ops.push_back(Elt: RegMask);
7763
7764 if (InGlue)
7765 Ops.push_back(Elt: InGlue);
7766
7767 unsigned Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
7768 return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
7769}
7770
7771static unsigned getSMCondition(const SMEAttrs &CallerAttrs,
7772 const SMEAttrs &CalleeAttrs) {
7773 if (!CallerAttrs.hasStreamingCompatibleInterface() ||
7774 CallerAttrs.hasStreamingBody())
7775 return AArch64SME::Always;
7776 if (CalleeAttrs.hasNonStreamingInterface())
7777 return AArch64SME::IfCallerIsStreaming;
7778 if (CalleeAttrs.hasStreamingInterface())
7779 return AArch64SME::IfCallerIsNonStreaming;
7780
7781 llvm_unreachable("Unsupported attributes");
7782}
7783
7784/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
7785/// and add input and output parameter nodes.
7786SDValue
7787AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
7788 SmallVectorImpl<SDValue> &InVals) const {
7789 SelectionDAG &DAG = CLI.DAG;
7790 SDLoc &DL = CLI.DL;
7791 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7792 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
7793 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
7794 SDValue Chain = CLI.Chain;
7795 SDValue Callee = CLI.Callee;
7796 bool &IsTailCall = CLI.IsTailCall;
7797 CallingConv::ID &CallConv = CLI.CallConv;
7798 bool IsVarArg = CLI.IsVarArg;
7799
7800 MachineFunction &MF = DAG.getMachineFunction();
7801 MachineFunction::CallSiteInfo CSInfo;
7802 bool IsThisReturn = false;
7803
7804 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
7805 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
7806 bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
7807 bool IsSibCall = false;
7808 bool GuardWithBTI = false;
7809
7810 if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) &&
7811 !Subtarget->noBTIAtReturnTwice()) {
7812 GuardWithBTI = FuncInfo->branchTargetEnforcement();
7813 }
7814
7815 // Analyze operands of the call, assigning locations to each operand.
7816 SmallVector<CCValAssign, 16> ArgLocs;
7817 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
7818
7819 if (IsVarArg) {
7820 unsigned NumArgs = Outs.size();
7821
7822 for (unsigned i = 0; i != NumArgs; ++i) {
7823 if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector())
7824 report_fatal_error(reason: "Passing SVE types to variadic functions is "
7825 "currently not supported");
7826 }
7827 }
7828
7829 analyzeCallOperands(TLI: *this, Subtarget, CLI, CCInfo);
7830
7831 CCAssignFn *RetCC = CCAssignFnForReturn(CC: CallConv);
7832 // Assign locations to each value returned by this call.
7833 SmallVector<CCValAssign, 16> RVLocs;
7834 CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
7835 *DAG.getContext());
7836 RetCCInfo.AnalyzeCallResult(Ins, Fn: RetCC);
7837
7838 // Check callee args/returns for SVE registers and set calling convention
7839 // accordingly.
7840 if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
7841 auto HasSVERegLoc = [](CCValAssign &Loc) {
7842 if (!Loc.isRegLoc())
7843 return false;
7844 return AArch64::ZPRRegClass.contains(Loc.getLocReg()) ||
7845 AArch64::PPRRegClass.contains(Loc.getLocReg());
7846 };
7847 if (any_of(Range&: RVLocs, P: HasSVERegLoc) || any_of(Range&: ArgLocs, P: HasSVERegLoc))
7848 CallConv = CallingConv::AArch64_SVE_VectorCall;
7849 }
7850
7851 if (IsTailCall) {
7852 // Check if it's really possible to do a tail call.
7853 IsTailCall = isEligibleForTailCallOptimization(CLI);
7854
7855 // A sibling call is one where we're under the usual C ABI and not planning
7856 // to change that but can still do a tail call:
7857 if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail &&
7858 CallConv != CallingConv::SwiftTail)
7859 IsSibCall = true;
7860
7861 if (IsTailCall)
7862 ++NumTailCalls;
7863 }
7864
7865 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
7866 report_fatal_error(reason: "failed to perform tail call elimination on a call "
7867 "site marked musttail");
7868
7869 // Get a count of how many bytes are to be pushed on the stack.
7870 unsigned NumBytes = CCInfo.getStackSize();
7871
7872 if (IsSibCall) {
7873 // Since we're not changing the ABI to make this a tail call, the memory
7874 // operands are already available in the caller's incoming argument space.
7875 NumBytes = 0;
7876 }
7877
7878 // FPDiff is the byte offset of the call's argument area from the callee's.
7879 // Stores to callee stack arguments will be placed in FixedStackSlots offset
7880 // by this amount for a tail call. In a sibling call it must be 0 because the
7881 // caller will deallocate the entire stack and the callee still expects its
7882 // arguments to begin at SP+0. Completely unused for non-tail calls.
7883 int FPDiff = 0;
7884
7885 if (IsTailCall && !IsSibCall) {
7886 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
7887
7888 // Since callee will pop argument stack as a tail call, we must keep the
7889 // popped size 16-byte aligned.
7890 NumBytes = alignTo(Value: NumBytes, Align: 16);
7891
7892 // FPDiff will be negative if this tail call requires more space than we
7893 // would automatically have in our incoming argument space. Positive if we
7894 // can actually shrink the stack.
7895 FPDiff = NumReusableBytes - NumBytes;
7896
7897 // Update the required reserved area if this is the tail call requiring the
7898 // most argument stack space.
7899 if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
7900 FuncInfo->setTailCallReservedStack(-FPDiff);
7901
7902 // The stack pointer must be 16-byte aligned at all times it's used for a
7903 // memory operation, which in practice means at *all* times and in
7904 // particular across call boundaries. Therefore our own arguments started at
7905 // a 16-byte aligned SP and the delta applied for the tail call should
7906 // satisfy the same constraint.
7907 assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
7908 }
7909
7910 // Determine whether we need any streaming mode changes.
7911 SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction());
7912 if (CLI.CB)
7913 CalleeAttrs = SMEAttrs(*CLI.CB);
7914 else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(Val&: CLI.Callee))
7915 CalleeAttrs = SMEAttrs(ES->getSymbol());
7916
7917 auto DescribeCallsite =
7918 [&](OptimizationRemarkAnalysis &R) -> OptimizationRemarkAnalysis & {
7919 R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '";
7920 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(Val&: CLI.Callee))
7921 R << ore::NV("Callee", ES->getSymbol());
7922 else if (CLI.CB && CLI.CB->getCalledFunction())
7923 R << ore::NV("Callee", CLI.CB->getCalledFunction()->getName());
7924 else
7925 R << "unknown callee";
7926 R << "'";
7927 return R;
7928 };
7929
7930 bool RequiresLazySave = CallerAttrs.requiresLazySave(Callee: CalleeAttrs);
7931 if (RequiresLazySave) {
7932 unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj();
7933 MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, Offset: TPIDR2Obj);
7934 SDValue TPIDR2ObjAddr = DAG.getFrameIndex(FI: TPIDR2Obj,
7935 VT: DAG.getTargetLoweringInfo().getFrameIndexTy(DL: DAG.getDataLayout()));
7936 SDValue NumZaSaveSlicesAddr =
7937 DAG.getNode(Opcode: ISD::ADD, DL, VT: TPIDR2ObjAddr.getValueType(), N1: TPIDR2ObjAddr,
7938 N2: DAG.getConstant(Val: 8, DL, VT: TPIDR2ObjAddr.getValueType()));
7939 SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
7940 DAG.getConstant(1, DL, MVT::i32));
7941 Chain = DAG.getTruncStore(Chain, DL, NumZaSaveSlices, NumZaSaveSlicesAddr,
7942 MPI, MVT::i16);
7943 Chain = DAG.getNode(
7944 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
7945 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
7946 TPIDR2ObjAddr);
7947 OptimizationRemarkEmitter ORE(&MF.getFunction());
7948 ORE.emit(RemarkBuilder: [&]() {
7949 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
7950 CLI.CB)
7951 : OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
7952 &MF.getFunction());
7953 return DescribeCallsite(R) << " sets up a lazy save for ZA";
7954 });
7955 }
7956
7957 SDValue PStateSM;
7958 bool RequiresSMChange = CallerAttrs.requiresSMChange(Callee: CalleeAttrs);
7959 if (RequiresSMChange) {
7960 if (CallerAttrs.hasStreamingInterfaceOrBody())
7961 PStateSM = DAG.getConstant(1, DL, MVT::i64);
7962 else if (CallerAttrs.hasNonStreamingInterface())
7963 PStateSM = DAG.getConstant(0, DL, MVT::i64);
7964 else
7965 PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
7966 OptimizationRemarkEmitter ORE(&MF.getFunction());
7967 ORE.emit(RemarkBuilder: [&]() {
7968 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
7969 CLI.CB)
7970 : OptimizationRemarkAnalysis("sme", "SMETransition",
7971 &MF.getFunction());
7972 DescribeCallsite(R) << " requires a streaming mode transition";
7973 return R;
7974 });
7975 }
7976
7977 SDValue ZTFrameIdx;
7978 MachineFrameInfo &MFI = MF.getFrameInfo();
7979 bool ShouldPreserveZT0 = CallerAttrs.requiresPreservingZT0(Callee: CalleeAttrs);
7980
7981 // If the caller has ZT0 state which will not be preserved by the callee,
7982 // spill ZT0 before the call.
7983 if (ShouldPreserveZT0) {
7984 unsigned ZTObj = MFI.CreateSpillStackObject(Size: 64, Alignment: Align(16));
7985 ZTFrameIdx = DAG.getFrameIndex(
7986 FI: ZTObj,
7987 VT: DAG.getTargetLoweringInfo().getFrameIndexTy(DL: DAG.getDataLayout()));
7988
7989 Chain = DAG.getNode(AArch64ISD::SAVE_ZT, DL, DAG.getVTList(MVT::Other),
7990 {Chain, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
7991 }
7992
7993 // If caller shares ZT0 but the callee is not shared ZA, we need to stop
7994 // PSTATE.ZA before the call if there is no lazy-save active.
7995 bool DisableZA = CallerAttrs.requiresDisablingZABeforeCall(Callee: CalleeAttrs);
7996 assert((!DisableZA || !RequiresLazySave) &&
7997 "Lazy-save should have PSTATE.SM=1 on entry to the function");
7998
7999 if (DisableZA)
8000 Chain = DAG.getNode(
8001 AArch64ISD::SMSTOP, DL, MVT::Other, Chain,
8002 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
8003 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
8004
8005 // Adjust the stack pointer for the new arguments...
8006 // These operations are automatically eliminated by the prolog/epilog pass
8007 if (!IsSibCall)
8008 Chain = DAG.getCALLSEQ_START(Chain, InSize: IsTailCall ? 0 : NumBytes, OutSize: 0, DL);
8009
8010 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
8011 getPointerTy(DAG.getDataLayout()));
8012
8013 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
8014 SmallSet<unsigned, 8> RegsUsed;
8015 SmallVector<SDValue, 8> MemOpChains;
8016 auto PtrVT = getPointerTy(DL: DAG.getDataLayout());
8017
8018 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
8019 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
8020 for (const auto &F : Forwards) {
8021 SDValue Val = DAG.getCopyFromReg(Chain, dl: DL, Reg: F.VReg, VT: F.VT);
8022 RegsToPass.emplace_back(Args: F.PReg, Args&: Val);
8023 }
8024 }
8025
8026 // Walk the register/memloc assignments, inserting copies/loads.
8027 unsigned ExtraArgLocs = 0;
8028 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
8029 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
8030 SDValue Arg = OutVals[i];
8031 ISD::ArgFlagsTy Flags = Outs[i].Flags;
8032
8033 // Promote the value if needed.
8034 switch (VA.getLocInfo()) {
8035 default:
8036 llvm_unreachable("Unknown loc info!");
8037 case CCValAssign::Full:
8038 break;
8039 case CCValAssign::SExt:
8040 Arg = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
8041 break;
8042 case CCValAssign::ZExt:
8043 Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
8044 break;
8045 case CCValAssign::AExt:
8046 if (Outs[i].ArgVT == MVT::i1) {
8047 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
8048 //
8049 // Check if we actually have to do this, because the value may
8050 // already be zero-extended.
8051 //
8052 // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
8053 // and rely on DAGCombiner to fold this, because the following
8054 // (anyext i32) is combined with (zext i8) in DAG.getNode:
8055 //
8056 // (ext (zext x)) -> (zext x)
8057 //
8058 // This will give us (zext i32), which we cannot remove, so
8059 // try to check this beforehand.
8060 if (!checkZExtBool(Arg, DAG)) {
8061 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
8062 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
8063 }
8064 }
8065 Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
8066 break;
8067 case CCValAssign::AExtUpper:
8068 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
8069 Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
8070 Arg = DAG.getNode(Opcode: ISD::SHL, DL, VT: VA.getLocVT(), N1: Arg,
8071 N2: DAG.getConstant(Val: 32, DL, VT: VA.getLocVT()));
8072 break;
8073 case CCValAssign::BCvt:
8074 Arg = DAG.getBitcast(VT: VA.getLocVT(), V: Arg);
8075 break;
8076 case CCValAssign::Trunc:
8077 Arg = DAG.getZExtOrTrunc(Op: Arg, DL, VT: VA.getLocVT());
8078 break;
8079 case CCValAssign::FPExt:
8080 Arg = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
8081 break;
8082 case CCValAssign::Indirect:
8083 bool isScalable = VA.getValVT().isScalableVT();
8084 assert((isScalable || Subtarget->isWindowsArm64EC()) &&
8085 "Indirect arguments should be scalable on most subtargets");
8086
8087 uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue();
8088 uint64_t PartSize = StoreSize;
8089 unsigned NumParts = 1;
8090 if (Outs[i].Flags.isInConsecutiveRegs()) {
8091 assert(!Outs[i].Flags.isInConsecutiveRegsLast());
8092 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
8093 ++NumParts;
8094 StoreSize *= NumParts;
8095 }
8096
8097 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(Context&: *DAG.getContext());
8098 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
8099 MachineFrameInfo &MFI = MF.getFrameInfo();
8100 int FI = MFI.CreateStackObject(Size: StoreSize, Alignment, isSpillSlot: false);
8101 if (isScalable)
8102 MFI.setStackID(ObjectIdx: FI, ID: TargetStackID::ScalableVector);
8103
8104 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI);
8105 SDValue Ptr = DAG.getFrameIndex(
8106 FI, VT: DAG.getTargetLoweringInfo().getFrameIndexTy(DL: DAG.getDataLayout()));
8107 SDValue SpillSlot = Ptr;
8108
8109 // Ensure we generate all stores for each tuple part, whilst updating the
8110 // pointer after each store correctly using vscale.
8111 while (NumParts) {
8112 SDValue Store = DAG.getStore(Chain, dl: DL, Val: OutVals[i], Ptr, PtrInfo: MPI);
8113 MemOpChains.push_back(Elt: Store);
8114
8115 NumParts--;
8116 if (NumParts > 0) {
8117 SDValue BytesIncrement;
8118 if (isScalable) {
8119 BytesIncrement = DAG.getVScale(
8120 DL, VT: Ptr.getValueType(),
8121 MulImm: APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
8122 } else {
8123 BytesIncrement = DAG.getConstant(
8124 Val: APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
8125 VT: Ptr.getValueType());
8126 }
8127 SDNodeFlags Flags;
8128 Flags.setNoUnsignedWrap(true);
8129
8130 MPI = MachinePointerInfo(MPI.getAddrSpace());
8131 Ptr = DAG.getNode(Opcode: ISD::ADD, DL, VT: Ptr.getValueType(), N1: Ptr,
8132 N2: BytesIncrement, Flags);
8133 ExtraArgLocs++;
8134 i++;
8135 }
8136 }
8137
8138 Arg = SpillSlot;
8139 break;
8140 }
8141
8142 if (VA.isRegLoc()) {
8143 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
8144 Outs[0].VT == MVT::i64) {
8145 assert(VA.getLocVT() == MVT::i64 &&
8146 "unexpected calling convention register assignment");
8147 assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
8148 "unexpected use of 'returned'");
8149 IsThisReturn = true;
8150 }
8151 if (RegsUsed.count(V: VA.getLocReg())) {
8152 // If this register has already been used then we're trying to pack
8153 // parts of an [N x i32] into an X-register. The extension type will
8154 // take care of putting the two halves in the right place but we have to
8155 // combine them.
8156 SDValue &Bits =
8157 llvm::find_if(Range&: RegsToPass,
8158 P: [=](const std::pair<unsigned, SDValue> &Elt) {
8159 return Elt.first == VA.getLocReg();
8160 })
8161 ->second;
8162 Bits = DAG.getNode(Opcode: ISD::OR, DL, VT: Bits.getValueType(), N1: Bits, N2: Arg);
8163 // Call site info is used for function's parameter entry value
8164 // tracking. For now we track only simple cases when parameter
8165 // is transferred through whole register.
8166 llvm::erase_if(C&: CSInfo.ArgRegPairs,
8167 P: [&VA](MachineFunction::ArgRegPair ArgReg) {
8168 return ArgReg.Reg == VA.getLocReg();
8169 });
8170 } else {
8171 // Add an extra level of indirection for streaming mode changes by
8172 // using a pseudo copy node that cannot be rematerialised between a
8173 // smstart/smstop and the call by the simple register coalescer.
8174 if (RequiresSMChange && isPassedInFPR(VT: Arg.getValueType()))
8175 Arg = DAG.getNode(Opcode: AArch64ISD::COALESCER_BARRIER, DL,
8176 VT: Arg.getValueType(), Operand: Arg);
8177 RegsToPass.emplace_back(Args: VA.getLocReg(), Args&: Arg);
8178 RegsUsed.insert(V: VA.getLocReg());
8179 const TargetOptions &Options = DAG.getTarget().Options;
8180 if (Options.EmitCallSiteInfo)
8181 CSInfo.ArgRegPairs.emplace_back(Args: VA.getLocReg(), Args&: i);
8182 }
8183 } else {
8184 assert(VA.isMemLoc());
8185
8186 SDValue DstAddr;
8187 MachinePointerInfo DstInfo;
8188
8189 // FIXME: This works on big-endian for composite byvals, which are the
8190 // common case. It should also work for fundamental types too.
8191 uint32_t BEAlign = 0;
8192 unsigned OpSize;
8193 if (VA.getLocInfo() == CCValAssign::Indirect ||
8194 VA.getLocInfo() == CCValAssign::Trunc)
8195 OpSize = VA.getLocVT().getFixedSizeInBits();
8196 else
8197 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
8198 : VA.getValVT().getSizeInBits();
8199 OpSize = (OpSize + 7) / 8;
8200 if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
8201 !Flags.isInConsecutiveRegs()) {
8202 if (OpSize < 8)
8203 BEAlign = 8 - OpSize;
8204 }
8205 unsigned LocMemOffset = VA.getLocMemOffset();
8206 int32_t Offset = LocMemOffset + BEAlign;
8207 SDValue PtrOff = DAG.getIntPtrConstant(Val: Offset, DL);
8208 PtrOff = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: StackPtr, N2: PtrOff);
8209
8210 if (IsTailCall) {
8211 Offset = Offset + FPDiff;
8212 int FI = MF.getFrameInfo().CreateFixedObject(Size: OpSize, SPOffset: Offset, IsImmutable: true);
8213
8214 DstAddr = DAG.getFrameIndex(FI, VT: PtrVT);
8215 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
8216
8217 // Make sure any stack arguments overlapping with where we're storing
8218 // are loaded before this eventual operation. Otherwise they'll be
8219 // clobbered.
8220 Chain = addTokenForArgument(Chain, DAG, MFI&: MF.getFrameInfo(), ClobberedFI: FI);
8221 } else {
8222 SDValue PtrOff = DAG.getIntPtrConstant(Val: Offset, DL);
8223
8224 DstAddr = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: StackPtr, N2: PtrOff);
8225 DstInfo = MachinePointerInfo::getStack(MF, Offset: LocMemOffset);
8226 }
8227
8228 if (Outs[i].Flags.isByVal()) {
8229 SDValue SizeNode =
8230 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
8231 SDValue Cpy = DAG.getMemcpy(
8232 Chain, dl: DL, Dst: DstAddr, Src: Arg, Size: SizeNode,
8233 Alignment: Outs[i].Flags.getNonZeroByValAlign(),
8234 /*isVol = */ false, /*AlwaysInline = */ false,
8235 /*isTailCall = */ false, DstPtrInfo: DstInfo, SrcPtrInfo: MachinePointerInfo());
8236
8237 MemOpChains.push_back(Elt: Cpy);
8238 } else {
8239 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
8240 // promoted to a legal register type i32, we should truncate Arg back to
8241 // i1/i8/i16.
8242 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
8243 VA.getValVT() == MVT::i16)
8244 Arg = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VA.getValVT(), Operand: Arg);
8245
8246 SDValue Store = DAG.getStore(Chain, dl: DL, Val: Arg, Ptr: DstAddr, PtrInfo: DstInfo);
8247 MemOpChains.push_back(Elt: Store);
8248 }
8249 }
8250 }
8251
8252 if (IsVarArg && Subtarget->isWindowsArm64EC()) {
8253 SDValue ParamPtr = StackPtr;
8254 if (IsTailCall) {
8255 // Create a dummy object at the top of the stack that can be used to get
8256 // the SP after the epilogue
8257 int FI = MF.getFrameInfo().CreateFixedObject(Size: 1, SPOffset: FPDiff, IsImmutable: true);
8258 ParamPtr = DAG.getFrameIndex(FI, VT: PtrVT);
8259 }
8260
8261 // For vararg calls, the Arm64EC ABI requires values in x4 and x5
8262 // describing the argument list. x4 contains the address of the
8263 // first stack parameter. x5 contains the size in bytes of all parameters
8264 // passed on the stack.
8265 RegsToPass.emplace_back(AArch64::X4, ParamPtr);
8266 RegsToPass.emplace_back(AArch64::X5,
8267 DAG.getConstant(NumBytes, DL, MVT::i64));
8268 }
8269
8270 if (!MemOpChains.empty())
8271 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
8272
8273 SDValue InGlue;
8274 if (RequiresSMChange) {
8275 SDValue NewChain = changeStreamingMode(
8276 DAG, DL, Enable: CalleeAttrs.hasStreamingInterface(), Chain, InGlue,
8277 Condition: getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
8278 Chain = NewChain.getValue(R: 0);
8279 InGlue = NewChain.getValue(R: 1);
8280 }
8281
8282 // Build a sequence of copy-to-reg nodes chained together with token chain
8283 // and flag operands which copy the outgoing args into the appropriate regs.
8284 for (auto &RegToPass : RegsToPass) {
8285 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: RegToPass.first,
8286 N: RegToPass.second, Glue: InGlue);
8287 InGlue = Chain.getValue(R: 1);
8288 }
8289
8290 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
8291 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
8292 // node so that legalize doesn't hack it.
8293 if (auto *G = dyn_cast<GlobalAddressSDNode>(Val&: Callee)) {
8294 auto GV = G->getGlobal();
8295 unsigned OpFlags =
8296 Subtarget->classifyGlobalFunctionReference(GV, TM: getTargetMachine());
8297 if (OpFlags & AArch64II::MO_GOT) {
8298 Callee = DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT, offset: 0, TargetFlags: OpFlags);
8299 Callee = DAG.getNode(Opcode: AArch64ISD::LOADgot, DL, VT: PtrVT, Operand: Callee);
8300 } else {
8301 const GlobalValue *GV = G->getGlobal();
8302 Callee = DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT, offset: 0, TargetFlags: OpFlags);
8303 }
8304 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Val&: Callee)) {
8305 bool UseGot = (getTargetMachine().getCodeModel() == CodeModel::Large &&
8306 Subtarget->isTargetMachO()) ||
8307 MF.getFunction().getParent()->getRtLibUseGOT();
8308 const char *Sym = S->getSymbol();
8309 if (UseGot) {
8310 Callee = DAG.getTargetExternalSymbol(Sym, VT: PtrVT, TargetFlags: AArch64II::MO_GOT);
8311 Callee = DAG.getNode(Opcode: AArch64ISD::LOADgot, DL, VT: PtrVT, Operand: Callee);
8312 } else {
8313 Callee = DAG.getTargetExternalSymbol(Sym, VT: PtrVT, TargetFlags: 0);
8314 }
8315 }
8316
8317 // We don't usually want to end the call-sequence here because we would tidy
8318 // the frame up *after* the call, however in the ABI-changing tail-call case
8319 // we've carefully laid out the parameters so that when sp is reset they'll be
8320 // in the correct location.
8321 if (IsTailCall && !IsSibCall) {
8322 Chain = DAG.getCALLSEQ_END(Chain, Size1: 0, Size2: 0, Glue: InGlue, DL);
8323 InGlue = Chain.getValue(R: 1);
8324 }
8325
8326 std::vector<SDValue> Ops;
8327 Ops.push_back(x: Chain);
8328 Ops.push_back(x: Callee);
8329
8330 if (IsTailCall) {
8331 // Each tail call may have to adjust the stack by a different amount, so
8332 // this information must travel along with the operation for eventual
8333 // consumption by emitEpilogue.
8334 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
8335 }
8336
8337 // Add argument registers to the end of the list so that they are known live
8338 // into the call.
8339 for (auto &RegToPass : RegsToPass)
8340 Ops.push_back(x: DAG.getRegister(Reg: RegToPass.first,
8341 VT: RegToPass.second.getValueType()));
8342
8343 // Add a register mask operand representing the call-preserved registers.
8344 const uint32_t *Mask;
8345 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8346 if (IsThisReturn) {
8347 // For 'this' returns, use the X0-preserving mask if applicable
8348 Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
8349 if (!Mask) {
8350 IsThisReturn = false;
8351 Mask = TRI->getCallPreservedMask(MF, CallConv);
8352 }
8353 } else
8354 Mask = TRI->getCallPreservedMask(MF, CallConv);
8355
8356 if (Subtarget->hasCustomCallingConv())
8357 TRI->UpdateCustomCallPreservedMask(MF, Mask: &Mask);
8358
8359 if (TRI->isAnyArgRegReserved(MF))
8360 TRI->emitReservedArgRegCallError(MF);
8361
8362 assert(Mask && "Missing call preserved mask for calling convention");
8363 Ops.push_back(x: DAG.getRegisterMask(RegMask: Mask));
8364
8365 if (InGlue.getNode())
8366 Ops.push_back(x: InGlue);
8367
8368 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
8369
8370 // If we're doing a tall call, use a TC_RETURN here rather than an
8371 // actual call instruction.
8372 if (IsTailCall) {
8373 MF.getFrameInfo().setHasTailCall();
8374 SDValue Ret = DAG.getNode(Opcode: AArch64ISD::TC_RETURN, DL, VTList: NodeTys, Ops);
8375
8376 if (IsCFICall)
8377 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
8378
8379 DAG.addNoMergeSiteInfo(Node: Ret.getNode(), NoMerge: CLI.NoMerge);
8380 DAG.addCallSiteInfo(Node: Ret.getNode(), CallInfo: std::move(CSInfo));
8381 return Ret;
8382 }
8383
8384 unsigned CallOpc = AArch64ISD::CALL;
8385 // Calls with operand bundle "clang.arc.attachedcall" are special. They should
8386 // be expanded to the call, directly followed by a special marker sequence and
8387 // a call to an ObjC library function. Use CALL_RVMARKER to do that.
8388 if (CLI.CB && objcarc::hasAttachedCallOpBundle(CB: CLI.CB)) {
8389 assert(!IsTailCall &&
8390 "tail calls cannot be marked with clang.arc.attachedcall");
8391 CallOpc = AArch64ISD::CALL_RVMARKER;
8392
8393 // Add a target global address for the retainRV/claimRV runtime function
8394 // just before the call target.
8395 Function *ARCFn = *objcarc::getAttachedARCFunction(CB: CLI.CB);
8396 auto GA = DAG.getTargetGlobalAddress(GV: ARCFn, DL, VT: PtrVT);
8397 Ops.insert(position: Ops.begin() + 1, x: GA);
8398 } else if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
8399 CallOpc = AArch64ISD::CALL_ARM64EC_TO_X64;
8400 } else if (GuardWithBTI) {
8401 CallOpc = AArch64ISD::CALL_BTI;
8402 }
8403
8404 // Returns a chain and a flag for retval copy to use.
8405 Chain = DAG.getNode(Opcode: CallOpc, DL, VTList: NodeTys, Ops);
8406
8407 if (IsCFICall)
8408 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
8409
8410 DAG.addNoMergeSiteInfo(Node: Chain.getNode(), NoMerge: CLI.NoMerge);
8411 InGlue = Chain.getValue(R: 1);
8412 DAG.addCallSiteInfo(Node: Chain.getNode(), CallInfo: std::move(CSInfo));
8413
8414 uint64_t CalleePopBytes =
8415 DoesCalleeRestoreStack(CallCC: CallConv, TailCallOpt) ? alignTo(Value: NumBytes, Align: 16) : 0;
8416
8417 Chain = DAG.getCALLSEQ_END(Chain, Size1: NumBytes, Size2: CalleePopBytes, Glue: InGlue, DL);
8418 InGlue = Chain.getValue(R: 1);
8419
8420 // Handle result values, copying them out of physregs into vregs that we
8421 // return.
8422 SDValue Result = LowerCallResult(
8423 Chain, InGlue, CallConv, isVarArg: IsVarArg, RVLocs, DL, DAG, InVals, isThisReturn: IsThisReturn,
8424 ThisVal: IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange);
8425
8426 if (!Ins.empty())
8427 InGlue = Result.getValue(R: Result->getNumValues() - 1);
8428
8429 if (RequiresSMChange) {
8430 assert(PStateSM && "Expected a PStateSM to be set");
8431 Result = changeStreamingMode(
8432 DAG, DL, Enable: !CalleeAttrs.hasStreamingInterface(), Chain: Result, InGlue,
8433 Condition: getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
8434 }
8435
8436 if (CallerAttrs.requiresEnablingZAAfterCall(CalleeAttrs))
8437 // Unconditionally resume ZA.
8438 Result = DAG.getNode(
8439 AArch64ISD::SMSTART, DL, MVT::Other, Result,
8440 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
8441 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
8442
8443 if (ShouldPreserveZT0)
8444 Result =
8445 DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
8446 {Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
8447
8448 if (RequiresLazySave) {
8449 // Conditionally restore the lazy save using a pseudo node.
8450 unsigned FI = FuncInfo->getLazySaveTPIDR2Obj();
8451 SDValue RegMask = DAG.getRegisterMask(
8452 RegMask: TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
8453 SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
8454 Sym: "__arm_tpidr2_restore", VT: getPointerTy(DL: DAG.getDataLayout()));
8455 SDValue TPIDR2_EL0 = DAG.getNode(
8456 ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
8457 DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
8458
8459 // Copy the address of the TPIDR2 block into X0 before 'calling' the
8460 // RESTORE_ZA pseudo.
8461 SDValue Glue;
8462 SDValue TPIDR2Block = DAG.getFrameIndex(
8463 FI, VT: DAG.getTargetLoweringInfo().getFrameIndexTy(DL: DAG.getDataLayout()));
8464 Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
8465 Result =
8466 DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
8467 {Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
8468 RestoreRoutine, RegMask, Result.getValue(1)});
8469
8470 // Finally reset the TPIDR2_EL0 register to 0.
8471 Result = DAG.getNode(
8472 ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
8473 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
8474 DAG.getConstant(0, DL, MVT::i64));
8475 }
8476
8477 if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0) {
8478 for (unsigned I = 0; I < InVals.size(); ++I) {
8479 // The smstart/smstop is chained as part of the call, but when the
8480 // resulting chain is discarded (which happens when the call is not part
8481 // of a chain, e.g. a call to @llvm.cos()), we need to ensure the
8482 // smstart/smstop is chained to the result value. We can do that by doing
8483 // a vreg -> vreg copy.
8484 Register Reg = MF.getRegInfo().createVirtualRegister(
8485 RegClass: getRegClassFor(VT: InVals[I].getValueType().getSimpleVT()));
8486 SDValue X = DAG.getCopyToReg(Chain: Result, dl: DL, Reg, N: InVals[I]);
8487 InVals[I] = DAG.getCopyFromReg(Chain: X, dl: DL, Reg,
8488 VT: InVals[I].getValueType());
8489 }
8490 }
8491
8492 return Result;
8493}
8494
8495bool AArch64TargetLowering::CanLowerReturn(
8496 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
8497 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
8498 CCAssignFn *RetCC = CCAssignFnForReturn(CC: CallConv);
8499 SmallVector<CCValAssign, 16> RVLocs;
8500 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
8501 return CCInfo.CheckReturn(Outs, Fn: RetCC);
8502}
8503
8504SDValue
8505AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
8506 bool isVarArg,
8507 const SmallVectorImpl<ISD::OutputArg> &Outs,
8508 const SmallVectorImpl<SDValue> &OutVals,
8509 const SDLoc &DL, SelectionDAG &DAG) const {
8510 auto &MF = DAG.getMachineFunction();
8511 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8512
8513 CCAssignFn *RetCC = CCAssignFnForReturn(CC: CallConv);
8514 SmallVector<CCValAssign, 16> RVLocs;
8515 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
8516 CCInfo.AnalyzeReturn(Outs, Fn: RetCC);
8517
8518 // Copy the result values into the output registers.
8519 SDValue Glue;
8520 SmallVector<std::pair<unsigned, SDValue>, 4> RetVals;
8521 SmallSet<unsigned, 4> RegsUsed;
8522 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
8523 ++i, ++realRVLocIdx) {
8524 CCValAssign &VA = RVLocs[i];
8525 assert(VA.isRegLoc() && "Can only return in registers!");
8526 SDValue Arg = OutVals[realRVLocIdx];
8527
8528 switch (VA.getLocInfo()) {
8529 default:
8530 llvm_unreachable("Unknown loc info!");
8531 case CCValAssign::Full:
8532 if (Outs[i].ArgVT == MVT::i1) {
8533 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
8534 // value. This is strictly redundant on Darwin (which uses "zeroext
8535 // i1"), but will be optimised out before ISel.
8536 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
8537 Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
8538 }
8539 break;
8540 case CCValAssign::BCvt:
8541 Arg = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getLocVT(), Operand: Arg);
8542 break;
8543 case CCValAssign::AExt:
8544 case CCValAssign::ZExt:
8545 Arg = DAG.getZExtOrTrunc(Op: Arg, DL, VT: VA.getLocVT());
8546 break;
8547 case CCValAssign::AExtUpper:
8548 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
8549 Arg = DAG.getZExtOrTrunc(Op: Arg, DL, VT: VA.getLocVT());
8550 Arg = DAG.getNode(Opcode: ISD::SHL, DL, VT: VA.getLocVT(), N1: Arg,
8551 N2: DAG.getConstant(Val: 32, DL, VT: VA.getLocVT()));
8552 break;
8553 }
8554
8555 if (RegsUsed.count(V: VA.getLocReg())) {
8556 SDValue &Bits =
8557 llvm::find_if(Range&: RetVals, P: [=](const std::pair<unsigned, SDValue> &Elt) {
8558 return Elt.first == VA.getLocReg();
8559 })->second;
8560 Bits = DAG.getNode(Opcode: ISD::OR, DL, VT: Bits.getValueType(), N1: Bits, N2: Arg);
8561 } else {
8562 RetVals.emplace_back(Args: VA.getLocReg(), Args&: Arg);
8563 RegsUsed.insert(V: VA.getLocReg());
8564 }
8565 }
8566
8567 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8568
8569 // Emit SMSTOP before returning from a locally streaming function
8570 SMEAttrs FuncAttrs(MF.getFunction());
8571 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
8572 if (FuncAttrs.hasStreamingCompatibleInterface()) {
8573 Register Reg = FuncInfo->getPStateSMReg();
8574 assert(Reg.isValid() && "PStateSM Register is invalid");
8575 SDValue PStateSM = DAG.getCopyFromReg(Chain, DL, Reg, MVT::i64);
8576 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
8577 /*Glue*/ InGlue: SDValue(),
8578 Condition: AArch64SME::IfCallerIsNonStreaming, PStateSM);
8579 } else
8580 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
8581 /*Glue*/ InGlue: SDValue(), Condition: AArch64SME::Always);
8582 Glue = Chain.getValue(R: 1);
8583 }
8584
8585 SmallVector<SDValue, 4> RetOps(1, Chain);
8586 for (auto &RetVal : RetVals) {
8587 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface() &&
8588 isPassedInFPR(VT: RetVal.second.getValueType()))
8589 RetVal.second = DAG.getNode(Opcode: AArch64ISD::COALESCER_BARRIER, DL,
8590 VT: RetVal.second.getValueType(), Operand: RetVal.second);
8591 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: RetVal.first, N: RetVal.second, Glue);
8592 Glue = Chain.getValue(R: 1);
8593 RetOps.push_back(
8594 Elt: DAG.getRegister(Reg: RetVal.first, VT: RetVal.second.getValueType()));
8595 }
8596
8597 // Windows AArch64 ABIs require that for returning structs by value we copy
8598 // the sret argument into X0 for the return.
8599 // We saved the argument into a virtual register in the entry block,
8600 // so now we copy the value out and into X0.
8601 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
8602 SDValue Val = DAG.getCopyFromReg(Chain: RetOps[0], dl: DL, Reg: SRetReg,
8603 VT: getPointerTy(DL: MF.getDataLayout()));
8604
8605 unsigned RetValReg = AArch64::X0;
8606 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
8607 RetValReg = AArch64::X8;
8608 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: RetValReg, N: Val, Glue);
8609 Glue = Chain.getValue(R: 1);
8610
8611 RetOps.push_back(
8612 Elt: DAG.getRegister(Reg: RetValReg, VT: getPointerTy(DL: DAG.getDataLayout())));
8613 }
8614
8615 const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(MF: &MF);
8616 if (I) {
8617 for (; *I; ++I) {
8618 if (AArch64::GPR64RegClass.contains(*I))
8619 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
8620 else if (AArch64::FPR64RegClass.contains(*I))
8621 RetOps.push_back(Elt: DAG.getRegister(Reg: *I, VT: MVT::getFloatingPointVT(BitWidth: 64)));
8622 else
8623 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
8624 }
8625 }
8626
8627 RetOps[0] = Chain; // Update chain.
8628
8629 // Add the glue if we have it.
8630 if (Glue.getNode())
8631 RetOps.push_back(Elt: Glue);
8632
8633 if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
8634 // ARM64EC entry thunks use a special return sequence: instead of a regular
8635 // "ret" instruction, they need to explicitly call the emulator.
8636 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
8637 SDValue Arm64ECRetDest =
8638 DAG.getExternalSymbol(Sym: "__os_arm64x_dispatch_ret", VT: PtrVT);
8639 Arm64ECRetDest =
8640 getAddr(N: cast<ExternalSymbolSDNode>(Val&: Arm64ECRetDest), DAG, Flags: 0);
8641 Arm64ECRetDest = DAG.getLoad(VT: PtrVT, dl: DL, Chain: DAG.getEntryNode(), Ptr: Arm64ECRetDest,
8642 PtrInfo: MachinePointerInfo());
8643 RetOps.insert(I: RetOps.begin() + 1, Elt: Arm64ECRetDest);
8644 RetOps.insert(RetOps.begin() + 2, DAG.getTargetConstant(0, DL, MVT::i32));
8645 return DAG.getNode(AArch64ISD::TC_RETURN, DL, MVT::Other, RetOps);
8646 }
8647
8648 return DAG.getNode(AArch64ISD::RET_GLUE, DL, MVT::Other, RetOps);
8649}
8650
8651//===----------------------------------------------------------------------===//
8652// Other Lowering Code
8653//===----------------------------------------------------------------------===//
8654
8655SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
8656 SelectionDAG &DAG,
8657 unsigned Flag) const {
8658 return DAG.getTargetGlobalAddress(GV: N->getGlobal(), DL: SDLoc(N), VT: Ty,
8659 offset: N->getOffset(), TargetFlags: Flag);
8660}
8661
8662SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
8663 SelectionDAG &DAG,
8664 unsigned Flag) const {
8665 return DAG.getTargetJumpTable(JTI: N->getIndex(), VT: Ty, TargetFlags: Flag);
8666}
8667
8668SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
8669 SelectionDAG &DAG,
8670 unsigned Flag) const {
8671 return DAG.getTargetConstantPool(C: N->getConstVal(), VT: Ty, Align: N->getAlign(),
8672 Offset: N->getOffset(), TargetFlags: Flag);
8673}
8674
8675SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
8676 SelectionDAG &DAG,
8677 unsigned Flag) const {
8678 return DAG.getTargetBlockAddress(BA: N->getBlockAddress(), VT: Ty, Offset: 0, TargetFlags: Flag);
8679}
8680
8681SDValue AArch64TargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
8682 SelectionDAG &DAG,
8683 unsigned Flag) const {
8684 return DAG.getTargetExternalSymbol(Sym: N->getSymbol(), VT: Ty, TargetFlags: Flag);
8685}
8686
8687// (loadGOT sym)
8688template <class NodeTy>
8689SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
8690 unsigned Flags) const {
8691 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
8692 SDLoc DL(N);
8693 EVT Ty = getPointerTy(DL: DAG.getDataLayout());
8694 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
8695 // FIXME: Once remat is capable of dealing with instructions with register
8696 // operands, expand this into two nodes instead of using a wrapper node.
8697 return DAG.getNode(Opcode: AArch64ISD::LOADgot, DL, VT: Ty, Operand: GotAddr);
8698}
8699
8700// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
8701template <class NodeTy>
8702SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
8703 unsigned Flags) const {
8704 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
8705 SDLoc DL(N);
8706 EVT Ty = getPointerTy(DL: DAG.getDataLayout());
8707 const unsigned char MO_NC = AArch64II::MO_NC;
8708 return DAG.getNode(
8709 AArch64ISD::WrapperLarge, DL, Ty,
8710 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
8711 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
8712 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
8713 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
8714}
8715
8716// (addlow (adrp %hi(sym)) %lo(sym))
8717template <class NodeTy>
8718SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
8719 unsigned Flags) const {
8720 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
8721 SDLoc DL(N);
8722 EVT Ty = getPointerTy(DL: DAG.getDataLayout());
8723 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
8724 SDValue Lo = getTargetNode(N, Ty, DAG,
8725 AArch64II::MO_PAGEOFF | AArch64II::MO_NC | Flags);
8726 SDValue ADRP = DAG.getNode(Opcode: AArch64ISD::ADRP, DL, VT: Ty, Operand: Hi);
8727 return DAG.getNode(Opcode: AArch64ISD::ADDlow, DL, VT: Ty, N1: ADRP, N2: Lo);
8728}
8729
8730// (adr sym)
8731template <class NodeTy>
8732SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
8733 unsigned Flags) const {
8734 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
8735 SDLoc DL(N);
8736 EVT Ty = getPointerTy(DL: DAG.getDataLayout());
8737 SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
8738 return DAG.getNode(Opcode: AArch64ISD::ADR, DL, VT: Ty, Operand: Sym);
8739}
8740
8741SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
8742 SelectionDAG &DAG) const {
8743 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Val&: Op);
8744 const GlobalValue *GV = GN->getGlobal();
8745 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, TM: getTargetMachine());
8746
8747 if (OpFlags != AArch64II::MO_NO_FLAG)
8748 assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
8749 "unexpected offset in global node");
8750
8751 // This also catches the large code model case for Darwin, and tiny code
8752 // model with got relocations.
8753 if ((OpFlags & AArch64II::MO_GOT) != 0) {
8754 return getGOT(N: GN, DAG, Flags: OpFlags);
8755 }
8756
8757 SDValue Result;
8758 if (getTargetMachine().getCodeModel() == CodeModel::Large &&
8759 !getTargetMachine().isPositionIndependent()) {
8760 Result = getAddrLarge(N: GN, DAG, Flags: OpFlags);
8761 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
8762 Result = getAddrTiny(N: GN, DAG, Flags: OpFlags);
8763 } else {
8764 Result = getAddr(N: GN, DAG, Flags: OpFlags);
8765 }
8766 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
8767 SDLoc DL(GN);
8768 if (OpFlags & (AArch64II::MO_DLLIMPORT | AArch64II::MO_COFFSTUB))
8769 Result = DAG.getLoad(VT: PtrVT, dl: DL, Chain: DAG.getEntryNode(), Ptr: Result,
8770 PtrInfo: MachinePointerInfo::getGOT(MF&: DAG.getMachineFunction()));
8771 return Result;
8772}
8773
8774/// Convert a TLS address reference into the correct sequence of loads
8775/// and calls to compute the variable's address (for Darwin, currently) and
8776/// return an SDValue containing the final node.
8777
8778/// Darwin only has one TLS scheme which must be capable of dealing with the
8779/// fully general situation, in the worst case. This means:
8780/// + "extern __thread" declaration.
8781/// + Defined in a possibly unknown dynamic library.
8782///
8783/// The general system is that each __thread variable has a [3 x i64] descriptor
8784/// which contains information used by the runtime to calculate the address. The
8785/// only part of this the compiler needs to know about is the first xword, which
8786/// contains a function pointer that must be called with the address of the
8787/// entire descriptor in "x0".
8788///
8789/// Since this descriptor may be in a different unit, in general even the
8790/// descriptor must be accessed via an indirect load. The "ideal" code sequence
8791/// is:
8792/// adrp x0, _var@TLVPPAGE
8793/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
8794/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
8795/// ; the function pointer
8796/// blr x1 ; Uses descriptor address in x0
8797/// ; Address of _var is now in x0.
8798///
8799/// If the address of _var's descriptor *is* known to the linker, then it can
8800/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
8801/// a slight efficiency gain.
8802SDValue
8803AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
8804 SelectionDAG &DAG) const {
8805 assert(Subtarget->isTargetDarwin() &&
8806 "This function expects a Darwin target");
8807
8808 SDLoc DL(Op);
8809 MVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
8810 MVT PtrMemVT = getPointerMemTy(DL: DAG.getDataLayout());
8811 const GlobalValue *GV = cast<GlobalAddressSDNode>(Val&: Op)->getGlobal();
8812
8813 SDValue TLVPAddr =
8814 DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT, offset: 0, TargetFlags: AArch64II::MO_TLS);
8815 SDValue DescAddr = DAG.getNode(Opcode: AArch64ISD::LOADgot, DL, VT: PtrVT, Operand: TLVPAddr);
8816
8817 // The first entry in the descriptor is a function pointer that we must call
8818 // to obtain the address of the variable.
8819 SDValue Chain = DAG.getEntryNode();
8820 SDValue FuncTLVGet = DAG.getLoad(
8821 VT: PtrMemVT, dl: DL, Chain, Ptr: DescAddr,
8822 PtrInfo: MachinePointerInfo::getGOT(MF&: DAG.getMachineFunction()),
8823 Alignment: Align(PtrMemVT.getSizeInBits() / 8),
8824 MMOFlags: MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable);
8825 Chain = FuncTLVGet.getValue(R: 1);
8826
8827 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
8828 FuncTLVGet = DAG.getZExtOrTrunc(Op: FuncTLVGet, DL, VT: PtrVT);
8829
8830 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
8831 MFI.setAdjustsStack(true);
8832
8833 // TLS calls preserve all registers except those that absolutely must be
8834 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
8835 // silly).
8836 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8837 const uint32_t *Mask = TRI->getTLSCallPreservedMask();
8838 if (Subtarget->hasCustomCallingConv())
8839 TRI->UpdateCustomCallPreservedMask(MF&: DAG.getMachineFunction(), Mask: &Mask);
8840
8841 // Finally, we can make the call. This is just a degenerate version of a
8842 // normal AArch64 call node: x0 takes the address of the descriptor, and
8843 // returns the address of the variable in this thread.
8844 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
8845 Chain =
8846 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
8847 Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
8848 DAG.getRegisterMask(Mask), Chain.getValue(1));
8849 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
8850}
8851
8852/// Convert a thread-local variable reference into a sequence of instructions to
8853/// compute the variable's address for the local exec TLS model of ELF targets.
8854/// The sequence depends on the maximum TLS area size.
8855SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
8856 SDValue ThreadBase,
8857 const SDLoc &DL,
8858 SelectionDAG &DAG) const {
8859 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
8860 SDValue TPOff, Addr;
8861
8862 switch (DAG.getTarget().Options.TLSSize) {
8863 default:
8864 llvm_unreachable("Unexpected TLS size");
8865
8866 case 12: {
8867 // mrs x0, TPIDR_EL0
8868 // add x0, x0, :tprel_lo12:a
8869 SDValue Var = DAG.getTargetGlobalAddress(
8870 GV, DL, VT: PtrVT, offset: 0, TargetFlags: AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
8871 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
8872 Var,
8873 DAG.getTargetConstant(0, DL, MVT::i32)),
8874 0);
8875 }
8876
8877 case 24: {
8878 // mrs x0, TPIDR_EL0
8879 // add x0, x0, :tprel_hi12:a
8880 // add x0, x0, :tprel_lo12_nc:a
8881 SDValue HiVar = DAG.getTargetGlobalAddress(
8882 GV, DL, VT: PtrVT, offset: 0, TargetFlags: AArch64II::MO_TLS | AArch64II::MO_HI12);
8883 SDValue LoVar = DAG.getTargetGlobalAddress(
8884 GV, DL, VT: PtrVT, offset: 0,
8885 TargetFlags: AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
8886 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
8887 HiVar,
8888 DAG.getTargetConstant(0, DL, MVT::i32)),
8889 0);
8890 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
8891 LoVar,
8892 DAG.getTargetConstant(0, DL, MVT::i32)),
8893 0);
8894 }
8895
8896 case 32: {
8897 // mrs x1, TPIDR_EL0
8898 // movz x0, #:tprel_g1:a
8899 // movk x0, #:tprel_g0_nc:a
8900 // add x0, x1, x0
8901 SDValue HiVar = DAG.getTargetGlobalAddress(
8902 GV, DL, VT: PtrVT, offset: 0, TargetFlags: AArch64II::MO_TLS | AArch64II::MO_G1);
8903 SDValue LoVar = DAG.getTargetGlobalAddress(
8904 GV, DL, VT: PtrVT, offset: 0,
8905 TargetFlags: AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
8906 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
8907 DAG.getTargetConstant(16, DL, MVT::i32)),
8908 0);
8909 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
8910 DAG.getTargetConstant(0, DL, MVT::i32)),
8911 0);
8912 return DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: ThreadBase, N2: TPOff);
8913 }
8914
8915 case 48: {
8916 // mrs x1, TPIDR_EL0
8917 // movz x0, #:tprel_g2:a
8918 // movk x0, #:tprel_g1_nc:a
8919 // movk x0, #:tprel_g0_nc:a
8920 // add x0, x1, x0
8921 SDValue HiVar = DAG.getTargetGlobalAddress(
8922 GV, DL, VT: PtrVT, offset: 0, TargetFlags: AArch64II::MO_TLS | AArch64II::MO_G2);
8923 SDValue MiVar = DAG.getTargetGlobalAddress(
8924 GV, DL, VT: PtrVT, offset: 0,
8925 TargetFlags: AArch64II::MO_TLS | AArch64II::MO_G1 | AArch64II::MO_NC);
8926 SDValue LoVar = DAG.getTargetGlobalAddress(
8927 GV, DL, VT: PtrVT, offset: 0,
8928 TargetFlags: AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
8929 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
8930 DAG.getTargetConstant(32, DL, MVT::i32)),
8931 0);
8932 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
8933 DAG.getTargetConstant(16, DL, MVT::i32)),
8934 0);
8935 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
8936 DAG.getTargetConstant(0, DL, MVT::i32)),
8937 0);
8938 return DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: ThreadBase, N2: TPOff);
8939 }
8940 }
8941}
8942
8943/// When accessing thread-local variables under either the general-dynamic or
8944/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
8945/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
8946/// is a function pointer to carry out the resolution.
8947///
8948/// The sequence is:
8949/// adrp x0, :tlsdesc:var
8950/// ldr x1, [x0, #:tlsdesc_lo12:var]
8951/// add x0, x0, #:tlsdesc_lo12:var
8952/// .tlsdesccall var
8953/// blr x1
8954/// (TPIDR_EL0 offset now in x0)
8955///
8956/// The above sequence must be produced unscheduled, to enable the linker to
8957/// optimize/relax this sequence.
8958/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
8959/// above sequence, and expanded really late in the compilation flow, to ensure
8960/// the sequence is produced as per above.
8961SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
8962 const SDLoc &DL,
8963 SelectionDAG &DAG) const {
8964 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
8965
8966 SDValue Chain = DAG.getEntryNode();
8967 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
8968
8969 Chain =
8970 DAG.getNode(Opcode: AArch64ISD::TLSDESC_CALLSEQ, DL, VTList: NodeTys, Ops: {Chain, SymAddr});
8971 SDValue Glue = Chain.getValue(R: 1);
8972
8973 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
8974}
8975
8976SDValue
8977AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
8978 SelectionDAG &DAG) const {
8979 assert(Subtarget->isTargetELF() && "This function expects an ELF target");
8980
8981 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Val&: Op);
8982
8983 TLSModel::Model Model = getTargetMachine().getTLSModel(GV: GA->getGlobal());
8984
8985 if (!EnableAArch64ELFLocalDynamicTLSGeneration) {
8986 if (Model == TLSModel::LocalDynamic)
8987 Model = TLSModel::GeneralDynamic;
8988 }
8989
8990 if (getTargetMachine().getCodeModel() == CodeModel::Large &&
8991 Model != TLSModel::LocalExec)
8992 report_fatal_error(reason: "ELF TLS only supported in small memory model or "
8993 "in local exec TLS model");
8994 // Different choices can be made for the maximum size of the TLS area for a
8995 // module. For the small address model, the default TLS size is 16MiB and the
8996 // maximum TLS size is 4GiB.
8997 // FIXME: add tiny and large code model support for TLS access models other
8998 // than local exec. We currently generate the same code as small for tiny,
8999 // which may be larger than needed.
9000
9001 SDValue TPOff;
9002 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
9003 SDLoc DL(Op);
9004 const GlobalValue *GV = GA->getGlobal();
9005
9006 SDValue ThreadBase = DAG.getNode(Opcode: AArch64ISD::THREAD_POINTER, DL, VT: PtrVT);
9007
9008 if (Model == TLSModel::LocalExec) {
9009 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
9010 } else if (Model == TLSModel::InitialExec) {
9011 TPOff = DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT, offset: 0, TargetFlags: AArch64II::MO_TLS);
9012 TPOff = DAG.getNode(Opcode: AArch64ISD::LOADgot, DL, VT: PtrVT, Operand: TPOff);
9013 } else if (Model == TLSModel::LocalDynamic) {
9014 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
9015 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
9016 // the beginning of the module's TLS region, followed by a DTPREL offset
9017 // calculation.
9018
9019 // These accesses will need deduplicating if there's more than one.
9020 AArch64FunctionInfo *MFI =
9021 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
9022 MFI->incNumLocalDynamicTLSAccesses();
9023
9024 // The call needs a relocation too for linker relaxation. It doesn't make
9025 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
9026 // the address.
9027 SDValue SymAddr = DAG.getTargetExternalSymbol(Sym: "_TLS_MODULE_BASE_", VT: PtrVT,
9028 TargetFlags: AArch64II::MO_TLS);
9029
9030 // Now we can calculate the offset from TPIDR_EL0 to this module's
9031 // thread-local area.
9032 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
9033
9034 // Now use :dtprel_whatever: operations to calculate this variable's offset
9035 // in its thread-storage area.
9036 SDValue HiVar = DAG.getTargetGlobalAddress(
9037 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
9038 SDValue LoVar = DAG.getTargetGlobalAddress(
9039 GV, DL, MVT::i64, 0,
9040 AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
9041
9042 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
9043 DAG.getTargetConstant(0, DL, MVT::i32)),
9044 0);
9045 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
9046 DAG.getTargetConstant(0, DL, MVT::i32)),
9047 0);
9048 } else if (Model == TLSModel::GeneralDynamic) {
9049 // The call needs a relocation too for linker relaxation. It doesn't make
9050 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
9051 // the address.
9052 SDValue SymAddr =
9053 DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT, offset: 0, TargetFlags: AArch64II::MO_TLS);
9054
9055 // Finally we can make a call to calculate the offset from tpidr_el0.
9056 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
9057 } else
9058 llvm_unreachable("Unsupported ELF TLS access model");
9059
9060 return DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: ThreadBase, N2: TPOff);
9061}
9062
9063SDValue
9064AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
9065 SelectionDAG &DAG) const {
9066 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
9067
9068 SDValue Chain = DAG.getEntryNode();
9069 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
9070 SDLoc DL(Op);
9071
9072 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
9073
9074 // Load the ThreadLocalStoragePointer from the TEB
9075 // A pointer to the TLS array is located at offset 0x58 from the TEB.
9076 SDValue TLSArray =
9077 DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: TEB, N2: DAG.getIntPtrConstant(Val: 0x58, DL));
9078 TLSArray = DAG.getLoad(VT: PtrVT, dl: DL, Chain, Ptr: TLSArray, PtrInfo: MachinePointerInfo());
9079 Chain = TLSArray.getValue(R: 1);
9080
9081 // Load the TLS index from the C runtime;
9082 // This does the same as getAddr(), but without having a GlobalAddressSDNode.
9083 // This also does the same as LOADgot, but using a generic i32 load,
9084 // while LOADgot only loads i64.
9085 SDValue TLSIndexHi =
9086 DAG.getTargetExternalSymbol(Sym: "_tls_index", VT: PtrVT, TargetFlags: AArch64II::MO_PAGE);
9087 SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
9088 Sym: "_tls_index", VT: PtrVT, TargetFlags: AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
9089 SDValue ADRP = DAG.getNode(Opcode: AArch64ISD::ADRP, DL, VT: PtrVT, Operand: TLSIndexHi);
9090 SDValue TLSIndex =
9091 DAG.getNode(Opcode: AArch64ISD::ADDlow, DL, VT: PtrVT, N1: ADRP, N2: TLSIndexLo);
9092 TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
9093 Chain = TLSIndex.getValue(R: 1);
9094
9095 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
9096 // offset into the TLSArray.
9097 TLSIndex = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: PtrVT, Operand: TLSIndex);
9098 SDValue Slot = DAG.getNode(Opcode: ISD::SHL, DL, VT: PtrVT, N1: TLSIndex,
9099 N2: DAG.getConstant(Val: 3, DL, VT: PtrVT));
9100 SDValue TLS = DAG.getLoad(VT: PtrVT, dl: DL, Chain,
9101 Ptr: DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: TLSArray, N2: Slot),
9102 PtrInfo: MachinePointerInfo());
9103 Chain = TLS.getValue(R: 1);
9104
9105 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Val&: Op);
9106 const GlobalValue *GV = GA->getGlobal();
9107 SDValue TGAHi = DAG.getTargetGlobalAddress(
9108 GV, DL, VT: PtrVT, offset: 0, TargetFlags: AArch64II::MO_TLS | AArch64II::MO_HI12);
9109 SDValue TGALo = DAG.getTargetGlobalAddress(
9110 GV, DL, VT: PtrVT, offset: 0,
9111 TargetFlags: AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
9112
9113 // Add the offset from the start of the .tls section (section base).
9114 SDValue Addr =
9115 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
9116 DAG.getTargetConstant(0, DL, MVT::i32)),
9117 0);
9118 Addr = DAG.getNode(Opcode: AArch64ISD::ADDlow, DL, VT: PtrVT, N1: Addr, N2: TGALo);
9119 return Addr;
9120}
9121
9122SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
9123 SelectionDAG &DAG) const {
9124 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Val&: Op);
9125 if (DAG.getTarget().useEmulatedTLS())
9126 return LowerToTLSEmulatedModel(GA, DAG);
9127
9128 if (Subtarget->isTargetDarwin())
9129 return LowerDarwinGlobalTLSAddress(Op, DAG);
9130 if (Subtarget->isTargetELF())
9131 return LowerELFGlobalTLSAddress(Op, DAG);
9132 if (Subtarget->isTargetWindows())
9133 return LowerWindowsGlobalTLSAddress(Op, DAG);
9134
9135 llvm_unreachable("Unexpected platform trying to use TLS");
9136}
9137
9138// Looks through \param Val to determine the bit that can be used to
9139// check the sign of the value. It returns the unextended value and
9140// the sign bit position.
9141std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
9142 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
9143 return {Val.getOperand(i: 0),
9144 cast<VTSDNode>(Val: Val.getOperand(i: 1))->getVT().getFixedSizeInBits() -
9145 1};
9146
9147 if (Val.getOpcode() == ISD::SIGN_EXTEND)
9148 return {Val.getOperand(i: 0),
9149 Val.getOperand(i: 0)->getValueType(ResNo: 0).getFixedSizeInBits() - 1};
9150
9151 return {Val, Val.getValueSizeInBits() - 1};
9152}
9153
9154SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
9155 SDValue Chain = Op.getOperand(i: 0);
9156 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: 1))->get();
9157 SDValue LHS = Op.getOperand(i: 2);
9158 SDValue RHS = Op.getOperand(i: 3);
9159 SDValue Dest = Op.getOperand(i: 4);
9160 SDLoc dl(Op);
9161
9162 MachineFunction &MF = DAG.getMachineFunction();
9163 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
9164 // will not be produced, as they are conditional branch instructions that do
9165 // not set flags.
9166 bool ProduceNonFlagSettingCondBr =
9167 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
9168
9169 // Handle f128 first, since lowering it will result in comparing the return
9170 // value of a libcall against zero, which is just what the rest of LowerBR_CC
9171 // is expecting to deal with.
9172 if (LHS.getValueType() == MVT::f128) {
9173 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
9174
9175 // If softenSetCCOperands returned a scalar, we need to compare the result
9176 // against zero to select between true and false values.
9177 if (!RHS.getNode()) {
9178 RHS = DAG.getConstant(Val: 0, DL: dl, VT: LHS.getValueType());
9179 CC = ISD::SETNE;
9180 }
9181 }
9182
9183 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
9184 // instruction.
9185 if (ISD::isOverflowIntrOpRes(Op: LHS) && isOneConstant(V: RHS) &&
9186 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
9187 // Only lower legal XALUO ops.
9188 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: LHS->getValueType(ResNo: 0)))
9189 return SDValue();
9190
9191 // The actual operation with overflow check.
9192 AArch64CC::CondCode OFCC;
9193 SDValue Value, Overflow;
9194 std::tie(args&: Value, args&: Overflow) = getAArch64XALUOOp(CC&: OFCC, Op: LHS.getValue(R: 0), DAG);
9195
9196 if (CC == ISD::SETNE)
9197 OFCC = getInvertedCondCode(Code: OFCC);
9198 SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);
9199
9200 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
9201 Overflow);
9202 }
9203
9204 if (LHS.getValueType().isInteger()) {
9205 assert((LHS.getValueType() == RHS.getValueType()) &&
9206 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
9207
9208 // If the RHS of the comparison is zero, we can potentially fold this
9209 // to a specialized branch.
9210 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(Val&: RHS);
9211 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
9212 if (CC == ISD::SETEQ) {
9213 // See if we can use a TBZ to fold in an AND as well.
9214 // TBZ has a smaller branch displacement than CBZ. If the offset is
9215 // out of bounds, a late MI-layer pass rewrites branches.
9216 // 403.gcc is an example that hits this case.
9217 if (LHS.getOpcode() == ISD::AND &&
9218 isa<ConstantSDNode>(Val: LHS.getOperand(i: 1)) &&
9219 isPowerOf2_64(Value: LHS.getConstantOperandVal(i: 1))) {
9220 SDValue Test = LHS.getOperand(i: 0);
9221 uint64_t Mask = LHS.getConstantOperandVal(i: 1);
9222 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
9223 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
9224 Dest);
9225 }
9226
9227 return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
9228 } else if (CC == ISD::SETNE) {
9229 // See if we can use a TBZ to fold in an AND as well.
9230 // TBZ has a smaller branch displacement than CBZ. If the offset is
9231 // out of bounds, a late MI-layer pass rewrites branches.
9232 // 403.gcc is an example that hits this case.
9233 if (LHS.getOpcode() == ISD::AND &&
9234 isa<ConstantSDNode>(Val: LHS.getOperand(i: 1)) &&
9235 isPowerOf2_64(Value: LHS.getConstantOperandVal(i: 1))) {
9236 SDValue Test = LHS.getOperand(i: 0);
9237 uint64_t Mask = LHS.getConstantOperandVal(i: 1);
9238 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
9239 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
9240 Dest);
9241 }
9242
9243 return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
9244 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
9245 // Don't combine AND since emitComparison converts the AND to an ANDS
9246 // (a.k.a. TST) and the test in the test bit and branch instruction
9247 // becomes redundant. This would also increase register pressure.
9248 uint64_t SignBitPos;
9249 std::tie(args&: LHS, args&: SignBitPos) = lookThroughSignExtension(Val: LHS);
9250 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
9251 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
9252 }
9253 }
9254 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
9255 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
9256 // Don't combine AND since emitComparison converts the AND to an ANDS
9257 // (a.k.a. TST) and the test in the test bit and branch instruction
9258 // becomes redundant. This would also increase register pressure.
9259 uint64_t SignBitPos;
9260 std::tie(args&: LHS, args&: SignBitPos) = lookThroughSignExtension(Val: LHS);
9261 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
9262 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
9263 }
9264
9265 SDValue CCVal;
9266 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, AArch64cc&: CCVal, DAG, dl);
9267 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
9268 Cmp);
9269 }
9270
9271 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
9272 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
9273
9274 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
9275 // clean. Some of them require two branches to implement.
9276 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
9277 AArch64CC::CondCode CC1, CC2;
9278 changeFPCCToAArch64CC(CC, CondCode&: CC1, CondCode2&: CC2);
9279 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9280 SDValue BR1 =
9281 DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
9282 if (CC2 != AArch64CC::AL) {
9283 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
9284 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
9285 Cmp);
9286 }
9287
9288 return BR1;
9289}
9290
9291SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
9292 SelectionDAG &DAG) const {
9293 if (!Subtarget->hasNEON())
9294 return SDValue();
9295
9296 EVT VT = Op.getValueType();
9297 EVT IntVT = VT.changeTypeToInteger();
9298 SDLoc DL(Op);
9299
9300 SDValue In1 = Op.getOperand(i: 0);
9301 SDValue In2 = Op.getOperand(i: 1);
9302 EVT SrcVT = In2.getValueType();
9303
9304 if (!SrcVT.bitsEq(VT))
9305 In2 = DAG.getFPExtendOrRound(Op: In2, DL, VT);
9306
9307 if (VT.isScalableVector())
9308 IntVT =
9309 getPackedSVEVectorVT(VT: VT.getVectorElementType().changeTypeToInteger());
9310
9311 if (VT.isFixedLengthVector() &&
9312 useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable())) {
9313 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
9314
9315 In1 = convertToScalableVector(DAG, VT: ContainerVT, V: In1);
9316 In2 = convertToScalableVector(DAG, VT: ContainerVT, V: In2);
9317
9318 SDValue Res = DAG.getNode(Opcode: ISD::FCOPYSIGN, DL, VT: ContainerVT, N1: In1, N2: In2);
9319 return convertFromScalableVector(DAG, VT, V: Res);
9320 }
9321
9322 auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
9323 if (VT.isScalableVector())
9324 return getSVESafeBitCast(VT, Op, DAG);
9325
9326 return DAG.getBitcast(VT, V: Op);
9327 };
9328
9329 SDValue VecVal1, VecVal2;
9330 EVT VecVT;
9331 auto SetVecVal = [&](int Idx = -1) {
9332 if (!VT.isVector()) {
9333 VecVal1 =
9334 DAG.getTargetInsertSubreg(SRIdx: Idx, DL, VT: VecVT, Operand: DAG.getUNDEF(VT: VecVT), Subreg: In1);
9335 VecVal2 =
9336 DAG.getTargetInsertSubreg(SRIdx: Idx, DL, VT: VecVT, Operand: DAG.getUNDEF(VT: VecVT), Subreg: In2);
9337 } else {
9338 VecVal1 = BitCast(VecVT, In1, DAG);
9339 VecVal2 = BitCast(VecVT, In2, DAG);
9340 }
9341 };
9342 if (VT.isVector()) {
9343 VecVT = IntVT;
9344 SetVecVal();
9345 } else if (VT == MVT::f64) {
9346 VecVT = MVT::v2i64;
9347 SetVecVal(AArch64::dsub);
9348 } else if (VT == MVT::f32) {
9349 VecVT = MVT::v4i32;
9350 SetVecVal(AArch64::ssub);
9351 } else if (VT == MVT::f16 || VT == MVT::bf16) {
9352 VecVT = MVT::v8i16;
9353 SetVecVal(AArch64::hsub);
9354 } else {
9355 llvm_unreachable("Invalid type for copysign!");
9356 }
9357
9358 unsigned BitWidth = In1.getScalarValueSizeInBits();
9359 SDValue SignMaskV = DAG.getConstant(Val: ~APInt::getSignMask(BitWidth), DL, VT: VecVT);
9360
9361 // We want to materialize a mask with every bit but the high bit set, but the
9362 // AdvSIMD immediate moves cannot materialize that in a single instruction for
9363 // 64-bit elements. Instead, materialize all bits set and then negate that.
9364 if (VT == MVT::f64 || VT == MVT::v2f64) {
9365 SignMaskV = DAG.getConstant(Val: APInt::getAllOnes(numBits: BitWidth), DL, VT: VecVT);
9366 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
9367 SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
9368 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
9369 }
9370
9371 SDValue BSP =
9372 DAG.getNode(Opcode: AArch64ISD::BSP, DL, VT: VecVT, N1: SignMaskV, N2: VecVal1, N3: VecVal2);
9373 if (VT == MVT::f16 || VT == MVT::bf16)
9374 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
9375 if (VT == MVT::f32)
9376 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
9377 if (VT == MVT::f64)
9378 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
9379
9380 return BitCast(VT, BSP, DAG);
9381}
9382
9383SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
9384 SelectionDAG &DAG) const {
9385 if (DAG.getMachineFunction().getFunction().hasFnAttribute(
9386 Attribute::NoImplicitFloat))
9387 return SDValue();
9388
9389 if (!Subtarget->hasNEON())
9390 return SDValue();
9391
9392 bool IsParity = Op.getOpcode() == ISD::PARITY;
9393 SDValue Val = Op.getOperand(i: 0);
9394 SDLoc DL(Op);
9395 EVT VT = Op.getValueType();
9396
9397 // for i32, general parity function using EORs is more efficient compared to
9398 // using floating point
9399 if (VT == MVT::i32 && IsParity)
9400 return SDValue();
9401
9402 // If there is no CNT instruction available, GPR popcount can
9403 // be more efficiently lowered to the following sequence that uses
9404 // AdvSIMD registers/instructions as long as the copies to/from
9405 // the AdvSIMD registers are cheap.
9406 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
9407 // CNT V0.8B, V0.8B // 8xbyte pop-counts
9408 // ADDV B0, V0.8B // sum 8xbyte pop-counts
9409 // UMOV X0, V0.B[0] // copy byte result back to integer reg
9410 if (VT == MVT::i32 || VT == MVT::i64) {
9411 if (VT == MVT::i32)
9412 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
9413 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
9414
9415 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
9416 SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop);
9417 UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV,
9418 DAG.getConstant(0, DL, MVT::i64));
9419
9420 if (IsParity)
9421 UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
9422 DAG.getConstant(1, DL, MVT::i32));
9423
9424 if (VT == MVT::i64)
9425 UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
9426 return UaddLV;
9427 } else if (VT == MVT::i128) {
9428 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
9429
9430 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
9431 SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop);
9432 UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV,
9433 DAG.getConstant(0, DL, MVT::i64));
9434
9435 if (IsParity)
9436 UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
9437 DAG.getConstant(1, DL, MVT::i32));
9438
9439 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV);
9440 }
9441
9442 assert(!IsParity && "ISD::PARITY of vector types not supported");
9443
9444 if (VT.isScalableVector() ||
9445 useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
9446 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::CTPOP_MERGE_PASSTHRU);
9447
9448 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
9449 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
9450 "Unexpected type for custom ctpop lowering");
9451
9452 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
9453 Val = DAG.getBitcast(VT: VT8Bit, V: Val);
9454 Val = DAG.getNode(Opcode: ISD::CTPOP, DL, VT: VT8Bit, Operand: Val);
9455
9456 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
9457 unsigned EltSize = 8;
9458 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
9459 while (EltSize != VT.getScalarSizeInBits()) {
9460 EltSize *= 2;
9461 NumElts /= 2;
9462 MVT WidenVT = MVT::getVectorVT(VT: MVT::getIntegerVT(BitWidth: EltSize), NumElements: NumElts);
9463 Val = DAG.getNode(
9464 ISD::INTRINSIC_WO_CHAIN, DL, WidenVT,
9465 DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val);
9466 }
9467
9468 return Val;
9469}
9470
9471SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
9472 EVT VT = Op.getValueType();
9473 assert(VT.isScalableVector() ||
9474 useSVEForFixedLengthVectorVT(
9475 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));
9476
9477 SDLoc DL(Op);
9478 SDValue RBIT = DAG.getNode(Opcode: ISD::BITREVERSE, DL, VT, Operand: Op.getOperand(i: 0));
9479 return DAG.getNode(Opcode: ISD::CTLZ, DL, VT, Operand: RBIT);
9480}
9481
9482SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
9483 SelectionDAG &DAG) const {
9484
9485 EVT VT = Op.getValueType();
9486 SDLoc DL(Op);
9487 unsigned Opcode = Op.getOpcode();
9488 ISD::CondCode CC;
9489 switch (Opcode) {
9490 default:
9491 llvm_unreachable("Wrong instruction");
9492 case ISD::SMAX:
9493 CC = ISD::SETGT;
9494 break;
9495 case ISD::SMIN:
9496 CC = ISD::SETLT;
9497 break;
9498 case ISD::UMAX:
9499 CC = ISD::SETUGT;
9500 break;
9501 case ISD::UMIN:
9502 CC = ISD::SETULT;
9503 break;
9504 }
9505
9506 if (VT.isScalableVector() ||
9507 useSVEForFixedLengthVectorVT(
9508 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
9509 switch (Opcode) {
9510 default:
9511 llvm_unreachable("Wrong instruction");
9512 case ISD::SMAX:
9513 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::SMAX_PRED);
9514 case ISD::SMIN:
9515 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::SMIN_PRED);
9516 case ISD::UMAX:
9517 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::UMAX_PRED);
9518 case ISD::UMIN:
9519 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::UMIN_PRED);
9520 }
9521 }
9522
9523 SDValue Op0 = Op.getOperand(i: 0);
9524 SDValue Op1 = Op.getOperand(i: 1);
9525 SDValue Cond = DAG.getSetCC(DL, VT, LHS: Op0, RHS: Op1, Cond: CC);
9526 return DAG.getSelect(DL, VT, Cond, LHS: Op0, RHS: Op1);
9527}
9528
9529SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
9530 SelectionDAG &DAG) const {
9531 EVT VT = Op.getValueType();
9532
9533 if (VT.isScalableVector() ||
9534 useSVEForFixedLengthVectorVT(
9535 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
9536 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
9537
9538 SDLoc DL(Op);
9539 SDValue REVB;
9540 MVT VST;
9541
9542 switch (VT.getSimpleVT().SimpleTy) {
9543 default:
9544 llvm_unreachable("Invalid type for bitreverse!");
9545
9546 case MVT::v2i32: {
9547 VST = MVT::v8i8;
9548 REVB = DAG.getNode(Opcode: AArch64ISD::REV32, DL, VT: VST, Operand: Op.getOperand(i: 0));
9549
9550 break;
9551 }
9552
9553 case MVT::v4i32: {
9554 VST = MVT::v16i8;
9555 REVB = DAG.getNode(Opcode: AArch64ISD::REV32, DL, VT: VST, Operand: Op.getOperand(i: 0));
9556
9557 break;
9558 }
9559
9560 case MVT::v1i64: {
9561 VST = MVT::v8i8;
9562 REVB = DAG.getNode(Opcode: AArch64ISD::REV64, DL, VT: VST, Operand: Op.getOperand(i: 0));
9563
9564 break;
9565 }
9566
9567 case MVT::v2i64: {
9568 VST = MVT::v16i8;
9569 REVB = DAG.getNode(Opcode: AArch64ISD::REV64, DL, VT: VST, Operand: Op.getOperand(i: 0));
9570
9571 break;
9572 }
9573 }
9574
9575 return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT,
9576 Operand: DAG.getNode(Opcode: ISD::BITREVERSE, DL, VT: VST, Operand: REVB));
9577}
9578
9579// Check whether the continuous comparison sequence.
9580static bool
9581isOrXorChain(SDValue N, unsigned &Num,
9582 SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) {
9583 if (Num == MaxXors)
9584 return false;
9585
9586 // Skip the one-use zext
9587 if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse())
9588 N = N->getOperand(Num: 0);
9589
9590 // The leaf node must be XOR
9591 if (N->getOpcode() == ISD::XOR) {
9592 WorkList.push_back(Elt: std::make_pair(x: N->getOperand(Num: 0), y: N->getOperand(Num: 1)));
9593 Num++;
9594 return true;
9595 }
9596
9597 // All the non-leaf nodes must be OR.
9598 if (N->getOpcode() != ISD::OR || !N->hasOneUse())
9599 return false;
9600
9601 if (isOrXorChain(N: N->getOperand(Num: 0), Num, WorkList) &&
9602 isOrXorChain(N: N->getOperand(Num: 1), Num, WorkList))
9603 return true;
9604 return false;
9605}
9606
9607// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
9608static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG) {
9609 SDValue LHS = N->getOperand(Num: 0);
9610 SDValue RHS = N->getOperand(Num: 1);
9611 SDLoc DL(N);
9612 EVT VT = N->getValueType(ResNo: 0);
9613 SmallVector<std::pair<SDValue, SDValue>, 16> WorkList;
9614
9615 // Only handle integer compares.
9616 if (N->getOpcode() != ISD::SETCC)
9617 return SDValue();
9618
9619 ISD::CondCode Cond = cast<CondCodeSDNode>(Val: N->getOperand(Num: 2))->get();
9620 // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
9621 // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
9622 unsigned NumXors = 0;
9623 if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(V: RHS) &&
9624 LHS->getOpcode() == ISD::OR && LHS->hasOneUse() &&
9625 isOrXorChain(N: LHS, Num&: NumXors, WorkList)) {
9626 SDValue XOR0, XOR1;
9627 std::tie(args&: XOR0, args&: XOR1) = WorkList[0];
9628 unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
9629 SDValue Cmp = DAG.getSetCC(DL, VT, LHS: XOR0, RHS: XOR1, Cond);
9630 for (unsigned I = 1; I < WorkList.size(); I++) {
9631 std::tie(args&: XOR0, args&: XOR1) = WorkList[I];
9632 SDValue CmpChain = DAG.getSetCC(DL, VT, LHS: XOR0, RHS: XOR1, Cond);
9633 Cmp = DAG.getNode(Opcode: LogicOp, DL, VT, N1: Cmp, N2: CmpChain);
9634 }
9635
9636 // Exit early by inverting the condition, which help reduce indentations.
9637 return Cmp;
9638 }
9639
9640 return SDValue();
9641}
9642
9643SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
9644
9645 if (Op.getValueType().isVector())
9646 return LowerVSETCC(Op, DAG);
9647
9648 bool IsStrict = Op->isStrictFPOpcode();
9649 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
9650 unsigned OpNo = IsStrict ? 1 : 0;
9651 SDValue Chain;
9652 if (IsStrict)
9653 Chain = Op.getOperand(i: 0);
9654 SDValue LHS = Op.getOperand(i: OpNo + 0);
9655 SDValue RHS = Op.getOperand(i: OpNo + 1);
9656 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: OpNo + 2))->get();
9657 SDLoc dl(Op);
9658
9659 // We chose ZeroOrOneBooleanContents, so use zero and one.
9660 EVT VT = Op.getValueType();
9661 SDValue TVal = DAG.getConstant(Val: 1, DL: dl, VT);
9662 SDValue FVal = DAG.getConstant(Val: 0, DL: dl, VT);
9663
9664 // Handle f128 first, since one possible outcome is a normal integer
9665 // comparison which gets picked up by the next if statement.
9666 if (LHS.getValueType() == MVT::f128) {
9667 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain,
9668 IsSignaling);
9669
9670 // If softenSetCCOperands returned a scalar, use it.
9671 if (!RHS.getNode()) {
9672 assert(LHS.getValueType() == Op.getValueType() &&
9673 "Unexpected setcc expansion!");
9674 return IsStrict ? DAG.getMergeValues(Ops: {LHS, Chain}, dl) : LHS;
9675 }
9676 }
9677
9678 if (LHS.getValueType().isInteger()) {
9679 SDValue CCVal;
9680 SDValue Cmp = getAArch64Cmp(
9681 LHS, RHS, CC: ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType()), AArch64cc&: CCVal, DAG, dl);
9682
9683 // Note that we inverted the condition above, so we reverse the order of
9684 // the true and false operands here. This will allow the setcc to be
9685 // matched to a single CSINC instruction.
9686 SDValue Res = DAG.getNode(Opcode: AArch64ISD::CSEL, DL: dl, VT, N1: FVal, N2: TVal, N3: CCVal, N4: Cmp);
9687 return IsStrict ? DAG.getMergeValues(Ops: {Res, Chain}, dl) : Res;
9688 }
9689
9690 // Now we know we're dealing with FP values.
9691 assert(LHS.getValueType() == MVT::bf16 || LHS.getValueType() == MVT::f16 ||
9692 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
9693
9694 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
9695 // and do the comparison.
9696 SDValue Cmp;
9697 if (IsStrict)
9698 Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling);
9699 else
9700 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
9701
9702 AArch64CC::CondCode CC1, CC2;
9703 changeFPCCToAArch64CC(CC, CondCode&: CC1, CondCode2&: CC2);
9704 SDValue Res;
9705 if (CC2 == AArch64CC::AL) {
9706 changeFPCCToAArch64CC(CC: ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType()), CondCode&: CC1,
9707 CondCode2&: CC2);
9708 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9709
9710 // Note that we inverted the condition above, so we reverse the order of
9711 // the true and false operands here. This will allow the setcc to be
9712 // matched to a single CSINC instruction.
9713 Res = DAG.getNode(Opcode: AArch64ISD::CSEL, DL: dl, VT, N1: FVal, N2: TVal, N3: CC1Val, N4: Cmp);
9714 } else {
9715 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
9716 // totally clean. Some of them require two CSELs to implement. As is in
9717 // this case, we emit the first CSEL and then emit a second using the output
9718 // of the first as the RHS. We're effectively OR'ing the two CC's together.
9719
9720 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
9721 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9722 SDValue CS1 =
9723 DAG.getNode(Opcode: AArch64ISD::CSEL, DL: dl, VT, N1: TVal, N2: FVal, N3: CC1Val, N4: Cmp);
9724
9725 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
9726 Res = DAG.getNode(Opcode: AArch64ISD::CSEL, DL: dl, VT, N1: TVal, N2: CS1, N3: CC2Val, N4: Cmp);
9727 }
9728 return IsStrict ? DAG.getMergeValues(Ops: {Res, Cmp.getValue(R: 1)}, dl) : Res;
9729}
9730
9731SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
9732 SelectionDAG &DAG) const {
9733
9734 SDValue LHS = Op.getOperand(i: 0);
9735 SDValue RHS = Op.getOperand(i: 1);
9736 EVT VT = LHS.getValueType();
9737 if (VT != MVT::i32 && VT != MVT::i64)
9738 return SDValue();
9739
9740 SDLoc DL(Op);
9741 SDValue Carry = Op.getOperand(i: 2);
9742 // SBCS uses a carry not a borrow so the carry flag should be inverted first.
9743 SDValue InvCarry = valueToCarryFlag(Value: Carry, DAG, Invert: true);
9744 SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, MVT::Glue),
9745 LHS, RHS, InvCarry);
9746
9747 EVT OpVT = Op.getValueType();
9748 SDValue TVal = DAG.getConstant(Val: 1, DL, VT: OpVT);
9749 SDValue FVal = DAG.getConstant(Val: 0, DL, VT: OpVT);
9750
9751 ISD::CondCode Cond = cast<CondCodeSDNode>(Val: Op.getOperand(i: 3))->get();
9752 ISD::CondCode CondInv = ISD::getSetCCInverse(Operation: Cond, Type: VT);
9753 SDValue CCVal =
9754 DAG.getConstant(changeIntCCToAArch64CC(CondInv), DL, MVT::i32);
9755 // Inputs are swapped because the condition is inverted. This will allow
9756 // matching with a single CSINC instruction.
9757 return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT: OpVT, N1: FVal, N2: TVal, N3: CCVal,
9758 N4: Cmp.getValue(R: 1));
9759}
9760
9761SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
9762 SDValue RHS, SDValue TVal,
9763 SDValue FVal, const SDLoc &dl,
9764 SelectionDAG &DAG) const {
9765 // Handle f128 first, because it will result in a comparison of some RTLIB
9766 // call result against zero.
9767 if (LHS.getValueType() == MVT::f128) {
9768 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
9769
9770 // If softenSetCCOperands returned a scalar, we need to compare the result
9771 // against zero to select between true and false values.
9772 if (!RHS.getNode()) {
9773 RHS = DAG.getConstant(Val: 0, DL: dl, VT: LHS.getValueType());
9774 CC = ISD::SETNE;
9775 }
9776 }
9777
9778 // Also handle f16, for which we need to do a f32 comparison.
9779 if ((LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
9780 LHS.getValueType() == MVT::bf16) {
9781 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
9782 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
9783 }
9784
9785 // Next, handle integers.
9786 if (LHS.getValueType().isInteger()) {
9787 assert((LHS.getValueType() == RHS.getValueType()) &&
9788 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
9789
9790 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(Val&: FVal);
9791 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(Val&: TVal);
9792 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(Val&: RHS);
9793 // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform
9794 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
9795 // supported types.
9796 if (CC == ISD::SETGT && RHSC && RHSC->isAllOnes() && CTVal && CFVal &&
9797 CTVal->isOne() && CFVal->isAllOnes() &&
9798 LHS.getValueType() == TVal.getValueType()) {
9799 EVT VT = LHS.getValueType();
9800 SDValue Shift =
9801 DAG.getNode(Opcode: ISD::SRA, DL: dl, VT, N1: LHS,
9802 N2: DAG.getConstant(Val: VT.getSizeInBits() - 1, DL: dl, VT));
9803 return DAG.getNode(Opcode: ISD::OR, DL: dl, VT, N1: Shift, N2: DAG.getConstant(Val: 1, DL: dl, VT));
9804 }
9805
9806 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
9807 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
9808 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
9809 // Both require less instructions than compare and conditional select.
9810 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal &&
9811 RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&
9812 LHS.getValueType() == RHS.getValueType()) {
9813 EVT VT = LHS.getValueType();
9814 SDValue Shift =
9815 DAG.getNode(Opcode: ISD::SRA, DL: dl, VT, N1: LHS,
9816 N2: DAG.getConstant(Val: VT.getSizeInBits() - 1, DL: dl, VT));
9817
9818 if (CC == ISD::SETGT)
9819 Shift = DAG.getNOT(DL: dl, Val: Shift, VT);
9820
9821 return DAG.getNode(Opcode: ISD::AND, DL: dl, VT, N1: LHS, N2: Shift);
9822 }
9823
9824 unsigned Opcode = AArch64ISD::CSEL;
9825
9826 // If both the TVal and the FVal are constants, see if we can swap them in
9827 // order to for a CSINV or CSINC out of them.
9828 if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
9829 std::swap(a&: TVal, b&: FVal);
9830 std::swap(a&: CTVal, b&: CFVal);
9831 CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
9832 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
9833 std::swap(a&: TVal, b&: FVal);
9834 std::swap(a&: CTVal, b&: CFVal);
9835 CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
9836 } else if (TVal.getOpcode() == ISD::XOR) {
9837 // If TVal is a NOT we want to swap TVal and FVal so that we can match
9838 // with a CSINV rather than a CSEL.
9839 if (isAllOnesConstant(V: TVal.getOperand(i: 1))) {
9840 std::swap(a&: TVal, b&: FVal);
9841 std::swap(a&: CTVal, b&: CFVal);
9842 CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
9843 }
9844 } else if (TVal.getOpcode() == ISD::SUB) {
9845 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
9846 // that we can match with a CSNEG rather than a CSEL.
9847 if (isNullConstant(V: TVal.getOperand(i: 0))) {
9848 std::swap(a&: TVal, b&: FVal);
9849 std::swap(a&: CTVal, b&: CFVal);
9850 CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
9851 }
9852 } else if (CTVal && CFVal) {
9853 const int64_t TrueVal = CTVal->getSExtValue();
9854 const int64_t FalseVal = CFVal->getSExtValue();
9855 bool Swap = false;
9856
9857 // If both TVal and FVal are constants, see if FVal is the
9858 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
9859 // instead of a CSEL in that case.
9860 if (TrueVal == ~FalseVal) {
9861 Opcode = AArch64ISD::CSINV;
9862 } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
9863 TrueVal == -FalseVal) {
9864 Opcode = AArch64ISD::CSNEG;
9865 } else if (TVal.getValueType() == MVT::i32) {
9866 // If our operands are only 32-bit wide, make sure we use 32-bit
9867 // arithmetic for the check whether we can use CSINC. This ensures that
9868 // the addition in the check will wrap around properly in case there is
9869 // an overflow (which would not be the case if we do the check with
9870 // 64-bit arithmetic).
9871 const uint32_t TrueVal32 = CTVal->getZExtValue();
9872 const uint32_t FalseVal32 = CFVal->getZExtValue();
9873
9874 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
9875 Opcode = AArch64ISD::CSINC;
9876
9877 if (TrueVal32 > FalseVal32) {
9878 Swap = true;
9879 }
9880 }
9881 } else {
9882 // 64-bit check whether we can use CSINC.
9883 const uint64_t TrueVal64 = TrueVal;
9884 const uint64_t FalseVal64 = FalseVal;
9885
9886 if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {
9887 Opcode = AArch64ISD::CSINC;
9888
9889 if (TrueVal > FalseVal) {
9890 Swap = true;
9891 }
9892 }
9893 }
9894
9895 // Swap TVal and FVal if necessary.
9896 if (Swap) {
9897 std::swap(a&: TVal, b&: FVal);
9898 std::swap(a&: CTVal, b&: CFVal);
9899 CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
9900 }
9901
9902 if (Opcode != AArch64ISD::CSEL) {
9903 // Drop FVal since we can get its value by simply inverting/negating
9904 // TVal.
9905 FVal = TVal;
9906 }
9907 }
9908
9909 // Avoid materializing a constant when possible by reusing a known value in
9910 // a register. However, don't perform this optimization if the known value
9911 // is one, zero or negative one in the case of a CSEL. We can always
9912 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
9913 // FVal, respectively.
9914 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(Val&: RHS);
9915 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
9916 !RHSVal->isZero() && !RHSVal->isAllOnes()) {
9917 AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
9918 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
9919 // "a != C ? x : a" to avoid materializing C.
9920 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
9921 TVal = LHS;
9922 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
9923 FVal = LHS;
9924 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
9925 assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
9926 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
9927 // avoid materializing C.
9928 AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
9929 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
9930 Opcode = AArch64ISD::CSINV;
9931 TVal = LHS;
9932 FVal = DAG.getConstant(Val: 0, DL: dl, VT: FVal.getValueType());
9933 }
9934 }
9935
9936 SDValue CCVal;
9937 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, AArch64cc&: CCVal, DAG, dl);
9938 EVT VT = TVal.getValueType();
9939 return DAG.getNode(Opcode, DL: dl, VT, N1: TVal, N2: FVal, N3: CCVal, N4: Cmp);
9940 }
9941
9942 // Now we know we're dealing with FP values.
9943 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
9944 LHS.getValueType() == MVT::f64);
9945 assert(LHS.getValueType() == RHS.getValueType());
9946 EVT VT = TVal.getValueType();
9947 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
9948
9949 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
9950 // clean. Some of them require two CSELs to implement.
9951 AArch64CC::CondCode CC1, CC2;
9952 changeFPCCToAArch64CC(CC, CondCode&: CC1, CondCode2&: CC2);
9953
9954 if (DAG.getTarget().Options.UnsafeFPMath) {
9955 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
9956 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
9957 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(Val&: RHS);
9958 if (RHSVal && RHSVal->isZero()) {
9959 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(Val&: FVal);
9960 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(Val&: TVal);
9961
9962 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
9963 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
9964 TVal = LHS;
9965 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
9966 CFVal && CFVal->isZero() &&
9967 FVal.getValueType() == LHS.getValueType())
9968 FVal = LHS;
9969 }
9970 }
9971
9972 // Emit first, and possibly only, CSEL.
9973 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9974 SDValue CS1 = DAG.getNode(Opcode: AArch64ISD::CSEL, DL: dl, VT, N1: TVal, N2: FVal, N3: CC1Val, N4: Cmp);
9975
9976 // If we need a second CSEL, emit it, using the output of the first as the
9977 // RHS. We're effectively OR'ing the two CC's together.
9978 if (CC2 != AArch64CC::AL) {
9979 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
9980 return DAG.getNode(Opcode: AArch64ISD::CSEL, DL: dl, VT, N1: TVal, N2: CS1, N3: CC2Val, N4: Cmp);
9981 }
9982
9983 // Otherwise, return the output of the first CSEL.
9984 return CS1;
9985}
9986
9987SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
9988 SelectionDAG &DAG) const {
9989 EVT Ty = Op.getValueType();
9990 auto Idx = Op.getConstantOperandAPInt(i: 2);
9991 int64_t IdxVal = Idx.getSExtValue();
9992 assert(Ty.isScalableVector() &&
9993 "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
9994
9995 // We can use the splice instruction for certain index values where we are
9996 // able to efficiently generate the correct predicate. The index will be
9997 // inverted and used directly as the input to the ptrue instruction, i.e.
9998 // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
9999 // splice predicate. However, we can only do this if we can guarantee that
10000 // there are enough elements in the vector, hence we check the index <= min
10001 // number of elements.
10002 std::optional<unsigned> PredPattern;
10003 if (Ty.isScalableVector() && IdxVal < 0 &&
10004 (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) !=
10005 std::nullopt) {
10006 SDLoc DL(Op);
10007
10008 // Create a predicate where all but the last -IdxVal elements are false.
10009 EVT PredVT = Ty.changeVectorElementType(MVT::i1);
10010 SDValue Pred = getPTrue(DAG, DL, VT: PredVT, Pattern: *PredPattern);
10011 Pred = DAG.getNode(Opcode: ISD::VECTOR_REVERSE, DL, VT: PredVT, Operand: Pred);
10012
10013 // Now splice the two inputs together using the predicate.
10014 return DAG.getNode(Opcode: AArch64ISD::SPLICE, DL, VT: Ty, N1: Pred, N2: Op.getOperand(i: 0),
10015 N3: Op.getOperand(i: 1));
10016 }
10017
10018 // This will select to an EXT instruction, which has a maximum immediate
10019 // value of 255, hence 2048-bits is the maximum value we can lower.
10020 if (IdxVal >= 0 &&
10021 IdxVal < int64_t(2048 / Ty.getVectorElementType().getSizeInBits()))
10022 return Op;
10023
10024 return SDValue();
10025}
10026
10027SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
10028 SelectionDAG &DAG) const {
10029 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: 4))->get();
10030 SDValue LHS = Op.getOperand(i: 0);
10031 SDValue RHS = Op.getOperand(i: 1);
10032 SDValue TVal = Op.getOperand(i: 2);
10033 SDValue FVal = Op.getOperand(i: 3);
10034 SDLoc DL(Op);
10035 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, dl: DL, DAG);
10036}
10037
10038SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
10039 SelectionDAG &DAG) const {
10040 SDValue CCVal = Op->getOperand(Num: 0);
10041 SDValue TVal = Op->getOperand(Num: 1);
10042 SDValue FVal = Op->getOperand(Num: 2);
10043 SDLoc DL(Op);
10044
10045 EVT Ty = Op.getValueType();
10046 if (Ty == MVT::aarch64svcount) {
10047 TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal);
10048 FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal);
10049 SDValue Sel =
10050 DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal);
10051 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: Ty, Operand: Sel);
10052 }
10053
10054 if (Ty.isScalableVector()) {
10055 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
10056 SDValue SplatPred = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: PredVT, Operand: CCVal);
10057 return DAG.getNode(Opcode: ISD::VSELECT, DL, VT: Ty, N1: SplatPred, N2: TVal, N3: FVal);
10058 }
10059
10060 if (useSVEForFixedLengthVectorVT(VT: Ty, OverrideNEON: !Subtarget->isNeonAvailable())) {
10061 // FIXME: Ideally this would be the same as above using i1 types, however
10062 // for the moment we can't deal with fixed i1 vector types properly, so
10063 // instead extend the predicate to a result type sized integer vector.
10064 MVT SplatValVT = MVT::getIntegerVT(BitWidth: Ty.getScalarSizeInBits());
10065 MVT PredVT = MVT::getVectorVT(VT: SplatValVT, EC: Ty.getVectorElementCount());
10066 SDValue SplatVal = DAG.getSExtOrTrunc(Op: CCVal, DL, VT: SplatValVT);
10067 SDValue SplatPred = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: PredVT, Operand: SplatVal);
10068 return DAG.getNode(Opcode: ISD::VSELECT, DL, VT: Ty, N1: SplatPred, N2: TVal, N3: FVal);
10069 }
10070
10071 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
10072 // instruction.
10073 if (ISD::isOverflowIntrOpRes(Op: CCVal)) {
10074 // Only lower legal XALUO ops.
10075 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: CCVal->getValueType(ResNo: 0)))
10076 return SDValue();
10077
10078 AArch64CC::CondCode OFCC;
10079 SDValue Value, Overflow;
10080 std::tie(args&: Value, args&: Overflow) = getAArch64XALUOOp(CC&: OFCC, Op: CCVal.getValue(R: 0), DAG);
10081 SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
10082
10083 return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT: Op.getValueType(), N1: TVal, N2: FVal,
10084 N3: CCVal, N4: Overflow);
10085 }
10086
10087 // Lower it the same way as we would lower a SELECT_CC node.
10088 ISD::CondCode CC;
10089 SDValue LHS, RHS;
10090 if (CCVal.getOpcode() == ISD::SETCC) {
10091 LHS = CCVal.getOperand(i: 0);
10092 RHS = CCVal.getOperand(i: 1);
10093 CC = cast<CondCodeSDNode>(Val: CCVal.getOperand(i: 2))->get();
10094 } else {
10095 LHS = CCVal;
10096 RHS = DAG.getConstant(Val: 0, DL, VT: CCVal.getValueType());
10097 CC = ISD::SETNE;
10098 }
10099
10100 // If we are lowering a f16 and we do not have fullf16, convert to a f32 in
10101 // order to use FCSELSrrr
10102 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
10103 TVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
10104 DAG.getUNDEF(MVT::f32), TVal);
10105 FVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
10106 DAG.getUNDEF(MVT::f32), FVal);
10107 }
10108
10109 SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, dl: DL, DAG);
10110
10111 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
10112 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
10113 }
10114
10115 return Res;
10116}
10117
10118SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
10119 SelectionDAG &DAG) const {
10120 // Jump table entries as PC relative offsets. No additional tweaking
10121 // is necessary here. Just get the address of the jump table.
10122 JumpTableSDNode *JT = cast<JumpTableSDNode>(Val&: Op);
10123
10124 CodeModel::Model CM = getTargetMachine().getCodeModel();
10125 if (CM == CodeModel::Large && !getTargetMachine().isPositionIndependent() &&
10126 !Subtarget->isTargetMachO())
10127 return getAddrLarge(N: JT, DAG);
10128 if (CM == CodeModel::Tiny)
10129 return getAddrTiny(N: JT, DAG);
10130 return getAddr(N: JT, DAG);
10131}
10132
10133SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
10134 SelectionDAG &DAG) const {
10135 // Jump table entries as PC relative offsets. No additional tweaking
10136 // is necessary here. Just get the address of the jump table.
10137 SDLoc DL(Op);
10138 SDValue JT = Op.getOperand(i: 1);
10139 SDValue Entry = Op.getOperand(i: 2);
10140 int JTI = cast<JumpTableSDNode>(Val: JT.getNode())->getIndex();
10141
10142 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
10143 AFI->setJumpTableEntryInfo(Idx: JTI, Size: 4, PCRelSym: nullptr);
10144
10145 SDNode *Dest =
10146 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
10147 Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
10148 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Chain: Op.getOperand(i: 0), DL);
10149 return DAG.getNode(ISD::BRIND, DL, MVT::Other, JTInfo, SDValue(Dest, 0));
10150}
10151
10152SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
10153 SelectionDAG &DAG) const {
10154 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Val&: Op);
10155 CodeModel::Model CM = getTargetMachine().getCodeModel();
10156 if (CM == CodeModel::Large) {
10157 // Use the GOT for the large code model on iOS.
10158 if (Subtarget->isTargetMachO()) {
10159 return getGOT(N: CP, DAG);
10160 }
10161 if (!getTargetMachine().isPositionIndependent())
10162 return getAddrLarge(N: CP, DAG);
10163 } else if (CM == CodeModel::Tiny) {
10164 return getAddrTiny(N: CP, DAG);
10165 }
10166 return getAddr(N: CP, DAG);
10167}
10168
10169SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
10170 SelectionDAG &DAG) const {
10171 BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Val&: Op);
10172 CodeModel::Model CM = getTargetMachine().getCodeModel();
10173 if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
10174 if (!getTargetMachine().isPositionIndependent())
10175 return getAddrLarge(N: BA, DAG);
10176 } else if (CM == CodeModel::Tiny) {
10177 return getAddrTiny(N: BA, DAG);
10178 }
10179 return getAddr(N: BA, DAG);
10180}
10181
10182SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
10183 SelectionDAG &DAG) const {
10184 AArch64FunctionInfo *FuncInfo =
10185 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
10186
10187 SDLoc DL(Op);
10188 SDValue FR = DAG.getFrameIndex(FI: FuncInfo->getVarArgsStackIndex(),
10189 VT: getPointerTy(DL: DAG.getDataLayout()));
10190 FR = DAG.getZExtOrTrunc(Op: FR, DL, VT: getPointerMemTy(DL: DAG.getDataLayout()));
10191 const Value *SV = cast<SrcValueSDNode>(Val: Op.getOperand(i: 2))->getValue();
10192 return DAG.getStore(Chain: Op.getOperand(i: 0), dl: DL, Val: FR, Ptr: Op.getOperand(i: 1),
10193 PtrInfo: MachinePointerInfo(SV));
10194}
10195
10196SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
10197 SelectionDAG &DAG) const {
10198 MachineFunction &MF = DAG.getMachineFunction();
10199 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
10200
10201 SDLoc DL(Op);
10202 SDValue FR;
10203 if (Subtarget->isWindowsArm64EC()) {
10204 // With the Arm64EC ABI, we compute the address of the varargs save area
10205 // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
10206 // but calls from an entry thunk can pass in a different address.
10207 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
10208 SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64);
10209 uint64_t StackOffset;
10210 if (FuncInfo->getVarArgsGPRSize() > 0)
10211 StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
10212 else
10213 StackOffset = FuncInfo->getVarArgsStackOffset();
10214 FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
10215 DAG.getConstant(StackOffset, DL, MVT::i64));
10216 } else {
10217 FR = DAG.getFrameIndex(FI: FuncInfo->getVarArgsGPRSize() > 0
10218 ? FuncInfo->getVarArgsGPRIndex()
10219 : FuncInfo->getVarArgsStackIndex(),
10220 VT: getPointerTy(DL: DAG.getDataLayout()));
10221 }
10222 const Value *SV = cast<SrcValueSDNode>(Val: Op.getOperand(i: 2))->getValue();
10223 return DAG.getStore(Chain: Op.getOperand(i: 0), dl: DL, Val: FR, Ptr: Op.getOperand(i: 1),
10224 PtrInfo: MachinePointerInfo(SV));
10225}
10226
10227SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
10228 SelectionDAG &DAG) const {
10229 // The layout of the va_list struct is specified in the AArch64 Procedure Call
10230 // Standard, section B.3.
10231 MachineFunction &MF = DAG.getMachineFunction();
10232 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
10233 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
10234 auto PtrMemVT = getPointerMemTy(DL: DAG.getDataLayout());
10235 auto PtrVT = getPointerTy(DL: DAG.getDataLayout());
10236 SDLoc DL(Op);
10237
10238 SDValue Chain = Op.getOperand(i: 0);
10239 SDValue VAList = Op.getOperand(i: 1);
10240 const Value *SV = cast<SrcValueSDNode>(Val: Op.getOperand(i: 2))->getValue();
10241 SmallVector<SDValue, 4> MemOps;
10242
10243 // void *__stack at offset 0
10244 unsigned Offset = 0;
10245 SDValue Stack = DAG.getFrameIndex(FI: FuncInfo->getVarArgsStackIndex(), VT: PtrVT);
10246 Stack = DAG.getZExtOrTrunc(Op: Stack, DL, VT: PtrMemVT);
10247 MemOps.push_back(Elt: DAG.getStore(Chain, dl: DL, Val: Stack, Ptr: VAList,
10248 PtrInfo: MachinePointerInfo(SV), Alignment: Align(PtrSize)));
10249
10250 // void *__gr_top at offset 8 (4 on ILP32)
10251 Offset += PtrSize;
10252 int GPRSize = FuncInfo->getVarArgsGPRSize();
10253 if (GPRSize > 0) {
10254 SDValue GRTop, GRTopAddr;
10255
10256 GRTopAddr = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VAList,
10257 N2: DAG.getConstant(Val: Offset, DL, VT: PtrVT));
10258
10259 GRTop = DAG.getFrameIndex(FI: FuncInfo->getVarArgsGPRIndex(), VT: PtrVT);
10260 GRTop = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: GRTop,
10261 N2: DAG.getConstant(Val: GPRSize, DL, VT: PtrVT));
10262 GRTop = DAG.getZExtOrTrunc(Op: GRTop, DL, VT: PtrMemVT);
10263
10264 MemOps.push_back(Elt: DAG.getStore(Chain, dl: DL, Val: GRTop, Ptr: GRTopAddr,
10265 PtrInfo: MachinePointerInfo(SV, Offset),
10266 Alignment: Align(PtrSize)));
10267 }
10268
10269 // void *__vr_top at offset 16 (8 on ILP32)
10270 Offset += PtrSize;
10271 int FPRSize = FuncInfo->getVarArgsFPRSize();
10272 if (FPRSize > 0) {
10273 SDValue VRTop, VRTopAddr;
10274 VRTopAddr = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VAList,
10275 N2: DAG.getConstant(Val: Offset, DL, VT: PtrVT));
10276
10277 VRTop = DAG.getFrameIndex(FI: FuncInfo->getVarArgsFPRIndex(), VT: PtrVT);
10278 VRTop = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VRTop,
10279 N2: DAG.getConstant(Val: FPRSize, DL, VT: PtrVT));
10280 VRTop = DAG.getZExtOrTrunc(Op: VRTop, DL, VT: PtrMemVT);
10281
10282 MemOps.push_back(Elt: DAG.getStore(Chain, dl: DL, Val: VRTop, Ptr: VRTopAddr,
10283 PtrInfo: MachinePointerInfo(SV, Offset),
10284 Alignment: Align(PtrSize)));
10285 }
10286
10287 // int __gr_offs at offset 24 (12 on ILP32)
10288 Offset += PtrSize;
10289 SDValue GROffsAddr = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VAList,
10290 N2: DAG.getConstant(Val: Offset, DL, VT: PtrVT));
10291 MemOps.push_back(
10292 DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32),
10293 GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
10294
10295 // int __vr_offs at offset 28 (16 on ILP32)
10296 Offset += 4;
10297 SDValue VROffsAddr = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VAList,
10298 N2: DAG.getConstant(Val: Offset, DL, VT: PtrVT));
10299 MemOps.push_back(
10300 DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32),
10301 VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
10302
10303 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
10304}
10305
10306SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
10307 SelectionDAG &DAG) const {
10308 MachineFunction &MF = DAG.getMachineFunction();
10309
10310 if (Subtarget->isCallingConvWin64(CC: MF.getFunction().getCallingConv()))
10311 return LowerWin64_VASTART(Op, DAG);
10312 else if (Subtarget->isTargetDarwin())
10313 return LowerDarwin_VASTART(Op, DAG);
10314 else
10315 return LowerAAPCS_VASTART(Op, DAG);
10316}
10317
10318SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
10319 SelectionDAG &DAG) const {
10320 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
10321 // pointer.
10322 SDLoc DL(Op);
10323 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
10324 unsigned VaListSize =
10325 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
10326 ? PtrSize
10327 : Subtarget->isTargetILP32() ? 20 : 32;
10328 const Value *DestSV = cast<SrcValueSDNode>(Val: Op.getOperand(i: 3))->getValue();
10329 const Value *SrcSV = cast<SrcValueSDNode>(Val: Op.getOperand(i: 4))->getValue();
10330
10331 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
10332 DAG.getConstant(VaListSize, DL, MVT::i32),
10333 Align(PtrSize), false, false, false,
10334 MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
10335}
10336
10337SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
10338 assert(Subtarget->isTargetDarwin() &&
10339 "automatic va_arg instruction only works on Darwin");
10340
10341 const Value *V = cast<SrcValueSDNode>(Val: Op.getOperand(i: 2))->getValue();
10342 EVT VT = Op.getValueType();
10343 SDLoc DL(Op);
10344 SDValue Chain = Op.getOperand(i: 0);
10345 SDValue Addr = Op.getOperand(i: 1);
10346 MaybeAlign Align(Op.getConstantOperandVal(i: 3));
10347 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
10348 auto PtrVT = getPointerTy(DL: DAG.getDataLayout());
10349 auto PtrMemVT = getPointerMemTy(DL: DAG.getDataLayout());
10350 SDValue VAList =
10351 DAG.getLoad(VT: PtrMemVT, dl: DL, Chain, Ptr: Addr, PtrInfo: MachinePointerInfo(V));
10352 Chain = VAList.getValue(R: 1);
10353 VAList = DAG.getZExtOrTrunc(Op: VAList, DL, VT: PtrVT);
10354
10355 if (VT.isScalableVector())
10356 report_fatal_error(reason: "Passing SVE types to variadic functions is "
10357 "currently not supported");
10358
10359 if (Align && *Align > MinSlotSize) {
10360 VAList = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VAList,
10361 N2: DAG.getConstant(Val: Align->value() - 1, DL, VT: PtrVT));
10362 VAList = DAG.getNode(Opcode: ISD::AND, DL, VT: PtrVT, N1: VAList,
10363 N2: DAG.getConstant(Val: -(int64_t)Align->value(), DL, VT: PtrVT));
10364 }
10365
10366 Type *ArgTy = VT.getTypeForEVT(Context&: *DAG.getContext());
10367 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(Ty: ArgTy);
10368
10369 // Scalar integer and FP values smaller than 64 bits are implicitly extended
10370 // up to 64 bits. At the very least, we have to increase the striding of the
10371 // vaargs list to match this, and for FP values we need to introduce
10372 // FP_ROUND nodes as well.
10373 if (VT.isInteger() && !VT.isVector())
10374 ArgSize = std::max(a: ArgSize, b: MinSlotSize);
10375 bool NeedFPTrunc = false;
10376 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
10377 ArgSize = 8;
10378 NeedFPTrunc = true;
10379 }
10380
10381 // Increment the pointer, VAList, to the next vaarg
10382 SDValue VANext = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VAList,
10383 N2: DAG.getConstant(Val: ArgSize, DL, VT: PtrVT));
10384 VANext = DAG.getZExtOrTrunc(Op: VANext, DL, VT: PtrMemVT);
10385
10386 // Store the incremented VAList to the legalized pointer
10387 SDValue APStore =
10388 DAG.getStore(Chain, dl: DL, Val: VANext, Ptr: Addr, PtrInfo: MachinePointerInfo(V));
10389
10390 // Load the actual argument out of the pointer VAList
10391 if (NeedFPTrunc) {
10392 // Load the value as an f64.
10393 SDValue WideFP =
10394 DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
10395 // Round the value down to an f32.
10396 SDValue NarrowFP =
10397 DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: WideFP.getValue(R: 0),
10398 N2: DAG.getIntPtrConstant(Val: 1, DL, /*isTarget=*/true));
10399 SDValue Ops[] = { NarrowFP, WideFP.getValue(R: 1) };
10400 // Merge the rounded value with the chain output of the load.
10401 return DAG.getMergeValues(Ops, dl: DL);
10402 }
10403
10404 return DAG.getLoad(VT, dl: DL, Chain: APStore, Ptr: VAList, PtrInfo: MachinePointerInfo());
10405}
10406
10407SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
10408 SelectionDAG &DAG) const {
10409 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
10410 MFI.setFrameAddressIsTaken(true);
10411
10412 EVT VT = Op.getValueType();
10413 SDLoc DL(Op);
10414 unsigned Depth = Op.getConstantOperandVal(i: 0);
10415 SDValue FrameAddr =
10416 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
10417 while (Depth--)
10418 FrameAddr = DAG.getLoad(VT, dl: DL, Chain: DAG.getEntryNode(), Ptr: FrameAddr,
10419 PtrInfo: MachinePointerInfo());
10420
10421 if (Subtarget->isTargetILP32())
10422 FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
10423 DAG.getValueType(VT));
10424
10425 return FrameAddr;
10426}
10427
10428SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
10429 SelectionDAG &DAG) const {
10430 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
10431
10432 EVT VT = getPointerTy(DL: DAG.getDataLayout());
10433 SDLoc DL(Op);
10434 int FI = MFI.CreateFixedObject(Size: 4, SPOffset: 0, IsImmutable: false);
10435 return DAG.getFrameIndex(FI, VT);
10436}
10437
10438#define GET_REGISTER_MATCHER
10439#include "AArch64GenAsmMatcher.inc"
10440
10441// FIXME? Maybe this could be a TableGen attribute on some registers and
10442// this table could be generated automatically from RegInfo.
10443Register AArch64TargetLowering::
10444getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
10445 Register Reg = MatchRegisterName(RegName);
10446 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
10447 const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();
10448 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
10449 if (!Subtarget->isXRegisterReserved(i: DwarfRegNum) &&
10450 !MRI->isReservedReg(MF, Reg))
10451 Reg = 0;
10452 }
10453 if (Reg)
10454 return Reg;
10455 report_fatal_error(reason: Twine("Invalid register name \""
10456 + StringRef(RegName) + "\"."));
10457}
10458
10459SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
10460 SelectionDAG &DAG) const {
10461 DAG.getMachineFunction().getFrameInfo().setFrameAddressIsTaken(true);
10462
10463 EVT VT = Op.getValueType();
10464 SDLoc DL(Op);
10465
10466 SDValue FrameAddr =
10467 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
10468 SDValue Offset = DAG.getConstant(Val: 8, DL, VT: getPointerTy(DL: DAG.getDataLayout()));
10469
10470 return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: FrameAddr, N2: Offset);
10471}
10472
10473SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
10474 SelectionDAG &DAG) const {
10475 MachineFunction &MF = DAG.getMachineFunction();
10476 MachineFrameInfo &MFI = MF.getFrameInfo();
10477 MFI.setReturnAddressIsTaken(true);
10478
10479 EVT VT = Op.getValueType();
10480 SDLoc DL(Op);
10481 unsigned Depth = Op.getConstantOperandVal(i: 0);
10482 SDValue ReturnAddress;
10483 if (Depth) {
10484 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
10485 SDValue Offset = DAG.getConstant(Val: 8, DL, VT: getPointerTy(DL: DAG.getDataLayout()));
10486 ReturnAddress = DAG.getLoad(
10487 VT, dl: DL, Chain: DAG.getEntryNode(),
10488 Ptr: DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: FrameAddr, N2: Offset), PtrInfo: MachinePointerInfo());
10489 } else {
10490 // Return LR, which contains the return address. Mark it an implicit
10491 // live-in.
10492 Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
10493 ReturnAddress = DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg, VT);
10494 }
10495
10496 // The XPACLRI instruction assembles to a hint-space instruction before
10497 // Armv8.3-A therefore this instruction can be safely used for any pre
10498 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
10499 // that instead.
10500 SDNode *St;
10501 if (Subtarget->hasPAuth()) {
10502 St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
10503 } else {
10504 // XPACLRI operates on LR therefore we must move the operand accordingly.
10505 SDValue Chain =
10506 DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
10507 St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
10508 }
10509 return SDValue(St, 0);
10510}
10511
10512/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
10513/// i32 values and take a 2 x i32 value to shift plus a shift amount.
10514SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
10515 SelectionDAG &DAG) const {
10516 SDValue Lo, Hi;
10517 expandShiftParts(N: Op.getNode(), Lo, Hi, DAG);
10518 return DAG.getMergeValues(Ops: {Lo, Hi}, dl: SDLoc(Op));
10519}
10520
10521bool AArch64TargetLowering::isOffsetFoldingLegal(
10522 const GlobalAddressSDNode *GA) const {
10523 // Offsets are folded in the DAG combine rather than here so that we can
10524 // intelligently choose an offset based on the uses.
10525 return false;
10526}
10527
10528bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
10529 bool OptForSize) const {
10530 bool IsLegal = false;
10531 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
10532 // 16-bit case when target has full fp16 support.
10533 // We encode bf16 bit patterns as if they were fp16. This results in very
10534 // strange looking assembly but should populate the register with appropriate
10535 // values. Let's say we wanted to encode 0xR3FC0 which is 1.5 in BF16. We will
10536 // end up encoding this as the imm8 0x7f. This imm8 will be expanded to the
10537 // FP16 1.9375 which shares the same bit pattern as BF16 1.5.
10538 // FIXME: We should be able to handle f128 as well with a clever lowering.
10539 const APInt ImmInt = Imm.bitcastToAPInt();
10540 if (VT == MVT::f64)
10541 IsLegal = AArch64_AM::getFP64Imm(Imm: ImmInt) != -1 || Imm.isPosZero();
10542 else if (VT == MVT::f32)
10543 IsLegal = AArch64_AM::getFP32Imm(Imm: ImmInt) != -1 || Imm.isPosZero();
10544 else if (VT == MVT::f16 || VT == MVT::bf16)
10545 IsLegal =
10546 (Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(Imm: ImmInt) != -1) ||
10547 Imm.isPosZero();
10548
10549 // If we can not materialize in immediate field for fmov, check if the
10550 // value can be encoded as the immediate operand of a logical instruction.
10551 // The immediate value will be created with either MOVZ, MOVN, or ORR.
10552 // TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to
10553 // generate that fmov.
10554 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
10555 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
10556 // however the mov+fmov sequence is always better because of the reduced
10557 // cache pressure. The timings are still the same if you consider
10558 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
10559 // movw+movk is fused). So we limit up to 2 instrdduction at most.
10560 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
10561 AArch64_IMM::expandMOVImm(Imm: ImmInt.getZExtValue(), BitSize: VT.getSizeInBits(), Insn);
10562 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2));
10563 IsLegal = Insn.size() <= Limit;
10564 }
10565
10566 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT
10567 << " imm value: "; Imm.dump(););
10568 return IsLegal;
10569}
10570
10571//===----------------------------------------------------------------------===//
10572// AArch64 Optimization Hooks
10573//===----------------------------------------------------------------------===//
10574
10575static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
10576 SDValue Operand, SelectionDAG &DAG,
10577 int &ExtraSteps) {
10578 EVT VT = Operand.getValueType();
10579 if ((ST->hasNEON() &&
10580 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
10581 VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
10582 VT == MVT::v4f32)) ||
10583 (ST->hasSVE() &&
10584 (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
10585 if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified) {
10586 // For the reciprocal estimates, convergence is quadratic, so the number
10587 // of digits is doubled after each iteration. In ARMv8, the accuracy of
10588 // the initial estimate is 2^-8. Thus the number of extra steps to refine
10589 // the result for float (23 mantissa bits) is 2 and for double (52
10590 // mantissa bits) is 3.
10591 constexpr unsigned AccurateBits = 8;
10592 unsigned DesiredBits =
10593 APFloat::semanticsPrecision(DAG.EVTToAPFloatSemantics(VT));
10594 ExtraSteps = DesiredBits <= AccurateBits
10595 ? 0
10596 : Log2_64_Ceil(Value: DesiredBits) - Log2_64_Ceil(Value: AccurateBits);
10597 }
10598
10599 return DAG.getNode(Opcode, DL: SDLoc(Operand), VT, Operand);
10600 }
10601
10602 return SDValue();
10603}
10604
10605SDValue
10606AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
10607 const DenormalMode &Mode) const {
10608 SDLoc DL(Op);
10609 EVT VT = Op.getValueType();
10610 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), C&: *DAG.getContext(), VT);
10611 SDValue FPZero = DAG.getConstantFP(Val: 0.0, DL, VT);
10612 return DAG.getSetCC(DL, VT: CCVT, LHS: Op, RHS: FPZero, Cond: ISD::SETEQ);
10613}
10614
10615SDValue
10616AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
10617 SelectionDAG &DAG) const {
10618 return Op;
10619}
10620
10621SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
10622 SelectionDAG &DAG, int Enabled,
10623 int &ExtraSteps,
10624 bool &UseOneConst,
10625 bool Reciprocal) const {
10626 if (Enabled == ReciprocalEstimate::Enabled ||
10627 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
10628 if (SDValue Estimate = getEstimate(ST: Subtarget, Opcode: AArch64ISD::FRSQRTE, Operand,
10629 DAG, ExtraSteps)) {
10630 SDLoc DL(Operand);
10631 EVT VT = Operand.getValueType();
10632
10633 SDNodeFlags Flags;
10634 Flags.setAllowReassociation(true);
10635
10636 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
10637 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
10638 for (int i = ExtraSteps; i > 0; --i) {
10639 SDValue Step = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Estimate, N2: Estimate,
10640 Flags);
10641 Step = DAG.getNode(Opcode: AArch64ISD::FRSQRTS, DL, VT, N1: Operand, N2: Step, Flags);
10642 Estimate = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Estimate, N2: Step, Flags);
10643 }
10644 if (!Reciprocal)
10645 Estimate = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Operand, N2: Estimate, Flags);
10646
10647 ExtraSteps = 0;
10648 return Estimate;
10649 }
10650
10651 return SDValue();
10652}
10653
10654SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
10655 SelectionDAG &DAG, int Enabled,
10656 int &ExtraSteps) const {
10657 if (Enabled == ReciprocalEstimate::Enabled)
10658 if (SDValue Estimate = getEstimate(ST: Subtarget, Opcode: AArch64ISD::FRECPE, Operand,
10659 DAG, ExtraSteps)) {
10660 SDLoc DL(Operand);
10661 EVT VT = Operand.getValueType();
10662
10663 SDNodeFlags Flags;
10664 Flags.setAllowReassociation(true);
10665
10666 // Newton reciprocal iteration: E * (2 - X * E)
10667 // AArch64 reciprocal iteration instruction: (2 - M * N)
10668 for (int i = ExtraSteps; i > 0; --i) {
10669 SDValue Step = DAG.getNode(Opcode: AArch64ISD::FRECPS, DL, VT, N1: Operand,
10670 N2: Estimate, Flags);
10671 Estimate = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Estimate, N2: Step, Flags);
10672 }
10673
10674 ExtraSteps = 0;
10675 return Estimate;
10676 }
10677
10678 return SDValue();
10679}
10680
10681//===----------------------------------------------------------------------===//
10682// AArch64 Inline Assembly Support
10683//===----------------------------------------------------------------------===//
10684
10685// Table of Constraints
10686// TODO: This is the current set of constraints supported by ARM for the
10687// compiler, not all of them may make sense.
10688//
10689// r - A general register
10690// w - An FP/SIMD register of some size in the range v0-v31
10691// x - An FP/SIMD register of some size in the range v0-v15
10692// I - Constant that can be used with an ADD instruction
10693// J - Constant that can be used with a SUB instruction
10694// K - Constant that can be used with a 32-bit logical instruction
10695// L - Constant that can be used with a 64-bit logical instruction
10696// M - Constant that can be used as a 32-bit MOV immediate
10697// N - Constant that can be used as a 64-bit MOV immediate
10698// Q - A memory reference with base register and no offset
10699// S - A symbolic address
10700// Y - Floating point constant zero
10701// Z - Integer constant zero
10702//
10703// Note that general register operands will be output using their 64-bit x
10704// register name, whatever the size of the variable, unless the asm operand
10705// is prefixed by the %w modifier. Floating-point and SIMD register operands
10706// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
10707// %q modifier.
10708const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
10709 // At this point, we have to lower this constraint to something else, so we
10710 // lower it to an "r" or "w". However, by doing this we will force the result
10711 // to be in register, while the X constraint is much more permissive.
10712 //
10713 // Although we are correct (we are free to emit anything, without
10714 // constraints), we might break use cases that would expect us to be more
10715 // efficient and emit something else.
10716 if (!Subtarget->hasFPARMv8())
10717 return "r";
10718
10719 if (ConstraintVT.isFloatingPoint())
10720 return "w";
10721
10722 if (ConstraintVT.isVector() &&
10723 (ConstraintVT.getSizeInBits() == 64 ||
10724 ConstraintVT.getSizeInBits() == 128))
10725 return "w";
10726
10727 return "r";
10728}
10729
10730enum class PredicateConstraint { Uph, Upl, Upa };
10731
10732static std::optional<PredicateConstraint>
10733parsePredicateConstraint(StringRef Constraint) {
10734 return StringSwitch<std::optional<PredicateConstraint>>(Constraint)
10735 .Case(S: "Uph", Value: PredicateConstraint::Uph)
10736 .Case(S: "Upl", Value: PredicateConstraint::Upl)
10737 .Case(S: "Upa", Value: PredicateConstraint::Upa)
10738 .Default(Value: std::nullopt);
10739}
10740
10741static const TargetRegisterClass *
10742getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT) {
10743 if (VT != MVT::aarch64svcount &&
10744 (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1))
10745 return nullptr;
10746
10747 switch (Constraint) {
10748 case PredicateConstraint::Uph:
10749 return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass
10750 : &AArch64::PPR_p8to15RegClass;
10751 case PredicateConstraint::Upl:
10752 return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass
10753 : &AArch64::PPR_3bRegClass;
10754 case PredicateConstraint::Upa:
10755 return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass
10756 : &AArch64::PPRRegClass;
10757 }
10758
10759 llvm_unreachable("Missing PredicateConstraint!");
10760}
10761
10762enum class ReducedGprConstraint { Uci, Ucj };
10763
10764static std::optional<ReducedGprConstraint>
10765parseReducedGprConstraint(StringRef Constraint) {
10766 return StringSwitch<std::optional<ReducedGprConstraint>>(Constraint)
10767 .Case(S: "Uci", Value: ReducedGprConstraint::Uci)
10768 .Case(S: "Ucj", Value: ReducedGprConstraint::Ucj)
10769 .Default(Value: std::nullopt);
10770}
10771
10772static const TargetRegisterClass *
10773getReducedGprRegisterClass(ReducedGprConstraint Constraint, EVT VT) {
10774 if (!VT.isScalarInteger() || VT.getFixedSizeInBits() > 64)
10775 return nullptr;
10776
10777 switch (Constraint) {
10778 case ReducedGprConstraint::Uci:
10779 return &AArch64::MatrixIndexGPR32_8_11RegClass;
10780 case ReducedGprConstraint::Ucj:
10781 return &AArch64::MatrixIndexGPR32_12_15RegClass;
10782 }
10783
10784 llvm_unreachable("Missing ReducedGprConstraint!");
10785}
10786
10787// The set of cc code supported is from
10788// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands
10789static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint) {
10790 AArch64CC::CondCode Cond = StringSwitch<AArch64CC::CondCode>(Constraint)
10791 .Case(S: "{@cchi}", Value: AArch64CC::HI)
10792 .Case(S: "{@cccs}", Value: AArch64CC::HS)
10793 .Case(S: "{@cclo}", Value: AArch64CC::LO)
10794 .Case(S: "{@ccls}", Value: AArch64CC::LS)
10795 .Case(S: "{@cccc}", Value: AArch64CC::LO)
10796 .Case(S: "{@cceq}", Value: AArch64CC::EQ)
10797 .Case(S: "{@ccgt}", Value: AArch64CC::GT)
10798 .Case(S: "{@ccge}", Value: AArch64CC::GE)
10799 .Case(S: "{@cclt}", Value: AArch64CC::LT)
10800 .Case(S: "{@ccle}", Value: AArch64CC::LE)
10801 .Case(S: "{@cchs}", Value: AArch64CC::HS)
10802 .Case(S: "{@ccne}", Value: AArch64CC::NE)
10803 .Case(S: "{@ccvc}", Value: AArch64CC::VC)
10804 .Case(S: "{@ccpl}", Value: AArch64CC::PL)
10805 .Case(S: "{@ccvs}", Value: AArch64CC::VS)
10806 .Case(S: "{@ccmi}", Value: AArch64CC::MI)
10807 .Default(Value: AArch64CC::Invalid);
10808 return Cond;
10809}
10810
10811/// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR,
10812/// WZR, invert(<cond>)'.
10813static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL,
10814 SelectionDAG &DAG) {
10815 return DAG.getNode(
10816 AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
10817 DAG.getConstant(0, DL, MVT::i32),
10818 DAG.getConstant(getInvertedCondCode(CC), DL, MVT::i32), NZCV);
10819}
10820
10821// Lower @cc flag output via getSETCC.
10822SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
10823 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
10824 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
10825 AArch64CC::CondCode Cond = parseConstraintCode(Constraint: OpInfo.ConstraintCode);
10826 if (Cond == AArch64CC::Invalid)
10827 return SDValue();
10828 // The output variable should be a scalar integer.
10829 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
10830 OpInfo.ConstraintVT.getSizeInBits() < 8)
10831 report_fatal_error(reason: "Flag output operand is of invalid type");
10832
10833 // Get NZCV register. Only update chain when copyfrom is glued.
10834 if (Glue.getNode()) {
10835 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32, Glue);
10836 Chain = Glue.getValue(R: 1);
10837 } else
10838 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32);
10839 // Extract CC code.
10840 SDValue CC = getSETCC(CC: Cond, NZCV: Glue, DL, DAG);
10841
10842 SDValue Result;
10843
10844 // Truncate or ZERO_EXTEND based on value types.
10845 if (OpInfo.ConstraintVT.getSizeInBits() <= 32)
10846 Result = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: OpInfo.ConstraintVT, Operand: CC);
10847 else
10848 Result = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: OpInfo.ConstraintVT, Operand: CC);
10849
10850 return Result;
10851}
10852
10853/// getConstraintType - Given a constraint letter, return the type of
10854/// constraint it is for this target.
10855AArch64TargetLowering::ConstraintType
10856AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
10857 if (Constraint.size() == 1) {
10858 switch (Constraint[0]) {
10859 default:
10860 break;
10861 case 'x':
10862 case 'w':
10863 case 'y':
10864 return C_RegisterClass;
10865 // An address with a single base register. Due to the way we
10866 // currently handle addresses it is the same as 'r'.
10867 case 'Q':
10868 return C_Memory;
10869 case 'I':
10870 case 'J':
10871 case 'K':
10872 case 'L':
10873 case 'M':
10874 case 'N':
10875 case 'Y':
10876 case 'Z':
10877 return C_Immediate;
10878 case 'z':
10879 case 'S': // A symbol or label reference with a constant offset
10880 return C_Other;
10881 }
10882 } else if (parsePredicateConstraint(Constraint))
10883 return C_RegisterClass;
10884 else if (parseReducedGprConstraint(Constraint))
10885 return C_RegisterClass;
10886 else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
10887 return C_Other;
10888 return TargetLowering::getConstraintType(Constraint);
10889}
10890
10891/// Examine constraint type and operand type and determine a weight value.
10892/// This object must already have been set up with the operand type
10893/// and the current alternative constraint selected.
10894TargetLowering::ConstraintWeight
10895AArch64TargetLowering::getSingleConstraintMatchWeight(
10896 AsmOperandInfo &info, const char *constraint) const {
10897 ConstraintWeight weight = CW_Invalid;
10898 Value *CallOperandVal = info.CallOperandVal;
10899 // If we don't have a value, we can't do a match,
10900 // but allow it at the lowest weight.
10901 if (!CallOperandVal)
10902 return CW_Default;
10903 Type *type = CallOperandVal->getType();
10904 // Look at the constraint type.
10905 switch (*constraint) {
10906 default:
10907 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
10908 break;
10909 case 'x':
10910 case 'w':
10911 case 'y':
10912 if (type->isFloatingPointTy() || type->isVectorTy())
10913 weight = CW_Register;
10914 break;
10915 case 'z':
10916 weight = CW_Constant;
10917 break;
10918 case 'U':
10919 if (parsePredicateConstraint(Constraint: constraint) ||
10920 parseReducedGprConstraint(Constraint: constraint))
10921 weight = CW_Register;
10922 break;
10923 }
10924 return weight;
10925}
10926
10927std::pair<unsigned, const TargetRegisterClass *>
10928AArch64TargetLowering::getRegForInlineAsmConstraint(
10929 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
10930 if (Constraint.size() == 1) {
10931 switch (Constraint[0]) {
10932 case 'r':
10933 if (VT.isScalableVector())
10934 return std::make_pair(x: 0U, y: nullptr);
10935 if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
10936 return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
10937 if (VT.getFixedSizeInBits() == 64)
10938 return std::make_pair(0U, &AArch64::GPR64commonRegClass);
10939 return std::make_pair(0U, &AArch64::GPR32commonRegClass);
10940 case 'w': {
10941 if (!Subtarget->hasFPARMv8())
10942 break;
10943 if (VT.isScalableVector()) {
10944 if (VT.getVectorElementType() != MVT::i1)
10945 return std::make_pair(0U, &AArch64::ZPRRegClass);
10946 return std::make_pair(x: 0U, y: nullptr);
10947 }
10948 uint64_t VTSize = VT.getFixedSizeInBits();
10949 if (VTSize == 16)
10950 return std::make_pair(0U, &AArch64::FPR16RegClass);
10951 if (VTSize == 32)
10952 return std::make_pair(0U, &AArch64::FPR32RegClass);
10953 if (VTSize == 64)
10954 return std::make_pair(0U, &AArch64::FPR64RegClass);
10955 if (VTSize == 128)
10956 return std::make_pair(0U, &AArch64::FPR128RegClass);
10957 break;
10958 }
10959 // The instructions that this constraint is designed for can
10960 // only take 128-bit registers so just use that regclass.
10961 case 'x':
10962 if (!Subtarget->hasFPARMv8())
10963 break;
10964 if (VT.isScalableVector())
10965 return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
10966 if (VT.getSizeInBits() == 128)
10967 return std::make_pair(0U, &AArch64::FPR128_loRegClass);
10968 break;
10969 case 'y':
10970 if (!Subtarget->hasFPARMv8())
10971 break;
10972 if (VT.isScalableVector())
10973 return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
10974 break;
10975 }
10976 } else {
10977 if (const auto PC = parsePredicateConstraint(Constraint))
10978 if (const auto *RegClass = getPredicateRegisterClass(Constraint: *PC, VT))
10979 return std::make_pair(x: 0U, y&: RegClass);
10980
10981 if (const auto RGC = parseReducedGprConstraint(Constraint))
10982 if (const auto *RegClass = getReducedGprRegisterClass(Constraint: *RGC, VT))
10983 return std::make_pair(x: 0U, y&: RegClass);
10984 }
10985 if (StringRef("{cc}").equals_insensitive(Constraint) ||
10986 parseConstraintCode(Constraint) != AArch64CC::Invalid)
10987 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
10988
10989 if (Constraint == "{za}") {
10990 return std::make_pair(unsigned(AArch64::ZA), &AArch64::MPRRegClass);
10991 }
10992
10993 if (Constraint == "{zt0}") {
10994 return std::make_pair(unsigned(AArch64::ZT0), &AArch64::ZTRRegClass);
10995 }
10996
10997 // Use the default implementation in TargetLowering to convert the register
10998 // constraint into a member of a register class.
10999 std::pair<unsigned, const TargetRegisterClass *> Res;
11000 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
11001
11002 // Not found as a standard register?
11003 if (!Res.second) {
11004 unsigned Size = Constraint.size();
11005 if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
11006 tolower(c: Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
11007 int RegNo;
11008 bool Failed = Constraint.slice(Start: 2, End: Size - 1).getAsInteger(Radix: 10, Result&: RegNo);
11009 if (!Failed && RegNo >= 0 && RegNo <= 31) {
11010 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
11011 // By default we'll emit v0-v31 for this unless there's a modifier where
11012 // we'll emit the correct register as well.
11013 if (VT != MVT::Other && VT.getSizeInBits() == 64) {
11014 Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
11015 Res.second = &AArch64::FPR64RegClass;
11016 } else {
11017 Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
11018 Res.second = &AArch64::FPR128RegClass;
11019 }
11020 }
11021 }
11022 }
11023
11024 if (Res.second && !Subtarget->hasFPARMv8() &&
11025 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
11026 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
11027 return std::make_pair(x: 0U, y: nullptr);
11028
11029 return Res;
11030}
11031
11032EVT AArch64TargetLowering::getAsmOperandValueType(const DataLayout &DL,
11033 llvm::Type *Ty,
11034 bool AllowUnknown) const {
11035 if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
11036 return EVT(MVT::i64x8);
11037
11038 return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
11039}
11040
11041/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
11042/// vector. If it is invalid, don't add anything to Ops.
11043void AArch64TargetLowering::LowerAsmOperandForConstraint(
11044 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
11045 SelectionDAG &DAG) const {
11046 SDValue Result;
11047
11048 // Currently only support length 1 constraints.
11049 if (Constraint.size() != 1)
11050 return;
11051
11052 char ConstraintLetter = Constraint[0];
11053 switch (ConstraintLetter) {
11054 default:
11055 break;
11056
11057 // This set of constraints deal with valid constants for various instructions.
11058 // Validate and return a target constant for them if we can.
11059 case 'z': {
11060 // 'z' maps to xzr or wzr so it needs an input of 0.
11061 if (!isNullConstant(V: Op))
11062 return;
11063
11064 if (Op.getValueType() == MVT::i64)
11065 Result = DAG.getRegister(AArch64::XZR, MVT::i64);
11066 else
11067 Result = DAG.getRegister(AArch64::WZR, MVT::i32);
11068 break;
11069 }
11070 case 'S':
11071 // Use the generic code path for "s". In GCC's aarch64 port, "S" is
11072 // supported for PIC while "s" isn't, making "s" less useful. We implement
11073 // "S" but not "s".
11074 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint: "s", Ops, DAG);
11075 break;
11076
11077 case 'I':
11078 case 'J':
11079 case 'K':
11080 case 'L':
11081 case 'M':
11082 case 'N':
11083 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Op);
11084 if (!C)
11085 return;
11086
11087 // Grab the value and do some validation.
11088 uint64_t CVal = C->getZExtValue();
11089 switch (ConstraintLetter) {
11090 // The I constraint applies only to simple ADD or SUB immediate operands:
11091 // i.e. 0 to 4095 with optional shift by 12
11092 // The J constraint applies only to ADD or SUB immediates that would be
11093 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
11094 // instruction [or vice versa], in other words -1 to -4095 with optional
11095 // left shift by 12.
11096 case 'I':
11097 if (isUInt<12>(x: CVal) || isShiftedUInt<12, 12>(x: CVal))
11098 break;
11099 return;
11100 case 'J': {
11101 uint64_t NVal = -C->getSExtValue();
11102 if (isUInt<12>(x: NVal) || isShiftedUInt<12, 12>(x: NVal)) {
11103 CVal = C->getSExtValue();
11104 break;
11105 }
11106 return;
11107 }
11108 // The K and L constraints apply *only* to logical immediates, including
11109 // what used to be the MOVI alias for ORR (though the MOVI alias has now
11110 // been removed and MOV should be used). So these constraints have to
11111 // distinguish between bit patterns that are valid 32-bit or 64-bit
11112 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
11113 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
11114 // versa.
11115 case 'K':
11116 if (AArch64_AM::isLogicalImmediate(imm: CVal, regSize: 32))
11117 break;
11118 return;
11119 case 'L':
11120 if (AArch64_AM::isLogicalImmediate(imm: CVal, regSize: 64))
11121 break;
11122 return;
11123 // The M and N constraints are a superset of K and L respectively, for use
11124 // with the MOV (immediate) alias. As well as the logical immediates they
11125 // also match 32 or 64-bit immediates that can be loaded either using a
11126 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
11127 // (M) or 64-bit 0x1234000000000000 (N) etc.
11128 // As a note some of this code is liberally stolen from the asm parser.
11129 case 'M': {
11130 if (!isUInt<32>(x: CVal))
11131 return;
11132 if (AArch64_AM::isLogicalImmediate(imm: CVal, regSize: 32))
11133 break;
11134 if ((CVal & 0xFFFF) == CVal)
11135 break;
11136 if ((CVal & 0xFFFF0000ULL) == CVal)
11137 break;
11138 uint64_t NCVal = ~(uint32_t)CVal;
11139 if ((NCVal & 0xFFFFULL) == NCVal)
11140 break;
11141 if ((NCVal & 0xFFFF0000ULL) == NCVal)
11142 break;
11143 return;
11144 }
11145 case 'N': {
11146 if (AArch64_AM::isLogicalImmediate(imm: CVal, regSize: 64))
11147 break;
11148 if ((CVal & 0xFFFFULL) == CVal)
11149 break;
11150 if ((CVal & 0xFFFF0000ULL) == CVal)
11151 break;
11152 if ((CVal & 0xFFFF00000000ULL) == CVal)
11153 break;
11154 if ((CVal & 0xFFFF000000000000ULL) == CVal)
11155 break;
11156 uint64_t NCVal = ~CVal;
11157 if ((NCVal & 0xFFFFULL) == NCVal)
11158 break;
11159 if ((NCVal & 0xFFFF0000ULL) == NCVal)
11160 break;
11161 if ((NCVal & 0xFFFF00000000ULL) == NCVal)
11162 break;
11163 if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
11164 break;
11165 return;
11166 }
11167 default:
11168 return;
11169 }
11170
11171 // All assembler immediates are 64-bit integers.
11172 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
11173 break;
11174 }
11175
11176 if (Result.getNode()) {
11177 Ops.push_back(x: Result);
11178 return;
11179 }
11180
11181 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
11182}
11183
11184//===----------------------------------------------------------------------===//
11185// AArch64 Advanced SIMD Support
11186//===----------------------------------------------------------------------===//
11187
11188/// WidenVector - Given a value in the V64 register class, produce the
11189/// equivalent value in the V128 register class.
11190static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
11191 EVT VT = V64Reg.getValueType();
11192 unsigned NarrowSize = VT.getVectorNumElements();
11193 MVT EltTy = VT.getVectorElementType().getSimpleVT();
11194 MVT WideTy = MVT::getVectorVT(VT: EltTy, NumElements: 2 * NarrowSize);
11195 SDLoc DL(V64Reg);
11196
11197 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
11198 V64Reg, DAG.getConstant(0, DL, MVT::i64));
11199}
11200
11201/// getExtFactor - Determine the adjustment factor for the position when
11202/// generating an "extract from vector registers" instruction.
11203static unsigned getExtFactor(SDValue &V) {
11204 EVT EltType = V.getValueType().getVectorElementType();
11205 return EltType.getSizeInBits() / 8;
11206}
11207
11208// Check if a vector is built from one vector via extracted elements of
11209// another together with an AND mask, ensuring that all elements fit
11210// within range. This can be reconstructed using AND and NEON's TBL1.
11211SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG) {
11212 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
11213 SDLoc dl(Op);
11214 EVT VT = Op.getValueType();
11215 assert(!VT.isScalableVector() &&
11216 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
11217
11218 // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map
11219 // directly to TBL1.
11220 if (VT != MVT::v16i8 && VT != MVT::v8i8)
11221 return SDValue();
11222
11223 unsigned NumElts = VT.getVectorNumElements();
11224 assert((NumElts == 8 || NumElts == 16) &&
11225 "Need to have exactly 8 or 16 elements in vector.");
11226
11227 SDValue SourceVec;
11228 SDValue MaskSourceVec;
11229 SmallVector<SDValue, 16> AndMaskConstants;
11230
11231 for (unsigned i = 0; i < NumElts; ++i) {
11232 SDValue V = Op.getOperand(i);
11233 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11234 return SDValue();
11235
11236 SDValue OperandSourceVec = V.getOperand(i: 0);
11237 if (!SourceVec)
11238 SourceVec = OperandSourceVec;
11239 else if (SourceVec != OperandSourceVec)
11240 return SDValue();
11241
11242 // This only looks at shuffles with elements that are
11243 // a) truncated by a constant AND mask extracted from a mask vector, or
11244 // b) extracted directly from a mask vector.
11245 SDValue MaskSource = V.getOperand(i: 1);
11246 if (MaskSource.getOpcode() == ISD::AND) {
11247 if (!isa<ConstantSDNode>(Val: MaskSource.getOperand(i: 1)))
11248 return SDValue();
11249
11250 AndMaskConstants.push_back(Elt: MaskSource.getOperand(i: 1));
11251 MaskSource = MaskSource->getOperand(Num: 0);
11252 } else if (!AndMaskConstants.empty()) {
11253 // Either all or no operands should have an AND mask.
11254 return SDValue();
11255 }
11256
11257 // An ANY_EXTEND may be inserted between the AND and the source vector
11258 // extraction. We don't care about that, so we can just skip it.
11259 if (MaskSource.getOpcode() == ISD::ANY_EXTEND)
11260 MaskSource = MaskSource.getOperand(i: 0);
11261
11262 if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11263 return SDValue();
11264
11265 SDValue MaskIdx = MaskSource.getOperand(i: 1);
11266 if (!isa<ConstantSDNode>(Val: MaskIdx) ||
11267 !cast<ConstantSDNode>(Val&: MaskIdx)->getConstantIntValue()->equalsInt(V: i))
11268 return SDValue();
11269
11270 // We only apply this if all elements come from the same vector with the
11271 // same vector type.
11272 if (!MaskSourceVec) {
11273 MaskSourceVec = MaskSource->getOperand(Num: 0);
11274 if (MaskSourceVec.getValueType() != VT)
11275 return SDValue();
11276 } else if (MaskSourceVec != MaskSource->getOperand(Num: 0)) {
11277 return SDValue();
11278 }
11279 }
11280
11281 // We need a v16i8 for TBL, so we extend the source with a placeholder vector
11282 // for v8i8 to get a v16i8. As the pattern we are replacing is extract +
11283 // insert, we know that the index in the mask must be smaller than the number
11284 // of elements in the source, or we would have an out-of-bounds access.
11285 if (NumElts == 8)
11286 SourceVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, SourceVec,
11287 DAG.getUNDEF(VT));
11288
11289 // Preconditions met, so we can use a vector (AND +) TBL to build this vector.
11290 if (!AndMaskConstants.empty())
11291 MaskSourceVec = DAG.getNode(Opcode: ISD::AND, DL: dl, VT, N1: MaskSourceVec,
11292 N2: DAG.getBuildVector(VT, DL: dl, Ops: AndMaskConstants));
11293
11294 return DAG.getNode(
11295 ISD::INTRINSIC_WO_CHAIN, dl, VT,
11296 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, dl, MVT::i32), SourceVec,
11297 MaskSourceVec);
11298}
11299
11300// Gather data to see if the operation can be modelled as a
11301// shuffle in combination with VEXTs.
11302SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
11303 SelectionDAG &DAG) const {
11304 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
11305 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
11306 SDLoc dl(Op);
11307 EVT VT = Op.getValueType();
11308 assert(!VT.isScalableVector() &&
11309 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
11310 unsigned NumElts = VT.getVectorNumElements();
11311
11312 struct ShuffleSourceInfo {
11313 SDValue Vec;
11314 unsigned MinElt;
11315 unsigned MaxElt;
11316
11317 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
11318 // be compatible with the shuffle we intend to construct. As a result
11319 // ShuffleVec will be some sliding window into the original Vec.
11320 SDValue ShuffleVec;
11321
11322 // Code should guarantee that element i in Vec starts at element "WindowBase
11323 // + i * WindowScale in ShuffleVec".
11324 int WindowBase;
11325 int WindowScale;
11326
11327 ShuffleSourceInfo(SDValue Vec)
11328 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
11329 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
11330
11331 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
11332 };
11333
11334 // First gather all vectors used as an immediate source for this BUILD_VECTOR
11335 // node.
11336 SmallVector<ShuffleSourceInfo, 2> Sources;
11337 for (unsigned i = 0; i < NumElts; ++i) {
11338 SDValue V = Op.getOperand(i);
11339 if (V.isUndef())
11340 continue;
11341 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11342 !isa<ConstantSDNode>(Val: V.getOperand(i: 1)) ||
11343 V.getOperand(i: 0).getValueType().isScalableVector()) {
11344 LLVM_DEBUG(
11345 dbgs() << "Reshuffle failed: "
11346 "a shuffle can only come from building a vector from "
11347 "various elements of other fixed-width vectors, provided "
11348 "their indices are constant\n");
11349 return SDValue();
11350 }
11351
11352 // Add this element source to the list if it's not already there.
11353 SDValue SourceVec = V.getOperand(i: 0);
11354 auto Source = find(Range&: Sources, Val: SourceVec);
11355 if (Source == Sources.end())
11356 Source = Sources.insert(I: Sources.end(), Elt: ShuffleSourceInfo(SourceVec));
11357
11358 // Update the minimum and maximum lane number seen.
11359 unsigned EltNo = V.getConstantOperandVal(i: 1);
11360 Source->MinElt = std::min(a: Source->MinElt, b: EltNo);
11361 Source->MaxElt = std::max(a: Source->MaxElt, b: EltNo);
11362 }
11363
11364 // If we have 3 or 4 sources, try to generate a TBL, which will at least be
11365 // better than moving to/from gpr registers for larger vectors.
11366 if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
11367 // Construct a mask for the tbl. We may need to adjust the index for types
11368 // larger than i8.
11369 SmallVector<unsigned, 16> Mask;
11370 unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
11371 for (unsigned I = 0; I < NumElts; ++I) {
11372 SDValue V = Op.getOperand(i: I);
11373 if (V.isUndef()) {
11374 for (unsigned OF = 0; OF < OutputFactor; OF++)
11375 Mask.push_back(Elt: -1);
11376 continue;
11377 }
11378 // Set the Mask lanes adjusted for the size of the input and output
11379 // lanes. The Mask is always i8, so it will set OutputFactor lanes per
11380 // output element, adjusted in their positions per input and output types.
11381 unsigned Lane = V.getConstantOperandVal(i: 1);
11382 for (unsigned S = 0; S < Sources.size(); S++) {
11383 if (V.getOperand(i: 0) == Sources[S].Vec) {
11384 unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
11385 unsigned InputBase = 16 * S + Lane * InputSize / 8;
11386 for (unsigned OF = 0; OF < OutputFactor; OF++)
11387 Mask.push_back(Elt: InputBase + OF);
11388 break;
11389 }
11390 }
11391 }
11392
11393 // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
11394 // v16i8, and the TBLMask
11395 SmallVector<SDValue, 16> TBLOperands;
11396 TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
11397 ? Intrinsic::aarch64_neon_tbl3
11398 : Intrinsic::aarch64_neon_tbl4,
11399 dl, MVT::i32));
11400 for (unsigned i = 0; i < Sources.size(); i++) {
11401 SDValue Src = Sources[i].Vec;
11402 EVT SrcVT = Src.getValueType();
11403 Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
11404 assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
11405 "Expected a legally typed vector");
11406 if (SrcVT.is64BitVector())
11407 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, Src,
11408 DAG.getUNDEF(MVT::v8i8));
11409 TBLOperands.push_back(Elt: Src);
11410 }
11411
11412 SmallVector<SDValue, 16> TBLMask;
11413 for (unsigned i = 0; i < Mask.size(); i++)
11414 TBLMask.push_back(DAG.getConstant(Mask[i], dl, MVT::i32));
11415 assert((Mask.size() == 8 || Mask.size() == 16) &&
11416 "Expected a v8i8 or v16i8 Mask");
11417 TBLOperands.push_back(
11418 DAG.getBuildVector(Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, dl, TBLMask));
11419
11420 SDValue Shuffle =
11421 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
11422 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
11423 return DAG.getBitcast(VT, V: Shuffle);
11424 }
11425
11426 if (Sources.size() > 2) {
11427 LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
11428 << "sensible when at most two source vectors are "
11429 << "involved\n");
11430 return SDValue();
11431 }
11432
11433 // Find out the smallest element size among result and two sources, and use
11434 // it as element size to build the shuffle_vector.
11435 EVT SmallestEltTy = VT.getVectorElementType();
11436 for (auto &Source : Sources) {
11437 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
11438 if (SrcEltTy.bitsLT(VT: SmallestEltTy)) {
11439 SmallestEltTy = SrcEltTy;
11440 }
11441 }
11442 unsigned ResMultiplier =
11443 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
11444 uint64_t VTSize = VT.getFixedSizeInBits();
11445 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
11446 EVT ShuffleVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: SmallestEltTy, NumElements: NumElts);
11447
11448 // If the source vector is too wide or too narrow, we may nevertheless be able
11449 // to construct a compatible shuffle either by concatenating it with UNDEF or
11450 // extracting a suitable range of elements.
11451 for (auto &Src : Sources) {
11452 EVT SrcVT = Src.ShuffleVec.getValueType();
11453
11454 TypeSize SrcVTSize = SrcVT.getSizeInBits();
11455 if (SrcVTSize == TypeSize::getFixed(ExactSize: VTSize))
11456 continue;
11457
11458 // This stage of the search produces a source with the same element type as
11459 // the original, but with a total width matching the BUILD_VECTOR output.
11460 EVT EltVT = SrcVT.getVectorElementType();
11461 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
11462 EVT DestVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: NumSrcElts);
11463
11464 if (SrcVTSize.getFixedValue() < VTSize) {
11465 assert(2 * SrcVTSize == VTSize);
11466 // We can pad out the smaller vector for free, so if it's part of a
11467 // shuffle...
11468 Src.ShuffleVec =
11469 DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT: DestVT, N1: Src.ShuffleVec,
11470 N2: DAG.getUNDEF(VT: Src.ShuffleVec.getValueType()));
11471 continue;
11472 }
11473
11474 if (SrcVTSize.getFixedValue() != 2 * VTSize) {
11475 LLVM_DEBUG(
11476 dbgs() << "Reshuffle failed: result vector too small to extract\n");
11477 return SDValue();
11478 }
11479
11480 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
11481 LLVM_DEBUG(
11482 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
11483 return SDValue();
11484 }
11485
11486 if (Src.MinElt >= NumSrcElts) {
11487 // The extraction can just take the second half
11488 Src.ShuffleVec =
11489 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11490 DAG.getConstant(NumSrcElts, dl, MVT::i64));
11491 Src.WindowBase = -NumSrcElts;
11492 } else if (Src.MaxElt < NumSrcElts) {
11493 // The extraction can just take the first half
11494 Src.ShuffleVec =
11495 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11496 DAG.getConstant(0, dl, MVT::i64));
11497 } else {
11498 // An actual VEXT is needed
11499 SDValue VEXTSrc1 =
11500 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11501 DAG.getConstant(0, dl, MVT::i64));
11502 SDValue VEXTSrc2 =
11503 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11504 DAG.getConstant(NumSrcElts, dl, MVT::i64));
11505 unsigned Imm = Src.MinElt * getExtFactor(V&: VEXTSrc1);
11506
11507 if (!SrcVT.is64BitVector()) {
11508 LLVM_DEBUG(
11509 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
11510 "for SVE vectors.");
11511 return SDValue();
11512 }
11513
11514 Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
11515 VEXTSrc2,
11516 DAG.getConstant(Imm, dl, MVT::i32));
11517 Src.WindowBase = -Src.MinElt;
11518 }
11519 }
11520
11521 // Another possible incompatibility occurs from the vector element types. We
11522 // can fix this by bitcasting the source vectors to the same type we intend
11523 // for the shuffle.
11524 for (auto &Src : Sources) {
11525 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
11526 if (SrcEltTy == SmallestEltTy)
11527 continue;
11528 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
11529 if (DAG.getDataLayout().isBigEndian()) {
11530 Src.ShuffleVec =
11531 DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: dl, VT: ShuffleVT, Operand: Src.ShuffleVec);
11532 } else {
11533 Src.ShuffleVec = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: ShuffleVT, Operand: Src.ShuffleVec);
11534 }
11535 Src.WindowScale =
11536 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
11537 Src.WindowBase *= Src.WindowScale;
11538 }
11539
11540 // Final check before we try to actually produce a shuffle.
11541 LLVM_DEBUG(for (auto Src
11542 : Sources)
11543 assert(Src.ShuffleVec.getValueType() == ShuffleVT););
11544
11545 // The stars all align, our next step is to produce the mask for the shuffle.
11546 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
11547 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
11548 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
11549 SDValue Entry = Op.getOperand(i);
11550 if (Entry.isUndef())
11551 continue;
11552
11553 auto Src = find(Range&: Sources, Val: Entry.getOperand(i: 0));
11554 int EltNo = cast<ConstantSDNode>(Val: Entry.getOperand(i: 1))->getSExtValue();
11555
11556 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
11557 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
11558 // segment.
11559 EVT OrigEltTy = Entry.getOperand(i: 0).getValueType().getVectorElementType();
11560 int BitsDefined = std::min(a: OrigEltTy.getScalarSizeInBits(),
11561 b: VT.getScalarSizeInBits());
11562 int LanesDefined = BitsDefined / BitsPerShuffleLane;
11563
11564 // This source is expected to fill ResMultiplier lanes of the final shuffle,
11565 // starting at the appropriate offset.
11566 int *LaneMask = &Mask[i * ResMultiplier];
11567
11568 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
11569 ExtractBase += NumElts * (Src - Sources.begin());
11570 for (int j = 0; j < LanesDefined; ++j)
11571 LaneMask[j] = ExtractBase + j;
11572 }
11573
11574 // Final check before we try to produce nonsense...
11575 if (!isShuffleMaskLegal(M: Mask, VT: ShuffleVT)) {
11576 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
11577 return SDValue();
11578 }
11579
11580 SDValue ShuffleOps[] = { DAG.getUNDEF(VT: ShuffleVT), DAG.getUNDEF(VT: ShuffleVT) };
11581 for (unsigned i = 0; i < Sources.size(); ++i)
11582 ShuffleOps[i] = Sources[i].ShuffleVec;
11583
11584 SDValue Shuffle = DAG.getVectorShuffle(VT: ShuffleVT, dl, N1: ShuffleOps[0],
11585 N2: ShuffleOps[1], Mask);
11586 SDValue V;
11587 if (DAG.getDataLayout().isBigEndian()) {
11588 V = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: dl, VT, Operand: Shuffle);
11589 } else {
11590 V = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT, Operand: Shuffle);
11591 }
11592
11593 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
11594 dbgs() << "Reshuffle, creating node: "; V.dump(););
11595
11596 return V;
11597}
11598
11599// check if an EXT instruction can handle the shuffle mask when the
11600// vector sources of the shuffle are the same.
11601static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
11602 unsigned NumElts = VT.getVectorNumElements();
11603
11604 // Assume that the first shuffle index is not UNDEF. Fail if it is.
11605 if (M[0] < 0)
11606 return false;
11607
11608 Imm = M[0];
11609
11610 // If this is a VEXT shuffle, the immediate value is the index of the first
11611 // element. The other shuffle indices must be the successive elements after
11612 // the first one.
11613 unsigned ExpectedElt = Imm;
11614 for (unsigned i = 1; i < NumElts; ++i) {
11615 // Increment the expected index. If it wraps around, just follow it
11616 // back to index zero and keep going.
11617 ++ExpectedElt;
11618 if (ExpectedElt == NumElts)
11619 ExpectedElt = 0;
11620
11621 if (M[i] < 0)
11622 continue; // ignore UNDEF indices
11623 if (ExpectedElt != static_cast<unsigned>(M[i]))
11624 return false;
11625 }
11626
11627 return true;
11628}
11629
11630// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
11631// v4i32s. This is really a truncate, which we can construct out of (legal)
11632// concats and truncate nodes.
11633static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG) {
11634 if (V.getValueType() != MVT::v16i8)
11635 return SDValue();
11636 assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
11637
11638 for (unsigned X = 0; X < 4; X++) {
11639 // Check the first item in each group is an extract from lane 0 of a v4i32
11640 // or v4i16.
11641 SDValue BaseExt = V.getOperand(i: X * 4);
11642 if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11643 (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
11644 BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
11645 !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
11646 BaseExt.getConstantOperandVal(1) != 0)
11647 return SDValue();
11648 SDValue Base = BaseExt.getOperand(i: 0);
11649 // And check the other items are extracts from the same vector.
11650 for (unsigned Y = 1; Y < 4; Y++) {
11651 SDValue Ext = V.getOperand(i: X * 4 + Y);
11652 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11653 Ext.getOperand(i: 0) != Base ||
11654 !isa<ConstantSDNode>(Val: Ext.getOperand(i: 1)) ||
11655 Ext.getConstantOperandVal(i: 1) != Y)
11656 return SDValue();
11657 }
11658 }
11659
11660 // Turn the buildvector into a series of truncates and concates, which will
11661 // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
11662 // concat together to produce 2 v8i16. These are both truncated and concat
11663 // together.
11664 SDLoc DL(V);
11665 SDValue Trunc[4] = {
11666 V.getOperand(i: 0).getOperand(i: 0), V.getOperand(i: 4).getOperand(i: 0),
11667 V.getOperand(i: 8).getOperand(i: 0), V.getOperand(i: 12).getOperand(i: 0)};
11668 for (SDValue &V : Trunc)
11669 if (V.getValueType() == MVT::v4i32)
11670 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
11671 SDValue Concat0 =
11672 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
11673 SDValue Concat1 =
11674 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
11675 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
11676 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
11677 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
11678}
11679
11680/// Check if a vector shuffle corresponds to a DUP instructions with a larger
11681/// element width than the vector lane type. If that is the case the function
11682/// returns true and writes the value of the DUP instruction lane operand into
11683/// DupLaneOp
11684static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
11685 unsigned &DupLaneOp) {
11686 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
11687 "Only possible block sizes for wide DUP are: 16, 32, 64");
11688
11689 if (BlockSize <= VT.getScalarSizeInBits())
11690 return false;
11691 if (BlockSize % VT.getScalarSizeInBits() != 0)
11692 return false;
11693 if (VT.getSizeInBits() % BlockSize != 0)
11694 return false;
11695
11696 size_t SingleVecNumElements = VT.getVectorNumElements();
11697 size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
11698 size_t NumBlocks = VT.getSizeInBits() / BlockSize;
11699
11700 // We are looking for masks like
11701 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
11702 // might be replaced by 'undefined'. BlockIndices will eventually contain
11703 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
11704 // for the above examples)
11705 SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
11706 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
11707 for (size_t I = 0; I < NumEltsPerBlock; I++) {
11708 int Elt = M[BlockIndex * NumEltsPerBlock + I];
11709 if (Elt < 0)
11710 continue;
11711 // For now we don't support shuffles that use the second operand
11712 if ((unsigned)Elt >= SingleVecNumElements)
11713 return false;
11714 if (BlockElts[I] < 0)
11715 BlockElts[I] = Elt;
11716 else if (BlockElts[I] != Elt)
11717 return false;
11718 }
11719
11720 // We found a candidate block (possibly with some undefs). It must be a
11721 // sequence of consecutive integers starting with a value divisible by
11722 // NumEltsPerBlock with some values possibly replaced by undef-s.
11723
11724 // Find first non-undef element
11725 auto FirstRealEltIter = find_if(Range&: BlockElts, P: [](int Elt) { return Elt >= 0; });
11726 assert(FirstRealEltIter != BlockElts.end() &&
11727 "Shuffle with all-undefs must have been caught by previous cases, "
11728 "e.g. isSplat()");
11729 if (FirstRealEltIter == BlockElts.end()) {
11730 DupLaneOp = 0;
11731 return true;
11732 }
11733
11734 // Index of FirstRealElt in BlockElts
11735 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
11736
11737 if ((unsigned)*FirstRealEltIter < FirstRealIndex)
11738 return false;
11739 // BlockElts[0] must have the following value if it isn't undef:
11740 size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
11741
11742 // Check the first element
11743 if (Elt0 % NumEltsPerBlock != 0)
11744 return false;
11745 // Check that the sequence indeed consists of consecutive integers (modulo
11746 // undefs)
11747 for (size_t I = 0; I < NumEltsPerBlock; I++)
11748 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
11749 return false;
11750
11751 DupLaneOp = Elt0 / NumEltsPerBlock;
11752 return true;
11753}
11754
11755// check if an EXT instruction can handle the shuffle mask when the
11756// vector sources of the shuffle are different.
11757static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
11758 unsigned &Imm) {
11759 // Look for the first non-undef element.
11760 const int *FirstRealElt = find_if(Range&: M, P: [](int Elt) { return Elt >= 0; });
11761
11762 // Benefit form APInt to handle overflow when calculating expected element.
11763 unsigned NumElts = VT.getVectorNumElements();
11764 unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
11765 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
11766 // The following shuffle indices must be the successive elements after the
11767 // first real element.
11768 bool FoundWrongElt = std::any_of(first: FirstRealElt + 1, last: M.end(), pred: [&](int Elt) {
11769 return Elt != ExpectedElt++ && Elt != -1;
11770 });
11771 if (FoundWrongElt)
11772 return false;
11773
11774 // The index of an EXT is the first element if it is not UNDEF.
11775 // Watch out for the beginning UNDEFs. The EXT index should be the expected
11776 // value of the first element. E.g.
11777 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
11778 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
11779 // ExpectedElt is the last mask index plus 1.
11780 Imm = ExpectedElt.getZExtValue();
11781
11782 // There are two difference cases requiring to reverse input vectors.
11783 // For example, for vector <4 x i32> we have the following cases,
11784 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
11785 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
11786 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
11787 // to reverse two input vectors.
11788 if (Imm < NumElts)
11789 ReverseEXT = true;
11790 else
11791 Imm -= NumElts;
11792
11793 return true;
11794}
11795
11796/// isREVMask - Check if a vector shuffle corresponds to a REV
11797/// instruction with the specified blocksize. (The order of the elements
11798/// within each block of the vector is reversed.)
11799static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
11800 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64 ||
11801 BlockSize == 128) &&
11802 "Only possible block sizes for REV are: 16, 32, 64, 128");
11803
11804 unsigned EltSz = VT.getScalarSizeInBits();
11805 unsigned NumElts = VT.getVectorNumElements();
11806 unsigned BlockElts = M[0] + 1;
11807 // If the first shuffle index is UNDEF, be optimistic.
11808 if (M[0] < 0)
11809 BlockElts = BlockSize / EltSz;
11810
11811 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
11812 return false;
11813
11814 for (unsigned i = 0; i < NumElts; ++i) {
11815 if (M[i] < 0)
11816 continue; // ignore UNDEF indices
11817 if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
11818 return false;
11819 }
11820
11821 return true;
11822}
11823
11824static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11825 unsigned NumElts = VT.getVectorNumElements();
11826 if (NumElts % 2 != 0)
11827 return false;
11828 WhichResult = (M[0] == 0 ? 0 : 1);
11829 for (unsigned i = 0; i < NumElts; i += 2) {
11830 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
11831 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
11832 return false;
11833 }
11834 return true;
11835}
11836
11837/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
11838/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
11839/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
11840static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11841 unsigned NumElts = VT.getVectorNumElements();
11842 if (NumElts % 2 != 0)
11843 return false;
11844 WhichResult = (M[0] == 0 ? 0 : 1);
11845 unsigned Idx = WhichResult * NumElts / 2;
11846 for (unsigned i = 0; i != NumElts; i += 2) {
11847 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
11848 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
11849 return false;
11850 Idx += 1;
11851 }
11852
11853 return true;
11854}
11855
11856/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
11857/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
11858/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
11859static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11860 unsigned Half = VT.getVectorNumElements() / 2;
11861 WhichResult = (M[0] == 0 ? 0 : 1);
11862 for (unsigned j = 0; j != 2; ++j) {
11863 unsigned Idx = WhichResult;
11864 for (unsigned i = 0; i != Half; ++i) {
11865 int MIdx = M[i + j * Half];
11866 if (MIdx >= 0 && (unsigned)MIdx != Idx)
11867 return false;
11868 Idx += 2;
11869 }
11870 }
11871
11872 return true;
11873}
11874
11875/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
11876/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
11877/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
11878static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11879 unsigned NumElts = VT.getVectorNumElements();
11880 if (NumElts % 2 != 0)
11881 return false;
11882 WhichResult = (M[0] == 0 ? 0 : 1);
11883 for (unsigned i = 0; i < NumElts; i += 2) {
11884 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
11885 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
11886 return false;
11887 }
11888 return true;
11889}
11890
11891static bool isINSMask(ArrayRef<int> M, int NumInputElements,
11892 bool &DstIsLeft, int &Anomaly) {
11893 if (M.size() != static_cast<size_t>(NumInputElements))
11894 return false;
11895
11896 int NumLHSMatch = 0, NumRHSMatch = 0;
11897 int LastLHSMismatch = -1, LastRHSMismatch = -1;
11898
11899 for (int i = 0; i < NumInputElements; ++i) {
11900 if (M[i] == -1) {
11901 ++NumLHSMatch;
11902 ++NumRHSMatch;
11903 continue;
11904 }
11905
11906 if (M[i] == i)
11907 ++NumLHSMatch;
11908 else
11909 LastLHSMismatch = i;
11910
11911 if (M[i] == i + NumInputElements)
11912 ++NumRHSMatch;
11913 else
11914 LastRHSMismatch = i;
11915 }
11916
11917 if (NumLHSMatch == NumInputElements - 1) {
11918 DstIsLeft = true;
11919 Anomaly = LastLHSMismatch;
11920 return true;
11921 } else if (NumRHSMatch == NumInputElements - 1) {
11922 DstIsLeft = false;
11923 Anomaly = LastRHSMismatch;
11924 return true;
11925 }
11926
11927 return false;
11928}
11929
11930static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
11931 if (VT.getSizeInBits() != 128)
11932 return false;
11933
11934 unsigned NumElts = VT.getVectorNumElements();
11935
11936 for (int I = 0, E = NumElts / 2; I != E; I++) {
11937 if (Mask[I] != I)
11938 return false;
11939 }
11940
11941 int Offset = NumElts / 2;
11942 for (int I = NumElts / 2, E = NumElts; I != E; I++) {
11943 if (Mask[I] != I + SplitLHS * Offset)
11944 return false;
11945 }
11946
11947 return true;
11948}
11949
11950static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
11951 SDLoc DL(Op);
11952 EVT VT = Op.getValueType();
11953 SDValue V0 = Op.getOperand(i: 0);
11954 SDValue V1 = Op.getOperand(i: 1);
11955 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Val&: Op)->getMask();
11956
11957 if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
11958 VT.getVectorElementType() != V1.getValueType().getVectorElementType())
11959 return SDValue();
11960
11961 bool SplitV0 = V0.getValueSizeInBits() == 128;
11962
11963 if (!isConcatMask(Mask, VT, SplitLHS: SplitV0))
11964 return SDValue();
11965
11966 EVT CastVT = VT.getHalfNumVectorElementsVT(Context&: *DAG.getContext());
11967 if (SplitV0) {
11968 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
11969 DAG.getConstant(0, DL, MVT::i64));
11970 }
11971 if (V1.getValueSizeInBits() == 128) {
11972 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
11973 DAG.getConstant(0, DL, MVT::i64));
11974 }
11975 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, N1: V0, N2: V1);
11976}
11977
11978/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
11979/// the specified operations to build the shuffle. ID is the perfect-shuffle
11980//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
11981//table entry and LHS/RHS are the immediate inputs for this stage of the
11982//shuffle.
11983static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1,
11984 SDValue V2, unsigned PFEntry, SDValue LHS,
11985 SDValue RHS, SelectionDAG &DAG,
11986 const SDLoc &dl) {
11987 unsigned OpNum = (PFEntry >> 26) & 0x0F;
11988 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
11989 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
11990
11991 enum {
11992 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
11993 OP_VREV,
11994 OP_VDUP0,
11995 OP_VDUP1,
11996 OP_VDUP2,
11997 OP_VDUP3,
11998 OP_VEXT1,
11999 OP_VEXT2,
12000 OP_VEXT3,
12001 OP_VUZPL, // VUZP, left result
12002 OP_VUZPR, // VUZP, right result
12003 OP_VZIPL, // VZIP, left result
12004 OP_VZIPR, // VZIP, right result
12005 OP_VTRNL, // VTRN, left result
12006 OP_VTRNR, // VTRN, right result
12007 OP_MOVLANE // Move lane. RHSID is the lane to move into
12008 };
12009
12010 if (OpNum == OP_COPY) {
12011 if (LHSID == (1 * 9 + 2) * 9 + 3)
12012 return LHS;
12013 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
12014 return RHS;
12015 }
12016
12017 if (OpNum == OP_MOVLANE) {
12018 // Decompose a PerfectShuffle ID to get the Mask for lane Elt
12019 auto getPFIDLane = [](unsigned ID, int Elt) -> int {
12020 assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
12021 Elt = 3 - Elt;
12022 while (Elt > 0) {
12023 ID /= 9;
12024 Elt--;
12025 }
12026 return (ID % 9 == 8) ? -1 : ID % 9;
12027 };
12028
12029 // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
12030 // get the lane to move from the PFID, which is always from the
12031 // original vectors (V1 or V2).
12032 SDValue OpLHS = GeneratePerfectShuffle(
12033 ID: LHSID, V1, V2, PFEntry: PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
12034 EVT VT = OpLHS.getValueType();
12035 assert(RHSID < 8 && "Expected a lane index for RHSID!");
12036 unsigned ExtLane = 0;
12037 SDValue Input;
12038
12039 // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
12040 // convert into a higher type.
12041 if (RHSID & 0x4) {
12042 int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
12043 if (MaskElt == -1)
12044 MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
12045 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
12046 ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
12047 Input = MaskElt < 2 ? V1 : V2;
12048 if (VT.getScalarSizeInBits() == 16) {
12049 Input = DAG.getBitcast(MVT::v2f32, Input);
12050 OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
12051 } else {
12052 assert(VT.getScalarSizeInBits() == 32 &&
12053 "Expected 16 or 32 bit shuffle elemements");
12054 Input = DAG.getBitcast(MVT::v2f64, Input);
12055 OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
12056 }
12057 } else {
12058 int MaskElt = getPFIDLane(ID, RHSID);
12059 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
12060 ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
12061 Input = MaskElt < 4 ? V1 : V2;
12062 // Be careful about creating illegal types. Use f16 instead of i16.
12063 if (VT == MVT::v4i16) {
12064 Input = DAG.getBitcast(MVT::v4f16, Input);
12065 OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
12066 }
12067 }
12068 SDValue Ext = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl,
12069 VT: Input.getValueType().getVectorElementType(),
12070 N1: Input, N2: DAG.getVectorIdxConstant(Val: ExtLane, DL: dl));
12071 SDValue Ins =
12072 DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: dl, VT: Input.getValueType(), N1: OpLHS,
12073 N2: Ext, N3: DAG.getVectorIdxConstant(Val: RHSID & 0x3, DL: dl));
12074 return DAG.getBitcast(VT, V: Ins);
12075 }
12076
12077 SDValue OpLHS, OpRHS;
12078 OpLHS = GeneratePerfectShuffle(ID: LHSID, V1, V2, PFEntry: PerfectShuffleTable[LHSID], LHS,
12079 RHS, DAG, dl);
12080 OpRHS = GeneratePerfectShuffle(ID: RHSID, V1, V2, PFEntry: PerfectShuffleTable[RHSID], LHS,
12081 RHS, DAG, dl);
12082 EVT VT = OpLHS.getValueType();
12083
12084 switch (OpNum) {
12085 default:
12086 llvm_unreachable("Unknown shuffle opcode!");
12087 case OP_VREV:
12088 // VREV divides the vector in half and swaps within the half.
12089 if (VT.getVectorElementType() == MVT::i32 ||
12090 VT.getVectorElementType() == MVT::f32)
12091 return DAG.getNode(Opcode: AArch64ISD::REV64, DL: dl, VT, Operand: OpLHS);
12092 // vrev <4 x i16> -> REV32
12093 if (VT.getVectorElementType() == MVT::i16 ||
12094 VT.getVectorElementType() == MVT::f16 ||
12095 VT.getVectorElementType() == MVT::bf16)
12096 return DAG.getNode(Opcode: AArch64ISD::REV32, DL: dl, VT, Operand: OpLHS);
12097 // vrev <4 x i8> -> REV16
12098 assert(VT.getVectorElementType() == MVT::i8);
12099 return DAG.getNode(Opcode: AArch64ISD::REV16, DL: dl, VT, Operand: OpLHS);
12100 case OP_VDUP0:
12101 case OP_VDUP1:
12102 case OP_VDUP2:
12103 case OP_VDUP3: {
12104 EVT EltTy = VT.getVectorElementType();
12105 unsigned Opcode;
12106 if (EltTy == MVT::i8)
12107 Opcode = AArch64ISD::DUPLANE8;
12108 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
12109 Opcode = AArch64ISD::DUPLANE16;
12110 else if (EltTy == MVT::i32 || EltTy == MVT::f32)
12111 Opcode = AArch64ISD::DUPLANE32;
12112 else if (EltTy == MVT::i64 || EltTy == MVT::f64)
12113 Opcode = AArch64ISD::DUPLANE64;
12114 else
12115 llvm_unreachable("Invalid vector element type?");
12116
12117 if (VT.getSizeInBits() == 64)
12118 OpLHS = WidenVector(V64Reg: OpLHS, DAG);
12119 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
12120 return DAG.getNode(Opcode, DL: dl, VT, N1: OpLHS, N2: Lane);
12121 }
12122 case OP_VEXT1:
12123 case OP_VEXT2:
12124 case OP_VEXT3: {
12125 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(V&: OpLHS);
12126 return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
12127 DAG.getConstant(Imm, dl, MVT::i32));
12128 }
12129 case OP_VUZPL:
12130 return DAG.getNode(Opcode: AArch64ISD::UZP1, DL: dl, VT, N1: OpLHS, N2: OpRHS);
12131 case OP_VUZPR:
12132 return DAG.getNode(Opcode: AArch64ISD::UZP2, DL: dl, VT, N1: OpLHS, N2: OpRHS);
12133 case OP_VZIPL:
12134 return DAG.getNode(Opcode: AArch64ISD::ZIP1, DL: dl, VT, N1: OpLHS, N2: OpRHS);
12135 case OP_VZIPR:
12136 return DAG.getNode(Opcode: AArch64ISD::ZIP2, DL: dl, VT, N1: OpLHS, N2: OpRHS);
12137 case OP_VTRNL:
12138 return DAG.getNode(Opcode: AArch64ISD::TRN1, DL: dl, VT, N1: OpLHS, N2: OpRHS);
12139 case OP_VTRNR:
12140 return DAG.getNode(Opcode: AArch64ISD::TRN2, DL: dl, VT, N1: OpLHS, N2: OpRHS);
12141 }
12142}
12143
12144static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
12145 SelectionDAG &DAG) {
12146 // Check to see if we can use the TBL instruction.
12147 SDValue V1 = Op.getOperand(i: 0);
12148 SDValue V2 = Op.getOperand(i: 1);
12149 SDLoc DL(Op);
12150
12151 EVT EltVT = Op.getValueType().getVectorElementType();
12152 unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
12153
12154 bool Swap = false;
12155 if (V1.isUndef() || isZerosVector(N: V1.getNode())) {
12156 std::swap(a&: V1, b&: V2);
12157 Swap = true;
12158 }
12159
12160 // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
12161 // out of range values with 0s. We do need to make sure that any out-of-range
12162 // values are really out-of-range for a v16i8 vector.
12163 bool IsUndefOrZero = V2.isUndef() || isZerosVector(N: V2.getNode());
12164 MVT IndexVT = MVT::v8i8;
12165 unsigned IndexLen = 8;
12166 if (Op.getValueSizeInBits() == 128) {
12167 IndexVT = MVT::v16i8;
12168 IndexLen = 16;
12169 }
12170
12171 SmallVector<SDValue, 8> TBLMask;
12172 for (int Val : ShuffleMask) {
12173 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
12174 unsigned Offset = Byte + Val * BytesPerElt;
12175 if (Swap)
12176 Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
12177 if (IsUndefOrZero && Offset >= IndexLen)
12178 Offset = 255;
12179 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
12180 }
12181 }
12182
12183 SDValue V1Cst = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IndexVT, Operand: V1);
12184 SDValue V2Cst = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IndexVT, Operand: V2);
12185
12186 SDValue Shuffle;
12187 if (IsUndefOrZero) {
12188 if (IndexLen == 8)
12189 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
12190 Shuffle = DAG.getNode(
12191 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
12192 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
12193 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
12194 } else {
12195 if (IndexLen == 8) {
12196 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
12197 Shuffle = DAG.getNode(
12198 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
12199 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
12200 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
12201 } else {
12202 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
12203 // cannot currently represent the register constraints on the input
12204 // table registers.
12205 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
12206 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
12207 // IndexLen));
12208 Shuffle = DAG.getNode(
12209 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
12210 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
12211 V2Cst,
12212 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
12213 }
12214 }
12215 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: Op.getValueType(), Operand: Shuffle);
12216}
12217
12218static unsigned getDUPLANEOp(EVT EltType) {
12219 if (EltType == MVT::i8)
12220 return AArch64ISD::DUPLANE8;
12221 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
12222 return AArch64ISD::DUPLANE16;
12223 if (EltType == MVT::i32 || EltType == MVT::f32)
12224 return AArch64ISD::DUPLANE32;
12225 if (EltType == MVT::i64 || EltType == MVT::f64)
12226 return AArch64ISD::DUPLANE64;
12227
12228 llvm_unreachable("Invalid vector element type?");
12229}
12230
12231static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
12232 unsigned Opcode, SelectionDAG &DAG) {
12233 // Try to eliminate a bitcasted extract subvector before a DUPLANE.
12234 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
12235 // Match: dup (bitcast (extract_subv X, C)), LaneC
12236 if (BitCast.getOpcode() != ISD::BITCAST ||
12237 BitCast.getOperand(i: 0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
12238 return false;
12239
12240 // The extract index must align in the destination type. That may not
12241 // happen if the bitcast is from narrow to wide type.
12242 SDValue Extract = BitCast.getOperand(i: 0);
12243 unsigned ExtIdx = Extract.getConstantOperandVal(i: 1);
12244 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
12245 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
12246 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
12247 if (ExtIdxInBits % CastedEltBitWidth != 0)
12248 return false;
12249
12250 // Can't handle cases where vector size is not 128-bit
12251 if (!Extract.getOperand(i: 0).getValueType().is128BitVector())
12252 return false;
12253
12254 // Update the lane value by offsetting with the scaled extract index.
12255 LaneC += ExtIdxInBits / CastedEltBitWidth;
12256
12257 // Determine the casted vector type of the wide vector input.
12258 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
12259 // Examples:
12260 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
12261 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
12262 unsigned SrcVecNumElts =
12263 Extract.getOperand(i: 0).getValueSizeInBits() / CastedEltBitWidth;
12264 CastVT = MVT::getVectorVT(VT: BitCast.getSimpleValueType().getScalarType(),
12265 NumElements: SrcVecNumElts);
12266 return true;
12267 };
12268 MVT CastVT;
12269 if (getScaledOffsetDup(V, Lane, CastVT)) {
12270 V = DAG.getBitcast(VT: CastVT, V: V.getOperand(i: 0).getOperand(i: 0));
12271 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
12272 V.getOperand(i: 0).getValueType().is128BitVector()) {
12273 // The lane is incremented by the index of the extract.
12274 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
12275 Lane += V.getConstantOperandVal(i: 1);
12276 V = V.getOperand(i: 0);
12277 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
12278 // The lane is decremented if we are splatting from the 2nd operand.
12279 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
12280 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
12281 Lane -= Idx * VT.getVectorNumElements() / 2;
12282 V = WidenVector(V64Reg: V.getOperand(i: Idx), DAG);
12283 } else if (VT.getSizeInBits() == 64) {
12284 // Widen the operand to 128-bit register with undef.
12285 V = WidenVector(V64Reg: V, DAG);
12286 }
12287 return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64));
12288}
12289
12290// Return true if we can get a new shuffle mask by checking the parameter mask
12291// array to test whether every two adjacent mask values are continuous and
12292// starting from an even number.
12293static bool isWideTypeMask(ArrayRef<int> M, EVT VT,
12294 SmallVectorImpl<int> &NewMask) {
12295 unsigned NumElts = VT.getVectorNumElements();
12296 if (NumElts % 2 != 0)
12297 return false;
12298
12299 NewMask.clear();
12300 for (unsigned i = 0; i < NumElts; i += 2) {
12301 int M0 = M[i];
12302 int M1 = M[i + 1];
12303
12304 // If both elements are undef, new mask is undef too.
12305 if (M0 == -1 && M1 == -1) {
12306 NewMask.push_back(Elt: -1);
12307 continue;
12308 }
12309
12310 if (M0 == -1 && M1 != -1 && (M1 % 2) == 1) {
12311 NewMask.push_back(Elt: M1 / 2);
12312 continue;
12313 }
12314
12315 if (M0 != -1 && (M0 % 2) == 0 && ((M0 + 1) == M1 || M1 == -1)) {
12316 NewMask.push_back(Elt: M0 / 2);
12317 continue;
12318 }
12319
12320 NewMask.clear();
12321 return false;
12322 }
12323
12324 assert(NewMask.size() == NumElts / 2 && "Incorrect size for mask!");
12325 return true;
12326}
12327
12328// Try to widen element type to get a new mask value for a better permutation
12329// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
12330// UZP1/2, TRN1/2, REV, INS, etc.
12331// For example:
12332// shufflevector <4 x i32> %a, <4 x i32> %b,
12333// <4 x i32> <i32 6, i32 7, i32 2, i32 3>
12334// is equivalent to:
12335// shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
12336// Finally, we can get:
12337// mov v0.d[0], v1.d[1]
12338static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG) {
12339 SDLoc DL(Op);
12340 EVT VT = Op.getValueType();
12341 EVT ScalarVT = VT.getVectorElementType();
12342 unsigned ElementSize = ScalarVT.getFixedSizeInBits();
12343 SDValue V0 = Op.getOperand(i: 0);
12344 SDValue V1 = Op.getOperand(i: 1);
12345 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Val&: Op)->getMask();
12346
12347 // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
12348 // We need to make sure the wider element type is legal. Thus, ElementSize
12349 // should be not larger than 32 bits, and i1 type should also be excluded.
12350 if (ElementSize > 32 || ElementSize == 1)
12351 return SDValue();
12352
12353 SmallVector<int, 8> NewMask;
12354 if (isWideTypeMask(M: Mask, VT, NewMask)) {
12355 MVT NewEltVT = VT.isFloatingPoint()
12356 ? MVT::getFloatingPointVT(BitWidth: ElementSize * 2)
12357 : MVT::getIntegerVT(BitWidth: ElementSize * 2);
12358 MVT NewVT = MVT::getVectorVT(VT: NewEltVT, NumElements: VT.getVectorNumElements() / 2);
12359 if (DAG.getTargetLoweringInfo().isTypeLegal(VT: NewVT)) {
12360 V0 = DAG.getBitcast(VT: NewVT, V: V0);
12361 V1 = DAG.getBitcast(VT: NewVT, V: V1);
12362 return DAG.getBitcast(VT,
12363 V: DAG.getVectorShuffle(VT: NewVT, dl: DL, N1: V0, N2: V1, Mask: NewMask));
12364 }
12365 }
12366
12367 return SDValue();
12368}
12369
12370// Try to fold shuffle (tbl2, tbl2) into a single tbl4.
12371static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op,
12372 ArrayRef<int> ShuffleMask,
12373 SelectionDAG &DAG) {
12374 SDValue Tbl1 = Op->getOperand(Num: 0);
12375 SDValue Tbl2 = Op->getOperand(Num: 1);
12376 SDLoc dl(Op);
12377 SDValue Tbl2ID =
12378 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, dl, MVT::i64);
12379
12380 EVT VT = Op.getValueType();
12381 if (Tbl1->getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
12382 Tbl1->getOperand(Num: 0) != Tbl2ID ||
12383 Tbl2->getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
12384 Tbl2->getOperand(Num: 0) != Tbl2ID)
12385 return SDValue();
12386
12387 if (Tbl1->getValueType(0) != MVT::v16i8 ||
12388 Tbl2->getValueType(0) != MVT::v16i8)
12389 return SDValue();
12390
12391 SDValue Mask1 = Tbl1->getOperand(Num: 3);
12392 SDValue Mask2 = Tbl2->getOperand(Num: 3);
12393 SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());
12394 for (unsigned I = 0; I < 16; I++) {
12395 if (ShuffleMask[I] < 16)
12396 TBLMaskParts[I] = Mask1->getOperand(Num: ShuffleMask[I]);
12397 else {
12398 auto *C =
12399 dyn_cast<ConstantSDNode>(Val: Mask2->getOperand(Num: ShuffleMask[I] - 16));
12400 if (!C)
12401 return SDValue();
12402 TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, dl, MVT::i32);
12403 }
12404 }
12405
12406 SDValue TBLMask = DAG.getBuildVector(VT, DL: dl, Ops: TBLMaskParts);
12407 SDValue ID =
12408 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, dl, MVT::i64);
12409
12410 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8,
12411 {ID, Tbl1->getOperand(1), Tbl1->getOperand(2),
12412 Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});
12413}
12414
12415// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
12416// but we don't have an appropriate instruction,
12417// so custom-lower it as ZIP1-with-zeros.
12418SDValue
12419AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
12420 SelectionDAG &DAG) const {
12421 SDLoc dl(Op);
12422 EVT VT = Op.getValueType();
12423 SDValue SrcOp = Op.getOperand(i: 0);
12424 EVT SrcVT = SrcOp.getValueType();
12425 assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&
12426 "Unexpected extension factor.");
12427 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
12428 // FIXME: support multi-step zipping?
12429 if (Scale != 2)
12430 return SDValue();
12431 SDValue Zeros = DAG.getConstant(Val: 0, DL: dl, VT: SrcVT);
12432 return DAG.getBitcast(VT,
12433 V: DAG.getNode(Opcode: AArch64ISD::ZIP1, DL: dl, VT: SrcVT, N1: SrcOp, N2: Zeros));
12434}
12435
12436SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
12437 SelectionDAG &DAG) const {
12438 SDLoc dl(Op);
12439 EVT VT = Op.getValueType();
12440
12441 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Val: Op.getNode());
12442
12443 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
12444 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
12445
12446 // Convert shuffles that are directly supported on NEON to target-specific
12447 // DAG nodes, instead of keeping them as shuffles and matching them again
12448 // during code selection. This is more efficient and avoids the possibility
12449 // of inconsistencies between legalization and selection.
12450 ArrayRef<int> ShuffleMask = SVN->getMask();
12451
12452 SDValue V1 = Op.getOperand(i: 0);
12453 SDValue V2 = Op.getOperand(i: 1);
12454
12455 assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
12456 assert(ShuffleMask.size() == VT.getVectorNumElements() &&
12457 "Unexpected VECTOR_SHUFFLE mask size!");
12458
12459 if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
12460 return Res;
12461
12462 if (SVN->isSplat()) {
12463 int Lane = SVN->getSplatIndex();
12464 // If this is undef splat, generate it via "just" vdup, if possible.
12465 if (Lane == -1)
12466 Lane = 0;
12467
12468 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
12469 return DAG.getNode(Opcode: AArch64ISD::DUP, DL: dl, VT: V1.getValueType(),
12470 Operand: V1.getOperand(i: 0));
12471 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
12472 // constant. If so, we can just reference the lane's definition directly.
12473 if (V1.getOpcode() == ISD::BUILD_VECTOR &&
12474 !isa<ConstantSDNode>(Val: V1.getOperand(i: Lane)))
12475 return DAG.getNode(Opcode: AArch64ISD::DUP, DL: dl, VT, Operand: V1.getOperand(i: Lane));
12476
12477 // Otherwise, duplicate from the lane of the input vector.
12478 unsigned Opcode = getDUPLANEOp(EltType: V1.getValueType().getVectorElementType());
12479 return constructDup(V: V1, Lane, dl, VT, Opcode, DAG);
12480 }
12481
12482 // Check if the mask matches a DUP for a wider element
12483 for (unsigned LaneSize : {64U, 32U, 16U}) {
12484 unsigned Lane = 0;
12485 if (isWideDUPMask(M: ShuffleMask, VT, BlockSize: LaneSize, DupLaneOp&: Lane)) {
12486 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
12487 : LaneSize == 32 ? AArch64ISD::DUPLANE32
12488 : AArch64ISD::DUPLANE16;
12489 // Cast V1 to an integer vector with required lane size
12490 MVT NewEltTy = MVT::getIntegerVT(BitWidth: LaneSize);
12491 unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
12492 MVT NewVecTy = MVT::getVectorVT(VT: NewEltTy, NumElements: NewEltCount);
12493 V1 = DAG.getBitcast(VT: NewVecTy, V: V1);
12494 // Constuct the DUP instruction
12495 V1 = constructDup(V: V1, Lane, dl, VT: NewVecTy, Opcode, DAG);
12496 // Cast back to the original type
12497 return DAG.getBitcast(VT, V: V1);
12498 }
12499 }
12500
12501 if (isREVMask(M: ShuffleMask, VT, BlockSize: 64))
12502 return DAG.getNode(Opcode: AArch64ISD::REV64, DL: dl, VT: V1.getValueType(), N1: V1, N2: V2);
12503 if (isREVMask(M: ShuffleMask, VT, BlockSize: 32))
12504 return DAG.getNode(Opcode: AArch64ISD::REV32, DL: dl, VT: V1.getValueType(), N1: V1, N2: V2);
12505 if (isREVMask(M: ShuffleMask, VT, BlockSize: 16))
12506 return DAG.getNode(Opcode: AArch64ISD::REV16, DL: dl, VT: V1.getValueType(), N1: V1, N2: V2);
12507
12508 if (((VT.getVectorNumElements() == 8 && VT.getScalarSizeInBits() == 16) ||
12509 (VT.getVectorNumElements() == 16 && VT.getScalarSizeInBits() == 8)) &&
12510 ShuffleVectorInst::isReverseMask(Mask: ShuffleMask, NumSrcElts: ShuffleMask.size())) {
12511 SDValue Rev = DAG.getNode(Opcode: AArch64ISD::REV64, DL: dl, VT, Operand: V1);
12512 return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev,
12513 DAG.getConstant(8, dl, MVT::i32));
12514 }
12515
12516 bool ReverseEXT = false;
12517 unsigned Imm;
12518 if (isEXTMask(M: ShuffleMask, VT, ReverseEXT, Imm)) {
12519 if (ReverseEXT)
12520 std::swap(a&: V1, b&: V2);
12521 Imm *= getExtFactor(V&: V1);
12522 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
12523 DAG.getConstant(Imm, dl, MVT::i32));
12524 } else if (V2->isUndef() && isSingletonEXTMask(M: ShuffleMask, VT, Imm)) {
12525 Imm *= getExtFactor(V&: V1);
12526 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
12527 DAG.getConstant(Imm, dl, MVT::i32));
12528 }
12529
12530 unsigned WhichResult;
12531 if (isZIPMask(M: ShuffleMask, VT, WhichResultOut&: WhichResult)) {
12532 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
12533 return DAG.getNode(Opcode: Opc, DL: dl, VT: V1.getValueType(), N1: V1, N2: V2);
12534 }
12535 if (isUZPMask(M: ShuffleMask, VT, WhichResultOut&: WhichResult)) {
12536 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
12537 return DAG.getNode(Opcode: Opc, DL: dl, VT: V1.getValueType(), N1: V1, N2: V2);
12538 }
12539 if (isTRNMask(M: ShuffleMask, VT, WhichResult)) {
12540 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
12541 return DAG.getNode(Opcode: Opc, DL: dl, VT: V1.getValueType(), N1: V1, N2: V2);
12542 }
12543
12544 if (isZIP_v_undef_Mask(M: ShuffleMask, VT, WhichResult)) {
12545 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
12546 return DAG.getNode(Opcode: Opc, DL: dl, VT: V1.getValueType(), N1: V1, N2: V1);
12547 }
12548 if (isUZP_v_undef_Mask(M: ShuffleMask, VT, WhichResult)) {
12549 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
12550 return DAG.getNode(Opcode: Opc, DL: dl, VT: V1.getValueType(), N1: V1, N2: V1);
12551 }
12552 if (isTRN_v_undef_Mask(M: ShuffleMask, VT, WhichResult)) {
12553 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
12554 return DAG.getNode(Opcode: Opc, DL: dl, VT: V1.getValueType(), N1: V1, N2: V1);
12555 }
12556
12557 if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG))
12558 return Concat;
12559
12560 bool DstIsLeft;
12561 int Anomaly;
12562 int NumInputElements = V1.getValueType().getVectorNumElements();
12563 if (isINSMask(M: ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
12564 SDValue DstVec = DstIsLeft ? V1 : V2;
12565 SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);
12566
12567 SDValue SrcVec = V1;
12568 int SrcLane = ShuffleMask[Anomaly];
12569 if (SrcLane >= NumInputElements) {
12570 SrcVec = V2;
12571 SrcLane -= VT.getVectorNumElements();
12572 }
12573 SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);
12574
12575 EVT ScalarVT = VT.getVectorElementType();
12576
12577 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
12578 ScalarVT = MVT::i32;
12579
12580 return DAG.getNode(
12581 Opcode: ISD::INSERT_VECTOR_ELT, DL: dl, VT, N1: DstVec,
12582 N2: DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT: ScalarVT, N1: SrcVec, N2: SrcLaneV),
12583 N3: DstLaneV);
12584 }
12585
12586 if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
12587 return NewSD;
12588
12589 // If the shuffle is not directly supported and it has 4 elements, use
12590 // the PerfectShuffle-generated table to synthesize it from other shuffles.
12591 unsigned NumElts = VT.getVectorNumElements();
12592 if (NumElts == 4) {
12593 unsigned PFIndexes[4];
12594 for (unsigned i = 0; i != 4; ++i) {
12595 if (ShuffleMask[i] < 0)
12596 PFIndexes[i] = 8;
12597 else
12598 PFIndexes[i] = ShuffleMask[i];
12599 }
12600
12601 // Compute the index in the perfect shuffle table.
12602 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
12603 PFIndexes[2] * 9 + PFIndexes[3];
12604 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
12605 return GeneratePerfectShuffle(ID: PFTableIndex, V1, V2, PFEntry, LHS: V1, RHS: V2, DAG,
12606 dl);
12607 }
12608
12609 return GenerateTBL(Op, ShuffleMask, DAG);
12610}
12611
12612SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
12613 SelectionDAG &DAG) const {
12614 EVT VT = Op.getValueType();
12615
12616 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
12617 return LowerToScalableOp(Op, DAG);
12618
12619 assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
12620 "Unexpected vector type!");
12621
12622 // We can handle the constant cases during isel.
12623 if (isa<ConstantSDNode>(Val: Op.getOperand(i: 0)))
12624 return Op;
12625
12626 // There isn't a natural way to handle the general i1 case, so we use some
12627 // trickery with whilelo.
12628 SDLoc DL(Op);
12629 SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);
12630 SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
12631 DAG.getValueType(MVT::i1));
12632 SDValue ID =
12633 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
12634 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
12635 if (VT == MVT::nxv1i1)
12636 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
12637 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
12638 Zero, SplatVal),
12639 Zero);
12640 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT, N1: ID, N2: Zero, N3: SplatVal);
12641}
12642
12643SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
12644 SelectionDAG &DAG) const {
12645 SDLoc DL(Op);
12646
12647 EVT VT = Op.getValueType();
12648 if (!isTypeLegal(VT) || !VT.isScalableVector())
12649 return SDValue();
12650
12651 // Current lowering only supports the SVE-ACLE types.
12652 if (VT.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock)
12653 return SDValue();
12654
12655 // The DUPQ operation is indepedent of element type so normalise to i64s.
12656 SDValue Idx128 = Op.getOperand(i: 2);
12657
12658 // DUPQ can be used when idx is in range.
12659 auto *CIdx = dyn_cast<ConstantSDNode>(Val&: Idx128);
12660 if (CIdx && (CIdx->getZExtValue() <= 3)) {
12661 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
12662 return DAG.getNode(Opcode: AArch64ISD::DUPLANE128, DL, VT, N1: Op.getOperand(i: 1), N2: CI);
12663 }
12664
12665 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
12666
12667 // The ACLE says this must produce the same result as:
12668 // svtbl(data, svadd_x(svptrue_b64(),
12669 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
12670 // index * 2))
12671 SDValue One = DAG.getConstant(1, DL, MVT::i64);
12672 SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
12673
12674 // create the vector 0,1,0,1,...
12675 SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
12676 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
12677
12678 // create the vector idx64,idx64+1,idx64,idx64+1,...
12679 SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
12680 SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
12681 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
12682
12683 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
12684 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
12685 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: TBL);
12686}
12687
12688
12689static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
12690 APInt &UndefBits) {
12691 EVT VT = BVN->getValueType(ResNo: 0);
12692 APInt SplatBits, SplatUndef;
12693 unsigned SplatBitSize;
12694 bool HasAnyUndefs;
12695 if (BVN->isConstantSplat(SplatValue&: SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
12696 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
12697
12698 for (unsigned i = 0; i < NumSplats; ++i) {
12699 CnstBits <<= SplatBitSize;
12700 UndefBits <<= SplatBitSize;
12701 CnstBits |= SplatBits.zextOrTrunc(width: VT.getSizeInBits());
12702 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(width: VT.getSizeInBits());
12703 }
12704
12705 return true;
12706 }
12707
12708 return false;
12709}
12710
12711// Try 64-bit splatted SIMD immediate.
12712static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12713 const APInt &Bits) {
12714 if (Bits.getHiBits(numBits: 64) == Bits.getLoBits(numBits: 64)) {
12715 uint64_t Value = Bits.zextOrTrunc(width: 64).getZExtValue();
12716 EVT VT = Op.getValueType();
12717 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
12718
12719 if (AArch64_AM::isAdvSIMDModImmType10(Imm: Value)) {
12720 Value = AArch64_AM::encodeAdvSIMDModImmType10(Imm: Value);
12721
12722 SDLoc dl(Op);
12723 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12724 DAG.getConstant(Value, dl, MVT::i32));
12725 return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: dl, VT, Operand: Mov);
12726 }
12727 }
12728
12729 return SDValue();
12730}
12731
12732// Try 32-bit splatted SIMD immediate.
12733static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12734 const APInt &Bits,
12735 const SDValue *LHS = nullptr) {
12736 EVT VT = Op.getValueType();
12737 if (VT.isFixedLengthVector() &&
12738 !DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable())
12739 return SDValue();
12740
12741 if (Bits.getHiBits(numBits: 64) == Bits.getLoBits(numBits: 64)) {
12742 uint64_t Value = Bits.zextOrTrunc(width: 64).getZExtValue();
12743 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
12744 bool isAdvSIMDModImm = false;
12745 uint64_t Shift;
12746
12747 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Imm: Value))) {
12748 Value = AArch64_AM::encodeAdvSIMDModImmType1(Imm: Value);
12749 Shift = 0;
12750 }
12751 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Imm: Value))) {
12752 Value = AArch64_AM::encodeAdvSIMDModImmType2(Imm: Value);
12753 Shift = 8;
12754 }
12755 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Imm: Value))) {
12756 Value = AArch64_AM::encodeAdvSIMDModImmType3(Imm: Value);
12757 Shift = 16;
12758 }
12759 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Imm: Value))) {
12760 Value = AArch64_AM::encodeAdvSIMDModImmType4(Imm: Value);
12761 Shift = 24;
12762 }
12763
12764 if (isAdvSIMDModImm) {
12765 SDLoc dl(Op);
12766 SDValue Mov;
12767
12768 if (LHS)
12769 Mov = DAG.getNode(NewOp, dl, MovTy,
12770 DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
12771 DAG.getConstant(Value, dl, MVT::i32),
12772 DAG.getConstant(Shift, dl, MVT::i32));
12773 else
12774 Mov = DAG.getNode(NewOp, dl, MovTy,
12775 DAG.getConstant(Value, dl, MVT::i32),
12776 DAG.getConstant(Shift, dl, MVT::i32));
12777
12778 return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: dl, VT, Operand: Mov);
12779 }
12780 }
12781
12782 return SDValue();
12783}
12784
12785// Try 16-bit splatted SIMD immediate.
12786static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12787 const APInt &Bits,
12788 const SDValue *LHS = nullptr) {
12789 EVT VT = Op.getValueType();
12790 if (VT.isFixedLengthVector() &&
12791 !DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable())
12792 return SDValue();
12793
12794 if (Bits.getHiBits(numBits: 64) == Bits.getLoBits(numBits: 64)) {
12795 uint64_t Value = Bits.zextOrTrunc(width: 64).getZExtValue();
12796 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
12797 bool isAdvSIMDModImm = false;
12798 uint64_t Shift;
12799
12800 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Imm: Value))) {
12801 Value = AArch64_AM::encodeAdvSIMDModImmType5(Imm: Value);
12802 Shift = 0;
12803 }
12804 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Imm: Value))) {
12805 Value = AArch64_AM::encodeAdvSIMDModImmType6(Imm: Value);
12806 Shift = 8;
12807 }
12808
12809 if (isAdvSIMDModImm) {
12810 SDLoc dl(Op);
12811 SDValue Mov;
12812
12813 if (LHS)
12814 Mov = DAG.getNode(NewOp, dl, MovTy,
12815 DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
12816 DAG.getConstant(Value, dl, MVT::i32),
12817 DAG.getConstant(Shift, dl, MVT::i32));
12818 else
12819 Mov = DAG.getNode(NewOp, dl, MovTy,
12820 DAG.getConstant(Value, dl, MVT::i32),
12821 DAG.getConstant(Shift, dl, MVT::i32));
12822
12823 return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: dl, VT, Operand: Mov);
12824 }
12825 }
12826
12827 return SDValue();
12828}
12829
12830// Try 32-bit splatted SIMD immediate with shifted ones.
12831static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op,
12832 SelectionDAG &DAG, const APInt &Bits) {
12833 if (Bits.getHiBits(numBits: 64) == Bits.getLoBits(numBits: 64)) {
12834 uint64_t Value = Bits.zextOrTrunc(width: 64).getZExtValue();
12835 EVT VT = Op.getValueType();
12836 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
12837 bool isAdvSIMDModImm = false;
12838 uint64_t Shift;
12839
12840 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Imm: Value))) {
12841 Value = AArch64_AM::encodeAdvSIMDModImmType7(Imm: Value);
12842 Shift = 264;
12843 }
12844 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Imm: Value))) {
12845 Value = AArch64_AM::encodeAdvSIMDModImmType8(Imm: Value);
12846 Shift = 272;
12847 }
12848
12849 if (isAdvSIMDModImm) {
12850 SDLoc dl(Op);
12851 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12852 DAG.getConstant(Value, dl, MVT::i32),
12853 DAG.getConstant(Shift, dl, MVT::i32));
12854 return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: dl, VT, Operand: Mov);
12855 }
12856 }
12857
12858 return SDValue();
12859}
12860
12861// Try 8-bit splatted SIMD immediate.
12862static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12863 const APInt &Bits) {
12864 if (Bits.getHiBits(numBits: 64) == Bits.getLoBits(numBits: 64)) {
12865 uint64_t Value = Bits.zextOrTrunc(width: 64).getZExtValue();
12866 EVT VT = Op.getValueType();
12867 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
12868
12869 if (AArch64_AM::isAdvSIMDModImmType9(Imm: Value)) {
12870 Value = AArch64_AM::encodeAdvSIMDModImmType9(Imm: Value);
12871
12872 SDLoc dl(Op);
12873 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12874 DAG.getConstant(Value, dl, MVT::i32));
12875 return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: dl, VT, Operand: Mov);
12876 }
12877 }
12878
12879 return SDValue();
12880}
12881
12882// Try FP splatted SIMD immediate.
12883static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12884 const APInt &Bits) {
12885 if (Bits.getHiBits(numBits: 64) == Bits.getLoBits(numBits: 64)) {
12886 uint64_t Value = Bits.zextOrTrunc(width: 64).getZExtValue();
12887 EVT VT = Op.getValueType();
12888 bool isWide = (VT.getSizeInBits() == 128);
12889 MVT MovTy;
12890 bool isAdvSIMDModImm = false;
12891
12892 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Imm: Value))) {
12893 Value = AArch64_AM::encodeAdvSIMDModImmType11(Imm: Value);
12894 MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
12895 }
12896 else if (isWide &&
12897 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Imm: Value))) {
12898 Value = AArch64_AM::encodeAdvSIMDModImmType12(Imm: Value);
12899 MovTy = MVT::v2f64;
12900 }
12901
12902 if (isAdvSIMDModImm) {
12903 SDLoc dl(Op);
12904 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12905 DAG.getConstant(Value, dl, MVT::i32));
12906 return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: dl, VT, Operand: Mov);
12907 }
12908 }
12909
12910 return SDValue();
12911}
12912
12913// Specialized code to quickly find if PotentialBVec is a BuildVector that
12914// consists of only the same constant int value, returned in reference arg
12915// ConstVal
12916static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
12917 uint64_t &ConstVal) {
12918 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(Val: PotentialBVec);
12919 if (!Bvec)
12920 return false;
12921 ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Val: Bvec->getOperand(Num: 0));
12922 if (!FirstElt)
12923 return false;
12924 EVT VT = Bvec->getValueType(ResNo: 0);
12925 unsigned NumElts = VT.getVectorNumElements();
12926 for (unsigned i = 1; i < NumElts; ++i)
12927 if (dyn_cast<ConstantSDNode>(Val: Bvec->getOperand(Num: i)) != FirstElt)
12928 return false;
12929 ConstVal = FirstElt->getZExtValue();
12930 return true;
12931}
12932
12933static bool isAllInactivePredicate(SDValue N) {
12934 // Look through cast.
12935 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
12936 N = N.getOperand(i: 0);
12937
12938 return ISD::isConstantSplatVectorAllZeros(N: N.getNode());
12939}
12940
12941static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) {
12942 unsigned NumElts = N.getValueType().getVectorMinNumElements();
12943
12944 // Look through cast.
12945 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
12946 N = N.getOperand(i: 0);
12947 // When reinterpreting from a type with fewer elements the "new" elements
12948 // are not active, so bail if they're likely to be used.
12949 if (N.getValueType().getVectorMinNumElements() < NumElts)
12950 return false;
12951 }
12952
12953 if (ISD::isConstantSplatVectorAllOnes(N: N.getNode()))
12954 return true;
12955
12956 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
12957 // or smaller than the implicit element type represented by N.
12958 // NOTE: A larger element count implies a smaller element type.
12959 if (N.getOpcode() == AArch64ISD::PTRUE &&
12960 N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
12961 return N.getValueType().getVectorMinNumElements() >= NumElts;
12962
12963 // If we're compiling for a specific vector-length, we can check if the
12964 // pattern's VL equals that of the scalable vector at runtime.
12965 if (N.getOpcode() == AArch64ISD::PTRUE) {
12966 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
12967 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
12968 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
12969 if (MaxSVESize && MinSVESize == MaxSVESize) {
12970 unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
12971 unsigned PatNumElts =
12972 getNumElementsFromSVEPredPattern(Pattern: N.getConstantOperandVal(i: 0));
12973 return PatNumElts == (NumElts * VScale);
12974 }
12975 }
12976
12977 return false;
12978}
12979
12980// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
12981// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
12982// BUILD_VECTORs with constant element C1, C2 is a constant, and:
12983// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
12984// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
12985// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
12986static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
12987 EVT VT = N->getValueType(ResNo: 0);
12988
12989 if (!VT.isVector())
12990 return SDValue();
12991
12992 SDLoc DL(N);
12993
12994 SDValue And;
12995 SDValue Shift;
12996
12997 SDValue FirstOp = N->getOperand(Num: 0);
12998 unsigned FirstOpc = FirstOp.getOpcode();
12999 SDValue SecondOp = N->getOperand(Num: 1);
13000 unsigned SecondOpc = SecondOp.getOpcode();
13001
13002 // Is one of the operands an AND or a BICi? The AND may have been optimised to
13003 // a BICi in order to use an immediate instead of a register.
13004 // Is the other operand an shl or lshr? This will have been turned into:
13005 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift
13006 // or (AArch64ISD::SHL_PRED || AArch64ISD::SRL_PRED) mask, vector, #shiftVec.
13007 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
13008 (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR ||
13009 SecondOpc == AArch64ISD::SHL_PRED ||
13010 SecondOpc == AArch64ISD::SRL_PRED)) {
13011 And = FirstOp;
13012 Shift = SecondOp;
13013
13014 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
13015 (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR ||
13016 FirstOpc == AArch64ISD::SHL_PRED ||
13017 FirstOpc == AArch64ISD::SRL_PRED)) {
13018 And = SecondOp;
13019 Shift = FirstOp;
13020 } else
13021 return SDValue();
13022
13023 bool IsAnd = And.getOpcode() == ISD::AND;
13024 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR ||
13025 Shift.getOpcode() == AArch64ISD::SRL_PRED;
13026 bool ShiftHasPredOp = Shift.getOpcode() == AArch64ISD::SHL_PRED ||
13027 Shift.getOpcode() == AArch64ISD::SRL_PRED;
13028
13029 // Is the shift amount constant and are all lanes active?
13030 uint64_t C2;
13031 if (ShiftHasPredOp) {
13032 if (!isAllActivePredicate(DAG, N: Shift.getOperand(i: 0)))
13033 return SDValue();
13034 APInt C;
13035 if (!ISD::isConstantSplatVector(N: Shift.getOperand(i: 2).getNode(), SplatValue&: C))
13036 return SDValue();
13037 C2 = C.getZExtValue();
13038 } else if (ConstantSDNode *C2node =
13039 dyn_cast<ConstantSDNode>(Val: Shift.getOperand(i: 1)))
13040 C2 = C2node->getZExtValue();
13041 else
13042 return SDValue();
13043
13044 APInt C1AsAPInt;
13045 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
13046 if (IsAnd) {
13047 // Is the and mask vector all constant?
13048 if (!ISD::isConstantSplatVector(N: And.getOperand(i: 1).getNode(), SplatValue&: C1AsAPInt))
13049 return SDValue();
13050 } else {
13051 // Reconstruct the corresponding AND immediate from the two BICi immediates.
13052 ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(Val: And.getOperand(i: 1));
13053 ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(Val: And.getOperand(i: 2));
13054 assert(C1nodeImm && C1nodeShift);
13055 C1AsAPInt = ~(C1nodeImm->getAPIntValue() << C1nodeShift->getAPIntValue());
13056 C1AsAPInt = C1AsAPInt.zextOrTrunc(width: ElemSizeInBits);
13057 }
13058
13059 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
13060 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
13061 // how much one can shift elements of a particular size?
13062 if (C2 > ElemSizeInBits)
13063 return SDValue();
13064
13065 APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(numBits: ElemSizeInBits, hiBitsSet: C2)
13066 : APInt::getLowBitsSet(numBits: ElemSizeInBits, loBitsSet: C2);
13067 if (C1AsAPInt != RequiredC1)
13068 return SDValue();
13069
13070 SDValue X = And.getOperand(i: 0);
13071 SDValue Y = ShiftHasPredOp ? Shift.getOperand(i: 1) : Shift.getOperand(i: 0);
13072 SDValue Imm = ShiftHasPredOp ? DAG.getTargetConstant(C2, DL, MVT::i32)
13073 : Shift.getOperand(1);
13074
13075 unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
13076 SDValue ResultSLI = DAG.getNode(Opcode: Inst, DL, VT, N1: X, N2: Y, N3: Imm);
13077
13078 LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
13079 LLVM_DEBUG(N->dump(&DAG));
13080 LLVM_DEBUG(dbgs() << "into: \n");
13081 LLVM_DEBUG(ResultSLI->dump(&DAG));
13082
13083 ++NumShiftInserts;
13084 return ResultSLI;
13085}
13086
13087SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
13088 SelectionDAG &DAG) const {
13089 if (useSVEForFixedLengthVectorVT(VT: Op.getValueType(),
13090 OverrideNEON: !Subtarget->isNeonAvailable()))
13091 return LowerToScalableOp(Op, DAG);
13092
13093 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
13094 if (SDValue Res = tryLowerToSLI(N: Op.getNode(), DAG))
13095 return Res;
13096
13097 EVT VT = Op.getValueType();
13098 if (VT.isScalableVector())
13099 return Op;
13100
13101 SDValue LHS = Op.getOperand(i: 0);
13102 BuildVectorSDNode *BVN =
13103 dyn_cast<BuildVectorSDNode>(Val: Op.getOperand(i: 1).getNode());
13104 if (!BVN) {
13105 // OR commutes, so try swapping the operands.
13106 LHS = Op.getOperand(i: 1);
13107 BVN = dyn_cast<BuildVectorSDNode>(Val: Op.getOperand(i: 0).getNode());
13108 }
13109 if (!BVN)
13110 return Op;
13111
13112 APInt DefBits(VT.getSizeInBits(), 0);
13113 APInt UndefBits(VT.getSizeInBits(), 0);
13114 if (resolveBuildVector(BVN, CnstBits&: DefBits, UndefBits)) {
13115 SDValue NewOp;
13116
13117 if ((NewOp = tryAdvSIMDModImm32(NewOp: AArch64ISD::ORRi, Op, DAG,
13118 Bits: DefBits, LHS: &LHS)) ||
13119 (NewOp = tryAdvSIMDModImm16(NewOp: AArch64ISD::ORRi, Op, DAG,
13120 Bits: DefBits, LHS: &LHS)))
13121 return NewOp;
13122
13123 if ((NewOp = tryAdvSIMDModImm32(NewOp: AArch64ISD::ORRi, Op, DAG,
13124 Bits: UndefBits, LHS: &LHS)) ||
13125 (NewOp = tryAdvSIMDModImm16(NewOp: AArch64ISD::ORRi, Op, DAG,
13126 Bits: UndefBits, LHS: &LHS)))
13127 return NewOp;
13128 }
13129
13130 // We can always fall back to a non-immediate OR.
13131 return Op;
13132}
13133
13134// Normalize the operands of BUILD_VECTOR. The value of constant operands will
13135// be truncated to fit element width.
13136static SDValue NormalizeBuildVector(SDValue Op,
13137 SelectionDAG &DAG) {
13138 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13139 SDLoc dl(Op);
13140 EVT VT = Op.getValueType();
13141 EVT EltTy= VT.getVectorElementType();
13142
13143 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
13144 return Op;
13145
13146 SmallVector<SDValue, 16> Ops;
13147 for (SDValue Lane : Op->ops()) {
13148 // For integer vectors, type legalization would have promoted the
13149 // operands already. Otherwise, if Op is a floating-point splat
13150 // (with operands cast to integers), then the only possibilities
13151 // are constants and UNDEFs.
13152 if (auto *CstLane = dyn_cast<ConstantSDNode>(Val&: Lane)) {
13153 APInt LowBits(EltTy.getSizeInBits(),
13154 CstLane->getZExtValue());
13155 Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
13156 } else if (Lane.getNode()->isUndef()) {
13157 Lane = DAG.getUNDEF(MVT::i32);
13158 } else {
13159 assert(Lane.getValueType() == MVT::i32 &&
13160 "Unexpected BUILD_VECTOR operand type");
13161 }
13162 Ops.push_back(Elt: Lane);
13163 }
13164 return DAG.getBuildVector(VT, DL: dl, Ops);
13165}
13166
13167static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG,
13168 const AArch64Subtarget *ST) {
13169 EVT VT = Op.getValueType();
13170 assert((VT.getSizeInBits() == 64 || VT.getSizeInBits() == 128) &&
13171 "Expected a legal NEON vector");
13172
13173 APInt DefBits(VT.getSizeInBits(), 0);
13174 APInt UndefBits(VT.getSizeInBits(), 0);
13175 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Val: Op.getNode());
13176 if (resolveBuildVector(BVN, CnstBits&: DefBits, UndefBits)) {
13177 auto TryMOVIWithBits = [&](APInt DefBits) {
13178 SDValue NewOp;
13179 if ((NewOp =
13180 tryAdvSIMDModImm64(NewOp: AArch64ISD::MOVIedit, Op, DAG, Bits: DefBits)) ||
13181 (NewOp =
13182 tryAdvSIMDModImm32(NewOp: AArch64ISD::MOVIshift, Op, DAG, Bits: DefBits)) ||
13183 (NewOp =
13184 tryAdvSIMDModImm321s(NewOp: AArch64ISD::MOVImsl, Op, DAG, Bits: DefBits)) ||
13185 (NewOp =
13186 tryAdvSIMDModImm16(NewOp: AArch64ISD::MOVIshift, Op, DAG, Bits: DefBits)) ||
13187 (NewOp = tryAdvSIMDModImm8(NewOp: AArch64ISD::MOVI, Op, DAG, Bits: DefBits)) ||
13188 (NewOp = tryAdvSIMDModImmFP(NewOp: AArch64ISD::FMOV, Op, DAG, Bits: DefBits)))
13189 return NewOp;
13190
13191 APInt NotDefBits = ~DefBits;
13192 if ((NewOp = tryAdvSIMDModImm32(NewOp: AArch64ISD::MVNIshift, Op, DAG,
13193 Bits: NotDefBits)) ||
13194 (NewOp = tryAdvSIMDModImm321s(NewOp: AArch64ISD::MVNImsl, Op, DAG,
13195 Bits: NotDefBits)) ||
13196 (NewOp =
13197 tryAdvSIMDModImm16(NewOp: AArch64ISD::MVNIshift, Op, DAG, Bits: NotDefBits)))
13198 return NewOp;
13199 return SDValue();
13200 };
13201 if (SDValue R = TryMOVIWithBits(DefBits))
13202 return R;
13203 if (SDValue R = TryMOVIWithBits(UndefBits))
13204 return R;
13205
13206 // See if a fneg of the constant can be materialized with a MOVI, etc
13207 auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
13208 // FNegate each sub-element of the constant
13209 assert(VT.getSizeInBits() % FVT.getScalarSizeInBits() == 0);
13210 APInt Neg = APInt::getHighBitsSet(numBits: FVT.getSizeInBits(), hiBitsSet: 1)
13211 .zext(width: VT.getSizeInBits());
13212 APInt NegBits(VT.getSizeInBits(), 0);
13213 unsigned NumElts = VT.getSizeInBits() / FVT.getScalarSizeInBits();
13214 for (unsigned i = 0; i < NumElts; i++)
13215 NegBits |= Neg << (FVT.getScalarSizeInBits() * i);
13216 NegBits = DefBits ^ NegBits;
13217
13218 // Try to create the new constants with MOVI, and if so generate a fneg
13219 // for it.
13220 if (SDValue NewOp = TryMOVIWithBits(NegBits)) {
13221 SDLoc DL(Op);
13222 MVT VFVT = NumElts == 1 ? FVT : MVT::getVectorVT(VT: FVT, NumElements: NumElts);
13223 return DAG.getNode(
13224 Opcode: AArch64ISD::NVCAST, DL, VT,
13225 Operand: DAG.getNode(Opcode: ISD::FNEG, DL, VT: VFVT,
13226 Operand: DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT: VFVT, Operand: NewOp)));
13227 }
13228 return SDValue();
13229 };
13230 SDValue R;
13231 if ((R = TryWithFNeg(DefBits, MVT::f32)) ||
13232 (R = TryWithFNeg(DefBits, MVT::f64)) ||
13233 (ST->hasFullFP16() && (R = TryWithFNeg(DefBits, MVT::f16))))
13234 return R;
13235 }
13236
13237 return SDValue();
13238}
13239
13240SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
13241 SelectionDAG &DAG) const {
13242 EVT VT = Op.getValueType();
13243
13244 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable())) {
13245 if (auto SeqInfo = cast<BuildVectorSDNode>(Val&: Op)->isConstantSequence()) {
13246 SDLoc DL(Op);
13247 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
13248 SDValue Start = DAG.getConstant(Val: SeqInfo->first, DL, VT: ContainerVT);
13249 SDValue Steps = DAG.getStepVector(DL, ResVT: ContainerVT, StepVal: SeqInfo->second);
13250 SDValue Seq = DAG.getNode(Opcode: ISD::ADD, DL, VT: ContainerVT, N1: Start, N2: Steps);
13251 return convertFromScalableVector(DAG, VT: Op.getValueType(), V: Seq);
13252 }
13253
13254 // Revert to common legalisation for all other variants.
13255 return SDValue();
13256 }
13257
13258 // Try to build a simple constant vector.
13259 Op = NormalizeBuildVector(Op, DAG);
13260 // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
13261 // abort.
13262 if (Op.getOpcode() != ISD::BUILD_VECTOR)
13263 return SDValue();
13264
13265 // Certain vector constants, used to express things like logical NOT and
13266 // arithmetic NEG, are passed through unmodified. This allows special
13267 // patterns for these operations to match, which will lower these constants
13268 // to whatever is proven necessary.
13269 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Val: Op.getNode());
13270 if (BVN->isConstant()) {
13271 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
13272 unsigned BitSize = VT.getVectorElementType().getSizeInBits();
13273 APInt Val(BitSize,
13274 Const->getAPIntValue().zextOrTrunc(width: BitSize).getZExtValue());
13275 if (Val.isZero() || (VT.isInteger() && Val.isAllOnes()))
13276 return Op;
13277 }
13278 if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode())
13279 if (Const->isZero() && !Const->isNegative())
13280 return Op;
13281 }
13282
13283 if (SDValue V = ConstantBuildVector(Op, DAG, ST: Subtarget))
13284 return V;
13285
13286 // Scan through the operands to find some interesting properties we can
13287 // exploit:
13288 // 1) If only one value is used, we can use a DUP, or
13289 // 2) if only the low element is not undef, we can just insert that, or
13290 // 3) if only one constant value is used (w/ some non-constant lanes),
13291 // we can splat the constant value into the whole vector then fill
13292 // in the non-constant lanes.
13293 // 4) FIXME: If different constant values are used, but we can intelligently
13294 // select the values we'll be overwriting for the non-constant
13295 // lanes such that we can directly materialize the vector
13296 // some other way (MOVI, e.g.), we can be sneaky.
13297 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
13298 SDLoc dl(Op);
13299 unsigned NumElts = VT.getVectorNumElements();
13300 bool isOnlyLowElement = true;
13301 bool usesOnlyOneValue = true;
13302 bool usesOnlyOneConstantValue = true;
13303 bool isConstant = true;
13304 bool AllLanesExtractElt = true;
13305 unsigned NumConstantLanes = 0;
13306 unsigned NumDifferentLanes = 0;
13307 unsigned NumUndefLanes = 0;
13308 SDValue Value;
13309 SDValue ConstantValue;
13310 SmallMapVector<SDValue, unsigned, 16> DifferentValueMap;
13311 unsigned ConsecutiveValCount = 0;
13312 SDValue PrevVal;
13313 for (unsigned i = 0; i < NumElts; ++i) {
13314 SDValue V = Op.getOperand(i);
13315 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13316 AllLanesExtractElt = false;
13317 if (V.isUndef()) {
13318 ++NumUndefLanes;
13319 continue;
13320 }
13321 if (i > 0)
13322 isOnlyLowElement = false;
13323 if (!isIntOrFPConstant(V))
13324 isConstant = false;
13325
13326 if (isIntOrFPConstant(V)) {
13327 ++NumConstantLanes;
13328 if (!ConstantValue.getNode())
13329 ConstantValue = V;
13330 else if (ConstantValue != V)
13331 usesOnlyOneConstantValue = false;
13332 }
13333
13334 if (!Value.getNode())
13335 Value = V;
13336 else if (V != Value) {
13337 usesOnlyOneValue = false;
13338 ++NumDifferentLanes;
13339 }
13340
13341 if (PrevVal != V) {
13342 ConsecutiveValCount = 0;
13343 PrevVal = V;
13344 }
13345
13346 // Keep different values and its last consecutive count. For example,
13347 //
13348 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
13349 // t24, t24, t24, t24, t24, t24, t24, t24
13350 // t23 = consecutive count 8
13351 // t24 = consecutive count 8
13352 // ------------------------------------------------------------------
13353 // t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,
13354 // t24, t24, t24, t24, t24, t24, t24, t24
13355 // t23 = consecutive count 5
13356 // t24 = consecutive count 9
13357 DifferentValueMap[V] = ++ConsecutiveValCount;
13358 }
13359
13360 if (!Value.getNode()) {
13361 LLVM_DEBUG(
13362 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
13363 return DAG.getUNDEF(VT);
13364 }
13365
13366 // Convert BUILD_VECTOR where all elements but the lowest are undef into
13367 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
13368 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
13369 if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(V: Value))) {
13370 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
13371 "SCALAR_TO_VECTOR node\n");
13372 return DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT, Operand: Value);
13373 }
13374
13375 if (AllLanesExtractElt) {
13376 SDNode *Vector = nullptr;
13377 bool Even = false;
13378 bool Odd = false;
13379 // Check whether the extract elements match the Even pattern <0,2,4,...> or
13380 // the Odd pattern <1,3,5,...>.
13381 for (unsigned i = 0; i < NumElts; ++i) {
13382 SDValue V = Op.getOperand(i);
13383 const SDNode *N = V.getNode();
13384 if (!isa<ConstantSDNode>(Val: N->getOperand(Num: 1))) {
13385 Even = false;
13386 Odd = false;
13387 break;
13388 }
13389 SDValue N0 = N->getOperand(Num: 0);
13390
13391 // All elements are extracted from the same vector.
13392 if (!Vector) {
13393 Vector = N0.getNode();
13394 // Check that the type of EXTRACT_VECTOR_ELT matches the type of
13395 // BUILD_VECTOR.
13396 if (VT.getVectorElementType() !=
13397 N0.getValueType().getVectorElementType())
13398 break;
13399 } else if (Vector != N0.getNode()) {
13400 Odd = false;
13401 Even = false;
13402 break;
13403 }
13404
13405 // Extracted values are either at Even indices <0,2,4,...> or at Odd
13406 // indices <1,3,5,...>.
13407 uint64_t Val = N->getConstantOperandVal(Num: 1);
13408 if (Val == 2 * i) {
13409 Even = true;
13410 continue;
13411 }
13412 if (Val - 1 == 2 * i) {
13413 Odd = true;
13414 continue;
13415 }
13416
13417 // Something does not match: abort.
13418 Odd = false;
13419 Even = false;
13420 break;
13421 }
13422 if (Even || Odd) {
13423 SDValue LHS =
13424 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
13425 DAG.getConstant(0, dl, MVT::i64));
13426 SDValue RHS =
13427 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
13428 DAG.getConstant(NumElts, dl, MVT::i64));
13429
13430 if (Even && !Odd)
13431 return DAG.getNode(Opcode: AArch64ISD::UZP1, DL: dl, VTList: DAG.getVTList(VT1: VT, VT2: VT), N1: LHS,
13432 N2: RHS);
13433 if (Odd && !Even)
13434 return DAG.getNode(Opcode: AArch64ISD::UZP2, DL: dl, VTList: DAG.getVTList(VT1: VT, VT2: VT), N1: LHS,
13435 N2: RHS);
13436 }
13437 }
13438
13439 // Use DUP for non-constant splats. For f32 constant splats, reduce to
13440 // i32 and try again.
13441 if (usesOnlyOneValue) {
13442 if (!isConstant) {
13443 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13444 Value.getValueType() != VT) {
13445 LLVM_DEBUG(
13446 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
13447 return DAG.getNode(Opcode: AArch64ISD::DUP, DL: dl, VT, Operand: Value);
13448 }
13449
13450 // This is actually a DUPLANExx operation, which keeps everything vectory.
13451
13452 SDValue Lane = Value.getOperand(i: 1);
13453 Value = Value.getOperand(i: 0);
13454 if (Value.getValueSizeInBits() == 64) {
13455 LLVM_DEBUG(
13456 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
13457 "widening it\n");
13458 Value = WidenVector(V64Reg: Value, DAG);
13459 }
13460
13461 unsigned Opcode = getDUPLANEOp(EltType: VT.getVectorElementType());
13462 return DAG.getNode(Opcode, DL: dl, VT, N1: Value, N2: Lane);
13463 }
13464
13465 if (VT.getVectorElementType().isFloatingPoint()) {
13466 SmallVector<SDValue, 8> Ops;
13467 EVT EltTy = VT.getVectorElementType();
13468 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
13469 EltTy == MVT::f64) && "Unsupported floating-point vector type");
13470 LLVM_DEBUG(
13471 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
13472 "BITCASTS, and try again\n");
13473 MVT NewType = MVT::getIntegerVT(BitWidth: EltTy.getSizeInBits());
13474 for (unsigned i = 0; i < NumElts; ++i)
13475 Ops.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: NewType, Operand: Op.getOperand(i)));
13476 EVT VecVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: NewType, NumElements: NumElts);
13477 SDValue Val = DAG.getBuildVector(VT: VecVT, DL: dl, Ops);
13478 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
13479 Val.dump(););
13480 Val = LowerBUILD_VECTOR(Op: Val, DAG);
13481 if (Val.getNode())
13482 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT, Operand: Val);
13483 }
13484 }
13485
13486 // If we need to insert a small number of different non-constant elements and
13487 // the vector width is sufficiently large, prefer using DUP with the common
13488 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
13489 // skip the constant lane handling below.
13490 bool PreferDUPAndInsert =
13491 !isConstant && NumDifferentLanes >= 1 &&
13492 NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
13493 NumDifferentLanes >= NumConstantLanes;
13494
13495 // If there was only one constant value used and for more than one lane,
13496 // start by splatting that value, then replace the non-constant lanes. This
13497 // is better than the default, which will perform a separate initialization
13498 // for each lane.
13499 if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
13500 // Firstly, try to materialize the splat constant.
13501 SDValue Val = DAG.getSplatBuildVector(VT, DL: dl, Op: ConstantValue);
13502 unsigned BitSize = VT.getScalarSizeInBits();
13503 APInt ConstantValueAPInt(1, 0);
13504 if (auto *C = dyn_cast<ConstantSDNode>(Val&: ConstantValue))
13505 ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(width: BitSize);
13506 if (!isNullConstant(V: ConstantValue) && !isNullFPConstant(V: ConstantValue) &&
13507 !ConstantValueAPInt.isAllOnes()) {
13508 Val = ConstantBuildVector(Op: Val, DAG, ST: Subtarget);
13509 if (!Val)
13510 // Otherwise, materialize the constant and splat it.
13511 Val = DAG.getNode(Opcode: AArch64ISD::DUP, DL: dl, VT, Operand: ConstantValue);
13512 }
13513
13514 // Now insert the non-constant lanes.
13515 for (unsigned i = 0; i < NumElts; ++i) {
13516 SDValue V = Op.getOperand(i);
13517 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
13518 if (!isIntOrFPConstant(V))
13519 // Note that type legalization likely mucked about with the VT of the
13520 // source operand, so we may have to convert it here before inserting.
13521 Val = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: dl, VT, N1: Val, N2: V, N3: LaneIdx);
13522 }
13523 return Val;
13524 }
13525
13526 // This will generate a load from the constant pool.
13527 if (isConstant) {
13528 LLVM_DEBUG(
13529 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
13530 "expansion\n");
13531 return SDValue();
13532 }
13533
13534 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
13535 // v4i32s. This is really a truncate, which we can construct out of (legal)
13536 // concats and truncate nodes.
13537 if (SDValue M = ReconstructTruncateFromBuildVector(V: Op, DAG))
13538 return M;
13539
13540 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
13541 if (NumElts >= 4) {
13542 if (SDValue Shuffle = ReconstructShuffle(Op, DAG))
13543 return Shuffle;
13544
13545 if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG))
13546 return Shuffle;
13547 }
13548
13549 if (PreferDUPAndInsert) {
13550 // First, build a constant vector with the common element.
13551 SmallVector<SDValue, 8> Ops(NumElts, Value);
13552 SDValue NewVector = LowerBUILD_VECTOR(Op: DAG.getBuildVector(VT, DL: dl, Ops), DAG);
13553 // Next, insert the elements that do not match the common value.
13554 for (unsigned I = 0; I < NumElts; ++I)
13555 if (Op.getOperand(I) != Value)
13556 NewVector =
13557 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector,
13558 Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64));
13559
13560 return NewVector;
13561 }
13562
13563 // If vector consists of two different values, try to generate two DUPs and
13564 // (CONCAT_VECTORS or VECTOR_SHUFFLE).
13565 if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) {
13566 SmallVector<SDValue, 2> Vals;
13567 // Check the consecutive count of the value is the half number of vector
13568 // elements. In this case, we can use CONCAT_VECTORS. For example,
13569 //
13570 // canUseVECTOR_CONCAT = true;
13571 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
13572 // t24, t24, t24, t24, t24, t24, t24, t24
13573 //
13574 // canUseVECTOR_CONCAT = false;
13575 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,
13576 // t24, t24, t24, t24, t24, t24, t24, t24
13577 bool canUseVECTOR_CONCAT = true;
13578 for (auto Pair : DifferentValueMap) {
13579 // Check different values have same length which is NumElts / 2.
13580 if (Pair.second != NumElts / 2)
13581 canUseVECTOR_CONCAT = false;
13582 Vals.push_back(Elt: Pair.first);
13583 }
13584
13585 // If canUseVECTOR_CONCAT is true, we can generate two DUPs and
13586 // CONCAT_VECTORs. For example,
13587 //
13588 // t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
13589 // t24, t24, t24, t24, t24, t24, t24, t24
13590 // ==>
13591 // t26: v8i8 = AArch64ISD::DUP t23
13592 // t28: v8i8 = AArch64ISD::DUP t24
13593 // t29: v16i8 = concat_vectors t26, t28
13594 if (canUseVECTOR_CONCAT) {
13595 EVT SubVT = VT.getHalfNumVectorElementsVT(Context&: *DAG.getContext());
13596 if (isTypeLegal(VT: SubVT) && SubVT.isVector() &&
13597 SubVT.getVectorNumElements() >= 2) {
13598 SmallVector<SDValue, 8> Ops1(NumElts / 2, Vals[0]);
13599 SmallVector<SDValue, 8> Ops2(NumElts / 2, Vals[1]);
13600 SDValue DUP1 =
13601 LowerBUILD_VECTOR(Op: DAG.getBuildVector(VT: SubVT, DL: dl, Ops: Ops1), DAG);
13602 SDValue DUP2 =
13603 LowerBUILD_VECTOR(Op: DAG.getBuildVector(VT: SubVT, DL: dl, Ops: Ops2), DAG);
13604 SDValue CONCAT_VECTORS =
13605 DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT, N1: DUP1, N2: DUP2);
13606 return CONCAT_VECTORS;
13607 }
13608 }
13609
13610 // Let's try to generate VECTOR_SHUFFLE. For example,
13611 //
13612 // t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26
13613 // ==>
13614 // t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26
13615 // t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25
13616 // t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28
13617 if (NumElts >= 8) {
13618 SmallVector<int, 16> MaskVec;
13619 // Build mask for VECTOR_SHUFLLE.
13620 SDValue FirstLaneVal = Op.getOperand(i: 0);
13621 for (unsigned i = 0; i < NumElts; ++i) {
13622 SDValue Val = Op.getOperand(i);
13623 if (FirstLaneVal == Val)
13624 MaskVec.push_back(Elt: i);
13625 else
13626 MaskVec.push_back(Elt: i + NumElts);
13627 }
13628
13629 SmallVector<SDValue, 8> Ops1(NumElts, Vals[0]);
13630 SmallVector<SDValue, 8> Ops2(NumElts, Vals[1]);
13631 SDValue VEC1 = DAG.getBuildVector(VT, DL: dl, Ops: Ops1);
13632 SDValue VEC2 = DAG.getBuildVector(VT, DL: dl, Ops: Ops2);
13633 SDValue VECTOR_SHUFFLE =
13634 DAG.getVectorShuffle(VT, dl, N1: VEC1, N2: VEC2, Mask: MaskVec);
13635 return VECTOR_SHUFFLE;
13636 }
13637 }
13638
13639 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
13640 // know the default expansion would otherwise fall back on something even
13641 // worse. For a vector with one or two non-undef values, that's
13642 // scalar_to_vector for the elements followed by a shuffle (provided the
13643 // shuffle is valid for the target) and materialization element by element
13644 // on the stack followed by a load for everything else.
13645 if (!isConstant && !usesOnlyOneValue) {
13646 LLVM_DEBUG(
13647 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
13648 "of INSERT_VECTOR_ELT\n");
13649
13650 SDValue Vec = DAG.getUNDEF(VT);
13651 SDValue Op0 = Op.getOperand(i: 0);
13652 unsigned i = 0;
13653
13654 // Use SCALAR_TO_VECTOR for lane zero to
13655 // a) Avoid a RMW dependency on the full vector register, and
13656 // b) Allow the register coalescer to fold away the copy if the
13657 // value is already in an S or D register, and we're forced to emit an
13658 // INSERT_SUBREG that we can't fold anywhere.
13659 //
13660 // We also allow types like i8 and i16 which are illegal scalar but legal
13661 // vector element types. After type-legalization the inserted value is
13662 // extended (i32) and it is safe to cast them to the vector type by ignoring
13663 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
13664 if (!Op0.isUndef()) {
13665 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
13666 Vec = DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT, Operand: Op0);
13667 ++i;
13668 }
13669 LLVM_DEBUG(if (i < NumElts) dbgs()
13670 << "Creating nodes for the other vector elements:\n";);
13671 for (; i < NumElts; ++i) {
13672 SDValue V = Op.getOperand(i);
13673 if (V.isUndef())
13674 continue;
13675 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
13676 Vec = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: dl, VT, N1: Vec, N2: V, N3: LaneIdx);
13677 }
13678 return Vec;
13679 }
13680
13681 LLVM_DEBUG(
13682 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
13683 "better alternative\n");
13684 return SDValue();
13685}
13686
13687SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
13688 SelectionDAG &DAG) const {
13689 if (useSVEForFixedLengthVectorVT(VT: Op.getValueType(),
13690 OverrideNEON: !Subtarget->isNeonAvailable()))
13691 return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
13692
13693 assert(Op.getValueType().isScalableVector() &&
13694 isTypeLegal(Op.getValueType()) &&
13695 "Expected legal scalable vector type!");
13696
13697 if (isTypeLegal(VT: Op.getOperand(i: 0).getValueType())) {
13698 unsigned NumOperands = Op->getNumOperands();
13699 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
13700 "Unexpected number of operands in CONCAT_VECTORS");
13701
13702 if (NumOperands == 2)
13703 return Op;
13704
13705 // Concat each pair of subvectors and pack into the lower half of the array.
13706 SmallVector<SDValue> ConcatOps(Op->op_begin(), Op->op_end());
13707 while (ConcatOps.size() > 1) {
13708 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
13709 SDValue V1 = ConcatOps[I];
13710 SDValue V2 = ConcatOps[I + 1];
13711 EVT SubVT = V1.getValueType();
13712 EVT PairVT = SubVT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
13713 ConcatOps[I / 2] =
13714 DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(Op), VT: PairVT, N1: V1, N2: V2);
13715 }
13716 ConcatOps.resize(N: ConcatOps.size() / 2);
13717 }
13718 return ConcatOps[0];
13719 }
13720
13721 return SDValue();
13722}
13723
13724SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
13725 SelectionDAG &DAG) const {
13726 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
13727
13728 if (useSVEForFixedLengthVectorVT(VT: Op.getValueType(),
13729 OverrideNEON: !Subtarget->isNeonAvailable()))
13730 return LowerFixedLengthInsertVectorElt(Op, DAG);
13731
13732 EVT VT = Op.getOperand(i: 0).getValueType();
13733
13734 if (VT.getScalarType() == MVT::i1) {
13735 EVT VectorVT = getPromotedVTForPredicate(VT);
13736 SDLoc DL(Op);
13737 SDValue ExtendedVector =
13738 DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: 0), DL, VT: VectorVT);
13739 SDValue ExtendedValue =
13740 DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,
13741 VectorVT.getScalarType().getSizeInBits() < 32
13742 ? MVT::i32
13743 : VectorVT.getScalarType());
13744 ExtendedVector =
13745 DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: VectorVT, N1: ExtendedVector,
13746 N2: ExtendedValue, N3: Op.getOperand(i: 2));
13747 return DAG.getAnyExtOrTrunc(Op: ExtendedVector, DL, VT);
13748 }
13749
13750 // Check for non-constant or out of range lane.
13751 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
13752 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
13753 return SDValue();
13754
13755 return Op;
13756}
13757
13758SDValue
13759AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
13760 SelectionDAG &DAG) const {
13761 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
13762 EVT VT = Op.getOperand(i: 0).getValueType();
13763
13764 if (VT.getScalarType() == MVT::i1) {
13765 // We can't directly extract from an SVE predicate; extend it first.
13766 // (This isn't the only possible lowering, but it's straightforward.)
13767 EVT VectorVT = getPromotedVTForPredicate(VT);
13768 SDLoc DL(Op);
13769 SDValue Extend =
13770 DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VectorVT, Operand: Op.getOperand(i: 0));
13771 MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
13772 SDValue Extract = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ExtractTy,
13773 N1: Extend, N2: Op.getOperand(i: 1));
13774 return DAG.getAnyExtOrTrunc(Op: Extract, DL, VT: Op.getValueType());
13775 }
13776
13777 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
13778 return LowerFixedLengthExtractVectorElt(Op, DAG);
13779
13780 // Check for non-constant or out of range lane.
13781 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 1));
13782 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
13783 return SDValue();
13784
13785 // Insertion/extraction are legal for V128 types.
13786 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
13787 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
13788 VT == MVT::v8f16 || VT == MVT::v8bf16)
13789 return Op;
13790
13791 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
13792 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
13793 VT != MVT::v4bf16)
13794 return SDValue();
13795
13796 // For V64 types, we perform extraction by expanding the value
13797 // to a V128 type and perform the extraction on that.
13798 SDLoc DL(Op);
13799 SDValue WideVec = WidenVector(V64Reg: Op.getOperand(i: 0), DAG);
13800 EVT WideTy = WideVec.getValueType();
13801
13802 EVT ExtrTy = WideTy.getVectorElementType();
13803 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
13804 ExtrTy = MVT::i32;
13805
13806 // For extractions, we just return the result directly.
13807 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ExtrTy, N1: WideVec,
13808 N2: Op.getOperand(i: 1));
13809}
13810
13811SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
13812 SelectionDAG &DAG) const {
13813 assert(Op.getValueType().isFixedLengthVector() &&
13814 "Only cases that extract a fixed length vector are supported!");
13815
13816 EVT InVT = Op.getOperand(i: 0).getValueType();
13817 unsigned Idx = Op.getConstantOperandVal(i: 1);
13818 unsigned Size = Op.getValueSizeInBits();
13819
13820 // If we don't have legal types yet, do nothing
13821 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: InVT))
13822 return SDValue();
13823
13824 if (InVT.isScalableVector()) {
13825 // This will be matched by custom code during ISelDAGToDAG.
13826 if (Idx == 0 && isPackedVectorType(VT: InVT, DAG))
13827 return Op;
13828
13829 return SDValue();
13830 }
13831
13832 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
13833 if (Idx == 0 && InVT.getSizeInBits() <= 128)
13834 return Op;
13835
13836 // If this is extracting the upper 64-bits of a 128-bit vector, we match
13837 // that directly.
13838 if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64 &&
13839 InVT.getSizeInBits() == 128 && Subtarget->isNeonAvailable())
13840 return Op;
13841
13842 if (useSVEForFixedLengthVectorVT(VT: InVT, OverrideNEON: !Subtarget->isNeonAvailable())) {
13843 SDLoc DL(Op);
13844
13845 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: InVT);
13846 SDValue NewInVec =
13847 convertToScalableVector(DAG, VT: ContainerVT, V: Op.getOperand(i: 0));
13848
13849 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ContainerVT, NewInVec,
13850 NewInVec, DAG.getConstant(Idx, DL, MVT::i64));
13851 return convertFromScalableVector(DAG, VT: Op.getValueType(), V: Splice);
13852 }
13853
13854 return SDValue();
13855}
13856
13857SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
13858 SelectionDAG &DAG) const {
13859 assert(Op.getValueType().isScalableVector() &&
13860 "Only expect to lower inserts into scalable vectors!");
13861
13862 EVT InVT = Op.getOperand(i: 1).getValueType();
13863 unsigned Idx = Op.getConstantOperandVal(i: 2);
13864
13865 SDValue Vec0 = Op.getOperand(i: 0);
13866 SDValue Vec1 = Op.getOperand(i: 1);
13867 SDLoc DL(Op);
13868 EVT VT = Op.getValueType();
13869
13870 if (InVT.isScalableVector()) {
13871 if (!isTypeLegal(VT))
13872 return SDValue();
13873
13874 // Break down insert_subvector into simpler parts.
13875 if (VT.getVectorElementType() == MVT::i1) {
13876 unsigned NumElts = VT.getVectorMinNumElements();
13877 EVT HalfVT = VT.getHalfNumVectorElementsVT(Context&: *DAG.getContext());
13878
13879 SDValue Lo, Hi;
13880 Lo = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: HalfVT, N1: Vec0,
13881 N2: DAG.getVectorIdxConstant(Val: 0, DL));
13882 Hi = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: HalfVT, N1: Vec0,
13883 N2: DAG.getVectorIdxConstant(Val: NumElts / 2, DL));
13884 if (Idx < (NumElts / 2))
13885 Lo = DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT: HalfVT, N1: Lo, N2: Vec1,
13886 N3: DAG.getVectorIdxConstant(Val: Idx, DL));
13887 else
13888 Hi = DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT: HalfVT, N1: Hi, N2: Vec1,
13889 N3: DAG.getVectorIdxConstant(Val: Idx - (NumElts / 2), DL));
13890
13891 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, N1: Lo, N2: Hi);
13892 }
13893
13894 // Ensure the subvector is half the size of the main vector.
13895 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
13896 return SDValue();
13897
13898 // Here narrow and wide refers to the vector element types. After "casting"
13899 // both vectors must have the same bit length and so because the subvector
13900 // has fewer elements, those elements need to be bigger.
13901 EVT NarrowVT = getPackedSVEVectorVT(EC: VT.getVectorElementCount());
13902 EVT WideVT = getPackedSVEVectorVT(EC: InVT.getVectorElementCount());
13903
13904 // NOP cast operands to the largest legal vector of the same element count.
13905 if (VT.isFloatingPoint()) {
13906 Vec0 = getSVESafeBitCast(VT: NarrowVT, Op: Vec0, DAG);
13907 Vec1 = getSVESafeBitCast(VT: WideVT, Op: Vec1, DAG);
13908 } else {
13909 // Legal integer vectors are already their largest so Vec0 is fine as is.
13910 Vec1 = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: WideVT, Operand: Vec1);
13911 }
13912
13913 // To replace the top/bottom half of vector V with vector SubV we widen the
13914 // preserved half of V, concatenate this to SubV (the order depending on the
13915 // half being replaced) and then narrow the result.
13916 SDValue Narrow;
13917 if (Idx == 0) {
13918 SDValue HiVec0 = DAG.getNode(Opcode: AArch64ISD::UUNPKHI, DL, VT: WideVT, Operand: Vec0);
13919 Narrow = DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: NarrowVT, N1: Vec1, N2: HiVec0);
13920 } else {
13921 assert(Idx == InVT.getVectorMinNumElements() &&
13922 "Invalid subvector index!");
13923 SDValue LoVec0 = DAG.getNode(Opcode: AArch64ISD::UUNPKLO, DL, VT: WideVT, Operand: Vec0);
13924 Narrow = DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: NarrowVT, N1: LoVec0, N2: Vec1);
13925 }
13926
13927 return getSVESafeBitCast(VT, Op: Narrow, DAG);
13928 }
13929
13930 if (Idx == 0 && isPackedVectorType(VT, DAG)) {
13931 // This will be matched by custom code during ISelDAGToDAG.
13932 if (Vec0.isUndef())
13933 return Op;
13934
13935 std::optional<unsigned> PredPattern =
13936 getSVEPredPatternFromNumElements(InVT.getVectorNumElements());
13937 auto PredTy = VT.changeVectorElementType(MVT::i1);
13938 SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);
13939 SDValue ScalableVec1 = convertToScalableVector(DAG, VT, V: Vec1);
13940 return DAG.getNode(Opcode: ISD::VSELECT, DL, VT, N1: PTrue, N2: ScalableVec1, N3: Vec0);
13941 }
13942
13943 return SDValue();
13944}
13945
13946static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
13947 if (Op.getOpcode() != AArch64ISD::DUP &&
13948 Op.getOpcode() != ISD::SPLAT_VECTOR &&
13949 Op.getOpcode() != ISD::BUILD_VECTOR)
13950 return false;
13951
13952 if (Op.getOpcode() == ISD::BUILD_VECTOR &&
13953 !isAllConstantBuildVector(PotentialBVec: Op, ConstVal&: SplatVal))
13954 return false;
13955
13956 if (Op.getOpcode() != ISD::BUILD_VECTOR &&
13957 !isa<ConstantSDNode>(Val: Op->getOperand(Num: 0)))
13958 return false;
13959
13960 SplatVal = Op->getConstantOperandVal(Num: 0);
13961 if (Op.getValueType().getVectorElementType() != MVT::i64)
13962 SplatVal = (int32_t)SplatVal;
13963
13964 Negated = false;
13965 if (isPowerOf2_64(Value: SplatVal))
13966 return true;
13967
13968 Negated = true;
13969 if (isPowerOf2_64(Value: -SplatVal)) {
13970 SplatVal = -SplatVal;
13971 return true;
13972 }
13973
13974 return false;
13975}
13976
13977SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
13978 EVT VT = Op.getValueType();
13979 SDLoc dl(Op);
13980
13981 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
13982 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
13983
13984 assert(VT.isScalableVector() && "Expected a scalable vector.");
13985
13986 bool Signed = Op.getOpcode() == ISD::SDIV;
13987 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
13988
13989 bool Negated;
13990 uint64_t SplatVal;
13991 if (Signed && isPow2Splat(Op: Op.getOperand(i: 1), SplatVal, Negated)) {
13992 SDValue Pg = getPredicateForScalableVector(DAG, DL&: dl, VT);
13993 SDValue Res =
13994 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, VT, Pg, Op->getOperand(0),
13995 DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32));
13996 if (Negated)
13997 Res = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT, N1: DAG.getConstant(Val: 0, DL: dl, VT), N2: Res);
13998
13999 return Res;
14000 }
14001
14002 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
14003 return LowerToPredicatedOp(Op, DAG, NewOp: PredOpcode);
14004
14005 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
14006 // operations, and truncate the result.
14007 EVT WidenedVT;
14008 if (VT == MVT::nxv16i8)
14009 WidenedVT = MVT::nxv8i16;
14010 else if (VT == MVT::nxv8i16)
14011 WidenedVT = MVT::nxv4i32;
14012 else
14013 llvm_unreachable("Unexpected Custom DIV operation");
14014
14015 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
14016 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
14017 SDValue Op0Lo = DAG.getNode(Opcode: UnpkLo, DL: dl, VT: WidenedVT, Operand: Op.getOperand(i: 0));
14018 SDValue Op1Lo = DAG.getNode(Opcode: UnpkLo, DL: dl, VT: WidenedVT, Operand: Op.getOperand(i: 1));
14019 SDValue Op0Hi = DAG.getNode(Opcode: UnpkHi, DL: dl, VT: WidenedVT, Operand: Op.getOperand(i: 0));
14020 SDValue Op1Hi = DAG.getNode(Opcode: UnpkHi, DL: dl, VT: WidenedVT, Operand: Op.getOperand(i: 1));
14021 SDValue ResultLo = DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: WidenedVT, N1: Op0Lo, N2: Op1Lo);
14022 SDValue ResultHi = DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: WidenedVT, N1: Op0Hi, N2: Op1Hi);
14023 return DAG.getNode(Opcode: AArch64ISD::UZP1, DL: dl, VT, N1: ResultLo, N2: ResultHi);
14024}
14025
14026bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
14027 // Currently no fixed length shuffles that require SVE are legal.
14028 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
14029 return false;
14030
14031 if (VT.getVectorNumElements() == 4 &&
14032 (VT.is128BitVector() || VT.is64BitVector())) {
14033 unsigned Cost = getPerfectShuffleCost(M);
14034 if (Cost <= 1)
14035 return true;
14036 }
14037
14038 bool DummyBool;
14039 int DummyInt;
14040 unsigned DummyUnsigned;
14041
14042 return (ShuffleVectorSDNode::isSplatMask(Mask: &M[0], VT) || isREVMask(M, VT, BlockSize: 64) ||
14043 isREVMask(M, VT, BlockSize: 32) || isREVMask(M, VT, BlockSize: 16) ||
14044 isEXTMask(M, VT, ReverseEXT&: DummyBool, Imm&: DummyUnsigned) ||
14045 // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
14046 isTRNMask(M, VT, WhichResult&: DummyUnsigned) || isUZPMask(M, VT, WhichResultOut&: DummyUnsigned) ||
14047 isZIPMask(M, VT, WhichResultOut&: DummyUnsigned) ||
14048 isTRN_v_undef_Mask(M, VT, WhichResult&: DummyUnsigned) ||
14049 isUZP_v_undef_Mask(M, VT, WhichResult&: DummyUnsigned) ||
14050 isZIP_v_undef_Mask(M, VT, WhichResult&: DummyUnsigned) ||
14051 isINSMask(M, NumInputElements: VT.getVectorNumElements(), DstIsLeft&: DummyBool, Anomaly&: DummyInt) ||
14052 isConcatMask(Mask: M, VT, SplitLHS: VT.getSizeInBits() == 128));
14053}
14054
14055bool AArch64TargetLowering::isVectorClearMaskLegal(ArrayRef<int> M,
14056 EVT VT) const {
14057 // Just delegate to the generic legality, clear masks aren't special.
14058 return isShuffleMaskLegal(M, VT);
14059}
14060
14061/// getVShiftImm - Check if this is a valid build_vector for the immediate
14062/// operand of a vector shift operation, where all the elements of the
14063/// build_vector must have the same constant integer value.
14064static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
14065 // Ignore bit_converts.
14066 while (Op.getOpcode() == ISD::BITCAST)
14067 Op = Op.getOperand(i: 0);
14068 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Val: Op.getNode());
14069 APInt SplatBits, SplatUndef;
14070 unsigned SplatBitSize;
14071 bool HasAnyUndefs;
14072 if (!BVN || !BVN->isConstantSplat(SplatValue&: SplatBits, SplatUndef, SplatBitSize,
14073 HasAnyUndefs, MinSplatBits: ElementBits) ||
14074 SplatBitSize > ElementBits)
14075 return false;
14076 Cnt = SplatBits.getSExtValue();
14077 return true;
14078}
14079
14080/// isVShiftLImm - Check if this is a valid build_vector for the immediate
14081/// operand of a vector shift left operation. That value must be in the range:
14082/// 0 <= Value < ElementBits for a left shift; or
14083/// 0 <= Value <= ElementBits for a long left shift.
14084static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
14085 assert(VT.isVector() && "vector shift count is not a vector type");
14086 int64_t ElementBits = VT.getScalarSizeInBits();
14087 if (!getVShiftImm(Op, ElementBits, Cnt))
14088 return false;
14089 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
14090}
14091
14092/// isVShiftRImm - Check if this is a valid build_vector for the immediate
14093/// operand of a vector shift right operation. The value must be in the range:
14094/// 1 <= Value <= ElementBits for a right shift; or
14095static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
14096 assert(VT.isVector() && "vector shift count is not a vector type");
14097 int64_t ElementBits = VT.getScalarSizeInBits();
14098 if (!getVShiftImm(Op, ElementBits, Cnt))
14099 return false;
14100 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
14101}
14102
14103SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
14104 SelectionDAG &DAG) const {
14105 EVT VT = Op.getValueType();
14106
14107 if (VT.getScalarType() == MVT::i1) {
14108 // Lower i1 truncate to `(x & 1) != 0`.
14109 SDLoc dl(Op);
14110 EVT OpVT = Op.getOperand(i: 0).getValueType();
14111 SDValue Zero = DAG.getConstant(Val: 0, DL: dl, VT: OpVT);
14112 SDValue One = DAG.getConstant(Val: 1, DL: dl, VT: OpVT);
14113 SDValue And = DAG.getNode(Opcode: ISD::AND, DL: dl, VT: OpVT, N1: Op.getOperand(i: 0), N2: One);
14114 return DAG.getSetCC(DL: dl, VT, LHS: And, RHS: Zero, Cond: ISD::SETNE);
14115 }
14116
14117 if (!VT.isVector() || VT.isScalableVector())
14118 return SDValue();
14119
14120 if (useSVEForFixedLengthVectorVT(VT: Op.getOperand(i: 0).getValueType(),
14121 OverrideNEON: !Subtarget->isNeonAvailable()))
14122 return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
14123
14124 return SDValue();
14125}
14126
14127// Check if we can we lower this SRL to a rounding shift instruction. ResVT is
14128// possibly a truncated type, it tells how many bits of the value are to be
14129// used.
14130static bool canLowerSRLToRoundingShiftForVT(SDValue Shift, EVT ResVT,
14131 SelectionDAG &DAG,
14132 unsigned &ShiftValue,
14133 SDValue &RShOperand) {
14134 if (Shift->getOpcode() != ISD::SRL)
14135 return false;
14136
14137 EVT VT = Shift.getValueType();
14138 assert(VT.isScalableVT());
14139
14140 auto ShiftOp1 =
14141 dyn_cast_or_null<ConstantSDNode>(Val: DAG.getSplatValue(V: Shift->getOperand(Num: 1)));
14142 if (!ShiftOp1)
14143 return false;
14144
14145 ShiftValue = ShiftOp1->getZExtValue();
14146 if (ShiftValue < 1 || ShiftValue > ResVT.getScalarSizeInBits())
14147 return false;
14148
14149 SDValue Add = Shift->getOperand(Num: 0);
14150 if (Add->getOpcode() != ISD::ADD || !Add->hasOneUse())
14151 return false;
14152
14153 assert(ResVT.getScalarSizeInBits() <= VT.getScalarSizeInBits() &&
14154 "ResVT must be truncated or same type as the shift.");
14155 // Check if an overflow can lead to incorrect results.
14156 uint64_t ExtraBits = VT.getScalarSizeInBits() - ResVT.getScalarSizeInBits();
14157 if (ShiftValue > ExtraBits && !Add->getFlags().hasNoUnsignedWrap())
14158 return false;
14159
14160 auto AddOp1 =
14161 dyn_cast_or_null<ConstantSDNode>(Val: DAG.getSplatValue(V: Add->getOperand(Num: 1)));
14162 if (!AddOp1)
14163 return false;
14164 uint64_t AddValue = AddOp1->getZExtValue();
14165 if (AddValue != 1ULL << (ShiftValue - 1))
14166 return false;
14167
14168 RShOperand = Add->getOperand(Num: 0);
14169 return true;
14170}
14171
14172SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
14173 SelectionDAG &DAG) const {
14174 EVT VT = Op.getValueType();
14175 SDLoc DL(Op);
14176 int64_t Cnt;
14177
14178 if (!Op.getOperand(i: 1).getValueType().isVector())
14179 return Op;
14180 unsigned EltSize = VT.getScalarSizeInBits();
14181
14182 switch (Op.getOpcode()) {
14183 case ISD::SHL:
14184 if (VT.isScalableVector() ||
14185 useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
14186 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::SHL_PRED);
14187
14188 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
14189 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
14190 DAG.getConstant(Cnt, DL, MVT::i32));
14191 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
14192 DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
14193 MVT::i32),
14194 Op.getOperand(0), Op.getOperand(1));
14195 case ISD::SRA:
14196 case ISD::SRL:
14197 if (VT.isScalableVector() && Subtarget->hasSVE2orSME()) {
14198 SDValue RShOperand;
14199 unsigned ShiftValue;
14200 if (canLowerSRLToRoundingShiftForVT(Op, VT, DAG, ShiftValue, RShOperand))
14201 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, VT,
14202 getPredicateForVector(DAG, DL, VT), RShOperand,
14203 DAG.getTargetConstant(ShiftValue, DL, MVT::i32));
14204 }
14205
14206 if (VT.isScalableVector() ||
14207 useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable())) {
14208 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
14209 : AArch64ISD::SRL_PRED;
14210 return LowerToPredicatedOp(Op, DAG, NewOp: Opc);
14211 }
14212
14213 // Right shift immediate
14214 if (isVShiftRImm(Op: Op.getOperand(i: 1), VT, isNarrow: false, Cnt) && Cnt < EltSize) {
14215 unsigned Opc =
14216 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
14217 return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
14218 DAG.getConstant(Cnt, DL, MVT::i32));
14219 }
14220
14221 // Right shift register. Note, there is not a shift right register
14222 // instruction, but the shift left register instruction takes a signed
14223 // value, where negative numbers specify a right shift.
14224 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
14225 : Intrinsic::aarch64_neon_ushl;
14226 // negate the shift amount
14227 SDValue NegShift = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: DAG.getConstant(Val: 0, DL, VT),
14228 N2: Op.getOperand(i: 1));
14229 SDValue NegShiftLeft =
14230 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
14231 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
14232 NegShift);
14233 return NegShiftLeft;
14234 }
14235
14236 llvm_unreachable("unexpected shift opcode");
14237}
14238
14239static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
14240 AArch64CC::CondCode CC, bool NoNans, EVT VT,
14241 const SDLoc &dl, SelectionDAG &DAG) {
14242 EVT SrcVT = LHS.getValueType();
14243 assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
14244 "function only supposed to emit natural comparisons");
14245
14246 APInt SplatValue;
14247 APInt SplatUndef;
14248 unsigned SplatBitSize = 0;
14249 bool HasAnyUndefs;
14250
14251 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Val: RHS.getNode());
14252 bool IsCnst = BVN && BVN->isConstantSplat(SplatValue, SplatUndef,
14253 SplatBitSize, HasAnyUndefs);
14254
14255 bool IsZero = IsCnst && SplatValue == 0;
14256 bool IsOne =
14257 IsCnst && SrcVT.getScalarSizeInBits() == SplatBitSize && SplatValue == 1;
14258 bool IsMinusOne = IsCnst && SplatValue.isAllOnes();
14259
14260 if (SrcVT.getVectorElementType().isFloatingPoint()) {
14261 switch (CC) {
14262 default:
14263 return SDValue();
14264 case AArch64CC::NE: {
14265 SDValue Fcmeq;
14266 if (IsZero)
14267 Fcmeq = DAG.getNode(Opcode: AArch64ISD::FCMEQz, DL: dl, VT, Operand: LHS);
14268 else
14269 Fcmeq = DAG.getNode(Opcode: AArch64ISD::FCMEQ, DL: dl, VT, N1: LHS, N2: RHS);
14270 return DAG.getNOT(DL: dl, Val: Fcmeq, VT);
14271 }
14272 case AArch64CC::EQ:
14273 if (IsZero)
14274 return DAG.getNode(Opcode: AArch64ISD::FCMEQz, DL: dl, VT, Operand: LHS);
14275 return DAG.getNode(Opcode: AArch64ISD::FCMEQ, DL: dl, VT, N1: LHS, N2: RHS);
14276 case AArch64CC::GE:
14277 if (IsZero)
14278 return DAG.getNode(Opcode: AArch64ISD::FCMGEz, DL: dl, VT, Operand: LHS);
14279 return DAG.getNode(Opcode: AArch64ISD::FCMGE, DL: dl, VT, N1: LHS, N2: RHS);
14280 case AArch64CC::GT:
14281 if (IsZero)
14282 return DAG.getNode(Opcode: AArch64ISD::FCMGTz, DL: dl, VT, Operand: LHS);
14283 return DAG.getNode(Opcode: AArch64ISD::FCMGT, DL: dl, VT, N1: LHS, N2: RHS);
14284 case AArch64CC::LE:
14285 if (!NoNans)
14286 return SDValue();
14287 // If we ignore NaNs then we can use to the LS implementation.
14288 [[fallthrough]];
14289 case AArch64CC::LS:
14290 if (IsZero)
14291 return DAG.getNode(Opcode: AArch64ISD::FCMLEz, DL: dl, VT, Operand: LHS);
14292 return DAG.getNode(Opcode: AArch64ISD::FCMGE, DL: dl, VT, N1: RHS, N2: LHS);
14293 case AArch64CC::LT:
14294 if (!NoNans)
14295 return SDValue();
14296 // If we ignore NaNs then we can use to the MI implementation.
14297 [[fallthrough]];
14298 case AArch64CC::MI:
14299 if (IsZero)
14300 return DAG.getNode(Opcode: AArch64ISD::FCMLTz, DL: dl, VT, Operand: LHS);
14301 return DAG.getNode(Opcode: AArch64ISD::FCMGT, DL: dl, VT, N1: RHS, N2: LHS);
14302 }
14303 }
14304
14305 switch (CC) {
14306 default:
14307 return SDValue();
14308 case AArch64CC::NE: {
14309 SDValue Cmeq;
14310 if (IsZero)
14311 Cmeq = DAG.getNode(Opcode: AArch64ISD::CMEQz, DL: dl, VT, Operand: LHS);
14312 else
14313 Cmeq = DAG.getNode(Opcode: AArch64ISD::CMEQ, DL: dl, VT, N1: LHS, N2: RHS);
14314 return DAG.getNOT(DL: dl, Val: Cmeq, VT);
14315 }
14316 case AArch64CC::EQ:
14317 if (IsZero)
14318 return DAG.getNode(Opcode: AArch64ISD::CMEQz, DL: dl, VT, Operand: LHS);
14319 return DAG.getNode(Opcode: AArch64ISD::CMEQ, DL: dl, VT, N1: LHS, N2: RHS);
14320 case AArch64CC::GE:
14321 if (IsZero)
14322 return DAG.getNode(Opcode: AArch64ISD::CMGEz, DL: dl, VT, Operand: LHS);
14323 return DAG.getNode(Opcode: AArch64ISD::CMGE, DL: dl, VT, N1: LHS, N2: RHS);
14324 case AArch64CC::GT:
14325 if (IsZero)
14326 return DAG.getNode(Opcode: AArch64ISD::CMGTz, DL: dl, VT, Operand: LHS);
14327 if (IsMinusOne)
14328 return DAG.getNode(Opcode: AArch64ISD::CMGEz, DL: dl, VT, N1: LHS, N2: RHS);
14329 return DAG.getNode(Opcode: AArch64ISD::CMGT, DL: dl, VT, N1: LHS, N2: RHS);
14330 case AArch64CC::LE:
14331 if (IsZero)
14332 return DAG.getNode(Opcode: AArch64ISD::CMLEz, DL: dl, VT, Operand: LHS);
14333 return DAG.getNode(Opcode: AArch64ISD::CMGE, DL: dl, VT, N1: RHS, N2: LHS);
14334 case AArch64CC::LS:
14335 return DAG.getNode(Opcode: AArch64ISD::CMHS, DL: dl, VT, N1: RHS, N2: LHS);
14336 case AArch64CC::LO:
14337 return DAG.getNode(Opcode: AArch64ISD::CMHI, DL: dl, VT, N1: RHS, N2: LHS);
14338 case AArch64CC::LT:
14339 if (IsZero)
14340 return DAG.getNode(Opcode: AArch64ISD::CMLTz, DL: dl, VT, Operand: LHS);
14341 if (IsOne)
14342 return DAG.getNode(Opcode: AArch64ISD::CMLEz, DL: dl, VT, Operand: LHS);
14343 return DAG.getNode(Opcode: AArch64ISD::CMGT, DL: dl, VT, N1: RHS, N2: LHS);
14344 case AArch64CC::HI:
14345 return DAG.getNode(Opcode: AArch64ISD::CMHI, DL: dl, VT, N1: LHS, N2: RHS);
14346 case AArch64CC::HS:
14347 return DAG.getNode(Opcode: AArch64ISD::CMHS, DL: dl, VT, N1: LHS, N2: RHS);
14348 }
14349}
14350
14351SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
14352 SelectionDAG &DAG) const {
14353 if (Op.getValueType().isScalableVector())
14354 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::SETCC_MERGE_ZERO);
14355
14356 if (useSVEForFixedLengthVectorVT(VT: Op.getOperand(i: 0).getValueType(),
14357 OverrideNEON: !Subtarget->isNeonAvailable()))
14358 return LowerFixedLengthVectorSetccToSVE(Op, DAG);
14359
14360 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: 2))->get();
14361 SDValue LHS = Op.getOperand(i: 0);
14362 SDValue RHS = Op.getOperand(i: 1);
14363 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
14364 SDLoc dl(Op);
14365
14366 if (LHS.getValueType().getVectorElementType().isInteger()) {
14367 assert(LHS.getValueType() == RHS.getValueType());
14368 AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
14369 SDValue Cmp =
14370 EmitVectorComparison(LHS, RHS, CC: AArch64CC, NoNans: false, VT: CmpVT, dl, DAG);
14371 return DAG.getSExtOrTrunc(Op: Cmp, DL: dl, VT: Op.getValueType());
14372 }
14373
14374 // Lower isnan(x) | isnan(never-nan) to x != x.
14375 // Lower !isnan(x) & !isnan(never-nan) to x == x.
14376 if (CC == ISD::SETUO || CC == ISD::SETO) {
14377 bool OneNaN = false;
14378 if (LHS == RHS) {
14379 OneNaN = true;
14380 } else if (DAG.isKnownNeverNaN(Op: RHS)) {
14381 OneNaN = true;
14382 RHS = LHS;
14383 } else if (DAG.isKnownNeverNaN(Op: LHS)) {
14384 OneNaN = true;
14385 LHS = RHS;
14386 }
14387 if (OneNaN) {
14388 CC = CC == ISD::SETUO ? ISD::SETUNE : ISD::SETOEQ;
14389 }
14390 }
14391
14392 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
14393
14394 // Make v4f16 (only) fcmp operations utilise vector instructions
14395 // v8f16 support will be a litle more complicated
14396 if ((!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) ||
14397 LHS.getValueType().getVectorElementType() == MVT::bf16) {
14398 if (LHS.getValueType().getVectorNumElements() == 4) {
14399 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
14400 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
14401 SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
14402 DAG.ReplaceAllUsesWith(From: Op, To: NewSetcc);
14403 CmpVT = MVT::v4i32;
14404 } else
14405 return SDValue();
14406 }
14407
14408 assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) ||
14409 LHS.getValueType().getVectorElementType() != MVT::bf16 ||
14410 LHS.getValueType().getVectorElementType() != MVT::f128);
14411
14412 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
14413 // clean. Some of them require two branches to implement.
14414 AArch64CC::CondCode CC1, CC2;
14415 bool ShouldInvert;
14416 changeVectorFPCCToAArch64CC(CC, CondCode&: CC1, CondCode2&: CC2, Invert&: ShouldInvert);
14417
14418 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
14419 SDValue Cmp =
14420 EmitVectorComparison(LHS, RHS, CC: CC1, NoNans: NoNaNs, VT: CmpVT, dl, DAG);
14421 if (!Cmp.getNode())
14422 return SDValue();
14423
14424 if (CC2 != AArch64CC::AL) {
14425 SDValue Cmp2 =
14426 EmitVectorComparison(LHS, RHS, CC: CC2, NoNans: NoNaNs, VT: CmpVT, dl, DAG);
14427 if (!Cmp2.getNode())
14428 return SDValue();
14429
14430 Cmp = DAG.getNode(Opcode: ISD::OR, DL: dl, VT: CmpVT, N1: Cmp, N2: Cmp2);
14431 }
14432
14433 Cmp = DAG.getSExtOrTrunc(Op: Cmp, DL: dl, VT: Op.getValueType());
14434
14435 if (ShouldInvert)
14436 Cmp = DAG.getNOT(DL: dl, Val: Cmp, VT: Cmp.getValueType());
14437
14438 return Cmp;
14439}
14440
14441static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
14442 SelectionDAG &DAG) {
14443 SDValue VecOp = ScalarOp.getOperand(i: 0);
14444 auto Rdx = DAG.getNode(Opcode: Op, DL, VT: VecOp.getSimpleValueType(), Operand: VecOp);
14445 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
14446 DAG.getConstant(0, DL, MVT::i64));
14447}
14448
14449static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
14450 SDLoc DL, SelectionDAG &DAG) {
14451 unsigned ScalarOpcode;
14452 switch (Opcode) {
14453 case ISD::VECREDUCE_AND:
14454 ScalarOpcode = ISD::AND;
14455 break;
14456 case ISD::VECREDUCE_OR:
14457 ScalarOpcode = ISD::OR;
14458 break;
14459 case ISD::VECREDUCE_XOR:
14460 ScalarOpcode = ISD::XOR;
14461 break;
14462 default:
14463 llvm_unreachable("Expected bitwise vector reduction");
14464 return SDValue();
14465 }
14466
14467 EVT VecVT = Vec.getValueType();
14468 assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
14469 "Expected power-of-2 length vector");
14470
14471 EVT ElemVT = VecVT.getVectorElementType();
14472
14473 SDValue Result;
14474 unsigned NumElems = VecVT.getVectorNumElements();
14475
14476 // Special case for boolean reductions
14477 if (ElemVT == MVT::i1) {
14478 // Split large vectors into smaller ones
14479 if (NumElems > 16) {
14480 SDValue Lo, Hi;
14481 std::tie(args&: Lo, args&: Hi) = DAG.SplitVector(N: Vec, DL);
14482 EVT HalfVT = Lo.getValueType();
14483 SDValue HalfVec = DAG.getNode(Opcode: ScalarOpcode, DL, VT: HalfVT, N1: Lo, N2: Hi);
14484 return getVectorBitwiseReduce(Opcode, Vec: HalfVec, VT, DL, DAG);
14485 }
14486
14487 // Vectors that are less than 64 bits get widened to neatly fit a 64 bit
14488 // register, so e.g. <4 x i1> gets lowered to <4 x i16>. Sign extending to
14489 // this element size leads to the best codegen, since e.g. setcc results
14490 // might need to be truncated otherwise.
14491 EVT ExtendedVT = MVT::getIntegerVT(BitWidth: std::max(a: 64u / NumElems, b: 8u));
14492
14493 // any_ext doesn't work with umin/umax, so only use it for uadd.
14494 unsigned ExtendOp =
14495 ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
14496 SDValue Extended = DAG.getNode(
14497 Opcode: ExtendOp, DL, VT: VecVT.changeVectorElementType(EltVT: ExtendedVT), Operand: Vec);
14498 switch (ScalarOpcode) {
14499 case ISD::AND:
14500 Result = DAG.getNode(Opcode: ISD::VECREDUCE_UMIN, DL, VT: ExtendedVT, Operand: Extended);
14501 break;
14502 case ISD::OR:
14503 Result = DAG.getNode(Opcode: ISD::VECREDUCE_UMAX, DL, VT: ExtendedVT, Operand: Extended);
14504 break;
14505 case ISD::XOR:
14506 Result = DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: ExtendedVT, Operand: Extended);
14507 break;
14508 default:
14509 llvm_unreachable("Unexpected Opcode");
14510 }
14511
14512 Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1);
14513 } else {
14514 // Iteratively split the vector in half and combine using the bitwise
14515 // operation until it fits in a 64 bit register.
14516 while (VecVT.getSizeInBits() > 64) {
14517 SDValue Lo, Hi;
14518 std::tie(args&: Lo, args&: Hi) = DAG.SplitVector(N: Vec, DL);
14519 VecVT = Lo.getValueType();
14520 NumElems = VecVT.getVectorNumElements();
14521 Vec = DAG.getNode(Opcode: ScalarOpcode, DL, VT: VecVT, N1: Lo, N2: Hi);
14522 }
14523
14524 EVT ScalarVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: VecVT.getSizeInBits());
14525
14526 // Do the remaining work on a scalar since it allows the code generator to
14527 // combine the shift and bitwise operation into one instruction and since
14528 // integer instructions can have higher throughput than vector instructions.
14529 SDValue Scalar = DAG.getBitcast(VT: ScalarVT, V: Vec);
14530
14531 // Iteratively combine the lower and upper halves of the scalar using the
14532 // bitwise operation, halving the relevant region of the scalar in each
14533 // iteration, until the relevant region is just one element of the original
14534 // vector.
14535 for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) {
14536 SDValue ShiftAmount =
14537 DAG.getConstant(Shift * ElemVT.getSizeInBits(), DL, MVT::i64);
14538 SDValue Shifted =
14539 DAG.getNode(Opcode: ISD::SRL, DL, VT: ScalarVT, N1: Scalar, N2: ShiftAmount);
14540 Scalar = DAG.getNode(Opcode: ScalarOpcode, DL, VT: ScalarVT, N1: Scalar, N2: Shifted);
14541 }
14542
14543 Result = DAG.getAnyExtOrTrunc(Op: Scalar, DL, VT: ElemVT);
14544 }
14545
14546 return DAG.getAnyExtOrTrunc(Op: Result, DL, VT);
14547}
14548
14549SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
14550 SelectionDAG &DAG) const {
14551 SDValue Src = Op.getOperand(i: 0);
14552
14553 // Try to lower fixed length reductions to SVE.
14554 EVT SrcVT = Src.getValueType();
14555 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
14556 Op.getOpcode() == ISD::VECREDUCE_AND ||
14557 Op.getOpcode() == ISD::VECREDUCE_OR ||
14558 Op.getOpcode() == ISD::VECREDUCE_XOR ||
14559 Op.getOpcode() == ISD::VECREDUCE_FADD ||
14560 (Op.getOpcode() != ISD::VECREDUCE_ADD &&
14561 SrcVT.getVectorElementType() == MVT::i64);
14562 if (SrcVT.isScalableVector() ||
14563 useSVEForFixedLengthVectorVT(
14564 VT: SrcVT, OverrideNEON: OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
14565
14566 if (SrcVT.getVectorElementType() == MVT::i1)
14567 return LowerPredReductionToSVE(ScalarOp: Op, DAG);
14568
14569 switch (Op.getOpcode()) {
14570 case ISD::VECREDUCE_ADD:
14571 return LowerReductionToSVE(Opcode: AArch64ISD::UADDV_PRED, ScalarOp: Op, DAG);
14572 case ISD::VECREDUCE_AND:
14573 return LowerReductionToSVE(Opcode: AArch64ISD::ANDV_PRED, ScalarOp: Op, DAG);
14574 case ISD::VECREDUCE_OR:
14575 return LowerReductionToSVE(Opcode: AArch64ISD::ORV_PRED, ScalarOp: Op, DAG);
14576 case ISD::VECREDUCE_SMAX:
14577 return LowerReductionToSVE(Opcode: AArch64ISD::SMAXV_PRED, ScalarOp: Op, DAG);
14578 case ISD::VECREDUCE_SMIN:
14579 return LowerReductionToSVE(Opcode: AArch64ISD::SMINV_PRED, ScalarOp: Op, DAG);
14580 case ISD::VECREDUCE_UMAX:
14581 return LowerReductionToSVE(Opcode: AArch64ISD::UMAXV_PRED, ScalarOp: Op, DAG);
14582 case ISD::VECREDUCE_UMIN:
14583 return LowerReductionToSVE(Opcode: AArch64ISD::UMINV_PRED, ScalarOp: Op, DAG);
14584 case ISD::VECREDUCE_XOR:
14585 return LowerReductionToSVE(Opcode: AArch64ISD::EORV_PRED, ScalarOp: Op, DAG);
14586 case ISD::VECREDUCE_FADD:
14587 return LowerReductionToSVE(Opcode: AArch64ISD::FADDV_PRED, ScalarOp: Op, DAG);
14588 case ISD::VECREDUCE_FMAX:
14589 return LowerReductionToSVE(Opcode: AArch64ISD::FMAXNMV_PRED, ScalarOp: Op, DAG);
14590 case ISD::VECREDUCE_FMIN:
14591 return LowerReductionToSVE(Opcode: AArch64ISD::FMINNMV_PRED, ScalarOp: Op, DAG);
14592 case ISD::VECREDUCE_FMAXIMUM:
14593 return LowerReductionToSVE(Opcode: AArch64ISD::FMAXV_PRED, ScalarOp: Op, DAG);
14594 case ISD::VECREDUCE_FMINIMUM:
14595 return LowerReductionToSVE(Opcode: AArch64ISD::FMINV_PRED, ScalarOp: Op, DAG);
14596 default:
14597 llvm_unreachable("Unhandled fixed length reduction");
14598 }
14599 }
14600
14601 // Lower NEON reductions.
14602 SDLoc dl(Op);
14603 switch (Op.getOpcode()) {
14604 case ISD::VECREDUCE_AND:
14605 case ISD::VECREDUCE_OR:
14606 case ISD::VECREDUCE_XOR:
14607 return getVectorBitwiseReduce(Opcode: Op.getOpcode(), Vec: Op.getOperand(i: 0),
14608 VT: Op.getValueType(), DL: dl, DAG);
14609 case ISD::VECREDUCE_ADD:
14610 return getReductionSDNode(Op: AArch64ISD::UADDV, DL: dl, ScalarOp: Op, DAG);
14611 case ISD::VECREDUCE_SMAX:
14612 return getReductionSDNode(Op: AArch64ISD::SMAXV, DL: dl, ScalarOp: Op, DAG);
14613 case ISD::VECREDUCE_SMIN:
14614 return getReductionSDNode(Op: AArch64ISD::SMINV, DL: dl, ScalarOp: Op, DAG);
14615 case ISD::VECREDUCE_UMAX:
14616 return getReductionSDNode(Op: AArch64ISD::UMAXV, DL: dl, ScalarOp: Op, DAG);
14617 case ISD::VECREDUCE_UMIN:
14618 return getReductionSDNode(Op: AArch64ISD::UMINV, DL: dl, ScalarOp: Op, DAG);
14619 default:
14620 llvm_unreachable("Unhandled reduction");
14621 }
14622}
14623
14624SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
14625 SelectionDAG &DAG) const {
14626 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
14627 // No point replacing if we don't have the relevant instruction/libcall anyway
14628 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
14629 return SDValue();
14630
14631 // LSE has an atomic load-clear instruction, but not a load-and.
14632 SDLoc dl(Op);
14633 MVT VT = Op.getSimpleValueType();
14634 assert(VT != MVT::i128 && "Handled elsewhere, code replicated.");
14635 SDValue RHS = Op.getOperand(i: 2);
14636 AtomicSDNode *AN = cast<AtomicSDNode>(Val: Op.getNode());
14637 RHS = DAG.getNode(Opcode: ISD::XOR, DL: dl, VT, N1: DAG.getConstant(Val: -1ULL, DL: dl, VT), N2: RHS);
14638 return DAG.getAtomic(Opcode: ISD::ATOMIC_LOAD_CLR, dl, MemVT: AN->getMemoryVT(),
14639 Chain: Op.getOperand(i: 0), Ptr: Op.getOperand(i: 1), Val: RHS,
14640 MMO: AN->getMemOperand());
14641}
14642
14643SDValue
14644AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
14645 SelectionDAG &DAG) const {
14646
14647 SDLoc dl(Op);
14648 // Get the inputs.
14649 SDNode *Node = Op.getNode();
14650 SDValue Chain = Op.getOperand(i: 0);
14651 SDValue Size = Op.getOperand(i: 1);
14652 MaybeAlign Align =
14653 cast<ConstantSDNode>(Val: Op.getOperand(i: 2))->getMaybeAlignValue();
14654 EVT VT = Node->getValueType(ResNo: 0);
14655
14656 if (DAG.getMachineFunction().getFunction().hasFnAttribute(
14657 Kind: "no-stack-arg-probe")) {
14658 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
14659 Chain = SP.getValue(R: 1);
14660 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
14661 if (Align)
14662 SP = DAG.getNode(Opcode: ISD::AND, DL: dl, VT, N1: SP.getValue(R: 0),
14663 N2: DAG.getConstant(Val: -(uint64_t)Align->value(), DL: dl, VT));
14664 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
14665 SDValue Ops[2] = {SP, Chain};
14666 return DAG.getMergeValues(Ops, dl);
14667 }
14668
14669 Chain = DAG.getCALLSEQ_START(Chain, InSize: 0, OutSize: 0, DL: dl);
14670
14671 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
14672 SDValue Callee = DAG.getTargetExternalSymbol(Sym: Subtarget->getChkStkName(),
14673 VT: PtrVT, TargetFlags: 0);
14674
14675 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
14676 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
14677 if (Subtarget->hasCustomCallingConv())
14678 TRI->UpdateCustomCallPreservedMask(MF&: DAG.getMachineFunction(), Mask: &Mask);
14679
14680 Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
14681 DAG.getConstant(4, dl, MVT::i64));
14682 Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
14683 Chain =
14684 DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue),
14685 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
14686 DAG.getRegisterMask(Mask), Chain.getValue(1));
14687 // To match the actual intent better, we should read the output from X15 here
14688 // again (instead of potentially spilling it to the stack), but rereading Size
14689 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
14690 // here.
14691
14692 Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
14693 DAG.getConstant(4, dl, MVT::i64));
14694
14695 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
14696 Chain = SP.getValue(R: 1);
14697 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
14698 if (Align)
14699 SP = DAG.getNode(Opcode: ISD::AND, DL: dl, VT, N1: SP.getValue(R: 0),
14700 N2: DAG.getConstant(Val: -(uint64_t)Align->value(), DL: dl, VT));
14701 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
14702
14703 Chain = DAG.getCALLSEQ_END(Chain, Size1: 0, Size2: 0, Glue: SDValue(), DL: dl);
14704
14705 SDValue Ops[2] = {SP, Chain};
14706 return DAG.getMergeValues(Ops, dl);
14707}
14708
14709SDValue
14710AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
14711 SelectionDAG &DAG) const {
14712 // Get the inputs.
14713 SDNode *Node = Op.getNode();
14714 SDValue Chain = Op.getOperand(i: 0);
14715 SDValue Size = Op.getOperand(i: 1);
14716
14717 MaybeAlign Align =
14718 cast<ConstantSDNode>(Val: Op.getOperand(i: 2))->getMaybeAlignValue();
14719 SDLoc dl(Op);
14720 EVT VT = Node->getValueType(ResNo: 0);
14721
14722 // Construct the new SP value in a GPR.
14723 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
14724 Chain = SP.getValue(R: 1);
14725 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
14726 if (Align)
14727 SP = DAG.getNode(Opcode: ISD::AND, DL: dl, VT, N1: SP.getValue(R: 0),
14728 N2: DAG.getConstant(Val: -(uint64_t)Align->value(), DL: dl, VT));
14729
14730 // Set the real SP to the new value with a probing loop.
14731 Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP);
14732 SDValue Ops[2] = {SP, Chain};
14733 return DAG.getMergeValues(Ops, dl);
14734}
14735
14736SDValue
14737AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
14738 SelectionDAG &DAG) const {
14739 MachineFunction &MF = DAG.getMachineFunction();
14740
14741 if (Subtarget->isTargetWindows())
14742 return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
14743 else if (hasInlineStackProbe(MF))
14744 return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
14745 else
14746 return SDValue();
14747}
14748
14749// When x and y are extended, lower:
14750// avgfloor(x, y) -> (x + y) >> 1
14751// avgceil(x, y) -> (x + y + 1) >> 1
14752
14753// Otherwise, lower to:
14754// avgfloor(x, y) -> (x >> 1) + (y >> 1) + (x & y & 1)
14755// avgceil(x, y) -> (x >> 1) + (y >> 1) + ((x || y) & 1)
14756SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
14757 unsigned NewOp) const {
14758 if (Subtarget->hasSVE2())
14759 return LowerToPredicatedOp(Op, DAG, NewOp);
14760
14761 SDLoc dl(Op);
14762 SDValue OpA = Op->getOperand(Num: 0);
14763 SDValue OpB = Op->getOperand(Num: 1);
14764 EVT VT = Op.getValueType();
14765 bool IsCeil =
14766 (Op->getOpcode() == ISD::AVGCEILS || Op->getOpcode() == ISD::AVGCEILU);
14767 bool IsSigned =
14768 (Op->getOpcode() == ISD::AVGFLOORS || Op->getOpcode() == ISD::AVGCEILS);
14769 unsigned ShiftOpc = IsSigned ? ISD::SRA : ISD::SRL;
14770
14771 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
14772
14773 auto IsZeroExtended = [&DAG](SDValue &Node) {
14774 KnownBits Known = DAG.computeKnownBits(Op: Node, Depth: 0);
14775 return Known.Zero.isSignBitSet();
14776 };
14777
14778 auto IsSignExtended = [&DAG](SDValue &Node) {
14779 return (DAG.ComputeNumSignBits(Op: Node, Depth: 0) > 1);
14780 };
14781
14782 SDValue ConstantOne = DAG.getConstant(Val: 1, DL: dl, VT);
14783 if ((!IsSigned && IsZeroExtended(OpA) && IsZeroExtended(OpB)) ||
14784 (IsSigned && IsSignExtended(OpA) && IsSignExtended(OpB))) {
14785 SDValue Add = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: OpA, N2: OpB);
14786 if (IsCeil)
14787 Add = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: Add, N2: ConstantOne);
14788 return DAG.getNode(Opcode: ShiftOpc, DL: dl, VT, N1: Add, N2: ConstantOne);
14789 }
14790
14791 SDValue ShiftOpA = DAG.getNode(Opcode: ShiftOpc, DL: dl, VT, N1: OpA, N2: ConstantOne);
14792 SDValue ShiftOpB = DAG.getNode(Opcode: ShiftOpc, DL: dl, VT, N1: OpB, N2: ConstantOne);
14793
14794 SDValue tmp = DAG.getNode(Opcode: IsCeil ? ISD::OR : ISD::AND, DL: dl, VT, N1: OpA, N2: OpB);
14795 tmp = DAG.getNode(Opcode: ISD::AND, DL: dl, VT, N1: tmp, N2: ConstantOne);
14796 SDValue Add = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: ShiftOpA, N2: ShiftOpB);
14797 return DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: Add, N2: tmp);
14798}
14799
14800SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
14801 SelectionDAG &DAG) const {
14802 EVT VT = Op.getValueType();
14803 assert(VT != MVT::i64 && "Expected illegal VSCALE node");
14804
14805 SDLoc DL(Op);
14806 APInt MulImm = Op.getConstantOperandAPInt(i: 0);
14807 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
14808 VT);
14809}
14810
14811/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
14812template <unsigned NumVecs>
14813static bool
14814setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL,
14815 AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI) {
14816 Info.opc = ISD::INTRINSIC_VOID;
14817 // Retrieve EC from first vector argument.
14818 const EVT VT = TLI.getMemValueType(DL, Ty: CI.getArgOperand(i: 0)->getType());
14819 ElementCount EC = VT.getVectorElementCount();
14820#ifndef NDEBUG
14821 // Check the assumption that all input vectors are the same type.
14822 for (unsigned I = 0; I < NumVecs; ++I)
14823 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
14824 "Invalid type.");
14825#endif
14826 // memVT is `NumVecs * VT`.
14827 Info.memVT = EVT::getVectorVT(Context&: CI.getType()->getContext(), VT: VT.getScalarType(),
14828 EC: EC * NumVecs);
14829 Info.ptrVal = CI.getArgOperand(i: CI.arg_size() - 1);
14830 Info.offset = 0;
14831 Info.align.reset();
14832 Info.flags = MachineMemOperand::MOStore;
14833 return true;
14834}
14835
14836/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
14837/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
14838/// specified in the intrinsic calls.
14839bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
14840 const CallInst &I,
14841 MachineFunction &MF,
14842 unsigned Intrinsic) const {
14843 auto &DL = I.getModule()->getDataLayout();
14844 switch (Intrinsic) {
14845 case Intrinsic::aarch64_sve_st2:
14846 return setInfoSVEStN<2>(TLI: *this, DL, Info, CI: I);
14847 case Intrinsic::aarch64_sve_st3:
14848 return setInfoSVEStN<3>(TLI: *this, DL, Info, CI: I);
14849 case Intrinsic::aarch64_sve_st4:
14850 return setInfoSVEStN<4>(TLI: *this, DL, Info, CI: I);
14851 case Intrinsic::aarch64_neon_ld2:
14852 case Intrinsic::aarch64_neon_ld3:
14853 case Intrinsic::aarch64_neon_ld4:
14854 case Intrinsic::aarch64_neon_ld1x2:
14855 case Intrinsic::aarch64_neon_ld1x3:
14856 case Intrinsic::aarch64_neon_ld1x4: {
14857 Info.opc = ISD::INTRINSIC_W_CHAIN;
14858 uint64_t NumElts = DL.getTypeSizeInBits(Ty: I.getType()) / 64;
14859 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
14860 Info.ptrVal = I.getArgOperand(i: I.arg_size() - 1);
14861 Info.offset = 0;
14862 Info.align.reset();
14863 // volatile loads with NEON intrinsics not supported
14864 Info.flags = MachineMemOperand::MOLoad;
14865 return true;
14866 }
14867 case Intrinsic::aarch64_neon_ld2lane:
14868 case Intrinsic::aarch64_neon_ld3lane:
14869 case Intrinsic::aarch64_neon_ld4lane:
14870 case Intrinsic::aarch64_neon_ld2r:
14871 case Intrinsic::aarch64_neon_ld3r:
14872 case Intrinsic::aarch64_neon_ld4r: {
14873 Info.opc = ISD::INTRINSIC_W_CHAIN;
14874 // ldx return struct with the same vec type
14875 Type *RetTy = I.getType();
14876 auto *StructTy = cast<StructType>(Val: RetTy);
14877 unsigned NumElts = StructTy->getNumElements();
14878 Type *VecTy = StructTy->getElementType(N: 0);
14879 MVT EleVT = MVT::getVT(Ty: VecTy).getVectorElementType();
14880 Info.memVT = EVT::getVectorVT(Context&: I.getType()->getContext(), VT: EleVT, NumElements: NumElts);
14881 Info.ptrVal = I.getArgOperand(i: I.arg_size() - 1);
14882 Info.offset = 0;
14883 Info.align.reset();
14884 // volatile loads with NEON intrinsics not supported
14885 Info.flags = MachineMemOperand::MOLoad;
14886 return true;
14887 }
14888 case Intrinsic::aarch64_neon_st2:
14889 case Intrinsic::aarch64_neon_st3:
14890 case Intrinsic::aarch64_neon_st4:
14891 case Intrinsic::aarch64_neon_st1x2:
14892 case Intrinsic::aarch64_neon_st1x3:
14893 case Intrinsic::aarch64_neon_st1x4: {
14894 Info.opc = ISD::INTRINSIC_VOID;
14895 unsigned NumElts = 0;
14896 for (const Value *Arg : I.args()) {
14897 Type *ArgTy = Arg->getType();
14898 if (!ArgTy->isVectorTy())
14899 break;
14900 NumElts += DL.getTypeSizeInBits(Ty: ArgTy) / 64;
14901 }
14902 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
14903 Info.ptrVal = I.getArgOperand(i: I.arg_size() - 1);
14904 Info.offset = 0;
14905 Info.align.reset();
14906 // volatile stores with NEON intrinsics not supported
14907 Info.flags = MachineMemOperand::MOStore;
14908 return true;
14909 }
14910 case Intrinsic::aarch64_neon_st2lane:
14911 case Intrinsic::aarch64_neon_st3lane:
14912 case Intrinsic::aarch64_neon_st4lane: {
14913 Info.opc = ISD::INTRINSIC_VOID;
14914 unsigned NumElts = 0;
14915 // all the vector type is same
14916 Type *VecTy = I.getArgOperand(i: 0)->getType();
14917 MVT EleVT = MVT::getVT(Ty: VecTy).getVectorElementType();
14918
14919 for (const Value *Arg : I.args()) {
14920 Type *ArgTy = Arg->getType();
14921 if (!ArgTy->isVectorTy())
14922 break;
14923 NumElts += 1;
14924 }
14925
14926 Info.memVT = EVT::getVectorVT(Context&: I.getType()->getContext(), VT: EleVT, NumElements: NumElts);
14927 Info.ptrVal = I.getArgOperand(i: I.arg_size() - 1);
14928 Info.offset = 0;
14929 Info.align.reset();
14930 // volatile stores with NEON intrinsics not supported
14931 Info.flags = MachineMemOperand::MOStore;
14932 return true;
14933 }
14934 case Intrinsic::aarch64_ldaxr:
14935 case Intrinsic::aarch64_ldxr: {
14936 Type *ValTy = I.getParamElementType(ArgNo: 0);
14937 Info.opc = ISD::INTRINSIC_W_CHAIN;
14938 Info.memVT = MVT::getVT(Ty: ValTy);
14939 Info.ptrVal = I.getArgOperand(i: 0);
14940 Info.offset = 0;
14941 Info.align = DL.getABITypeAlign(Ty: ValTy);
14942 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
14943 return true;
14944 }
14945 case Intrinsic::aarch64_stlxr:
14946 case Intrinsic::aarch64_stxr: {
14947 Type *ValTy = I.getParamElementType(ArgNo: 1);
14948 Info.opc = ISD::INTRINSIC_W_CHAIN;
14949 Info.memVT = MVT::getVT(Ty: ValTy);
14950 Info.ptrVal = I.getArgOperand(i: 1);
14951 Info.offset = 0;
14952 Info.align = DL.getABITypeAlign(Ty: ValTy);
14953 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
14954 return true;
14955 }
14956 case Intrinsic::aarch64_ldaxp:
14957 case Intrinsic::aarch64_ldxp:
14958 Info.opc = ISD::INTRINSIC_W_CHAIN;
14959 Info.memVT = MVT::i128;
14960 Info.ptrVal = I.getArgOperand(i: 0);
14961 Info.offset = 0;
14962 Info.align = Align(16);
14963 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
14964 return true;
14965 case Intrinsic::aarch64_stlxp:
14966 case Intrinsic::aarch64_stxp:
14967 Info.opc = ISD::INTRINSIC_W_CHAIN;
14968 Info.memVT = MVT::i128;
14969 Info.ptrVal = I.getArgOperand(i: 2);
14970 Info.offset = 0;
14971 Info.align = Align(16);
14972 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
14973 return true;
14974 case Intrinsic::aarch64_sve_ldnt1: {
14975 Type *ElTy = cast<VectorType>(Val: I.getType())->getElementType();
14976 Info.opc = ISD::INTRINSIC_W_CHAIN;
14977 Info.memVT = MVT::getVT(Ty: I.getType());
14978 Info.ptrVal = I.getArgOperand(i: 1);
14979 Info.offset = 0;
14980 Info.align = DL.getABITypeAlign(Ty: ElTy);
14981 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MONonTemporal;
14982 return true;
14983 }
14984 case Intrinsic::aarch64_sve_stnt1: {
14985 Type *ElTy =
14986 cast<VectorType>(Val: I.getArgOperand(i: 0)->getType())->getElementType();
14987 Info.opc = ISD::INTRINSIC_W_CHAIN;
14988 Info.memVT = MVT::getVT(Ty: I.getOperand(i_nocapture: 0)->getType());
14989 Info.ptrVal = I.getArgOperand(i: 2);
14990 Info.offset = 0;
14991 Info.align = DL.getABITypeAlign(Ty: ElTy);
14992 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MONonTemporal;
14993 return true;
14994 }
14995 case Intrinsic::aarch64_mops_memset_tag: {
14996 Value *Dst = I.getArgOperand(i: 0);
14997 Value *Val = I.getArgOperand(i: 1);
14998 Info.opc = ISD::INTRINSIC_W_CHAIN;
14999 Info.memVT = MVT::getVT(Ty: Val->getType());
15000 Info.ptrVal = Dst;
15001 Info.offset = 0;
15002 Info.align = I.getParamAlign(ArgNo: 0).valueOrOne();
15003 Info.flags = MachineMemOperand::MOStore;
15004 // The size of the memory being operated on is unknown at this point
15005 Info.size = MemoryLocation::UnknownSize;
15006 return true;
15007 }
15008 default:
15009 break;
15010 }
15011
15012 return false;
15013}
15014
15015bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load,
15016 ISD::LoadExtType ExtTy,
15017 EVT NewVT) const {
15018 // TODO: This may be worth removing. Check regression tests for diffs.
15019 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
15020 return false;
15021
15022 // If we're reducing the load width in order to avoid having to use an extra
15023 // instruction to do extension then it's probably a good idea.
15024 if (ExtTy != ISD::NON_EXTLOAD)
15025 return true;
15026 // Don't reduce load width if it would prevent us from combining a shift into
15027 // the offset.
15028 MemSDNode *Mem = dyn_cast<MemSDNode>(Val: Load);
15029 assert(Mem);
15030 const SDValue &Base = Mem->getBasePtr();
15031 if (Base.getOpcode() == ISD::ADD &&
15032 Base.getOperand(i: 1).getOpcode() == ISD::SHL &&
15033 Base.getOperand(i: 1).hasOneUse() &&
15034 Base.getOperand(i: 1).getOperand(i: 1).getOpcode() == ISD::Constant) {
15035 // It's unknown whether a scalable vector has a power-of-2 bitwidth.
15036 if (Mem->getMemoryVT().isScalableVector())
15037 return false;
15038 // The shift can be combined if it matches the size of the value being
15039 // loaded (and so reducing the width would make it not match).
15040 uint64_t ShiftAmount = Base.getOperand(i: 1).getConstantOperandVal(i: 1);
15041 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
15042 if (ShiftAmount == Log2_32(Value: LoadBytes))
15043 return false;
15044 }
15045 // We have no reason to disallow reducing the load width, so allow it.
15046 return true;
15047}
15048
15049// Treat a sext_inreg(extract(..)) as free if it has multiple uses.
15050bool AArch64TargetLowering::shouldRemoveRedundantExtend(SDValue Extend) const {
15051 EVT VT = Extend.getValueType();
15052 if ((VT == MVT::i64 || VT == MVT::i32) && Extend->use_size()) {
15053 SDValue Extract = Extend.getOperand(i: 0);
15054 if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse())
15055 Extract = Extract.getOperand(i: 0);
15056 if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) {
15057 EVT VecVT = Extract.getOperand(i: 0).getValueType();
15058 if (VecVT.getScalarType() == MVT::i8 || VecVT.getScalarType() == MVT::i16)
15059 return false;
15060 }
15061 }
15062 return true;
15063}
15064
15065// Truncations from 64-bit GPR to 32-bit GPR is free.
15066bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
15067 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
15068 return false;
15069 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();
15070 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();
15071 return NumBits1 > NumBits2;
15072}
15073bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
15074 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
15075 return false;
15076 uint64_t NumBits1 = VT1.getFixedSizeInBits();
15077 uint64_t NumBits2 = VT2.getFixedSizeInBits();
15078 return NumBits1 > NumBits2;
15079}
15080
15081/// Check if it is profitable to hoist instruction in then/else to if.
15082/// Not profitable if I and it's user can form a FMA instruction
15083/// because we prefer FMSUB/FMADD.
15084bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
15085 if (I->getOpcode() != Instruction::FMul)
15086 return true;
15087
15088 if (!I->hasOneUse())
15089 return true;
15090
15091 Instruction *User = I->user_back();
15092
15093 if (!(User->getOpcode() == Instruction::FSub ||
15094 User->getOpcode() == Instruction::FAdd))
15095 return true;
15096
15097 const TargetOptions &Options = getTargetMachine().Options;
15098 const Function *F = I->getFunction();
15099 const DataLayout &DL = F->getParent()->getDataLayout();
15100 Type *Ty = User->getOperand(i: 0)->getType();
15101
15102 return !(isFMAFasterThanFMulAndFAdd(F: *F, Ty) &&
15103 isOperationLegalOrCustom(Op: ISD::FMA, VT: getValueType(DL, Ty)) &&
15104 (Options.AllowFPOpFusion == FPOpFusion::Fast ||
15105 Options.UnsafeFPMath));
15106}
15107
15108// All 32-bit GPR operations implicitly zero the high-half of the corresponding
15109// 64-bit GPR.
15110bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
15111 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
15112 return false;
15113 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
15114 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
15115 return NumBits1 == 32 && NumBits2 == 64;
15116}
15117bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
15118 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
15119 return false;
15120 unsigned NumBits1 = VT1.getSizeInBits();
15121 unsigned NumBits2 = VT2.getSizeInBits();
15122 return NumBits1 == 32 && NumBits2 == 64;
15123}
15124
15125bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
15126 EVT VT1 = Val.getValueType();
15127 if (isZExtFree(VT1, VT2)) {
15128 return true;
15129 }
15130
15131 if (Val.getOpcode() != ISD::LOAD)
15132 return false;
15133
15134 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
15135 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
15136 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
15137 VT1.getSizeInBits() <= 32);
15138}
15139
15140bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
15141 if (isa<FPExtInst>(Val: Ext))
15142 return false;
15143
15144 // Vector types are not free.
15145 if (Ext->getType()->isVectorTy())
15146 return false;
15147
15148 for (const Use &U : Ext->uses()) {
15149 // The extension is free if we can fold it with a left shift in an
15150 // addressing mode or an arithmetic operation: add, sub, and cmp.
15151
15152 // Is there a shift?
15153 const Instruction *Instr = cast<Instruction>(Val: U.getUser());
15154
15155 // Is this a constant shift?
15156 switch (Instr->getOpcode()) {
15157 case Instruction::Shl:
15158 if (!isa<ConstantInt>(Val: Instr->getOperand(i: 1)))
15159 return false;
15160 break;
15161 case Instruction::GetElementPtr: {
15162 gep_type_iterator GTI = gep_type_begin(GEP: Instr);
15163 auto &DL = Ext->getModule()->getDataLayout();
15164 std::advance(i&: GTI, n: U.getOperandNo()-1);
15165 Type *IdxTy = GTI.getIndexedType();
15166 // This extension will end up with a shift because of the scaling factor.
15167 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
15168 // Get the shift amount based on the scaling factor:
15169 // log2(sizeof(IdxTy)) - log2(8).
15170 if (IdxTy->isScalableTy())
15171 return false;
15172 uint64_t ShiftAmt =
15173 llvm::countr_zero(Val: DL.getTypeStoreSizeInBits(Ty: IdxTy).getFixedValue()) -
15174 3;
15175 // Is the constant foldable in the shift of the addressing mode?
15176 // I.e., shift amount is between 1 and 4 inclusive.
15177 if (ShiftAmt == 0 || ShiftAmt > 4)
15178 return false;
15179 break;
15180 }
15181 case Instruction::Trunc:
15182 // Check if this is a noop.
15183 // trunc(sext ty1 to ty2) to ty1.
15184 if (Instr->getType() == Ext->getOperand(i: 0)->getType())
15185 continue;
15186 [[fallthrough]];
15187 default:
15188 return false;
15189 }
15190
15191 // At this point we can use the bfm family, so this extension is free
15192 // for that use.
15193 }
15194 return true;
15195}
15196
15197static bool isSplatShuffle(Value *V) {
15198 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(Val: V))
15199 return all_equal(Range: Shuf->getShuffleMask());
15200 return false;
15201}
15202
15203/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
15204/// or upper half of the vector elements.
15205static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
15206 bool AllowSplat = false) {
15207 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
15208 auto *FullTy = FullV->getType();
15209 auto *HalfTy = HalfV->getType();
15210 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
15211 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
15212 };
15213
15214 auto extractHalf = [](Value *FullV, Value *HalfV) {
15215 auto *FullVT = cast<FixedVectorType>(Val: FullV->getType());
15216 auto *HalfVT = cast<FixedVectorType>(Val: HalfV->getType());
15217 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
15218 };
15219
15220 ArrayRef<int> M1, M2;
15221 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
15222 if (!match(V: Op1, P: m_Shuffle(v1: m_Value(V&: S1Op1), v2: m_Undef(), mask: m_Mask(M1))) ||
15223 !match(V: Op2, P: m_Shuffle(v1: m_Value(V&: S2Op1), v2: m_Undef(), mask: m_Mask(M2))))
15224 return false;
15225
15226 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relavant arg so that
15227 // it is not checked as an extract below.
15228 if (AllowSplat && isSplatShuffle(V: Op1))
15229 S1Op1 = nullptr;
15230 if (AllowSplat && isSplatShuffle(V: Op2))
15231 S2Op1 = nullptr;
15232
15233 // Check that the operands are half as wide as the result and we extract
15234 // half of the elements of the input vectors.
15235 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
15236 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
15237 return false;
15238
15239 // Check the mask extracts either the lower or upper half of vector
15240 // elements.
15241 int M1Start = 0;
15242 int M2Start = 0;
15243 int NumElements = cast<FixedVectorType>(Val: Op1->getType())->getNumElements() * 2;
15244 if ((S1Op1 &&
15245 !ShuffleVectorInst::isExtractSubvectorMask(Mask: M1, NumSrcElts: NumElements, Index&: M1Start)) ||
15246 (S2Op1 &&
15247 !ShuffleVectorInst::isExtractSubvectorMask(Mask: M2, NumSrcElts: NumElements, Index&: M2Start)))
15248 return false;
15249
15250 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
15251 (M2Start != 0 && M2Start != (NumElements / 2)))
15252 return false;
15253 if (S1Op1 && S2Op1 && M1Start != M2Start)
15254 return false;
15255
15256 return true;
15257}
15258
15259/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
15260/// of the vector elements.
15261static bool areExtractExts(Value *Ext1, Value *Ext2) {
15262 auto areExtDoubled = [](Instruction *Ext) {
15263 return Ext->getType()->getScalarSizeInBits() ==
15264 2 * Ext->getOperand(i: 0)->getType()->getScalarSizeInBits();
15265 };
15266
15267 if (!match(V: Ext1, P: m_ZExtOrSExt(Op: m_Value())) ||
15268 !match(V: Ext2, P: m_ZExtOrSExt(Op: m_Value())) ||
15269 !areExtDoubled(cast<Instruction>(Val: Ext1)) ||
15270 !areExtDoubled(cast<Instruction>(Val: Ext2)))
15271 return false;
15272
15273 return true;
15274}
15275
15276/// Check if Op could be used with vmull_high_p64 intrinsic.
15277static bool isOperandOfVmullHighP64(Value *Op) {
15278 Value *VectorOperand = nullptr;
15279 ConstantInt *ElementIndex = nullptr;
15280 return match(V: Op, P: m_ExtractElt(Val: m_Value(V&: VectorOperand),
15281 Idx: m_ConstantInt(CI&: ElementIndex))) &&
15282 ElementIndex->getValue() == 1 &&
15283 isa<FixedVectorType>(Val: VectorOperand->getType()) &&
15284 cast<FixedVectorType>(Val: VectorOperand->getType())->getNumElements() == 2;
15285}
15286
15287/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
15288static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
15289 return isOperandOfVmullHighP64(Op: Op1) && isOperandOfVmullHighP64(Op: Op2);
15290}
15291
15292static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl<Use *> &Ops) {
15293 // Restrict ourselves to the form CodeGenPrepare typically constructs.
15294 auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptrs);
15295 if (!GEP || GEP->getNumOperands() != 2)
15296 return false;
15297
15298 Value *Base = GEP->getOperand(i_nocapture: 0);
15299 Value *Offsets = GEP->getOperand(i_nocapture: 1);
15300
15301 // We only care about scalar_base+vector_offsets.
15302 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
15303 return false;
15304
15305 // Sink extends that would allow us to use 32-bit offset vectors.
15306 if (isa<SExtInst>(Val: Offsets) || isa<ZExtInst>(Val: Offsets)) {
15307 auto *OffsetsInst = cast<Instruction>(Val: Offsets);
15308 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
15309 OffsetsInst->getOperand(i: 0)->getType()->getScalarSizeInBits() <= 32)
15310 Ops.push_back(Elt: &GEP->getOperandUse(i: 1));
15311 }
15312
15313 // Sink the GEP.
15314 return true;
15315}
15316
15317/// We want to sink following cases:
15318/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale
15319static bool shouldSinkVScale(Value *Op, SmallVectorImpl<Use *> &Ops) {
15320 if (match(V: Op, P: m_VScale()))
15321 return true;
15322 if (match(V: Op, P: m_Shl(L: m_VScale(), R: m_ConstantInt())) ||
15323 match(V: Op, P: m_Mul(L: m_VScale(), R: m_ConstantInt()))) {
15324 Ops.push_back(Elt: &cast<Instruction>(Val: Op)->getOperandUse(i: 0));
15325 return true;
15326 }
15327 return false;
15328}
15329
15330/// Check if sinking \p I's operands to I's basic block is profitable, because
15331/// the operands can be folded into a target instruction, e.g.
15332/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
15333bool AArch64TargetLowering::shouldSinkOperands(
15334 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
15335 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: I)) {
15336 switch (II->getIntrinsicID()) {
15337 case Intrinsic::aarch64_neon_smull:
15338 case Intrinsic::aarch64_neon_umull:
15339 if (areExtractShuffleVectors(Op1: II->getOperand(i_nocapture: 0), Op2: II->getOperand(i_nocapture: 1),
15340 /*AllowSplat=*/true)) {
15341 Ops.push_back(Elt: &II->getOperandUse(i: 0));
15342 Ops.push_back(Elt: &II->getOperandUse(i: 1));
15343 return true;
15344 }
15345 [[fallthrough]];
15346
15347 case Intrinsic::fma:
15348 if (isa<VectorType>(Val: I->getType()) &&
15349 cast<VectorType>(Val: I->getType())->getElementType()->isHalfTy() &&
15350 !Subtarget->hasFullFP16())
15351 return false;
15352 [[fallthrough]];
15353 case Intrinsic::aarch64_neon_sqdmull:
15354 case Intrinsic::aarch64_neon_sqdmulh:
15355 case Intrinsic::aarch64_neon_sqrdmulh:
15356 // Sink splats for index lane variants
15357 if (isSplatShuffle(V: II->getOperand(i_nocapture: 0)))
15358 Ops.push_back(Elt: &II->getOperandUse(i: 0));
15359 if (isSplatShuffle(V: II->getOperand(i_nocapture: 1)))
15360 Ops.push_back(Elt: &II->getOperandUse(i: 1));
15361 return !Ops.empty();
15362 case Intrinsic::aarch64_neon_fmlal:
15363 case Intrinsic::aarch64_neon_fmlal2:
15364 case Intrinsic::aarch64_neon_fmlsl:
15365 case Intrinsic::aarch64_neon_fmlsl2:
15366 // Sink splats for index lane variants
15367 if (isSplatShuffle(V: II->getOperand(i_nocapture: 1)))
15368 Ops.push_back(Elt: &II->getOperandUse(i: 1));
15369 if (isSplatShuffle(V: II->getOperand(i_nocapture: 2)))
15370 Ops.push_back(Elt: &II->getOperandUse(i: 2));
15371 return !Ops.empty();
15372 case Intrinsic::aarch64_sve_ptest_first:
15373 case Intrinsic::aarch64_sve_ptest_last:
15374 if (auto *IIOp = dyn_cast<IntrinsicInst>(Val: II->getOperand(i_nocapture: 0)))
15375 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
15376 Ops.push_back(Elt: &II->getOperandUse(i: 0));
15377 return !Ops.empty();
15378 case Intrinsic::aarch64_sme_write_horiz:
15379 case Intrinsic::aarch64_sme_write_vert:
15380 case Intrinsic::aarch64_sme_writeq_horiz:
15381 case Intrinsic::aarch64_sme_writeq_vert: {
15382 auto *Idx = dyn_cast<Instruction>(Val: II->getOperand(i_nocapture: 1));
15383 if (!Idx || Idx->getOpcode() != Instruction::Add)
15384 return false;
15385 Ops.push_back(Elt: &II->getOperandUse(i: 1));
15386 return true;
15387 }
15388 case Intrinsic::aarch64_sme_read_horiz:
15389 case Intrinsic::aarch64_sme_read_vert:
15390 case Intrinsic::aarch64_sme_readq_horiz:
15391 case Intrinsic::aarch64_sme_readq_vert:
15392 case Intrinsic::aarch64_sme_ld1b_vert:
15393 case Intrinsic::aarch64_sme_ld1h_vert:
15394 case Intrinsic::aarch64_sme_ld1w_vert:
15395 case Intrinsic::aarch64_sme_ld1d_vert:
15396 case Intrinsic::aarch64_sme_ld1q_vert:
15397 case Intrinsic::aarch64_sme_st1b_vert:
15398 case Intrinsic::aarch64_sme_st1h_vert:
15399 case Intrinsic::aarch64_sme_st1w_vert:
15400 case Intrinsic::aarch64_sme_st1d_vert:
15401 case Intrinsic::aarch64_sme_st1q_vert:
15402 case Intrinsic::aarch64_sme_ld1b_horiz:
15403 case Intrinsic::aarch64_sme_ld1h_horiz:
15404 case Intrinsic::aarch64_sme_ld1w_horiz:
15405 case Intrinsic::aarch64_sme_ld1d_horiz:
15406 case Intrinsic::aarch64_sme_ld1q_horiz:
15407 case Intrinsic::aarch64_sme_st1b_horiz:
15408 case Intrinsic::aarch64_sme_st1h_horiz:
15409 case Intrinsic::aarch64_sme_st1w_horiz:
15410 case Intrinsic::aarch64_sme_st1d_horiz:
15411 case Intrinsic::aarch64_sme_st1q_horiz: {
15412 auto *Idx = dyn_cast<Instruction>(Val: II->getOperand(i_nocapture: 3));
15413 if (!Idx || Idx->getOpcode() != Instruction::Add)
15414 return false;
15415 Ops.push_back(Elt: &II->getOperandUse(i: 3));
15416 return true;
15417 }
15418 case Intrinsic::aarch64_neon_pmull:
15419 if (!areExtractShuffleVectors(Op1: II->getOperand(i_nocapture: 0), Op2: II->getOperand(i_nocapture: 1)))
15420 return false;
15421 Ops.push_back(Elt: &II->getOperandUse(i: 0));
15422 Ops.push_back(Elt: &II->getOperandUse(i: 1));
15423 return true;
15424 case Intrinsic::aarch64_neon_pmull64:
15425 if (!areOperandsOfVmullHighP64(Op1: II->getArgOperand(i: 0),
15426 Op2: II->getArgOperand(i: 1)))
15427 return false;
15428 Ops.push_back(Elt: &II->getArgOperandUse(i: 0));
15429 Ops.push_back(Elt: &II->getArgOperandUse(i: 1));
15430 return true;
15431 case Intrinsic::masked_gather:
15432 if (!shouldSinkVectorOfPtrs(Ptrs: II->getArgOperand(i: 0), Ops))
15433 return false;
15434 Ops.push_back(Elt: &II->getArgOperandUse(i: 0));
15435 return true;
15436 case Intrinsic::masked_scatter:
15437 if (!shouldSinkVectorOfPtrs(Ptrs: II->getArgOperand(i: 1), Ops))
15438 return false;
15439 Ops.push_back(Elt: &II->getArgOperandUse(i: 1));
15440 return true;
15441 default:
15442 return false;
15443 }
15444 }
15445
15446 // Sink vscales closer to uses for better isel
15447 switch (I->getOpcode()) {
15448 case Instruction::GetElementPtr:
15449 case Instruction::Add:
15450 case Instruction::Sub:
15451 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
15452 if (shouldSinkVScale(Op: I->getOperand(i: Op), Ops)) {
15453 Ops.push_back(Elt: &I->getOperandUse(i: Op));
15454 return true;
15455 }
15456 }
15457 break;
15458 default:
15459 break;
15460 }
15461
15462 if (!I->getType()->isVectorTy())
15463 return false;
15464
15465 switch (I->getOpcode()) {
15466 case Instruction::Sub:
15467 case Instruction::Add: {
15468 if (!areExtractExts(Ext1: I->getOperand(i: 0), Ext2: I->getOperand(i: 1)))
15469 return false;
15470
15471 // If the exts' operands extract either the lower or upper elements, we
15472 // can sink them too.
15473 auto Ext1 = cast<Instruction>(Val: I->getOperand(i: 0));
15474 auto Ext2 = cast<Instruction>(Val: I->getOperand(i: 1));
15475 if (areExtractShuffleVectors(Op1: Ext1->getOperand(i: 0), Op2: Ext2->getOperand(i: 0))) {
15476 Ops.push_back(Elt: &Ext1->getOperandUse(i: 0));
15477 Ops.push_back(Elt: &Ext2->getOperandUse(i: 0));
15478 }
15479
15480 Ops.push_back(Elt: &I->getOperandUse(i: 0));
15481 Ops.push_back(Elt: &I->getOperandUse(i: 1));
15482
15483 return true;
15484 }
15485 case Instruction::Or: {
15486 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
15487 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
15488 if (Subtarget->hasNEON()) {
15489 Instruction *OtherAnd, *IA, *IB;
15490 Value *MaskValue;
15491 // MainAnd refers to And instruction that has 'Not' as one of its operands
15492 if (match(V: I, P: m_c_Or(L: m_OneUse(SubPattern: m_Instruction(I&: OtherAnd)),
15493 R: m_OneUse(SubPattern: m_c_And(L: m_OneUse(SubPattern: m_Not(V: m_Value(V&: MaskValue))),
15494 R: m_Instruction(I&: IA)))))) {
15495 if (match(V: OtherAnd,
15496 P: m_c_And(L: m_Specific(V: MaskValue), R: m_Instruction(I&: IB)))) {
15497 Instruction *MainAnd = I->getOperand(i: 0) == OtherAnd
15498 ? cast<Instruction>(Val: I->getOperand(i: 1))
15499 : cast<Instruction>(Val: I->getOperand(i: 0));
15500
15501 // Both Ands should be in same basic block as Or
15502 if (I->getParent() != MainAnd->getParent() ||
15503 I->getParent() != OtherAnd->getParent())
15504 return false;
15505
15506 // Non-mask operands of both Ands should also be in same basic block
15507 if (I->getParent() != IA->getParent() ||
15508 I->getParent() != IB->getParent())
15509 return false;
15510
15511 Ops.push_back(Elt: &MainAnd->getOperandUse(i: MainAnd->getOperand(i: 0) == IA ? 1 : 0));
15512 Ops.push_back(Elt: &I->getOperandUse(i: 0));
15513 Ops.push_back(Elt: &I->getOperandUse(i: 1));
15514
15515 return true;
15516 }
15517 }
15518 }
15519
15520 return false;
15521 }
15522 case Instruction::Mul: {
15523 int NumZExts = 0, NumSExts = 0;
15524 for (auto &Op : I->operands()) {
15525 // Make sure we are not already sinking this operand
15526 if (any_of(Range&: Ops, P: [&](Use *U) { return U->get() == Op; }))
15527 continue;
15528
15529 if (match(V: &Op, P: m_SExt(Op: m_Value()))) {
15530 NumSExts++;
15531 continue;
15532 } else if (match(V: &Op, P: m_ZExt(Op: m_Value()))) {
15533 NumZExts++;
15534 continue;
15535 }
15536
15537 ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Val&: Op);
15538
15539 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
15540 // operand and the s/zext can help create indexed s/umull. This is
15541 // especially useful to prevent i64 mul being scalarized.
15542 if (Shuffle && isSplatShuffle(V: Shuffle) &&
15543 match(V: Shuffle->getOperand(i_nocapture: 0), P: m_ZExtOrSExt(Op: m_Value()))) {
15544 Ops.push_back(Elt: &Shuffle->getOperandUse(i: 0));
15545 Ops.push_back(Elt: &Op);
15546 if (match(V: Shuffle->getOperand(i_nocapture: 0), P: m_SExt(Op: m_Value())))
15547 NumSExts++;
15548 else
15549 NumZExts++;
15550 continue;
15551 }
15552
15553 if (!Shuffle)
15554 continue;
15555
15556 Value *ShuffleOperand = Shuffle->getOperand(i_nocapture: 0);
15557 InsertElementInst *Insert = dyn_cast<InsertElementInst>(Val: ShuffleOperand);
15558 if (!Insert)
15559 continue;
15560
15561 Instruction *OperandInstr = dyn_cast<Instruction>(Val: Insert->getOperand(i_nocapture: 1));
15562 if (!OperandInstr)
15563 continue;
15564
15565 ConstantInt *ElementConstant =
15566 dyn_cast<ConstantInt>(Val: Insert->getOperand(i_nocapture: 2));
15567 // Check that the insertelement is inserting into element 0
15568 if (!ElementConstant || !ElementConstant->isZero())
15569 continue;
15570
15571 unsigned Opcode = OperandInstr->getOpcode();
15572 if (Opcode == Instruction::SExt)
15573 NumSExts++;
15574 else if (Opcode == Instruction::ZExt)
15575 NumZExts++;
15576 else {
15577 // If we find that the top bits are known 0, then we can sink and allow
15578 // the backend to generate a umull.
15579 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
15580 APInt UpperMask = APInt::getHighBitsSet(numBits: Bitwidth, hiBitsSet: Bitwidth / 2);
15581 const DataLayout &DL = I->getFunction()->getParent()->getDataLayout();
15582 if (!MaskedValueIsZero(V: OperandInstr, Mask: UpperMask, DL))
15583 continue;
15584 NumZExts++;
15585 }
15586
15587 Ops.push_back(Elt: &Shuffle->getOperandUse(i: 0));
15588 Ops.push_back(Elt: &Op);
15589 }
15590
15591 // Is it profitable to sink if we found two of the same type of extends.
15592 return !Ops.empty() && (NumSExts == 2 || NumZExts == 2);
15593 }
15594 default:
15595 return false;
15596 }
15597 return false;
15598}
15599
15600static bool createTblShuffleForZExt(ZExtInst *ZExt, FixedVectorType *DstTy,
15601 bool IsLittleEndian) {
15602 Value *Op = ZExt->getOperand(i_nocapture: 0);
15603 auto *SrcTy = cast<FixedVectorType>(Val: Op->getType());
15604 auto SrcWidth = cast<IntegerType>(Val: SrcTy->getElementType())->getBitWidth();
15605 auto DstWidth = cast<IntegerType>(Val: DstTy->getElementType())->getBitWidth();
15606 if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth >= 64)
15607 return false;
15608
15609 assert(DstWidth % SrcWidth == 0 &&
15610 "TBL lowering is not supported for a ZExt instruction with this "
15611 "source & destination element type.");
15612 unsigned ZExtFactor = DstWidth / SrcWidth;
15613 unsigned NumElts = SrcTy->getNumElements();
15614 IRBuilder<> Builder(ZExt);
15615 SmallVector<int> Mask;
15616 // Create a mask that selects <0,...,Op[i]> for each lane of the destination
15617 // vector to replace the original ZExt. This can later be lowered to a set of
15618 // tbl instructions.
15619 for (unsigned i = 0; i < NumElts * ZExtFactor; i++) {
15620 if (IsLittleEndian) {
15621 if (i % ZExtFactor == 0)
15622 Mask.push_back(Elt: i / ZExtFactor);
15623 else
15624 Mask.push_back(Elt: NumElts);
15625 } else {
15626 if ((i + 1) % ZExtFactor == 0)
15627 Mask.push_back(Elt: (i - ZExtFactor + 1) / ZExtFactor);
15628 else
15629 Mask.push_back(Elt: NumElts);
15630 }
15631 }
15632
15633 auto *FirstEltZero = Builder.CreateInsertElement(
15634 Vec: PoisonValue::get(T: SrcTy), NewElt: Builder.getInt8(C: 0), Idx: uint64_t(0));
15635 Value *Result = Builder.CreateShuffleVector(V1: Op, V2: FirstEltZero, Mask);
15636 Result = Builder.CreateBitCast(V: Result, DestTy: DstTy);
15637 if (DstTy != ZExt->getType())
15638 Result = Builder.CreateZExt(V: Result, DestTy: ZExt->getType());
15639 ZExt->replaceAllUsesWith(V: Result);
15640 ZExt->eraseFromParent();
15641 return true;
15642}
15643
15644static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
15645 IRBuilder<> Builder(TI);
15646 SmallVector<Value *> Parts;
15647 int NumElements = cast<FixedVectorType>(Val: TI->getType())->getNumElements();
15648 auto *SrcTy = cast<FixedVectorType>(Val: TI->getOperand(i_nocapture: 0)->getType());
15649 auto *DstTy = cast<FixedVectorType>(Val: TI->getType());
15650 assert(SrcTy->getElementType()->isIntegerTy() &&
15651 "Non-integer type source vector element is not supported");
15652 assert(DstTy->getElementType()->isIntegerTy(8) &&
15653 "Unsupported destination vector element type");
15654 unsigned SrcElemTySz =
15655 cast<IntegerType>(Val: SrcTy->getElementType())->getBitWidth();
15656 unsigned DstElemTySz =
15657 cast<IntegerType>(Val: DstTy->getElementType())->getBitWidth();
15658 assert((SrcElemTySz % DstElemTySz == 0) &&
15659 "Cannot lower truncate to tbl instructions for a source element size "
15660 "that is not divisible by the destination element size");
15661 unsigned TruncFactor = SrcElemTySz / DstElemTySz;
15662 assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) &&
15663 "Unsupported source vector element type size");
15664 Type *VecTy = FixedVectorType::get(ElementType: Builder.getInt8Ty(), NumElts: 16);
15665
15666 // Create a mask to choose every nth byte from the source vector table of
15667 // bytes to create the truncated destination vector, where 'n' is the truncate
15668 // ratio. For example, for a truncate from Yxi64 to Yxi8, choose
15669 // 0,8,16,..Y*8th bytes for the little-endian format
15670 SmallVector<Constant *, 16> MaskConst;
15671 for (int Itr = 0; Itr < 16; Itr++) {
15672 if (Itr < NumElements)
15673 MaskConst.push_back(Elt: Builder.getInt8(
15674 C: IsLittleEndian ? Itr * TruncFactor
15675 : Itr * TruncFactor + (TruncFactor - 1)));
15676 else
15677 MaskConst.push_back(Elt: Builder.getInt8(C: 255));
15678 }
15679
15680 int MaxTblSz = 128 * 4;
15681 int MaxSrcSz = SrcElemTySz * NumElements;
15682 int ElemsPerTbl =
15683 (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
15684 assert(ElemsPerTbl <= 16 &&
15685 "Maximum elements selected using TBL instruction cannot exceed 16!");
15686
15687 int ShuffleCount = 128 / SrcElemTySz;
15688 SmallVector<int> ShuffleLanes;
15689 for (int i = 0; i < ShuffleCount; ++i)
15690 ShuffleLanes.push_back(Elt: i);
15691
15692 // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
15693 // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
15694 // call TBL & save the result in a vector of TBL results for combining later.
15695 SmallVector<Value *> Results;
15696 while (ShuffleLanes.back() < NumElements) {
15697 Parts.push_back(Elt: Builder.CreateBitCast(
15698 V: Builder.CreateShuffleVector(V: TI->getOperand(i_nocapture: 0), Mask: ShuffleLanes), DestTy: VecTy));
15699
15700 if (Parts.size() == 4) {
15701 auto *F = Intrinsic::getDeclaration(TI->getModule(),
15702 Intrinsic::aarch64_neon_tbl4, VecTy);
15703 Parts.push_back(Elt: ConstantVector::get(V: MaskConst));
15704 Results.push_back(Elt: Builder.CreateCall(F, Parts));
15705 Parts.clear();
15706 }
15707
15708 for (int i = 0; i < ShuffleCount; ++i)
15709 ShuffleLanes[i] += ShuffleCount;
15710 }
15711
15712 assert((Parts.empty() || Results.empty()) &&
15713 "Lowering trunc for vectors requiring different TBL instructions is "
15714 "not supported!");
15715 // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
15716 // registers
15717 if (!Parts.empty()) {
15718 Intrinsic::ID TblID;
15719 switch (Parts.size()) {
15720 case 1:
15721 TblID = Intrinsic::aarch64_neon_tbl1;
15722 break;
15723 case 2:
15724 TblID = Intrinsic::aarch64_neon_tbl2;
15725 break;
15726 case 3:
15727 TblID = Intrinsic::aarch64_neon_tbl3;
15728 break;
15729 }
15730
15731 auto *F = Intrinsic::getDeclaration(M: TI->getModule(), id: TblID, Tys: VecTy);
15732 Parts.push_back(Elt: ConstantVector::get(V: MaskConst));
15733 Results.push_back(Elt: Builder.CreateCall(Callee: F, Args: Parts));
15734 }
15735
15736 // Extract the destination vector from TBL result(s) after combining them
15737 // where applicable. Currently, at most two TBLs are supported.
15738 assert(Results.size() <= 2 && "Trunc lowering does not support generation of "
15739 "more than 2 tbl instructions!");
15740 Value *FinalResult = Results[0];
15741 if (Results.size() == 1) {
15742 if (ElemsPerTbl < 16) {
15743 SmallVector<int> FinalMask(ElemsPerTbl);
15744 std::iota(first: FinalMask.begin(), last: FinalMask.end(), value: 0);
15745 FinalResult = Builder.CreateShuffleVector(V: Results[0], Mask: FinalMask);
15746 }
15747 } else {
15748 SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
15749 if (ElemsPerTbl < 16) {
15750 std::iota(first: FinalMask.begin(), last: FinalMask.begin() + ElemsPerTbl, value: 0);
15751 std::iota(first: FinalMask.begin() + ElemsPerTbl, last: FinalMask.end(), value: 16);
15752 } else {
15753 std::iota(first: FinalMask.begin(), last: FinalMask.end(), value: 0);
15754 }
15755 FinalResult =
15756 Builder.CreateShuffleVector(V1: Results[0], V2: Results[1], Mask: FinalMask);
15757 }
15758
15759 TI->replaceAllUsesWith(V: FinalResult);
15760 TI->eraseFromParent();
15761}
15762
15763bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(
15764 Instruction *I, Loop *L, const TargetTransformInfo &TTI) const {
15765 // shuffle_vector instructions are serialized when targeting SVE,
15766 // see LowerSPLAT_VECTOR. This peephole is not beneficial.
15767 if (!EnableExtToTBL || Subtarget->useSVEForFixedLengthVectors())
15768 return false;
15769
15770 // Try to optimize conversions using tbl. This requires materializing constant
15771 // index vectors, which can increase code size and add loads. Skip the
15772 // transform unless the conversion is in a loop block guaranteed to execute
15773 // and we are not optimizing for size.
15774 Function *F = I->getParent()->getParent();
15775 if (!L || L->getHeader() != I->getParent() || F->hasMinSize() ||
15776 F->hasOptSize())
15777 return false;
15778
15779 auto *SrcTy = dyn_cast<FixedVectorType>(Val: I->getOperand(i: 0)->getType());
15780 auto *DstTy = dyn_cast<FixedVectorType>(Val: I->getType());
15781 if (!SrcTy || !DstTy)
15782 return false;
15783
15784 // Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be
15785 // lowered to tbl instructions to insert the original i8 elements
15786 // into i8x lanes. This is enabled for cases where it is beneficial.
15787 auto *ZExt = dyn_cast<ZExtInst>(Val: I);
15788 if (ZExt && SrcTy->getElementType()->isIntegerTy(Bitwidth: 8)) {
15789 auto DstWidth = DstTy->getElementType()->getScalarSizeInBits();
15790 if (DstWidth % 8 != 0)
15791 return false;
15792
15793 auto *TruncDstType =
15794 cast<FixedVectorType>(Val: VectorType::getTruncatedElementVectorType(VTy: DstTy));
15795 // If the ZExt can be lowered to a single ZExt to the next power-of-2 and
15796 // the remaining ZExt folded into the user, don't use tbl lowering.
15797 auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits();
15798 if (TTI.getCastInstrCost(Opcode: I->getOpcode(), Dst: DstTy, Src: TruncDstType,
15799 CCH: TargetTransformInfo::getCastContextHint(I),
15800 CostKind: TTI::TCK_SizeAndLatency, I) == TTI::TCC_Free) {
15801 if (SrcWidth * 2 >= TruncDstType->getElementType()->getScalarSizeInBits())
15802 return false;
15803
15804 DstTy = TruncDstType;
15805 }
15806
15807 return createTblShuffleForZExt(ZExt, DstTy, IsLittleEndian: Subtarget->isLittleEndian());
15808 }
15809
15810 auto *UIToFP = dyn_cast<UIToFPInst>(Val: I);
15811 if (UIToFP && SrcTy->getElementType()->isIntegerTy(Bitwidth: 8) &&
15812 DstTy->getElementType()->isFloatTy()) {
15813 IRBuilder<> Builder(I);
15814 auto *ZExt = cast<ZExtInst>(
15815 Val: Builder.CreateZExt(V: I->getOperand(i: 0), DestTy: VectorType::getInteger(VTy: DstTy)));
15816 auto *UI = Builder.CreateUIToFP(V: ZExt, DestTy: DstTy);
15817 I->replaceAllUsesWith(V: UI);
15818 I->eraseFromParent();
15819 return createTblShuffleForZExt(ZExt, DstTy: cast<FixedVectorType>(Val: ZExt->getType()),
15820 IsLittleEndian: Subtarget->isLittleEndian());
15821 }
15822
15823 // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
15824 // followed by a truncate lowered to using tbl.4.
15825 auto *FPToUI = dyn_cast<FPToUIInst>(Val: I);
15826 if (FPToUI &&
15827 (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
15828 SrcTy->getElementType()->isFloatTy() &&
15829 DstTy->getElementType()->isIntegerTy(Bitwidth: 8)) {
15830 IRBuilder<> Builder(I);
15831 auto *WideConv = Builder.CreateFPToUI(V: FPToUI->getOperand(i_nocapture: 0),
15832 DestTy: VectorType::getInteger(VTy: SrcTy));
15833 auto *TruncI = Builder.CreateTrunc(V: WideConv, DestTy: DstTy);
15834 I->replaceAllUsesWith(V: TruncI);
15835 I->eraseFromParent();
15836 createTblForTrunc(TI: cast<TruncInst>(Val: TruncI), IsLittleEndian: Subtarget->isLittleEndian());
15837 return true;
15838 }
15839
15840 // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate
15841 // tbl instruction selecting the lowest/highest (little/big endian) 8 bits
15842 // per lane of the input that is represented using 1,2,3 or 4 128-bit table
15843 // registers
15844 auto *TI = dyn_cast<TruncInst>(Val: I);
15845 if (TI && DstTy->getElementType()->isIntegerTy(Bitwidth: 8) &&
15846 ((SrcTy->getElementType()->isIntegerTy(Bitwidth: 32) ||
15847 SrcTy->getElementType()->isIntegerTy(Bitwidth: 64)) &&
15848 (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) {
15849 createTblForTrunc(TI, IsLittleEndian: Subtarget->isLittleEndian());
15850 return true;
15851 }
15852
15853 return false;
15854}
15855
15856bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
15857 Align &RequiredAligment) const {
15858 if (!LoadedType.isSimple() ||
15859 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
15860 return false;
15861 // Cyclone supports unaligned accesses.
15862 RequiredAligment = Align(1);
15863 unsigned NumBits = LoadedType.getSizeInBits();
15864 return NumBits == 32 || NumBits == 64;
15865}
15866
15867/// A helper function for determining the number of interleaved accesses we
15868/// will generate when lowering accesses of the given type.
15869unsigned AArch64TargetLowering::getNumInterleavedAccesses(
15870 VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
15871 unsigned VecSize = 128;
15872 unsigned ElSize = DL.getTypeSizeInBits(Ty: VecTy->getElementType());
15873 unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
15874 if (UseScalable && isa<FixedVectorType>(Val: VecTy))
15875 VecSize = std::max(a: Subtarget->getMinSVEVectorSizeInBits(), b: 128u);
15876 return std::max<unsigned>(a: 1, b: (MinElts * ElSize + 127) / VecSize);
15877}
15878
15879MachineMemOperand::Flags
15880AArch64TargetLowering::getTargetMMOFlags(const Instruction &I) const {
15881 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
15882 I.hasMetadata(FALKOR_STRIDED_ACCESS_MD))
15883 return MOStridedAccess;
15884 return MachineMemOperand::MONone;
15885}
15886
15887bool AArch64TargetLowering::isLegalInterleavedAccessType(
15888 VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
15889 unsigned ElSize = DL.getTypeSizeInBits(Ty: VecTy->getElementType());
15890 auto EC = VecTy->getElementCount();
15891 unsigned MinElts = EC.getKnownMinValue();
15892
15893 UseScalable = false;
15894
15895 if (!VecTy->isScalableTy() && !Subtarget->hasNEON())
15896 return false;
15897
15898 if (VecTy->isScalableTy() && !Subtarget->hasSVEorSME())
15899 return false;
15900
15901 // Ensure that the predicate for this number of elements is available.
15902 if (Subtarget->hasSVE() && !getSVEPredPatternFromNumElements(MinElts))
15903 return false;
15904
15905 // Ensure the number of vector elements is greater than 1.
15906 if (MinElts < 2)
15907 return false;
15908
15909 // Ensure the element type is legal.
15910 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
15911 return false;
15912
15913 if (EC.isScalable()) {
15914 UseScalable = true;
15915 return isPowerOf2_32(Value: MinElts) && (MinElts * ElSize) % 128 == 0;
15916 }
15917
15918 unsigned VecSize = DL.getTypeSizeInBits(Ty: VecTy);
15919 if (!Subtarget->isNeonAvailable() ||
15920 (Subtarget->useSVEForFixedLengthVectors() &&
15921 (VecSize % Subtarget->getMinSVEVectorSizeInBits() == 0 ||
15922 (VecSize < Subtarget->getMinSVEVectorSizeInBits() &&
15923 isPowerOf2_32(Value: MinElts) && VecSize > 128)))) {
15924 UseScalable = true;
15925 return true;
15926 }
15927
15928 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
15929 // 128 will be split into multiple interleaved accesses.
15930 return VecSize == 64 || VecSize % 128 == 0;
15931}
15932
15933static ScalableVectorType *getSVEContainerIRType(FixedVectorType *VTy) {
15934 if (VTy->getElementType() == Type::getDoubleTy(C&: VTy->getContext()))
15935 return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: 2);
15936
15937 if (VTy->getElementType() == Type::getFloatTy(C&: VTy->getContext()))
15938 return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: 4);
15939
15940 if (VTy->getElementType() == Type::getBFloatTy(C&: VTy->getContext()))
15941 return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: 8);
15942
15943 if (VTy->getElementType() == Type::getHalfTy(C&: VTy->getContext()))
15944 return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: 8);
15945
15946 if (VTy->getElementType() == Type::getInt64Ty(C&: VTy->getContext()))
15947 return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: 2);
15948
15949 if (VTy->getElementType() == Type::getInt32Ty(C&: VTy->getContext()))
15950 return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: 4);
15951
15952 if (VTy->getElementType() == Type::getInt16Ty(C&: VTy->getContext()))
15953 return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: 8);
15954
15955 if (VTy->getElementType() == Type::getInt8Ty(C&: VTy->getContext()))
15956 return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: 16);
15957
15958 llvm_unreachable("Cannot handle input vector type");
15959}
15960
15961static Function *getStructuredLoadFunction(Module *M, unsigned Factor,
15962 bool Scalable, Type *LDVTy,
15963 Type *PtrTy) {
15964 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
15965 static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret,
15966 Intrinsic::aarch64_sve_ld3_sret,
15967 Intrinsic::aarch64_sve_ld4_sret};
15968 static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2,
15969 Intrinsic::aarch64_neon_ld3,
15970 Intrinsic::aarch64_neon_ld4};
15971 if (Scalable)
15972 return Intrinsic::getDeclaration(M, id: SVELoads[Factor - 2], Tys: {LDVTy});
15973
15974 return Intrinsic::getDeclaration(M, id: NEONLoads[Factor - 2], Tys: {LDVTy, PtrTy});
15975}
15976
15977static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
15978 bool Scalable, Type *STVTy,
15979 Type *PtrTy) {
15980 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
15981 static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2,
15982 Intrinsic::aarch64_sve_st3,
15983 Intrinsic::aarch64_sve_st4};
15984 static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2,
15985 Intrinsic::aarch64_neon_st3,
15986 Intrinsic::aarch64_neon_st4};
15987 if (Scalable)
15988 return Intrinsic::getDeclaration(M, id: SVEStores[Factor - 2], Tys: {STVTy});
15989
15990 return Intrinsic::getDeclaration(M, id: NEONStores[Factor - 2], Tys: {STVTy, PtrTy});
15991}
15992
15993/// Lower an interleaved load into a ldN intrinsic.
15994///
15995/// E.g. Lower an interleaved load (Factor = 2):
15996/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
15997/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
15998/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
15999///
16000/// Into:
16001/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
16002/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
16003/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
16004bool AArch64TargetLowering::lowerInterleavedLoad(
16005 LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
16006 ArrayRef<unsigned> Indices, unsigned Factor) const {
16007 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
16008 "Invalid interleave factor");
16009 assert(!Shuffles.empty() && "Empty shufflevector input");
16010 assert(Shuffles.size() == Indices.size() &&
16011 "Unmatched number of shufflevectors and indices");
16012
16013 const DataLayout &DL = LI->getModule()->getDataLayout();
16014
16015 VectorType *VTy = Shuffles[0]->getType();
16016
16017 // Skip if we do not have NEON and skip illegal vector types. We can
16018 // "legalize" wide vector types into multiple interleaved accesses as long as
16019 // the vector types are divisible by 128.
16020 bool UseScalable;
16021 if (!Subtarget->hasNEON() ||
16022 !isLegalInterleavedAccessType(VecTy: VTy, DL, UseScalable))
16023 return false;
16024
16025 unsigned NumLoads = getNumInterleavedAccesses(VecTy: VTy, DL, UseScalable);
16026
16027 auto *FVTy = cast<FixedVectorType>(Val: VTy);
16028
16029 // A pointer vector can not be the return type of the ldN intrinsics. Need to
16030 // load integer vectors first and then convert to pointer vectors.
16031 Type *EltTy = FVTy->getElementType();
16032 if (EltTy->isPointerTy())
16033 FVTy =
16034 FixedVectorType::get(ElementType: DL.getIntPtrType(EltTy), NumElts: FVTy->getNumElements());
16035
16036 // If we're going to generate more than one load, reset the sub-vector type
16037 // to something legal.
16038 FVTy = FixedVectorType::get(ElementType: FVTy->getElementType(),
16039 NumElts: FVTy->getNumElements() / NumLoads);
16040
16041 auto *LDVTy =
16042 UseScalable ? cast<VectorType>(Val: getSVEContainerIRType(VTy: FVTy)) : FVTy;
16043
16044 IRBuilder<> Builder(LI);
16045
16046 // The base address of the load.
16047 Value *BaseAddr = LI->getPointerOperand();
16048
16049 Type *PtrTy = LI->getPointerOperandType();
16050 Type *PredTy = VectorType::get(ElementType: Type::getInt1Ty(C&: LDVTy->getContext()),
16051 EC: LDVTy->getElementCount());
16052
16053 Function *LdNFunc = getStructuredLoadFunction(M: LI->getModule(), Factor,
16054 Scalable: UseScalable, LDVTy, PtrTy);
16055
16056 // Holds sub-vectors extracted from the load intrinsic return values. The
16057 // sub-vectors are associated with the shufflevector instructions they will
16058 // replace.
16059 DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;
16060
16061 Value *PTrue = nullptr;
16062 if (UseScalable) {
16063 std::optional<unsigned> PgPattern =
16064 getSVEPredPatternFromNumElements(FVTy->getNumElements());
16065 if (Subtarget->getMinSVEVectorSizeInBits() ==
16066 Subtarget->getMaxSVEVectorSizeInBits() &&
16067 Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))
16068 PgPattern = AArch64SVEPredPattern::all;
16069
16070 auto *PTruePat =
16071 ConstantInt::get(Ty: Type::getInt32Ty(C&: LDVTy->getContext()), V: *PgPattern);
16072 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
16073 {PTruePat});
16074 }
16075
16076 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
16077
16078 // If we're generating more than one load, compute the base address of
16079 // subsequent loads as an offset from the previous.
16080 if (LoadCount > 0)
16081 BaseAddr = Builder.CreateConstGEP1_32(Ty: LDVTy->getElementType(), Ptr: BaseAddr,
16082 Idx0: FVTy->getNumElements() * Factor);
16083
16084 CallInst *LdN;
16085 if (UseScalable)
16086 LdN = Builder.CreateCall(Callee: LdNFunc, Args: {PTrue, BaseAddr}, Name: "ldN");
16087 else
16088 LdN = Builder.CreateCall(Callee: LdNFunc, Args: BaseAddr, Name: "ldN");
16089
16090 // Extract and store the sub-vectors returned by the load intrinsic.
16091 for (unsigned i = 0; i < Shuffles.size(); i++) {
16092 ShuffleVectorInst *SVI = Shuffles[i];
16093 unsigned Index = Indices[i];
16094
16095 Value *SubVec = Builder.CreateExtractValue(Agg: LdN, Idxs: Index);
16096
16097 if (UseScalable)
16098 SubVec = Builder.CreateExtractVector(
16099 DstType: FVTy, SrcVec: SubVec,
16100 Idx: ConstantInt::get(Ty: Type::getInt64Ty(C&: VTy->getContext()), V: 0));
16101
16102 // Convert the integer vector to pointer vector if the element is pointer.
16103 if (EltTy->isPointerTy())
16104 SubVec = Builder.CreateIntToPtr(
16105 V: SubVec, DestTy: FixedVectorType::get(ElementType: SVI->getType()->getElementType(),
16106 NumElts: FVTy->getNumElements()));
16107
16108 SubVecs[SVI].push_back(Elt: SubVec);
16109 }
16110 }
16111
16112 // Replace uses of the shufflevector instructions with the sub-vectors
16113 // returned by the load intrinsic. If a shufflevector instruction is
16114 // associated with more than one sub-vector, those sub-vectors will be
16115 // concatenated into a single wide vector.
16116 for (ShuffleVectorInst *SVI : Shuffles) {
16117 auto &SubVec = SubVecs[SVI];
16118 auto *WideVec =
16119 SubVec.size() > 1 ? concatenateVectors(Builder, Vecs: SubVec) : SubVec[0];
16120 SVI->replaceAllUsesWith(V: WideVec);
16121 }
16122
16123 return true;
16124}
16125
16126template <typename Iter>
16127bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
16128 int MaxLookupDist = 20;
16129 unsigned IdxWidth = DL.getIndexSizeInBits(AS: 0);
16130 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
16131 const Value *PtrA1 =
16132 Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, Offset&: OffsetA);
16133
16134 while (++It != End) {
16135 if (It->isDebugOrPseudoInst())
16136 continue;
16137 if (MaxLookupDist-- == 0)
16138 break;
16139 if (const auto *SI = dyn_cast<StoreInst>(&*It)) {
16140 const Value *PtrB1 =
16141 SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
16142 DL, OffsetB);
16143 if (PtrA1 == PtrB1 &&
16144 (OffsetA.sextOrTrunc(width: IdxWidth) - OffsetB.sextOrTrunc(width: IdxWidth))
16145 .abs() == 16)
16146 return true;
16147 }
16148 }
16149
16150 return false;
16151}
16152
16153/// Lower an interleaved store into a stN intrinsic.
16154///
16155/// E.g. Lower an interleaved store (Factor = 3):
16156/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
16157/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
16158/// store <12 x i32> %i.vec, <12 x i32>* %ptr
16159///
16160/// Into:
16161/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
16162/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
16163/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
16164/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
16165///
16166/// Note that the new shufflevectors will be removed and we'll only generate one
16167/// st3 instruction in CodeGen.
16168///
16169/// Example for a more general valid mask (Factor 3). Lower:
16170/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
16171/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
16172/// store <12 x i32> %i.vec, <12 x i32>* %ptr
16173///
16174/// Into:
16175/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
16176/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
16177/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
16178/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
16179bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
16180 ShuffleVectorInst *SVI,
16181 unsigned Factor) const {
16182
16183 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
16184 "Invalid interleave factor");
16185
16186 auto *VecTy = cast<FixedVectorType>(Val: SVI->getType());
16187 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
16188
16189 unsigned LaneLen = VecTy->getNumElements() / Factor;
16190 Type *EltTy = VecTy->getElementType();
16191 auto *SubVecTy = FixedVectorType::get(ElementType: EltTy, NumElts: LaneLen);
16192
16193 const DataLayout &DL = SI->getModule()->getDataLayout();
16194 bool UseScalable;
16195
16196 // Skip if we do not have NEON and skip illegal vector types. We can
16197 // "legalize" wide vector types into multiple interleaved accesses as long as
16198 // the vector types are divisible by 128.
16199 if (!Subtarget->hasNEON() ||
16200 !isLegalInterleavedAccessType(VecTy: SubVecTy, DL, UseScalable))
16201 return false;
16202
16203 unsigned NumStores = getNumInterleavedAccesses(VecTy: SubVecTy, DL, UseScalable);
16204
16205 Value *Op0 = SVI->getOperand(i_nocapture: 0);
16206 Value *Op1 = SVI->getOperand(i_nocapture: 1);
16207 IRBuilder<> Builder(SI);
16208
16209 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
16210 // vectors to integer vectors.
16211 if (EltTy->isPointerTy()) {
16212 Type *IntTy = DL.getIntPtrType(EltTy);
16213 unsigned NumOpElts =
16214 cast<FixedVectorType>(Val: Op0->getType())->getNumElements();
16215
16216 // Convert to the corresponding integer vector.
16217 auto *IntVecTy = FixedVectorType::get(ElementType: IntTy, NumElts: NumOpElts);
16218 Op0 = Builder.CreatePtrToInt(V: Op0, DestTy: IntVecTy);
16219 Op1 = Builder.CreatePtrToInt(V: Op1, DestTy: IntVecTy);
16220
16221 SubVecTy = FixedVectorType::get(ElementType: IntTy, NumElts: LaneLen);
16222 }
16223
16224 // If we're going to generate more than one store, reset the lane length
16225 // and sub-vector type to something legal.
16226 LaneLen /= NumStores;
16227 SubVecTy = FixedVectorType::get(ElementType: SubVecTy->getElementType(), NumElts: LaneLen);
16228
16229 auto *STVTy = UseScalable ? cast<VectorType>(Val: getSVEContainerIRType(VTy: SubVecTy))
16230 : SubVecTy;
16231
16232 // The base address of the store.
16233 Value *BaseAddr = SI->getPointerOperand();
16234
16235 auto Mask = SVI->getShuffleMask();
16236
16237 // Sanity check if all the indices are NOT in range.
16238 // If mask is `poison`, `Mask` may be a vector of -1s.
16239 // If all of them are `poison`, OOB read will happen later.
16240 if (llvm::all_of(Range&: Mask, P: [](int Idx) { return Idx == PoisonMaskElem; })) {
16241 return false;
16242 }
16243 // A 64bit st2 which does not start at element 0 will involved adding extra
16244 // ext elements making the st2 unprofitable, and if there is a nearby store
16245 // that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
16246 // zip;ldp pair which has higher throughput.
16247 if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 &&
16248 (Mask[0] != 0 ||
16249 hasNearbyPairedStore(It: SI->getIterator(), End: SI->getParent()->end(), Ptr: BaseAddr,
16250 DL) ||
16251 hasNearbyPairedStore(It: SI->getReverseIterator(), End: SI->getParent()->rend(),
16252 Ptr: BaseAddr, DL)))
16253 return false;
16254
16255 Type *PtrTy = SI->getPointerOperandType();
16256 Type *PredTy = VectorType::get(ElementType: Type::getInt1Ty(C&: STVTy->getContext()),
16257 EC: STVTy->getElementCount());
16258
16259 Function *StNFunc = getStructuredStoreFunction(M: SI->getModule(), Factor,
16260 Scalable: UseScalable, STVTy, PtrTy);
16261
16262 Value *PTrue = nullptr;
16263 if (UseScalable) {
16264 std::optional<unsigned> PgPattern =
16265 getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
16266 if (Subtarget->getMinSVEVectorSizeInBits() ==
16267 Subtarget->getMaxSVEVectorSizeInBits() &&
16268 Subtarget->getMinSVEVectorSizeInBits() ==
16269 DL.getTypeSizeInBits(SubVecTy))
16270 PgPattern = AArch64SVEPredPattern::all;
16271
16272 auto *PTruePat =
16273 ConstantInt::get(Ty: Type::getInt32Ty(C&: STVTy->getContext()), V: *PgPattern);
16274 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
16275 {PTruePat});
16276 }
16277
16278 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
16279
16280 SmallVector<Value *, 5> Ops;
16281
16282 // Split the shufflevector operands into sub vectors for the new stN call.
16283 for (unsigned i = 0; i < Factor; i++) {
16284 Value *Shuffle;
16285 unsigned IdxI = StoreCount * LaneLen * Factor + i;
16286 if (Mask[IdxI] >= 0) {
16287 Shuffle = Builder.CreateShuffleVector(
16288 V1: Op0, V2: Op1, Mask: createSequentialMask(Start: Mask[IdxI], NumInts: LaneLen, NumUndefs: 0));
16289 } else {
16290 unsigned StartMask = 0;
16291 for (unsigned j = 1; j < LaneLen; j++) {
16292 unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
16293 if (Mask[IdxJ] >= 0) {
16294 StartMask = Mask[IdxJ] - j;
16295 break;
16296 }
16297 }
16298 // Note: Filling undef gaps with random elements is ok, since
16299 // those elements were being written anyway (with undefs).
16300 // In the case of all undefs we're defaulting to using elems from 0
16301 // Note: StartMask cannot be negative, it's checked in
16302 // isReInterleaveMask
16303 Shuffle = Builder.CreateShuffleVector(
16304 V1: Op0, V2: Op1, Mask: createSequentialMask(Start: StartMask, NumInts: LaneLen, NumUndefs: 0));
16305 }
16306
16307 if (UseScalable)
16308 Shuffle = Builder.CreateInsertVector(
16309 DstType: STVTy, SrcVec: UndefValue::get(T: STVTy), SubVec: Shuffle,
16310 Idx: ConstantInt::get(Ty: Type::getInt64Ty(C&: STVTy->getContext()), V: 0));
16311
16312 Ops.push_back(Elt: Shuffle);
16313 }
16314
16315 if (UseScalable)
16316 Ops.push_back(Elt: PTrue);
16317
16318 // If we generating more than one store, we compute the base address of
16319 // subsequent stores as an offset from the previous.
16320 if (StoreCount > 0)
16321 BaseAddr = Builder.CreateConstGEP1_32(Ty: SubVecTy->getElementType(),
16322 Ptr: BaseAddr, Idx0: LaneLen * Factor);
16323
16324 Ops.push_back(Elt: BaseAddr);
16325 Builder.CreateCall(Callee: StNFunc, Args: Ops);
16326 }
16327 return true;
16328}
16329
16330bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
16331 IntrinsicInst *DI, LoadInst *LI) const {
16332 // Only deinterleave2 supported at present.
16333 if (DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2)
16334 return false;
16335
16336 // Only a factor of 2 supported at present.
16337 const unsigned Factor = 2;
16338
16339 VectorType *VTy = cast<VectorType>(Val: DI->getType()->getContainedType(i: 0));
16340 const DataLayout &DL = DI->getModule()->getDataLayout();
16341 bool UseScalable;
16342 if (!isLegalInterleavedAccessType(VecTy: VTy, DL, UseScalable))
16343 return false;
16344
16345 // TODO: Add support for using SVE instructions with fixed types later, using
16346 // the code from lowerInterleavedLoad to obtain the correct container type.
16347 if (UseScalable && !VTy->isScalableTy())
16348 return false;
16349
16350 unsigned NumLoads = getNumInterleavedAccesses(VecTy: VTy, DL, UseScalable);
16351
16352 VectorType *LdTy =
16353 VectorType::get(ElementType: VTy->getElementType(),
16354 EC: VTy->getElementCount().divideCoefficientBy(RHS: NumLoads));
16355
16356 Type *PtrTy = LI->getPointerOperandType();
16357 Function *LdNFunc = getStructuredLoadFunction(M: DI->getModule(), Factor,
16358 Scalable: UseScalable, LDVTy: LdTy, PtrTy);
16359
16360 IRBuilder<> Builder(LI);
16361
16362 Value *Pred = nullptr;
16363 if (UseScalable)
16364 Pred =
16365 Builder.CreateVectorSplat(EC: LdTy->getElementCount(), V: Builder.getTrue());
16366
16367 Value *BaseAddr = LI->getPointerOperand();
16368 Value *Result;
16369 if (NumLoads > 1) {
16370 Value *Left = PoisonValue::get(T: VTy);
16371 Value *Right = PoisonValue::get(T: VTy);
16372
16373 for (unsigned I = 0; I < NumLoads; ++I) {
16374 Value *Offset = Builder.getInt64(C: I * Factor);
16375
16376 Value *Address = Builder.CreateGEP(Ty: LdTy, Ptr: BaseAddr, IdxList: {Offset});
16377 Value *LdN = nullptr;
16378 if (UseScalable)
16379 LdN = Builder.CreateCall(Callee: LdNFunc, Args: {Pred, Address}, Name: "ldN");
16380 else
16381 LdN = Builder.CreateCall(Callee: LdNFunc, Args: Address, Name: "ldN");
16382
16383 Value *Idx =
16384 Builder.getInt64(C: I * LdTy->getElementCount().getKnownMinValue());
16385 Left = Builder.CreateInsertVector(
16386 DstType: VTy, SrcVec: Left, SubVec: Builder.CreateExtractValue(Agg: LdN, Idxs: 0), Idx);
16387 Right = Builder.CreateInsertVector(
16388 DstType: VTy, SrcVec: Right, SubVec: Builder.CreateExtractValue(Agg: LdN, Idxs: 1), Idx);
16389 }
16390
16391 Result = PoisonValue::get(T: DI->getType());
16392 Result = Builder.CreateInsertValue(Agg: Result, Val: Left, Idxs: 0);
16393 Result = Builder.CreateInsertValue(Agg: Result, Val: Right, Idxs: 1);
16394 } else {
16395 if (UseScalable)
16396 Result = Builder.CreateCall(Callee: LdNFunc, Args: {Pred, BaseAddr}, Name: "ldN");
16397 else
16398 Result = Builder.CreateCall(Callee: LdNFunc, Args: BaseAddr, Name: "ldN");
16399 }
16400
16401 DI->replaceAllUsesWith(V: Result);
16402 return true;
16403}
16404
16405bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
16406 IntrinsicInst *II, StoreInst *SI) const {
16407 // Only interleave2 supported at present.
16408 if (II->getIntrinsicID() != Intrinsic::experimental_vector_interleave2)
16409 return false;
16410
16411 // Only a factor of 2 supported at present.
16412 const unsigned Factor = 2;
16413
16414 VectorType *VTy = cast<VectorType>(Val: II->getOperand(i_nocapture: 0)->getType());
16415 const DataLayout &DL = II->getModule()->getDataLayout();
16416 bool UseScalable;
16417 if (!isLegalInterleavedAccessType(VecTy: VTy, DL, UseScalable))
16418 return false;
16419
16420 // TODO: Add support for using SVE instructions with fixed types later, using
16421 // the code from lowerInterleavedStore to obtain the correct container type.
16422 if (UseScalable && !VTy->isScalableTy())
16423 return false;
16424
16425 unsigned NumStores = getNumInterleavedAccesses(VecTy: VTy, DL, UseScalable);
16426
16427 VectorType *StTy =
16428 VectorType::get(ElementType: VTy->getElementType(),
16429 EC: VTy->getElementCount().divideCoefficientBy(RHS: NumStores));
16430
16431 Type *PtrTy = SI->getPointerOperandType();
16432 Function *StNFunc = getStructuredStoreFunction(M: SI->getModule(), Factor,
16433 Scalable: UseScalable, STVTy: StTy, PtrTy);
16434
16435 IRBuilder<> Builder(SI);
16436
16437 Value *BaseAddr = SI->getPointerOperand();
16438 Value *Pred = nullptr;
16439
16440 if (UseScalable)
16441 Pred =
16442 Builder.CreateVectorSplat(EC: StTy->getElementCount(), V: Builder.getTrue());
16443
16444 Value *L = II->getOperand(i_nocapture: 0);
16445 Value *R = II->getOperand(i_nocapture: 1);
16446
16447 for (unsigned I = 0; I < NumStores; ++I) {
16448 Value *Address = BaseAddr;
16449 if (NumStores > 1) {
16450 Value *Offset = Builder.getInt64(C: I * Factor);
16451 Address = Builder.CreateGEP(Ty: StTy, Ptr: BaseAddr, IdxList: {Offset});
16452
16453 Value *Idx =
16454 Builder.getInt64(C: I * StTy->getElementCount().getKnownMinValue());
16455 L = Builder.CreateExtractVector(DstType: StTy, SrcVec: II->getOperand(i_nocapture: 0), Idx);
16456 R = Builder.CreateExtractVector(DstType: StTy, SrcVec: II->getOperand(i_nocapture: 1), Idx);
16457 }
16458
16459 if (UseScalable)
16460 Builder.CreateCall(Callee: StNFunc, Args: {L, R, Pred, Address});
16461 else
16462 Builder.CreateCall(Callee: StNFunc, Args: {L, R, Address});
16463 }
16464
16465 return true;
16466}
16467
16468EVT AArch64TargetLowering::getOptimalMemOpType(
16469 const MemOp &Op, const AttributeList &FuncAttributes) const {
16470 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
16471 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
16472 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
16473 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
16474 // taken one instruction to materialize the v2i64 zero and one store (with
16475 // restrictive addressing mode). Just do i64 stores.
16476 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
16477 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
16478 if (Op.isAligned(AlignCheck))
16479 return true;
16480 unsigned Fast;
16481 return allowsMisalignedMemoryAccesses(VT, AddrSpace: 0, Alignment: Align(1),
16482 Flags: MachineMemOperand::MONone, Fast: &Fast) &&
16483 Fast;
16484 };
16485
16486 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
16487 AlignmentIsAcceptable(MVT::v16i8, Align(16)))
16488 return MVT::v16i8;
16489 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
16490 return MVT::f128;
16491 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
16492 return MVT::i64;
16493 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
16494 return MVT::i32;
16495 return MVT::Other;
16496}
16497
16498LLT AArch64TargetLowering::getOptimalMemOpLLT(
16499 const MemOp &Op, const AttributeList &FuncAttributes) const {
16500 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
16501 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
16502 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
16503 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
16504 // taken one instruction to materialize the v2i64 zero and one store (with
16505 // restrictive addressing mode). Just do i64 stores.
16506 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
16507 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
16508 if (Op.isAligned(AlignCheck))
16509 return true;
16510 unsigned Fast;
16511 return allowsMisalignedMemoryAccesses(VT, AddrSpace: 0, Alignment: Align(1),
16512 Flags: MachineMemOperand::MONone, Fast: &Fast) &&
16513 Fast;
16514 };
16515
16516 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
16517 AlignmentIsAcceptable(MVT::v2i64, Align(16)))
16518 return LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64);
16519 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
16520 return LLT::scalar(SizeInBits: 128);
16521 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
16522 return LLT::scalar(SizeInBits: 64);
16523 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
16524 return LLT::scalar(SizeInBits: 32);
16525 return LLT();
16526}
16527
16528// 12-bit optionally shifted immediates are legal for adds.
16529bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
16530 if (Immed == std::numeric_limits<int64_t>::min()) {
16531 LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
16532 << ": avoid UB for INT64_MIN\n");
16533 return false;
16534 }
16535 // Same encoding for add/sub, just flip the sign.
16536 Immed = std::abs(i: Immed);
16537 bool IsLegal = ((Immed >> 12) == 0 ||
16538 ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
16539 LLVM_DEBUG(dbgs() << "Is " << Immed
16540 << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
16541 return IsLegal;
16542}
16543
16544bool AArch64TargetLowering::isLegalAddScalableImmediate(int64_t Imm) const {
16545 // We will only emit addvl/inc* instructions for SVE2
16546 if (!Subtarget->hasSVE2())
16547 return false;
16548
16549 // addvl's immediates are in terms of the number of bytes in a register.
16550 // Since there are 16 in the base supported size (128bits), we need to
16551 // divide the immediate by that much to give us a useful immediate to
16552 // multiply by vscale. We can't have a remainder as a result of this.
16553 if (Imm % 16 == 0)
16554 return isInt<6>(x: Imm / 16);
16555
16556 // Inc[b|h|w|d] instructions take a pattern and a positive immediate
16557 // multiplier. For now, assume a pattern of 'all'. Incb would be a subset
16558 // of addvl as a result, so only take h|w|d into account.
16559 // Dec[h|w|d] will cover subtractions.
16560 // Immediates are in the range [1,16], so we can't do a 2's complement check.
16561 // FIXME: Can we make use of other patterns to cover other immediates?
16562
16563 // inch|dech
16564 if (Imm % 8 == 0)
16565 return std::labs(x: Imm / 8) <= 16;
16566 // incw|decw
16567 if (Imm % 4 == 0)
16568 return std::labs(x: Imm / 4) <= 16;
16569 // incd|decd
16570 if (Imm % 2 == 0)
16571 return std::labs(x: Imm / 2) <= 16;
16572
16573 return false;
16574}
16575
16576// Return false to prevent folding
16577// (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
16578// if the folding leads to worse code.
16579bool AArch64TargetLowering::isMulAddWithConstProfitable(
16580 SDValue AddNode, SDValue ConstNode) const {
16581 // Let the DAGCombiner decide for vector types and large types.
16582 const EVT VT = AddNode.getValueType();
16583 if (VT.isVector() || VT.getScalarSizeInBits() > 64)
16584 return true;
16585
16586 // It is worse if c1 is legal add immediate, while c1*c2 is not
16587 // and has to be composed by at least two instructions.
16588 const ConstantSDNode *C1Node = cast<ConstantSDNode>(Val: AddNode.getOperand(i: 1));
16589 const ConstantSDNode *C2Node = cast<ConstantSDNode>(Val&: ConstNode);
16590 const int64_t C1 = C1Node->getSExtValue();
16591 const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
16592 if (!isLegalAddImmediate(Immed: C1) || isLegalAddImmediate(Immed: C1C2.getSExtValue()))
16593 return true;
16594 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
16595 // Adapt to the width of a register.
16596 unsigned BitSize = VT.getSizeInBits() <= 32 ? 32 : 64;
16597 AArch64_IMM::expandMOVImm(Imm: C1C2.getZExtValue(), BitSize, Insn);
16598 if (Insn.size() > 1)
16599 return false;
16600
16601 // Default to true and let the DAGCombiner decide.
16602 return true;
16603}
16604
16605// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
16606// immediates is the same as for an add or a sub.
16607bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
16608 return isLegalAddImmediate(Immed);
16609}
16610
16611/// isLegalAddressingMode - Return true if the addressing mode represented
16612/// by AM is legal for this target, for a load/store of the specified type.
16613bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
16614 const AddrMode &AMode, Type *Ty,
16615 unsigned AS, Instruction *I) const {
16616 // AArch64 has five basic addressing modes:
16617 // reg
16618 // reg + 9-bit signed offset
16619 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
16620 // reg1 + reg2
16621 // reg + SIZE_IN_BYTES * reg
16622
16623 // No global is ever allowed as a base.
16624 if (AMode.BaseGV)
16625 return false;
16626
16627 // No reg+reg+imm addressing.
16628 if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale)
16629 return false;
16630
16631 // Canonicalise `1*ScaledReg + imm` into `BaseReg + imm` and
16632 // `2*ScaledReg` into `BaseReg + ScaledReg`
16633 AddrMode AM = AMode;
16634 if (AM.Scale && !AM.HasBaseReg) {
16635 if (AM.Scale == 1) {
16636 AM.HasBaseReg = true;
16637 AM.Scale = 0;
16638 } else if (AM.Scale == 2) {
16639 AM.HasBaseReg = true;
16640 AM.Scale = 1;
16641 } else {
16642 return false;
16643 }
16644 }
16645
16646 // A base register is required in all addressing modes.
16647 if (!AM.HasBaseReg)
16648 return false;
16649
16650 if (Ty->isScalableTy()) {
16651 if (isa<ScalableVectorType>(Val: Ty)) {
16652 // See if we have a foldable vscale-based offset, for vector types which
16653 // are either legal or smaller than the minimum; more work will be
16654 // required if we need to consider addressing for types which need
16655 // legalization by splitting.
16656 uint64_t VecNumBytes = DL.getTypeSizeInBits(Ty).getKnownMinValue() / 8;
16657 if (AM.HasBaseReg && !AM.BaseOffs && AM.ScalableOffset && !AM.Scale &&
16658 (AM.ScalableOffset % VecNumBytes == 0) && VecNumBytes <= 16 &&
16659 isPowerOf2_64(Value: VecNumBytes))
16660 return isInt<4>(x: AM.ScalableOffset / (int64_t)VecNumBytes);
16661
16662 uint64_t VecElemNumBytes =
16663 DL.getTypeSizeInBits(Ty: cast<VectorType>(Val: Ty)->getElementType()) / 8;
16664 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset &&
16665 (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
16666 }
16667
16668 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset && !AM.Scale;
16669 }
16670
16671 // No scalable offsets allowed for non-scalable types.
16672 if (AM.ScalableOffset)
16673 return false;
16674
16675 // check reg + imm case:
16676 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
16677 uint64_t NumBytes = 0;
16678 if (Ty->isSized()) {
16679 uint64_t NumBits = DL.getTypeSizeInBits(Ty);
16680 NumBytes = NumBits / 8;
16681 if (!isPowerOf2_64(Value: NumBits))
16682 NumBytes = 0;
16683 }
16684
16685 return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, Offset: AM.BaseOffs,
16686 Scale: AM.Scale);
16687}
16688
16689// Check whether the 2 offsets belong to the same imm24 range, and their high
16690// 12bits are same, then their high part can be decoded with the offset of add.
16691int64_t
16692AArch64TargetLowering::getPreferredLargeGEPBaseOffset(int64_t MinOffset,
16693 int64_t MaxOffset) const {
16694 int64_t HighPart = MinOffset & ~0xfffULL;
16695 if (MinOffset >> 12 == MaxOffset >> 12 && isLegalAddImmediate(Immed: HighPart)) {
16696 // Rebase the value to an integer multiple of imm12.
16697 return HighPart;
16698 }
16699
16700 return 0;
16701}
16702
16703bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const {
16704 // Consider splitting large offset of struct or array.
16705 return true;
16706}
16707
16708bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(
16709 const MachineFunction &MF, EVT VT) const {
16710 VT = VT.getScalarType();
16711
16712 if (!VT.isSimple())
16713 return false;
16714
16715 switch (VT.getSimpleVT().SimpleTy) {
16716 case MVT::f16:
16717 return Subtarget->hasFullFP16();
16718 case MVT::f32:
16719 case MVT::f64:
16720 return true;
16721 default:
16722 break;
16723 }
16724
16725 return false;
16726}
16727
16728bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,
16729 Type *Ty) const {
16730 switch (Ty->getScalarType()->getTypeID()) {
16731 case Type::FloatTyID:
16732 case Type::DoubleTyID:
16733 return true;
16734 default:
16735 return false;
16736 }
16737}
16738
16739bool AArch64TargetLowering::generateFMAsInMachineCombiner(
16740 EVT VT, CodeGenOptLevel OptLevel) const {
16741 return (OptLevel >= CodeGenOptLevel::Aggressive) && !VT.isScalableVector() &&
16742 !useSVEForFixedLengthVectorVT(VT);
16743}
16744
16745const MCPhysReg *
16746AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
16747 // LR is a callee-save register, but we must treat it as clobbered by any call
16748 // site. Hence we include LR in the scratch registers, which are in turn added
16749 // as implicit-defs for stackmaps and patchpoints.
16750 static const MCPhysReg ScratchRegs[] = {
16751 AArch64::X16, AArch64::X17, AArch64::LR, 0
16752 };
16753 return ScratchRegs;
16754}
16755
16756ArrayRef<MCPhysReg> AArch64TargetLowering::getRoundingControlRegisters() const {
16757 static const MCPhysReg RCRegs[] = {AArch64::FPCR};
16758 return RCRegs;
16759}
16760
16761bool
16762AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
16763 CombineLevel Level) const {
16764 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
16765 N->getOpcode() == ISD::SRL) &&
16766 "Expected shift op");
16767
16768 SDValue ShiftLHS = N->getOperand(Num: 0);
16769 EVT VT = N->getValueType(ResNo: 0);
16770
16771 // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not
16772 // combine it with shift 'N' to let it be lowered to UBFX except:
16773 // ((x >> C) & mask) << C.
16774 if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
16775 isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {
16776 uint64_t TruncMask = ShiftLHS.getConstantOperandVal(i: 1);
16777 if (isMask_64(Value: TruncMask)) {
16778 SDValue AndLHS = ShiftLHS.getOperand(i: 0);
16779 if (AndLHS.getOpcode() == ISD::SRL) {
16780 if (auto *SRLC = dyn_cast<ConstantSDNode>(Val: AndLHS.getOperand(i: 1))) {
16781 if (N->getOpcode() == ISD::SHL)
16782 if (auto *SHLC = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1)))
16783 return SRLC->getZExtValue() == SHLC->getZExtValue();
16784 return false;
16785 }
16786 }
16787 }
16788 }
16789 return true;
16790}
16791
16792bool AArch64TargetLowering::isDesirableToCommuteXorWithShift(
16793 const SDNode *N) const {
16794 assert(N->getOpcode() == ISD::XOR &&
16795 (N->getOperand(0).getOpcode() == ISD::SHL ||
16796 N->getOperand(0).getOpcode() == ISD::SRL) &&
16797 "Expected XOR(SHIFT) pattern");
16798
16799 // Only commute if the entire NOT mask is a hidden shifted mask.
16800 auto *XorC = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
16801 auto *ShiftC = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 0).getOperand(i: 1));
16802 if (XorC && ShiftC) {
16803 unsigned MaskIdx, MaskLen;
16804 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
16805 unsigned ShiftAmt = ShiftC->getZExtValue();
16806 unsigned BitWidth = N->getValueType(ResNo: 0).getScalarSizeInBits();
16807 if (N->getOperand(Num: 0).getOpcode() == ISD::SHL)
16808 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
16809 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
16810 }
16811 }
16812
16813 return false;
16814}
16815
16816bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask(
16817 const SDNode *N, CombineLevel Level) const {
16818 assert(((N->getOpcode() == ISD::SHL &&
16819 N->getOperand(0).getOpcode() == ISD::SRL) ||
16820 (N->getOpcode() == ISD::SRL &&
16821 N->getOperand(0).getOpcode() == ISD::SHL)) &&
16822 "Expected shift-shift mask");
16823 // Don't allow multiuse shift folding with the same shift amount.
16824 if (!N->getOperand(Num: 0)->hasOneUse())
16825 return false;
16826
16827 // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
16828 EVT VT = N->getValueType(ResNo: 0);
16829 if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
16830 auto *C1 = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 0).getOperand(i: 1));
16831 auto *C2 = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
16832 return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
16833 }
16834
16835 return true;
16836}
16837
16838bool AArch64TargetLowering::shouldFoldSelectWithIdentityConstant(
16839 unsigned BinOpcode, EVT VT) const {
16840 return VT.isScalableVector() && isTypeLegal(VT);
16841}
16842
16843bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
16844 Type *Ty) const {
16845 assert(Ty->isIntegerTy());
16846
16847 unsigned BitSize = Ty->getPrimitiveSizeInBits();
16848 if (BitSize == 0)
16849 return false;
16850
16851 int64_t Val = Imm.getSExtValue();
16852 if (Val == 0 || AArch64_AM::isLogicalImmediate(imm: Val, regSize: BitSize))
16853 return true;
16854
16855 if ((int64_t)Val < 0)
16856 Val = ~Val;
16857 if (BitSize == 32)
16858 Val &= (1LL << 32) - 1;
16859
16860 unsigned Shift = llvm::Log2_64(Value: (uint64_t)Val) / 16;
16861 // MOVZ is free so return true for one or fewer MOVK.
16862 return Shift < 3;
16863}
16864
16865bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
16866 unsigned Index) const {
16867 if (!isOperationLegalOrCustom(Op: ISD::EXTRACT_SUBVECTOR, VT: ResVT))
16868 return false;
16869
16870 return (Index == 0 || Index == ResVT.getVectorMinNumElements());
16871}
16872
16873/// Turn vector tests of the signbit in the form of:
16874/// xor (sra X, elt_size(X)-1), -1
16875/// into:
16876/// cmge X, X, #0
16877static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
16878 const AArch64Subtarget *Subtarget) {
16879 EVT VT = N->getValueType(ResNo: 0);
16880 if (!Subtarget->hasNEON() || !VT.isVector())
16881 return SDValue();
16882
16883 // There must be a shift right algebraic before the xor, and the xor must be a
16884 // 'not' operation.
16885 SDValue Shift = N->getOperand(Num: 0);
16886 SDValue Ones = N->getOperand(Num: 1);
16887 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
16888 !ISD::isBuildVectorAllOnes(N: Ones.getNode()))
16889 return SDValue();
16890
16891 // The shift should be smearing the sign bit across each vector element.
16892 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Val: Shift.getOperand(i: 1));
16893 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
16894 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
16895 return SDValue();
16896
16897 return DAG.getNode(Opcode: AArch64ISD::CMGEz, DL: SDLoc(N), VT, Operand: Shift.getOperand(i: 0));
16898}
16899
16900// Given a vecreduce_add node, detect the below pattern and convert it to the
16901// node sequence with UABDL, [S|U]ADB and UADDLP.
16902//
16903// i32 vecreduce_add(
16904// v16i32 abs(
16905// v16i32 sub(
16906// v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
16907// =================>
16908// i32 vecreduce_add(
16909// v4i32 UADDLP(
16910// v8i16 add(
16911// v8i16 zext(
16912// v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
16913// v8i16 zext(
16914// v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
16915static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N,
16916 SelectionDAG &DAG) {
16917 // Assumed i32 vecreduce_add
16918 if (N->getValueType(0) != MVT::i32)
16919 return SDValue();
16920
16921 SDValue VecReduceOp0 = N->getOperand(Num: 0);
16922 unsigned Opcode = VecReduceOp0.getOpcode();
16923 // Assumed v16i32 abs
16924 if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != MVT::v16i32)
16925 return SDValue();
16926
16927 SDValue ABS = VecReduceOp0;
16928 // Assumed v16i32 sub
16929 if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
16930 ABS->getOperand(0)->getValueType(0) != MVT::v16i32)
16931 return SDValue();
16932
16933 SDValue SUB = ABS->getOperand(Num: 0);
16934 unsigned Opcode0 = SUB->getOperand(Num: 0).getOpcode();
16935 unsigned Opcode1 = SUB->getOperand(Num: 1).getOpcode();
16936 // Assumed v16i32 type
16937 if (SUB->getOperand(0)->getValueType(0) != MVT::v16i32 ||
16938 SUB->getOperand(1)->getValueType(0) != MVT::v16i32)
16939 return SDValue();
16940
16941 // Assumed zext or sext
16942 bool IsZExt = false;
16943 if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
16944 IsZExt = true;
16945 } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
16946 IsZExt = false;
16947 } else
16948 return SDValue();
16949
16950 SDValue EXT0 = SUB->getOperand(Num: 0);
16951 SDValue EXT1 = SUB->getOperand(Num: 1);
16952 // Assumed zext's operand has v16i8 type
16953 if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
16954 EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
16955 return SDValue();
16956
16957 // Pattern is dectected. Let's convert it to sequence of nodes.
16958 SDLoc DL(N);
16959
16960 // First, create the node pattern of UABD/SABD.
16961 SDValue UABDHigh8Op0 =
16962 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
16963 DAG.getConstant(8, DL, MVT::i64));
16964 SDValue UABDHigh8Op1 =
16965 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
16966 DAG.getConstant(8, DL, MVT::i64));
16967 SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
16968 UABDHigh8Op0, UABDHigh8Op1);
16969 SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
16970
16971 // Second, create the node pattern of UABAL.
16972 SDValue UABDLo8Op0 =
16973 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
16974 DAG.getConstant(0, DL, MVT::i64));
16975 SDValue UABDLo8Op1 =
16976 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
16977 DAG.getConstant(0, DL, MVT::i64));
16978 SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
16979 UABDLo8Op0, UABDLo8Op1);
16980 SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
16981 SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
16982
16983 // Third, create the node of UADDLP.
16984 SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
16985
16986 // Fourth, create the node of VECREDUCE_ADD.
16987 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
16988}
16989
16990// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
16991// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
16992// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
16993// If we have vectors larger than v16i8 we extract v16i8 vectors,
16994// Follow the same steps above to get DOT instructions concatenate them
16995// and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
16996static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG,
16997 const AArch64Subtarget *ST) {
16998 if (!ST->hasDotProd())
16999 return performVecReduceAddCombineWithUADDLP(N, DAG);
17000
17001 SDValue Op0 = N->getOperand(Num: 0);
17002 if (N->getValueType(0) != MVT::i32 || Op0.getValueType().isScalableVT() ||
17003 Op0.getValueType().getVectorElementType() != MVT::i32)
17004 return SDValue();
17005
17006 unsigned ExtOpcode = Op0.getOpcode();
17007 SDValue A = Op0;
17008 SDValue B;
17009 if (ExtOpcode == ISD::MUL) {
17010 A = Op0.getOperand(i: 0);
17011 B = Op0.getOperand(i: 1);
17012 if (A.getOpcode() != B.getOpcode() ||
17013 A.getOperand(i: 0).getValueType() != B.getOperand(i: 0).getValueType())
17014 return SDValue();
17015 ExtOpcode = A.getOpcode();
17016 }
17017 if (ExtOpcode != ISD::ZERO_EXTEND && ExtOpcode != ISD::SIGN_EXTEND)
17018 return SDValue();
17019
17020 EVT Op0VT = A.getOperand(i: 0).getValueType();
17021 bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0;
17022 bool IsValidSize = Op0VT.getScalarSizeInBits() == 8;
17023 if (!IsValidElementCount || !IsValidSize)
17024 return SDValue();
17025
17026 SDLoc DL(Op0);
17027 // For non-mla reductions B can be set to 1. For MLA we take the operand of
17028 // the extend B.
17029 if (!B)
17030 B = DAG.getConstant(Val: 1, DL, VT: Op0VT);
17031 else
17032 B = B.getOperand(i: 0);
17033
17034 unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % 16 == 0;
17035 unsigned NumOfVecReduce;
17036 EVT TargetType;
17037 if (IsMultipleOf16) {
17038 NumOfVecReduce = Op0VT.getVectorNumElements() / 16;
17039 TargetType = MVT::v4i32;
17040 } else {
17041 NumOfVecReduce = Op0VT.getVectorNumElements() / 8;
17042 TargetType = MVT::v2i32;
17043 }
17044 auto DotOpcode =
17045 (ExtOpcode == ISD::ZERO_EXTEND) ? AArch64ISD::UDOT : AArch64ISD::SDOT;
17046 // Handle the case where we need to generate only one Dot operation.
17047 if (NumOfVecReduce == 1) {
17048 SDValue Zeros = DAG.getConstant(Val: 0, DL, VT: TargetType);
17049 SDValue Dot = DAG.getNode(Opcode: DotOpcode, DL, VT: Zeros.getValueType(), N1: Zeros,
17050 N2: A.getOperand(i: 0), N3: B);
17051 return DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: N->getValueType(ResNo: 0), Operand: Dot);
17052 }
17053 // Generate Dot instructions that are multiple of 16.
17054 unsigned VecReduce16Num = Op0VT.getVectorNumElements() / 16;
17055 SmallVector<SDValue, 4> SDotVec16;
17056 unsigned I = 0;
17057 for (; I < VecReduce16Num; I += 1) {
17058 SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32);
17059 SDValue Op0 =
17060 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, A.getOperand(0),
17061 DAG.getConstant(I * 16, DL, MVT::i64));
17062 SDValue Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, B,
17063 DAG.getConstant(I * 16, DL, MVT::i64));
17064 SDValue Dot =
17065 DAG.getNode(Opcode: DotOpcode, DL, VT: Zeros.getValueType(), N1: Zeros, N2: Op0, N3: Op1);
17066 SDotVec16.push_back(Elt: Dot);
17067 }
17068 // Concatenate dot operations.
17069 EVT SDot16EVT =
17070 EVT::getVectorVT(*DAG.getContext(), MVT::i32, 4 * VecReduce16Num);
17071 SDValue ConcatSDot16 =
17072 DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: SDot16EVT, Ops: SDotVec16);
17073 SDValue VecReduceAdd16 =
17074 DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: N->getValueType(ResNo: 0), Operand: ConcatSDot16);
17075 unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % 16) / 8;
17076 if (VecReduce8Num == 0)
17077 return VecReduceAdd16;
17078
17079 // Generate the remainder Dot operation that is multiple of 8.
17080 SmallVector<SDValue, 4> SDotVec8;
17081 SDValue Zeros = DAG.getConstant(0, DL, MVT::v2i32);
17082 SDValue Vec8Op0 =
17083 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, A.getOperand(0),
17084 DAG.getConstant(I * 16, DL, MVT::i64));
17085 SDValue Vec8Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, B,
17086 DAG.getConstant(I * 16, DL, MVT::i64));
17087 SDValue Dot =
17088 DAG.getNode(Opcode: DotOpcode, DL, VT: Zeros.getValueType(), N1: Zeros, N2: Vec8Op0, N3: Vec8Op1);
17089 SDValue VecReudceAdd8 =
17090 DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: N->getValueType(ResNo: 0), Operand: Dot);
17091 return DAG.getNode(Opcode: ISD::ADD, DL, VT: N->getValueType(ResNo: 0), N1: VecReduceAdd16,
17092 N2: VecReudceAdd8);
17093}
17094
17095// Given an (integer) vecreduce, we know the order of the inputs does not
17096// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
17097// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
17098// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
17099static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG) {
17100 auto DetectAddExtract = [&](SDValue A) {
17101 // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
17102 // UADDLP(x) if found.
17103 assert(A.getOpcode() == ISD::ADD);
17104 EVT VT = A.getValueType();
17105 SDValue Op0 = A.getOperand(i: 0);
17106 SDValue Op1 = A.getOperand(i: 1);
17107 if (Op0.getOpcode() != Op0.getOpcode() ||
17108 (Op0.getOpcode() != ISD::ZERO_EXTEND &&
17109 Op0.getOpcode() != ISD::SIGN_EXTEND))
17110 return SDValue();
17111 SDValue Ext0 = Op0.getOperand(i: 0);
17112 SDValue Ext1 = Op1.getOperand(i: 0);
17113 if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
17114 Ext1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
17115 Ext0.getOperand(i: 0) != Ext1.getOperand(i: 0))
17116 return SDValue();
17117 // Check that the type is twice the add types, and the extract are from
17118 // upper/lower parts of the same source.
17119 if (Ext0.getOperand(i: 0).getValueType().getVectorNumElements() !=
17120 VT.getVectorNumElements() * 2)
17121 return SDValue();
17122 if ((Ext0.getConstantOperandVal(i: 1) != 0 ||
17123 Ext1.getConstantOperandVal(i: 1) != VT.getVectorNumElements()) &&
17124 (Ext1.getConstantOperandVal(i: 1) != 0 ||
17125 Ext0.getConstantOperandVal(i: 1) != VT.getVectorNumElements()))
17126 return SDValue();
17127 unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
17128 : AArch64ISD::SADDLP;
17129 return DAG.getNode(Opcode, DL: SDLoc(A), VT, Operand: Ext0.getOperand(i: 0));
17130 };
17131
17132 if (SDValue R = DetectAddExtract(A))
17133 return R;
17134
17135 if (A.getOperand(i: 0).getOpcode() == ISD::ADD && A.getOperand(i: 0).hasOneUse())
17136 if (SDValue R = performUADDVAddCombine(A: A.getOperand(i: 0), DAG))
17137 return DAG.getNode(Opcode: ISD::ADD, DL: SDLoc(A), VT: A.getValueType(), N1: R,
17138 N2: A.getOperand(i: 1));
17139 if (A.getOperand(i: 1).getOpcode() == ISD::ADD && A.getOperand(i: 1).hasOneUse())
17140 if (SDValue R = performUADDVAddCombine(A: A.getOperand(i: 1), DAG))
17141 return DAG.getNode(Opcode: ISD::ADD, DL: SDLoc(A), VT: A.getValueType(), N1: R,
17142 N2: A.getOperand(i: 0));
17143 return SDValue();
17144}
17145
17146// We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into
17147// UADDLV(concat), where the concat represents the 64-bit zext sources.
17148static SDValue performUADDVZextCombine(SDValue A, SelectionDAG &DAG) {
17149 // Look for add(zext(64-bit source), zext(64-bit source)), returning
17150 // UADDLV(concat(zext, zext)) if found.
17151 assert(A.getOpcode() == ISD::ADD);
17152 EVT VT = A.getValueType();
17153 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
17154 return SDValue();
17155 SDValue Op0 = A.getOperand(i: 0);
17156 SDValue Op1 = A.getOperand(i: 1);
17157 if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOpcode() != Op1.getOpcode())
17158 return SDValue();
17159 SDValue Ext0 = Op0.getOperand(i: 0);
17160 SDValue Ext1 = Op1.getOperand(i: 0);
17161 EVT ExtVT0 = Ext0.getValueType();
17162 EVT ExtVT1 = Ext1.getValueType();
17163 // Check zext VTs are the same and 64-bit length.
17164 if (ExtVT0 != ExtVT1 ||
17165 VT.getScalarSizeInBits() != (2 * ExtVT0.getScalarSizeInBits()))
17166 return SDValue();
17167 // Get VT for concat of zext sources.
17168 EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
17169 SDValue Concat =
17170 DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(A), VT: PairVT, N1: Ext0, N2: Ext1);
17171
17172 switch (VT.getSimpleVT().SimpleTy) {
17173 case MVT::v2i64:
17174 case MVT::v4i32:
17175 return DAG.getNode(Opcode: AArch64ISD::UADDLV, DL: SDLoc(A), VT, Operand: Concat);
17176 case MVT::v8i16: {
17177 SDValue Uaddlv =
17178 DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), MVT::v4i32, Concat);
17179 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(A), MVT::v8i16, Uaddlv);
17180 }
17181 default:
17182 llvm_unreachable("Unhandled vector type");
17183 }
17184}
17185
17186static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) {
17187 SDValue A = N->getOperand(Num: 0);
17188 if (A.getOpcode() == ISD::ADD) {
17189 if (SDValue R = performUADDVAddCombine(A, DAG))
17190 return DAG.getNode(Opcode: N->getOpcode(), DL: SDLoc(N), VT: N->getValueType(ResNo: 0), Operand: R);
17191 else if (SDValue R = performUADDVZextCombine(A, DAG))
17192 return R;
17193 }
17194 return SDValue();
17195}
17196
17197static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
17198 TargetLowering::DAGCombinerInfo &DCI,
17199 const AArch64Subtarget *Subtarget) {
17200 if (DCI.isBeforeLegalizeOps())
17201 return SDValue();
17202
17203 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
17204}
17205
17206SDValue
17207AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
17208 SelectionDAG &DAG,
17209 SmallVectorImpl<SDNode *> &Created) const {
17210 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
17211 if (isIntDivCheap(VT: N->getValueType(ResNo: 0), Attr))
17212 return SDValue(N,0); // Lower SDIV as SDIV
17213
17214 EVT VT = N->getValueType(ResNo: 0);
17215
17216 // For scalable and fixed types, mark them as cheap so we can handle it much
17217 // later. This allows us to handle larger than legal types.
17218 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
17219 return SDValue(N, 0);
17220
17221 // fold (sdiv X, pow2)
17222 if ((VT != MVT::i32 && VT != MVT::i64) ||
17223 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
17224 return SDValue();
17225
17226 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
17227}
17228
17229SDValue
17230AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
17231 SelectionDAG &DAG,
17232 SmallVectorImpl<SDNode *> &Created) const {
17233 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
17234 if (isIntDivCheap(VT: N->getValueType(ResNo: 0), Attr))
17235 return SDValue(N, 0); // Lower SREM as SREM
17236
17237 EVT VT = N->getValueType(ResNo: 0);
17238
17239 // For scalable and fixed types, mark them as cheap so we can handle it much
17240 // later. This allows us to handle larger than legal types.
17241 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
17242 return SDValue(N, 0);
17243
17244 // fold (srem X, pow2)
17245 if ((VT != MVT::i32 && VT != MVT::i64) ||
17246 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
17247 return SDValue();
17248
17249 unsigned Lg2 = Divisor.countr_zero();
17250 if (Lg2 == 0)
17251 return SDValue();
17252
17253 SDLoc DL(N);
17254 SDValue N0 = N->getOperand(Num: 0);
17255 SDValue Pow2MinusOne = DAG.getConstant(Val: (1ULL << Lg2) - 1, DL, VT);
17256 SDValue Zero = DAG.getConstant(Val: 0, DL, VT);
17257 SDValue CCVal, CSNeg;
17258 if (Lg2 == 1) {
17259 SDValue Cmp = getAArch64Cmp(LHS: N0, RHS: Zero, CC: ISD::SETGE, AArch64cc&: CCVal, DAG, dl: DL);
17260 SDValue And = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: N0, N2: Pow2MinusOne);
17261 CSNeg = DAG.getNode(Opcode: AArch64ISD::CSNEG, DL, VT, N1: And, N2: And, N3: CCVal, N4: Cmp);
17262
17263 Created.push_back(Elt: Cmp.getNode());
17264 Created.push_back(Elt: And.getNode());
17265 } else {
17266 SDValue CCVal = DAG.getConstant(Val: AArch64CC::MI, DL, VT: MVT_CC);
17267 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
17268
17269 SDValue Negs = DAG.getNode(Opcode: AArch64ISD::SUBS, DL, VTList: VTs, N1: Zero, N2: N0);
17270 SDValue AndPos = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: N0, N2: Pow2MinusOne);
17271 SDValue AndNeg = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Negs, N2: Pow2MinusOne);
17272 CSNeg = DAG.getNode(Opcode: AArch64ISD::CSNEG, DL, VT, N1: AndPos, N2: AndNeg, N3: CCVal,
17273 N4: Negs.getValue(R: 1));
17274
17275 Created.push_back(Elt: Negs.getNode());
17276 Created.push_back(Elt: AndPos.getNode());
17277 Created.push_back(Elt: AndNeg.getNode());
17278 }
17279
17280 return CSNeg;
17281}
17282
17283static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
17284 switch(getIntrinsicID(N: S.getNode())) {
17285 default:
17286 break;
17287 case Intrinsic::aarch64_sve_cntb:
17288 return 8;
17289 case Intrinsic::aarch64_sve_cnth:
17290 return 16;
17291 case Intrinsic::aarch64_sve_cntw:
17292 return 32;
17293 case Intrinsic::aarch64_sve_cntd:
17294 return 64;
17295 }
17296 return {};
17297}
17298
17299/// Calculates what the pre-extend type is, based on the extension
17300/// operation node provided by \p Extend.
17301///
17302/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
17303/// pre-extend type is pulled directly from the operand, while other extend
17304/// operations need a bit more inspection to get this information.
17305///
17306/// \param Extend The SDNode from the DAG that represents the extend operation
17307///
17308/// \returns The type representing the \p Extend source type, or \p MVT::Other
17309/// if no valid type can be determined
17310static EVT calculatePreExtendType(SDValue Extend) {
17311 switch (Extend.getOpcode()) {
17312 case ISD::SIGN_EXTEND:
17313 case ISD::ZERO_EXTEND:
17314 return Extend.getOperand(i: 0).getValueType();
17315 case ISD::AssertSext:
17316 case ISD::AssertZext:
17317 case ISD::SIGN_EXTEND_INREG: {
17318 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Val: Extend.getOperand(i: 1));
17319 if (!TypeNode)
17320 return MVT::Other;
17321 return TypeNode->getVT();
17322 }
17323 case ISD::AND: {
17324 ConstantSDNode *Constant =
17325 dyn_cast<ConstantSDNode>(Val: Extend.getOperand(i: 1).getNode());
17326 if (!Constant)
17327 return MVT::Other;
17328
17329 uint32_t Mask = Constant->getZExtValue();
17330
17331 if (Mask == UCHAR_MAX)
17332 return MVT::i8;
17333 else if (Mask == USHRT_MAX)
17334 return MVT::i16;
17335 else if (Mask == UINT_MAX)
17336 return MVT::i32;
17337
17338 return MVT::Other;
17339 }
17340 default:
17341 return MVT::Other;
17342 }
17343}
17344
17345/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
17346/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
17347/// SExt/ZExt rather than the scalar SExt/ZExt
17348static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG) {
17349 EVT VT = BV.getValueType();
17350 if (BV.getOpcode() != ISD::BUILD_VECTOR &&
17351 BV.getOpcode() != ISD::VECTOR_SHUFFLE)
17352 return SDValue();
17353
17354 // Use the first item in the buildvector/shuffle to get the size of the
17355 // extend, and make sure it looks valid.
17356 SDValue Extend = BV->getOperand(Num: 0);
17357 unsigned ExtendOpcode = Extend.getOpcode();
17358 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
17359 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
17360 ExtendOpcode == ISD::AssertSext;
17361 if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
17362 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
17363 return SDValue();
17364 // Shuffle inputs are vector, limit to SIGN_EXTEND and ZERO_EXTEND to ensure
17365 // calculatePreExtendType will work without issue.
17366 if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
17367 ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
17368 return SDValue();
17369
17370 // Restrict valid pre-extend data type
17371 EVT PreExtendType = calculatePreExtendType(Extend);
17372 if (PreExtendType == MVT::Other ||
17373 PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
17374 return SDValue();
17375
17376 // Make sure all other operands are equally extended
17377 for (SDValue Op : drop_begin(RangeOrContainer: BV->ops())) {
17378 if (Op.isUndef())
17379 continue;
17380 unsigned Opc = Op.getOpcode();
17381 bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
17382 Opc == ISD::AssertSext;
17383 if (OpcIsSExt != IsSExt || calculatePreExtendType(Extend: Op) != PreExtendType)
17384 return SDValue();
17385 }
17386
17387 SDValue NBV;
17388 SDLoc DL(BV);
17389 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
17390 EVT PreExtendVT = VT.changeVectorElementType(EltVT: PreExtendType);
17391 EVT PreExtendLegalType =
17392 PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
17393 SmallVector<SDValue, 8> NewOps;
17394 for (SDValue Op : BV->ops())
17395 NewOps.push_back(Elt: Op.isUndef() ? DAG.getUNDEF(VT: PreExtendLegalType)
17396 : DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: 0), DL,
17397 VT: PreExtendLegalType));
17398 NBV = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: PreExtendVT, Ops: NewOps);
17399 } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
17400 EVT PreExtendVT = VT.changeVectorElementType(EltVT: PreExtendType.getScalarType());
17401 NBV = DAG.getVectorShuffle(VT: PreExtendVT, dl: DL, N1: BV.getOperand(i: 0).getOperand(i: 0),
17402 N2: BV.getOperand(i: 1).isUndef()
17403 ? DAG.getUNDEF(VT: PreExtendVT)
17404 : BV.getOperand(i: 1).getOperand(i: 0),
17405 Mask: cast<ShuffleVectorSDNode>(Val&: BV)->getMask());
17406 }
17407 return DAG.getNode(Opcode: IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, VT, Operand: NBV);
17408}
17409
17410/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
17411/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
17412static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) {
17413 // If the value type isn't a vector, none of the operands are going to be dups
17414 EVT VT = Mul->getValueType(ResNo: 0);
17415 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
17416 return SDValue();
17417
17418 SDValue Op0 = performBuildShuffleExtendCombine(BV: Mul->getOperand(Num: 0), DAG);
17419 SDValue Op1 = performBuildShuffleExtendCombine(BV: Mul->getOperand(Num: 1), DAG);
17420
17421 // Neither operands have been changed, don't make any further changes
17422 if (!Op0 && !Op1)
17423 return SDValue();
17424
17425 SDLoc DL(Mul);
17426 return DAG.getNode(Opcode: Mul->getOpcode(), DL, VT, N1: Op0 ? Op0 : Mul->getOperand(Num: 0),
17427 N2: Op1 ? Op1 : Mul->getOperand(Num: 1));
17428}
17429
17430// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
17431// Same for other types with equivalent constants.
17432static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG) {
17433 EVT VT = N->getValueType(ResNo: 0);
17434 if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
17435 VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
17436 return SDValue();
17437 if (N->getOperand(Num: 0).getOpcode() != ISD::AND ||
17438 N->getOperand(Num: 0).getOperand(i: 0).getOpcode() != ISD::SRL)
17439 return SDValue();
17440
17441 SDValue And = N->getOperand(Num: 0);
17442 SDValue Srl = And.getOperand(i: 0);
17443
17444 APInt V1, V2, V3;
17445 if (!ISD::isConstantSplatVector(N: N->getOperand(Num: 1).getNode(), SplatValue&: V1) ||
17446 !ISD::isConstantSplatVector(N: And.getOperand(i: 1).getNode(), SplatValue&: V2) ||
17447 !ISD::isConstantSplatVector(N: Srl.getOperand(i: 1).getNode(), SplatValue&: V3))
17448 return SDValue();
17449
17450 unsigned HalfSize = VT.getScalarSizeInBits() / 2;
17451 if (!V1.isMask(numBits: HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
17452 V3 != (HalfSize - 1))
17453 return SDValue();
17454
17455 EVT HalfVT = EVT::getVectorVT(Context&: *DAG.getContext(),
17456 VT: EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: HalfSize),
17457 EC: VT.getVectorElementCount() * 2);
17458
17459 SDLoc DL(N);
17460 SDValue In = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT: HalfVT, Operand: Srl.getOperand(i: 0));
17461 SDValue CM = DAG.getNode(Opcode: AArch64ISD::CMLTz, DL, VT: HalfVT, Operand: In);
17462 return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT, Operand: CM);
17463}
17464
17465static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
17466 TargetLowering::DAGCombinerInfo &DCI,
17467 const AArch64Subtarget *Subtarget) {
17468
17469 if (SDValue Ext = performMulVectorExtendCombine(Mul: N, DAG))
17470 return Ext;
17471 if (SDValue Ext = performMulVectorCmpZeroCombine(N, DAG))
17472 return Ext;
17473
17474 if (DCI.isBeforeLegalizeOps())
17475 return SDValue();
17476
17477 // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
17478 // and in MachineCombiner pass, add+mul will be combined into madd.
17479 // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
17480 SDLoc DL(N);
17481 EVT VT = N->getValueType(ResNo: 0);
17482 SDValue N0 = N->getOperand(Num: 0);
17483 SDValue N1 = N->getOperand(Num: 1);
17484 SDValue MulOper;
17485 unsigned AddSubOpc;
17486
17487 auto IsAddSubWith1 = [&](SDValue V) -> bool {
17488 AddSubOpc = V->getOpcode();
17489 if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
17490 SDValue Opnd = V->getOperand(Num: 1);
17491 MulOper = V->getOperand(Num: 0);
17492 if (AddSubOpc == ISD::SUB)
17493 std::swap(a&: Opnd, b&: MulOper);
17494 if (auto C = dyn_cast<ConstantSDNode>(Val&: Opnd))
17495 return C->isOne();
17496 }
17497 return false;
17498 };
17499
17500 if (IsAddSubWith1(N0)) {
17501 SDValue MulVal = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1, N2: MulOper);
17502 return DAG.getNode(Opcode: AddSubOpc, DL, VT, N1, N2: MulVal);
17503 }
17504
17505 if (IsAddSubWith1(N1)) {
17506 SDValue MulVal = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: N0, N2: MulOper);
17507 return DAG.getNode(Opcode: AddSubOpc, DL, VT, N1: N0, N2: MulVal);
17508 }
17509
17510 // The below optimizations require a constant RHS.
17511 if (!isa<ConstantSDNode>(Val: N1))
17512 return SDValue();
17513
17514 ConstantSDNode *C = cast<ConstantSDNode>(Val&: N1);
17515 const APInt &ConstValue = C->getAPIntValue();
17516
17517 // Allow the scaling to be folded into the `cnt` instruction by preventing
17518 // the scaling to be obscured here. This makes it easier to pattern match.
17519 if (IsSVECntIntrinsic(S: N0) ||
17520 (N0->getOpcode() == ISD::TRUNCATE &&
17521 (IsSVECntIntrinsic(S: N0->getOperand(Num: 0)))))
17522 if (ConstValue.sge(RHS: 1) && ConstValue.sle(RHS: 16))
17523 return SDValue();
17524
17525 // Multiplication of a power of two plus/minus one can be done more
17526 // cheaply as shift+add/sub. For now, this is true unilaterally. If
17527 // future CPUs have a cheaper MADD instruction, this may need to be
17528 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
17529 // 64-bit is 5 cycles, so this is always a win.
17530 // More aggressively, some multiplications N0 * C can be lowered to
17531 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
17532 // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)
17533 // TODO: lower more cases.
17534
17535 // TrailingZeroes is used to test if the mul can be lowered to
17536 // shift+add+shift.
17537 unsigned TrailingZeroes = ConstValue.countr_zero();
17538 if (TrailingZeroes) {
17539 // Conservatively do not lower to shift+add+shift if the mul might be
17540 // folded into smul or umul.
17541 if (N0->hasOneUse() && (isSignExtended(N: N0, DAG) ||
17542 isZeroExtended(N: N0, DAG)))
17543 return SDValue();
17544 // Conservatively do not lower to shift+add+shift if the mul might be
17545 // folded into madd or msub.
17546 if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD ||
17547 N->use_begin()->getOpcode() == ISD::SUB))
17548 return SDValue();
17549 }
17550 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
17551 // and shift+add+shift.
17552 APInt ShiftedConstValue = ConstValue.ashr(ShiftAmt: TrailingZeroes);
17553 unsigned ShiftAmt;
17554
17555 auto Shl = [&](SDValue N0, unsigned N1) {
17556 SDValue RHS = DAG.getConstant(N1, DL, MVT::i64);
17557 return DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: N0, N2: RHS);
17558 };
17559 auto Add = [&](SDValue N0, SDValue N1) {
17560 return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: N0, N2: N1);
17561 };
17562 auto Sub = [&](SDValue N0, SDValue N1) {
17563 return DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: N0, N2: N1);
17564 };
17565 auto Negate = [&](SDValue N) {
17566 SDValue Zero = DAG.getConstant(Val: 0, DL, VT);
17567 return DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Zero, N2: N);
17568 };
17569
17570 // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:
17571 // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as
17572 // the (2^N - 1) can't be execused via a single instruction.
17573 auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
17574 unsigned BitWidth = C.getBitWidth();
17575 for (unsigned i = 1; i < BitWidth / 2; i++) {
17576 APInt Rem;
17577 APInt X(BitWidth, (1 << i) + 1);
17578 APInt::sdivrem(LHS: C, RHS: X, Quotient&: N, Remainder&: Rem);
17579 APInt NVMinus1 = N - 1;
17580 if (Rem == 0 && NVMinus1.isPowerOf2()) {
17581 M = X;
17582 return true;
17583 }
17584 }
17585 return false;
17586 };
17587
17588 if (ConstValue.isNonNegative()) {
17589 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
17590 // (mul x, 2^N - 1) => (sub (shl x, N), x)
17591 // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
17592 // (mul x, (2^M + 1) * (2^N + 1))
17593 // => MV = (add (shl x, M), x); (add (shl MV, N), MV)
17594 APInt SCVMinus1 = ShiftedConstValue - 1;
17595 APInt SCVPlus1 = ShiftedConstValue + 1;
17596 APInt CVPlus1 = ConstValue + 1;
17597 APInt CVM, CVN;
17598 if (SCVMinus1.isPowerOf2()) {
17599 ShiftAmt = SCVMinus1.logBase2();
17600 return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes);
17601 } else if (CVPlus1.isPowerOf2()) {
17602 ShiftAmt = CVPlus1.logBase2();
17603 return Sub(Shl(N0, ShiftAmt), N0);
17604 } else if (SCVPlus1.isPowerOf2()) {
17605 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
17606 return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
17607 } else if (Subtarget->hasALULSLFast() &&
17608 isPowPlusPlusConst(ConstValue, CVM, CVN)) {
17609 APInt CVMMinus1 = CVM - 1;
17610 APInt CVNMinus1 = CVN - 1;
17611 unsigned ShiftM1 = CVMMinus1.logBase2();
17612 unsigned ShiftN1 = CVNMinus1.logBase2();
17613 // LSLFast implicate that Shifts <= 3 places are fast
17614 if (ShiftM1 <= 3 && ShiftN1 <= 3) {
17615 SDValue MVal = Add(Shl(N0, ShiftM1), N0);
17616 return Add(Shl(MVal, ShiftN1), MVal);
17617 }
17618 }
17619 } else {
17620 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
17621 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
17622 // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N))
17623 APInt SCVPlus1 = -ShiftedConstValue + 1;
17624 APInt CVNegPlus1 = -ConstValue + 1;
17625 APInt CVNegMinus1 = -ConstValue - 1;
17626 if (CVNegPlus1.isPowerOf2()) {
17627 ShiftAmt = CVNegPlus1.logBase2();
17628 return Sub(N0, Shl(N0, ShiftAmt));
17629 } else if (CVNegMinus1.isPowerOf2()) {
17630 ShiftAmt = CVNegMinus1.logBase2();
17631 return Negate(Add(Shl(N0, ShiftAmt), N0));
17632 } else if (SCVPlus1.isPowerOf2()) {
17633 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
17634 return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt));
17635 }
17636 }
17637
17638 return SDValue();
17639}
17640
17641static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
17642 SelectionDAG &DAG) {
17643 // Take advantage of vector comparisons producing 0 or -1 in each lane to
17644 // optimize away operation when it's from a constant.
17645 //
17646 // The general transformation is:
17647 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
17648 // AND(VECTOR_CMP(x,y), constant2)
17649 // constant2 = UNARYOP(constant)
17650
17651 // Early exit if this isn't a vector operation, the operand of the
17652 // unary operation isn't a bitwise AND, or if the sizes of the operations
17653 // aren't the same.
17654 EVT VT = N->getValueType(ResNo: 0);
17655 if (!VT.isVector() || N->getOperand(Num: 0)->getOpcode() != ISD::AND ||
17656 N->getOperand(Num: 0)->getOperand(Num: 0)->getOpcode() != ISD::SETCC ||
17657 VT.getSizeInBits() != N->getOperand(Num: 0)->getValueType(ResNo: 0).getSizeInBits())
17658 return SDValue();
17659
17660 // Now check that the other operand of the AND is a constant. We could
17661 // make the transformation for non-constant splats as well, but it's unclear
17662 // that would be a benefit as it would not eliminate any operations, just
17663 // perform one more step in scalar code before moving to the vector unit.
17664 if (BuildVectorSDNode *BV =
17665 dyn_cast<BuildVectorSDNode>(Val: N->getOperand(Num: 0)->getOperand(Num: 1))) {
17666 // Bail out if the vector isn't a constant.
17667 if (!BV->isConstant())
17668 return SDValue();
17669
17670 // Everything checks out. Build up the new and improved node.
17671 SDLoc DL(N);
17672 EVT IntVT = BV->getValueType(ResNo: 0);
17673 // Create a new constant of the appropriate type for the transformed
17674 // DAG.
17675 SDValue SourceConst = DAG.getNode(Opcode: N->getOpcode(), DL, VT, Operand: SDValue(BV, 0));
17676 // The AND node needs bitcasts to/from an integer vector type around it.
17677 SDValue MaskConst = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IntVT, Operand: SourceConst);
17678 SDValue NewAnd = DAG.getNode(Opcode: ISD::AND, DL, VT: IntVT,
17679 N1: N->getOperand(Num: 0)->getOperand(Num: 0), N2: MaskConst);
17680 SDValue Res = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: NewAnd);
17681 return Res;
17682 }
17683
17684 return SDValue();
17685}
17686
17687static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
17688 const AArch64Subtarget *Subtarget) {
17689 // First try to optimize away the conversion when it's conditionally from
17690 // a constant. Vectors only.
17691 if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
17692 return Res;
17693
17694 EVT VT = N->getValueType(ResNo: 0);
17695 if (VT != MVT::f32 && VT != MVT::f64)
17696 return SDValue();
17697
17698 // Only optimize when the source and destination types have the same width.
17699 if (VT.getSizeInBits() != N->getOperand(Num: 0).getValueSizeInBits())
17700 return SDValue();
17701
17702 // If the result of an integer load is only used by an integer-to-float
17703 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
17704 // This eliminates an "integer-to-vector-move" UOP and improves throughput.
17705 SDValue N0 = N->getOperand(Num: 0);
17706 if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N: N0.getNode()) &&
17707 N0.hasOneUse() &&
17708 // Do not change the width of a volatile load.
17709 !cast<LoadSDNode>(Val&: N0)->isVolatile()) {
17710 LoadSDNode *LN0 = cast<LoadSDNode>(Val&: N0);
17711 SDValue Load = DAG.getLoad(VT, dl: SDLoc(N), Chain: LN0->getChain(), Ptr: LN0->getBasePtr(),
17712 PtrInfo: LN0->getPointerInfo(), Alignment: LN0->getAlign(),
17713 MMOFlags: LN0->getMemOperand()->getFlags());
17714
17715 // Make sure successors of the original load stay after it by updating them
17716 // to use the new Chain.
17717 DAG.ReplaceAllUsesOfValueWith(From: SDValue(LN0, 1), To: Load.getValue(R: 1));
17718
17719 unsigned Opcode =
17720 (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
17721 return DAG.getNode(Opcode, DL: SDLoc(N), VT, Operand: Load);
17722 }
17723
17724 return SDValue();
17725}
17726
17727/// Fold a floating-point multiply by power of two into floating-point to
17728/// fixed-point conversion.
17729static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
17730 TargetLowering::DAGCombinerInfo &DCI,
17731 const AArch64Subtarget *Subtarget) {
17732 if (!Subtarget->isNeonAvailable())
17733 return SDValue();
17734
17735 if (!N->getValueType(ResNo: 0).isSimple())
17736 return SDValue();
17737
17738 SDValue Op = N->getOperand(Num: 0);
17739 if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL)
17740 return SDValue();
17741
17742 if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
17743 return SDValue();
17744
17745 SDValue ConstVec = Op->getOperand(Num: 1);
17746 if (!isa<BuildVectorSDNode>(Val: ConstVec))
17747 return SDValue();
17748
17749 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
17750 uint32_t FloatBits = FloatTy.getSizeInBits();
17751 if (FloatBits != 32 && FloatBits != 64 &&
17752 (FloatBits != 16 || !Subtarget->hasFullFP16()))
17753 return SDValue();
17754
17755 MVT IntTy = N->getSimpleValueType(ResNo: 0).getVectorElementType();
17756 uint32_t IntBits = IntTy.getSizeInBits();
17757 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
17758 return SDValue();
17759
17760 // Avoid conversions where iN is larger than the float (e.g., float -> i64).
17761 if (IntBits > FloatBits)
17762 return SDValue();
17763
17764 BitVector UndefElements;
17765 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Val&: ConstVec);
17766 int32_t Bits = IntBits == 64 ? 64 : 32;
17767 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(UndefElements: &UndefElements, BitWidth: Bits + 1);
17768 if (C == -1 || C == 0 || C > Bits)
17769 return SDValue();
17770
17771 EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
17772 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: ResTy))
17773 return SDValue();
17774
17775 if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
17776 N->getOpcode() == ISD::FP_TO_UINT_SAT) {
17777 EVT SatVT = cast<VTSDNode>(Val: N->getOperand(Num: 1))->getVT();
17778 if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits)
17779 return SDValue();
17780 }
17781
17782 SDLoc DL(N);
17783 bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT ||
17784 N->getOpcode() == ISD::FP_TO_SINT_SAT);
17785 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
17786 : Intrinsic::aarch64_neon_vcvtfp2fxu;
17787 SDValue FixConv =
17788 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy,
17789 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
17790 Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
17791 // We can handle smaller integers by generating an extra trunc.
17792 if (IntBits < FloatBits)
17793 FixConv = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: N->getValueType(ResNo: 0), Operand: FixConv);
17794
17795 return FixConv;
17796}
17797
17798/// Fold a floating-point divide by power of two into fixed-point to
17799/// floating-point conversion.
17800static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG,
17801 TargetLowering::DAGCombinerInfo &DCI,
17802 const AArch64Subtarget *Subtarget) {
17803 if (!Subtarget->hasNEON())
17804 return SDValue();
17805
17806 SDValue Op = N->getOperand(Num: 0);
17807 unsigned Opc = Op->getOpcode();
17808 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
17809 !Op.getOperand(i: 0).getValueType().isSimple() ||
17810 (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
17811 return SDValue();
17812
17813 SDValue ConstVec = N->getOperand(Num: 1);
17814 if (!isa<BuildVectorSDNode>(Val: ConstVec))
17815 return SDValue();
17816
17817 MVT IntTy = Op.getOperand(i: 0).getSimpleValueType().getVectorElementType();
17818 int32_t IntBits = IntTy.getSizeInBits();
17819 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
17820 return SDValue();
17821
17822 MVT FloatTy = N->getSimpleValueType(ResNo: 0).getVectorElementType();
17823 int32_t FloatBits = FloatTy.getSizeInBits();
17824 if (FloatBits != 32 && FloatBits != 64)
17825 return SDValue();
17826
17827 // Avoid conversions where iN is larger than the float (e.g., i64 -> float).
17828 if (IntBits > FloatBits)
17829 return SDValue();
17830
17831 BitVector UndefElements;
17832 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Val&: ConstVec);
17833 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(UndefElements: &UndefElements, BitWidth: FloatBits + 1);
17834 if (C == -1 || C == 0 || C > FloatBits)
17835 return SDValue();
17836
17837 MVT ResTy;
17838 unsigned NumLanes = Op.getValueType().getVectorNumElements();
17839 switch (NumLanes) {
17840 default:
17841 return SDValue();
17842 case 2:
17843 ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
17844 break;
17845 case 4:
17846 ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
17847 break;
17848 }
17849
17850 if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
17851 return SDValue();
17852
17853 SDLoc DL(N);
17854 SDValue ConvInput = Op.getOperand(i: 0);
17855 bool IsSigned = Opc == ISD::SINT_TO_FP;
17856 if (IntBits < FloatBits)
17857 ConvInput = DAG.getNode(Opcode: IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
17858 VT: ResTy, Operand: ConvInput);
17859
17860 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
17861 : Intrinsic::aarch64_neon_vcvtfxu2fp;
17862 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
17863 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
17864 DAG.getConstant(C, DL, MVT::i32));
17865}
17866
17867static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
17868 const AArch64TargetLowering &TLI) {
17869 EVT VT = N->getValueType(ResNo: 0);
17870 SelectionDAG &DAG = DCI.DAG;
17871 SDLoc DL(N);
17872 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
17873
17874 if (!VT.isVector())
17875 return SDValue();
17876
17877 // The combining code works for NEON, SVE2 and SME.
17878 if (TLI.useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget.isNeonAvailable()) ||
17879 (VT.isScalableVector() && !Subtarget.hasSVE2()))
17880 return SDValue();
17881
17882 SDValue N0 = N->getOperand(Num: 0);
17883 if (N0.getOpcode() != ISD::AND)
17884 return SDValue();
17885
17886 SDValue N1 = N->getOperand(Num: 1);
17887 if (N1.getOpcode() != ISD::AND)
17888 return SDValue();
17889
17890 // InstCombine does (not (neg a)) => (add a -1).
17891 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
17892 // Loop over all combinations of AND operands.
17893 for (int i = 1; i >= 0; --i) {
17894 for (int j = 1; j >= 0; --j) {
17895 SDValue O0 = N0->getOperand(Num: i);
17896 SDValue O1 = N1->getOperand(Num: j);
17897 SDValue Sub, Add, SubSibling, AddSibling;
17898
17899 // Find a SUB and an ADD operand, one from each AND.
17900 if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
17901 Sub = O0;
17902 Add = O1;
17903 SubSibling = N0->getOperand(Num: 1 - i);
17904 AddSibling = N1->getOperand(Num: 1 - j);
17905 } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
17906 Add = O0;
17907 Sub = O1;
17908 AddSibling = N0->getOperand(Num: 1 - i);
17909 SubSibling = N1->getOperand(Num: 1 - j);
17910 } else
17911 continue;
17912
17913 if (!ISD::isConstantSplatVectorAllZeros(N: Sub.getOperand(i: 0).getNode()))
17914 continue;
17915
17916 // Constant ones is always righthand operand of the Add.
17917 if (!ISD::isConstantSplatVectorAllOnes(N: Add.getOperand(i: 1).getNode()))
17918 continue;
17919
17920 if (Sub.getOperand(i: 1) != Add.getOperand(i: 0))
17921 continue;
17922
17923 return DAG.getNode(Opcode: AArch64ISD::BSP, DL, VT, N1: Sub, N2: SubSibling, N3: AddSibling);
17924 }
17925 }
17926
17927 // (or (and a b) (and (not a) c)) => (bsl a b c)
17928 // We only have to look for constant vectors here since the general, variable
17929 // case can be handled in TableGen.
17930 unsigned Bits = VT.getScalarSizeInBits();
17931 uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
17932 for (int i = 1; i >= 0; --i)
17933 for (int j = 1; j >= 0; --j) {
17934 APInt Val1, Val2;
17935
17936 if (ISD::isConstantSplatVector(N: N0->getOperand(Num: i).getNode(), SplatValue&: Val1) &&
17937 ISD::isConstantSplatVector(N: N1->getOperand(Num: j).getNode(), SplatValue&: Val2) &&
17938 (BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue()) {
17939 return DAG.getNode(Opcode: AArch64ISD::BSP, DL, VT, N1: N0->getOperand(Num: i),
17940 N2: N0->getOperand(Num: 1 - i), N3: N1->getOperand(Num: 1 - j));
17941 }
17942 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(Val: N0->getOperand(Num: i));
17943 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(Val: N1->getOperand(Num: j));
17944 if (!BVN0 || !BVN1)
17945 continue;
17946
17947 bool FoundMatch = true;
17948 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
17949 ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(Val: BVN0->getOperand(Num: k));
17950 ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(Val: BVN1->getOperand(Num: k));
17951 if (!CN0 || !CN1 ||
17952 CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
17953 FoundMatch = false;
17954 break;
17955 }
17956 }
17957 if (FoundMatch)
17958 return DAG.getNode(Opcode: AArch64ISD::BSP, DL, VT, N1: N0->getOperand(Num: i),
17959 N2: N0->getOperand(Num: 1 - i), N3: N1->getOperand(Num: 1 - j));
17960 }
17961
17962 return SDValue();
17963}
17964
17965// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
17966// convert to csel(ccmp(.., cc0)), depending on cc1:
17967
17968// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
17969// =>
17970// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
17971//
17972// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
17973// =>
17974// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
17975static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG) {
17976 EVT VT = N->getValueType(ResNo: 0);
17977 SDValue CSel0 = N->getOperand(Num: 0);
17978 SDValue CSel1 = N->getOperand(Num: 1);
17979
17980 if (CSel0.getOpcode() != AArch64ISD::CSEL ||
17981 CSel1.getOpcode() != AArch64ISD::CSEL)
17982 return SDValue();
17983
17984 if (!CSel0->hasOneUse() || !CSel1->hasOneUse())
17985 return SDValue();
17986
17987 if (!isNullConstant(V: CSel0.getOperand(i: 0)) ||
17988 !isOneConstant(V: CSel0.getOperand(i: 1)) ||
17989 !isNullConstant(V: CSel1.getOperand(i: 0)) ||
17990 !isOneConstant(V: CSel1.getOperand(i: 1)))
17991 return SDValue();
17992
17993 SDValue Cmp0 = CSel0.getOperand(i: 3);
17994 SDValue Cmp1 = CSel1.getOperand(i: 3);
17995 AArch64CC::CondCode CC0 = (AArch64CC::CondCode)CSel0.getConstantOperandVal(i: 2);
17996 AArch64CC::CondCode CC1 = (AArch64CC::CondCode)CSel1.getConstantOperandVal(i: 2);
17997 if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
17998 return SDValue();
17999 if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
18000 Cmp0.getOpcode() == AArch64ISD::SUBS) {
18001 std::swap(a&: Cmp0, b&: Cmp1);
18002 std::swap(a&: CC0, b&: CC1);
18003 }
18004
18005 if (Cmp1.getOpcode() != AArch64ISD::SUBS)
18006 return SDValue();
18007
18008 SDLoc DL(N);
18009 SDValue CCmp, Condition;
18010 unsigned NZCV;
18011
18012 if (N->getOpcode() == ISD::AND) {
18013 AArch64CC::CondCode InvCC0 = AArch64CC::getInvertedCondCode(Code: CC0);
18014 Condition = DAG.getConstant(Val: InvCC0, DL, VT: MVT_CC);
18015 NZCV = AArch64CC::getNZCVToSatisfyCondCode(Code: CC1);
18016 } else {
18017 AArch64CC::CondCode InvCC1 = AArch64CC::getInvertedCondCode(Code: CC1);
18018 Condition = DAG.getConstant(Val: CC0, DL, VT: MVT_CC);
18019 NZCV = AArch64CC::getNZCVToSatisfyCondCode(Code: InvCC1);
18020 }
18021
18022 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
18023
18024 auto *Op1 = dyn_cast<ConstantSDNode>(Val: Cmp1.getOperand(i: 1));
18025 if (Op1 && Op1->getAPIntValue().isNegative() &&
18026 Op1->getAPIntValue().sgt(RHS: -32)) {
18027 // CCMP accept the constant int the range [0, 31]
18028 // if the Op1 is a constant in the range [-31, -1], we
18029 // can select to CCMN to avoid the extra mov
18030 SDValue AbsOp1 =
18031 DAG.getConstant(Val: Op1->getAPIntValue().abs(), DL, VT: Op1->getValueType(ResNo: 0));
18032 CCmp = DAG.getNode(Opcode: AArch64ISD::CCMN, DL, VT: MVT_CC, N1: Cmp1.getOperand(i: 0), N2: AbsOp1,
18033 N3: NZCVOp, N4: Condition, N5: Cmp0);
18034 } else {
18035 CCmp = DAG.getNode(Opcode: AArch64ISD::CCMP, DL, VT: MVT_CC, N1: Cmp1.getOperand(i: 0),
18036 N2: Cmp1.getOperand(i: 1), N3: NZCVOp, N4: Condition, N5: Cmp0);
18037 }
18038 return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
18039 CSel0.getOperand(1), DAG.getConstant(CC1, DL, MVT::i32),
18040 CCmp);
18041}
18042
18043static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
18044 const AArch64Subtarget *Subtarget,
18045 const AArch64TargetLowering &TLI) {
18046 SelectionDAG &DAG = DCI.DAG;
18047 EVT VT = N->getValueType(ResNo: 0);
18048
18049 if (SDValue R = performANDORCSELCombine(N, DAG))
18050 return R;
18051
18052 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
18053 return SDValue();
18054
18055 if (SDValue Res = tryCombineToBSL(N, DCI, TLI))
18056 return Res;
18057
18058 return SDValue();
18059}
18060
18061static bool isConstantSplatVectorMaskForType(SDNode *N, EVT MemVT) {
18062 if (!MemVT.getVectorElementType().isSimple())
18063 return false;
18064
18065 uint64_t MaskForTy = 0ull;
18066 switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
18067 case MVT::i8:
18068 MaskForTy = 0xffull;
18069 break;
18070 case MVT::i16:
18071 MaskForTy = 0xffffull;
18072 break;
18073 case MVT::i32:
18074 MaskForTy = 0xffffffffull;
18075 break;
18076 default:
18077 return false;
18078 break;
18079 }
18080
18081 if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
18082 if (auto *Op0 = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 0)))
18083 return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
18084
18085 return false;
18086}
18087
18088static SDValue performReinterpretCastCombine(SDNode *N) {
18089 SDValue LeafOp = SDValue(N, 0);
18090 SDValue Op = N->getOperand(Num: 0);
18091 while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST &&
18092 LeafOp.getValueType() != Op.getValueType())
18093 Op = Op->getOperand(Num: 0);
18094 if (LeafOp.getValueType() == Op.getValueType())
18095 return Op;
18096 return SDValue();
18097}
18098
18099static SDValue performSVEAndCombine(SDNode *N,
18100 TargetLowering::DAGCombinerInfo &DCI) {
18101 SelectionDAG &DAG = DCI.DAG;
18102 SDValue Src = N->getOperand(Num: 0);
18103 unsigned Opc = Src->getOpcode();
18104
18105 // Zero/any extend of an unsigned unpack
18106 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
18107 SDValue UnpkOp = Src->getOperand(Num: 0);
18108 SDValue Dup = N->getOperand(Num: 1);
18109
18110 if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
18111 return SDValue();
18112
18113 SDLoc DL(N);
18114 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: Dup->getOperand(Num: 0));
18115 if (!C)
18116 return SDValue();
18117
18118 uint64_t ExtVal = C->getZExtValue();
18119
18120 auto MaskAndTypeMatch = [ExtVal](EVT VT) -> bool {
18121 return ((ExtVal == 0xFF && VT == MVT::i8) ||
18122 (ExtVal == 0xFFFF && VT == MVT::i16) ||
18123 (ExtVal == 0xFFFFFFFF && VT == MVT::i32));
18124 };
18125
18126 // If the mask is fully covered by the unpack, we don't need to push
18127 // a new AND onto the operand
18128 EVT EltTy = UnpkOp->getValueType(ResNo: 0).getVectorElementType();
18129 if (MaskAndTypeMatch(EltTy))
18130 return Src;
18131
18132 // If this is 'and (uunpklo/hi (extload MemTy -> ExtTy)), mask', then check
18133 // to see if the mask is all-ones of size MemTy.
18134 auto MaskedLoadOp = dyn_cast<MaskedLoadSDNode>(Val&: UnpkOp);
18135 if (MaskedLoadOp && (MaskedLoadOp->getExtensionType() == ISD::ZEXTLOAD ||
18136 MaskedLoadOp->getExtensionType() == ISD::EXTLOAD)) {
18137 EVT EltTy = MaskedLoadOp->getMemoryVT().getVectorElementType();
18138 if (MaskAndTypeMatch(EltTy))
18139 return Src;
18140 }
18141
18142 // Truncate to prevent a DUP with an over wide constant
18143 APInt Mask = C->getAPIntValue().trunc(width: EltTy.getSizeInBits());
18144
18145 // Otherwise, make sure we propagate the AND to the operand
18146 // of the unpack
18147 Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0),
18148 DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
18149
18150 SDValue And = DAG.getNode(Opcode: ISD::AND, DL,
18151 VT: UnpkOp->getValueType(ResNo: 0), N1: UnpkOp, N2: Dup);
18152
18153 return DAG.getNode(Opcode: Opc, DL, VT: N->getValueType(ResNo: 0), Operand: And);
18154 }
18155
18156 if (DCI.isBeforeLegalizeOps())
18157 return SDValue();
18158
18159 // If both sides of AND operations are i1 splat_vectors then
18160 // we can produce just i1 splat_vector as the result.
18161 if (isAllActivePredicate(DAG, N: N->getOperand(Num: 0)))
18162 return N->getOperand(Num: 1);
18163 if (isAllActivePredicate(DAG, N: N->getOperand(Num: 1)))
18164 return N->getOperand(Num: 0);
18165
18166 if (!EnableCombineMGatherIntrinsics)
18167 return SDValue();
18168
18169 SDValue Mask = N->getOperand(Num: 1);
18170
18171 if (!Src.hasOneUse())
18172 return SDValue();
18173
18174 EVT MemVT;
18175
18176 // SVE load instructions perform an implicit zero-extend, which makes them
18177 // perfect candidates for combining.
18178 switch (Opc) {
18179 case AArch64ISD::LD1_MERGE_ZERO:
18180 case AArch64ISD::LDNF1_MERGE_ZERO:
18181 case AArch64ISD::LDFF1_MERGE_ZERO:
18182 MemVT = cast<VTSDNode>(Val: Src->getOperand(Num: 3))->getVT();
18183 break;
18184 case AArch64ISD::GLD1_MERGE_ZERO:
18185 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
18186 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
18187 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
18188 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
18189 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
18190 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
18191 case AArch64ISD::GLDFF1_MERGE_ZERO:
18192 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
18193 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
18194 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
18195 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
18196 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
18197 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
18198 case AArch64ISD::GLDNT1_MERGE_ZERO:
18199 MemVT = cast<VTSDNode>(Val: Src->getOperand(Num: 4))->getVT();
18200 break;
18201 default:
18202 return SDValue();
18203 }
18204
18205 if (isConstantSplatVectorMaskForType(N: Mask.getNode(), MemVT))
18206 return Src;
18207
18208 return SDValue();
18209}
18210
18211// Transform and(fcmp(a, b), fcmp(c, d)) into fccmp(fcmp(a, b), c, d)
18212static SDValue performANDSETCCCombine(SDNode *N,
18213 TargetLowering::DAGCombinerInfo &DCI) {
18214
18215 // This function performs an optimization on a specific pattern involving
18216 // an AND operation and SETCC (Set Condition Code) node.
18217
18218 SDValue SetCC = N->getOperand(Num: 0);
18219 EVT VT = N->getValueType(ResNo: 0);
18220 SelectionDAG &DAG = DCI.DAG;
18221
18222 // Checks if the current node (N) is used by any SELECT instruction and
18223 // returns an empty SDValue to avoid applying the optimization to prevent
18224 // incorrect results
18225 for (auto U : N->uses())
18226 if (U->getOpcode() == ISD::SELECT)
18227 return SDValue();
18228
18229 // Check if the operand is a SETCC node with floating-point comparison
18230 if (SetCC.getOpcode() == ISD::SETCC &&
18231 SetCC.getOperand(0).getValueType() == MVT::f32) {
18232
18233 SDValue Cmp;
18234 AArch64CC::CondCode CC;
18235
18236 // Check if the DAG is after legalization and if we can emit the conjunction
18237 if (!DCI.isBeforeLegalize() &&
18238 (Cmp = emitConjunction(DAG, Val: SDValue(N, 0), OutCC&: CC))) {
18239
18240 AArch64CC::CondCode InvertedCC = AArch64CC::getInvertedCondCode(Code: CC);
18241
18242 SDLoc DL(N);
18243 return DAG.getNode(AArch64ISD::CSINC, DL, VT, DAG.getConstant(0, DL, VT),
18244 DAG.getConstant(0, DL, VT),
18245 DAG.getConstant(InvertedCC, DL, MVT::i32), Cmp);
18246 }
18247 }
18248 return SDValue();
18249}
18250
18251static SDValue performANDCombine(SDNode *N,
18252 TargetLowering::DAGCombinerInfo &DCI) {
18253 SelectionDAG &DAG = DCI.DAG;
18254 SDValue LHS = N->getOperand(Num: 0);
18255 SDValue RHS = N->getOperand(Num: 1);
18256 EVT VT = N->getValueType(ResNo: 0);
18257
18258 if (SDValue R = performANDORCSELCombine(N, DAG))
18259 return R;
18260
18261 if (SDValue R = performANDSETCCCombine(N,DCI))
18262 return R;
18263
18264 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
18265 return SDValue();
18266
18267 if (VT.isScalableVector())
18268 return performSVEAndCombine(N, DCI);
18269
18270 // The combining code below works only for NEON vectors. In particular, it
18271 // does not work for SVE when dealing with vectors wider than 128 bits.
18272 if (!VT.is64BitVector() && !VT.is128BitVector())
18273 return SDValue();
18274
18275 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Val: RHS.getNode());
18276 if (!BVN)
18277 return SDValue();
18278
18279 // AND does not accept an immediate, so check if we can use a BIC immediate
18280 // instruction instead. We do this here instead of using a (and x, (mvni imm))
18281 // pattern in isel, because some immediates may be lowered to the preferred
18282 // (and x, (movi imm)) form, even though an mvni representation also exists.
18283 APInt DefBits(VT.getSizeInBits(), 0);
18284 APInt UndefBits(VT.getSizeInBits(), 0);
18285 if (resolveBuildVector(BVN, CnstBits&: DefBits, UndefBits)) {
18286 SDValue NewOp;
18287
18288 // Any bits known to already be 0 need not be cleared again, which can help
18289 // reduce the size of the immediate to one supported by the instruction.
18290 KnownBits Known = DAG.computeKnownBits(Op: LHS);
18291 APInt ZeroSplat(VT.getSizeInBits(), 0);
18292 for (unsigned I = 0; I < VT.getSizeInBits() / Known.Zero.getBitWidth(); I++)
18293 ZeroSplat |= Known.Zero.zext(width: VT.getSizeInBits())
18294 << (Known.Zero.getBitWidth() * I);
18295
18296 DefBits = ~(DefBits | ZeroSplat);
18297 if ((NewOp = tryAdvSIMDModImm32(NewOp: AArch64ISD::BICi, Op: SDValue(N, 0), DAG,
18298 Bits: DefBits, LHS: &LHS)) ||
18299 (NewOp = tryAdvSIMDModImm16(NewOp: AArch64ISD::BICi, Op: SDValue(N, 0), DAG,
18300 Bits: DefBits, LHS: &LHS)))
18301 return NewOp;
18302
18303 UndefBits = ~(UndefBits | ZeroSplat);
18304 if ((NewOp = tryAdvSIMDModImm32(NewOp: AArch64ISD::BICi, Op: SDValue(N, 0), DAG,
18305 Bits: UndefBits, LHS: &LHS)) ||
18306 (NewOp = tryAdvSIMDModImm16(NewOp: AArch64ISD::BICi, Op: SDValue(N, 0), DAG,
18307 Bits: UndefBits, LHS: &LHS)))
18308 return NewOp;
18309 }
18310
18311 return SDValue();
18312}
18313
18314static SDValue performFADDCombine(SDNode *N,
18315 TargetLowering::DAGCombinerInfo &DCI) {
18316 SelectionDAG &DAG = DCI.DAG;
18317 SDValue LHS = N->getOperand(Num: 0);
18318 SDValue RHS = N->getOperand(Num: 1);
18319 EVT VT = N->getValueType(ResNo: 0);
18320 SDLoc DL(N);
18321
18322 if (!N->getFlags().hasAllowReassociation())
18323 return SDValue();
18324
18325 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
18326 auto ReassocComplex = [&](SDValue A, SDValue B) {
18327 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
18328 return SDValue();
18329 unsigned Opc = A.getConstantOperandVal(i: 0);
18330 if (Opc != Intrinsic::aarch64_neon_vcmla_rot0 &&
18331 Opc != Intrinsic::aarch64_neon_vcmla_rot90 &&
18332 Opc != Intrinsic::aarch64_neon_vcmla_rot180 &&
18333 Opc != Intrinsic::aarch64_neon_vcmla_rot270)
18334 return SDValue();
18335 SDValue VCMLA = DAG.getNode(
18336 Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT, N1: A.getOperand(i: 0),
18337 N2: DAG.getNode(Opcode: ISD::FADD, DL, VT, N1: A.getOperand(i: 1), N2: B, Flags: N->getFlags()),
18338 N3: A.getOperand(i: 2), N4: A.getOperand(i: 3));
18339 VCMLA->setFlags(A->getFlags());
18340 return VCMLA;
18341 };
18342 if (SDValue R = ReassocComplex(LHS, RHS))
18343 return R;
18344 if (SDValue R = ReassocComplex(RHS, LHS))
18345 return R;
18346
18347 return SDValue();
18348}
18349
18350static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
18351 switch (Opcode) {
18352 case ISD::STRICT_FADD:
18353 case ISD::FADD:
18354 return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
18355 case ISD::ADD:
18356 return VT == MVT::i64;
18357 default:
18358 return false;
18359 }
18360}
18361
18362static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
18363 AArch64CC::CondCode Cond);
18364
18365static bool isPredicateCCSettingOp(SDValue N) {
18366 if ((N.getOpcode() == ISD::SETCC) ||
18367 (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
18368 (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
18369 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
18370 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
18371 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
18372 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
18373 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
18374 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
18375 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
18376 // get_active_lane_mask is lowered to a whilelo instruction.
18377 N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask)))
18378 return true;
18379
18380 return false;
18381}
18382
18383// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
18384// ... into: "ptrue p, all" + PTEST
18385static SDValue
18386performFirstTrueTestVectorCombine(SDNode *N,
18387 TargetLowering::DAGCombinerInfo &DCI,
18388 const AArch64Subtarget *Subtarget) {
18389 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18390 // Make sure PTEST can be legalised with illegal types.
18391 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
18392 return SDValue();
18393
18394 SDValue N0 = N->getOperand(Num: 0);
18395 EVT VT = N0.getValueType();
18396
18397 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
18398 !isNullConstant(N->getOperand(1)))
18399 return SDValue();
18400
18401 // Restricted the DAG combine to only cases where we're extracting from a
18402 // flag-setting operation.
18403 if (!isPredicateCCSettingOp(N: N0))
18404 return SDValue();
18405
18406 // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
18407 SelectionDAG &DAG = DCI.DAG;
18408 SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
18409 return getPTest(DAG, VT: N->getValueType(ResNo: 0), Pg, Op: N0, Cond: AArch64CC::FIRST_ACTIVE);
18410}
18411
18412// Materialize : Idx = (add (mul vscale, NumEls), -1)
18413// i1 = extract_vector_elt t37, Constant:i64<Idx>
18414// ... into: "ptrue p, all" + PTEST
18415static SDValue
18416performLastTrueTestVectorCombine(SDNode *N,
18417 TargetLowering::DAGCombinerInfo &DCI,
18418 const AArch64Subtarget *Subtarget) {
18419 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18420 // Make sure PTEST is legal types.
18421 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
18422 return SDValue();
18423
18424 SDValue N0 = N->getOperand(Num: 0);
18425 EVT OpVT = N0.getValueType();
18426
18427 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
18428 return SDValue();
18429
18430 // Idx == (add (mul vscale, NumEls), -1)
18431 SDValue Idx = N->getOperand(Num: 1);
18432 if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(V: Idx.getOperand(i: 1)))
18433 return SDValue();
18434
18435 SDValue VS = Idx.getOperand(i: 0);
18436 if (VS.getOpcode() != ISD::VSCALE)
18437 return SDValue();
18438
18439 unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
18440 if (VS.getConstantOperandVal(i: 0) != NumEls)
18441 return SDValue();
18442
18443 // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
18444 SelectionDAG &DAG = DCI.DAG;
18445 SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all);
18446 return getPTest(DAG, VT: N->getValueType(ResNo: 0), Pg, Op: N0, Cond: AArch64CC::LAST_ACTIVE);
18447}
18448
18449static SDValue
18450performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
18451 const AArch64Subtarget *Subtarget) {
18452 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18453 if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
18454 return Res;
18455 if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
18456 return Res;
18457
18458 SelectionDAG &DAG = DCI.DAG;
18459 SDValue N0 = N->getOperand(Num: 0), N1 = N->getOperand(Num: 1);
18460
18461 EVT VT = N->getValueType(ResNo: 0);
18462 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
18463 bool IsStrict = N0->isStrictFPOpcode();
18464
18465 // extract(dup x) -> x
18466 if (N0.getOpcode() == AArch64ISD::DUP)
18467 return VT.isInteger() ? DAG.getZExtOrTrunc(Op: N0.getOperand(i: 0), DL: SDLoc(N), VT)
18468 : N0.getOperand(i: 0);
18469
18470 // Rewrite for pairwise fadd pattern
18471 // (f32 (extract_vector_elt
18472 // (fadd (vXf32 Other)
18473 // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
18474 // ->
18475 // (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
18476 // (extract_vector_elt (vXf32 Other) 1))
18477 // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
18478 // we can only do this when it's used only by the extract_vector_elt.
18479 if (isNullConstant(V: N1) && hasPairwiseAdd(Opcode: N0->getOpcode(), VT, FullFP16) &&
18480 (!IsStrict || N0.hasOneUse())) {
18481 SDLoc DL(N0);
18482 SDValue N00 = N0->getOperand(Num: IsStrict ? 1 : 0);
18483 SDValue N01 = N0->getOperand(Num: IsStrict ? 2 : 1);
18484
18485 ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(Val&: N01);
18486 SDValue Other = N00;
18487
18488 // And handle the commutative case.
18489 if (!Shuffle) {
18490 Shuffle = dyn_cast<ShuffleVectorSDNode>(Val&: N00);
18491 Other = N01;
18492 }
18493
18494 if (Shuffle && Shuffle->getMaskElt(Idx: 0) == 1 &&
18495 Other == Shuffle->getOperand(Num: 0)) {
18496 SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
18497 DAG.getConstant(0, DL, MVT::i64));
18498 SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
18499 DAG.getConstant(1, DL, MVT::i64));
18500 if (!IsStrict)
18501 return DAG.getNode(Opcode: N0->getOpcode(), DL, VT, N1: Extract1, N2: Extract2);
18502
18503 // For strict_fadd we need uses of the final extract_vector to be replaced
18504 // with the strict_fadd, but we also need uses of the chain output of the
18505 // original strict_fadd to use the chain output of the new strict_fadd as
18506 // otherwise it may not be deleted.
18507 SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
18508 {VT, MVT::Other},
18509 {N0->getOperand(0), Extract1, Extract2});
18510 DAG.ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Ret);
18511 DAG.ReplaceAllUsesOfValueWith(From: N0.getValue(R: 1), To: Ret.getValue(R: 1));
18512 return SDValue(N, 0);
18513 }
18514 }
18515
18516 return SDValue();
18517}
18518
18519static SDValue performConcatVectorsCombine(SDNode *N,
18520 TargetLowering::DAGCombinerInfo &DCI,
18521 SelectionDAG &DAG) {
18522 SDLoc dl(N);
18523 EVT VT = N->getValueType(ResNo: 0);
18524 SDValue N0 = N->getOperand(Num: 0), N1 = N->getOperand(Num: 1);
18525 unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
18526
18527 if (VT.isScalableVector())
18528 return SDValue();
18529
18530 // Optimize concat_vectors of truncated vectors, where the intermediate
18531 // type is illegal, to avoid said illegality, e.g.,
18532 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
18533 // (v2i16 (truncate (v2i64)))))
18534 // ->
18535 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
18536 // (v4i32 (bitcast (v2i64))),
18537 // <0, 2, 4, 6>)))
18538 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
18539 // on both input and result type, so we might generate worse code.
18540 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
18541 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
18542 N1Opc == ISD::TRUNCATE) {
18543 SDValue N00 = N0->getOperand(Num: 0);
18544 SDValue N10 = N1->getOperand(Num: 0);
18545 EVT N00VT = N00.getValueType();
18546
18547 if (N00VT == N10.getValueType() &&
18548 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
18549 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
18550 MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
18551 SmallVector<int, 8> Mask(MidVT.getVectorNumElements());
18552 for (size_t i = 0; i < Mask.size(); ++i)
18553 Mask[i] = i * 2;
18554 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT,
18555 Operand: DAG.getVectorShuffle(
18556 VT: MidVT, dl,
18557 N1: DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MidVT, Operand: N00),
18558 N2: DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MidVT, Operand: N10), Mask));
18559 }
18560 }
18561
18562 if (N->getOperand(0).getValueType() == MVT::v4i8 ||
18563 N->getOperand(0).getValueType() == MVT::v2i16 ||
18564 N->getOperand(0).getValueType() == MVT::v2i8) {
18565 EVT SrcVT = N->getOperand(Num: 0).getValueType();
18566 // If we have a concat of v4i8 loads, convert them to a buildvector of f32
18567 // loads to prevent having to go through the v4i8 load legalization that
18568 // needs to extend each element into a larger type.
18569 if (N->getNumOperands() % 2 == 0 &&
18570 all_of(Range: N->op_values(), P: [SrcVT](SDValue V) {
18571 if (V.getValueType() != SrcVT)
18572 return false;
18573 if (V.isUndef())
18574 return true;
18575 LoadSDNode *LD = dyn_cast<LoadSDNode>(Val&: V);
18576 return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
18577 LD->getExtensionType() == ISD::NON_EXTLOAD;
18578 })) {
18579 EVT FVT = SrcVT == MVT::v2i8 ? MVT::f16 : MVT::f32;
18580 EVT NVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: FVT, NumElements: N->getNumOperands());
18581 SmallVector<SDValue> Ops;
18582
18583 for (unsigned i = 0; i < N->getNumOperands(); i++) {
18584 SDValue V = N->getOperand(Num: i);
18585 if (V.isUndef())
18586 Ops.push_back(Elt: DAG.getUNDEF(VT: FVT));
18587 else {
18588 LoadSDNode *LD = cast<LoadSDNode>(Val&: V);
18589 SDValue NewLoad = DAG.getLoad(VT: FVT, dl, Chain: LD->getChain(),
18590 Ptr: LD->getBasePtr(), MMO: LD->getMemOperand());
18591 DAG.ReplaceAllUsesOfValueWith(From: SDValue(LD, 1), To: NewLoad.getValue(R: 1));
18592 Ops.push_back(Elt: NewLoad);
18593 }
18594 }
18595 return DAG.getBitcast(VT: N->getValueType(ResNo: 0),
18596 V: DAG.getBuildVector(VT: NVT, DL: dl, Ops));
18597 }
18598 }
18599
18600 // Canonicalise concat_vectors to replace concatenations of truncated nots
18601 // with nots of concatenated truncates. This in some cases allows for multiple
18602 // redundant negations to be eliminated.
18603 // (concat_vectors (v4i16 (truncate (not (v4i32)))),
18604 // (v4i16 (truncate (not (v4i32)))))
18605 // ->
18606 // (not (concat_vectors (v4i16 (truncate (v4i32))),
18607 // (v4i16 (truncate (v4i32)))))
18608 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
18609 N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N: N0.getNode()) &&
18610 N->isOnlyUserOf(N: N1.getNode())) {
18611 auto isBitwiseVectorNegate = [](SDValue V) {
18612 return V->getOpcode() == ISD::XOR &&
18613 ISD::isConstantSplatVectorAllOnes(N: V.getOperand(i: 1).getNode());
18614 };
18615 SDValue N00 = N0->getOperand(Num: 0);
18616 SDValue N10 = N1->getOperand(Num: 0);
18617 if (isBitwiseVectorNegate(N00) && N0->isOnlyUserOf(N: N00.getNode()) &&
18618 isBitwiseVectorNegate(N10) && N1->isOnlyUserOf(N: N10.getNode())) {
18619 return DAG.getNOT(
18620 DL: dl,
18621 Val: DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT,
18622 N1: DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: N0.getValueType(),
18623 Operand: N00->getOperand(Num: 0)),
18624 N2: DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: N1.getValueType(),
18625 Operand: N10->getOperand(Num: 0))),
18626 VT);
18627 }
18628 }
18629
18630 // Wait till after everything is legalized to try this. That way we have
18631 // legal vector types and such.
18632 if (DCI.isBeforeLegalizeOps())
18633 return SDValue();
18634
18635 // Optimise concat_vectors of two [us]avgceils or [us]avgfloors with a 128-bit
18636 // destination size, combine into an avg of two contacts of the source
18637 // vectors. eg: concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c),
18638 // concat(b, d))
18639 if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() &&
18640 (N0Opc == ISD::AVGCEILU || N0Opc == ISD::AVGCEILS ||
18641 N0Opc == ISD::AVGFLOORU || N0Opc == ISD::AVGFLOORS) &&
18642 N0->hasOneUse() && N1->hasOneUse()) {
18643 SDValue N00 = N0->getOperand(Num: 0);
18644 SDValue N01 = N0->getOperand(Num: 1);
18645 SDValue N10 = N1->getOperand(Num: 0);
18646 SDValue N11 = N1->getOperand(Num: 1);
18647
18648 if (!N00.isUndef() && !N01.isUndef() && !N10.isUndef() && !N11.isUndef()) {
18649 SDValue Concat0 = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT, N1: N00, N2: N10);
18650 SDValue Concat1 = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT, N1: N01, N2: N11);
18651 return DAG.getNode(Opcode: N0Opc, DL: dl, VT, N1: Concat0, N2: Concat1);
18652 }
18653 }
18654
18655 auto IsRSHRN = [](SDValue Shr) {
18656 if (Shr.getOpcode() != AArch64ISD::VLSHR)
18657 return false;
18658 SDValue Op = Shr.getOperand(i: 0);
18659 EVT VT = Op.getValueType();
18660 unsigned ShtAmt = Shr.getConstantOperandVal(i: 1);
18661 if (ShtAmt > VT.getScalarSizeInBits() / 2 || Op.getOpcode() != ISD::ADD)
18662 return false;
18663
18664 APInt Imm;
18665 if (Op.getOperand(i: 1).getOpcode() == AArch64ISD::MOVIshift)
18666 Imm = APInt(VT.getScalarSizeInBits(),
18667 Op.getOperand(i: 1).getConstantOperandVal(i: 0)
18668 << Op.getOperand(i: 1).getConstantOperandVal(i: 1));
18669 else if (Op.getOperand(i: 1).getOpcode() == AArch64ISD::DUP &&
18670 isa<ConstantSDNode>(Val: Op.getOperand(i: 1).getOperand(i: 0)))
18671 Imm = APInt(VT.getScalarSizeInBits(),
18672 Op.getOperand(i: 1).getConstantOperandVal(i: 0));
18673 else
18674 return false;
18675
18676 if (Imm != 1ULL << (ShtAmt - 1))
18677 return false;
18678 return true;
18679 };
18680
18681 // concat(rshrn(x), rshrn(y)) -> rshrn(concat(x, y))
18682 if (N->getNumOperands() == 2 && IsRSHRN(N0) &&
18683 ((IsRSHRN(N1) &&
18684 N0.getConstantOperandVal(i: 1) == N1.getConstantOperandVal(i: 1)) ||
18685 N1.isUndef())) {
18686 SDValue X = N0.getOperand(i: 0).getOperand(i: 0);
18687 SDValue Y = N1.isUndef() ? DAG.getUNDEF(VT: X.getValueType())
18688 : N1.getOperand(i: 0).getOperand(i: 0);
18689 EVT BVT =
18690 X.getValueType().getDoubleNumVectorElementsVT(Context&: *DCI.DAG.getContext());
18691 SDValue CC = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT: BVT, N1: X, N2: Y);
18692 SDValue Add = DAG.getNode(
18693 Opcode: ISD::ADD, DL: dl, VT: BVT, N1: CC,
18694 N2: DAG.getConstant(Val: 1ULL << (N0.getConstantOperandVal(i: 1) - 1), DL: dl, VT: BVT));
18695 SDValue Shr =
18696 DAG.getNode(Opcode: AArch64ISD::VLSHR, DL: dl, VT: BVT, N1: Add, N2: N0.getOperand(i: 1));
18697 return Shr;
18698 }
18699
18700 // concat(zip1(a, b), zip2(a, b)) is zip1(a, b)
18701 if (N->getNumOperands() == 2 && N0Opc == AArch64ISD::ZIP1 &&
18702 N1Opc == AArch64ISD::ZIP2 && N0.getOperand(i: 0) == N1.getOperand(i: 0) &&
18703 N0.getOperand(i: 1) == N1.getOperand(i: 1)) {
18704 SDValue E0 = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT, N1: N0.getOperand(i: 0),
18705 N2: DAG.getUNDEF(VT: N0.getValueType()));
18706 SDValue E1 = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT, N1: N0.getOperand(i: 1),
18707 N2: DAG.getUNDEF(VT: N0.getValueType()));
18708 return DAG.getNode(Opcode: AArch64ISD::ZIP1, DL: dl, VT, N1: E0, N2: E1);
18709 }
18710
18711 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
18712 // splat. The indexed instructions are going to be expecting a DUPLANE64, so
18713 // canonicalise to that.
18714 if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) {
18715 assert(VT.getScalarSizeInBits() == 64);
18716 return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
18717 DAG.getConstant(0, dl, MVT::i64));
18718 }
18719
18720 // Canonicalise concat_vectors so that the right-hand vector has as few
18721 // bit-casts as possible before its real operation. The primary matching
18722 // destination for these operations will be the narrowing "2" instructions,
18723 // which depend on the operation being performed on this right-hand vector.
18724 // For example,
18725 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
18726 // becomes
18727 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
18728
18729 if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST)
18730 return SDValue();
18731 SDValue RHS = N1->getOperand(Num: 0);
18732 MVT RHSTy = RHS.getValueType().getSimpleVT();
18733 // If the RHS is not a vector, this is not the pattern we're looking for.
18734 if (!RHSTy.isVector())
18735 return SDValue();
18736
18737 LLVM_DEBUG(
18738 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
18739
18740 MVT ConcatTy = MVT::getVectorVT(VT: RHSTy.getVectorElementType(),
18741 NumElements: RHSTy.getVectorNumElements() * 2);
18742 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT,
18743 Operand: DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT: ConcatTy,
18744 N1: DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: RHSTy, Operand: N0),
18745 N2: RHS));
18746}
18747
18748static SDValue
18749performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
18750 SelectionDAG &DAG) {
18751 if (DCI.isBeforeLegalizeOps())
18752 return SDValue();
18753
18754 EVT VT = N->getValueType(ResNo: 0);
18755 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
18756 return SDValue();
18757
18758 SDValue V = N->getOperand(Num: 0);
18759
18760 // NOTE: This combine exists in DAGCombiner, but that version's legality check
18761 // blocks this combine because the non-const case requires custom lowering.
18762 //
18763 // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
18764 if (V.getOpcode() == ISD::SPLAT_VECTOR)
18765 if (isa<ConstantSDNode>(Val: V.getOperand(i: 0)))
18766 return DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL: SDLoc(N), VT, Operand: V.getOperand(i: 0));
18767
18768 return SDValue();
18769}
18770
18771static SDValue
18772performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
18773 SelectionDAG &DAG) {
18774 SDLoc DL(N);
18775 SDValue Vec = N->getOperand(Num: 0);
18776 SDValue SubVec = N->getOperand(Num: 1);
18777 uint64_t IdxVal = N->getConstantOperandVal(Num: 2);
18778 EVT VecVT = Vec.getValueType();
18779 EVT SubVT = SubVec.getValueType();
18780
18781 // Only do this for legal fixed vector types.
18782 if (!VecVT.isFixedLengthVector() ||
18783 !DAG.getTargetLoweringInfo().isTypeLegal(VT: VecVT) ||
18784 !DAG.getTargetLoweringInfo().isTypeLegal(VT: SubVT))
18785 return SDValue();
18786
18787 // Ignore widening patterns.
18788 if (IdxVal == 0 && Vec.isUndef())
18789 return SDValue();
18790
18791 // Subvector must be half the width and an "aligned" insertion.
18792 unsigned NumSubElts = SubVT.getVectorNumElements();
18793 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
18794 (IdxVal != 0 && IdxVal != NumSubElts))
18795 return SDValue();
18796
18797 // Fold insert_subvector -> concat_vectors
18798 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
18799 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
18800 SDValue Lo, Hi;
18801 if (IdxVal == 0) {
18802 Lo = SubVec;
18803 Hi = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: SubVT, N1: Vec,
18804 N2: DAG.getVectorIdxConstant(Val: NumSubElts, DL));
18805 } else {
18806 Lo = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: SubVT, N1: Vec,
18807 N2: DAG.getVectorIdxConstant(Val: 0, DL));
18808 Hi = SubVec;
18809 }
18810 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: VecVT, N1: Lo, N2: Hi);
18811}
18812
18813static SDValue tryCombineFixedPointConvert(SDNode *N,
18814 TargetLowering::DAGCombinerInfo &DCI,
18815 SelectionDAG &DAG) {
18816 // Wait until after everything is legalized to try this. That way we have
18817 // legal vector types and such.
18818 if (DCI.isBeforeLegalizeOps())
18819 return SDValue();
18820 // Transform a scalar conversion of a value from a lane extract into a
18821 // lane extract of a vector conversion. E.g., from foo1 to foo2:
18822 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
18823 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
18824 //
18825 // The second form interacts better with instruction selection and the
18826 // register allocator to avoid cross-class register copies that aren't
18827 // coalescable due to a lane reference.
18828
18829 // Check the operand and see if it originates from a lane extract.
18830 SDValue Op1 = N->getOperand(Num: 1);
18831 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
18832 return SDValue();
18833
18834 // Yep, no additional predication needed. Perform the transform.
18835 SDValue IID = N->getOperand(Num: 0);
18836 SDValue Shift = N->getOperand(Num: 2);
18837 SDValue Vec = Op1.getOperand(i: 0);
18838 SDValue Lane = Op1.getOperand(i: 1);
18839 EVT ResTy = N->getValueType(ResNo: 0);
18840 EVT VecResTy;
18841 SDLoc DL(N);
18842
18843 // The vector width should be 128 bits by the time we get here, even
18844 // if it started as 64 bits (the extract_vector handling will have
18845 // done so). Bail if it is not.
18846 if (Vec.getValueSizeInBits() != 128)
18847 return SDValue();
18848
18849 if (Vec.getValueType() == MVT::v4i32)
18850 VecResTy = MVT::v4f32;
18851 else if (Vec.getValueType() == MVT::v2i64)
18852 VecResTy = MVT::v2f64;
18853 else
18854 return SDValue();
18855
18856 SDValue Convert =
18857 DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: VecResTy, N1: IID, N2: Vec, N3: Shift);
18858 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ResTy, N1: Convert, N2: Lane);
18859}
18860
18861// AArch64 high-vector "long" operations are formed by performing the non-high
18862// version on an extract_subvector of each operand which gets the high half:
18863//
18864// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
18865//
18866// However, there are cases which don't have an extract_high explicitly, but
18867// have another operation that can be made compatible with one for free. For
18868// example:
18869//
18870// (dupv64 scalar) --> (extract_high (dup128 scalar))
18871//
18872// This routine does the actual conversion of such DUPs, once outer routines
18873// have determined that everything else is in order.
18874// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
18875// similarly here.
18876static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
18877 MVT VT = N.getSimpleValueType();
18878 if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
18879 N.getConstantOperandVal(i: 1) == 0)
18880 N = N.getOperand(i: 0);
18881
18882 switch (N.getOpcode()) {
18883 case AArch64ISD::DUP:
18884 case AArch64ISD::DUPLANE8:
18885 case AArch64ISD::DUPLANE16:
18886 case AArch64ISD::DUPLANE32:
18887 case AArch64ISD::DUPLANE64:
18888 case AArch64ISD::MOVI:
18889 case AArch64ISD::MOVIshift:
18890 case AArch64ISD::MOVIedit:
18891 case AArch64ISD::MOVImsl:
18892 case AArch64ISD::MVNIshift:
18893 case AArch64ISD::MVNImsl:
18894 break;
18895 default:
18896 // FMOV could be supported, but isn't very useful, as it would only occur
18897 // if you passed a bitcast' floating point immediate to an eligible long
18898 // integer op (addl, smull, ...).
18899 return SDValue();
18900 }
18901
18902 if (!VT.is64BitVector())
18903 return SDValue();
18904
18905 SDLoc DL(N);
18906 unsigned NumElems = VT.getVectorNumElements();
18907 if (N.getValueType().is64BitVector()) {
18908 MVT ElementTy = VT.getVectorElementType();
18909 MVT NewVT = MVT::getVectorVT(VT: ElementTy, NumElements: NumElems * 2);
18910 N = DAG.getNode(Opcode: N->getOpcode(), DL, VT: NewVT, Ops: N->ops());
18911 }
18912
18913 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,
18914 DAG.getConstant(NumElems, DL, MVT::i64));
18915}
18916
18917static bool isEssentiallyExtractHighSubvector(SDValue N) {
18918 if (N.getOpcode() == ISD::BITCAST)
18919 N = N.getOperand(i: 0);
18920 if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
18921 return false;
18922 if (N.getOperand(i: 0).getValueType().isScalableVector())
18923 return false;
18924 return N.getConstantOperandAPInt(i: 1) ==
18925 N.getOperand(i: 0).getValueType().getVectorNumElements() / 2;
18926}
18927
18928/// Helper structure to keep track of ISD::SET_CC operands.
18929struct GenericSetCCInfo {
18930 const SDValue *Opnd0;
18931 const SDValue *Opnd1;
18932 ISD::CondCode CC;
18933};
18934
18935/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
18936struct AArch64SetCCInfo {
18937 const SDValue *Cmp;
18938 AArch64CC::CondCode CC;
18939};
18940
18941/// Helper structure to keep track of SetCC information.
18942union SetCCInfo {
18943 GenericSetCCInfo Generic;
18944 AArch64SetCCInfo AArch64;
18945};
18946
18947/// Helper structure to be able to read SetCC information. If set to
18948/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
18949/// GenericSetCCInfo.
18950struct SetCCInfoAndKind {
18951 SetCCInfo Info;
18952 bool IsAArch64;
18953};
18954
18955/// Check whether or not \p Op is a SET_CC operation, either a generic or
18956/// an
18957/// AArch64 lowered one.
18958/// \p SetCCInfo is filled accordingly.
18959/// \post SetCCInfo is meanginfull only when this function returns true.
18960/// \return True when Op is a kind of SET_CC operation.
18961static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
18962 // If this is a setcc, this is straight forward.
18963 if (Op.getOpcode() == ISD::SETCC) {
18964 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(i: 0);
18965 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(i: 1);
18966 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: 2))->get();
18967 SetCCInfo.IsAArch64 = false;
18968 return true;
18969 }
18970 // Otherwise, check if this is a matching csel instruction.
18971 // In other words:
18972 // - csel 1, 0, cc
18973 // - csel 0, 1, !cc
18974 if (Op.getOpcode() != AArch64ISD::CSEL)
18975 return false;
18976 // Set the information about the operands.
18977 // TODO: we want the operands of the Cmp not the csel
18978 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(i: 3);
18979 SetCCInfo.IsAArch64 = true;
18980 SetCCInfo.Info.AArch64.CC =
18981 static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(i: 2));
18982
18983 // Check that the operands matches the constraints:
18984 // (1) Both operands must be constants.
18985 // (2) One must be 1 and the other must be 0.
18986 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 0));
18987 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 1));
18988
18989 // Check (1).
18990 if (!TValue || !FValue)
18991 return false;
18992
18993 // Check (2).
18994 if (!TValue->isOne()) {
18995 // Update the comparison when we are interested in !cc.
18996 std::swap(a&: TValue, b&: FValue);
18997 SetCCInfo.Info.AArch64.CC =
18998 AArch64CC::getInvertedCondCode(Code: SetCCInfo.Info.AArch64.CC);
18999 }
19000 return TValue->isOne() && FValue->isZero();
19001}
19002
19003// Returns true if Op is setcc or zext of setcc.
19004static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
19005 if (isSetCC(Op, SetCCInfo&: Info))
19006 return true;
19007 return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
19008 isSetCC(Op: Op->getOperand(Num: 0), SetCCInfo&: Info));
19009}
19010
19011// The folding we want to perform is:
19012// (add x, [zext] (setcc cc ...) )
19013// -->
19014// (csel x, (add x, 1), !cc ...)
19015//
19016// The latter will get matched to a CSINC instruction.
19017static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
19018 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
19019 SDValue LHS = Op->getOperand(Num: 0);
19020 SDValue RHS = Op->getOperand(Num: 1);
19021 SetCCInfoAndKind InfoAndKind;
19022
19023 // If both operands are a SET_CC, then we don't want to perform this
19024 // folding and create another csel as this results in more instructions
19025 // (and higher register usage).
19026 if (isSetCCOrZExtSetCC(Op: LHS, Info&: InfoAndKind) &&
19027 isSetCCOrZExtSetCC(Op: RHS, Info&: InfoAndKind))
19028 return SDValue();
19029
19030 // If neither operand is a SET_CC, give up.
19031 if (!isSetCCOrZExtSetCC(Op: LHS, Info&: InfoAndKind)) {
19032 std::swap(a&: LHS, b&: RHS);
19033 if (!isSetCCOrZExtSetCC(Op: LHS, Info&: InfoAndKind))
19034 return SDValue();
19035 }
19036
19037 // FIXME: This could be generatized to work for FP comparisons.
19038 EVT CmpVT = InfoAndKind.IsAArch64
19039 ? InfoAndKind.Info.AArch64.Cmp->getOperand(i: 0).getValueType()
19040 : InfoAndKind.Info.Generic.Opnd0->getValueType();
19041 if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
19042 return SDValue();
19043
19044 SDValue CCVal;
19045 SDValue Cmp;
19046 SDLoc dl(Op);
19047 if (InfoAndKind.IsAArch64) {
19048 CCVal = DAG.getConstant(
19049 AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl,
19050 MVT::i32);
19051 Cmp = *InfoAndKind.Info.AArch64.Cmp;
19052 } else
19053 Cmp = getAArch64Cmp(
19054 LHS: *InfoAndKind.Info.Generic.Opnd0, RHS: *InfoAndKind.Info.Generic.Opnd1,
19055 CC: ISD::getSetCCInverse(Operation: InfoAndKind.Info.Generic.CC, Type: CmpVT), AArch64cc&: CCVal, DAG,
19056 dl);
19057
19058 EVT VT = Op->getValueType(ResNo: 0);
19059 LHS = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: RHS, N2: DAG.getConstant(Val: 1, DL: dl, VT));
19060 return DAG.getNode(Opcode: AArch64ISD::CSEL, DL: dl, VT, N1: RHS, N2: LHS, N3: CCVal, N4: Cmp);
19061}
19062
19063// ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
19064static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG) {
19065 EVT VT = N->getValueType(ResNo: 0);
19066 // Only scalar integer and vector types.
19067 if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
19068 return SDValue();
19069
19070 SDValue LHS = N->getOperand(Num: 0);
19071 SDValue RHS = N->getOperand(Num: 1);
19072 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19073 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
19074 return SDValue();
19075
19076 auto *LHSN1 = dyn_cast<ConstantSDNode>(Val: LHS->getOperand(Num: 1));
19077 auto *RHSN1 = dyn_cast<ConstantSDNode>(Val: RHS->getOperand(Num: 1));
19078 if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero())
19079 return SDValue();
19080
19081 SDValue Op1 = LHS->getOperand(Num: 0);
19082 SDValue Op2 = RHS->getOperand(Num: 0);
19083 EVT OpVT1 = Op1.getValueType();
19084 EVT OpVT2 = Op2.getValueType();
19085 if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
19086 Op2.getOpcode() != AArch64ISD::UADDV ||
19087 OpVT1.getVectorElementType() != VT)
19088 return SDValue();
19089
19090 SDValue Val1 = Op1.getOperand(i: 0);
19091 SDValue Val2 = Op2.getOperand(i: 0);
19092 EVT ValVT = Val1->getValueType(ResNo: 0);
19093 SDLoc DL(N);
19094 SDValue AddVal = DAG.getNode(Opcode: ISD::ADD, DL, VT: ValVT, N1: Val1, N2: Val2);
19095 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
19096 DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
19097 DAG.getConstant(0, DL, MVT::i64));
19098}
19099
19100/// Perform the scalar expression combine in the form of:
19101/// CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)
19102/// CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)
19103static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG) {
19104 EVT VT = N->getValueType(ResNo: 0);
19105 if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD)
19106 return SDValue();
19107
19108 SDValue LHS = N->getOperand(Num: 0);
19109 SDValue RHS = N->getOperand(Num: 1);
19110
19111 // Handle commutivity.
19112 if (LHS.getOpcode() != AArch64ISD::CSEL &&
19113 LHS.getOpcode() != AArch64ISD::CSNEG) {
19114 std::swap(a&: LHS, b&: RHS);
19115 if (LHS.getOpcode() != AArch64ISD::CSEL &&
19116 LHS.getOpcode() != AArch64ISD::CSNEG) {
19117 return SDValue();
19118 }
19119 }
19120
19121 if (!LHS.hasOneUse())
19122 return SDValue();
19123
19124 AArch64CC::CondCode AArch64CC =
19125 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(i: 2));
19126
19127 // The CSEL should include a const one operand, and the CSNEG should include
19128 // One or NegOne operand.
19129 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: 0));
19130 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: 1));
19131 if (!CTVal || !CFVal)
19132 return SDValue();
19133
19134 if (!(LHS.getOpcode() == AArch64ISD::CSEL &&
19135 (CTVal->isOne() || CFVal->isOne())) &&
19136 !(LHS.getOpcode() == AArch64ISD::CSNEG &&
19137 (CTVal->isOne() || CFVal->isAllOnes())))
19138 return SDValue();
19139
19140 // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)
19141 if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() &&
19142 !CFVal->isOne()) {
19143 std::swap(a&: CTVal, b&: CFVal);
19144 AArch64CC = AArch64CC::getInvertedCondCode(Code: AArch64CC);
19145 }
19146
19147 SDLoc DL(N);
19148 // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)
19149 if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() &&
19150 !CFVal->isAllOnes()) {
19151 APInt C = -1 * CFVal->getAPIntValue();
19152 CTVal = cast<ConstantSDNode>(Val: DAG.getConstant(Val: C, DL, VT));
19153 CFVal = cast<ConstantSDNode>(Val: DAG.getAllOnesConstant(DL, VT));
19154 AArch64CC = AArch64CC::getInvertedCondCode(Code: AArch64CC);
19155 }
19156
19157 // It might be neutral for larger constants, as the immediate need to be
19158 // materialized in a register.
19159 APInt ADDC = CTVal->getAPIntValue();
19160 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19161 if (!TLI.isLegalAddImmediate(ADDC.getSExtValue()))
19162 return SDValue();
19163
19164 assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) ||
19165 (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) &&
19166 "Unexpected constant value");
19167
19168 SDValue NewNode = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: RHS, N2: SDValue(CTVal, 0));
19169 SDValue CCVal = DAG.getConstant(AArch64CC, DL, MVT::i32);
19170 SDValue Cmp = LHS.getOperand(i: 3);
19171
19172 return DAG.getNode(Opcode: AArch64ISD::CSINC, DL, VT, N1: NewNode, N2: RHS, N3: CCVal, N4: Cmp);
19173}
19174
19175// ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y)
19176static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG) {
19177 EVT VT = N->getValueType(ResNo: 0);
19178 if (N->getOpcode() != ISD::ADD)
19179 return SDValue();
19180
19181 SDValue Dot = N->getOperand(Num: 0);
19182 SDValue A = N->getOperand(Num: 1);
19183 // Handle commutivity
19184 auto isZeroDot = [](SDValue Dot) {
19185 return (Dot.getOpcode() == AArch64ISD::UDOT ||
19186 Dot.getOpcode() == AArch64ISD::SDOT) &&
19187 isZerosVector(N: Dot.getOperand(i: 0).getNode());
19188 };
19189 if (!isZeroDot(Dot))
19190 std::swap(a&: Dot, b&: A);
19191 if (!isZeroDot(Dot))
19192 return SDValue();
19193
19194 return DAG.getNode(Opcode: Dot.getOpcode(), DL: SDLoc(N), VT, N1: A, N2: Dot.getOperand(i: 1),
19195 N3: Dot.getOperand(i: 2));
19196}
19197
19198static bool isNegatedInteger(SDValue Op) {
19199 return Op.getOpcode() == ISD::SUB && isNullConstant(V: Op.getOperand(i: 0));
19200}
19201
19202static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG) {
19203 SDLoc DL(Op);
19204 EVT VT = Op.getValueType();
19205 SDValue Zero = DAG.getConstant(Val: 0, DL, VT);
19206 return DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Zero, N2: Op);
19207}
19208
19209// Try to fold
19210//
19211// (neg (csel X, Y)) -> (csel (neg X), (neg Y))
19212//
19213// The folding helps csel to be matched with csneg without generating
19214// redundant neg instruction, which includes negation of the csel expansion
19215// of abs node lowered by lowerABS.
19216static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG) {
19217 if (!isNegatedInteger(Op: SDValue(N, 0)))
19218 return SDValue();
19219
19220 SDValue CSel = N->getOperand(Num: 1);
19221 if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse())
19222 return SDValue();
19223
19224 SDValue N0 = CSel.getOperand(i: 0);
19225 SDValue N1 = CSel.getOperand(i: 1);
19226
19227 // If both of them is not negations, it's not worth the folding as it
19228 // introduces two additional negations while reducing one negation.
19229 if (!isNegatedInteger(Op: N0) && !isNegatedInteger(Op: N1))
19230 return SDValue();
19231
19232 SDValue N0N = getNegatedInteger(Op: N0, DAG);
19233 SDValue N1N = getNegatedInteger(Op: N1, DAG);
19234
19235 SDLoc DL(N);
19236 EVT VT = CSel.getValueType();
19237 return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: N0N, N2: N1N, N3: CSel.getOperand(i: 2),
19238 N4: CSel.getOperand(i: 3));
19239}
19240
19241// The basic add/sub long vector instructions have variants with "2" on the end
19242// which act on the high-half of their inputs. They are normally matched by
19243// patterns like:
19244//
19245// (add (zeroext (extract_high LHS)),
19246// (zeroext (extract_high RHS)))
19247// -> uaddl2 vD, vN, vM
19248//
19249// However, if one of the extracts is something like a duplicate, this
19250// instruction can still be used profitably. This function puts the DAG into a
19251// more appropriate form for those patterns to trigger.
19252static SDValue performAddSubLongCombine(SDNode *N,
19253 TargetLowering::DAGCombinerInfo &DCI) {
19254 SelectionDAG &DAG = DCI.DAG;
19255 if (DCI.isBeforeLegalizeOps())
19256 return SDValue();
19257
19258 MVT VT = N->getSimpleValueType(ResNo: 0);
19259 if (!VT.is128BitVector()) {
19260 if (N->getOpcode() == ISD::ADD)
19261 return performSetccAddFolding(Op: N, DAG);
19262 return SDValue();
19263 }
19264
19265 // Make sure both branches are extended in the same way.
19266 SDValue LHS = N->getOperand(Num: 0);
19267 SDValue RHS = N->getOperand(Num: 1);
19268 if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
19269 LHS.getOpcode() != ISD::SIGN_EXTEND) ||
19270 LHS.getOpcode() != RHS.getOpcode())
19271 return SDValue();
19272
19273 unsigned ExtType = LHS.getOpcode();
19274
19275 // It's not worth doing if at least one of the inputs isn't already an
19276 // extract, but we don't know which it'll be so we have to try both.
19277 if (isEssentiallyExtractHighSubvector(N: LHS.getOperand(i: 0))) {
19278 RHS = tryExtendDUPToExtractHigh(N: RHS.getOperand(i: 0), DAG);
19279 if (!RHS.getNode())
19280 return SDValue();
19281
19282 RHS = DAG.getNode(Opcode: ExtType, DL: SDLoc(N), VT, Operand: RHS);
19283 } else if (isEssentiallyExtractHighSubvector(N: RHS.getOperand(i: 0))) {
19284 LHS = tryExtendDUPToExtractHigh(N: LHS.getOperand(i: 0), DAG);
19285 if (!LHS.getNode())
19286 return SDValue();
19287
19288 LHS = DAG.getNode(Opcode: ExtType, DL: SDLoc(N), VT, Operand: LHS);
19289 }
19290
19291 return DAG.getNode(Opcode: N->getOpcode(), DL: SDLoc(N), VT, N1: LHS, N2: RHS);
19292}
19293
19294static bool isCMP(SDValue Op) {
19295 return Op.getOpcode() == AArch64ISD::SUBS &&
19296 !Op.getNode()->hasAnyUseOfValue(Value: 0);
19297}
19298
19299// (CSEL 1 0 CC Cond) => CC
19300// (CSEL 0 1 CC Cond) => !CC
19301static std::optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) {
19302 if (Op.getOpcode() != AArch64ISD::CSEL)
19303 return std::nullopt;
19304 auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(i: 2));
19305 if (CC == AArch64CC::AL || CC == AArch64CC::NV)
19306 return std::nullopt;
19307 SDValue OpLHS = Op.getOperand(i: 0);
19308 SDValue OpRHS = Op.getOperand(i: 1);
19309 if (isOneConstant(V: OpLHS) && isNullConstant(V: OpRHS))
19310 return CC;
19311 if (isNullConstant(V: OpLHS) && isOneConstant(V: OpRHS))
19312 return getInvertedCondCode(Code: CC);
19313
19314 return std::nullopt;
19315}
19316
19317// (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)
19318// (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)
19319static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) {
19320 SDValue CmpOp = Op->getOperand(Num: 2);
19321 if (!isCMP(Op: CmpOp))
19322 return SDValue();
19323
19324 if (IsAdd) {
19325 if (!isOneConstant(V: CmpOp.getOperand(i: 1)))
19326 return SDValue();
19327 } else {
19328 if (!isNullConstant(V: CmpOp.getOperand(i: 0)))
19329 return SDValue();
19330 }
19331
19332 SDValue CsetOp = CmpOp->getOperand(Num: IsAdd ? 0 : 1);
19333 auto CC = getCSETCondCode(Op: CsetOp);
19334 if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO))
19335 return SDValue();
19336
19337 return DAG.getNode(Opcode: Op->getOpcode(), DL: SDLoc(Op), VTList: Op->getVTList(),
19338 N1: Op->getOperand(Num: 0), N2: Op->getOperand(Num: 1),
19339 N3: CsetOp.getOperand(i: 3));
19340}
19341
19342// (ADC x 0 cond) => (CINC x HS cond)
19343static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG) {
19344 SDValue LHS = N->getOperand(Num: 0);
19345 SDValue RHS = N->getOperand(Num: 1);
19346 SDValue Cond = N->getOperand(Num: 2);
19347
19348 if (!isNullConstant(V: RHS))
19349 return SDValue();
19350
19351 EVT VT = N->getValueType(ResNo: 0);
19352 SDLoc DL(N);
19353
19354 // (CINC x cc cond) <=> (CSINC x x !cc cond)
19355 SDValue CC = DAG.getConstant(AArch64CC::LO, DL, MVT::i32);
19356 return DAG.getNode(Opcode: AArch64ISD::CSINC, DL, VT, N1: LHS, N2: LHS, N3: CC, N4: Cond);
19357}
19358
19359// Transform vector add(zext i8 to i32, zext i8 to i32)
19360// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
19361// This allows extra uses of saddl/uaddl at the lower vector widths, and less
19362// extends.
19363static SDValue performVectorAddSubExtCombine(SDNode *N, SelectionDAG &DAG) {
19364 EVT VT = N->getValueType(ResNo: 0);
19365 if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
19366 (N->getOperand(Num: 0).getOpcode() != ISD::ZERO_EXTEND &&
19367 N->getOperand(Num: 0).getOpcode() != ISD::SIGN_EXTEND) ||
19368 (N->getOperand(Num: 1).getOpcode() != ISD::ZERO_EXTEND &&
19369 N->getOperand(Num: 1).getOpcode() != ISD::SIGN_EXTEND) ||
19370 N->getOperand(Num: 0).getOperand(i: 0).getValueType() !=
19371 N->getOperand(Num: 1).getOperand(i: 0).getValueType())
19372 return SDValue();
19373
19374 SDValue N0 = N->getOperand(Num: 0).getOperand(i: 0);
19375 SDValue N1 = N->getOperand(Num: 1).getOperand(i: 0);
19376 EVT InVT = N0.getValueType();
19377
19378 EVT S1 = InVT.getScalarType();
19379 EVT S2 = VT.getScalarType();
19380 if ((S2 == MVT::i32 && S1 == MVT::i8) ||
19381 (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
19382 SDLoc DL(N);
19383 EVT HalfVT = EVT::getVectorVT(Context&: *DAG.getContext(),
19384 VT: S2.getHalfSizedIntegerVT(Context&: *DAG.getContext()),
19385 EC: VT.getVectorElementCount());
19386 SDValue NewN0 = DAG.getNode(Opcode: N->getOperand(Num: 0).getOpcode(), DL, VT: HalfVT, Operand: N0);
19387 SDValue NewN1 = DAG.getNode(Opcode: N->getOperand(Num: 1).getOpcode(), DL, VT: HalfVT, Operand: N1);
19388 SDValue NewOp = DAG.getNode(Opcode: N->getOpcode(), DL, VT: HalfVT, N1: NewN0, N2: NewN1);
19389 return DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT, Operand: NewOp);
19390 }
19391 return SDValue();
19392}
19393
19394static SDValue performBuildVectorCombine(SDNode *N,
19395 TargetLowering::DAGCombinerInfo &DCI,
19396 SelectionDAG &DAG) {
19397 SDLoc DL(N);
19398 EVT VT = N->getValueType(ResNo: 0);
19399
19400 if (VT == MVT::v4f16 || VT == MVT::v4bf16) {
19401 SDValue Elt0 = N->getOperand(Num: 0), Elt1 = N->getOperand(Num: 1),
19402 Elt2 = N->getOperand(Num: 2), Elt3 = N->getOperand(Num: 3);
19403 if (Elt0->getOpcode() == ISD::FP_ROUND &&
19404 Elt1->getOpcode() == ISD::FP_ROUND &&
19405 isa<ConstantSDNode>(Val: Elt0->getOperand(Num: 1)) &&
19406 isa<ConstantSDNode>(Val: Elt1->getOperand(Num: 1)) &&
19407 Elt0->getConstantOperandVal(Num: 1) == Elt1->getConstantOperandVal(Num: 1) &&
19408 Elt0->getOperand(Num: 0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19409 Elt1->getOperand(Num: 0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19410 // Constant index.
19411 isa<ConstantSDNode>(Val: Elt0->getOperand(Num: 0)->getOperand(Num: 1)) &&
19412 isa<ConstantSDNode>(Val: Elt1->getOperand(Num: 0)->getOperand(Num: 1)) &&
19413 Elt0->getOperand(Num: 0)->getOperand(Num: 0) ==
19414 Elt1->getOperand(Num: 0)->getOperand(Num: 0) &&
19415 Elt0->getOperand(Num: 0)->getConstantOperandVal(Num: 1) == 0 &&
19416 Elt1->getOperand(Num: 0)->getConstantOperandVal(Num: 1) == 1) {
19417 SDValue LowLanesSrcVec = Elt0->getOperand(Num: 0)->getOperand(Num: 0);
19418 if (LowLanesSrcVec.getValueType() == MVT::v2f64) {
19419 SDValue HighLanes;
19420 if (Elt2->getOpcode() == ISD::UNDEF &&
19421 Elt3->getOpcode() == ISD::UNDEF) {
19422 HighLanes = DAG.getUNDEF(MVT::v2f32);
19423 } else if (Elt2->getOpcode() == ISD::FP_ROUND &&
19424 Elt3->getOpcode() == ISD::FP_ROUND &&
19425 isa<ConstantSDNode>(Val: Elt2->getOperand(Num: 1)) &&
19426 isa<ConstantSDNode>(Val: Elt3->getOperand(Num: 1)) &&
19427 Elt2->getConstantOperandVal(Num: 1) ==
19428 Elt3->getConstantOperandVal(Num: 1) &&
19429 Elt2->getOperand(Num: 0)->getOpcode() ==
19430 ISD::EXTRACT_VECTOR_ELT &&
19431 Elt3->getOperand(Num: 0)->getOpcode() ==
19432 ISD::EXTRACT_VECTOR_ELT &&
19433 // Constant index.
19434 isa<ConstantSDNode>(Val: Elt2->getOperand(Num: 0)->getOperand(Num: 1)) &&
19435 isa<ConstantSDNode>(Val: Elt3->getOperand(Num: 0)->getOperand(Num: 1)) &&
19436 Elt2->getOperand(Num: 0)->getOperand(Num: 0) ==
19437 Elt3->getOperand(Num: 0)->getOperand(Num: 0) &&
19438 Elt2->getOperand(Num: 0)->getConstantOperandVal(Num: 1) == 0 &&
19439 Elt3->getOperand(Num: 0)->getConstantOperandVal(Num: 1) == 1) {
19440 SDValue HighLanesSrcVec = Elt2->getOperand(Num: 0)->getOperand(Num: 0);
19441 HighLanes =
19442 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, HighLanesSrcVec);
19443 }
19444 if (HighLanes) {
19445 SDValue DoubleToSingleSticky =
19446 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, LowLanesSrcVec);
19447 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
19448 DoubleToSingleSticky, HighLanes);
19449 return DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: Concat,
19450 N2: Elt0->getOperand(Num: 1));
19451 }
19452 }
19453 }
19454 }
19455
19456 if (VT == MVT::v2f64) {
19457 SDValue Elt0 = N->getOperand(Num: 0), Elt1 = N->getOperand(Num: 1);
19458 if (Elt0->getOpcode() == ISD::FP_EXTEND &&
19459 Elt1->getOpcode() == ISD::FP_EXTEND &&
19460 Elt0->getOperand(Num: 0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19461 Elt1->getOperand(Num: 0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19462 Elt0->getOperand(Num: 0)->getOperand(Num: 0) ==
19463 Elt1->getOperand(Num: 0)->getOperand(Num: 0) &&
19464 // Constant index.
19465 isa<ConstantSDNode>(Val: Elt0->getOperand(Num: 0)->getOperand(Num: 1)) &&
19466 isa<ConstantSDNode>(Val: Elt1->getOperand(Num: 0)->getOperand(Num: 1)) &&
19467 Elt0->getOperand(Num: 0)->getConstantOperandVal(Num: 1) + 1 ==
19468 Elt1->getOperand(Num: 0)->getConstantOperandVal(Num: 1) &&
19469 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
19470 // ResultType's known minimum vector length.
19471 Elt0->getOperand(Num: 0)->getConstantOperandVal(Num: 1) %
19472 VT.getVectorMinNumElements() ==
19473 0) {
19474 SDValue SrcVec = Elt0->getOperand(Num: 0)->getOperand(Num: 0);
19475 if (SrcVec.getValueType() == MVT::v4f16 ||
19476 SrcVec.getValueType() == MVT::v4bf16) {
19477 SDValue HalfToSingle =
19478 DAG.getNode(ISD::FP_EXTEND, DL, MVT::v4f32, SrcVec);
19479 SDValue SubvectorIdx = Elt0->getOperand(Num: 0)->getOperand(Num: 1);
19480 SDValue Extract = DAG.getNode(
19481 ISD::EXTRACT_SUBVECTOR, DL, VT.changeVectorElementType(MVT::f32),
19482 HalfToSingle, SubvectorIdx);
19483 return DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT, Operand: Extract);
19484 }
19485 }
19486 }
19487
19488 // A build vector of two extracted elements is equivalent to an
19489 // extract subvector where the inner vector is any-extended to the
19490 // extract_vector_elt VT.
19491 // (build_vector (extract_elt_iXX_to_i32 vec Idx+0)
19492 // (extract_elt_iXX_to_i32 vec Idx+1))
19493 // => (extract_subvector (anyext_iXX_to_i32 vec) Idx)
19494
19495 // For now, only consider the v2i32 case, which arises as a result of
19496 // legalization.
19497 if (VT != MVT::v2i32)
19498 return SDValue();
19499
19500 SDValue Elt0 = N->getOperand(Num: 0), Elt1 = N->getOperand(Num: 1);
19501 // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT.
19502 if (Elt0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19503 Elt1->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19504 // Constant index.
19505 isa<ConstantSDNode>(Val: Elt0->getOperand(Num: 1)) &&
19506 isa<ConstantSDNode>(Val: Elt1->getOperand(Num: 1)) &&
19507 // Both EXTRACT_VECTOR_ELT from same vector...
19508 Elt0->getOperand(Num: 0) == Elt1->getOperand(Num: 0) &&
19509 // ... and contiguous. First element's index +1 == second element's index.
19510 Elt0->getConstantOperandVal(Num: 1) + 1 == Elt1->getConstantOperandVal(Num: 1) &&
19511 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
19512 // ResultType's known minimum vector length.
19513 Elt0->getConstantOperandVal(Num: 1) % VT.getVectorMinNumElements() == 0) {
19514 SDValue VecToExtend = Elt0->getOperand(Num: 0);
19515 EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(MVT::i32);
19516 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: ExtVT))
19517 return SDValue();
19518
19519 SDValue SubvectorIdx = DAG.getVectorIdxConstant(Val: Elt0->getConstantOperandVal(Num: 1), DL);
19520
19521 SDValue Ext = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: ExtVT, Operand: VecToExtend);
19522 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext,
19523 SubvectorIdx);
19524 }
19525
19526 return SDValue();
19527}
19528
19529static SDValue performTruncateCombine(SDNode *N,
19530 SelectionDAG &DAG) {
19531 EVT VT = N->getValueType(ResNo: 0);
19532 SDValue N0 = N->getOperand(Num: 0);
19533 if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
19534 N0.getOpcode() == AArch64ISD::DUP) {
19535 SDValue Op = N0.getOperand(i: 0);
19536 if (VT.getScalarType() == MVT::i32 &&
19537 N0.getOperand(0).getValueType().getScalarType() == MVT::i64)
19538 Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i32, Op);
19539 return DAG.getNode(Opcode: N0.getOpcode(), DL: SDLoc(N), VT, Operand: Op);
19540 }
19541
19542 return SDValue();
19543}
19544
19545// Check an node is an extend or shift operand
19546static bool isExtendOrShiftOperand(SDValue N) {
19547 unsigned Opcode = N.getOpcode();
19548 if (ISD::isExtOpcode(Opcode) || Opcode == ISD::SIGN_EXTEND_INREG) {
19549 EVT SrcVT;
19550 if (Opcode == ISD::SIGN_EXTEND_INREG)
19551 SrcVT = cast<VTSDNode>(Val: N.getOperand(i: 1))->getVT();
19552 else
19553 SrcVT = N.getOperand(i: 0).getValueType();
19554
19555 return SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8;
19556 } else if (Opcode == ISD::AND) {
19557 ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(Val: N.getOperand(i: 1));
19558 if (!CSD)
19559 return false;
19560 uint64_t AndMask = CSD->getZExtValue();
19561 return AndMask == 0xff || AndMask == 0xffff || AndMask == 0xffffffff;
19562 } else if (Opcode == ISD::SHL || Opcode == ISD::SRL || Opcode == ISD::SRA) {
19563 return isa<ConstantSDNode>(Val: N.getOperand(i: 1));
19564 }
19565
19566 return false;
19567}
19568
19569// (N - Y) + Z --> (Z - Y) + N
19570// when N is an extend or shift operand
19571static SDValue performAddCombineSubShift(SDNode *N, SDValue SUB, SDValue Z,
19572 SelectionDAG &DAG) {
19573 auto IsOneUseExtend = [](SDValue N) {
19574 return N.hasOneUse() && isExtendOrShiftOperand(N);
19575 };
19576
19577 // DAGCombiner will revert the combination when Z is constant cause
19578 // dead loop. So don't enable the combination when Z is constant.
19579 // If Z is one use shift C, we also can't do the optimization.
19580 // It will falling to self infinite loop.
19581 if (isa<ConstantSDNode>(Val: Z) || IsOneUseExtend(Z))
19582 return SDValue();
19583
19584 if (SUB.getOpcode() != ISD::SUB || !SUB.hasOneUse())
19585 return SDValue();
19586
19587 SDValue Shift = SUB.getOperand(i: 0);
19588 if (!IsOneUseExtend(Shift))
19589 return SDValue();
19590
19591 SDLoc DL(N);
19592 EVT VT = N->getValueType(ResNo: 0);
19593
19594 SDValue Y = SUB.getOperand(i: 1);
19595 SDValue NewSub = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Z, N2: Y);
19596 return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: NewSub, N2: Shift);
19597}
19598
19599static SDValue performAddCombineForShiftedOperands(SDNode *N,
19600 SelectionDAG &DAG) {
19601 // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not
19602 // commutative.
19603 if (N->getOpcode() != ISD::ADD)
19604 return SDValue();
19605
19606 // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with
19607 // shifted register is only available for i32 and i64.
19608 EVT VT = N->getValueType(ResNo: 0);
19609 if (VT != MVT::i32 && VT != MVT::i64)
19610 return SDValue();
19611
19612 SDLoc DL(N);
19613 SDValue LHS = N->getOperand(Num: 0);
19614 SDValue RHS = N->getOperand(Num: 1);
19615
19616 if (SDValue Val = performAddCombineSubShift(N, SUB: LHS, Z: RHS, DAG))
19617 return Val;
19618 if (SDValue Val = performAddCombineSubShift(N, SUB: RHS, Z: LHS, DAG))
19619 return Val;
19620
19621 uint64_t LHSImm = 0, RHSImm = 0;
19622 // If both operand are shifted by imm and shift amount is not greater than 4
19623 // for one operand, swap LHS and RHS to put operand with smaller shift amount
19624 // on RHS.
19625 //
19626 // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with
19627 // LSL shift (shift <= 4) has smaller latency and larger throughput than ADD
19628 // with LSL (shift > 4). For the rest of processors, this is no-op for
19629 // performance or correctness.
19630 if (isOpcWithIntImmediate(N: LHS.getNode(), Opc: ISD::SHL, Imm&: LHSImm) &&
19631 isOpcWithIntImmediate(N: RHS.getNode(), Opc: ISD::SHL, Imm&: RHSImm) && LHSImm <= 4 &&
19632 RHSImm > 4 && LHS.hasOneUse())
19633 return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: RHS, N2: LHS);
19634
19635 return SDValue();
19636}
19637
19638// The mid end will reassociate sub(sub(x, m1), m2) to sub(x, add(m1, m2))
19639// This reassociates it back to allow the creation of more mls instructions.
19640static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG) {
19641 if (N->getOpcode() != ISD::SUB)
19642 return SDValue();
19643
19644 SDValue Add = N->getOperand(Num: 1);
19645 SDValue X = N->getOperand(Num: 0);
19646 if (Add.getOpcode() != ISD::ADD)
19647 return SDValue();
19648
19649 if (!Add.hasOneUse())
19650 return SDValue();
19651 if (DAG.isConstantIntBuildVectorOrConstantInt(N: peekThroughBitcasts(V: X)))
19652 return SDValue();
19653
19654 SDValue M1 = Add.getOperand(i: 0);
19655 SDValue M2 = Add.getOperand(i: 1);
19656 if (M1.getOpcode() != ISD::MUL && M1.getOpcode() != AArch64ISD::SMULL &&
19657 M1.getOpcode() != AArch64ISD::UMULL)
19658 return SDValue();
19659 if (M2.getOpcode() != ISD::MUL && M2.getOpcode() != AArch64ISD::SMULL &&
19660 M2.getOpcode() != AArch64ISD::UMULL)
19661 return SDValue();
19662
19663 EVT VT = N->getValueType(ResNo: 0);
19664 SDValue Sub = DAG.getNode(Opcode: ISD::SUB, DL: SDLoc(N), VT, N1: X, N2: M1);
19665 return DAG.getNode(Opcode: ISD::SUB, DL: SDLoc(N), VT, N1: Sub, N2: M2);
19666}
19667
19668// Combine into mla/mls.
19669// This works on the patterns of:
19670// add v1, (mul v2, v3)
19671// sub v1, (mul v2, v3)
19672// for vectors of type <1 x i64> and <2 x i64> when SVE is available.
19673// It will transform the add/sub to a scalable version, so that we can
19674// make use of SVE's MLA/MLS that will be generated for that pattern
19675static SDValue
19676performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
19677 SelectionDAG &DAG = DCI.DAG;
19678 // Make sure that the types are legal
19679 if (!DCI.isAfterLegalizeDAG())
19680 return SDValue();
19681 // Before using SVE's features, check first if it's available.
19682 if (!DAG.getSubtarget<AArch64Subtarget>().hasSVE())
19683 return SDValue();
19684
19685 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::SUB)
19686 return SDValue();
19687
19688 if (!N->getValueType(ResNo: 0).isFixedLengthVector())
19689 return SDValue();
19690
19691 auto performOpt = [&DAG, &N](SDValue Op0, SDValue Op1) -> SDValue {
19692 if (Op1.getOpcode() != ISD::EXTRACT_SUBVECTOR)
19693 return SDValue();
19694
19695 if (!cast<ConstantSDNode>(Val: Op1->getOperand(Num: 1))->isZero())
19696 return SDValue();
19697
19698 SDValue MulValue = Op1->getOperand(Num: 0);
19699 if (MulValue.getOpcode() != AArch64ISD::MUL_PRED)
19700 return SDValue();
19701
19702 if (!Op1.hasOneUse() || !MulValue.hasOneUse())
19703 return SDValue();
19704
19705 EVT ScalableVT = MulValue.getValueType();
19706 if (!ScalableVT.isScalableVector())
19707 return SDValue();
19708
19709 SDValue ScaledOp = convertToScalableVector(DAG, VT: ScalableVT, V: Op0);
19710 SDValue NewValue =
19711 DAG.getNode(Opcode: N->getOpcode(), DL: SDLoc(N), VT: ScalableVT, Ops: {ScaledOp, MulValue});
19712 return convertFromScalableVector(DAG, VT: N->getValueType(ResNo: 0), V: NewValue);
19713 };
19714
19715 if (SDValue res = performOpt(N->getOperand(Num: 0), N->getOperand(Num: 1)))
19716 return res;
19717 else if (N->getOpcode() == ISD::ADD)
19718 return performOpt(N->getOperand(Num: 1), N->getOperand(Num: 0));
19719
19720 return SDValue();
19721}
19722
19723// Given a i64 add from a v1i64 extract, convert to a neon v1i64 add. This can
19724// help, for example, to produce ssra from sshr+add.
19725static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG) {
19726 EVT VT = N->getValueType(ResNo: 0);
19727 if (VT != MVT::i64)
19728 return SDValue();
19729 SDValue Op0 = N->getOperand(Num: 0);
19730 SDValue Op1 = N->getOperand(Num: 1);
19731
19732 // At least one of the operands should be an extract, and the other should be
19733 // something that is easy to convert to v1i64 type (in this case a load).
19734 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
19735 Op0.getOpcode() != ISD::LOAD)
19736 return SDValue();
19737 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
19738 Op1.getOpcode() != ISD::LOAD)
19739 return SDValue();
19740
19741 SDLoc DL(N);
19742 if (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19743 Op0.getOperand(0).getValueType() == MVT::v1i64) {
19744 Op0 = Op0.getOperand(i: 0);
19745 Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op1);
19746 } else if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19747 Op1.getOperand(0).getValueType() == MVT::v1i64) {
19748 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op0);
19749 Op1 = Op1.getOperand(i: 0);
19750 } else
19751 return SDValue();
19752
19753 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
19754 DAG.getNode(N->getOpcode(), DL, MVT::v1i64, Op0, Op1),
19755 DAG.getConstant(0, DL, MVT::i64));
19756}
19757
19758static bool isLoadOrMultipleLoads(SDValue B, SmallVector<LoadSDNode *> &Loads) {
19759 SDValue BV = peekThroughOneUseBitcasts(V: B);
19760 if (!BV->hasOneUse())
19761 return false;
19762 if (auto *Ld = dyn_cast<LoadSDNode>(Val&: BV)) {
19763 if (!Ld || !Ld->isSimple())
19764 return false;
19765 Loads.push_back(Elt: Ld);
19766 return true;
19767 } else if (BV.getOpcode() == ISD::BUILD_VECTOR ||
19768 BV.getOpcode() == ISD::CONCAT_VECTORS) {
19769 for (unsigned Op = 0; Op < BV.getNumOperands(); Op++) {
19770 auto *Ld = dyn_cast<LoadSDNode>(Val: BV.getOperand(i: Op));
19771 if (!Ld || !Ld->isSimple() || !BV.getOperand(i: Op).hasOneUse())
19772 return false;
19773 Loads.push_back(Elt: Ld);
19774 }
19775 return true;
19776 } else if (B.getOpcode() == ISD::VECTOR_SHUFFLE) {
19777 // Try to find a tree of shuffles and concats from how IR shuffles of loads
19778 // are lowered. Note that this only comes up because we do not always visit
19779 // operands before uses. After that is fixed this can be removed and in the
19780 // meantime this is fairly specific to the lowering we expect from IR.
19781 // t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44, t45
19782 // t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42, t43
19783 // t42: v16i8 = concat_vectors t40, t36, undef:v4i8, undef:v4i8
19784 // t40: v4i8,ch = load<(load (s32) from %ir.17)> t0, t22, undef:i64
19785 // t36: v4i8,ch = load<(load (s32) from %ir.13)> t0, t18, undef:i64
19786 // t43: v16i8 = concat_vectors t32, undef:v4i8, undef:v4i8, undef:v4i8
19787 // t32: v4i8,ch = load<(load (s32) from %ir.9)> t0, t14, undef:i64
19788 // t45: v16i8 = concat_vectors t28, undef:v4i8, undef:v4i8, undef:v4i8
19789 // t28: v4i8,ch = load<(load (s32) from %ir.0)> t0, t2, undef:i64
19790 if (B.getOperand(i: 0).getOpcode() != ISD::VECTOR_SHUFFLE ||
19791 B.getOperand(i: 0).getOperand(i: 0).getOpcode() != ISD::CONCAT_VECTORS ||
19792 B.getOperand(i: 0).getOperand(i: 1).getOpcode() != ISD::CONCAT_VECTORS ||
19793 B.getOperand(i: 1).getOpcode() != ISD::CONCAT_VECTORS ||
19794 B.getOperand(i: 1).getNumOperands() != 4)
19795 return false;
19796 auto SV1 = cast<ShuffleVectorSDNode>(Val&: B);
19797 auto SV2 = cast<ShuffleVectorSDNode>(Val: B.getOperand(i: 0));
19798 int NumElts = B.getValueType().getVectorNumElements();
19799 int NumSubElts = NumElts / 4;
19800 for (int I = 0; I < NumSubElts; I++) {
19801 // <0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19>
19802 if (SV1->getMaskElt(Idx: I) != I ||
19803 SV1->getMaskElt(Idx: I + NumSubElts) != I + NumSubElts ||
19804 SV1->getMaskElt(Idx: I + NumSubElts * 2) != I + NumSubElts * 2 ||
19805 SV1->getMaskElt(Idx: I + NumSubElts * 3) != I + NumElts)
19806 return false;
19807 // <0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u>
19808 if (SV2->getMaskElt(Idx: I) != I ||
19809 SV2->getMaskElt(Idx: I + NumSubElts) != I + NumSubElts ||
19810 SV2->getMaskElt(Idx: I + NumSubElts * 2) != I + NumElts)
19811 return false;
19812 }
19813 auto *Ld0 = dyn_cast<LoadSDNode>(Val: SV2->getOperand(Num: 0).getOperand(i: 0));
19814 auto *Ld1 = dyn_cast<LoadSDNode>(Val: SV2->getOperand(Num: 0).getOperand(i: 1));
19815 auto *Ld2 = dyn_cast<LoadSDNode>(Val: SV2->getOperand(Num: 1).getOperand(i: 0));
19816 auto *Ld3 = dyn_cast<LoadSDNode>(Val: B.getOperand(i: 1).getOperand(i: 0));
19817 if (!Ld0 || !Ld1 || !Ld2 || !Ld3 || !Ld0->isSimple() || !Ld1->isSimple() ||
19818 !Ld2->isSimple() || !Ld3->isSimple())
19819 return false;
19820 Loads.push_back(Elt: Ld0);
19821 Loads.push_back(Elt: Ld1);
19822 Loads.push_back(Elt: Ld2);
19823 Loads.push_back(Elt: Ld3);
19824 return true;
19825 }
19826 return false;
19827}
19828
19829static bool areLoadedOffsetButOtherwiseSame(SDValue Op0, SDValue Op1,
19830 SelectionDAG &DAG,
19831 unsigned &NumSubLoads) {
19832 if (!Op0.hasOneUse() || !Op1.hasOneUse())
19833 return false;
19834
19835 SmallVector<LoadSDNode *> Loads0, Loads1;
19836 if (isLoadOrMultipleLoads(B: Op0, Loads&: Loads0) &&
19837 isLoadOrMultipleLoads(B: Op1, Loads&: Loads1)) {
19838 if (NumSubLoads && Loads0.size() != NumSubLoads)
19839 return false;
19840 NumSubLoads = Loads0.size();
19841 return Loads0.size() == Loads1.size() &&
19842 all_of(Range: zip(t&: Loads0, u&: Loads1), P: [&DAG](auto L) {
19843 unsigned Size = get<0>(L)->getValueType(0).getSizeInBits();
19844 return Size == get<1>(L)->getValueType(0).getSizeInBits() &&
19845 DAG.areNonVolatileConsecutiveLoads(LD: get<1>(L), Base: get<0>(L),
19846 Bytes: Size / 8, Dist: 1);
19847 });
19848 }
19849
19850 if (Op0.getOpcode() != Op1.getOpcode())
19851 return false;
19852
19853 switch (Op0.getOpcode()) {
19854 case ISD::ADD:
19855 case ISD::SUB:
19856 return areLoadedOffsetButOtherwiseSame(Op0: Op0.getOperand(i: 0), Op1: Op1.getOperand(i: 0),
19857 DAG, NumSubLoads) &&
19858 areLoadedOffsetButOtherwiseSame(Op0: Op0.getOperand(i: 1), Op1: Op1.getOperand(i: 1),
19859 DAG, NumSubLoads);
19860 case ISD::SIGN_EXTEND:
19861 case ISD::ANY_EXTEND:
19862 case ISD::ZERO_EXTEND:
19863 EVT XVT = Op0.getOperand(i: 0).getValueType();
19864 if (XVT.getScalarSizeInBits() != 8 && XVT.getScalarSizeInBits() != 16 &&
19865 XVT.getScalarSizeInBits() != 32)
19866 return false;
19867 return areLoadedOffsetButOtherwiseSame(Op0: Op0.getOperand(i: 0), Op1: Op1.getOperand(i: 0),
19868 DAG, NumSubLoads);
19869 }
19870 return false;
19871}
19872
19873// This method attempts to fold trees of add(ext(load p), shl(ext(load p+4))
19874// into a single load of twice the size, that we extract the bottom part and top
19875// part so that the shl can use a shll2 instruction. The two loads in that
19876// example can also be larger trees of instructions, which are identical except
19877// for the leaves which are all loads offset from the LHS, including
19878// buildvectors of multiple loads. For example the RHS tree could be
19879// sub(zext(buildvec(load p+4, load q+4)), zext(buildvec(load r+4, load s+4)))
19880// Whilst it can be common for the larger loads to replace LDP instructions
19881// (which doesn't gain anything on it's own), the larger loads can help create
19882// more efficient code, and in buildvectors prevent the need for ld1 lane
19883// inserts which can be slower than normal loads.
19884static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG) {
19885 EVT VT = N->getValueType(ResNo: 0);
19886 if (!VT.isFixedLengthVector() ||
19887 (VT.getScalarSizeInBits() != 16 && VT.getScalarSizeInBits() != 32 &&
19888 VT.getScalarSizeInBits() != 64))
19889 return SDValue();
19890
19891 SDValue Other = N->getOperand(Num: 0);
19892 SDValue Shift = N->getOperand(Num: 1);
19893 if (Shift.getOpcode() != ISD::SHL && N->getOpcode() != ISD::SUB)
19894 std::swap(a&: Shift, b&: Other);
19895 APInt ShiftAmt;
19896 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse() ||
19897 !ISD::isConstantSplatVector(N: Shift.getOperand(i: 1).getNode(), SplatValue&: ShiftAmt))
19898 return SDValue();
19899
19900 if (!ISD::isExtOpcode(Opcode: Shift.getOperand(i: 0).getOpcode()) ||
19901 !ISD::isExtOpcode(Opcode: Other.getOpcode()) ||
19902 Shift.getOperand(i: 0).getOperand(i: 0).getValueType() !=
19903 Other.getOperand(i: 0).getValueType() ||
19904 !Other.hasOneUse() || !Shift.getOperand(i: 0).hasOneUse())
19905 return SDValue();
19906
19907 SDValue Op0 = Other.getOperand(i: 0);
19908 SDValue Op1 = Shift.getOperand(i: 0).getOperand(i: 0);
19909
19910 unsigned NumSubLoads = 0;
19911 if (!areLoadedOffsetButOtherwiseSame(Op0, Op1, DAG, NumSubLoads))
19912 return SDValue();
19913
19914 // Attempt to rule out some unprofitable cases using heuristics (some working
19915 // around suboptimal code generation), notably if the extend not be able to
19916 // use ushll2 instructions as the types are not large enough. Otherwise zip's
19917 // will need to be created which can increase the instruction count.
19918 unsigned NumElts = Op0.getValueType().getVectorNumElements();
19919 unsigned NumSubElts = NumElts / NumSubLoads;
19920 if (NumSubElts * VT.getScalarSizeInBits() < 128 ||
19921 (Other.getOpcode() != Shift.getOperand(i: 0).getOpcode() &&
19922 Op0.getValueType().getSizeInBits() < 128 &&
19923 !DAG.getTargetLoweringInfo().isTypeLegal(VT: Op0.getValueType())))
19924 return SDValue();
19925
19926 // Recreate the tree with the new combined loads.
19927 std::function<SDValue(SDValue, SDValue, SelectionDAG &)> GenCombinedTree =
19928 [&GenCombinedTree](SDValue Op0, SDValue Op1, SelectionDAG &DAG) {
19929 EVT DVT =
19930 Op0.getValueType().getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
19931
19932 SmallVector<LoadSDNode *> Loads0, Loads1;
19933 if (isLoadOrMultipleLoads(B: Op0, Loads&: Loads0) &&
19934 isLoadOrMultipleLoads(B: Op1, Loads&: Loads1)) {
19935 EVT LoadVT = EVT::getVectorVT(
19936 Context&: *DAG.getContext(), VT: Op0.getValueType().getScalarType(),
19937 NumElements: Op0.getValueType().getVectorNumElements() / Loads0.size());
19938 EVT DLoadVT = LoadVT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
19939
19940 SmallVector<SDValue> NewLoads;
19941 for (const auto &[L0, L1] : zip(t&: Loads0, u&: Loads1)) {
19942 SDValue Load = DAG.getLoad(VT: DLoadVT, dl: SDLoc(L0), Chain: L0->getChain(),
19943 Ptr: L0->getBasePtr(), PtrInfo: L0->getPointerInfo(),
19944 Alignment: L0->getOriginalAlign());
19945 DAG.makeEquivalentMemoryOrdering(OldLoad: L0, NewMemOp: Load.getValue(R: 1));
19946 DAG.makeEquivalentMemoryOrdering(OldLoad: L1, NewMemOp: Load.getValue(R: 1));
19947 NewLoads.push_back(Elt: Load);
19948 }
19949 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(Op0), VT: DVT, Ops: NewLoads);
19950 }
19951
19952 SmallVector<SDValue> Ops;
19953 for (const auto &[O0, O1] : zip(t: Op0->op_values(), u: Op1->op_values()))
19954 Ops.push_back(Elt: GenCombinedTree(O0, O1, DAG));
19955 return DAG.getNode(Opcode: Op0.getOpcode(), DL: SDLoc(Op0), VT: DVT, Ops);
19956 };
19957 SDValue NewOp = GenCombinedTree(Op0, Op1, DAG);
19958
19959 SmallVector<int> LowMask(NumElts, 0), HighMask(NumElts, 0);
19960 int Hi = NumSubElts, Lo = 0;
19961 for (unsigned i = 0; i < NumSubLoads; i++) {
19962 for (unsigned j = 0; j < NumSubElts; j++) {
19963 LowMask[i * NumSubElts + j] = Lo++;
19964 HighMask[i * NumSubElts + j] = Hi++;
19965 }
19966 Lo += NumSubElts;
19967 Hi += NumSubElts;
19968 }
19969 SDLoc DL(N);
19970 SDValue Ext0, Ext1;
19971 // Extract the top and bottom lanes, then extend the result. Possibly extend
19972 // the result then extract the lanes if the two operands match as it produces
19973 // slightly smaller code.
19974 if (Other.getOpcode() != Shift.getOperand(i: 0).getOpcode()) {
19975 SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(),
19976 NewOp, DAG.getConstant(0, DL, MVT::i64));
19977 SDValue SubH =
19978 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(), NewOp,
19979 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
19980 SDValue Extr0 =
19981 DAG.getVectorShuffle(VT: Op0.getValueType(), dl: DL, N1: SubL, N2: SubH, Mask: LowMask);
19982 SDValue Extr1 =
19983 DAG.getVectorShuffle(VT: Op0.getValueType(), dl: DL, N1: SubL, N2: SubH, Mask: HighMask);
19984 Ext0 = DAG.getNode(Opcode: Other.getOpcode(), DL, VT, Operand: Extr0);
19985 Ext1 = DAG.getNode(Opcode: Shift.getOperand(i: 0).getOpcode(), DL, VT, Operand: Extr1);
19986 } else {
19987 EVT DVT = VT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
19988 SDValue Ext = DAG.getNode(Opcode: Other.getOpcode(), DL, VT: DVT, Operand: NewOp);
19989 SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
19990 DAG.getConstant(0, DL, MVT::i64));
19991 SDValue SubH =
19992 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
19993 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
19994 Ext0 = DAG.getVectorShuffle(VT, dl: DL, N1: SubL, N2: SubH, Mask: LowMask);
19995 Ext1 = DAG.getVectorShuffle(VT, dl: DL, N1: SubL, N2: SubH, Mask: HighMask);
19996 }
19997 SDValue NShift =
19998 DAG.getNode(Opcode: Shift.getOpcode(), DL, VT, N1: Ext1, N2: Shift.getOperand(i: 1));
19999 return DAG.getNode(Opcode: N->getOpcode(), DL, VT, N1: Ext0, N2: NShift);
20000}
20001
20002static SDValue performAddSubCombine(SDNode *N,
20003 TargetLowering::DAGCombinerInfo &DCI) {
20004 // Try to change sum of two reductions.
20005 if (SDValue Val = performAddUADDVCombine(N, DAG&: DCI.DAG))
20006 return Val;
20007 if (SDValue Val = performAddDotCombine(N, DAG&: DCI.DAG))
20008 return Val;
20009 if (SDValue Val = performAddCSelIntoCSinc(N, DAG&: DCI.DAG))
20010 return Val;
20011 if (SDValue Val = performNegCSelCombine(N, DAG&: DCI.DAG))
20012 return Val;
20013 if (SDValue Val = performVectorAddSubExtCombine(N, DAG&: DCI.DAG))
20014 return Val;
20015 if (SDValue Val = performAddCombineForShiftedOperands(N, DAG&: DCI.DAG))
20016 return Val;
20017 if (SDValue Val = performSubAddMULCombine(N, DAG&: DCI.DAG))
20018 return Val;
20019 if (SDValue Val = performSVEMulAddSubCombine(N, DCI))
20020 return Val;
20021 if (SDValue Val = performAddSubIntoVectorOp(N, DAG&: DCI.DAG))
20022 return Val;
20023
20024 if (SDValue Val = performExtBinopLoadFold(N, DAG&: DCI.DAG))
20025 return Val;
20026
20027 return performAddSubLongCombine(N, DCI);
20028}
20029
20030// Massage DAGs which we can use the high-half "long" operations on into
20031// something isel will recognize better. E.g.
20032//
20033// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
20034// (aarch64_neon_umull (extract_high (v2i64 vec)))
20035// (extract_high (v2i64 (dup128 scalar)))))
20036//
20037static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
20038 TargetLowering::DAGCombinerInfo &DCI,
20039 SelectionDAG &DAG) {
20040 if (DCI.isBeforeLegalizeOps())
20041 return SDValue();
20042
20043 SDValue LHS = N->getOperand(Num: (IID == Intrinsic::not_intrinsic) ? 0 : 1);
20044 SDValue RHS = N->getOperand(Num: (IID == Intrinsic::not_intrinsic) ? 1 : 2);
20045 assert(LHS.getValueType().is64BitVector() &&
20046 RHS.getValueType().is64BitVector() &&
20047 "unexpected shape for long operation");
20048
20049 // Either node could be a DUP, but it's not worth doing both of them (you'd
20050 // just as well use the non-high version) so look for a corresponding extract
20051 // operation on the other "wing".
20052 if (isEssentiallyExtractHighSubvector(N: LHS)) {
20053 RHS = tryExtendDUPToExtractHigh(N: RHS, DAG);
20054 if (!RHS.getNode())
20055 return SDValue();
20056 } else if (isEssentiallyExtractHighSubvector(N: RHS)) {
20057 LHS = tryExtendDUPToExtractHigh(N: LHS, DAG);
20058 if (!LHS.getNode())
20059 return SDValue();
20060 } else
20061 return SDValue();
20062
20063 if (IID == Intrinsic::not_intrinsic)
20064 return DAG.getNode(Opcode: N->getOpcode(), DL: SDLoc(N), VT: N->getValueType(ResNo: 0), N1: LHS, N2: RHS);
20065
20066 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20067 N1: N->getOperand(Num: 0), N2: LHS, N3: RHS);
20068}
20069
20070static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
20071 MVT ElemTy = N->getSimpleValueType(ResNo: 0).getScalarType();
20072 unsigned ElemBits = ElemTy.getSizeInBits();
20073
20074 int64_t ShiftAmount;
20075 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Val: N->getOperand(Num: 2))) {
20076 APInt SplatValue, SplatUndef;
20077 unsigned SplatBitSize;
20078 bool HasAnyUndefs;
20079 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
20080 HasAnyUndefs, MinSplatBits: ElemBits) ||
20081 SplatBitSize != ElemBits)
20082 return SDValue();
20083
20084 ShiftAmount = SplatValue.getSExtValue();
20085 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 2))) {
20086 ShiftAmount = CVN->getSExtValue();
20087 } else
20088 return SDValue();
20089
20090 // If the shift amount is zero, remove the shift intrinsic.
20091 if (ShiftAmount == 0 && IID != Intrinsic::aarch64_neon_sqshlu)
20092 return N->getOperand(Num: 1);
20093
20094 unsigned Opcode;
20095 bool IsRightShift;
20096 switch (IID) {
20097 default:
20098 llvm_unreachable("Unknown shift intrinsic");
20099 case Intrinsic::aarch64_neon_sqshl:
20100 Opcode = AArch64ISD::SQSHL_I;
20101 IsRightShift = false;
20102 break;
20103 case Intrinsic::aarch64_neon_uqshl:
20104 Opcode = AArch64ISD::UQSHL_I;
20105 IsRightShift = false;
20106 break;
20107 case Intrinsic::aarch64_neon_srshl:
20108 Opcode = AArch64ISD::SRSHR_I;
20109 IsRightShift = true;
20110 break;
20111 case Intrinsic::aarch64_neon_urshl:
20112 Opcode = AArch64ISD::URSHR_I;
20113 IsRightShift = true;
20114 break;
20115 case Intrinsic::aarch64_neon_sqshlu:
20116 Opcode = AArch64ISD::SQSHLU_I;
20117 IsRightShift = false;
20118 break;
20119 case Intrinsic::aarch64_neon_sshl:
20120 case Intrinsic::aarch64_neon_ushl:
20121 // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
20122 // left shift for positive shift amounts. For negative shifts we can use a
20123 // VASHR/VLSHR as appropiate.
20124 if (ShiftAmount < 0) {
20125 Opcode = IID == Intrinsic::aarch64_neon_sshl ? AArch64ISD::VASHR
20126 : AArch64ISD::VLSHR;
20127 ShiftAmount = -ShiftAmount;
20128 } else
20129 Opcode = AArch64ISD::VSHL;
20130 IsRightShift = false;
20131 break;
20132 }
20133
20134 EVT VT = N->getValueType(ResNo: 0);
20135 SDValue Op = N->getOperand(Num: 1);
20136 SDLoc dl(N);
20137 if (VT == MVT::i64) {
20138 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op);
20139 VT = MVT::v1i64;
20140 }
20141
20142 if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
20143 Op = DAG.getNode(Opcode, dl, VT, Op,
20144 DAG.getConstant(-ShiftAmount, dl, MVT::i32));
20145 if (N->getValueType(0) == MVT::i64)
20146 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
20147 DAG.getConstant(0, dl, MVT::i64));
20148 return Op;
20149 } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
20150 Op = DAG.getNode(Opcode, dl, VT, Op,
20151 DAG.getConstant(ShiftAmount, dl, MVT::i32));
20152 if (N->getValueType(0) == MVT::i64)
20153 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
20154 DAG.getConstant(0, dl, MVT::i64));
20155 return Op;
20156 }
20157
20158 return SDValue();
20159}
20160
20161// The CRC32[BH] instructions ignore the high bits of their data operand. Since
20162// the intrinsics must be legal and take an i32, this means there's almost
20163// certainly going to be a zext in the DAG which we can eliminate.
20164static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
20165 SDValue AndN = N->getOperand(Num: 2);
20166 if (AndN.getOpcode() != ISD::AND)
20167 return SDValue();
20168
20169 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Val: AndN.getOperand(i: 1));
20170 if (!CMask || CMask->getZExtValue() != Mask)
20171 return SDValue();
20172
20173 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
20174 N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
20175}
20176
20177static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N,
20178 SelectionDAG &DAG) {
20179 SDLoc dl(N);
20180 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
20181 DAG.getNode(Opc, dl,
20182 N->getOperand(1).getSimpleValueType(),
20183 N->getOperand(1)),
20184 DAG.getConstant(0, dl, MVT::i64));
20185}
20186
20187static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG) {
20188 SDLoc DL(N);
20189 SDValue Op1 = N->getOperand(Num: 1);
20190 SDValue Op2 = N->getOperand(Num: 2);
20191 EVT ScalarTy = Op2.getValueType();
20192 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
20193 ScalarTy = MVT::i32;
20194
20195 // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
20196 SDValue StepVector = DAG.getStepVector(DL, ResVT: N->getValueType(ResNo: 0));
20197 SDValue Step = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: N->getValueType(ResNo: 0), Operand: Op2);
20198 SDValue Mul = DAG.getNode(Opcode: ISD::MUL, DL, VT: N->getValueType(ResNo: 0), N1: StepVector, N2: Step);
20199 SDValue Base = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: N->getValueType(ResNo: 0), Operand: Op1);
20200 return DAG.getNode(Opcode: ISD::ADD, DL, VT: N->getValueType(ResNo: 0), N1: Mul, N2: Base);
20201}
20202
20203static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG) {
20204 SDLoc dl(N);
20205 SDValue Scalar = N->getOperand(Num: 3);
20206 EVT ScalarTy = Scalar.getValueType();
20207
20208 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
20209 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
20210
20211 SDValue Passthru = N->getOperand(Num: 1);
20212 SDValue Pred = N->getOperand(Num: 2);
20213 return DAG.getNode(Opcode: AArch64ISD::DUP_MERGE_PASSTHRU, DL: dl, VT: N->getValueType(ResNo: 0),
20214 N1: Pred, N2: Scalar, N3: Passthru);
20215}
20216
20217static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) {
20218 SDLoc dl(N);
20219 LLVMContext &Ctx = *DAG.getContext();
20220 EVT VT = N->getValueType(ResNo: 0);
20221
20222 assert(VT.isScalableVector() && "Expected a scalable vector.");
20223
20224 // Current lowering only supports the SVE-ACLE types.
20225 if (VT.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock)
20226 return SDValue();
20227
20228 unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
20229 unsigned ByteSize = VT.getSizeInBits().getKnownMinValue() / 8;
20230 EVT ByteVT =
20231 EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize));
20232
20233 // Convert everything to the domain of EXT (i.e bytes).
20234 SDValue Op0 = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: ByteVT, Operand: N->getOperand(Num: 1));
20235 SDValue Op1 = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: ByteVT, Operand: N->getOperand(Num: 2));
20236 SDValue Op2 = DAG.getNode(ISD::MUL, dl, MVT::i32, N->getOperand(3),
20237 DAG.getConstant(ElemSize, dl, MVT::i32));
20238
20239 SDValue EXT = DAG.getNode(Opcode: AArch64ISD::EXT, DL: dl, VT: ByteVT, N1: Op0, N2: Op1, N3: Op2);
20240 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT, Operand: EXT);
20241}
20242
20243static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC,
20244 TargetLowering::DAGCombinerInfo &DCI,
20245 SelectionDAG &DAG) {
20246 if (DCI.isBeforeLegalize())
20247 return SDValue();
20248
20249 SDValue Comparator = N->getOperand(Num: 3);
20250 if (Comparator.getOpcode() == AArch64ISD::DUP ||
20251 Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
20252 unsigned IID = getIntrinsicID(N);
20253 EVT VT = N->getValueType(ResNo: 0);
20254 EVT CmpVT = N->getOperand(Num: 2).getValueType();
20255 SDValue Pred = N->getOperand(Num: 1);
20256 SDValue Imm;
20257 SDLoc DL(N);
20258
20259 switch (IID) {
20260 default:
20261 llvm_unreachable("Called with wrong intrinsic!");
20262 break;
20263
20264 // Signed comparisons
20265 case Intrinsic::aarch64_sve_cmpeq_wide:
20266 case Intrinsic::aarch64_sve_cmpne_wide:
20267 case Intrinsic::aarch64_sve_cmpge_wide:
20268 case Intrinsic::aarch64_sve_cmpgt_wide:
20269 case Intrinsic::aarch64_sve_cmplt_wide:
20270 case Intrinsic::aarch64_sve_cmple_wide: {
20271 if (auto *CN = dyn_cast<ConstantSDNode>(Val: Comparator.getOperand(i: 0))) {
20272 int64_t ImmVal = CN->getSExtValue();
20273 if (ImmVal >= -16 && ImmVal <= 15)
20274 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
20275 else
20276 return SDValue();
20277 }
20278 break;
20279 }
20280 // Unsigned comparisons
20281 case Intrinsic::aarch64_sve_cmphs_wide:
20282 case Intrinsic::aarch64_sve_cmphi_wide:
20283 case Intrinsic::aarch64_sve_cmplo_wide:
20284 case Intrinsic::aarch64_sve_cmpls_wide: {
20285 if (auto *CN = dyn_cast<ConstantSDNode>(Val: Comparator.getOperand(i: 0))) {
20286 uint64_t ImmVal = CN->getZExtValue();
20287 if (ImmVal <= 127)
20288 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
20289 else
20290 return SDValue();
20291 }
20292 break;
20293 }
20294 }
20295
20296 if (!Imm)
20297 return SDValue();
20298
20299 SDValue Splat = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: CmpVT, Operand: Imm);
20300 return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL, VT, N1: Pred,
20301 N2: N->getOperand(Num: 2), N3: Splat, N4: DAG.getCondCode(Cond: CC));
20302 }
20303
20304 return SDValue();
20305}
20306
20307static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
20308 AArch64CC::CondCode Cond) {
20309 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20310
20311 SDLoc DL(Op);
20312 assert(Op.getValueType().isScalableVector() &&
20313 TLI.isTypeLegal(Op.getValueType()) &&
20314 "Expected legal scalable vector type!");
20315 assert(Op.getValueType() == Pg.getValueType() &&
20316 "Expected same type for PTEST operands");
20317
20318 // Ensure target specific opcodes are using legal type.
20319 EVT OutVT = TLI.getTypeToTransformTo(Context&: *DAG.getContext(), VT);
20320 SDValue TVal = DAG.getConstant(Val: 1, DL, VT: OutVT);
20321 SDValue FVal = DAG.getConstant(Val: 0, DL, VT: OutVT);
20322
20323 // Ensure operands have type nxv16i1.
20324 if (Op.getValueType() != MVT::nxv16i1) {
20325 if ((Cond == AArch64CC::ANY_ACTIVE || Cond == AArch64CC::NONE_ACTIVE) &&
20326 isZeroingInactiveLanes(Op))
20327 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pg);
20328 else
20329 Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG);
20330 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op);
20331 }
20332
20333 // Set condition code (CC) flags.
20334 SDValue Test = DAG.getNode(
20335 Cond == AArch64CC::ANY_ACTIVE ? AArch64ISD::PTEST_ANY : AArch64ISD::PTEST,
20336 DL, MVT::Other, Pg, Op);
20337
20338 // Convert CC to integer based on requested condition.
20339 // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
20340 SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32);
20341 SDValue Res = DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT: OutVT, N1: FVal, N2: TVal, N3: CC, N4: Test);
20342 return DAG.getZExtOrTrunc(Op: Res, DL, VT);
20343}
20344
20345static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc,
20346 SelectionDAG &DAG) {
20347 SDLoc DL(N);
20348
20349 SDValue Pred = N->getOperand(Num: 1);
20350 SDValue VecToReduce = N->getOperand(Num: 2);
20351
20352 // NOTE: The integer reduction's result type is not always linked to the
20353 // operand's element type so we construct it from the intrinsic's result type.
20354 EVT ReduceVT = getPackedSVEVectorVT(VT: N->getValueType(ResNo: 0));
20355 SDValue Reduce = DAG.getNode(Opcode: Opc, DL, VT: ReduceVT, N1: Pred, N2: VecToReduce);
20356
20357 // SVE reductions set the whole vector register with the first element
20358 // containing the reduction result, which we'll now extract.
20359 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
20360 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: N->getValueType(ResNo: 0), N1: Reduce,
20361 N2: Zero);
20362}
20363
20364static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc,
20365 SelectionDAG &DAG) {
20366 SDLoc DL(N);
20367
20368 SDValue Pred = N->getOperand(Num: 1);
20369 SDValue VecToReduce = N->getOperand(Num: 2);
20370
20371 EVT ReduceVT = VecToReduce.getValueType();
20372 SDValue Reduce = DAG.getNode(Opcode: Opc, DL, VT: ReduceVT, N1: Pred, N2: VecToReduce);
20373
20374 // SVE reductions set the whole vector register with the first element
20375 // containing the reduction result, which we'll now extract.
20376 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
20377 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: N->getValueType(ResNo: 0), N1: Reduce,
20378 N2: Zero);
20379}
20380
20381static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc,
20382 SelectionDAG &DAG) {
20383 SDLoc DL(N);
20384
20385 SDValue Pred = N->getOperand(Num: 1);
20386 SDValue InitVal = N->getOperand(Num: 2);
20387 SDValue VecToReduce = N->getOperand(Num: 3);
20388 EVT ReduceVT = VecToReduce.getValueType();
20389
20390 // Ordered reductions use the first lane of the result vector as the
20391 // reduction's initial value.
20392 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
20393 InitVal = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: ReduceVT,
20394 N1: DAG.getUNDEF(VT: ReduceVT), N2: InitVal, N3: Zero);
20395
20396 SDValue Reduce = DAG.getNode(Opcode: Opc, DL, VT: ReduceVT, N1: Pred, N2: InitVal, N3: VecToReduce);
20397
20398 // SVE reductions set the whole vector register with the first element
20399 // containing the reduction result, which we'll now extract.
20400 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: N->getValueType(ResNo: 0), N1: Reduce,
20401 N2: Zero);
20402}
20403
20404// If a merged operation has no inactive lanes we can relax it to a predicated
20405// or unpredicated operation, which potentially allows better isel (perhaps
20406// using immediate forms) or relaxing register reuse requirements.
20407static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc,
20408 SelectionDAG &DAG, bool UnpredOp = false,
20409 bool SwapOperands = false) {
20410 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
20411 assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
20412 SDValue Pg = N->getOperand(Num: 1);
20413 SDValue Op1 = N->getOperand(Num: SwapOperands ? 3 : 2);
20414 SDValue Op2 = N->getOperand(Num: SwapOperands ? 2 : 3);
20415
20416 // ISD way to specify an all active predicate.
20417 if (isAllActivePredicate(DAG, N: Pg)) {
20418 if (UnpredOp)
20419 return DAG.getNode(Opcode: Opc, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), N1: Op1, N2: Op2);
20420
20421 return DAG.getNode(Opcode: Opc, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), N1: Pg, N2: Op1, N3: Op2);
20422 }
20423
20424 // FUTURE: SplatVector(true)
20425 return SDValue();
20426}
20427
20428static SDValue performIntrinsicCombine(SDNode *N,
20429 TargetLowering::DAGCombinerInfo &DCI,
20430 const AArch64Subtarget *Subtarget) {
20431 SelectionDAG &DAG = DCI.DAG;
20432 unsigned IID = getIntrinsicID(N);
20433 switch (IID) {
20434 default:
20435 break;
20436 case Intrinsic::get_active_lane_mask: {
20437 SDValue Res = SDValue();
20438 EVT VT = N->getValueType(ResNo: 0);
20439 if (VT.isFixedLengthVector()) {
20440 // We can use the SVE whilelo instruction to lower this intrinsic by
20441 // creating the appropriate sequence of scalable vector operations and
20442 // then extracting a fixed-width subvector from the scalable vector.
20443
20444 SDLoc DL(N);
20445 SDValue ID =
20446 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
20447
20448 EVT WhileVT = EVT::getVectorVT(
20449 *DAG.getContext(), MVT::i1,
20450 ElementCount::getScalable(VT.getVectorNumElements()));
20451
20452 // Get promoted scalable vector VT, i.e. promote nxv4i1 -> nxv4i32.
20453 EVT PromVT = getPromotedVTForPredicate(VT: WhileVT);
20454
20455 // Get the fixed-width equivalent of PromVT for extraction.
20456 EVT ExtVT =
20457 EVT::getVectorVT(Context&: *DAG.getContext(), VT: PromVT.getVectorElementType(),
20458 EC: VT.getVectorElementCount());
20459
20460 Res = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: WhileVT, N1: ID,
20461 N2: N->getOperand(Num: 1), N3: N->getOperand(Num: 2));
20462 Res = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: PromVT, Operand: Res);
20463 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, Res,
20464 DAG.getConstant(0, DL, MVT::i64));
20465 Res = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: Res);
20466 }
20467 return Res;
20468 }
20469 case Intrinsic::aarch64_neon_vcvtfxs2fp:
20470 case Intrinsic::aarch64_neon_vcvtfxu2fp:
20471 return tryCombineFixedPointConvert(N, DCI, DAG);
20472 case Intrinsic::aarch64_neon_saddv:
20473 return combineAcrossLanesIntrinsic(Opc: AArch64ISD::SADDV, N, DAG);
20474 case Intrinsic::aarch64_neon_uaddv:
20475 return combineAcrossLanesIntrinsic(Opc: AArch64ISD::UADDV, N, DAG);
20476 case Intrinsic::aarch64_neon_sminv:
20477 return combineAcrossLanesIntrinsic(Opc: AArch64ISD::SMINV, N, DAG);
20478 case Intrinsic::aarch64_neon_uminv:
20479 return combineAcrossLanesIntrinsic(Opc: AArch64ISD::UMINV, N, DAG);
20480 case Intrinsic::aarch64_neon_smaxv:
20481 return combineAcrossLanesIntrinsic(Opc: AArch64ISD::SMAXV, N, DAG);
20482 case Intrinsic::aarch64_neon_umaxv:
20483 return combineAcrossLanesIntrinsic(Opc: AArch64ISD::UMAXV, N, DAG);
20484 case Intrinsic::aarch64_neon_fmax:
20485 return DAG.getNode(Opcode: ISD::FMAXIMUM, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20486 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
20487 case Intrinsic::aarch64_neon_fmin:
20488 return DAG.getNode(Opcode: ISD::FMINIMUM, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20489 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
20490 case Intrinsic::aarch64_neon_fmaxnm:
20491 return DAG.getNode(Opcode: ISD::FMAXNUM, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20492 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
20493 case Intrinsic::aarch64_neon_fminnm:
20494 return DAG.getNode(Opcode: ISD::FMINNUM, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20495 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
20496 case Intrinsic::aarch64_neon_smull:
20497 return DAG.getNode(Opcode: AArch64ISD::SMULL, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20498 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
20499 case Intrinsic::aarch64_neon_umull:
20500 return DAG.getNode(Opcode: AArch64ISD::UMULL, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20501 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
20502 case Intrinsic::aarch64_neon_pmull:
20503 return DAG.getNode(Opcode: AArch64ISD::PMULL, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20504 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
20505 case Intrinsic::aarch64_neon_sqdmull:
20506 return tryCombineLongOpWithDup(IID, N, DCI, DAG);
20507 case Intrinsic::aarch64_neon_sqshl:
20508 case Intrinsic::aarch64_neon_uqshl:
20509 case Intrinsic::aarch64_neon_sqshlu:
20510 case Intrinsic::aarch64_neon_srshl:
20511 case Intrinsic::aarch64_neon_urshl:
20512 case Intrinsic::aarch64_neon_sshl:
20513 case Intrinsic::aarch64_neon_ushl:
20514 return tryCombineShiftImm(IID, N, DAG);
20515 case Intrinsic::aarch64_neon_sabd:
20516 return DAG.getNode(Opcode: ISD::ABDS, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20517 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
20518 case Intrinsic::aarch64_neon_uabd:
20519 return DAG.getNode(Opcode: ISD::ABDU, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20520 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
20521 case Intrinsic::aarch64_crc32b:
20522 case Intrinsic::aarch64_crc32cb:
20523 return tryCombineCRC32(Mask: 0xff, N, DAG);
20524 case Intrinsic::aarch64_crc32h:
20525 case Intrinsic::aarch64_crc32ch:
20526 return tryCombineCRC32(Mask: 0xffff, N, DAG);
20527 case Intrinsic::aarch64_sve_saddv:
20528 // There is no i64 version of SADDV because the sign is irrelevant.
20529 if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64)
20530 return combineSVEReductionInt(N, Opc: AArch64ISD::UADDV_PRED, DAG);
20531 else
20532 return combineSVEReductionInt(N, Opc: AArch64ISD::SADDV_PRED, DAG);
20533 case Intrinsic::aarch64_sve_uaddv:
20534 return combineSVEReductionInt(N, Opc: AArch64ISD::UADDV_PRED, DAG);
20535 case Intrinsic::aarch64_sve_smaxv:
20536 return combineSVEReductionInt(N, Opc: AArch64ISD::SMAXV_PRED, DAG);
20537 case Intrinsic::aarch64_sve_umaxv:
20538 return combineSVEReductionInt(N, Opc: AArch64ISD::UMAXV_PRED, DAG);
20539 case Intrinsic::aarch64_sve_sminv:
20540 return combineSVEReductionInt(N, Opc: AArch64ISD::SMINV_PRED, DAG);
20541 case Intrinsic::aarch64_sve_uminv:
20542 return combineSVEReductionInt(N, Opc: AArch64ISD::UMINV_PRED, DAG);
20543 case Intrinsic::aarch64_sve_orv:
20544 return combineSVEReductionInt(N, Opc: AArch64ISD::ORV_PRED, DAG);
20545 case Intrinsic::aarch64_sve_eorv:
20546 return combineSVEReductionInt(N, Opc: AArch64ISD::EORV_PRED, DAG);
20547 case Intrinsic::aarch64_sve_andv:
20548 return combineSVEReductionInt(N, Opc: AArch64ISD::ANDV_PRED, DAG);
20549 case Intrinsic::aarch64_sve_index:
20550 return LowerSVEIntrinsicIndex(N, DAG);
20551 case Intrinsic::aarch64_sve_dup:
20552 return LowerSVEIntrinsicDUP(N, DAG);
20553 case Intrinsic::aarch64_sve_dup_x:
20554 return DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20555 Operand: N->getOperand(Num: 1));
20556 case Intrinsic::aarch64_sve_ext:
20557 return LowerSVEIntrinsicEXT(N, DAG);
20558 case Intrinsic::aarch64_sve_mul_u:
20559 return DAG.getNode(Opcode: AArch64ISD::MUL_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20560 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
20561 case Intrinsic::aarch64_sve_smulh_u:
20562 return DAG.getNode(Opcode: AArch64ISD::MULHS_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20563 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
20564 case Intrinsic::aarch64_sve_umulh_u:
20565 return DAG.getNode(Opcode: AArch64ISD::MULHU_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20566 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
20567 case Intrinsic::aarch64_sve_smin_u:
20568 return DAG.getNode(Opcode: AArch64ISD::SMIN_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20569 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
20570 case Intrinsic::aarch64_sve_umin_u:
20571 return DAG.getNode(Opcode: AArch64ISD::UMIN_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20572 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
20573 case Intrinsic::aarch64_sve_smax_u:
20574 return DAG.getNode(Opcode: AArch64ISD::SMAX_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20575 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
20576 case Intrinsic::aarch64_sve_umax_u:
20577 return DAG.getNode(Opcode: AArch64ISD::UMAX_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20578 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
20579 case Intrinsic::aarch64_sve_lsl_u:
20580 return DAG.getNode(Opcode: AArch64ISD::SHL_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20581 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
20582 case Intrinsic::aarch64_sve_lsr_u:
20583 return DAG.getNode(Opcode: AArch64ISD::SRL_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20584 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
20585 case Intrinsic::aarch64_sve_asr_u:
20586 return DAG.getNode(Opcode: AArch64ISD::SRA_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20587 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
20588 case Intrinsic::aarch64_sve_fadd_u:
20589 return DAG.getNode(Opcode: AArch64ISD::FADD_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20590 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
20591 case Intrinsic::aarch64_sve_fdiv_u:
20592 return DAG.getNode(Opcode: AArch64ISD::FDIV_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20593 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
20594 case Intrinsic::aarch64_sve_fmax_u:
20595 return DAG.getNode(Opcode: AArch64ISD::FMAX_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20596 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
20597 case Intrinsic::aarch64_sve_fmaxnm_u:
20598 return DAG.getNode(Opcode: AArch64ISD::FMAXNM_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20599 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
20600 case Intrinsic::aarch64_sve_fmla_u:
20601 return DAG.getNode(Opcode: AArch64ISD::FMA_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20602 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 3), N3: N->getOperand(Num: 4),
20603 N4: N->getOperand(Num: 2));
20604 case Intrinsic::aarch64_sve_fmin_u:
20605 return DAG.getNode(Opcode: AArch64ISD::FMIN_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20606 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
20607 case Intrinsic::aarch64_sve_fminnm_u:
20608 return DAG.getNode(Opcode: AArch64ISD::FMINNM_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20609 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
20610 case Intrinsic::aarch64_sve_fmul_u:
20611 return DAG.getNode(Opcode: AArch64ISD::FMUL_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20612 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
20613 case Intrinsic::aarch64_sve_fsub_u:
20614 return DAG.getNode(Opcode: AArch64ISD::FSUB_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20615 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
20616 case Intrinsic::aarch64_sve_add_u:
20617 return DAG.getNode(Opcode: ISD::ADD, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 2),
20618 N2: N->getOperand(Num: 3));
20619 case Intrinsic::aarch64_sve_sub_u:
20620 return DAG.getNode(Opcode: ISD::SUB, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 2),
20621 N2: N->getOperand(Num: 3));
20622 case Intrinsic::aarch64_sve_subr:
20623 return convertMergedOpToPredOp(N, Opc: ISD::SUB, DAG, UnpredOp: true, SwapOperands: true);
20624 case Intrinsic::aarch64_sve_and_u:
20625 return DAG.getNode(Opcode: ISD::AND, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 2),
20626 N2: N->getOperand(Num: 3));
20627 case Intrinsic::aarch64_sve_bic_u:
20628 return DAG.getNode(Opcode: AArch64ISD::BIC, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20629 N1: N->getOperand(Num: 2), N2: N->getOperand(Num: 3));
20630 case Intrinsic::aarch64_sve_eor_u:
20631 return DAG.getNode(Opcode: ISD::XOR, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 2),
20632 N2: N->getOperand(Num: 3));
20633 case Intrinsic::aarch64_sve_orr_u:
20634 return DAG.getNode(Opcode: ISD::OR, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 2),
20635 N2: N->getOperand(Num: 3));
20636 case Intrinsic::aarch64_sve_sabd_u:
20637 return DAG.getNode(Opcode: ISD::ABDS, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20638 N1: N->getOperand(Num: 2), N2: N->getOperand(Num: 3));
20639 case Intrinsic::aarch64_sve_uabd_u:
20640 return DAG.getNode(Opcode: ISD::ABDU, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20641 N1: N->getOperand(Num: 2), N2: N->getOperand(Num: 3));
20642 case Intrinsic::aarch64_sve_sdiv_u:
20643 return DAG.getNode(Opcode: AArch64ISD::SDIV_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20644 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
20645 case Intrinsic::aarch64_sve_udiv_u:
20646 return DAG.getNode(Opcode: AArch64ISD::UDIV_PRED, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20647 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
20648 case Intrinsic::aarch64_sve_sqadd:
20649 return convertMergedOpToPredOp(N, Opc: ISD::SADDSAT, DAG, UnpredOp: true);
20650 case Intrinsic::aarch64_sve_sqsub_u:
20651 return DAG.getNode(Opcode: ISD::SSUBSAT, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20652 N1: N->getOperand(Num: 2), N2: N->getOperand(Num: 3));
20653 case Intrinsic::aarch64_sve_uqadd:
20654 return convertMergedOpToPredOp(N, Opc: ISD::UADDSAT, DAG, UnpredOp: true);
20655 case Intrinsic::aarch64_sve_uqsub_u:
20656 return DAG.getNode(Opcode: ISD::USUBSAT, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20657 N1: N->getOperand(Num: 2), N2: N->getOperand(Num: 3));
20658 case Intrinsic::aarch64_sve_sqadd_x:
20659 return DAG.getNode(Opcode: ISD::SADDSAT, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20660 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
20661 case Intrinsic::aarch64_sve_sqsub_x:
20662 return DAG.getNode(Opcode: ISD::SSUBSAT, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20663 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
20664 case Intrinsic::aarch64_sve_uqadd_x:
20665 return DAG.getNode(Opcode: ISD::UADDSAT, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20666 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
20667 case Intrinsic::aarch64_sve_uqsub_x:
20668 return DAG.getNode(Opcode: ISD::USUBSAT, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20669 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2));
20670 case Intrinsic::aarch64_sve_asrd:
20671 return DAG.getNode(Opcode: AArch64ISD::SRAD_MERGE_OP1, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20672 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
20673 case Intrinsic::aarch64_sve_cmphs:
20674 if (!N->getOperand(Num: 2).getValueType().isFloatingPoint())
20675 return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc(N),
20676 VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2),
20677 N3: N->getOperand(Num: 3), N4: DAG.getCondCode(Cond: ISD::SETUGE));
20678 break;
20679 case Intrinsic::aarch64_sve_cmphi:
20680 if (!N->getOperand(Num: 2).getValueType().isFloatingPoint())
20681 return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc(N),
20682 VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2),
20683 N3: N->getOperand(Num: 3), N4: DAG.getCondCode(Cond: ISD::SETUGT));
20684 break;
20685 case Intrinsic::aarch64_sve_fcmpge:
20686 case Intrinsic::aarch64_sve_cmpge:
20687 return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc(N),
20688 VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2),
20689 N3: N->getOperand(Num: 3), N4: DAG.getCondCode(Cond: ISD::SETGE));
20690 break;
20691 case Intrinsic::aarch64_sve_fcmpgt:
20692 case Intrinsic::aarch64_sve_cmpgt:
20693 return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc(N),
20694 VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2),
20695 N3: N->getOperand(Num: 3), N4: DAG.getCondCode(Cond: ISD::SETGT));
20696 break;
20697 case Intrinsic::aarch64_sve_fcmpeq:
20698 case Intrinsic::aarch64_sve_cmpeq:
20699 return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc(N),
20700 VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2),
20701 N3: N->getOperand(Num: 3), N4: DAG.getCondCode(Cond: ISD::SETEQ));
20702 break;
20703 case Intrinsic::aarch64_sve_fcmpne:
20704 case Intrinsic::aarch64_sve_cmpne:
20705 return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc(N),
20706 VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2),
20707 N3: N->getOperand(Num: 3), N4: DAG.getCondCode(Cond: ISD::SETNE));
20708 break;
20709 case Intrinsic::aarch64_sve_fcmpuo:
20710 return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc(N),
20711 VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2),
20712 N3: N->getOperand(Num: 3), N4: DAG.getCondCode(Cond: ISD::SETUO));
20713 break;
20714 case Intrinsic::aarch64_sve_fadda:
20715 return combineSVEReductionOrderedFP(N, Opc: AArch64ISD::FADDA_PRED, DAG);
20716 case Intrinsic::aarch64_sve_faddv:
20717 return combineSVEReductionFP(N, Opc: AArch64ISD::FADDV_PRED, DAG);
20718 case Intrinsic::aarch64_sve_fmaxnmv:
20719 return combineSVEReductionFP(N, Opc: AArch64ISD::FMAXNMV_PRED, DAG);
20720 case Intrinsic::aarch64_sve_fmaxv:
20721 return combineSVEReductionFP(N, Opc: AArch64ISD::FMAXV_PRED, DAG);
20722 case Intrinsic::aarch64_sve_fminnmv:
20723 return combineSVEReductionFP(N, Opc: AArch64ISD::FMINNMV_PRED, DAG);
20724 case Intrinsic::aarch64_sve_fminv:
20725 return combineSVEReductionFP(N, Opc: AArch64ISD::FMINV_PRED, DAG);
20726 case Intrinsic::aarch64_sve_sel:
20727 return DAG.getNode(Opcode: ISD::VSELECT, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20728 N1: N->getOperand(Num: 1), N2: N->getOperand(Num: 2), N3: N->getOperand(Num: 3));
20729 case Intrinsic::aarch64_sve_cmpeq_wide:
20730 return tryConvertSVEWideCompare(N, CC: ISD::SETEQ, DCI, DAG);
20731 case Intrinsic::aarch64_sve_cmpne_wide:
20732 return tryConvertSVEWideCompare(N, CC: ISD::SETNE, DCI, DAG);
20733 case Intrinsic::aarch64_sve_cmpge_wide:
20734 return tryConvertSVEWideCompare(N, CC: ISD::SETGE, DCI, DAG);
20735 case Intrinsic::aarch64_sve_cmpgt_wide:
20736 return tryConvertSVEWideCompare(N, CC: ISD::SETGT, DCI, DAG);
20737 case Intrinsic::aarch64_sve_cmplt_wide:
20738 return tryConvertSVEWideCompare(N, CC: ISD::SETLT, DCI, DAG);
20739 case Intrinsic::aarch64_sve_cmple_wide:
20740 return tryConvertSVEWideCompare(N, CC: ISD::SETLE, DCI, DAG);
20741 case Intrinsic::aarch64_sve_cmphs_wide:
20742 return tryConvertSVEWideCompare(N, CC: ISD::SETUGE, DCI, DAG);
20743 case Intrinsic::aarch64_sve_cmphi_wide:
20744 return tryConvertSVEWideCompare(N, CC: ISD::SETUGT, DCI, DAG);
20745 case Intrinsic::aarch64_sve_cmplo_wide:
20746 return tryConvertSVEWideCompare(N, CC: ISD::SETULT, DCI, DAG);
20747 case Intrinsic::aarch64_sve_cmpls_wide:
20748 return tryConvertSVEWideCompare(N, CC: ISD::SETULE, DCI, DAG);
20749 case Intrinsic::aarch64_sve_ptest_any:
20750 return getPTest(DAG, VT: N->getValueType(ResNo: 0), Pg: N->getOperand(Num: 1), Op: N->getOperand(Num: 2),
20751 Cond: AArch64CC::ANY_ACTIVE);
20752 case Intrinsic::aarch64_sve_ptest_first:
20753 return getPTest(DAG, VT: N->getValueType(ResNo: 0), Pg: N->getOperand(Num: 1), Op: N->getOperand(Num: 2),
20754 Cond: AArch64CC::FIRST_ACTIVE);
20755 case Intrinsic::aarch64_sve_ptest_last:
20756 return getPTest(DAG, VT: N->getValueType(ResNo: 0), Pg: N->getOperand(Num: 1), Op: N->getOperand(Num: 2),
20757 Cond: AArch64CC::LAST_ACTIVE);
20758 }
20759 return SDValue();
20760}
20761
20762static bool isCheapToExtend(const SDValue &N) {
20763 unsigned OC = N->getOpcode();
20764 return OC == ISD::LOAD || OC == ISD::MLOAD ||
20765 ISD::isConstantSplatVectorAllZeros(N: N.getNode());
20766}
20767
20768static SDValue
20769performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
20770 SelectionDAG &DAG) {
20771 // If we have (sext (setcc A B)) and A and B are cheap to extend,
20772 // we can move the sext into the arguments and have the same result. For
20773 // example, if A and B are both loads, we can make those extending loads and
20774 // avoid an extra instruction. This pattern appears often in VLS code
20775 // generation where the inputs to the setcc have a different size to the
20776 // instruction that wants to use the result of the setcc.
20777 assert(N->getOpcode() == ISD::SIGN_EXTEND &&
20778 N->getOperand(0)->getOpcode() == ISD::SETCC);
20779 const SDValue SetCC = N->getOperand(Num: 0);
20780
20781 const SDValue CCOp0 = SetCC.getOperand(i: 0);
20782 const SDValue CCOp1 = SetCC.getOperand(i: 1);
20783 if (!CCOp0->getValueType(ResNo: 0).isInteger() ||
20784 !CCOp1->getValueType(ResNo: 0).isInteger())
20785 return SDValue();
20786
20787 ISD::CondCode Code =
20788 cast<CondCodeSDNode>(Val: SetCC->getOperand(Num: 2).getNode())->get();
20789
20790 ISD::NodeType ExtType =
20791 isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
20792
20793 if (isCheapToExtend(N: SetCC.getOperand(i: 0)) &&
20794 isCheapToExtend(N: SetCC.getOperand(i: 1))) {
20795 const SDValue Ext1 =
20796 DAG.getNode(Opcode: ExtType, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), Operand: CCOp0);
20797 const SDValue Ext2 =
20798 DAG.getNode(Opcode: ExtType, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), Operand: CCOp1);
20799
20800 return DAG.getSetCC(
20801 DL: SDLoc(SetCC), VT: N->getValueType(ResNo: 0), LHS: Ext1, RHS: Ext2,
20802 Cond: cast<CondCodeSDNode>(Val: SetCC->getOperand(Num: 2).getNode())->get());
20803 }
20804
20805 return SDValue();
20806}
20807
20808static SDValue performExtendCombine(SDNode *N,
20809 TargetLowering::DAGCombinerInfo &DCI,
20810 SelectionDAG &DAG) {
20811 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
20812 // we can convert that DUP into another extract_high (of a bigger DUP), which
20813 // helps the backend to decide that an sabdl2 would be useful, saving a real
20814 // extract_high operation.
20815 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
20816 (N->getOperand(Num: 0).getOpcode() == ISD::ABDU ||
20817 N->getOperand(Num: 0).getOpcode() == ISD::ABDS)) {
20818 SDNode *ABDNode = N->getOperand(Num: 0).getNode();
20819 SDValue NewABD =
20820 tryCombineLongOpWithDup(IID: Intrinsic::not_intrinsic, N: ABDNode, DCI, DAG);
20821 if (!NewABD.getNode())
20822 return SDValue();
20823
20824 return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), Operand: NewABD);
20825 }
20826
20827 if (N->getValueType(ResNo: 0).isFixedLengthVector() &&
20828 N->getOpcode() == ISD::SIGN_EXTEND &&
20829 N->getOperand(Num: 0)->getOpcode() == ISD::SETCC)
20830 return performSignExtendSetCCCombine(N, DCI, DAG);
20831
20832 return SDValue();
20833}
20834
20835static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
20836 SDValue SplatVal, unsigned NumVecElts) {
20837 assert(!St.isTruncatingStore() && "cannot split truncating vector store");
20838 Align OrigAlignment = St.getAlign();
20839 unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
20840
20841 // Create scalar stores. This is at least as good as the code sequence for a
20842 // split unaligned store which is a dup.s, ext.b, and two stores.
20843 // Most of the time the three stores should be replaced by store pair
20844 // instructions (stp).
20845 SDLoc DL(&St);
20846 SDValue BasePtr = St.getBasePtr();
20847 uint64_t BaseOffset = 0;
20848
20849 const MachinePointerInfo &PtrInfo = St.getPointerInfo();
20850 SDValue NewST1 =
20851 DAG.getStore(Chain: St.getChain(), dl: DL, Val: SplatVal, Ptr: BasePtr, PtrInfo,
20852 Alignment: OrigAlignment, MMOFlags: St.getMemOperand()->getFlags());
20853
20854 // As this in ISel, we will not merge this add which may degrade results.
20855 if (BasePtr->getOpcode() == ISD::ADD &&
20856 isa<ConstantSDNode>(Val: BasePtr->getOperand(Num: 1))) {
20857 BaseOffset = cast<ConstantSDNode>(Val: BasePtr->getOperand(Num: 1))->getSExtValue();
20858 BasePtr = BasePtr->getOperand(Num: 0);
20859 }
20860
20861 unsigned Offset = EltOffset;
20862 while (--NumVecElts) {
20863 Align Alignment = commonAlignment(A: OrigAlignment, Offset);
20864 SDValue OffsetPtr =
20865 DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
20866 DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
20867 NewST1 = DAG.getStore(Chain: NewST1.getValue(R: 0), dl: DL, Val: SplatVal, Ptr: OffsetPtr,
20868 PtrInfo: PtrInfo.getWithOffset(O: Offset), Alignment,
20869 MMOFlags: St.getMemOperand()->getFlags());
20870 Offset += EltOffset;
20871 }
20872 return NewST1;
20873}
20874
20875// Returns an SVE type that ContentTy can be trivially sign or zero extended
20876// into.
20877static MVT getSVEContainerType(EVT ContentTy) {
20878 assert(ContentTy.isSimple() && "No SVE containers for extended types");
20879
20880 switch (ContentTy.getSimpleVT().SimpleTy) {
20881 default:
20882 llvm_unreachable("No known SVE container for this MVT type");
20883 case MVT::nxv2i8:
20884 case MVT::nxv2i16:
20885 case MVT::nxv2i32:
20886 case MVT::nxv2i64:
20887 case MVT::nxv2f32:
20888 case MVT::nxv2f64:
20889 return MVT::nxv2i64;
20890 case MVT::nxv4i8:
20891 case MVT::nxv4i16:
20892 case MVT::nxv4i32:
20893 case MVT::nxv4f32:
20894 return MVT::nxv4i32;
20895 case MVT::nxv8i8:
20896 case MVT::nxv8i16:
20897 case MVT::nxv8f16:
20898 case MVT::nxv8bf16:
20899 return MVT::nxv8i16;
20900 case MVT::nxv16i8:
20901 return MVT::nxv16i8;
20902 }
20903}
20904
20905static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
20906 SDLoc DL(N);
20907 EVT VT = N->getValueType(ResNo: 0);
20908
20909 if (VT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock)
20910 return SDValue();
20911
20912 EVT ContainerVT = VT;
20913 if (ContainerVT.isInteger())
20914 ContainerVT = getSVEContainerType(ContentTy: ContainerVT);
20915
20916 SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
20917 SDValue Ops[] = { N->getOperand(Num: 0), // Chain
20918 N->getOperand(Num: 2), // Pg
20919 N->getOperand(Num: 3), // Base
20920 DAG.getValueType(VT) };
20921
20922 SDValue Load = DAG.getNode(Opcode: Opc, DL, VTList: VTs, Ops);
20923 SDValue LoadChain = SDValue(Load.getNode(), 1);
20924
20925 if (ContainerVT.isInteger() && (VT != ContainerVT))
20926 Load = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: Load.getValue(R: 0));
20927
20928 return DAG.getMergeValues(Ops: { Load, LoadChain }, dl: DL);
20929}
20930
20931static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) {
20932 SDLoc DL(N);
20933 EVT VT = N->getValueType(ResNo: 0);
20934 EVT PtrTy = N->getOperand(Num: 3).getValueType();
20935
20936 EVT LoadVT = VT;
20937 if (VT.isFloatingPoint())
20938 LoadVT = VT.changeTypeToInteger();
20939
20940 auto *MINode = cast<MemIntrinsicSDNode>(Val: N);
20941 SDValue PassThru = DAG.getConstant(Val: 0, DL, VT: LoadVT);
20942 SDValue L = DAG.getMaskedLoad(VT: LoadVT, dl: DL, Chain: MINode->getChain(),
20943 Base: MINode->getOperand(Num: 3), Offset: DAG.getUNDEF(VT: PtrTy),
20944 Mask: MINode->getOperand(Num: 2), Src0: PassThru,
20945 MemVT: MINode->getMemoryVT(), MMO: MINode->getMemOperand(),
20946 AM: ISD::UNINDEXED, ISD::NON_EXTLOAD, IsExpanding: false);
20947
20948 if (VT.isFloatingPoint()) {
20949 SDValue Ops[] = { DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: L), L.getValue(R: 1) };
20950 return DAG.getMergeValues(Ops, dl: DL);
20951 }
20952
20953 return L;
20954}
20955
20956template <unsigned Opcode>
20957static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) {
20958 static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
20959 Opcode == AArch64ISD::LD1RO_MERGE_ZERO,
20960 "Unsupported opcode.");
20961 SDLoc DL(N);
20962 EVT VT = N->getValueType(ResNo: 0);
20963
20964 EVT LoadVT = VT;
20965 if (VT.isFloatingPoint())
20966 LoadVT = VT.changeTypeToInteger();
20967
20968 SDValue Ops[] = {N->getOperand(Num: 0), N->getOperand(Num: 2), N->getOperand(Num: 3)};
20969 SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
20970 SDValue LoadChain = SDValue(Load.getNode(), 1);
20971
20972 if (VT.isFloatingPoint())
20973 Load = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Load.getValue(R: 0));
20974
20975 return DAG.getMergeValues(Ops: {Load, LoadChain}, dl: DL);
20976}
20977
20978static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) {
20979 SDLoc DL(N);
20980 SDValue Data = N->getOperand(Num: 2);
20981 EVT DataVT = Data.getValueType();
20982 EVT HwSrcVt = getSVEContainerType(ContentTy: DataVT);
20983 SDValue InputVT = DAG.getValueType(DataVT);
20984
20985 if (DataVT.isFloatingPoint())
20986 InputVT = DAG.getValueType(HwSrcVt);
20987
20988 SDValue SrcNew;
20989 if (Data.getValueType().isFloatingPoint())
20990 SrcNew = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: HwSrcVt, Operand: Data);
20991 else
20992 SrcNew = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: HwSrcVt, Operand: Data);
20993
20994 SDValue Ops[] = { N->getOperand(Num: 0), // Chain
20995 SrcNew,
20996 N->getOperand(Num: 4), // Base
20997 N->getOperand(Num: 3), // Pg
20998 InputVT
20999 };
21000
21001 return DAG.getNode(Opcode: AArch64ISD::ST1_PRED, DL, VT: N->getValueType(ResNo: 0), Ops);
21002}
21003
21004static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) {
21005 SDLoc DL(N);
21006
21007 SDValue Data = N->getOperand(Num: 2);
21008 EVT DataVT = Data.getValueType();
21009 EVT PtrTy = N->getOperand(Num: 4).getValueType();
21010
21011 if (DataVT.isFloatingPoint())
21012 Data = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: DataVT.changeTypeToInteger(), Operand: Data);
21013
21014 auto *MINode = cast<MemIntrinsicSDNode>(Val: N);
21015 return DAG.getMaskedStore(Chain: MINode->getChain(), dl: DL, Val: Data, Base: MINode->getOperand(Num: 4),
21016 Offset: DAG.getUNDEF(VT: PtrTy), Mask: MINode->getOperand(Num: 3),
21017 MemVT: MINode->getMemoryVT(), MMO: MINode->getMemOperand(),
21018 AM: ISD::UNINDEXED, IsTruncating: false, IsCompressing: false);
21019}
21020
21021/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
21022/// load store optimizer pass will merge them to store pair stores. This should
21023/// be better than a movi to create the vector zero followed by a vector store
21024/// if the zero constant is not re-used, since one instructions and one register
21025/// live range will be removed.
21026///
21027/// For example, the final generated code should be:
21028///
21029/// stp xzr, xzr, [x0]
21030///
21031/// instead of:
21032///
21033/// movi v0.2d, #0
21034/// str q0, [x0]
21035///
21036static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
21037 SDValue StVal = St.getValue();
21038 EVT VT = StVal.getValueType();
21039
21040 // Avoid scalarizing zero splat stores for scalable vectors.
21041 if (VT.isScalableVector())
21042 return SDValue();
21043
21044 // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
21045 // 2, 3 or 4 i32 elements.
21046 int NumVecElts = VT.getVectorNumElements();
21047 if (!(((NumVecElts == 2 || NumVecElts == 3) &&
21048 VT.getVectorElementType().getSizeInBits() == 64) ||
21049 ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
21050 VT.getVectorElementType().getSizeInBits() == 32)))
21051 return SDValue();
21052
21053 if (StVal.getOpcode() != ISD::BUILD_VECTOR)
21054 return SDValue();
21055
21056 // If the zero constant has more than one use then the vector store could be
21057 // better since the constant mov will be amortized and stp q instructions
21058 // should be able to be formed.
21059 if (!StVal.hasOneUse())
21060 return SDValue();
21061
21062 // If the store is truncating then it's going down to i16 or smaller, which
21063 // means it can be implemented in a single store anyway.
21064 if (St.isTruncatingStore())
21065 return SDValue();
21066
21067 // If the immediate offset of the address operand is too large for the stp
21068 // instruction, then bail out.
21069 if (DAG.isBaseWithConstantOffset(Op: St.getBasePtr())) {
21070 int64_t Offset = St.getBasePtr()->getConstantOperandVal(Num: 1);
21071 if (Offset < -512 || Offset > 504)
21072 return SDValue();
21073 }
21074
21075 for (int I = 0; I < NumVecElts; ++I) {
21076 SDValue EltVal = StVal.getOperand(i: I);
21077 if (!isNullConstant(V: EltVal) && !isNullFPConstant(V: EltVal))
21078 return SDValue();
21079 }
21080
21081 // Use a CopyFromReg WZR/XZR here to prevent
21082 // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
21083 SDLoc DL(&St);
21084 unsigned ZeroReg;
21085 EVT ZeroVT;
21086 if (VT.getVectorElementType().getSizeInBits() == 32) {
21087 ZeroReg = AArch64::WZR;
21088 ZeroVT = MVT::i32;
21089 } else {
21090 ZeroReg = AArch64::XZR;
21091 ZeroVT = MVT::i64;
21092 }
21093 SDValue SplatVal =
21094 DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg: ZeroReg, VT: ZeroVT);
21095 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
21096}
21097
21098/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
21099/// value. The load store optimizer pass will merge them to store pair stores.
21100/// This has better performance than a splat of the scalar followed by a split
21101/// vector store. Even if the stores are not merged it is four stores vs a dup,
21102/// followed by an ext.b and two stores.
21103static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
21104 SDValue StVal = St.getValue();
21105 EVT VT = StVal.getValueType();
21106
21107 // Don't replace floating point stores, they possibly won't be transformed to
21108 // stp because of the store pair suppress pass.
21109 if (VT.isFloatingPoint())
21110 return SDValue();
21111
21112 // We can express a splat as store pair(s) for 2 or 4 elements.
21113 unsigned NumVecElts = VT.getVectorNumElements();
21114 if (NumVecElts != 4 && NumVecElts != 2)
21115 return SDValue();
21116
21117 // If the store is truncating then it's going down to i16 or smaller, which
21118 // means it can be implemented in a single store anyway.
21119 if (St.isTruncatingStore())
21120 return SDValue();
21121
21122 // Check that this is a splat.
21123 // Make sure that each of the relevant vector element locations are inserted
21124 // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
21125 std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
21126 SDValue SplatVal;
21127 for (unsigned I = 0; I < NumVecElts; ++I) {
21128 // Check for insert vector elements.
21129 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
21130 return SDValue();
21131
21132 // Check that same value is inserted at each vector element.
21133 if (I == 0)
21134 SplatVal = StVal.getOperand(i: 1);
21135 else if (StVal.getOperand(i: 1) != SplatVal)
21136 return SDValue();
21137
21138 // Check insert element index.
21139 ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(Val: StVal.getOperand(i: 2));
21140 if (!CIndex)
21141 return SDValue();
21142 uint64_t IndexVal = CIndex->getZExtValue();
21143 if (IndexVal >= NumVecElts)
21144 return SDValue();
21145 IndexNotInserted.reset(position: IndexVal);
21146
21147 StVal = StVal.getOperand(i: 0);
21148 }
21149 // Check that all vector element locations were inserted to.
21150 if (IndexNotInserted.any())
21151 return SDValue();
21152
21153 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
21154}
21155
21156static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
21157 SelectionDAG &DAG,
21158 const AArch64Subtarget *Subtarget) {
21159
21160 StoreSDNode *S = cast<StoreSDNode>(Val: N);
21161 if (S->isVolatile() || S->isIndexed())
21162 return SDValue();
21163
21164 SDValue StVal = S->getValue();
21165 EVT VT = StVal.getValueType();
21166
21167 if (!VT.isFixedLengthVector())
21168 return SDValue();
21169
21170 // If we get a splat of zeros, convert this vector store to a store of
21171 // scalars. They will be merged into store pairs of xzr thereby removing one
21172 // instruction and one register.
21173 if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, St&: *S))
21174 return ReplacedZeroSplat;
21175
21176 // FIXME: The logic for deciding if an unaligned store should be split should
21177 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
21178 // a call to that function here.
21179
21180 if (!Subtarget->isMisaligned128StoreSlow())
21181 return SDValue();
21182
21183 // Don't split at -Oz.
21184 if (DAG.getMachineFunction().getFunction().hasMinSize())
21185 return SDValue();
21186
21187 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
21188 // those up regresses performance on micro-benchmarks and olden/bh.
21189 if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
21190 return SDValue();
21191
21192 // Split unaligned 16B stores. They are terrible for performance.
21193 // Don't split stores with alignment of 1 or 2. Code that uses clang vector
21194 // extensions can use this to mark that it does not want splitting to happen
21195 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
21196 // eliminating alignment hazards is only 1 in 8 for alignment of 2.
21197 if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) ||
21198 S->getAlign() <= Align(2))
21199 return SDValue();
21200
21201 // If we get a splat of a scalar convert this vector store to a store of
21202 // scalars. They will be merged into store pairs thereby removing two
21203 // instructions.
21204 if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, St&: *S))
21205 return ReplacedSplat;
21206
21207 SDLoc DL(S);
21208
21209 // Split VT into two.
21210 EVT HalfVT = VT.getHalfNumVectorElementsVT(Context&: *DAG.getContext());
21211 unsigned NumElts = HalfVT.getVectorNumElements();
21212 SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
21213 DAG.getConstant(0, DL, MVT::i64));
21214 SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
21215 DAG.getConstant(NumElts, DL, MVT::i64));
21216 SDValue BasePtr = S->getBasePtr();
21217 SDValue NewST1 =
21218 DAG.getStore(Chain: S->getChain(), dl: DL, Val: SubVector0, Ptr: BasePtr, PtrInfo: S->getPointerInfo(),
21219 Alignment: S->getAlign(), MMOFlags: S->getMemOperand()->getFlags());
21220 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
21221 DAG.getConstant(8, DL, MVT::i64));
21222 return DAG.getStore(Chain: NewST1.getValue(R: 0), dl: DL, Val: SubVector1, Ptr: OffsetPtr,
21223 PtrInfo: S->getPointerInfo(), Alignment: S->getAlign(),
21224 MMOFlags: S->getMemOperand()->getFlags());
21225}
21226
21227static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG) {
21228 assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexepected Opcode!");
21229
21230 // splice(pg, op1, undef) -> op1
21231 if (N->getOperand(Num: 2).isUndef())
21232 return N->getOperand(Num: 1);
21233
21234 return SDValue();
21235}
21236
21237static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG,
21238 const AArch64Subtarget *Subtarget) {
21239 assert((N->getOpcode() == AArch64ISD::UUNPKHI ||
21240 N->getOpcode() == AArch64ISD::UUNPKLO) &&
21241 "Unexpected Opcode!");
21242
21243 // uunpklo/hi undef -> undef
21244 if (N->getOperand(Num: 0).isUndef())
21245 return DAG.getUNDEF(VT: N->getValueType(ResNo: 0));
21246
21247 // If this is a masked load followed by an UUNPKLO, fold this into a masked
21248 // extending load. We can do this even if this is already a masked
21249 // {z,}extload.
21250 if (N->getOperand(Num: 0).getOpcode() == ISD::MLOAD &&
21251 N->getOpcode() == AArch64ISD::UUNPKLO) {
21252 MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(Val: N->getOperand(Num: 0));
21253 SDValue Mask = MLD->getMask();
21254 SDLoc DL(N);
21255
21256 if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD &&
21257 SDValue(MLD, 0).hasOneUse() && Mask->getOpcode() == AArch64ISD::PTRUE &&
21258 (MLD->getPassThru()->isUndef() ||
21259 isZerosVector(N: MLD->getPassThru().getNode()))) {
21260 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
21261 unsigned PgPattern = Mask->getConstantOperandVal(Num: 0);
21262 EVT VT = N->getValueType(ResNo: 0);
21263
21264 // Ensure we can double the size of the predicate pattern
21265 unsigned NumElts = getNumElementsFromSVEPredPattern(Pattern: PgPattern);
21266 if (NumElts &&
21267 NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) {
21268 Mask =
21269 getPTrue(DAG, DL, VT.changeVectorElementType(MVT::i1), PgPattern);
21270 SDValue PassThru = DAG.getConstant(Val: 0, DL, VT);
21271 SDValue NewLoad = DAG.getMaskedLoad(
21272 VT, dl: DL, Chain: MLD->getChain(), Base: MLD->getBasePtr(), Offset: MLD->getOffset(), Mask,
21273 Src0: PassThru, MemVT: MLD->getMemoryVT(), MMO: MLD->getMemOperand(),
21274 AM: MLD->getAddressingMode(), ISD::ZEXTLOAD);
21275
21276 DAG.ReplaceAllUsesOfValueWith(From: SDValue(MLD, 1), To: NewLoad.getValue(R: 1));
21277
21278 return NewLoad;
21279 }
21280 }
21281 }
21282
21283 return SDValue();
21284}
21285
21286static bool isHalvingTruncateAndConcatOfLegalIntScalableType(SDNode *N) {
21287 if (N->getOpcode() != AArch64ISD::UZP1)
21288 return false;
21289 SDValue Op0 = N->getOperand(Num: 0);
21290 EVT SrcVT = Op0->getValueType(ResNo: 0);
21291 EVT DstVT = N->getValueType(ResNo: 0);
21292 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv16i8) ||
21293 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv8i16) ||
21294 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv4i32);
21295}
21296
21297// Try to combine rounding shifts where the operands come from an extend, and
21298// the result is truncated and combined into one vector.
21299// uzp1(rshrnb(uunpklo(X),C), rshrnb(uunpkhi(X), C)) -> urshr(X, C)
21300static SDValue tryCombineExtendRShTrunc(SDNode *N, SelectionDAG &DAG) {
21301 assert(N->getOpcode() == AArch64ISD::UZP1 && "Only UZP1 expected.");
21302 SDValue Op0 = N->getOperand(Num: 0);
21303 SDValue Op1 = N->getOperand(Num: 1);
21304 EVT ResVT = N->getValueType(ResNo: 0);
21305
21306 unsigned RshOpc = Op0.getOpcode();
21307 if (RshOpc != AArch64ISD::RSHRNB_I)
21308 return SDValue();
21309
21310 // Same op code and imm value?
21311 SDValue ShiftValue = Op0.getOperand(i: 1);
21312 if (RshOpc != Op1.getOpcode() || ShiftValue != Op1.getOperand(i: 1))
21313 return SDValue();
21314
21315 // Same unextended operand value?
21316 SDValue Lo = Op0.getOperand(i: 0);
21317 SDValue Hi = Op1.getOperand(i: 0);
21318 if (Lo.getOpcode() != AArch64ISD::UUNPKLO &&
21319 Hi.getOpcode() != AArch64ISD::UUNPKHI)
21320 return SDValue();
21321 SDValue OrigArg = Lo.getOperand(i: 0);
21322 if (OrigArg != Hi.getOperand(i: 0))
21323 return SDValue();
21324
21325 SDLoc DL(N);
21326 return DAG.getNode(Opcode: AArch64ISD::URSHR_I_PRED, DL, VT: ResVT,
21327 N1: getPredicateForVector(DAG, DL, VT: ResVT), N2: OrigArg,
21328 N3: ShiftValue);
21329}
21330
21331// Try to simplify:
21332// t1 = nxv8i16 add(X, 1 << (ShiftValue - 1))
21333// t2 = nxv8i16 srl(t1, ShiftValue)
21334// to
21335// t1 = nxv8i16 rshrnb(X, shiftvalue).
21336// rshrnb will zero the top half bits of each element. Therefore, this combine
21337// should only be performed when a following instruction with the rshrnb
21338// as an operand does not care about the top half of each element. For example,
21339// a uzp1 or a truncating store.
21340static SDValue trySimplifySrlAddToRshrnb(SDValue Srl, SelectionDAG &DAG,
21341 const AArch64Subtarget *Subtarget) {
21342 EVT VT = Srl->getValueType(ResNo: 0);
21343 if (!VT.isScalableVector() || !Subtarget->hasSVE2())
21344 return SDValue();
21345
21346 EVT ResVT;
21347 if (VT == MVT::nxv8i16)
21348 ResVT = MVT::nxv16i8;
21349 else if (VT == MVT::nxv4i32)
21350 ResVT = MVT::nxv8i16;
21351 else if (VT == MVT::nxv2i64)
21352 ResVT = MVT::nxv4i32;
21353 else
21354 return SDValue();
21355
21356 SDLoc DL(Srl);
21357 unsigned ShiftValue;
21358 SDValue RShOperand;
21359 if (!canLowerSRLToRoundingShiftForVT(Shift: Srl, ResVT, DAG, ShiftValue, RShOperand))
21360 return SDValue();
21361 SDValue Rshrnb = DAG.getNode(
21362 AArch64ISD::RSHRNB_I, DL, ResVT,
21363 {RShOperand, DAG.getTargetConstant(ShiftValue, DL, MVT::i32)});
21364 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Rshrnb);
21365}
21366
21367static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG,
21368 const AArch64Subtarget *Subtarget) {
21369 SDLoc DL(N);
21370 SDValue Op0 = N->getOperand(Num: 0);
21371 SDValue Op1 = N->getOperand(Num: 1);
21372 EVT ResVT = N->getValueType(ResNo: 0);
21373
21374 // uzp1(x, undef) -> concat(truncate(x), undef)
21375 if (Op1.getOpcode() == ISD::UNDEF) {
21376 EVT BCVT = MVT::Other, HalfVT = MVT::Other;
21377 switch (ResVT.getSimpleVT().SimpleTy) {
21378 default:
21379 break;
21380 case MVT::v16i8:
21381 BCVT = MVT::v8i16;
21382 HalfVT = MVT::v8i8;
21383 break;
21384 case MVT::v8i16:
21385 BCVT = MVT::v4i32;
21386 HalfVT = MVT::v4i16;
21387 break;
21388 case MVT::v4i32:
21389 BCVT = MVT::v2i64;
21390 HalfVT = MVT::v2i32;
21391 break;
21392 }
21393 if (BCVT != MVT::Other) {
21394 SDValue BC = DAG.getBitcast(VT: BCVT, V: Op0);
21395 SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: HalfVT, Operand: BC);
21396 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: ResVT, N1: Trunc,
21397 N2: DAG.getUNDEF(VT: HalfVT));
21398 }
21399 }
21400
21401 if (SDValue Urshr = tryCombineExtendRShTrunc(N, DAG))
21402 return Urshr;
21403
21404 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Srl: Op0, DAG, Subtarget))
21405 return DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: ResVT, N1: Rshrnb, N2: Op1);
21406
21407 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Srl: Op1, DAG, Subtarget))
21408 return DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: ResVT, N1: Op0, N2: Rshrnb);
21409
21410 // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z)
21411 if (Op0.getOpcode() == AArch64ISD::UUNPKLO) {
21412 if (Op0.getOperand(i: 0).getOpcode() == AArch64ISD::UZP1) {
21413 SDValue X = Op0.getOperand(i: 0).getOperand(i: 0);
21414 return DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: ResVT, N1: X, N2: Op1);
21415 }
21416 }
21417
21418 // uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z)
21419 if (Op1.getOpcode() == AArch64ISD::UUNPKHI) {
21420 if (Op1.getOperand(i: 0).getOpcode() == AArch64ISD::UZP1) {
21421 SDValue Z = Op1.getOperand(i: 0).getOperand(i: 1);
21422 return DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: ResVT, N1: Op0, N2: Z);
21423 }
21424 }
21425
21426 // These optimizations only work on little endian.
21427 if (!DAG.getDataLayout().isLittleEndian())
21428 return SDValue();
21429
21430 // uzp1(bitcast(x), bitcast(y)) -> uzp1(x, y)
21431 // Example:
21432 // nxv4i32 = uzp1 bitcast(nxv4i32 x to nxv2i64), bitcast(nxv4i32 y to nxv2i64)
21433 // to
21434 // nxv4i32 = uzp1 nxv4i32 x, nxv4i32 y
21435 if (isHalvingTruncateAndConcatOfLegalIntScalableType(N) &&
21436 Op0.getOpcode() == ISD::BITCAST && Op1.getOpcode() == ISD::BITCAST) {
21437 if (Op0.getOperand(i: 0).getValueType() == Op1.getOperand(i: 0).getValueType()) {
21438 return DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: ResVT, N1: Op0.getOperand(i: 0),
21439 N2: Op1.getOperand(i: 0));
21440 }
21441 }
21442
21443 if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
21444 return SDValue();
21445
21446 SDValue SourceOp0 = peekThroughBitcasts(V: Op0);
21447 SDValue SourceOp1 = peekThroughBitcasts(V: Op1);
21448
21449 // truncating uzp1(x, y) -> xtn(concat (x, y))
21450 if (SourceOp0.getValueType() == SourceOp1.getValueType()) {
21451 EVT Op0Ty = SourceOp0.getValueType();
21452 if ((ResVT == MVT::v4i16 && Op0Ty == MVT::v2i32) ||
21453 (ResVT == MVT::v8i8 && Op0Ty == MVT::v4i16)) {
21454 SDValue Concat =
21455 DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL,
21456 VT: Op0Ty.getDoubleNumVectorElementsVT(Context&: *DAG.getContext()),
21457 N1: SourceOp0, N2: SourceOp1);
21458 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ResVT, Operand: Concat);
21459 }
21460 }
21461
21462 // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
21463 if (SourceOp0.getOpcode() != ISD::TRUNCATE ||
21464 SourceOp1.getOpcode() != ISD::TRUNCATE)
21465 return SDValue();
21466 SourceOp0 = SourceOp0.getOperand(i: 0);
21467 SourceOp1 = SourceOp1.getOperand(i: 0);
21468
21469 if (SourceOp0.getValueType() != SourceOp1.getValueType() ||
21470 !SourceOp0.getValueType().isSimple())
21471 return SDValue();
21472
21473 EVT ResultTy;
21474
21475 switch (SourceOp0.getSimpleValueType().SimpleTy) {
21476 case MVT::v2i64:
21477 ResultTy = MVT::v4i32;
21478 break;
21479 case MVT::v4i32:
21480 ResultTy = MVT::v8i16;
21481 break;
21482 case MVT::v8i16:
21483 ResultTy = MVT::v16i8;
21484 break;
21485 default:
21486 return SDValue();
21487 }
21488
21489 SDValue UzpOp0 = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ResultTy, Operand: SourceOp0);
21490 SDValue UzpOp1 = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ResultTy, Operand: SourceOp1);
21491 SDValue UzpResult =
21492 DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: UzpOp0.getValueType(), N1: UzpOp0, N2: UzpOp1);
21493
21494 EVT BitcastResultTy;
21495
21496 switch (ResVT.getSimpleVT().SimpleTy) {
21497 case MVT::v2i32:
21498 BitcastResultTy = MVT::v2i64;
21499 break;
21500 case MVT::v4i16:
21501 BitcastResultTy = MVT::v4i32;
21502 break;
21503 case MVT::v8i8:
21504 BitcastResultTy = MVT::v8i16;
21505 break;
21506 default:
21507 llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}");
21508 }
21509
21510 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ResVT,
21511 Operand: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: BitcastResultTy, Operand: UzpResult));
21512}
21513
21514static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG) {
21515 unsigned Opc = N->getOpcode();
21516
21517 assert(((Opc >= AArch64ISD::GLD1_MERGE_ZERO && // unsigned gather loads
21518 Opc <= AArch64ISD::GLD1_IMM_MERGE_ZERO) ||
21519 (Opc >= AArch64ISD::GLD1S_MERGE_ZERO && // signed gather loads
21520 Opc <= AArch64ISD::GLD1S_IMM_MERGE_ZERO)) &&
21521 "Invalid opcode.");
21522
21523 const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||
21524 Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
21525 const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||
21526 Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
21527 const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||
21528 Opc == AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO ||
21529 Opc == AArch64ISD::GLD1_UXTW_MERGE_ZERO ||
21530 Opc == AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO;
21531
21532 SDLoc DL(N);
21533 SDValue Chain = N->getOperand(Num: 0);
21534 SDValue Pg = N->getOperand(Num: 1);
21535 SDValue Base = N->getOperand(Num: 2);
21536 SDValue Offset = N->getOperand(Num: 3);
21537 SDValue Ty = N->getOperand(Num: 4);
21538
21539 EVT ResVT = N->getValueType(ResNo: 0);
21540
21541 const auto OffsetOpc = Offset.getOpcode();
21542 const bool OffsetIsZExt =
21543 OffsetOpc == AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU;
21544 const bool OffsetIsSExt =
21545 OffsetOpc == AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU;
21546
21547 // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
21548 if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {
21549 SDValue ExtPg = Offset.getOperand(i: 0);
21550 VTSDNode *ExtFrom = cast<VTSDNode>(Val: Offset.getOperand(i: 2).getNode());
21551 EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
21552
21553 // If the predicate for the sign- or zero-extended offset is the
21554 // same as the predicate used for this load and the sign-/zero-extension
21555 // was from a 32-bits...
21556 if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
21557 SDValue UnextendedOffset = Offset.getOperand(i: 1);
21558
21559 unsigned NewOpc = getGatherVecOpcode(IsScaled: Scaled, IsSigned: OffsetIsSExt, NeedsExtend: true);
21560 if (Signed)
21561 NewOpc = getSignExtendedGatherOpcode(Opcode: NewOpc);
21562
21563 return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other},
21564 {Chain, Pg, Base, UnextendedOffset, Ty});
21565 }
21566 }
21567
21568 return SDValue();
21569}
21570
21571/// Optimize a vector shift instruction and its operand if shifted out
21572/// bits are not used.
21573static SDValue performVectorShiftCombine(SDNode *N,
21574 const AArch64TargetLowering &TLI,
21575 TargetLowering::DAGCombinerInfo &DCI) {
21576 assert(N->getOpcode() == AArch64ISD::VASHR ||
21577 N->getOpcode() == AArch64ISD::VLSHR);
21578
21579 SDValue Op = N->getOperand(Num: 0);
21580 unsigned OpScalarSize = Op.getScalarValueSizeInBits();
21581
21582 unsigned ShiftImm = N->getConstantOperandVal(Num: 1);
21583 assert(OpScalarSize > ShiftImm && "Invalid shift imm");
21584
21585 // Remove sign_extend_inreg (ashr(shl(x)) based on the number of sign bits.
21586 if (N->getOpcode() == AArch64ISD::VASHR &&
21587 Op.getOpcode() == AArch64ISD::VSHL &&
21588 N->getOperand(Num: 1) == Op.getOperand(i: 1))
21589 if (DCI.DAG.ComputeNumSignBits(Op: Op.getOperand(i: 0)) > ShiftImm)
21590 return Op.getOperand(i: 0);
21591
21592 APInt ShiftedOutBits = APInt::getLowBitsSet(numBits: OpScalarSize, loBitsSet: ShiftImm);
21593 APInt DemandedMask = ~ShiftedOutBits;
21594
21595 if (TLI.SimplifyDemandedBits(Op, DemandedBits: DemandedMask, DCI))
21596 return SDValue(N, 0);
21597
21598 return SDValue();
21599}
21600
21601static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG) {
21602 // sunpklo(sext(pred)) -> sext(extract_low_half(pred))
21603 // This transform works in partnership with performSetCCPunpkCombine to
21604 // remove unnecessary transfer of predicates into standard registers and back
21605 if (N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND &&
21606 N->getOperand(0)->getOperand(0)->getValueType(0).getScalarType() ==
21607 MVT::i1) {
21608 SDValue CC = N->getOperand(Num: 0)->getOperand(Num: 0);
21609 auto VT = CC->getValueType(ResNo: 0).getHalfNumVectorElementsVT(Context&: *DAG.getContext());
21610 SDValue Unpk = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SDLoc(N), VT, N1: CC,
21611 N2: DAG.getVectorIdxConstant(Val: 0, DL: SDLoc(N)));
21612 return DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), Operand: Unpk);
21613 }
21614
21615 return SDValue();
21616}
21617
21618/// Target-specific DAG combine function for post-increment LD1 (lane) and
21619/// post-increment LD1R.
21620static SDValue performPostLD1Combine(SDNode *N,
21621 TargetLowering::DAGCombinerInfo &DCI,
21622 bool IsLaneOp) {
21623 if (DCI.isBeforeLegalizeOps())
21624 return SDValue();
21625
21626 SelectionDAG &DAG = DCI.DAG;
21627 EVT VT = N->getValueType(ResNo: 0);
21628
21629 if (!VT.is128BitVector() && !VT.is64BitVector())
21630 return SDValue();
21631
21632 unsigned LoadIdx = IsLaneOp ? 1 : 0;
21633 SDNode *LD = N->getOperand(Num: LoadIdx).getNode();
21634 // If it is not LOAD, can not do such combine.
21635 if (LD->getOpcode() != ISD::LOAD)
21636 return SDValue();
21637
21638 // The vector lane must be a constant in the LD1LANE opcode.
21639 SDValue Lane;
21640 if (IsLaneOp) {
21641 Lane = N->getOperand(Num: 2);
21642 auto *LaneC = dyn_cast<ConstantSDNode>(Val&: Lane);
21643 if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
21644 return SDValue();
21645 }
21646
21647 LoadSDNode *LoadSDN = cast<LoadSDNode>(Val: LD);
21648 EVT MemVT = LoadSDN->getMemoryVT();
21649 // Check if memory operand is the same type as the vector element.
21650 if (MemVT != VT.getVectorElementType())
21651 return SDValue();
21652
21653 // Check if there are other uses. If so, do not combine as it will introduce
21654 // an extra load.
21655 for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
21656 ++UI) {
21657 if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
21658 continue;
21659 if (*UI != N)
21660 return SDValue();
21661 }
21662
21663 // If there is one use and it can splat the value, prefer that operation.
21664 // TODO: This could be expanded to more operations if they reliably use the
21665 // index variants.
21666 if (N->hasOneUse()) {
21667 unsigned UseOpc = N->use_begin()->getOpcode();
21668 if (UseOpc == ISD::FMUL || UseOpc == ISD::FMA)
21669 return SDValue();
21670 }
21671
21672 SDValue Addr = LD->getOperand(Num: 1);
21673 SDValue Vector = N->getOperand(Num: 0);
21674 // Search for a use of the address operand that is an increment.
21675 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
21676 Addr.getNode()->use_end(); UI != UE; ++UI) {
21677 SDNode *User = *UI;
21678 if (User->getOpcode() != ISD::ADD
21679 || UI.getUse().getResNo() != Addr.getResNo())
21680 continue;
21681
21682 // If the increment is a constant, it must match the memory ref size.
21683 SDValue Inc = User->getOperand(Num: User->getOperand(Num: 0) == Addr ? 1 : 0);
21684 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Val: Inc.getNode())) {
21685 uint32_t IncVal = CInc->getZExtValue();
21686 unsigned NumBytes = VT.getScalarSizeInBits() / 8;
21687 if (IncVal != NumBytes)
21688 continue;
21689 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
21690 }
21691
21692 // To avoid cycle construction make sure that neither the load nor the add
21693 // are predecessors to each other or the Vector.
21694 SmallPtrSet<const SDNode *, 32> Visited;
21695 SmallVector<const SDNode *, 16> Worklist;
21696 Visited.insert(Ptr: Addr.getNode());
21697 Worklist.push_back(Elt: User);
21698 Worklist.push_back(Elt: LD);
21699 Worklist.push_back(Elt: Vector.getNode());
21700 if (SDNode::hasPredecessorHelper(N: LD, Visited, Worklist) ||
21701 SDNode::hasPredecessorHelper(N: User, Visited, Worklist))
21702 continue;
21703
21704 SmallVector<SDValue, 8> Ops;
21705 Ops.push_back(Elt: LD->getOperand(Num: 0)); // Chain
21706 if (IsLaneOp) {
21707 Ops.push_back(Elt: Vector); // The vector to be inserted
21708 Ops.push_back(Elt: Lane); // The lane to be inserted in the vector
21709 }
21710 Ops.push_back(Elt: Addr);
21711 Ops.push_back(Elt: Inc);
21712
21713 EVT Tys[3] = { VT, MVT::i64, MVT::Other };
21714 SDVTList SDTys = DAG.getVTList(VTs: Tys);
21715 unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
21716 SDValue UpdN = DAG.getMemIntrinsicNode(Opcode: NewOp, dl: SDLoc(N), VTList: SDTys, Ops,
21717 MemVT,
21718 MMO: LoadSDN->getMemOperand());
21719
21720 // Update the uses.
21721 SDValue NewResults[] = {
21722 SDValue(LD, 0), // The result of load
21723 SDValue(UpdN.getNode(), 2) // Chain
21724 };
21725 DCI.CombineTo(N: LD, To: NewResults);
21726 DCI.CombineTo(N, Res: SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
21727 DCI.CombineTo(N: User, Res: SDValue(UpdN.getNode(), 1)); // Write back register
21728
21729 break;
21730 }
21731 return SDValue();
21732}
21733
21734/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
21735/// address translation.
21736static bool performTBISimplification(SDValue Addr,
21737 TargetLowering::DAGCombinerInfo &DCI,
21738 SelectionDAG &DAG) {
21739 APInt DemandedMask = APInt::getLowBitsSet(numBits: 64, loBitsSet: 56);
21740 KnownBits Known;
21741 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
21742 !DCI.isBeforeLegalizeOps());
21743 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21744 if (TLI.SimplifyDemandedBits(Op: Addr, DemandedBits: DemandedMask, Known, TLO)) {
21745 DCI.CommitTargetLoweringOpt(TLO);
21746 return true;
21747 }
21748 return false;
21749}
21750
21751static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) {
21752 assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) &&
21753 "Expected STORE dag node in input!");
21754
21755 if (auto Store = dyn_cast<StoreSDNode>(Val: N)) {
21756 if (!Store->isTruncatingStore() || Store->isIndexed())
21757 return SDValue();
21758 SDValue Ext = Store->getValue();
21759 auto ExtOpCode = Ext.getOpcode();
21760 if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
21761 ExtOpCode != ISD::ANY_EXTEND)
21762 return SDValue();
21763 SDValue Orig = Ext->getOperand(Num: 0);
21764 if (Store->getMemoryVT() != Orig.getValueType())
21765 return SDValue();
21766 return DAG.getStore(Chain: Store->getChain(), dl: SDLoc(Store), Val: Orig,
21767 Ptr: Store->getBasePtr(), MMO: Store->getMemOperand());
21768 }
21769
21770 return SDValue();
21771}
21772
21773// A custom combine to lower load <3 x i8> as the more efficient sequence
21774// below:
21775// ldrb wX, [x0, #2]
21776// ldrh wY, [x0]
21777// orr wX, wY, wX, lsl #16
21778// fmov s0, wX
21779//
21780// Note that an alternative sequence with even fewer (although usually more
21781// complex/expensive) instructions would be:
21782// ld1r.4h { v0 }, [x0], #2
21783// ld1.b { v0 }[2], [x0]
21784//
21785// Generating this sequence unfortunately results in noticeably worse codegen
21786// for code that extends the loaded v3i8, due to legalization breaking vector
21787// shuffle detection in a way that is very difficult to work around.
21788// TODO: Revisit once v3i8 legalization has been improved in general.
21789static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) {
21790 EVT MemVT = LD->getMemoryVT();
21791 if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3) ||
21792 LD->getOriginalAlign() >= 4)
21793 return SDValue();
21794
21795 SDLoc DL(LD);
21796 MachineFunction &MF = DAG.getMachineFunction();
21797 SDValue Chain = LD->getChain();
21798 SDValue BasePtr = LD->getBasePtr();
21799 MachineMemOperand *MMO = LD->getMemOperand();
21800 assert(LD->getOffset().isUndef() && "undef offset expected");
21801
21802 // Load 2 x i8, then 1 x i8.
21803 SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, MMO);
21804 TypeSize Offset2 = TypeSize::getFixed(ExactSize: 2);
21805 SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain,
21806 DAG.getMemBasePlusOffset(BasePtr, Offset2, DL),
21807 MF.getMachineMemOperand(MMO, 2, 1));
21808
21809 // Extend to i32.
21810 SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);
21811 SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
21812
21813 // Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
21814 SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8,
21815 DAG.getConstant(16, DL, MVT::i32));
21816 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shl);
21817 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or);
21818
21819 // Extract v3i8 again.
21820 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast,
21821 DAG.getConstant(0, DL, MVT::i64));
21822 SDValue TokenFactor = DAG.getNode(
21823 ISD::TokenFactor, DL, MVT::Other,
21824 {SDValue(cast<SDNode>(L16), 1), SDValue(cast<SDNode>(L8), 1)});
21825 return DAG.getMergeValues(Ops: {Extract, TokenFactor}, dl: DL);
21826}
21827
21828// Perform TBI simplification if supported by the target and try to break up
21829// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
21830// load instructions can be selected.
21831static SDValue performLOADCombine(SDNode *N,
21832 TargetLowering::DAGCombinerInfo &DCI,
21833 SelectionDAG &DAG,
21834 const AArch64Subtarget *Subtarget) {
21835 if (Subtarget->supportsAddressTopByteIgnored())
21836 performTBISimplification(Addr: N->getOperand(Num: 1), DCI, DAG);
21837
21838 LoadSDNode *LD = cast<LoadSDNode>(Val: N);
21839 if (LD->isVolatile() || !Subtarget->isLittleEndian())
21840 return SDValue(N, 0);
21841
21842 if (SDValue Res = combineV3I8LoadExt(LD, DAG))
21843 return Res;
21844
21845 if (!LD->isNonTemporal())
21846 return SDValue(N, 0);
21847
21848 EVT MemVT = LD->getMemoryVT();
21849 if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
21850 MemVT.getSizeInBits() % 256 == 0 ||
21851 256 % MemVT.getScalarSizeInBits() != 0)
21852 return SDValue(N, 0);
21853
21854 SDLoc DL(LD);
21855 SDValue Chain = LD->getChain();
21856 SDValue BasePtr = LD->getBasePtr();
21857 SDNodeFlags Flags = LD->getFlags();
21858 SmallVector<SDValue, 4> LoadOps;
21859 SmallVector<SDValue, 4> LoadOpsChain;
21860 // Replace any non temporal load over 256-bit with a series of 256 bit loads
21861 // and a scalar/vector load less than 256. This way we can utilize 256-bit
21862 // loads and reduce the amount of load instructions generated.
21863 MVT NewVT =
21864 MVT::getVectorVT(VT: MemVT.getVectorElementType().getSimpleVT(),
21865 NumElements: 256 / MemVT.getVectorElementType().getSizeInBits());
21866 unsigned Num256Loads = MemVT.getSizeInBits() / 256;
21867 // Create all 256-bit loads starting from offset 0 and up to Num256Loads-1*32.
21868 for (unsigned I = 0; I < Num256Loads; I++) {
21869 unsigned PtrOffset = I * 32;
21870 SDValue NewPtr = DAG.getMemBasePlusOffset(
21871 Base: BasePtr, Offset: TypeSize::getFixed(ExactSize: PtrOffset), DL, Flags);
21872 Align NewAlign = commonAlignment(A: LD->getAlign(), Offset: PtrOffset);
21873 SDValue NewLoad = DAG.getLoad(
21874 VT: NewVT, dl: DL, Chain, Ptr: NewPtr, PtrInfo: LD->getPointerInfo().getWithOffset(O: PtrOffset),
21875 Alignment: NewAlign, MMOFlags: LD->getMemOperand()->getFlags(), AAInfo: LD->getAAInfo());
21876 LoadOps.push_back(Elt: NewLoad);
21877 LoadOpsChain.push_back(Elt: SDValue(cast<SDNode>(Val&: NewLoad), 1));
21878 }
21879
21880 // Process remaining bits of the load operation.
21881 // This is done by creating an UNDEF vector to match the size of the
21882 // 256-bit loads and inserting the remaining load to it. We extract the
21883 // original load type at the end using EXTRACT_SUBVECTOR instruction.
21884 unsigned BitsRemaining = MemVT.getSizeInBits() % 256;
21885 unsigned PtrOffset = (MemVT.getSizeInBits() - BitsRemaining) / 8;
21886 MVT RemainingVT = MVT::getVectorVT(
21887 VT: MemVT.getVectorElementType().getSimpleVT(),
21888 NumElements: BitsRemaining / MemVT.getVectorElementType().getSizeInBits());
21889 SDValue NewPtr = DAG.getMemBasePlusOffset(
21890 Base: BasePtr, Offset: TypeSize::getFixed(ExactSize: PtrOffset), DL, Flags);
21891 Align NewAlign = commonAlignment(A: LD->getAlign(), Offset: PtrOffset);
21892 SDValue RemainingLoad =
21893 DAG.getLoad(VT: RemainingVT, dl: DL, Chain, Ptr: NewPtr,
21894 PtrInfo: LD->getPointerInfo().getWithOffset(O: PtrOffset), Alignment: NewAlign,
21895 MMOFlags: LD->getMemOperand()->getFlags(), AAInfo: LD->getAAInfo());
21896 SDValue UndefVector = DAG.getUNDEF(VT: NewVT);
21897 SDValue InsertIdx = DAG.getVectorIdxConstant(Val: 0, DL);
21898 SDValue ExtendedReminingLoad =
21899 DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT: NewVT,
21900 Ops: {UndefVector, RemainingLoad, InsertIdx});
21901 LoadOps.push_back(Elt: ExtendedReminingLoad);
21902 LoadOpsChain.push_back(Elt: SDValue(cast<SDNode>(Val&: RemainingLoad), 1));
21903 EVT ConcatVT =
21904 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MemVT.getScalarType(),
21905 NumElements: LoadOps.size() * NewVT.getVectorNumElements());
21906 SDValue ConcatVectors =
21907 DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: ConcatVT, Ops: LoadOps);
21908 // Extract the original vector type size.
21909 SDValue ExtractSubVector =
21910 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MemVT,
21911 Ops: {ConcatVectors, DAG.getVectorIdxConstant(Val: 0, DL)});
21912 SDValue TokenFactor =
21913 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, LoadOpsChain);
21914 return DAG.getMergeValues(Ops: {ExtractSubVector, TokenFactor}, dl: DL);
21915}
21916
21917static EVT tryGetOriginalBoolVectorType(SDValue Op, int Depth = 0) {
21918 EVT VecVT = Op.getValueType();
21919 assert(VecVT.isVector() && VecVT.getVectorElementType() == MVT::i1 &&
21920 "Need boolean vector type.");
21921
21922 if (Depth > 3)
21923 return MVT::INVALID_SIMPLE_VALUE_TYPE;
21924
21925 // We can get the base type from a vector compare or truncate.
21926 if (Op.getOpcode() == ISD::SETCC || Op.getOpcode() == ISD::TRUNCATE)
21927 return Op.getOperand(i: 0).getValueType();
21928
21929 // If an operand is a bool vector, continue looking.
21930 EVT BaseVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
21931 for (SDValue Operand : Op->op_values()) {
21932 if (Operand.getValueType() != VecVT)
21933 continue;
21934
21935 EVT OperandVT = tryGetOriginalBoolVectorType(Op: Operand, Depth: Depth + 1);
21936 if (!BaseVT.isSimple())
21937 BaseVT = OperandVT;
21938 else if (OperandVT != BaseVT)
21939 return MVT::INVALID_SIMPLE_VALUE_TYPE;
21940 }
21941
21942 return BaseVT;
21943}
21944
21945// When converting a <N x iX> vector to <N x i1> to store or use as a scalar
21946// iN, we can use a trick that extracts the i^th bit from the i^th element and
21947// then performs a vector add to get a scalar bitmask. This requires that each
21948// element's bits are either all 1 or all 0.
21949static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) {
21950 SDLoc DL(N);
21951 SDValue ComparisonResult(N, 0);
21952 EVT VecVT = ComparisonResult.getValueType();
21953 assert(VecVT.isVector() && "Must be a vector type");
21954
21955 unsigned NumElts = VecVT.getVectorNumElements();
21956 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
21957 return SDValue();
21958
21959 if (VecVT.getVectorElementType() != MVT::i1 &&
21960 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT))
21961 return SDValue();
21962
21963 // If we can find the original types to work on instead of a vector of i1,
21964 // we can avoid extend/extract conversion instructions.
21965 if (VecVT.getVectorElementType() == MVT::i1) {
21966 VecVT = tryGetOriginalBoolVectorType(Op: ComparisonResult);
21967 if (!VecVT.isSimple()) {
21968 unsigned BitsPerElement = std::max(a: 64 / NumElts, b: 8u); // >= 64-bit vector
21969 VecVT = MVT::getVectorVT(VT: MVT::getIntegerVT(BitWidth: BitsPerElement), NumElements: NumElts);
21970 }
21971 }
21972 VecVT = VecVT.changeVectorElementTypeToInteger();
21973
21974 // Large vectors don't map directly to this conversion, so to avoid too many
21975 // edge cases, we don't apply it here. The conversion will likely still be
21976 // applied later via multiple smaller vectors, whose results are concatenated.
21977 if (VecVT.getSizeInBits() > 128)
21978 return SDValue();
21979
21980 // Ensure that all elements' bits are either 0s or 1s.
21981 ComparisonResult = DAG.getSExtOrTrunc(Op: ComparisonResult, DL, VT: VecVT);
21982
21983 SmallVector<SDValue, 16> MaskConstants;
21984 if (VecVT == MVT::v16i8) {
21985 // v16i8 is a special case, as we have 16 entries but only 8 positional bits
21986 // per entry. We split it into two halves, apply the mask, zip the halves to
21987 // create 8x 16-bit values, and the perform the vector reduce.
21988 for (unsigned Half = 0; Half < 2; ++Half) {
21989 for (unsigned MaskBit = 1; MaskBit <= 128; MaskBit *= 2) {
21990 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i32));
21991 }
21992 }
21993 SDValue Mask = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: VecVT, Ops: MaskConstants);
21994 SDValue RepresentativeBits =
21995 DAG.getNode(Opcode: ISD::AND, DL, VT: VecVT, N1: ComparisonResult, N2: Mask);
21996
21997 SDValue UpperRepresentativeBits =
21998 DAG.getNode(AArch64ISD::EXT, DL, VecVT, RepresentativeBits,
21999 RepresentativeBits, DAG.getConstant(8, DL, MVT::i32));
22000 SDValue Zipped = DAG.getNode(Opcode: AArch64ISD::ZIP1, DL, VT: VecVT,
22001 N1: RepresentativeBits, N2: UpperRepresentativeBits);
22002 Zipped = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Zipped);
22003 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i16, Zipped);
22004 }
22005
22006 // All other vector sizes.
22007 unsigned MaxBitMask = 1u << (VecVT.getVectorNumElements() - 1);
22008 for (unsigned MaskBit = 1; MaskBit <= MaxBitMask; MaskBit *= 2) {
22009 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i64));
22010 }
22011
22012 SDValue Mask = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: VecVT, Ops: MaskConstants);
22013 SDValue RepresentativeBits =
22014 DAG.getNode(Opcode: ISD::AND, DL, VT: VecVT, N1: ComparisonResult, N2: Mask);
22015 EVT ResultVT = MVT::getIntegerVT(BitWidth: std::max<unsigned>(
22016 a: NumElts, b: VecVT.getVectorElementType().getSizeInBits()));
22017 return DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: ResultVT, Operand: RepresentativeBits);
22018}
22019
22020static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG,
22021 StoreSDNode *Store) {
22022 if (!Store->isTruncatingStore())
22023 return SDValue();
22024
22025 SDLoc DL(Store);
22026 SDValue VecOp = Store->getValue();
22027 EVT VT = VecOp.getValueType();
22028 EVT MemVT = Store->getMemoryVT();
22029
22030 if (!MemVT.isVector() || !VT.isVector() ||
22031 MemVT.getVectorElementType() != MVT::i1)
22032 return SDValue();
22033
22034 // If we are storing a vector that we are currently building, let
22035 // `scalarizeVectorStore()` handle this more efficiently.
22036 if (VecOp.getOpcode() == ISD::BUILD_VECTOR)
22037 return SDValue();
22038
22039 VecOp = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MemVT, Operand: VecOp);
22040 SDValue VectorBits = vectorToScalarBitmask(N: VecOp.getNode(), DAG);
22041 if (!VectorBits)
22042 return SDValue();
22043
22044 EVT StoreVT =
22045 EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: MemVT.getStoreSizeInBits());
22046 SDValue ExtendedBits = DAG.getZExtOrTrunc(Op: VectorBits, DL, VT: StoreVT);
22047 return DAG.getStore(Chain: Store->getChain(), dl: DL, Val: ExtendedBits, Ptr: Store->getBasePtr(),
22048 MMO: Store->getMemOperand());
22049}
22050
22051bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) {
22052 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||
22053 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||
22054 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32);
22055}
22056
22057// Combine store (trunc X to <3 x i8>) to sequence of ST1.b.
22058static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG,
22059 const AArch64Subtarget *Subtarget) {
22060 SDValue Value = ST->getValue();
22061 EVT ValueVT = Value.getValueType();
22062
22063 if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
22064 Value.getOpcode() != ISD::TRUNCATE ||
22065 ValueVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3))
22066 return SDValue();
22067
22068 assert(ST->getOffset().isUndef() && "undef offset expected");
22069 SDLoc DL(ST);
22070 auto WideVT = EVT::getVectorVT(
22071 Context&: *DAG.getContext(),
22072 VT: Value->getOperand(Num: 0).getValueType().getVectorElementType(), NumElements: 4);
22073 SDValue UndefVector = DAG.getUNDEF(VT: WideVT);
22074 SDValue WideTrunc = DAG.getNode(
22075 Opcode: ISD::INSERT_SUBVECTOR, DL, VT: WideVT,
22076 Ops: {UndefVector, Value->getOperand(Num: 0), DAG.getVectorIdxConstant(Val: 0, DL)});
22077 SDValue Cast = DAG.getNode(
22078 ISD::BITCAST, DL, WideVT.getSizeInBits() == 64 ? MVT::v8i8 : MVT::v16i8,
22079 WideTrunc);
22080
22081 MachineFunction &MF = DAG.getMachineFunction();
22082 SDValue Chain = ST->getChain();
22083 MachineMemOperand *MMO = ST->getMemOperand();
22084 unsigned IdxScale = WideVT.getScalarSizeInBits() / 8;
22085 SDValue E2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
22086 DAG.getConstant(2 * IdxScale, DL, MVT::i64));
22087 TypeSize Offset2 = TypeSize::getFixed(ExactSize: 2);
22088 SDValue Ptr2 = DAG.getMemBasePlusOffset(Base: ST->getBasePtr(), Offset: Offset2, DL);
22089 Chain = DAG.getStore(Chain, dl: DL, Val: E2, Ptr: Ptr2, MMO: MF.getMachineMemOperand(MMO, Offset: 2, Size: 1));
22090
22091 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
22092 DAG.getConstant(1 * IdxScale, DL, MVT::i64));
22093 TypeSize Offset1 = TypeSize::getFixed(ExactSize: 1);
22094 SDValue Ptr1 = DAG.getMemBasePlusOffset(Base: ST->getBasePtr(), Offset: Offset1, DL);
22095 Chain = DAG.getStore(Chain, dl: DL, Val: E1, Ptr: Ptr1, MMO: MF.getMachineMemOperand(MMO, Offset: 1, Size: 1));
22096
22097 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
22098 DAG.getConstant(0, DL, MVT::i64));
22099 Chain = DAG.getStore(Chain, dl: DL, Val: E0, Ptr: ST->getBasePtr(),
22100 MMO: MF.getMachineMemOperand(MMO, Offset: 0, Size: 1));
22101 return Chain;
22102}
22103
22104static SDValue performSTORECombine(SDNode *N,
22105 TargetLowering::DAGCombinerInfo &DCI,
22106 SelectionDAG &DAG,
22107 const AArch64Subtarget *Subtarget) {
22108 StoreSDNode *ST = cast<StoreSDNode>(Val: N);
22109 SDValue Chain = ST->getChain();
22110 SDValue Value = ST->getValue();
22111 SDValue Ptr = ST->getBasePtr();
22112 EVT ValueVT = Value.getValueType();
22113
22114 auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
22115 EVT EltVT = VT.getVectorElementType();
22116 return EltVT == MVT::f32 || EltVT == MVT::f64;
22117 };
22118
22119 if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
22120 return Res;
22121
22122 // If this is an FP_ROUND followed by a store, fold this into a truncating
22123 // store. We can do this even if this is already a truncstore.
22124 // We purposefully don't care about legality of the nodes here as we know
22125 // they can be split down into something legal.
22126 if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
22127 Value.getNode()->hasOneUse() && ST->isUnindexed() &&
22128 Subtarget->useSVEForFixedLengthVectors() &&
22129 ValueVT.isFixedLengthVector() &&
22130 ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() &&
22131 hasValidElementTypeForFPTruncStore(Value.getOperand(i: 0).getValueType()))
22132 return DAG.getTruncStore(Chain, dl: SDLoc(N), Val: Value.getOperand(i: 0), Ptr,
22133 SVT: ST->getMemoryVT(), MMO: ST->getMemOperand());
22134
22135 if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
22136 return Split;
22137
22138 if (Subtarget->supportsAddressTopByteIgnored() &&
22139 performTBISimplification(Addr: N->getOperand(Num: 2), DCI, DAG))
22140 return SDValue(N, 0);
22141
22142 if (SDValue Store = foldTruncStoreOfExt(DAG, N))
22143 return Store;
22144
22145 if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, Store: ST))
22146 return Store;
22147
22148 if (ST->isTruncatingStore()) {
22149 EVT StoreVT = ST->getMemoryVT();
22150 if (!isHalvingTruncateOfLegalScalableType(SrcVT: ValueVT, DstVT: StoreVT))
22151 return SDValue();
22152 if (SDValue Rshrnb =
22153 trySimplifySrlAddToRshrnb(Srl: ST->getOperand(Num: 1), DAG, Subtarget)) {
22154 return DAG.getTruncStore(Chain: ST->getChain(), dl: ST, Val: Rshrnb, Ptr: ST->getBasePtr(),
22155 SVT: StoreVT, MMO: ST->getMemOperand());
22156 }
22157 }
22158
22159 return SDValue();
22160}
22161
22162static SDValue performMSTORECombine(SDNode *N,
22163 TargetLowering::DAGCombinerInfo &DCI,
22164 SelectionDAG &DAG,
22165 const AArch64Subtarget *Subtarget) {
22166 MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(Val: N);
22167 SDValue Value = MST->getValue();
22168 SDValue Mask = MST->getMask();
22169 SDLoc DL(N);
22170
22171 // If this is a UZP1 followed by a masked store, fold this into a masked
22172 // truncating store. We can do this even if this is already a masked
22173 // truncstore.
22174 if (Value.getOpcode() == AArch64ISD::UZP1 && Value->hasOneUse() &&
22175 MST->isUnindexed() && Mask->getOpcode() == AArch64ISD::PTRUE &&
22176 Value.getValueType().isInteger()) {
22177 Value = Value.getOperand(i: 0);
22178 if (Value.getOpcode() == ISD::BITCAST) {
22179 EVT HalfVT =
22180 Value.getValueType().getHalfNumVectorElementsVT(Context&: *DAG.getContext());
22181 EVT InVT = Value.getOperand(i: 0).getValueType();
22182
22183 if (HalfVT.widenIntegerVectorElementType(Context&: *DAG.getContext()) == InVT) {
22184 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
22185 unsigned PgPattern = Mask->getConstantOperandVal(Num: 0);
22186
22187 // Ensure we can double the size of the predicate pattern
22188 unsigned NumElts = getNumElementsFromSVEPredPattern(Pattern: PgPattern);
22189 if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <=
22190 MinSVESize) {
22191 Mask = getPTrue(DAG, DL, InVT.changeVectorElementType(MVT::i1),
22192 PgPattern);
22193 return DAG.getMaskedStore(Chain: MST->getChain(), dl: DL, Val: Value.getOperand(i: 0),
22194 Base: MST->getBasePtr(), Offset: MST->getOffset(), Mask,
22195 MemVT: MST->getMemoryVT(), MMO: MST->getMemOperand(),
22196 AM: MST->getAddressingMode(),
22197 /*IsTruncating=*/true);
22198 }
22199 }
22200 }
22201 }
22202
22203 if (MST->isTruncatingStore()) {
22204 EVT ValueVT = Value->getValueType(ResNo: 0);
22205 EVT MemVT = MST->getMemoryVT();
22206 if (!isHalvingTruncateOfLegalScalableType(SrcVT: ValueVT, DstVT: MemVT))
22207 return SDValue();
22208 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Srl: Value, DAG, Subtarget)) {
22209 return DAG.getMaskedStore(Chain: MST->getChain(), dl: DL, Val: Rshrnb, Base: MST->getBasePtr(),
22210 Offset: MST->getOffset(), Mask: MST->getMask(),
22211 MemVT: MST->getMemoryVT(), MMO: MST->getMemOperand(),
22212 AM: MST->getAddressingMode(), IsTruncating: true);
22213 }
22214 }
22215
22216 return SDValue();
22217}
22218
22219/// \return true if part of the index was folded into the Base.
22220static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,
22221 SDLoc DL, SelectionDAG &DAG) {
22222 // This function assumes a vector of i64 indices.
22223 EVT IndexVT = Index.getValueType();
22224 if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64)
22225 return false;
22226
22227 // Simplify:
22228 // BasePtr = Ptr
22229 // Index = X + splat(Offset)
22230 // ->
22231 // BasePtr = Ptr + Offset * scale.
22232 // Index = X
22233 if (Index.getOpcode() == ISD::ADD) {
22234 if (auto Offset = DAG.getSplatValue(V: Index.getOperand(i: 1))) {
22235 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
22236 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
22237 Index = Index.getOperand(i: 0);
22238 return true;
22239 }
22240 }
22241
22242 // Simplify:
22243 // BasePtr = Ptr
22244 // Index = (X + splat(Offset)) << splat(Shift)
22245 // ->
22246 // BasePtr = Ptr + (Offset << Shift) * scale)
22247 // Index = X << splat(shift)
22248 if (Index.getOpcode() == ISD::SHL &&
22249 Index.getOperand(i: 0).getOpcode() == ISD::ADD) {
22250 SDValue Add = Index.getOperand(i: 0);
22251 SDValue ShiftOp = Index.getOperand(i: 1);
22252 SDValue OffsetOp = Add.getOperand(i: 1);
22253 if (auto Shift = DAG.getSplatValue(V: ShiftOp))
22254 if (auto Offset = DAG.getSplatValue(V: OffsetOp)) {
22255 Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, Shift);
22256 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
22257 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
22258 Index = DAG.getNode(Opcode: ISD::SHL, DL, VT: Index.getValueType(),
22259 N1: Add.getOperand(i: 0), N2: ShiftOp);
22260 return true;
22261 }
22262 }
22263
22264 return false;
22265}
22266
22267// Analyse the specified address returning true if a more optimal addressing
22268// mode is available. When returning true all parameters are updated to reflect
22269// their recommended values.
22270static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N,
22271 SDValue &BasePtr, SDValue &Index,
22272 SelectionDAG &DAG) {
22273 // Try to iteratively fold parts of the index into the base pointer to
22274 // simplify the index as much as possible.
22275 bool Changed = false;
22276 while (foldIndexIntoBase(BasePtr, Index, Scale: N->getScale(), DL: SDLoc(N), DAG))
22277 Changed = true;
22278
22279 // Only consider element types that are pointer sized as smaller types can
22280 // be easily promoted.
22281 EVT IndexVT = Index.getValueType();
22282 if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64)
22283 return Changed;
22284
22285 // Can indices be trivially shrunk?
22286 EVT DataVT = N->getOperand(Num: 1).getValueType();
22287 // Don't attempt to shrink the index for fixed vectors of 64 bit data since it
22288 // will later be re-extended to 64 bits in legalization
22289 if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == 64)
22290 return Changed;
22291 if (ISD::isVectorShrinkable(N: Index.getNode(), NewEltSize: 32, Signed: N->isIndexSigned())) {
22292 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
22293 Index = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SDLoc(N), VT: NewIndexVT, Operand: Index);
22294 return true;
22295 }
22296
22297 // Match:
22298 // Index = step(const)
22299 int64_t Stride = 0;
22300 if (Index.getOpcode() == ISD::STEP_VECTOR) {
22301 Stride = cast<ConstantSDNode>(Val: Index.getOperand(i: 0))->getSExtValue();
22302 }
22303 // Match:
22304 // Index = step(const) << shift(const)
22305 else if (Index.getOpcode() == ISD::SHL &&
22306 Index.getOperand(i: 0).getOpcode() == ISD::STEP_VECTOR) {
22307 SDValue RHS = Index.getOperand(i: 1);
22308 if (auto *Shift =
22309 dyn_cast_or_null<ConstantSDNode>(Val: DAG.getSplatValue(V: RHS))) {
22310 int64_t Step = (int64_t)Index.getOperand(i: 0).getConstantOperandVal(i: 1);
22311 Stride = Step << Shift->getZExtValue();
22312 }
22313 }
22314
22315 // Return early because no supported pattern is found.
22316 if (Stride == 0)
22317 return Changed;
22318
22319 if (Stride < std::numeric_limits<int32_t>::min() ||
22320 Stride > std::numeric_limits<int32_t>::max())
22321 return Changed;
22322
22323 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
22324 unsigned MaxVScale =
22325 Subtarget.getMaxSVEVectorSizeInBits() / AArch64::SVEBitsPerBlock;
22326 int64_t LastElementOffset =
22327 IndexVT.getVectorMinNumElements() * Stride * MaxVScale;
22328
22329 if (LastElementOffset < std::numeric_limits<int32_t>::min() ||
22330 LastElementOffset > std::numeric_limits<int32_t>::max())
22331 return Changed;
22332
22333 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
22334 // Stride does not scale explicitly by 'Scale', because it happens in
22335 // the gather/scatter addressing mode.
22336 Index = DAG.getStepVector(DL: SDLoc(N), ResVT: NewIndexVT, StepVal: APInt(32, Stride));
22337 return true;
22338}
22339
22340static SDValue performMaskedGatherScatterCombine(
22341 SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) {
22342 MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(Val: N);
22343 assert(MGS && "Can only combine gather load or scatter store nodes");
22344
22345 if (!DCI.isBeforeLegalize())
22346 return SDValue();
22347
22348 SDLoc DL(MGS);
22349 SDValue Chain = MGS->getChain();
22350 SDValue Scale = MGS->getScale();
22351 SDValue Index = MGS->getIndex();
22352 SDValue Mask = MGS->getMask();
22353 SDValue BasePtr = MGS->getBasePtr();
22354 ISD::MemIndexType IndexType = MGS->getIndexType();
22355
22356 if (!findMoreOptimalIndexType(N: MGS, BasePtr, Index, DAG))
22357 return SDValue();
22358
22359 // Here we catch such cases early and change MGATHER's IndexType to allow
22360 // the use of an Index that's more legalisation friendly.
22361 if (auto *MGT = dyn_cast<MaskedGatherSDNode>(Val: MGS)) {
22362 SDValue PassThru = MGT->getPassThru();
22363 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
22364 return DAG.getMaskedGather(
22365 DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
22366 Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
22367 }
22368 auto *MSC = cast<MaskedScatterSDNode>(Val: MGS);
22369 SDValue Data = MSC->getValue();
22370 SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
22371 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(), DL,
22372 Ops, MSC->getMemOperand(), IndexType,
22373 MSC->isTruncatingStore());
22374}
22375
22376/// Target-specific DAG combine function for NEON load/store intrinsics
22377/// to merge base address updates.
22378static SDValue performNEONPostLDSTCombine(SDNode *N,
22379 TargetLowering::DAGCombinerInfo &DCI,
22380 SelectionDAG &DAG) {
22381 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
22382 return SDValue();
22383
22384 unsigned AddrOpIdx = N->getNumOperands() - 1;
22385 SDValue Addr = N->getOperand(Num: AddrOpIdx);
22386
22387 // Search for a use of the address operand that is an increment.
22388 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
22389 UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
22390 SDNode *User = *UI;
22391 if (User->getOpcode() != ISD::ADD ||
22392 UI.getUse().getResNo() != Addr.getResNo())
22393 continue;
22394
22395 // Check that the add is independent of the load/store. Otherwise, folding
22396 // it would create a cycle.
22397 SmallPtrSet<const SDNode *, 32> Visited;
22398 SmallVector<const SDNode *, 16> Worklist;
22399 Visited.insert(Ptr: Addr.getNode());
22400 Worklist.push_back(Elt: N);
22401 Worklist.push_back(Elt: User);
22402 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
22403 SDNode::hasPredecessorHelper(N: User, Visited, Worklist))
22404 continue;
22405
22406 // Find the new opcode for the updating load/store.
22407 bool IsStore = false;
22408 bool IsLaneOp = false;
22409 bool IsDupOp = false;
22410 unsigned NewOpc = 0;
22411 unsigned NumVecs = 0;
22412 unsigned IntNo = N->getConstantOperandVal(Num: 1);
22413 switch (IntNo) {
22414 default: llvm_unreachable("unexpected intrinsic for Neon base update");
22415 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
22416 NumVecs = 2; break;
22417 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
22418 NumVecs = 3; break;
22419 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
22420 NumVecs = 4; break;
22421 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
22422 NumVecs = 2; IsStore = true; break;
22423 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
22424 NumVecs = 3; IsStore = true; break;
22425 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
22426 NumVecs = 4; IsStore = true; break;
22427 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
22428 NumVecs = 2; break;
22429 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
22430 NumVecs = 3; break;
22431 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
22432 NumVecs = 4; break;
22433 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
22434 NumVecs = 2; IsStore = true; break;
22435 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
22436 NumVecs = 3; IsStore = true; break;
22437 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
22438 NumVecs = 4; IsStore = true; break;
22439 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
22440 NumVecs = 2; IsDupOp = true; break;
22441 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
22442 NumVecs = 3; IsDupOp = true; break;
22443 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
22444 NumVecs = 4; IsDupOp = true; break;
22445 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
22446 NumVecs = 2; IsLaneOp = true; break;
22447 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
22448 NumVecs = 3; IsLaneOp = true; break;
22449 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
22450 NumVecs = 4; IsLaneOp = true; break;
22451 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
22452 NumVecs = 2; IsStore = true; IsLaneOp = true; break;
22453 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
22454 NumVecs = 3; IsStore = true; IsLaneOp = true; break;
22455 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
22456 NumVecs = 4; IsStore = true; IsLaneOp = true; break;
22457 }
22458
22459 EVT VecTy;
22460 if (IsStore)
22461 VecTy = N->getOperand(Num: 2).getValueType();
22462 else
22463 VecTy = N->getValueType(ResNo: 0);
22464
22465 // If the increment is a constant, it must match the memory ref size.
22466 SDValue Inc = User->getOperand(Num: User->getOperand(Num: 0) == Addr ? 1 : 0);
22467 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Val: Inc.getNode())) {
22468 uint32_t IncVal = CInc->getZExtValue();
22469 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
22470 if (IsLaneOp || IsDupOp)
22471 NumBytes /= VecTy.getVectorNumElements();
22472 if (IncVal != NumBytes)
22473 continue;
22474 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
22475 }
22476 SmallVector<SDValue, 8> Ops;
22477 Ops.push_back(Elt: N->getOperand(Num: 0)); // Incoming chain
22478 // Load lane and store have vector list as input.
22479 if (IsLaneOp || IsStore)
22480 for (unsigned i = 2; i < AddrOpIdx; ++i)
22481 Ops.push_back(Elt: N->getOperand(Num: i));
22482 Ops.push_back(Elt: Addr); // Base register
22483 Ops.push_back(Elt: Inc);
22484
22485 // Return Types.
22486 EVT Tys[6];
22487 unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
22488 unsigned n;
22489 for (n = 0; n < NumResultVecs; ++n)
22490 Tys[n] = VecTy;
22491 Tys[n++] = MVT::i64; // Type of write back register
22492 Tys[n] = MVT::Other; // Type of the chain
22493 SDVTList SDTys = DAG.getVTList(VTs: ArrayRef(Tys, NumResultVecs + 2));
22494
22495 MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(Val: N);
22496 SDValue UpdN = DAG.getMemIntrinsicNode(Opcode: NewOpc, dl: SDLoc(N), VTList: SDTys, Ops,
22497 MemVT: MemInt->getMemoryVT(),
22498 MMO: MemInt->getMemOperand());
22499
22500 // Update the uses.
22501 std::vector<SDValue> NewResults;
22502 for (unsigned i = 0; i < NumResultVecs; ++i) {
22503 NewResults.push_back(x: SDValue(UpdN.getNode(), i));
22504 }
22505 NewResults.push_back(x: SDValue(UpdN.getNode(), NumResultVecs + 1));
22506 DCI.CombineTo(N, To: NewResults);
22507 DCI.CombineTo(N: User, Res: SDValue(UpdN.getNode(), NumResultVecs));
22508
22509 break;
22510 }
22511 return SDValue();
22512}
22513
22514// Checks to see if the value is the prescribed width and returns information
22515// about its extension mode.
22516static
22517bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
22518 ExtType = ISD::NON_EXTLOAD;
22519 switch(V.getNode()->getOpcode()) {
22520 default:
22521 return false;
22522 case ISD::LOAD: {
22523 LoadSDNode *LoadNode = cast<LoadSDNode>(Val: V.getNode());
22524 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
22525 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
22526 ExtType = LoadNode->getExtensionType();
22527 return true;
22528 }
22529 return false;
22530 }
22531 case ISD::AssertSext: {
22532 VTSDNode *TypeNode = cast<VTSDNode>(Val: V.getNode()->getOperand(Num: 1));
22533 if ((TypeNode->getVT() == MVT::i8 && width == 8)
22534 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
22535 ExtType = ISD::SEXTLOAD;
22536 return true;
22537 }
22538 return false;
22539 }
22540 case ISD::AssertZext: {
22541 VTSDNode *TypeNode = cast<VTSDNode>(Val: V.getNode()->getOperand(Num: 1));
22542 if ((TypeNode->getVT() == MVT::i8 && width == 8)
22543 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
22544 ExtType = ISD::ZEXTLOAD;
22545 return true;
22546 }
22547 return false;
22548 }
22549 case ISD::Constant:
22550 case ISD::TargetConstant: {
22551 return std::abs(i: cast<ConstantSDNode>(Val: V.getNode())->getSExtValue()) <
22552 1LL << (width - 1);
22553 }
22554 }
22555
22556 return true;
22557}
22558
22559// This function does a whole lot of voodoo to determine if the tests are
22560// equivalent without and with a mask. Essentially what happens is that given a
22561// DAG resembling:
22562//
22563// +-------------+ +-------------+ +-------------+ +-------------+
22564// | Input | | AddConstant | | CompConstant| | CC |
22565// +-------------+ +-------------+ +-------------+ +-------------+
22566// | | | |
22567// V V | +----------+
22568// +-------------+ +----+ | |
22569// | ADD | |0xff| | |
22570// +-------------+ +----+ | |
22571// | | | |
22572// V V | |
22573// +-------------+ | |
22574// | AND | | |
22575// +-------------+ | |
22576// | | |
22577// +-----+ | |
22578// | | |
22579// V V V
22580// +-------------+
22581// | CMP |
22582// +-------------+
22583//
22584// The AND node may be safely removed for some combinations of inputs. In
22585// particular we need to take into account the extension type of the Input,
22586// the exact values of AddConstant, CompConstant, and CC, along with the nominal
22587// width of the input (this can work for any width inputs, the above graph is
22588// specific to 8 bits.
22589//
22590// The specific equations were worked out by generating output tables for each
22591// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
22592// problem was simplified by working with 4 bit inputs, which means we only
22593// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
22594// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
22595// patterns present in both extensions (0,7). For every distinct set of
22596// AddConstant and CompConstants bit patterns we can consider the masked and
22597// unmasked versions to be equivalent if the result of this function is true for
22598// all 16 distinct bit patterns of for the current extension type of Input (w0).
22599//
22600// sub w8, w0, w1
22601// and w10, w8, #0x0f
22602// cmp w8, w2
22603// cset w9, AArch64CC
22604// cmp w10, w2
22605// cset w11, AArch64CC
22606// cmp w9, w11
22607// cset w0, eq
22608// ret
22609//
22610// Since the above function shows when the outputs are equivalent it defines
22611// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
22612// would be expensive to run during compiles. The equations below were written
22613// in a test harness that confirmed they gave equivalent outputs to the above
22614// for all inputs function, so they can be used determine if the removal is
22615// legal instead.
22616//
22617// isEquivalentMaskless() is the code for testing if the AND can be removed
22618// factored out of the DAG recognition as the DAG can take several forms.
22619
22620static bool isEquivalentMaskless(unsigned CC, unsigned width,
22621 ISD::LoadExtType ExtType, int AddConstant,
22622 int CompConstant) {
22623 // By being careful about our equations and only writing the in term
22624 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
22625 // make them generally applicable to all bit widths.
22626 int MaxUInt = (1 << width);
22627
22628 // For the purposes of these comparisons sign extending the type is
22629 // equivalent to zero extending the add and displacing it by half the integer
22630 // width. Provided we are careful and make sure our equations are valid over
22631 // the whole range we can just adjust the input and avoid writing equations
22632 // for sign extended inputs.
22633 if (ExtType == ISD::SEXTLOAD)
22634 AddConstant -= (1 << (width-1));
22635
22636 switch(CC) {
22637 case AArch64CC::LE:
22638 case AArch64CC::GT:
22639 if ((AddConstant == 0) ||
22640 (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
22641 (AddConstant >= 0 && CompConstant < 0) ||
22642 (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
22643 return true;
22644 break;
22645 case AArch64CC::LT:
22646 case AArch64CC::GE:
22647 if ((AddConstant == 0) ||
22648 (AddConstant >= 0 && CompConstant <= 0) ||
22649 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
22650 return true;
22651 break;
22652 case AArch64CC::HI:
22653 case AArch64CC::LS:
22654 if ((AddConstant >= 0 && CompConstant < 0) ||
22655 (AddConstant <= 0 && CompConstant >= -1 &&
22656 CompConstant < AddConstant + MaxUInt))
22657 return true;
22658 break;
22659 case AArch64CC::PL:
22660 case AArch64CC::MI:
22661 if ((AddConstant == 0) ||
22662 (AddConstant > 0 && CompConstant <= 0) ||
22663 (AddConstant < 0 && CompConstant <= AddConstant))
22664 return true;
22665 break;
22666 case AArch64CC::LO:
22667 case AArch64CC::HS:
22668 if ((AddConstant >= 0 && CompConstant <= 0) ||
22669 (AddConstant <= 0 && CompConstant >= 0 &&
22670 CompConstant <= AddConstant + MaxUInt))
22671 return true;
22672 break;
22673 case AArch64CC::EQ:
22674 case AArch64CC::NE:
22675 if ((AddConstant > 0 && CompConstant < 0) ||
22676 (AddConstant < 0 && CompConstant >= 0 &&
22677 CompConstant < AddConstant + MaxUInt) ||
22678 (AddConstant >= 0 && CompConstant >= 0 &&
22679 CompConstant >= AddConstant) ||
22680 (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
22681 return true;
22682 break;
22683 case AArch64CC::VS:
22684 case AArch64CC::VC:
22685 case AArch64CC::AL:
22686 case AArch64CC::NV:
22687 return true;
22688 case AArch64CC::Invalid:
22689 break;
22690 }
22691
22692 return false;
22693}
22694
22695// (X & C) >u Mask --> (X & (C & (~Mask)) != 0
22696// (X & C) <u Pow2 --> (X & (C & ~(Pow2-1)) == 0
22697static SDValue performSubsToAndsCombine(SDNode *N, SDNode *SubsNode,
22698 SDNode *AndNode, SelectionDAG &DAG,
22699 unsigned CCIndex, unsigned CmpIndex,
22700 unsigned CC) {
22701 ConstantSDNode *SubsC = dyn_cast<ConstantSDNode>(Val: SubsNode->getOperand(Num: 1));
22702 if (!SubsC)
22703 return SDValue();
22704
22705 APInt SubsAP = SubsC->getAPIntValue();
22706 if (CC == AArch64CC::HI) {
22707 if (!SubsAP.isMask())
22708 return SDValue();
22709 } else if (CC == AArch64CC::LO) {
22710 if (!SubsAP.isPowerOf2())
22711 return SDValue();
22712 } else
22713 return SDValue();
22714
22715 ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(Val: AndNode->getOperand(Num: 1));
22716 if (!AndC)
22717 return SDValue();
22718
22719 APInt MaskAP = CC == AArch64CC::HI ? SubsAP : (SubsAP - 1);
22720
22721 SDLoc DL(N);
22722 APInt AndSMask = (~MaskAP) & AndC->getAPIntValue();
22723 SDValue ANDS = DAG.getNode(
22724 Opcode: AArch64ISD::ANDS, DL, VTList: SubsNode->getVTList(), N1: AndNode->getOperand(Num: 0),
22725 N2: DAG.getConstant(Val: AndSMask, DL, VT: SubsC->getValueType(ResNo: 0)));
22726 SDValue AArch64_CC =
22727 DAG.getConstant(Val: CC == AArch64CC::HI ? AArch64CC::NE : AArch64CC::EQ, DL,
22728 VT: N->getOperand(Num: CCIndex)->getValueType(ResNo: 0));
22729
22730 // For now, only performCSELCombine and performBRCONDCombine call this
22731 // function. And both of them pass 2 for CCIndex, 3 for CmpIndex with 4
22732 // operands. So just init the ops direct to simplify the code. If we have some
22733 // other case with different CCIndex, CmpIndex, we need to use for loop to
22734 // rewrite the code here.
22735 // TODO: Do we need to assert number of operand is 4 here?
22736 assert((CCIndex == 2 && CmpIndex == 3) &&
22737 "Expected CCIndex to be 2 and CmpIndex to be 3.");
22738 SDValue Ops[] = {N->getOperand(Num: 0), N->getOperand(Num: 1), AArch64_CC,
22739 ANDS.getValue(R: 1)};
22740 return DAG.getNode(Opcode: N->getOpcode(), DL: N, VTList: N->getVTList(), Ops);
22741}
22742
22743static
22744SDValue performCONDCombine(SDNode *N,
22745 TargetLowering::DAGCombinerInfo &DCI,
22746 SelectionDAG &DAG, unsigned CCIndex,
22747 unsigned CmpIndex) {
22748 unsigned CC = cast<ConstantSDNode>(Val: N->getOperand(Num: CCIndex))->getSExtValue();
22749 SDNode *SubsNode = N->getOperand(Num: CmpIndex).getNode();
22750 unsigned CondOpcode = SubsNode->getOpcode();
22751
22752 if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(Value: 0))
22753 return SDValue();
22754
22755 // There is a SUBS feeding this condition. Is it fed by a mask we can
22756 // use?
22757
22758 SDNode *AndNode = SubsNode->getOperand(Num: 0).getNode();
22759 unsigned MaskBits = 0;
22760
22761 if (AndNode->getOpcode() != ISD::AND)
22762 return SDValue();
22763
22764 if (SDValue Val = performSubsToAndsCombine(N, SubsNode, AndNode, DAG, CCIndex,
22765 CmpIndex, CC))
22766 return Val;
22767
22768 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val: AndNode->getOperand(Num: 1))) {
22769 uint32_t CNV = CN->getZExtValue();
22770 if (CNV == 255)
22771 MaskBits = 8;
22772 else if (CNV == 65535)
22773 MaskBits = 16;
22774 }
22775
22776 if (!MaskBits)
22777 return SDValue();
22778
22779 SDValue AddValue = AndNode->getOperand(Num: 0);
22780
22781 if (AddValue.getOpcode() != ISD::ADD)
22782 return SDValue();
22783
22784 // The basic dag structure is correct, grab the inputs and validate them.
22785
22786 SDValue AddInputValue1 = AddValue.getNode()->getOperand(Num: 0);
22787 SDValue AddInputValue2 = AddValue.getNode()->getOperand(Num: 1);
22788 SDValue SubsInputValue = SubsNode->getOperand(Num: 1);
22789
22790 // The mask is present and the provenance of all the values is a smaller type,
22791 // lets see if the mask is superfluous.
22792
22793 if (!isa<ConstantSDNode>(Val: AddInputValue2.getNode()) ||
22794 !isa<ConstantSDNode>(Val: SubsInputValue.getNode()))
22795 return SDValue();
22796
22797 ISD::LoadExtType ExtType;
22798
22799 if (!checkValueWidth(V: SubsInputValue, width: MaskBits, ExtType) ||
22800 !checkValueWidth(V: AddInputValue2, width: MaskBits, ExtType) ||
22801 !checkValueWidth(V: AddInputValue1, width: MaskBits, ExtType) )
22802 return SDValue();
22803
22804 if(!isEquivalentMaskless(CC, width: MaskBits, ExtType,
22805 AddConstant: cast<ConstantSDNode>(Val: AddInputValue2.getNode())->getSExtValue(),
22806 CompConstant: cast<ConstantSDNode>(Val: SubsInputValue.getNode())->getSExtValue()))
22807 return SDValue();
22808
22809 // The AND is not necessary, remove it.
22810
22811 SDVTList VTs = DAG.getVTList(VT1: SubsNode->getValueType(ResNo: 0),
22812 VT2: SubsNode->getValueType(ResNo: 1));
22813 SDValue Ops[] = { AddValue, SubsNode->getOperand(Num: 1) };
22814
22815 SDValue NewValue = DAG.getNode(Opcode: CondOpcode, DL: SDLoc(SubsNode), VTList: VTs, Ops);
22816 DAG.ReplaceAllUsesWith(From: SubsNode, To: NewValue.getNode());
22817
22818 return SDValue(N, 0);
22819}
22820
22821// Optimize compare with zero and branch.
22822static SDValue performBRCONDCombine(SDNode *N,
22823 TargetLowering::DAGCombinerInfo &DCI,
22824 SelectionDAG &DAG) {
22825 MachineFunction &MF = DAG.getMachineFunction();
22826 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
22827 // will not be produced, as they are conditional branch instructions that do
22828 // not set flags.
22829 if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
22830 return SDValue();
22831
22832 if (SDValue NV = performCONDCombine(N, DCI, DAG, CCIndex: 2, CmpIndex: 3))
22833 N = NV.getNode();
22834 SDValue Chain = N->getOperand(Num: 0);
22835 SDValue Dest = N->getOperand(Num: 1);
22836 SDValue CCVal = N->getOperand(Num: 2);
22837 SDValue Cmp = N->getOperand(Num: 3);
22838
22839 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
22840 unsigned CC = CCVal->getAsZExtVal();
22841 if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
22842 return SDValue();
22843
22844 unsigned CmpOpc = Cmp.getOpcode();
22845 if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
22846 return SDValue();
22847
22848 // Only attempt folding if there is only one use of the flag and no use of the
22849 // value.
22850 if (!Cmp->hasNUsesOfValue(NUses: 0, Value: 0) || !Cmp->hasNUsesOfValue(NUses: 1, Value: 1))
22851 return SDValue();
22852
22853 SDValue LHS = Cmp.getOperand(i: 0);
22854 SDValue RHS = Cmp.getOperand(i: 1);
22855
22856 assert(LHS.getValueType() == RHS.getValueType() &&
22857 "Expected the value type to be the same for both operands!");
22858 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
22859 return SDValue();
22860
22861 if (isNullConstant(V: LHS))
22862 std::swap(a&: LHS, b&: RHS);
22863
22864 if (!isNullConstant(V: RHS))
22865 return SDValue();
22866
22867 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
22868 LHS.getOpcode() == ISD::SRL)
22869 return SDValue();
22870
22871 // Fold the compare into the branch instruction.
22872 SDValue BR;
22873 if (CC == AArch64CC::EQ)
22874 BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
22875 else
22876 BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
22877
22878 // Do not add new nodes to DAG combiner worklist.
22879 DCI.CombineTo(N, Res: BR, AddTo: false);
22880
22881 return SDValue();
22882}
22883
22884static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG) {
22885 unsigned CC = N->getConstantOperandVal(Num: 2);
22886 SDValue SUBS = N->getOperand(Num: 3);
22887 SDValue Zero, CTTZ;
22888
22889 if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) {
22890 Zero = N->getOperand(Num: 0);
22891 CTTZ = N->getOperand(Num: 1);
22892 } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) {
22893 Zero = N->getOperand(Num: 1);
22894 CTTZ = N->getOperand(Num: 0);
22895 } else
22896 return SDValue();
22897
22898 if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) ||
22899 (CTTZ.getOpcode() == ISD::TRUNCATE &&
22900 CTTZ.getOperand(i: 0).getOpcode() != ISD::CTTZ))
22901 return SDValue();
22902
22903 assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) &&
22904 "Illegal type in CTTZ folding");
22905
22906 if (!isNullConstant(V: Zero) || !isNullConstant(V: SUBS.getOperand(i: 1)))
22907 return SDValue();
22908
22909 SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE
22910 ? CTTZ.getOperand(i: 0).getOperand(i: 0)
22911 : CTTZ.getOperand(i: 0);
22912
22913 if (X != SUBS.getOperand(i: 0))
22914 return SDValue();
22915
22916 unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE
22917 ? CTTZ.getOperand(i: 0).getValueSizeInBits()
22918 : CTTZ.getValueSizeInBits();
22919 SDValue BitWidthMinusOne =
22920 DAG.getConstant(Val: BitWidth - 1, DL: SDLoc(N), VT: CTTZ.getValueType());
22921 return DAG.getNode(Opcode: ISD::AND, DL: SDLoc(N), VT: CTTZ.getValueType(), N1: CTTZ,
22922 N2: BitWidthMinusOne);
22923}
22924
22925// (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond)
22926// (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond)
22927// Where x and y are constants and x != y
22928
22929// (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond)
22930// (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond)
22931// Where x and y are constants and x != y
22932static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG) {
22933 SDValue L = Op->getOperand(Num: 0);
22934 SDValue R = Op->getOperand(Num: 1);
22935 AArch64CC::CondCode OpCC =
22936 static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(Num: 2));
22937
22938 SDValue OpCmp = Op->getOperand(Num: 3);
22939 if (!isCMP(Op: OpCmp))
22940 return SDValue();
22941
22942 SDValue CmpLHS = OpCmp.getOperand(i: 0);
22943 SDValue CmpRHS = OpCmp.getOperand(i: 1);
22944
22945 if (CmpRHS.getOpcode() == AArch64ISD::CSEL)
22946 std::swap(a&: CmpLHS, b&: CmpRHS);
22947 else if (CmpLHS.getOpcode() != AArch64ISD::CSEL)
22948 return SDValue();
22949
22950 SDValue X = CmpLHS->getOperand(Num: 0);
22951 SDValue Y = CmpLHS->getOperand(Num: 1);
22952 if (!isa<ConstantSDNode>(Val: X) || !isa<ConstantSDNode>(Val: Y) || X == Y) {
22953 return SDValue();
22954 }
22955
22956 // If one of the constant is opaque constant, x,y sdnode is still different
22957 // but the real value maybe the same. So check APInt here to make sure the
22958 // code is correct.
22959 ConstantSDNode *CX = cast<ConstantSDNode>(Val&: X);
22960 ConstantSDNode *CY = cast<ConstantSDNode>(Val&: Y);
22961 if (CX->getAPIntValue() == CY->getAPIntValue())
22962 return SDValue();
22963
22964 AArch64CC::CondCode CC =
22965 static_cast<AArch64CC::CondCode>(CmpLHS->getConstantOperandVal(Num: 2));
22966 SDValue Cond = CmpLHS->getOperand(Num: 3);
22967
22968 if (CmpRHS == Y)
22969 CC = AArch64CC::getInvertedCondCode(Code: CC);
22970 else if (CmpRHS != X)
22971 return SDValue();
22972
22973 if (OpCC == AArch64CC::NE)
22974 CC = AArch64CC::getInvertedCondCode(Code: CC);
22975 else if (OpCC != AArch64CC::EQ)
22976 return SDValue();
22977
22978 SDLoc DL(Op);
22979 EVT VT = Op->getValueType(ResNo: 0);
22980
22981 SDValue CCValue = DAG.getConstant(CC, DL, MVT::i32);
22982 return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: L, N2: R, N3: CCValue, N4: Cond);
22983}
22984
22985// Optimize CSEL instructions
22986static SDValue performCSELCombine(SDNode *N,
22987 TargetLowering::DAGCombinerInfo &DCI,
22988 SelectionDAG &DAG) {
22989 // CSEL x, x, cc -> x
22990 if (N->getOperand(Num: 0) == N->getOperand(Num: 1))
22991 return N->getOperand(Num: 0);
22992
22993 if (SDValue R = foldCSELOfCSEL(Op: N, DAG))
22994 return R;
22995
22996 // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
22997 // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
22998 if (SDValue Folded = foldCSELofCTTZ(N, DAG))
22999 return Folded;
23000
23001 return performCONDCombine(N, DCI, DAG, CCIndex: 2, CmpIndex: 3);
23002}
23003
23004// Try to re-use an already extended operand of a vector SetCC feeding a
23005// extended select. Doing so avoids requiring another full extension of the
23006// SET_CC result when lowering the select.
23007static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG) {
23008 EVT Op0MVT = Op->getOperand(Num: 0).getValueType();
23009 if (!Op0MVT.isVector() || Op->use_empty())
23010 return SDValue();
23011
23012 // Make sure that all uses of Op are VSELECTs with result matching types where
23013 // the result type has a larger element type than the SetCC operand.
23014 SDNode *FirstUse = *Op->use_begin();
23015 if (FirstUse->getOpcode() != ISD::VSELECT)
23016 return SDValue();
23017 EVT UseMVT = FirstUse->getValueType(ResNo: 0);
23018 if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
23019 return SDValue();
23020 if (any_of(Range: Op->uses(), P: [&UseMVT](const SDNode *N) {
23021 return N->getOpcode() != ISD::VSELECT || N->getValueType(ResNo: 0) != UseMVT;
23022 }))
23023 return SDValue();
23024
23025 APInt V;
23026 if (!ISD::isConstantSplatVector(N: Op->getOperand(Num: 1).getNode(), SplatValue&: V))
23027 return SDValue();
23028
23029 SDLoc DL(Op);
23030 SDValue Op0ExtV;
23031 SDValue Op1ExtV;
23032 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op->getOperand(Num: 2))->get();
23033 // Check if the first operand of the SET_CC is already extended. If it is,
23034 // split the SET_CC and re-use the extended version of the operand.
23035 SDNode *Op0SExt = DAG.getNodeIfExists(Opcode: ISD::SIGN_EXTEND, VTList: DAG.getVTList(VT: UseMVT),
23036 Ops: Op->getOperand(Num: 0));
23037 SDNode *Op0ZExt = DAG.getNodeIfExists(Opcode: ISD::ZERO_EXTEND, VTList: DAG.getVTList(VT: UseMVT),
23038 Ops: Op->getOperand(Num: 0));
23039 if (Op0SExt && (isSignedIntSetCC(Code: CC) || isIntEqualitySetCC(Code: CC))) {
23040 Op0ExtV = SDValue(Op0SExt, 0);
23041 Op1ExtV = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: UseMVT, Operand: Op->getOperand(Num: 1));
23042 } else if (Op0ZExt && (isUnsignedIntSetCC(Code: CC) || isIntEqualitySetCC(Code: CC))) {
23043 Op0ExtV = SDValue(Op0ZExt, 0);
23044 Op1ExtV = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: UseMVT, Operand: Op->getOperand(Num: 1));
23045 } else
23046 return SDValue();
23047
23048 return DAG.getNode(ISD::SETCC, DL, UseMVT.changeVectorElementType(MVT::i1),
23049 Op0ExtV, Op1ExtV, Op->getOperand(2));
23050}
23051
23052static SDValue
23053performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
23054 SelectionDAG &DAG) {
23055 SDValue Vec = N->getOperand(Num: 0);
23056 if (DCI.isBeforeLegalize() &&
23057 Vec.getValueType().getVectorElementType() == MVT::i1 &&
23058 Vec.getValueType().isFixedLengthVector() &&
23059 Vec.getValueType().isPow2VectorType()) {
23060 SDLoc DL(N);
23061 return getVectorBitwiseReduce(Opcode: N->getOpcode(), Vec, VT: N->getValueType(ResNo: 0), DL,
23062 DAG);
23063 }
23064
23065 return SDValue();
23066}
23067
23068static SDValue performSETCCCombine(SDNode *N,
23069 TargetLowering::DAGCombinerInfo &DCI,
23070 SelectionDAG &DAG) {
23071 assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
23072 SDValue LHS = N->getOperand(Num: 0);
23073 SDValue RHS = N->getOperand(Num: 1);
23074 ISD::CondCode Cond = cast<CondCodeSDNode>(Val: N->getOperand(Num: 2))->get();
23075 SDLoc DL(N);
23076 EVT VT = N->getValueType(ResNo: 0);
23077
23078 if (SDValue V = tryToWidenSetCCOperands(Op: N, DAG))
23079 return V;
23080
23081 // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
23082 if (Cond == ISD::SETNE && isOneConstant(V: RHS) &&
23083 LHS->getOpcode() == AArch64ISD::CSEL &&
23084 isNullConstant(V: LHS->getOperand(Num: 0)) && isOneConstant(V: LHS->getOperand(Num: 1)) &&
23085 LHS->hasOneUse()) {
23086 // Invert CSEL's condition.
23087 auto OldCond =
23088 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(i: 2));
23089 auto NewCond = getInvertedCondCode(Code: OldCond);
23090
23091 // csel 0, 1, !cond, X
23092 SDValue CSEL =
23093 DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(), LHS.getOperand(0),
23094 LHS.getOperand(1), DAG.getConstant(NewCond, DL, MVT::i32),
23095 LHS.getOperand(3));
23096 return DAG.getZExtOrTrunc(Op: CSEL, DL, VT);
23097 }
23098
23099 // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne
23100 if (Cond == ISD::SETNE && isNullConstant(V: RHS) &&
23101 LHS->getOpcode() == ISD::SRL && isa<ConstantSDNode>(Val: LHS->getOperand(Num: 1)) &&
23102 LHS->getConstantOperandVal(Num: 1) < VT.getScalarSizeInBits() &&
23103 LHS->hasOneUse()) {
23104 EVT TstVT = LHS->getValueType(ResNo: 0);
23105 if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64) {
23106 // this pattern will get better opt in emitComparison
23107 uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(Num: 1);
23108 SDValue TST = DAG.getNode(Opcode: ISD::AND, DL, VT: TstVT, N1: LHS->getOperand(Num: 0),
23109 N2: DAG.getConstant(Val: TstImm, DL, VT: TstVT));
23110 return DAG.getNode(Opcode: ISD::SETCC, DL, VT, N1: TST, N2: RHS, N3: N->getOperand(Num: 2));
23111 }
23112 }
23113
23114 // setcc (iN (bitcast (vNi1 X))), 0, (eq|ne)
23115 // ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq|ne)
23116 // setcc (iN (bitcast (vNi1 X))), -1, (eq|ne)
23117 // ==> setcc (iN (sext (i1 (vecreduce_and (vNi1 X))))), -1, (eq|ne)
23118 if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
23119 (Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
23120 (isNullConstant(V: RHS) || isAllOnesConstant(V: RHS)) &&
23121 LHS->getOpcode() == ISD::BITCAST) {
23122 EVT ToVT = LHS->getValueType(ResNo: 0);
23123 EVT FromVT = LHS->getOperand(Num: 0).getValueType();
23124 if (FromVT.isFixedLengthVector() &&
23125 FromVT.getVectorElementType() == MVT::i1) {
23126 bool IsNull = isNullConstant(V: RHS);
23127 LHS = DAG.getNode(IsNull ? ISD::VECREDUCE_OR : ISD::VECREDUCE_AND,
23128 DL, MVT::i1, LHS->getOperand(0));
23129 LHS = DAG.getNode(Opcode: IsNull ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, VT: ToVT,
23130 Operand: LHS);
23131 return DAG.getSetCC(DL, VT, LHS, RHS, Cond);
23132 }
23133 }
23134
23135 // Try to perform the memcmp when the result is tested for [in]equality with 0
23136 if (SDValue V = performOrXorChainCombine(N, DAG))
23137 return V;
23138
23139 return SDValue();
23140}
23141
23142// Replace a flag-setting operator (eg ANDS) with the generic version
23143// (eg AND) if the flag is unused.
23144static SDValue performFlagSettingCombine(SDNode *N,
23145 TargetLowering::DAGCombinerInfo &DCI,
23146 unsigned GenericOpcode) {
23147 SDLoc DL(N);
23148 SDValue LHS = N->getOperand(Num: 0);
23149 SDValue RHS = N->getOperand(Num: 1);
23150 EVT VT = N->getValueType(ResNo: 0);
23151
23152 // If the flag result isn't used, convert back to a generic opcode.
23153 if (!N->hasAnyUseOfValue(Value: 1)) {
23154 SDValue Res = DCI.DAG.getNode(Opcode: GenericOpcode, DL, VT, Ops: N->ops());
23155 return DCI.DAG.getMergeValues({Res, DCI.DAG.getConstant(0, DL, MVT::i32)},
23156 DL);
23157 }
23158
23159 // Combine identical generic nodes into this node, re-using the result.
23160 if (SDNode *Generic = DCI.DAG.getNodeIfExists(
23161 Opcode: GenericOpcode, VTList: DCI.DAG.getVTList(VT), Ops: {LHS, RHS}))
23162 DCI.CombineTo(N: Generic, Res: SDValue(N, 0));
23163
23164 return SDValue();
23165}
23166
23167static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG) {
23168 // setcc_merge_zero pred
23169 // (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne
23170 // => extract_subvector (inner setcc_merge_zero)
23171 SDValue Pred = N->getOperand(Num: 0);
23172 SDValue LHS = N->getOperand(Num: 1);
23173 SDValue RHS = N->getOperand(Num: 2);
23174 ISD::CondCode Cond = cast<CondCodeSDNode>(Val: N->getOperand(Num: 3))->get();
23175
23176 if (Cond != ISD::SETNE || !isZerosVector(N: RHS.getNode()) ||
23177 LHS->getOpcode() != ISD::SIGN_EXTEND)
23178 return SDValue();
23179
23180 SDValue Extract = LHS->getOperand(Num: 0);
23181 if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
23182 Extract->getValueType(ResNo: 0) != N->getValueType(ResNo: 0) ||
23183 Extract->getConstantOperandVal(Num: 1) != 0)
23184 return SDValue();
23185
23186 SDValue InnerSetCC = Extract->getOperand(Num: 0);
23187 if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO)
23188 return SDValue();
23189
23190 // By this point we've effectively got
23191 // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive
23192 // lanes are already zero then the trunc(sext()) sequence is redundant and we
23193 // can operate on A directly.
23194 SDValue InnerPred = InnerSetCC.getOperand(i: 0);
23195 if (Pred.getOpcode() == AArch64ISD::PTRUE &&
23196 InnerPred.getOpcode() == AArch64ISD::PTRUE &&
23197 Pred.getConstantOperandVal(0) == InnerPred.getConstantOperandVal(0) &&
23198 Pred->getConstantOperandVal(0) >= AArch64SVEPredPattern::vl1 &&
23199 Pred->getConstantOperandVal(0) <= AArch64SVEPredPattern::vl256)
23200 return Extract;
23201
23202 return SDValue();
23203}
23204
23205static SDValue
23206performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
23207 assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
23208 "Unexpected opcode!");
23209
23210 SelectionDAG &DAG = DCI.DAG;
23211 SDValue Pred = N->getOperand(Num: 0);
23212 SDValue LHS = N->getOperand(Num: 1);
23213 SDValue RHS = N->getOperand(Num: 2);
23214 ISD::CondCode Cond = cast<CondCodeSDNode>(Val: N->getOperand(Num: 3))->get();
23215
23216 if (SDValue V = performSetCCPunpkCombine(N, DAG))
23217 return V;
23218
23219 if (Cond == ISD::SETNE && isZerosVector(N: RHS.getNode()) &&
23220 LHS->getOpcode() == ISD::SIGN_EXTEND &&
23221 LHS->getOperand(Num: 0)->getValueType(ResNo: 0) == N->getValueType(ResNo: 0)) {
23222 // setcc_merge_zero(
23223 // pred, extend(setcc_merge_zero(pred, ...)), != splat(0))
23224 // => setcc_merge_zero(pred, ...)
23225 if (LHS->getOperand(Num: 0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
23226 LHS->getOperand(Num: 0)->getOperand(Num: 0) == Pred)
23227 return LHS->getOperand(Num: 0);
23228
23229 // setcc_merge_zero(
23230 // all_active, extend(nxvNi1 ...), != splat(0))
23231 // -> nxvNi1 ...
23232 if (isAllActivePredicate(DAG, N: Pred))
23233 return LHS->getOperand(Num: 0);
23234
23235 // setcc_merge_zero(
23236 // pred, extend(nxvNi1 ...), != splat(0))
23237 // -> nxvNi1 and(pred, ...)
23238 if (DCI.isAfterLegalizeDAG())
23239 // Do this after legalization to allow more folds on setcc_merge_zero
23240 // to be recognized.
23241 return DAG.getNode(Opcode: ISD::AND, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
23242 N1: LHS->getOperand(Num: 0), N2: Pred);
23243 }
23244
23245 return SDValue();
23246}
23247
23248// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
23249// as well as whether the test should be inverted. This code is required to
23250// catch these cases (as opposed to standard dag combines) because
23251// AArch64ISD::TBZ is matched during legalization.
23252static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
23253 SelectionDAG &DAG) {
23254
23255 if (!Op->hasOneUse())
23256 return Op;
23257
23258 // We don't handle undef/constant-fold cases below, as they should have
23259 // already been taken care of (e.g. and of 0, test of undefined shifted bits,
23260 // etc.)
23261
23262 // (tbz (trunc x), b) -> (tbz x, b)
23263 // This case is just here to enable more of the below cases to be caught.
23264 if (Op->getOpcode() == ISD::TRUNCATE &&
23265 Bit < Op->getValueType(ResNo: 0).getSizeInBits()) {
23266 return getTestBitOperand(Op: Op->getOperand(Num: 0), Bit, Invert, DAG);
23267 }
23268
23269 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
23270 if (Op->getOpcode() == ISD::ANY_EXTEND &&
23271 Bit < Op->getOperand(Num: 0).getValueSizeInBits()) {
23272 return getTestBitOperand(Op: Op->getOperand(Num: 0), Bit, Invert, DAG);
23273 }
23274
23275 if (Op->getNumOperands() != 2)
23276 return Op;
23277
23278 auto *C = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1));
23279 if (!C)
23280 return Op;
23281
23282 switch (Op->getOpcode()) {
23283 default:
23284 return Op;
23285
23286 // (tbz (and x, m), b) -> (tbz x, b)
23287 case ISD::AND:
23288 if ((C->getZExtValue() >> Bit) & 1)
23289 return getTestBitOperand(Op: Op->getOperand(Num: 0), Bit, Invert, DAG);
23290 return Op;
23291
23292 // (tbz (shl x, c), b) -> (tbz x, b-c)
23293 case ISD::SHL:
23294 if (C->getZExtValue() <= Bit &&
23295 (Bit - C->getZExtValue()) < Op->getValueType(ResNo: 0).getSizeInBits()) {
23296 Bit = Bit - C->getZExtValue();
23297 return getTestBitOperand(Op: Op->getOperand(Num: 0), Bit, Invert, DAG);
23298 }
23299 return Op;
23300
23301 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
23302 case ISD::SRA:
23303 Bit = Bit + C->getZExtValue();
23304 if (Bit >= Op->getValueType(ResNo: 0).getSizeInBits())
23305 Bit = Op->getValueType(ResNo: 0).getSizeInBits() - 1;
23306 return getTestBitOperand(Op: Op->getOperand(Num: 0), Bit, Invert, DAG);
23307
23308 // (tbz (srl x, c), b) -> (tbz x, b+c)
23309 case ISD::SRL:
23310 if ((Bit + C->getZExtValue()) < Op->getValueType(ResNo: 0).getSizeInBits()) {
23311 Bit = Bit + C->getZExtValue();
23312 return getTestBitOperand(Op: Op->getOperand(Num: 0), Bit, Invert, DAG);
23313 }
23314 return Op;
23315
23316 // (tbz (xor x, -1), b) -> (tbnz x, b)
23317 case ISD::XOR:
23318 if ((C->getZExtValue() >> Bit) & 1)
23319 Invert = !Invert;
23320 return getTestBitOperand(Op: Op->getOperand(Num: 0), Bit, Invert, DAG);
23321 }
23322}
23323
23324// Optimize test single bit zero/non-zero and branch.
23325static SDValue performTBZCombine(SDNode *N,
23326 TargetLowering::DAGCombinerInfo &DCI,
23327 SelectionDAG &DAG) {
23328 unsigned Bit = N->getConstantOperandVal(Num: 2);
23329 bool Invert = false;
23330 SDValue TestSrc = N->getOperand(Num: 1);
23331 SDValue NewTestSrc = getTestBitOperand(Op: TestSrc, Bit, Invert, DAG);
23332
23333 if (TestSrc == NewTestSrc)
23334 return SDValue();
23335
23336 unsigned NewOpc = N->getOpcode();
23337 if (Invert) {
23338 if (NewOpc == AArch64ISD::TBZ)
23339 NewOpc = AArch64ISD::TBNZ;
23340 else {
23341 assert(NewOpc == AArch64ISD::TBNZ);
23342 NewOpc = AArch64ISD::TBZ;
23343 }
23344 }
23345
23346 SDLoc DL(N);
23347 return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
23348 DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
23349}
23350
23351// Swap vselect operands where it may allow a predicated operation to achieve
23352// the `sel`.
23353//
23354// (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b)))
23355// => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
23356static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG) {
23357 auto SelectA = N->getOperand(Num: 1);
23358 auto SelectB = N->getOperand(Num: 2);
23359 auto NTy = N->getValueType(ResNo: 0);
23360
23361 if (!NTy.isScalableVector())
23362 return SDValue();
23363 SDValue SetCC = N->getOperand(Num: 0);
23364 if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse())
23365 return SDValue();
23366
23367 switch (SelectB.getOpcode()) {
23368 default:
23369 return SDValue();
23370 case ISD::FMUL:
23371 case ISD::FSUB:
23372 case ISD::FADD:
23373 break;
23374 }
23375 if (SelectA != SelectB.getOperand(i: 0))
23376 return SDValue();
23377
23378 ISD::CondCode CC = cast<CondCodeSDNode>(Val: SetCC.getOperand(i: 2))->get();
23379 ISD::CondCode InverseCC =
23380 ISD::getSetCCInverse(Operation: CC, Type: SetCC.getOperand(i: 0).getValueType());
23381 auto InverseSetCC =
23382 DAG.getSetCC(DL: SDLoc(SetCC), VT: SetCC.getValueType(), LHS: SetCC.getOperand(i: 0),
23383 RHS: SetCC.getOperand(i: 1), Cond: InverseCC);
23384
23385 return DAG.getNode(Opcode: ISD::VSELECT, DL: SDLoc(N), VT: NTy,
23386 Ops: {InverseSetCC, SelectB, SelectA});
23387}
23388
23389// vselect (v1i1 setcc) ->
23390// vselect (v1iXX setcc) (XX is the size of the compared operand type)
23391// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
23392// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
23393// such VSELECT.
23394static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
23395 if (auto SwapResult = trySwapVSelectOperands(N, DAG))
23396 return SwapResult;
23397
23398 SDValue N0 = N->getOperand(Num: 0);
23399 EVT CCVT = N0.getValueType();
23400
23401 if (isAllActivePredicate(DAG, N: N0))
23402 return N->getOperand(Num: 1);
23403
23404 if (isAllInactivePredicate(N: N0))
23405 return N->getOperand(Num: 2);
23406
23407 // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
23408 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
23409 // supported types.
23410 SDValue SetCC = N->getOperand(Num: 0);
23411 if (SetCC.getOpcode() == ISD::SETCC &&
23412 SetCC.getOperand(i: 2) == DAG.getCondCode(Cond: ISD::SETGT)) {
23413 SDValue CmpLHS = SetCC.getOperand(i: 0);
23414 EVT VT = CmpLHS.getValueType();
23415 SDNode *CmpRHS = SetCC.getOperand(i: 1).getNode();
23416 SDNode *SplatLHS = N->getOperand(Num: 1).getNode();
23417 SDNode *SplatRHS = N->getOperand(Num: 2).getNode();
23418 APInt SplatLHSVal;
23419 if (CmpLHS.getValueType() == N->getOperand(1).getValueType() &&
23420 VT.isSimple() &&
23421 is_contained(ArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
23422 MVT::v2i32, MVT::v4i32, MVT::v2i64}),
23423 VT.getSimpleVT().SimpleTy) &&
23424 ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) &&
23425 SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(CmpRHS) &&
23426 ISD::isConstantSplatVectorAllOnes(SplatRHS)) {
23427 unsigned NumElts = VT.getVectorNumElements();
23428 SmallVector<SDValue, 8> Ops(
23429 NumElts, DAG.getConstant(Val: VT.getScalarSizeInBits() - 1, DL: SDLoc(N),
23430 VT: VT.getScalarType()));
23431 SDValue Val = DAG.getBuildVector(VT, DL: SDLoc(N), Ops);
23432
23433 auto Shift = DAG.getNode(Opcode: ISD::SRA, DL: SDLoc(N), VT, N1: CmpLHS, N2: Val);
23434 auto Or = DAG.getNode(Opcode: ISD::OR, DL: SDLoc(N), VT, N1: Shift, N2: N->getOperand(Num: 1));
23435 return Or;
23436 }
23437 }
23438
23439 EVT CmpVT = N0.getOperand(i: 0).getValueType();
23440 if (N0.getOpcode() != ISD::SETCC ||
23441 CCVT.getVectorElementCount() != ElementCount::getFixed(1) ||
23442 CCVT.getVectorElementType() != MVT::i1 ||
23443 CmpVT.getVectorElementType().isFloatingPoint())
23444 return SDValue();
23445
23446 EVT ResVT = N->getValueType(ResNo: 0);
23447 // Only combine when the result type is of the same size as the compared
23448 // operands.
23449 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
23450 return SDValue();
23451
23452 SDValue IfTrue = N->getOperand(Num: 1);
23453 SDValue IfFalse = N->getOperand(Num: 2);
23454 SetCC = DAG.getSetCC(DL: SDLoc(N), VT: CmpVT.changeVectorElementTypeToInteger(),
23455 LHS: N0.getOperand(i: 0), RHS: N0.getOperand(i: 1),
23456 Cond: cast<CondCodeSDNode>(Val: N0.getOperand(i: 2))->get());
23457 return DAG.getNode(Opcode: ISD::VSELECT, DL: SDLoc(N), VT: ResVT, N1: SetCC,
23458 N2: IfTrue, N3: IfFalse);
23459}
23460
23461/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
23462/// the compare-mask instructions rather than going via NZCV, even if LHS and
23463/// RHS are really scalar. This replaces any scalar setcc in the above pattern
23464/// with a vector one followed by a DUP shuffle on the result.
23465static SDValue performSelectCombine(SDNode *N,
23466 TargetLowering::DAGCombinerInfo &DCI) {
23467 SelectionDAG &DAG = DCI.DAG;
23468 SDValue N0 = N->getOperand(Num: 0);
23469 EVT ResVT = N->getValueType(ResNo: 0);
23470
23471 if (N0.getOpcode() != ISD::SETCC)
23472 return SDValue();
23473
23474 if (ResVT.isScalableVT())
23475 return SDValue();
23476
23477 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
23478 // scalar SetCCResultType. We also don't expect vectors, because we assume
23479 // that selects fed by vector SETCCs are canonicalized to VSELECT.
23480 assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
23481 "Scalar-SETCC feeding SELECT has unexpected result type!");
23482
23483 // If NumMaskElts == 0, the comparison is larger than select result. The
23484 // largest real NEON comparison is 64-bits per lane, which means the result is
23485 // at most 32-bits and an illegal vector. Just bail out for now.
23486 EVT SrcVT = N0.getOperand(i: 0).getValueType();
23487
23488 // Don't try to do this optimization when the setcc itself has i1 operands.
23489 // There are no legal vectors of i1, so this would be pointless. v1f16 is
23490 // ruled out to prevent the creation of setcc that need to be scalarized.
23491 if (SrcVT == MVT::i1 ||
23492 (SrcVT.isFloatingPoint() && SrcVT.getSizeInBits() <= 16))
23493 return SDValue();
23494
23495 int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
23496 if (!ResVT.isVector() || NumMaskElts == 0)
23497 return SDValue();
23498
23499 SrcVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: SrcVT, NumElements: NumMaskElts);
23500 EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
23501
23502 // Also bail out if the vector CCVT isn't the same size as ResVT.
23503 // This can happen if the SETCC operand size doesn't divide the ResVT size
23504 // (e.g., f64 vs v3f32).
23505 if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
23506 return SDValue();
23507
23508 // Make sure we didn't create illegal types, if we're not supposed to.
23509 assert(DCI.isBeforeLegalize() ||
23510 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
23511
23512 // First perform a vector comparison, where lane 0 is the one we're interested
23513 // in.
23514 SDLoc DL(N0);
23515 SDValue LHS =
23516 DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT: SrcVT, Operand: N0.getOperand(i: 0));
23517 SDValue RHS =
23518 DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT: SrcVT, Operand: N0.getOperand(i: 1));
23519 SDValue SetCC = DAG.getNode(Opcode: ISD::SETCC, DL, VT: CCVT, N1: LHS, N2: RHS, N3: N0.getOperand(i: 2));
23520
23521 // Now duplicate the comparison mask we want across all other lanes.
23522 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
23523 SDValue Mask = DAG.getVectorShuffle(VT: CCVT, dl: DL, N1: SetCC, N2: SetCC, Mask: DUPMask);
23524 Mask = DAG.getNode(Opcode: ISD::BITCAST, DL,
23525 VT: ResVT.changeVectorElementTypeToInteger(), Operand: Mask);
23526
23527 return DAG.getSelect(DL, VT: ResVT, Cond: Mask, LHS: N->getOperand(Num: 1), RHS: N->getOperand(Num: 2));
23528}
23529
23530static SDValue performDUPCombine(SDNode *N,
23531 TargetLowering::DAGCombinerInfo &DCI) {
23532 EVT VT = N->getValueType(ResNo: 0);
23533 SDLoc DL(N);
23534 // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
23535 // 128bit vector version.
23536 if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
23537 EVT LVT = VT.getDoubleNumVectorElementsVT(Context&: *DCI.DAG.getContext());
23538 SmallVector<SDValue> Ops(N->ops());
23539 if (SDNode *LN = DCI.DAG.getNodeIfExists(Opcode: N->getOpcode(),
23540 VTList: DCI.DAG.getVTList(VT: LVT), Ops)) {
23541 return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),
23542 DCI.DAG.getConstant(0, DL, MVT::i64));
23543 }
23544 }
23545
23546 if (N->getOpcode() == AArch64ISD::DUP) {
23547 if (DCI.isAfterLegalizeDAG()) {
23548 // If scalar dup's operand is extract_vector_elt, try to combine them into
23549 // duplane. For example,
23550 //
23551 // t21: i32 = extract_vector_elt t19, Constant:i64<0>
23552 // t18: v4i32 = AArch64ISD::DUP t21
23553 // ==>
23554 // t22: v4i32 = AArch64ISD::DUPLANE32 t19, Constant:i64<0>
23555 SDValue EXTRACT_VEC_ELT = N->getOperand(Num: 0);
23556 if (EXTRACT_VEC_ELT.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
23557 if (VT == EXTRACT_VEC_ELT.getOperand(i: 0).getValueType()) {
23558 unsigned Opcode = getDUPLANEOp(EltType: VT.getVectorElementType());
23559 return DCI.DAG.getNode(Opcode, DL, VT, N1: EXTRACT_VEC_ELT.getOperand(i: 0),
23560 N2: EXTRACT_VEC_ELT.getOperand(i: 1));
23561 }
23562 }
23563 }
23564
23565 return performPostLD1Combine(N, DCI, IsLaneOp: false);
23566 }
23567
23568 return SDValue();
23569}
23570
23571/// Get rid of unnecessary NVCASTs (that don't change the type).
23572static SDValue performNVCASTCombine(SDNode *N, SelectionDAG &DAG) {
23573 if (N->getValueType(ResNo: 0) == N->getOperand(Num: 0).getValueType())
23574 return N->getOperand(Num: 0);
23575 if (N->getOperand(Num: 0).getOpcode() == AArch64ISD::NVCAST)
23576 return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
23577 Operand: N->getOperand(Num: 0).getOperand(i: 0));
23578
23579 return SDValue();
23580}
23581
23582// If all users of the globaladdr are of the form (globaladdr + constant), find
23583// the smallest constant, fold it into the globaladdr's offset and rewrite the
23584// globaladdr as (globaladdr + constant) - constant.
23585static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
23586 const AArch64Subtarget *Subtarget,
23587 const TargetMachine &TM) {
23588 auto *GN = cast<GlobalAddressSDNode>(Val: N);
23589 if (Subtarget->ClassifyGlobalReference(GV: GN->getGlobal(), TM) !=
23590 AArch64II::MO_NO_FLAG)
23591 return SDValue();
23592
23593 uint64_t MinOffset = -1ull;
23594 for (SDNode *N : GN->uses()) {
23595 if (N->getOpcode() != ISD::ADD)
23596 return SDValue();
23597 auto *C = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 0));
23598 if (!C)
23599 C = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
23600 if (!C)
23601 return SDValue();
23602 MinOffset = std::min(a: MinOffset, b: C->getZExtValue());
23603 }
23604 uint64_t Offset = MinOffset + GN->getOffset();
23605
23606 // Require that the new offset is larger than the existing one. Otherwise, we
23607 // can end up oscillating between two possible DAGs, for example,
23608 // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
23609 if (Offset <= uint64_t(GN->getOffset()))
23610 return SDValue();
23611
23612 // Check whether folding this offset is legal. It must not go out of bounds of
23613 // the referenced object to avoid violating the code model, and must be
23614 // smaller than 2^20 because this is the largest offset expressible in all
23615 // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
23616 // stores an immediate signed 21 bit offset.)
23617 //
23618 // This check also prevents us from folding negative offsets, which will end
23619 // up being treated in the same way as large positive ones. They could also
23620 // cause code model violations, and aren't really common enough to matter.
23621 if (Offset >= (1 << 20))
23622 return SDValue();
23623
23624 const GlobalValue *GV = GN->getGlobal();
23625 Type *T = GV->getValueType();
23626 if (!T->isSized() ||
23627 Offset > GV->getParent()->getDataLayout().getTypeAllocSize(Ty: T))
23628 return SDValue();
23629
23630 SDLoc DL(GN);
23631 SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
23632 return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
23633 DAG.getConstant(MinOffset, DL, MVT::i64));
23634}
23635
23636static SDValue performCTLZCombine(SDNode *N, SelectionDAG &DAG,
23637 const AArch64Subtarget *Subtarget) {
23638 SDValue BR = N->getOperand(Num: 0);
23639 if (!Subtarget->hasCSSC() || BR.getOpcode() != ISD::BITREVERSE ||
23640 !BR.getValueType().isScalarInteger())
23641 return SDValue();
23642
23643 SDLoc DL(N);
23644 return DAG.getNode(Opcode: ISD::CTTZ, DL, VT: BR.getValueType(), Operand: BR.getOperand(i: 0));
23645}
23646
23647// Turns the vector of indices into a vector of byte offstes by scaling Offset
23648// by (BitWidth / 8).
23649static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset,
23650 SDLoc DL, unsigned BitWidth) {
23651 assert(Offset.getValueType().isScalableVector() &&
23652 "This method is only for scalable vectors of offsets");
23653
23654 SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
23655 SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
23656
23657 return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
23658}
23659
23660/// Check if the value of \p OffsetInBytes can be used as an immediate for
23661/// the gather load/prefetch and scatter store instructions with vector base and
23662/// immediate offset addressing mode:
23663///
23664/// [<Zn>.[S|D]{, #<imm>}]
23665///
23666/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
23667inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
23668 unsigned ScalarSizeInBytes) {
23669 // The immediate is not a multiple of the scalar size.
23670 if (OffsetInBytes % ScalarSizeInBytes)
23671 return false;
23672
23673 // The immediate is out of range.
23674 if (OffsetInBytes / ScalarSizeInBytes > 31)
23675 return false;
23676
23677 return true;
23678}
23679
23680/// Check if the value of \p Offset represents a valid immediate for the SVE
23681/// gather load/prefetch and scatter store instructiona with vector base and
23682/// immediate offset addressing mode:
23683///
23684/// [<Zn>.[S|D]{, #<imm>}]
23685///
23686/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
23687static bool isValidImmForSVEVecImmAddrMode(SDValue Offset,
23688 unsigned ScalarSizeInBytes) {
23689 ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Val: Offset.getNode());
23690 return OffsetConst && isValidImmForSVEVecImmAddrMode(
23691 OffsetInBytes: OffsetConst->getZExtValue(), ScalarSizeInBytes);
23692}
23693
23694static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
23695 unsigned Opcode,
23696 bool OnlyPackedOffsets = true) {
23697 const SDValue Src = N->getOperand(Num: 2);
23698 const EVT SrcVT = Src->getValueType(ResNo: 0);
23699 assert(SrcVT.isScalableVector() &&
23700 "Scatter stores are only possible for SVE vectors");
23701
23702 SDLoc DL(N);
23703 MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
23704
23705 // Make sure that source data will fit into an SVE register
23706 if (SrcVT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock)
23707 return SDValue();
23708
23709 // For FPs, ACLE only supports _packed_ single and double precision types.
23710 // SST1Q_[INDEX_]PRED is the ST1Q for sve2p1 and should allow all sizes.
23711 if (SrcElVT.isFloatingPoint())
23712 if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) &&
23713 ((Opcode != AArch64ISD::SST1Q_PRED &&
23714 Opcode != AArch64ISD::SST1Q_INDEX_PRED) ||
23715 ((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16))))
23716 return SDValue();
23717
23718 // Depending on the addressing mode, this is either a pointer or a vector of
23719 // pointers (that fits into one register)
23720 SDValue Base = N->getOperand(Num: 4);
23721 // Depending on the addressing mode, this is either a single offset or a
23722 // vector of offsets (that fits into one register)
23723 SDValue Offset = N->getOperand(Num: 5);
23724
23725 // For "scalar + vector of indices", just scale the indices. This only
23726 // applies to non-temporal scatters because there's no instruction that takes
23727 // indicies.
23728 if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
23729 Offset =
23730 getScaledOffsetForBitWidth(DAG, Offset, DL, BitWidth: SrcElVT.getSizeInBits());
23731 Opcode = AArch64ISD::SSTNT1_PRED;
23732 } else if (Opcode == AArch64ISD::SST1Q_INDEX_PRED) {
23733 Offset =
23734 getScaledOffsetForBitWidth(DAG, Offset, DL, BitWidth: SrcElVT.getSizeInBits());
23735 Opcode = AArch64ISD::SST1Q_PRED;
23736 }
23737
23738 // In the case of non-temporal gather loads there's only one SVE instruction
23739 // per data-size: "scalar + vector", i.e.
23740 // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
23741 // Since we do have intrinsics that allow the arguments to be in a different
23742 // order, we may need to swap them to match the spec.
23743 if ((Opcode == AArch64ISD::SSTNT1_PRED || Opcode == AArch64ISD::SST1Q_PRED) &&
23744 Offset.getValueType().isVector())
23745 std::swap(a&: Base, b&: Offset);
23746
23747 // SST1_IMM requires that the offset is an immediate that is:
23748 // * a multiple of #SizeInBytes,
23749 // * in the range [0, 31 x #SizeInBytes],
23750 // where #SizeInBytes is the size in bytes of the stored items. For
23751 // immediates outside that range and non-immediate scalar offsets use SST1 or
23752 // SST1_UXTW instead.
23753 if (Opcode == AArch64ISD::SST1_IMM_PRED) {
23754 if (!isValidImmForSVEVecImmAddrMode(Offset,
23755 ScalarSizeInBytes: SrcVT.getScalarSizeInBits() / 8)) {
23756 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
23757 Opcode = AArch64ISD::SST1_UXTW_PRED;
23758 else
23759 Opcode = AArch64ISD::SST1_PRED;
23760
23761 std::swap(a&: Base, b&: Offset);
23762 }
23763 }
23764
23765 auto &TLI = DAG.getTargetLoweringInfo();
23766 if (!TLI.isTypeLegal(VT: Base.getValueType()))
23767 return SDValue();
23768
23769 // Some scatter store variants allow unpacked offsets, but only as nxv2i32
23770 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
23771 // nxv2i64. Legalize accordingly.
23772 if (!OnlyPackedOffsets &&
23773 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
23774 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
23775
23776 if (!TLI.isTypeLegal(VT: Offset.getValueType()))
23777 return SDValue();
23778
23779 // Source value type that is representable in hardware
23780 EVT HwSrcVt = getSVEContainerType(ContentTy: SrcVT);
23781
23782 // Keep the original type of the input data to store - this is needed to be
23783 // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
23784 // FP values we want the integer equivalent, so just use HwSrcVt.
23785 SDValue InputVT = DAG.getValueType(SrcVT);
23786 if (SrcVT.isFloatingPoint())
23787 InputVT = DAG.getValueType(HwSrcVt);
23788
23789 SDVTList VTs = DAG.getVTList(MVT::Other);
23790 SDValue SrcNew;
23791
23792 if (Src.getValueType().isFloatingPoint())
23793 SrcNew = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: HwSrcVt, Operand: Src);
23794 else
23795 SrcNew = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: HwSrcVt, Operand: Src);
23796
23797 SDValue Ops[] = {N->getOperand(Num: 0), // Chain
23798 SrcNew,
23799 N->getOperand(Num: 3), // Pg
23800 Base,
23801 Offset,
23802 InputVT};
23803
23804 return DAG.getNode(Opcode, DL, VTList: VTs, Ops);
23805}
23806
23807static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
23808 unsigned Opcode,
23809 bool OnlyPackedOffsets = true) {
23810 const EVT RetVT = N->getValueType(ResNo: 0);
23811 assert(RetVT.isScalableVector() &&
23812 "Gather loads are only possible for SVE vectors");
23813
23814 SDLoc DL(N);
23815
23816 // Make sure that the loaded data will fit into an SVE register
23817 if (RetVT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock)
23818 return SDValue();
23819
23820 // Depending on the addressing mode, this is either a pointer or a vector of
23821 // pointers (that fits into one register)
23822 SDValue Base = N->getOperand(Num: 3);
23823 // Depending on the addressing mode, this is either a single offset or a
23824 // vector of offsets (that fits into one register)
23825 SDValue Offset = N->getOperand(Num: 4);
23826
23827 // For "scalar + vector of indices", scale the indices to obtain unscaled
23828 // offsets. This applies to non-temporal and quadword gathers, which do not
23829 // have an addressing mode with scaled offset.
23830 if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) {
23831 Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
23832 BitWidth: RetVT.getScalarSizeInBits());
23833 Opcode = AArch64ISD::GLDNT1_MERGE_ZERO;
23834 } else if (Opcode == AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) {
23835 Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
23836 BitWidth: RetVT.getScalarSizeInBits());
23837 Opcode = AArch64ISD::GLD1Q_MERGE_ZERO;
23838 }
23839
23840 // In the case of non-temporal gather loads and quadword gather loads there's
23841 // only one addressing mode : "vector + scalar", e.g.
23842 // ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
23843 // Since we do have intrinsics that allow the arguments to be in a different
23844 // order, we may need to swap them to match the spec.
23845 if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO ||
23846 Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) &&
23847 Offset.getValueType().isVector())
23848 std::swap(a&: Base, b&: Offset);
23849
23850 // GLD{FF}1_IMM requires that the offset is an immediate that is:
23851 // * a multiple of #SizeInBytes,
23852 // * in the range [0, 31 x #SizeInBytes],
23853 // where #SizeInBytes is the size in bytes of the loaded items. For
23854 // immediates outside that range and non-immediate scalar offsets use
23855 // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
23856 if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
23857 Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) {
23858 if (!isValidImmForSVEVecImmAddrMode(Offset,
23859 ScalarSizeInBytes: RetVT.getScalarSizeInBits() / 8)) {
23860 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
23861 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
23862 ? AArch64ISD::GLD1_UXTW_MERGE_ZERO
23863 : AArch64ISD::GLDFF1_UXTW_MERGE_ZERO;
23864 else
23865 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
23866 ? AArch64ISD::GLD1_MERGE_ZERO
23867 : AArch64ISD::GLDFF1_MERGE_ZERO;
23868
23869 std::swap(a&: Base, b&: Offset);
23870 }
23871 }
23872
23873 auto &TLI = DAG.getTargetLoweringInfo();
23874 if (!TLI.isTypeLegal(VT: Base.getValueType()))
23875 return SDValue();
23876
23877 // Some gather load variants allow unpacked offsets, but only as nxv2i32
23878 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
23879 // nxv2i64. Legalize accordingly.
23880 if (!OnlyPackedOffsets &&
23881 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
23882 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
23883
23884 // Return value type that is representable in hardware
23885 EVT HwRetVt = getSVEContainerType(ContentTy: RetVT);
23886
23887 // Keep the original output value type around - this is needed to be able to
23888 // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
23889 // values we want the integer equivalent, so just use HwRetVT.
23890 SDValue OutVT = DAG.getValueType(RetVT);
23891 if (RetVT.isFloatingPoint())
23892 OutVT = DAG.getValueType(HwRetVt);
23893
23894 SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
23895 SDValue Ops[] = {N->getOperand(Num: 0), // Chain
23896 N->getOperand(Num: 2), // Pg
23897 Base, Offset, OutVT};
23898
23899 SDValue Load = DAG.getNode(Opcode, DL, VTList: VTs, Ops);
23900 SDValue LoadChain = SDValue(Load.getNode(), 1);
23901
23902 if (RetVT.isInteger() && (RetVT != HwRetVt))
23903 Load = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: RetVT, Operand: Load.getValue(R: 0));
23904
23905 // If the original return value was FP, bitcast accordingly. Doing it here
23906 // means that we can avoid adding TableGen patterns for FPs.
23907 if (RetVT.isFloatingPoint())
23908 Load = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: RetVT, Operand: Load.getValue(R: 0));
23909
23910 return DAG.getMergeValues(Ops: {Load, LoadChain}, dl: DL);
23911}
23912
23913static SDValue
23914performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
23915 SelectionDAG &DAG) {
23916 SDLoc DL(N);
23917 SDValue Src = N->getOperand(Num: 0);
23918 unsigned Opc = Src->getOpcode();
23919
23920 // Sign extend of an unsigned unpack -> signed unpack
23921 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
23922
23923 unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
23924 : AArch64ISD::SUNPKLO;
23925
23926 // Push the sign extend to the operand of the unpack
23927 // This is necessary where, for example, the operand of the unpack
23928 // is another unpack:
23929 // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
23930 // ->
23931 // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
23932 // ->
23933 // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
23934 SDValue ExtOp = Src->getOperand(Num: 0);
23935 auto VT = cast<VTSDNode>(Val: N->getOperand(Num: 1))->getVT();
23936 EVT EltTy = VT.getVectorElementType();
23937 (void)EltTy;
23938
23939 assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
23940 "Sign extending from an invalid type");
23941
23942 EVT ExtVT = VT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
23943
23944 SDValue Ext = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT: ExtOp.getValueType(),
23945 N1: ExtOp, N2: DAG.getValueType(ExtVT));
23946
23947 return DAG.getNode(Opcode: SOpc, DL, VT: N->getValueType(ResNo: 0), Operand: Ext);
23948 }
23949
23950 if (DCI.isBeforeLegalizeOps())
23951 return SDValue();
23952
23953 if (!EnableCombineMGatherIntrinsics)
23954 return SDValue();
23955
23956 // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
23957 // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
23958 unsigned NewOpc;
23959 unsigned MemVTOpNum = 4;
23960 switch (Opc) {
23961 case AArch64ISD::LD1_MERGE_ZERO:
23962 NewOpc = AArch64ISD::LD1S_MERGE_ZERO;
23963 MemVTOpNum = 3;
23964 break;
23965 case AArch64ISD::LDNF1_MERGE_ZERO:
23966 NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO;
23967 MemVTOpNum = 3;
23968 break;
23969 case AArch64ISD::LDFF1_MERGE_ZERO:
23970 NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO;
23971 MemVTOpNum = 3;
23972 break;
23973 case AArch64ISD::GLD1_MERGE_ZERO:
23974 NewOpc = AArch64ISD::GLD1S_MERGE_ZERO;
23975 break;
23976 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
23977 NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
23978 break;
23979 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
23980 NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
23981 break;
23982 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
23983 NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
23984 break;
23985 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
23986 NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
23987 break;
23988 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
23989 NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
23990 break;
23991 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
23992 NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO;
23993 break;
23994 case AArch64ISD::GLDFF1_MERGE_ZERO:
23995 NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO;
23996 break;
23997 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
23998 NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO;
23999 break;
24000 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
24001 NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO;
24002 break;
24003 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
24004 NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO;
24005 break;
24006 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
24007 NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO;
24008 break;
24009 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
24010 NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO;
24011 break;
24012 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
24013 NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO;
24014 break;
24015 case AArch64ISD::GLDNT1_MERGE_ZERO:
24016 NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO;
24017 break;
24018 default:
24019 return SDValue();
24020 }
24021
24022 EVT SignExtSrcVT = cast<VTSDNode>(Val: N->getOperand(Num: 1))->getVT();
24023 EVT SrcMemVT = cast<VTSDNode>(Val: Src->getOperand(Num: MemVTOpNum))->getVT();
24024
24025 if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
24026 return SDValue();
24027
24028 EVT DstVT = N->getValueType(ResNo: 0);
24029 SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
24030
24031 SmallVector<SDValue, 5> Ops;
24032 for (unsigned I = 0; I < Src->getNumOperands(); ++I)
24033 Ops.push_back(Elt: Src->getOperand(Num: I));
24034
24035 SDValue ExtLoad = DAG.getNode(Opcode: NewOpc, DL: SDLoc(N), VTList: VTs, Ops);
24036 DCI.CombineTo(N, Res: ExtLoad);
24037 DCI.CombineTo(N: Src.getNode(), Res0: ExtLoad, Res1: ExtLoad.getValue(R: 1));
24038
24039 // Return N so it doesn't get rechecked
24040 return SDValue(N, 0);
24041}
24042
24043/// Legalize the gather prefetch (scalar + vector addressing mode) when the
24044/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
24045/// != nxv2i32) do not need legalization.
24046static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG) {
24047 const unsigned OffsetPos = 4;
24048 SDValue Offset = N->getOperand(Num: OffsetPos);
24049
24050 // Not an unpacked vector, bail out.
24051 if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
24052 return SDValue();
24053
24054 // Extend the unpacked offset vector to 64-bit lanes.
24055 SDLoc DL(N);
24056 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
24057 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
24058 // Replace the offset operand with the 64-bit one.
24059 Ops[OffsetPos] = Offset;
24060
24061 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
24062}
24063
24064/// Combines a node carrying the intrinsic
24065/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
24066/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
24067/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
24068/// sve gather prefetch instruction with vector plus immediate addressing mode.
24069static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG,
24070 unsigned ScalarSizeInBytes) {
24071 const unsigned ImmPos = 4, OffsetPos = 3;
24072 // No need to combine the node if the immediate is valid...
24073 if (isValidImmForSVEVecImmAddrMode(Offset: N->getOperand(Num: ImmPos), ScalarSizeInBytes))
24074 return SDValue();
24075
24076 // ...otherwise swap the offset base with the offset...
24077 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
24078 std::swap(a&: Ops[ImmPos], b&: Ops[OffsetPos]);
24079 // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
24080 // `aarch64_sve_prfb_gather_uxtw_index`.
24081 SDLoc DL(N);
24082 Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
24083 MVT::i64);
24084
24085 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
24086}
24087
24088// Return true if the vector operation can guarantee only the first lane of its
24089// result contains data, with all bits in other lanes set to zero.
24090static bool isLanes1toNKnownZero(SDValue Op) {
24091 switch (Op.getOpcode()) {
24092 default:
24093 return false;
24094 case AArch64ISD::ANDV_PRED:
24095 case AArch64ISD::EORV_PRED:
24096 case AArch64ISD::FADDA_PRED:
24097 case AArch64ISD::FADDV_PRED:
24098 case AArch64ISD::FMAXNMV_PRED:
24099 case AArch64ISD::FMAXV_PRED:
24100 case AArch64ISD::FMINNMV_PRED:
24101 case AArch64ISD::FMINV_PRED:
24102 case AArch64ISD::ORV_PRED:
24103 case AArch64ISD::SADDV_PRED:
24104 case AArch64ISD::SMAXV_PRED:
24105 case AArch64ISD::SMINV_PRED:
24106 case AArch64ISD::UADDV_PRED:
24107 case AArch64ISD::UMAXV_PRED:
24108 case AArch64ISD::UMINV_PRED:
24109 return true;
24110 }
24111}
24112
24113static SDValue removeRedundantInsertVectorElt(SDNode *N) {
24114 assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
24115 SDValue InsertVec = N->getOperand(Num: 0);
24116 SDValue InsertElt = N->getOperand(Num: 1);
24117 SDValue InsertIdx = N->getOperand(Num: 2);
24118
24119 // We only care about inserts into the first element...
24120 if (!isNullConstant(V: InsertIdx))
24121 return SDValue();
24122 // ...of a zero'd vector...
24123 if (!ISD::isConstantSplatVectorAllZeros(N: InsertVec.getNode()))
24124 return SDValue();
24125 // ...where the inserted data was previously extracted...
24126 if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
24127 return SDValue();
24128
24129 SDValue ExtractVec = InsertElt.getOperand(i: 0);
24130 SDValue ExtractIdx = InsertElt.getOperand(i: 1);
24131
24132 // ...from the first element of a vector.
24133 if (!isNullConstant(V: ExtractIdx))
24134 return SDValue();
24135
24136 // If we get here we are effectively trying to zero lanes 1-N of a vector.
24137
24138 // Ensure there's no type conversion going on.
24139 if (N->getValueType(ResNo: 0) != ExtractVec.getValueType())
24140 return SDValue();
24141
24142 if (!isLanes1toNKnownZero(Op: ExtractVec))
24143 return SDValue();
24144
24145 // The explicit zeroing is redundant.
24146 return ExtractVec;
24147}
24148
24149static SDValue
24150performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
24151 if (SDValue Res = removeRedundantInsertVectorElt(N))
24152 return Res;
24153
24154 return performPostLD1Combine(N, DCI, IsLaneOp: true);
24155}
24156
24157static SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG) {
24158 EVT Ty = N->getValueType(ResNo: 0);
24159 if (Ty.isInteger())
24160 return SDValue();
24161
24162 EVT IntTy = Ty.changeVectorElementTypeToInteger();
24163 EVT ExtIntTy = getPackedSVEVectorVT(EC: IntTy.getVectorElementCount());
24164 if (ExtIntTy.getVectorElementType().getScalarSizeInBits() <
24165 IntTy.getVectorElementType().getScalarSizeInBits())
24166 return SDValue();
24167
24168 SDLoc DL(N);
24169 SDValue LHS = DAG.getAnyExtOrTrunc(Op: DAG.getBitcast(VT: IntTy, V: N->getOperand(Num: 0)),
24170 DL, VT: ExtIntTy);
24171 SDValue RHS = DAG.getAnyExtOrTrunc(Op: DAG.getBitcast(VT: IntTy, V: N->getOperand(Num: 1)),
24172 DL, VT: ExtIntTy);
24173 SDValue Idx = N->getOperand(Num: 2);
24174 SDValue Splice = DAG.getNode(Opcode: ISD::VECTOR_SPLICE, DL, VT: ExtIntTy, N1: LHS, N2: RHS, N3: Idx);
24175 SDValue Trunc = DAG.getAnyExtOrTrunc(Op: Splice, DL, VT: IntTy);
24176 return DAG.getBitcast(VT: Ty, V: Trunc);
24177}
24178
24179static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG,
24180 TargetLowering::DAGCombinerInfo &DCI,
24181 const AArch64Subtarget *Subtarget) {
24182 SDValue N0 = N->getOperand(Num: 0);
24183 EVT VT = N->getValueType(ResNo: 0);
24184
24185 // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
24186 if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::FP_ROUND)
24187 return SDValue();
24188
24189 auto hasValidElementTypeForFPExtLoad = [](EVT VT) {
24190 EVT EltVT = VT.getVectorElementType();
24191 return EltVT == MVT::f32 || EltVT == MVT::f64;
24192 };
24193
24194 // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
24195 // We purposefully don't care about legality of the nodes here as we know
24196 // they can be split down into something legal.
24197 if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N: N0.getNode()) &&
24198 N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
24199 VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad(VT) &&
24200 VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
24201 LoadSDNode *LN0 = cast<LoadSDNode>(Val&: N0);
24202 SDValue ExtLoad = DAG.getExtLoad(ExtType: ISD::EXTLOAD, dl: SDLoc(N), VT,
24203 Chain: LN0->getChain(), Ptr: LN0->getBasePtr(),
24204 MemVT: N0.getValueType(), MMO: LN0->getMemOperand());
24205 DCI.CombineTo(N, Res: ExtLoad);
24206 DCI.CombineTo(
24207 N: N0.getNode(),
24208 Res0: DAG.getNode(Opcode: ISD::FP_ROUND, DL: SDLoc(N0), VT: N0.getValueType(), N1: ExtLoad,
24209 N2: DAG.getIntPtrConstant(Val: 1, DL: SDLoc(N0), /*isTarget=*/true)),
24210 Res1: ExtLoad.getValue(R: 1));
24211 return SDValue(N, 0); // Return N so it doesn't get rechecked!
24212 }
24213
24214 return SDValue();
24215}
24216
24217static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG,
24218 const AArch64Subtarget *Subtarget) {
24219 EVT VT = N->getValueType(ResNo: 0);
24220
24221 // Don't expand for NEON, SVE2 or SME
24222 if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME())
24223 return SDValue();
24224
24225 SDLoc DL(N);
24226
24227 SDValue Mask = N->getOperand(Num: 0);
24228 SDValue In1 = N->getOperand(Num: 1);
24229 SDValue In2 = N->getOperand(Num: 2);
24230
24231 SDValue InvMask = DAG.getNOT(DL, Val: Mask, VT);
24232 SDValue Sel = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Mask, N2: In1);
24233 SDValue SelInv = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: InvMask, N2: In2);
24234 return DAG.getNode(Opcode: ISD::OR, DL, VT, N1: Sel, N2: SelInv);
24235}
24236
24237static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG) {
24238 EVT VT = N->getValueType(ResNo: 0);
24239
24240 SDValue Insert = N->getOperand(Num: 0);
24241 if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
24242 return SDValue();
24243
24244 if (!Insert.getOperand(i: 0).isUndef())
24245 return SDValue();
24246
24247 uint64_t IdxInsert = Insert.getConstantOperandVal(i: 2);
24248 uint64_t IdxDupLane = N->getConstantOperandVal(Num: 1);
24249 if (IdxInsert != 0 || IdxDupLane != 0)
24250 return SDValue();
24251
24252 SDValue Bitcast = Insert.getOperand(i: 1);
24253 if (Bitcast.getOpcode() != ISD::BITCAST)
24254 return SDValue();
24255
24256 SDValue Subvec = Bitcast.getOperand(i: 0);
24257 EVT SubvecVT = Subvec.getValueType();
24258 if (!SubvecVT.is128BitVector())
24259 return SDValue();
24260 EVT NewSubvecVT =
24261 getPackedSVEVectorVT(VT: Subvec.getValueType().getVectorElementType());
24262
24263 SDLoc DL(N);
24264 SDValue NewInsert =
24265 DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT: NewSubvecVT,
24266 N1: DAG.getUNDEF(VT: NewSubvecVT), N2: Subvec, N3: Insert->getOperand(Num: 2));
24267 SDValue NewDuplane128 = DAG.getNode(Opcode: AArch64ISD::DUPLANE128, DL, VT: NewSubvecVT,
24268 N1: NewInsert, N2: N->getOperand(Num: 1));
24269 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: NewDuplane128);
24270}
24271
24272// Try to combine mull with uzp1.
24273static SDValue tryCombineMULLWithUZP1(SDNode *N,
24274 TargetLowering::DAGCombinerInfo &DCI,
24275 SelectionDAG &DAG) {
24276 if (DCI.isBeforeLegalizeOps())
24277 return SDValue();
24278
24279 SDValue LHS = N->getOperand(Num: 0);
24280 SDValue RHS = N->getOperand(Num: 1);
24281
24282 SDValue ExtractHigh;
24283 SDValue ExtractLow;
24284 SDValue TruncHigh;
24285 SDValue TruncLow;
24286 SDLoc DL(N);
24287
24288 // Check the operands are trunc and extract_high.
24289 if (isEssentiallyExtractHighSubvector(N: LHS) &&
24290 RHS.getOpcode() == ISD::TRUNCATE) {
24291 TruncHigh = RHS;
24292 if (LHS.getOpcode() == ISD::BITCAST)
24293 ExtractHigh = LHS.getOperand(i: 0);
24294 else
24295 ExtractHigh = LHS;
24296 } else if (isEssentiallyExtractHighSubvector(N: RHS) &&
24297 LHS.getOpcode() == ISD::TRUNCATE) {
24298 TruncHigh = LHS;
24299 if (LHS.getOpcode() == ISD::BITCAST)
24300 ExtractHigh = RHS.getOperand(i: 0);
24301 else
24302 ExtractHigh = RHS;
24303 } else
24304 return SDValue();
24305
24306 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
24307 // with uzp1.
24308 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
24309 SDValue TruncHighOp = TruncHigh.getOperand(i: 0);
24310 EVT TruncHighOpVT = TruncHighOp.getValueType();
24311 if (TruncHighOp.getOpcode() == AArch64ISD::DUP ||
24312 DAG.isSplatValue(V: TruncHighOp, AllowUndefs: false))
24313 return SDValue();
24314
24315 // Check there is other extract_high with same source vector.
24316 // For example,
24317 //
24318 // t18: v4i16 = extract_subvector t2, Constant:i64<0>
24319 // t12: v4i16 = truncate t11
24320 // t31: v4i32 = AArch64ISD::SMULL t18, t12
24321 // t23: v4i16 = extract_subvector t2, Constant:i64<4>
24322 // t16: v4i16 = truncate t15
24323 // t30: v4i32 = AArch64ISD::SMULL t23, t1
24324 //
24325 // This dagcombine assumes the two extract_high uses same source vector in
24326 // order to detect the pair of the mull. If they have different source vector,
24327 // this code will not work.
24328 bool HasFoundMULLow = true;
24329 SDValue ExtractHighSrcVec = ExtractHigh.getOperand(i: 0);
24330 if (ExtractHighSrcVec->use_size() != 2)
24331 HasFoundMULLow = false;
24332
24333 // Find ExtractLow.
24334 for (SDNode *User : ExtractHighSrcVec.getNode()->uses()) {
24335 if (User == ExtractHigh.getNode())
24336 continue;
24337
24338 if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
24339 !isNullConstant(V: User->getOperand(Num: 1))) {
24340 HasFoundMULLow = false;
24341 break;
24342 }
24343
24344 ExtractLow.setNode(User);
24345 }
24346
24347 if (!ExtractLow || !ExtractLow->hasOneUse())
24348 HasFoundMULLow = false;
24349
24350 // Check ExtractLow's user.
24351 if (HasFoundMULLow) {
24352 SDNode *ExtractLowUser = *ExtractLow.getNode()->use_begin();
24353 if (ExtractLowUser->getOpcode() != N->getOpcode()) {
24354 HasFoundMULLow = false;
24355 } else {
24356 if (ExtractLowUser->getOperand(Num: 0) == ExtractLow) {
24357 if (ExtractLowUser->getOperand(Num: 1).getOpcode() == ISD::TRUNCATE)
24358 TruncLow = ExtractLowUser->getOperand(Num: 1);
24359 else
24360 HasFoundMULLow = false;
24361 } else {
24362 if (ExtractLowUser->getOperand(Num: 0).getOpcode() == ISD::TRUNCATE)
24363 TruncLow = ExtractLowUser->getOperand(Num: 0);
24364 else
24365 HasFoundMULLow = false;
24366 }
24367 }
24368 }
24369
24370 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
24371 // with uzp1.
24372 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
24373 EVT TruncHighVT = TruncHigh.getValueType();
24374 EVT UZP1VT = TruncHighVT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
24375 SDValue TruncLowOp =
24376 HasFoundMULLow ? TruncLow.getOperand(i: 0) : DAG.getUNDEF(VT: UZP1VT);
24377 EVT TruncLowOpVT = TruncLowOp.getValueType();
24378 if (HasFoundMULLow && (TruncLowOp.getOpcode() == AArch64ISD::DUP ||
24379 DAG.isSplatValue(V: TruncLowOp, AllowUndefs: false)))
24380 return SDValue();
24381
24382 // Create uzp1, extract_high and extract_low.
24383 if (TruncHighOpVT != UZP1VT)
24384 TruncHighOp = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: UZP1VT, Operand: TruncHighOp);
24385 if (TruncLowOpVT != UZP1VT)
24386 TruncLowOp = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: UZP1VT, Operand: TruncLowOp);
24387
24388 SDValue UZP1 =
24389 DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: UZP1VT, N1: TruncLowOp, N2: TruncHighOp);
24390 SDValue HighIdxCst =
24391 DAG.getConstant(TruncHighVT.getVectorNumElements(), DL, MVT::i64);
24392 SDValue NewTruncHigh =
24393 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: TruncHighVT, N1: UZP1, N2: HighIdxCst);
24394 DAG.ReplaceAllUsesWith(From: TruncHigh, To: NewTruncHigh);
24395
24396 if (HasFoundMULLow) {
24397 EVT TruncLowVT = TruncLow.getValueType();
24398 SDValue NewTruncLow = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: TruncLowVT,
24399 N1: UZP1, N2: ExtractLow.getOperand(i: 1));
24400 DAG.ReplaceAllUsesWith(From: TruncLow, To: NewTruncLow);
24401 }
24402
24403 return SDValue(N, 0);
24404}
24405
24406static SDValue performMULLCombine(SDNode *N,
24407 TargetLowering::DAGCombinerInfo &DCI,
24408 SelectionDAG &DAG) {
24409 if (SDValue Val =
24410 tryCombineLongOpWithDup(IID: Intrinsic::not_intrinsic, N, DCI, DAG))
24411 return Val;
24412
24413 if (SDValue Val = tryCombineMULLWithUZP1(N, DCI, DAG))
24414 return Val;
24415
24416 return SDValue();
24417}
24418
24419static SDValue
24420performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
24421 SelectionDAG &DAG) {
24422 // Let's do below transform.
24423 //
24424 // t34: v4i32 = AArch64ISD::UADDLV t2
24425 // t35: i32 = extract_vector_elt t34, Constant:i64<0>
24426 // t7: i64 = zero_extend t35
24427 // t20: v1i64 = scalar_to_vector t7
24428 // ==>
24429 // t34: v4i32 = AArch64ISD::UADDLV t2
24430 // t39: v2i32 = extract_subvector t34, Constant:i64<0>
24431 // t40: v1i64 = AArch64ISD::NVCAST t39
24432 if (DCI.isBeforeLegalizeOps())
24433 return SDValue();
24434
24435 EVT VT = N->getValueType(ResNo: 0);
24436 if (VT != MVT::v1i64)
24437 return SDValue();
24438
24439 SDValue ZEXT = N->getOperand(Num: 0);
24440 if (ZEXT.getOpcode() != ISD::ZERO_EXTEND || ZEXT.getValueType() != MVT::i64)
24441 return SDValue();
24442
24443 SDValue EXTRACT_VEC_ELT = ZEXT.getOperand(i: 0);
24444 if (EXTRACT_VEC_ELT.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
24445 EXTRACT_VEC_ELT.getValueType() != MVT::i32)
24446 return SDValue();
24447
24448 if (!isNullConstant(V: EXTRACT_VEC_ELT.getOperand(i: 1)))
24449 return SDValue();
24450
24451 SDValue UADDLV = EXTRACT_VEC_ELT.getOperand(i: 0);
24452 if (UADDLV.getOpcode() != AArch64ISD::UADDLV ||
24453 UADDLV.getValueType() != MVT::v4i32 ||
24454 UADDLV.getOperand(0).getValueType() != MVT::v8i8)
24455 return SDValue();
24456
24457 // Let's generate new sequence with AArch64ISD::NVCAST.
24458 SDLoc DL(N);
24459 SDValue EXTRACT_SUBVEC =
24460 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, UADDLV,
24461 DAG.getConstant(0, DL, MVT::i64));
24462 SDValue NVCAST =
24463 DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, EXTRACT_SUBVEC);
24464
24465 return NVCAST;
24466}
24467
24468SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
24469 DAGCombinerInfo &DCI) const {
24470 SelectionDAG &DAG = DCI.DAG;
24471 switch (N->getOpcode()) {
24472 default:
24473 LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
24474 break;
24475 case ISD::VECREDUCE_AND:
24476 case ISD::VECREDUCE_OR:
24477 case ISD::VECREDUCE_XOR:
24478 return performVecReduceBitwiseCombine(N, DCI, DAG);
24479 case ISD::ADD:
24480 case ISD::SUB:
24481 return performAddSubCombine(N, DCI);
24482 case ISD::BUILD_VECTOR:
24483 return performBuildVectorCombine(N, DCI, DAG);
24484 case ISD::TRUNCATE:
24485 return performTruncateCombine(N, DAG);
24486 case AArch64ISD::ANDS:
24487 return performFlagSettingCombine(N, DCI, GenericOpcode: ISD::AND);
24488 case AArch64ISD::ADC:
24489 if (auto R = foldOverflowCheck(Op: N, DAG, /* IsAdd */ true))
24490 return R;
24491 return foldADCToCINC(N, DAG);
24492 case AArch64ISD::SBC:
24493 return foldOverflowCheck(Op: N, DAG, /* IsAdd */ false);
24494 case AArch64ISD::ADCS:
24495 if (auto R = foldOverflowCheck(Op: N, DAG, /* IsAdd */ true))
24496 return R;
24497 return performFlagSettingCombine(N, DCI, GenericOpcode: AArch64ISD::ADC);
24498 case AArch64ISD::SBCS:
24499 if (auto R = foldOverflowCheck(Op: N, DAG, /* IsAdd */ false))
24500 return R;
24501 return performFlagSettingCombine(N, DCI, GenericOpcode: AArch64ISD::SBC);
24502 case AArch64ISD::BICi: {
24503 APInt DemandedBits =
24504 APInt::getAllOnes(numBits: N->getValueType(ResNo: 0).getScalarSizeInBits());
24505 APInt DemandedElts =
24506 APInt::getAllOnes(numBits: N->getValueType(ResNo: 0).getVectorNumElements());
24507
24508 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(
24509 Op: SDValue(N, 0), DemandedBits, DemandedElts, DCI))
24510 return SDValue();
24511
24512 break;
24513 }
24514 case ISD::XOR:
24515 return performXorCombine(N, DAG, DCI, Subtarget);
24516 case ISD::MUL:
24517 return performMulCombine(N, DAG, DCI, Subtarget);
24518 case ISD::SINT_TO_FP:
24519 case ISD::UINT_TO_FP:
24520 return performIntToFpCombine(N, DAG, Subtarget);
24521 case ISD::FP_TO_SINT:
24522 case ISD::FP_TO_UINT:
24523 case ISD::FP_TO_SINT_SAT:
24524 case ISD::FP_TO_UINT_SAT:
24525 return performFpToIntCombine(N, DAG, DCI, Subtarget);
24526 case ISD::FDIV:
24527 return performFDivCombine(N, DAG, DCI, Subtarget);
24528 case ISD::OR:
24529 return performORCombine(N, DCI, Subtarget, TLI: *this);
24530 case ISD::AND:
24531 return performANDCombine(N, DCI);
24532 case ISD::FADD:
24533 return performFADDCombine(N, DCI);
24534 case ISD::INTRINSIC_WO_CHAIN:
24535 return performIntrinsicCombine(N, DCI, Subtarget);
24536 case ISD::ANY_EXTEND:
24537 case ISD::ZERO_EXTEND:
24538 case ISD::SIGN_EXTEND:
24539 return performExtendCombine(N, DCI, DAG);
24540 case ISD::SIGN_EXTEND_INREG:
24541 return performSignExtendInRegCombine(N, DCI, DAG);
24542 case ISD::CONCAT_VECTORS:
24543 return performConcatVectorsCombine(N, DCI, DAG);
24544 case ISD::EXTRACT_SUBVECTOR:
24545 return performExtractSubvectorCombine(N, DCI, DAG);
24546 case ISD::INSERT_SUBVECTOR:
24547 return performInsertSubvectorCombine(N, DCI, DAG);
24548 case ISD::SELECT:
24549 return performSelectCombine(N, DCI);
24550 case ISD::VSELECT:
24551 return performVSelectCombine(N, DAG&: DCI.DAG);
24552 case ISD::SETCC:
24553 return performSETCCCombine(N, DCI, DAG);
24554 case ISD::LOAD:
24555 return performLOADCombine(N, DCI, DAG, Subtarget);
24556 case ISD::STORE:
24557 return performSTORECombine(N, DCI, DAG, Subtarget);
24558 case ISD::MSTORE:
24559 return performMSTORECombine(N, DCI, DAG, Subtarget);
24560 case ISD::MGATHER:
24561 case ISD::MSCATTER:
24562 return performMaskedGatherScatterCombine(N, DCI, DAG);
24563 case ISD::VECTOR_SPLICE:
24564 return performSVESpliceCombine(N, DAG);
24565 case ISD::FP_EXTEND:
24566 return performFPExtendCombine(N, DAG, DCI, Subtarget);
24567 case AArch64ISD::BRCOND:
24568 return performBRCONDCombine(N, DCI, DAG);
24569 case AArch64ISD::TBNZ:
24570 case AArch64ISD::TBZ:
24571 return performTBZCombine(N, DCI, DAG);
24572 case AArch64ISD::CSEL:
24573 return performCSELCombine(N, DCI, DAG);
24574 case AArch64ISD::DUP:
24575 case AArch64ISD::DUPLANE8:
24576 case AArch64ISD::DUPLANE16:
24577 case AArch64ISD::DUPLANE32:
24578 case AArch64ISD::DUPLANE64:
24579 return performDUPCombine(N, DCI);
24580 case AArch64ISD::DUPLANE128:
24581 return performDupLane128Combine(N, DAG);
24582 case AArch64ISD::NVCAST:
24583 return performNVCASTCombine(N, DAG);
24584 case AArch64ISD::SPLICE:
24585 return performSpliceCombine(N, DAG);
24586 case AArch64ISD::UUNPKLO:
24587 case AArch64ISD::UUNPKHI:
24588 return performUnpackCombine(N, DAG, Subtarget);
24589 case AArch64ISD::UZP1:
24590 return performUzpCombine(N, DAG, Subtarget);
24591 case AArch64ISD::SETCC_MERGE_ZERO:
24592 return performSetccMergeZeroCombine(N, DCI);
24593 case AArch64ISD::REINTERPRET_CAST:
24594 return performReinterpretCastCombine(N);
24595 case AArch64ISD::GLD1_MERGE_ZERO:
24596 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
24597 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
24598 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
24599 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
24600 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
24601 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
24602 case AArch64ISD::GLD1S_MERGE_ZERO:
24603 case AArch64ISD::GLD1S_SCALED_MERGE_ZERO:
24604 case AArch64ISD::GLD1S_UXTW_MERGE_ZERO:
24605 case AArch64ISD::GLD1S_SXTW_MERGE_ZERO:
24606 case AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO:
24607 case AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO:
24608 case AArch64ISD::GLD1S_IMM_MERGE_ZERO:
24609 return performGLD1Combine(N, DAG);
24610 case AArch64ISD::VASHR:
24611 case AArch64ISD::VLSHR:
24612 return performVectorShiftCombine(N, TLI: *this, DCI);
24613 case AArch64ISD::SUNPKLO:
24614 return performSunpkloCombine(N, DAG);
24615 case AArch64ISD::BSP:
24616 return performBSPExpandForSVE(N, DAG, Subtarget);
24617 case ISD::INSERT_VECTOR_ELT:
24618 return performInsertVectorEltCombine(N, DCI);
24619 case ISD::EXTRACT_VECTOR_ELT:
24620 return performExtractVectorEltCombine(N, DCI, Subtarget);
24621 case ISD::VECREDUCE_ADD:
24622 return performVecReduceAddCombine(N, DAG&: DCI.DAG, ST: Subtarget);
24623 case AArch64ISD::UADDV:
24624 return performUADDVCombine(N, DAG);
24625 case AArch64ISD::SMULL:
24626 case AArch64ISD::UMULL:
24627 case AArch64ISD::PMULL:
24628 return performMULLCombine(N, DCI, DAG);
24629 case ISD::INTRINSIC_VOID:
24630 case ISD::INTRINSIC_W_CHAIN:
24631 switch (N->getConstantOperandVal(Num: 1)) {
24632 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
24633 return combineSVEPrefetchVecBaseImmOff(N, DAG, ScalarSizeInBytes: 1 /*=ScalarSizeInBytes*/);
24634 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
24635 return combineSVEPrefetchVecBaseImmOff(N, DAG, ScalarSizeInBytes: 2 /*=ScalarSizeInBytes*/);
24636 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
24637 return combineSVEPrefetchVecBaseImmOff(N, DAG, ScalarSizeInBytes: 4 /*=ScalarSizeInBytes*/);
24638 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
24639 return combineSVEPrefetchVecBaseImmOff(N, DAG, ScalarSizeInBytes: 8 /*=ScalarSizeInBytes*/);
24640 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
24641 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
24642 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
24643 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
24644 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
24645 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
24646 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
24647 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
24648 return legalizeSVEGatherPrefetchOffsVec(N, DAG);
24649 case Intrinsic::aarch64_neon_ld2:
24650 case Intrinsic::aarch64_neon_ld3:
24651 case Intrinsic::aarch64_neon_ld4:
24652 case Intrinsic::aarch64_neon_ld1x2:
24653 case Intrinsic::aarch64_neon_ld1x3:
24654 case Intrinsic::aarch64_neon_ld1x4:
24655 case Intrinsic::aarch64_neon_ld2lane:
24656 case Intrinsic::aarch64_neon_ld3lane:
24657 case Intrinsic::aarch64_neon_ld4lane:
24658 case Intrinsic::aarch64_neon_ld2r:
24659 case Intrinsic::aarch64_neon_ld3r:
24660 case Intrinsic::aarch64_neon_ld4r:
24661 case Intrinsic::aarch64_neon_st2:
24662 case Intrinsic::aarch64_neon_st3:
24663 case Intrinsic::aarch64_neon_st4:
24664 case Intrinsic::aarch64_neon_st1x2:
24665 case Intrinsic::aarch64_neon_st1x3:
24666 case Intrinsic::aarch64_neon_st1x4:
24667 case Intrinsic::aarch64_neon_st2lane:
24668 case Intrinsic::aarch64_neon_st3lane:
24669 case Intrinsic::aarch64_neon_st4lane:
24670 return performNEONPostLDSTCombine(N, DCI, DAG);
24671 case Intrinsic::aarch64_sve_ldnt1:
24672 return performLDNT1Combine(N, DAG);
24673 case Intrinsic::aarch64_sve_ld1rq:
24674 return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);
24675 case Intrinsic::aarch64_sve_ld1ro:
24676 return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);
24677 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
24678 return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLDNT1_MERGE_ZERO);
24679 case Intrinsic::aarch64_sve_ldnt1_gather:
24680 return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLDNT1_MERGE_ZERO);
24681 case Intrinsic::aarch64_sve_ldnt1_gather_index:
24682 return performGatherLoadCombine(N, DAG,
24683 Opcode: AArch64ISD::GLDNT1_INDEX_MERGE_ZERO);
24684 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
24685 return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLDNT1_MERGE_ZERO);
24686 case Intrinsic::aarch64_sve_ld1:
24687 return performLD1Combine(N, DAG, Opc: AArch64ISD::LD1_MERGE_ZERO);
24688 case Intrinsic::aarch64_sve_ldnf1:
24689 return performLD1Combine(N, DAG, Opc: AArch64ISD::LDNF1_MERGE_ZERO);
24690 case Intrinsic::aarch64_sve_ldff1:
24691 return performLD1Combine(N, DAG, Opc: AArch64ISD::LDFF1_MERGE_ZERO);
24692 case Intrinsic::aarch64_sve_st1:
24693 return performST1Combine(N, DAG);
24694 case Intrinsic::aarch64_sve_stnt1:
24695 return performSTNT1Combine(N, DAG);
24696 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
24697 return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SSTNT1_PRED);
24698 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
24699 return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SSTNT1_PRED);
24700 case Intrinsic::aarch64_sve_stnt1_scatter:
24701 return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SSTNT1_PRED);
24702 case Intrinsic::aarch64_sve_stnt1_scatter_index:
24703 return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SSTNT1_INDEX_PRED);
24704 case Intrinsic::aarch64_sve_ld1_gather:
24705 return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLD1_MERGE_ZERO);
24706 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
24707 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
24708 return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLD1Q_MERGE_ZERO);
24709 case Intrinsic::aarch64_sve_ld1q_gather_index:
24710 return performGatherLoadCombine(N, DAG,
24711 Opcode: AArch64ISD::GLD1Q_INDEX_MERGE_ZERO);
24712 case Intrinsic::aarch64_sve_ld1_gather_index:
24713 return performGatherLoadCombine(N, DAG,
24714 Opcode: AArch64ISD::GLD1_SCALED_MERGE_ZERO);
24715 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
24716 return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLD1_SXTW_MERGE_ZERO,
24717 /*OnlyPackedOffsets=*/false);
24718 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
24719 return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLD1_UXTW_MERGE_ZERO,
24720 /*OnlyPackedOffsets=*/false);
24721 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
24722 return performGatherLoadCombine(N, DAG,
24723 Opcode: AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO,
24724 /*OnlyPackedOffsets=*/false);
24725 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
24726 return performGatherLoadCombine(N, DAG,
24727 Opcode: AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO,
24728 /*OnlyPackedOffsets=*/false);
24729 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
24730 return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLD1_IMM_MERGE_ZERO);
24731 case Intrinsic::aarch64_sve_ldff1_gather:
24732 return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLDFF1_MERGE_ZERO);
24733 case Intrinsic::aarch64_sve_ldff1_gather_index:
24734 return performGatherLoadCombine(N, DAG,
24735 Opcode: AArch64ISD::GLDFF1_SCALED_MERGE_ZERO);
24736 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
24737 return performGatherLoadCombine(N, DAG,
24738 Opcode: AArch64ISD::GLDFF1_SXTW_MERGE_ZERO,
24739 /*OnlyPackedOffsets=*/false);
24740 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
24741 return performGatherLoadCombine(N, DAG,
24742 Opcode: AArch64ISD::GLDFF1_UXTW_MERGE_ZERO,
24743 /*OnlyPackedOffsets=*/false);
24744 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
24745 return performGatherLoadCombine(N, DAG,
24746 Opcode: AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO,
24747 /*OnlyPackedOffsets=*/false);
24748 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
24749 return performGatherLoadCombine(N, DAG,
24750 Opcode: AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO,
24751 /*OnlyPackedOffsets=*/false);
24752 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
24753 return performGatherLoadCombine(N, DAG,
24754 Opcode: AArch64ISD::GLDFF1_IMM_MERGE_ZERO);
24755 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
24756 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
24757 return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1Q_PRED);
24758 case Intrinsic::aarch64_sve_st1q_scatter_index:
24759 return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1Q_INDEX_PRED);
24760 case Intrinsic::aarch64_sve_st1_scatter:
24761 return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1_PRED);
24762 case Intrinsic::aarch64_sve_st1_scatter_index:
24763 return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1_SCALED_PRED);
24764 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
24765 return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1_SXTW_PRED,
24766 /*OnlyPackedOffsets=*/false);
24767 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
24768 return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1_UXTW_PRED,
24769 /*OnlyPackedOffsets=*/false);
24770 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
24771 return performScatterStoreCombine(N, DAG,
24772 Opcode: AArch64ISD::SST1_SXTW_SCALED_PRED,
24773 /*OnlyPackedOffsets=*/false);
24774 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
24775 return performScatterStoreCombine(N, DAG,
24776 Opcode: AArch64ISD::SST1_UXTW_SCALED_PRED,
24777 /*OnlyPackedOffsets=*/false);
24778 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
24779 return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1_IMM_PRED);
24780 case Intrinsic::aarch64_rndr:
24781 case Intrinsic::aarch64_rndrrs: {
24782 unsigned IntrinsicID = N->getConstantOperandVal(Num: 1);
24783 auto Register =
24784 (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
24785 : AArch64SysReg::RNDRRS);
24786 SDLoc DL(N);
24787 SDValue A = DAG.getNode(
24788 AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::Glue, MVT::Other),
24789 N->getOperand(0), DAG.getConstant(Register, DL, MVT::i64));
24790 SDValue B = DAG.getNode(
24791 AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
24792 DAG.getConstant(0, DL, MVT::i32),
24793 DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1));
24794 return DAG.getMergeValues(
24795 {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
24796 }
24797 case Intrinsic::aarch64_sme_ldr_zt:
24798 return DAG.getNode(AArch64ISD::RESTORE_ZT, SDLoc(N),
24799 DAG.getVTList(MVT::Other), N->getOperand(0),
24800 N->getOperand(2), N->getOperand(3));
24801 case Intrinsic::aarch64_sme_str_zt:
24802 return DAG.getNode(AArch64ISD::SAVE_ZT, SDLoc(N),
24803 DAG.getVTList(MVT::Other), N->getOperand(0),
24804 N->getOperand(2), N->getOperand(3));
24805 default:
24806 break;
24807 }
24808 break;
24809 case ISD::GlobalAddress:
24810 return performGlobalAddressCombine(N, DAG, Subtarget, TM: getTargetMachine());
24811 case ISD::CTLZ:
24812 return performCTLZCombine(N, DAG, Subtarget);
24813 case ISD::SCALAR_TO_VECTOR:
24814 return performScalarToVectorCombine(N, DCI, DAG);
24815 }
24816 return SDValue();
24817}
24818
24819// Check if the return value is used as only a return value, as otherwise
24820// we can't perform a tail-call. In particular, we need to check for
24821// target ISD nodes that are returns and any other "odd" constructs
24822// that the generic analysis code won't necessarily catch.
24823bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
24824 SDValue &Chain) const {
24825 if (N->getNumValues() != 1)
24826 return false;
24827 if (!N->hasNUsesOfValue(NUses: 1, Value: 0))
24828 return false;
24829
24830 SDValue TCChain = Chain;
24831 SDNode *Copy = *N->use_begin();
24832 if (Copy->getOpcode() == ISD::CopyToReg) {
24833 // If the copy has a glue operand, we conservatively assume it isn't safe to
24834 // perform a tail call.
24835 if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
24836 MVT::Glue)
24837 return false;
24838 TCChain = Copy->getOperand(Num: 0);
24839 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
24840 return false;
24841
24842 bool HasRet = false;
24843 for (SDNode *Node : Copy->uses()) {
24844 if (Node->getOpcode() != AArch64ISD::RET_GLUE)
24845 return false;
24846 HasRet = true;
24847 }
24848
24849 if (!HasRet)
24850 return false;
24851
24852 Chain = TCChain;
24853 return true;
24854}
24855
24856// Return whether the an instruction can potentially be optimized to a tail
24857// call. This will cause the optimizers to attempt to move, or duplicate,
24858// return instructions to help enable tail call optimizations for this
24859// instruction.
24860bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
24861 return CI->isTailCall();
24862}
24863
24864bool AArch64TargetLowering::isIndexingLegal(MachineInstr &MI, Register Base,
24865 Register Offset, bool IsPre,
24866 MachineRegisterInfo &MRI) const {
24867 auto CstOffset = getIConstantVRegVal(VReg: Offset, MRI);
24868 if (!CstOffset || CstOffset->isZero())
24869 return false;
24870
24871 // All of the indexed addressing mode instructions take a signed 9 bit
24872 // immediate offset. Our CstOffset is a G_PTR_ADD offset so it already
24873 // encodes the sign/indexing direction.
24874 return isInt<9>(x: CstOffset->getSExtValue());
24875}
24876
24877bool AArch64TargetLowering::getIndexedAddressParts(SDNode *N, SDNode *Op,
24878 SDValue &Base,
24879 SDValue &Offset,
24880 SelectionDAG &DAG) const {
24881 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
24882 return false;
24883
24884 // Non-null if there is exactly one user of the loaded value (ignoring chain).
24885 SDNode *ValOnlyUser = nullptr;
24886 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE;
24887 ++UI) {
24888 if (UI.getUse().getResNo() == 1)
24889 continue; // Ignore chain.
24890 if (ValOnlyUser == nullptr)
24891 ValOnlyUser = *UI;
24892 else {
24893 ValOnlyUser = nullptr; // Multiple non-chain uses, bail out.
24894 break;
24895 }
24896 }
24897
24898 auto IsUndefOrZero = [](SDValue V) {
24899 return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true);
24900 };
24901
24902 // If the only user of the value is a scalable vector splat, it is
24903 // preferable to do a replicating load (ld1r*).
24904 if (ValOnlyUser && ValOnlyUser->getValueType(ResNo: 0).isScalableVector() &&
24905 (ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR ||
24906 (ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU &&
24907 IsUndefOrZero(ValOnlyUser->getOperand(Num: 2)))))
24908 return false;
24909
24910 Base = Op->getOperand(Num: 0);
24911 // All of the indexed addressing mode instructions take a signed
24912 // 9 bit immediate offset.
24913 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1))) {
24914 int64_t RHSC = RHS->getSExtValue();
24915 if (Op->getOpcode() == ISD::SUB)
24916 RHSC = -(uint64_t)RHSC;
24917 if (!isInt<9>(x: RHSC))
24918 return false;
24919 // Always emit pre-inc/post-inc addressing mode. Use negated constant offset
24920 // when dealing with subtraction.
24921 Offset = DAG.getConstant(Val: RHSC, DL: SDLoc(N), VT: RHS->getValueType(ResNo: 0));
24922 return true;
24923 }
24924 return false;
24925}
24926
24927bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
24928 SDValue &Offset,
24929 ISD::MemIndexedMode &AM,
24930 SelectionDAG &DAG) const {
24931 EVT VT;
24932 SDValue Ptr;
24933 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val: N)) {
24934 VT = LD->getMemoryVT();
24935 Ptr = LD->getBasePtr();
24936 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(Val: N)) {
24937 VT = ST->getMemoryVT();
24938 Ptr = ST->getBasePtr();
24939 } else
24940 return false;
24941
24942 if (!getIndexedAddressParts(N, Op: Ptr.getNode(), Base, Offset, DAG))
24943 return false;
24944 AM = ISD::PRE_INC;
24945 return true;
24946}
24947
24948bool AArch64TargetLowering::getPostIndexedAddressParts(
24949 SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset,
24950 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
24951 EVT VT;
24952 SDValue Ptr;
24953 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val: N)) {
24954 VT = LD->getMemoryVT();
24955 Ptr = LD->getBasePtr();
24956 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(Val: N)) {
24957 VT = ST->getMemoryVT();
24958 Ptr = ST->getBasePtr();
24959 } else
24960 return false;
24961
24962 if (!getIndexedAddressParts(N, Op, Base, Offset, DAG))
24963 return false;
24964 // Post-indexing updates the base, so it's not a valid transform
24965 // if that's not the same as the load's pointer.
24966 if (Ptr != Base)
24967 return false;
24968 AM = ISD::POST_INC;
24969 return true;
24970}
24971
24972static void replaceBoolVectorBitcast(SDNode *N,
24973 SmallVectorImpl<SDValue> &Results,
24974 SelectionDAG &DAG) {
24975 SDLoc DL(N);
24976 SDValue Op = N->getOperand(Num: 0);
24977 EVT VT = N->getValueType(ResNo: 0);
24978 [[maybe_unused]] EVT SrcVT = Op.getValueType();
24979 assert(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
24980 "Must be bool vector.");
24981
24982 // Special handling for Clang's __builtin_convertvector. For vectors with <8
24983 // elements, it adds a vector concatenation with undef(s). If we encounter
24984 // this here, we can skip the concat.
24985 if (Op.getOpcode() == ISD::CONCAT_VECTORS && !Op.getOperand(i: 0).isUndef()) {
24986 bool AllUndef = true;
24987 for (unsigned I = 1; I < Op.getNumOperands(); ++I)
24988 AllUndef &= Op.getOperand(i: I).isUndef();
24989
24990 if (AllUndef)
24991 Op = Op.getOperand(i: 0);
24992 }
24993
24994 SDValue VectorBits = vectorToScalarBitmask(N: Op.getNode(), DAG);
24995 if (VectorBits)
24996 Results.push_back(Elt: DAG.getZExtOrTrunc(Op: VectorBits, DL, VT));
24997}
24998
24999static void CustomNonLegalBITCASTResults(SDNode *N,
25000 SmallVectorImpl<SDValue> &Results,
25001 SelectionDAG &DAG, EVT ExtendVT,
25002 EVT CastVT) {
25003 SDLoc DL(N);
25004 SDValue Op = N->getOperand(Num: 0);
25005 EVT VT = N->getValueType(ResNo: 0);
25006
25007 // Use SCALAR_TO_VECTOR for lane zero
25008 SDValue Vec = DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT: ExtendVT, Operand: Op);
25009 SDValue CastVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: CastVT, Operand: Vec);
25010 SDValue IdxZero = DAG.getVectorIdxConstant(Val: 0, DL);
25011 Results.push_back(
25012 Elt: DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: CastVal, N2: IdxZero));
25013}
25014
25015void AArch64TargetLowering::ReplaceBITCASTResults(
25016 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
25017 SDLoc DL(N);
25018 SDValue Op = N->getOperand(Num: 0);
25019 EVT VT = N->getValueType(ResNo: 0);
25020 EVT SrcVT = Op.getValueType();
25021
25022 if (VT == MVT::v2i16 && SrcVT == MVT::i32) {
25023 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v4i16);
25024 return;
25025 }
25026
25027 if (VT == MVT::v4i8 && SrcVT == MVT::i32) {
25028 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v8i8);
25029 return;
25030 }
25031
25032 if (VT == MVT::v2i8 && SrcVT == MVT::i16) {
25033 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v4i16, MVT::v8i8);
25034 return;
25035 }
25036
25037 if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(VT: SrcVT)) {
25038 assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
25039 "Expected fp->int bitcast!");
25040
25041 // Bitcasting between unpacked vector types of different element counts is
25042 // not a NOP because the live elements are laid out differently.
25043 // 01234567
25044 // e.g. nxv2i32 = XX??XX??
25045 // nxv4f16 = X?X?X?X?
25046 if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
25047 return;
25048
25049 SDValue CastResult = getSVESafeBitCast(VT: getSVEContainerType(ContentTy: VT), Op, DAG);
25050 Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: CastResult));
25051 return;
25052 }
25053
25054 if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
25055 !VT.isVector())
25056 return replaceBoolVectorBitcast(N, Results, DAG);
25057
25058 if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
25059 return;
25060
25061 Op = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
25062 DAG.getUNDEF(MVT::i32), Op);
25063 Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
25064 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
25065}
25066
25067static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl<SDValue> &Results,
25068 SelectionDAG &DAG,
25069 const AArch64Subtarget *Subtarget) {
25070 EVT VT = N->getValueType(ResNo: 0);
25071 if (!VT.is256BitVector() ||
25072 (VT.getScalarType().isFloatingPoint() &&
25073 !N->getFlags().hasAllowReassociation()) ||
25074 (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
25075 VT.getScalarType() == MVT::bf16)
25076 return;
25077
25078 SDValue X = N->getOperand(Num: 0);
25079 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(Val: N->getOperand(Num: 1));
25080 if (!Shuf) {
25081 Shuf = dyn_cast<ShuffleVectorSDNode>(Val: N->getOperand(Num: 0));
25082 X = N->getOperand(Num: 1);
25083 if (!Shuf)
25084 return;
25085 }
25086
25087 if (Shuf->getOperand(Num: 0) != X || !Shuf->getOperand(Num: 1)->isUndef())
25088 return;
25089
25090 // Check the mask is 1,0,3,2,5,4,...
25091 ArrayRef<int> Mask = Shuf->getMask();
25092 for (int I = 0, E = Mask.size(); I < E; I++)
25093 if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1))
25094 return;
25095
25096 SDLoc DL(N);
25097 auto LoHi = DAG.SplitVector(N: X, DL);
25098 assert(LoHi.first.getValueType() == LoHi.second.getValueType());
25099 SDValue Addp = DAG.getNode(Opcode: AArch64ISD::ADDP, DL: N, VT: LoHi.first.getValueType(),
25100 N1: LoHi.first, N2: LoHi.second);
25101
25102 // Shuffle the elements back into order.
25103 SmallVector<int> NMask;
25104 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) {
25105 NMask.push_back(Elt: I);
25106 NMask.push_back(Elt: I);
25107 }
25108 Results.push_back(
25109 Elt: DAG.getVectorShuffle(VT, dl: DL,
25110 N1: DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, N1: Addp,
25111 N2: DAG.getUNDEF(VT: LoHi.first.getValueType())),
25112 N2: DAG.getUNDEF(VT), Mask: NMask));
25113}
25114
25115static void ReplaceReductionResults(SDNode *N,
25116 SmallVectorImpl<SDValue> &Results,
25117 SelectionDAG &DAG, unsigned InterOp,
25118 unsigned AcrossOp) {
25119 EVT LoVT, HiVT;
25120 SDValue Lo, Hi;
25121 SDLoc dl(N);
25122 std::tie(args&: LoVT, args&: HiVT) = DAG.GetSplitDestVTs(VT: N->getValueType(ResNo: 0));
25123 std::tie(args&: Lo, args&: Hi) = DAG.SplitVectorOperand(N, OpNo: 0);
25124 SDValue InterVal = DAG.getNode(Opcode: InterOp, DL: dl, VT: LoVT, N1: Lo, N2: Hi);
25125 SDValue SplitVal = DAG.getNode(Opcode: AcrossOp, DL: dl, VT: LoVT, Operand: InterVal);
25126 Results.push_back(Elt: SplitVal);
25127}
25128
25129void AArch64TargetLowering::ReplaceExtractSubVectorResults(
25130 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
25131 SDValue In = N->getOperand(Num: 0);
25132 EVT InVT = In.getValueType();
25133
25134 // Common code will handle these just fine.
25135 if (!InVT.isScalableVector() || !InVT.isInteger())
25136 return;
25137
25138 SDLoc DL(N);
25139 EVT VT = N->getValueType(ResNo: 0);
25140
25141 // The following checks bail if this is not a halving operation.
25142
25143 ElementCount ResEC = VT.getVectorElementCount();
25144
25145 if (InVT.getVectorElementCount() != (ResEC * 2))
25146 return;
25147
25148 auto *CIndex = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
25149 if (!CIndex)
25150 return;
25151
25152 unsigned Index = CIndex->getZExtValue();
25153 if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
25154 return;
25155
25156 unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
25157 EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(Context&: *DAG.getContext());
25158
25159 SDValue Half = DAG.getNode(Opcode, DL, VT: ExtendedHalfVT, Operand: N->getOperand(Num: 0));
25160 Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: Half));
25161}
25162
25163// Create an even/odd pair of X registers holding integer value V.
25164static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
25165 SDLoc dl(V.getNode());
25166 auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i64, MVT::i64);
25167 if (DAG.getDataLayout().isBigEndian())
25168 std::swap (VLo, VHi);
25169 SDValue RegClass =
25170 DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
25171 SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
25172 SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
25173 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
25174 return SDValue(
25175 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
25176}
25177
25178static void ReplaceCMP_SWAP_128Results(SDNode *N,
25179 SmallVectorImpl<SDValue> &Results,
25180 SelectionDAG &DAG,
25181 const AArch64Subtarget *Subtarget) {
25182 assert(N->getValueType(0) == MVT::i128 &&
25183 "AtomicCmpSwap on types less than 128 should be legal");
25184
25185 MachineMemOperand *MemOp = cast<MemSDNode>(Val: N)->getMemOperand();
25186 if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
25187 // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
25188 // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
25189 SDValue Ops[] = {
25190 createGPRPairNode(DAG, V: N->getOperand(Num: 2)), // Compare value
25191 createGPRPairNode(DAG, V: N->getOperand(Num: 3)), // Store value
25192 N->getOperand(Num: 1), // Ptr
25193 N->getOperand(Num: 0), // Chain in
25194 };
25195
25196 unsigned Opcode;
25197 switch (MemOp->getMergedOrdering()) {
25198 case AtomicOrdering::Monotonic:
25199 Opcode = AArch64::CASPX;
25200 break;
25201 case AtomicOrdering::Acquire:
25202 Opcode = AArch64::CASPAX;
25203 break;
25204 case AtomicOrdering::Release:
25205 Opcode = AArch64::CASPLX;
25206 break;
25207 case AtomicOrdering::AcquireRelease:
25208 case AtomicOrdering::SequentiallyConsistent:
25209 Opcode = AArch64::CASPALX;
25210 break;
25211 default:
25212 llvm_unreachable("Unexpected ordering!");
25213 }
25214
25215 MachineSDNode *CmpSwap = DAG.getMachineNode(
25216 Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
25217 DAG.setNodeMemRefs(N: CmpSwap, NewMemRefs: {MemOp});
25218
25219 unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
25220 if (DAG.getDataLayout().isBigEndian())
25221 std::swap(a&: SubReg1, b&: SubReg2);
25222 SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
25223 SDValue(CmpSwap, 0));
25224 SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
25225 SDValue(CmpSwap, 0));
25226 Results.push_back(
25227 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
25228 Results.push_back(Elt: SDValue(CmpSwap, 1)); // Chain out
25229 return;
25230 }
25231
25232 unsigned Opcode;
25233 switch (MemOp->getMergedOrdering()) {
25234 case AtomicOrdering::Monotonic:
25235 Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
25236 break;
25237 case AtomicOrdering::Acquire:
25238 Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
25239 break;
25240 case AtomicOrdering::Release:
25241 Opcode = AArch64::CMP_SWAP_128_RELEASE;
25242 break;
25243 case AtomicOrdering::AcquireRelease:
25244 case AtomicOrdering::SequentiallyConsistent:
25245 Opcode = AArch64::CMP_SWAP_128;
25246 break;
25247 default:
25248 llvm_unreachable("Unexpected ordering!");
25249 }
25250
25251 SDLoc DL(N);
25252 auto Desired = DAG.SplitScalar(N->getOperand(2), DL, MVT::i64, MVT::i64);
25253 auto New = DAG.SplitScalar(N->getOperand(3), DL, MVT::i64, MVT::i64);
25254 SDValue Ops[] = {N->getOperand(Num: 1), Desired.first, Desired.second,
25255 New.first, New.second, N->getOperand(Num: 0)};
25256 SDNode *CmpSwap = DAG.getMachineNode(
25257 Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other),
25258 Ops);
25259 DAG.setNodeMemRefs(N: cast<MachineSDNode>(Val: CmpSwap), NewMemRefs: {MemOp});
25260
25261 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
25262 SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
25263 Results.push_back(Elt: SDValue(CmpSwap, 3));
25264}
25265
25266static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode,
25267 AtomicOrdering Ordering) {
25268 // ATOMIC_LOAD_CLR only appears when lowering ATOMIC_LOAD_AND (see
25269 // LowerATOMIC_LOAD_AND). We can't take that approach with 128-bit, because
25270 // the type is not legal. Therefore we shouldn't expect to see a 128-bit
25271 // ATOMIC_LOAD_CLR at any point.
25272 assert(ISDOpcode != ISD::ATOMIC_LOAD_CLR &&
25273 "ATOMIC_LOAD_AND should be lowered to LDCLRP directly");
25274 assert(ISDOpcode != ISD::ATOMIC_LOAD_ADD && "There is no 128 bit LDADD");
25275 assert(ISDOpcode != ISD::ATOMIC_LOAD_SUB && "There is no 128 bit LDSUB");
25276
25277 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
25278 // The operand will need to be XORed in a separate step.
25279 switch (Ordering) {
25280 case AtomicOrdering::Monotonic:
25281 return AArch64::LDCLRP;
25282 break;
25283 case AtomicOrdering::Acquire:
25284 return AArch64::LDCLRPA;
25285 break;
25286 case AtomicOrdering::Release:
25287 return AArch64::LDCLRPL;
25288 break;
25289 case AtomicOrdering::AcquireRelease:
25290 case AtomicOrdering::SequentiallyConsistent:
25291 return AArch64::LDCLRPAL;
25292 break;
25293 default:
25294 llvm_unreachable("Unexpected ordering!");
25295 }
25296 }
25297
25298 if (ISDOpcode == ISD::ATOMIC_LOAD_OR) {
25299 switch (Ordering) {
25300 case AtomicOrdering::Monotonic:
25301 return AArch64::LDSETP;
25302 break;
25303 case AtomicOrdering::Acquire:
25304 return AArch64::LDSETPA;
25305 break;
25306 case AtomicOrdering::Release:
25307 return AArch64::LDSETPL;
25308 break;
25309 case AtomicOrdering::AcquireRelease:
25310 case AtomicOrdering::SequentiallyConsistent:
25311 return AArch64::LDSETPAL;
25312 break;
25313 default:
25314 llvm_unreachable("Unexpected ordering!");
25315 }
25316 }
25317
25318 if (ISDOpcode == ISD::ATOMIC_SWAP) {
25319 switch (Ordering) {
25320 case AtomicOrdering::Monotonic:
25321 return AArch64::SWPP;
25322 break;
25323 case AtomicOrdering::Acquire:
25324 return AArch64::SWPPA;
25325 break;
25326 case AtomicOrdering::Release:
25327 return AArch64::SWPPL;
25328 break;
25329 case AtomicOrdering::AcquireRelease:
25330 case AtomicOrdering::SequentiallyConsistent:
25331 return AArch64::SWPPAL;
25332 break;
25333 default:
25334 llvm_unreachable("Unexpected ordering!");
25335 }
25336 }
25337
25338 llvm_unreachable("Unexpected ISDOpcode!");
25339}
25340
25341static void ReplaceATOMIC_LOAD_128Results(SDNode *N,
25342 SmallVectorImpl<SDValue> &Results,
25343 SelectionDAG &DAG,
25344 const AArch64Subtarget *Subtarget) {
25345 // LSE128 has a 128-bit RMW ops, but i128 is not a legal type, so lower it
25346 // here. This follows the approach of the CMP_SWAP_XXX pseudo instructions
25347 // rather than the CASP instructions, because CASP has register classes for
25348 // the pairs of registers and therefore uses REG_SEQUENCE and EXTRACT_SUBREG
25349 // to present them as single operands. LSE128 instructions use the GPR64
25350 // register class (because the pair does not have to be sequential), like
25351 // CMP_SWAP_XXX, and therefore we use TRUNCATE and BUILD_PAIR.
25352
25353 assert(N->getValueType(0) == MVT::i128 &&
25354 "AtomicLoadXXX on types less than 128 should be legal");
25355
25356 if (!Subtarget->hasLSE128())
25357 return;
25358
25359 MachineMemOperand *MemOp = cast<MemSDNode>(Val: N)->getMemOperand();
25360 const SDValue &Chain = N->getOperand(Num: 0);
25361 const SDValue &Ptr = N->getOperand(Num: 1);
25362 const SDValue &Val128 = N->getOperand(Num: 2);
25363 std::pair<SDValue, SDValue> Val2x64 =
25364 DAG.SplitScalar(Val128, SDLoc(Val128), MVT::i64, MVT::i64);
25365
25366 const unsigned ISDOpcode = N->getOpcode();
25367 const unsigned MachineOpcode =
25368 getAtomicLoad128Opcode(ISDOpcode, Ordering: MemOp->getMergedOrdering());
25369
25370 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
25371 SDLoc dl(Val128);
25372 Val2x64.first =
25373 DAG.getNode(ISD::XOR, dl, MVT::i64,
25374 DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.first);
25375 Val2x64.second =
25376 DAG.getNode(ISD::XOR, dl, MVT::i64,
25377 DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.second);
25378 }
25379
25380 SDValue Ops[] = {Val2x64.first, Val2x64.second, Ptr, Chain};
25381 if (DAG.getDataLayout().isBigEndian())
25382 std::swap(a&: Ops[0], b&: Ops[1]);
25383
25384 MachineSDNode *AtomicInst =
25385 DAG.getMachineNode(MachineOpcode, SDLoc(N),
25386 DAG.getVTList(MVT::i64, MVT::i64, MVT::Other), Ops);
25387
25388 DAG.setNodeMemRefs(N: AtomicInst, NewMemRefs: {MemOp});
25389
25390 SDValue Lo = SDValue(AtomicInst, 0), Hi = SDValue(AtomicInst, 1);
25391 if (DAG.getDataLayout().isBigEndian())
25392 std::swap(a&: Lo, b&: Hi);
25393
25394 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
25395 Results.push_back(Elt: SDValue(AtomicInst, 2)); // Chain out
25396}
25397
25398void AArch64TargetLowering::ReplaceNodeResults(
25399 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
25400 switch (N->getOpcode()) {
25401 default:
25402 llvm_unreachable("Don't know how to custom expand this");
25403 case ISD::BITCAST:
25404 ReplaceBITCASTResults(N, Results, DAG);
25405 return;
25406 case ISD::VECREDUCE_ADD:
25407 case ISD::VECREDUCE_SMAX:
25408 case ISD::VECREDUCE_SMIN:
25409 case ISD::VECREDUCE_UMAX:
25410 case ISD::VECREDUCE_UMIN:
25411 Results.push_back(Elt: LowerVECREDUCE(Op: SDValue(N, 0), DAG));
25412 return;
25413 case ISD::ADD:
25414 case ISD::FADD:
25415 ReplaceAddWithADDP(N, Results, DAG, Subtarget);
25416 return;
25417
25418 case ISD::CTPOP:
25419 case ISD::PARITY:
25420 if (SDValue Result = LowerCTPOP_PARITY(Op: SDValue(N, 0), DAG))
25421 Results.push_back(Elt: Result);
25422 return;
25423 case AArch64ISD::SADDV:
25424 ReplaceReductionResults(N, Results, DAG, InterOp: ISD::ADD, AcrossOp: AArch64ISD::SADDV);
25425 return;
25426 case AArch64ISD::UADDV:
25427 ReplaceReductionResults(N, Results, DAG, InterOp: ISD::ADD, AcrossOp: AArch64ISD::UADDV);
25428 return;
25429 case AArch64ISD::SMINV:
25430 ReplaceReductionResults(N, Results, DAG, InterOp: ISD::SMIN, AcrossOp: AArch64ISD::SMINV);
25431 return;
25432 case AArch64ISD::UMINV:
25433 ReplaceReductionResults(N, Results, DAG, InterOp: ISD::UMIN, AcrossOp: AArch64ISD::UMINV);
25434 return;
25435 case AArch64ISD::SMAXV:
25436 ReplaceReductionResults(N, Results, DAG, InterOp: ISD::SMAX, AcrossOp: AArch64ISD::SMAXV);
25437 return;
25438 case AArch64ISD::UMAXV:
25439 ReplaceReductionResults(N, Results, DAG, InterOp: ISD::UMAX, AcrossOp: AArch64ISD::UMAXV);
25440 return;
25441 case ISD::MULHS:
25442 if (useSVEForFixedLengthVectorVT(VT: SDValue(N, 0).getValueType()))
25443 Results.push_back(
25444 Elt: LowerToPredicatedOp(Op: SDValue(N, 0), DAG, NewOp: AArch64ISD::MULHS_PRED));
25445 return;
25446 case ISD::MULHU:
25447 if (useSVEForFixedLengthVectorVT(VT: SDValue(N, 0).getValueType()))
25448 Results.push_back(
25449 Elt: LowerToPredicatedOp(Op: SDValue(N, 0), DAG, NewOp: AArch64ISD::MULHU_PRED));
25450 return;
25451 case ISD::FP_TO_UINT:
25452 case ISD::FP_TO_SINT:
25453 case ISD::STRICT_FP_TO_SINT:
25454 case ISD::STRICT_FP_TO_UINT:
25455 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
25456 // Let normal code take care of it by not adding anything to Results.
25457 return;
25458 case ISD::ATOMIC_CMP_SWAP:
25459 ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
25460 return;
25461 case ISD::ATOMIC_LOAD_CLR:
25462 assert(N->getValueType(0) != MVT::i128 &&
25463 "128-bit ATOMIC_LOAD_AND should be lowered directly to LDCLRP");
25464 break;
25465 case ISD::ATOMIC_LOAD_AND:
25466 case ISD::ATOMIC_LOAD_OR:
25467 case ISD::ATOMIC_SWAP: {
25468 assert(cast<AtomicSDNode>(N)->getVal().getValueType() == MVT::i128 &&
25469 "Expected 128-bit atomicrmw.");
25470 // These need custom type legalisation so we go directly to instruction.
25471 ReplaceATOMIC_LOAD_128Results(N, Results, DAG, Subtarget);
25472 return;
25473 }
25474 case ISD::ATOMIC_LOAD:
25475 case ISD::LOAD: {
25476 MemSDNode *LoadNode = cast<MemSDNode>(Val: N);
25477 EVT MemVT = LoadNode->getMemoryVT();
25478 // Handle lowering 256 bit non temporal loads into LDNP for little-endian
25479 // targets.
25480 if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() &&
25481 MemVT.getSizeInBits() == 256u &&
25482 (MemVT.getScalarSizeInBits() == 8u ||
25483 MemVT.getScalarSizeInBits() == 16u ||
25484 MemVT.getScalarSizeInBits() == 32u ||
25485 MemVT.getScalarSizeInBits() == 64u)) {
25486
25487 SDValue Result = DAG.getMemIntrinsicNode(
25488 AArch64ISD::LDNP, SDLoc(N),
25489 DAG.getVTList({MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
25490 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
25491 MVT::Other}),
25492 {LoadNode->getChain(), LoadNode->getBasePtr()},
25493 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
25494
25495 SDValue Pair = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(N), VT: MemVT,
25496 N1: Result.getValue(R: 0), N2: Result.getValue(R: 1));
25497 Results.append(IL: {Pair, Result.getValue(R: 2) /* Chain */});
25498 return;
25499 }
25500
25501 if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
25502 LoadNode->getMemoryVT() != MVT::i128) {
25503 // Non-volatile or atomic loads are optimized later in AArch64's load/store
25504 // optimizer.
25505 return;
25506 }
25507
25508 if (SDValue(N, 0).getValueType() == MVT::i128) {
25509 auto *AN = dyn_cast<AtomicSDNode>(Val: LoadNode);
25510 bool isLoadAcquire =
25511 AN && AN->getSuccessOrdering() == AtomicOrdering::Acquire;
25512 unsigned Opcode = isLoadAcquire ? AArch64ISD::LDIAPP : AArch64ISD::LDP;
25513
25514 if (isLoadAcquire)
25515 assert(Subtarget->hasFeature(AArch64::FeatureRCPC3));
25516
25517 SDValue Result = DAG.getMemIntrinsicNode(
25518 Opcode, SDLoc(N), DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
25519 {LoadNode->getChain(), LoadNode->getBasePtr()},
25520 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
25521
25522 unsigned FirstRes = DAG.getDataLayout().isBigEndian() ? 1 : 0;
25523
25524 SDValue Pair =
25525 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
25526 Result.getValue(FirstRes), Result.getValue(1 - FirstRes));
25527 Results.append(IL: {Pair, Result.getValue(R: 2) /* Chain */});
25528 }
25529 return;
25530 }
25531 case ISD::EXTRACT_SUBVECTOR:
25532 ReplaceExtractSubVectorResults(N, Results, DAG);
25533 return;
25534 case ISD::INSERT_SUBVECTOR:
25535 case ISD::CONCAT_VECTORS:
25536 // Custom lowering has been requested for INSERT_SUBVECTOR and
25537 // CONCAT_VECTORS -- but delegate to common code for result type
25538 // legalisation
25539 return;
25540 case ISD::INTRINSIC_WO_CHAIN: {
25541 EVT VT = N->getValueType(ResNo: 0);
25542 assert((VT == MVT::i8 || VT == MVT::i16) &&
25543 "custom lowering for unexpected type");
25544
25545 Intrinsic::ID IntID =
25546 static_cast<Intrinsic::ID>(N->getConstantOperandVal(Num: 0));
25547 switch (IntID) {
25548 default:
25549 return;
25550 case Intrinsic::aarch64_sve_clasta_n: {
25551 SDLoc DL(N);
25552 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
25553 auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
25554 N->getOperand(1), Op2, N->getOperand(3));
25555 Results.push_back(Elt: DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25556 return;
25557 }
25558 case Intrinsic::aarch64_sve_clastb_n: {
25559 SDLoc DL(N);
25560 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
25561 auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
25562 N->getOperand(1), Op2, N->getOperand(3));
25563 Results.push_back(Elt: DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25564 return;
25565 }
25566 case Intrinsic::aarch64_sve_lasta: {
25567 SDLoc DL(N);
25568 auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
25569 N->getOperand(1), N->getOperand(2));
25570 Results.push_back(Elt: DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25571 return;
25572 }
25573 case Intrinsic::aarch64_sve_lastb: {
25574 SDLoc DL(N);
25575 auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
25576 N->getOperand(1), N->getOperand(2));
25577 Results.push_back(Elt: DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25578 return;
25579 }
25580 }
25581 }
25582 case ISD::READ_REGISTER: {
25583 SDLoc DL(N);
25584 assert(N->getValueType(0) == MVT::i128 &&
25585 "READ_REGISTER custom lowering is only for 128-bit sysregs");
25586 SDValue Chain = N->getOperand(Num: 0);
25587 SDValue SysRegName = N->getOperand(Num: 1);
25588
25589 SDValue Result = DAG.getNode(
25590 AArch64ISD::MRRS, DL, DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
25591 Chain, SysRegName);
25592
25593 // Sysregs are not endian. Result.getValue(0) always contains the lower half
25594 // of the 128-bit System Register value.
25595 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
25596 Result.getValue(0), Result.getValue(1));
25597 Results.push_back(Elt: Pair);
25598 Results.push_back(Elt: Result.getValue(R: 2)); // Chain
25599 return;
25600 }
25601 }
25602}
25603
25604bool AArch64TargetLowering::useLoadStackGuardNode() const {
25605 if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
25606 return TargetLowering::useLoadStackGuardNode();
25607 return true;
25608}
25609
25610unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
25611 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
25612 // reciprocal if there are three or more FDIVs.
25613 return 3;
25614}
25615
25616TargetLoweringBase::LegalizeTypeAction
25617AArch64TargetLowering::getPreferredVectorAction(MVT VT) const {
25618 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
25619 // v4i16, v2i32 instead of to promote.
25620 if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
25621 VT == MVT::v1f32)
25622 return TypeWidenVector;
25623
25624 return TargetLoweringBase::getPreferredVectorAction(VT);
25625}
25626
25627// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
25628// provided the address is 16-byte aligned.
25629bool AArch64TargetLowering::isOpSuitableForLDPSTP(const Instruction *I) const {
25630 if (!Subtarget->hasLSE2())
25631 return false;
25632
25633 if (auto LI = dyn_cast<LoadInst>(Val: I))
25634 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
25635 LI->getAlign() >= Align(16);
25636
25637 if (auto SI = dyn_cast<StoreInst>(Val: I))
25638 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25639 SI->getAlign() >= Align(16);
25640
25641 return false;
25642}
25643
25644bool AArch64TargetLowering::isOpSuitableForLSE128(const Instruction *I) const {
25645 if (!Subtarget->hasLSE128())
25646 return false;
25647
25648 // Only use SWPP for stores where LSE2 would require a fence. Unlike STP, SWPP
25649 // will clobber the two registers.
25650 if (const auto *SI = dyn_cast<StoreInst>(Val: I))
25651 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25652 SI->getAlign() >= Align(16) &&
25653 (SI->getOrdering() == AtomicOrdering::Release ||
25654 SI->getOrdering() == AtomicOrdering::SequentiallyConsistent);
25655
25656 if (const auto *RMW = dyn_cast<AtomicRMWInst>(Val: I))
25657 return RMW->getValOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25658 RMW->getAlign() >= Align(16) &&
25659 (RMW->getOperation() == AtomicRMWInst::Xchg ||
25660 RMW->getOperation() == AtomicRMWInst::And ||
25661 RMW->getOperation() == AtomicRMWInst::Or);
25662
25663 return false;
25664}
25665
25666bool AArch64TargetLowering::isOpSuitableForRCPC3(const Instruction *I) const {
25667 if (!Subtarget->hasLSE2() || !Subtarget->hasRCPC3())
25668 return false;
25669
25670 if (auto LI = dyn_cast<LoadInst>(Val: I))
25671 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
25672 LI->getAlign() >= Align(16) &&
25673 LI->getOrdering() == AtomicOrdering::Acquire;
25674
25675 if (auto SI = dyn_cast<StoreInst>(Val: I))
25676 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25677 SI->getAlign() >= Align(16) &&
25678 SI->getOrdering() == AtomicOrdering::Release;
25679
25680 return false;
25681}
25682
25683bool AArch64TargetLowering::shouldInsertFencesForAtomic(
25684 const Instruction *I) const {
25685 if (isOpSuitableForRCPC3(I))
25686 return false;
25687 if (isOpSuitableForLSE128(I))
25688 return false;
25689 if (isOpSuitableForLDPSTP(I))
25690 return true;
25691 return false;
25692}
25693
25694bool AArch64TargetLowering::shouldInsertTrailingFenceForAtomicStore(
25695 const Instruction *I) const {
25696 // Store-Release instructions only provide seq_cst guarantees when paired with
25697 // Load-Acquire instructions. MSVC CRT does not use these instructions to
25698 // implement seq_cst loads and stores, so we need additional explicit fences
25699 // after memory writes.
25700 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
25701 return false;
25702
25703 switch (I->getOpcode()) {
25704 default:
25705 return false;
25706 case Instruction::AtomicCmpXchg:
25707 return cast<AtomicCmpXchgInst>(Val: I)->getSuccessOrdering() ==
25708 AtomicOrdering::SequentiallyConsistent;
25709 case Instruction::AtomicRMW:
25710 return cast<AtomicRMWInst>(Val: I)->getOrdering() ==
25711 AtomicOrdering::SequentiallyConsistent;
25712 case Instruction::Store:
25713 return cast<StoreInst>(Val: I)->getOrdering() ==
25714 AtomicOrdering::SequentiallyConsistent;
25715 }
25716}
25717
25718// Loads and stores less than 128-bits are already atomic; ones above that
25719// are doomed anyway, so defer to the default libcall and blame the OS when
25720// things go wrong.
25721TargetLoweringBase::AtomicExpansionKind
25722AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
25723 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
25724 if (Size != 128)
25725 return AtomicExpansionKind::None;
25726 if (isOpSuitableForRCPC3(I: SI))
25727 return AtomicExpansionKind::None;
25728 if (isOpSuitableForLSE128(I: SI))
25729 return AtomicExpansionKind::Expand;
25730 if (isOpSuitableForLDPSTP(I: SI))
25731 return AtomicExpansionKind::None;
25732 return AtomicExpansionKind::Expand;
25733}
25734
25735// Loads and stores less than 128-bits are already atomic; ones above that
25736// are doomed anyway, so defer to the default libcall and blame the OS when
25737// things go wrong.
25738TargetLowering::AtomicExpansionKind
25739AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
25740 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
25741
25742 if (Size != 128)
25743 return AtomicExpansionKind::None;
25744 if (isOpSuitableForRCPC3(I: LI))
25745 return AtomicExpansionKind::None;
25746 // No LSE128 loads
25747 if (isOpSuitableForLDPSTP(I: LI))
25748 return AtomicExpansionKind::None;
25749
25750 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
25751 // implement atomicrmw without spilling. If the target address is also on the
25752 // stack and close enough to the spill slot, this can lead to a situation
25753 // where the monitor always gets cleared and the atomic operation can never
25754 // succeed. So at -O0 lower this operation to a CAS loop.
25755 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
25756 return AtomicExpansionKind::CmpXChg;
25757
25758 // Using CAS for an atomic load has a better chance of succeeding under high
25759 // contention situations. So use it if available.
25760 return Subtarget->hasLSE() ? AtomicExpansionKind::CmpXChg
25761 : AtomicExpansionKind::LLSC;
25762}
25763
25764// The "default" for integer RMW operations is to expand to an LL/SC loop.
25765// However, with the LSE instructions (or outline-atomics mode, which provides
25766// library routines in place of the LSE-instructions), we can directly emit many
25767// operations instead.
25768//
25769// Floating-point operations are always emitted to a cmpxchg loop, because they
25770// may trigger a trap which aborts an LLSC sequence.
25771TargetLowering::AtomicExpansionKind
25772AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
25773 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
25774 assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes.");
25775
25776 if (AI->isFloatingPointOperation())
25777 return AtomicExpansionKind::CmpXChg;
25778
25779 bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 &&
25780 (AI->getOperation() == AtomicRMWInst::Xchg ||
25781 AI->getOperation() == AtomicRMWInst::Or ||
25782 AI->getOperation() == AtomicRMWInst::And);
25783 if (CanUseLSE128)
25784 return AtomicExpansionKind::None;
25785
25786 // Nand is not supported in LSE.
25787 // Leave 128 bits to LLSC or CmpXChg.
25788 if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) {
25789 if (Subtarget->hasLSE())
25790 return AtomicExpansionKind::None;
25791 if (Subtarget->outlineAtomics()) {
25792 // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
25793 // Don't outline them unless
25794 // (1) high level <atomic> support approved:
25795 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
25796 // (2) low level libgcc and compiler-rt support implemented by:
25797 // min/max outline atomics helpers
25798 if (AI->getOperation() != AtomicRMWInst::Min &&
25799 AI->getOperation() != AtomicRMWInst::Max &&
25800 AI->getOperation() != AtomicRMWInst::UMin &&
25801 AI->getOperation() != AtomicRMWInst::UMax) {
25802 return AtomicExpansionKind::None;
25803 }
25804 }
25805 }
25806
25807 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
25808 // implement atomicrmw without spilling. If the target address is also on the
25809 // stack and close enough to the spill slot, this can lead to a situation
25810 // where the monitor always gets cleared and the atomic operation can never
25811 // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if
25812 // we have a single CAS instruction that can replace the loop.
25813 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None ||
25814 Subtarget->hasLSE())
25815 return AtomicExpansionKind::CmpXChg;
25816
25817 return AtomicExpansionKind::LLSC;
25818}
25819
25820TargetLowering::AtomicExpansionKind
25821AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
25822 AtomicCmpXchgInst *AI) const {
25823 // If subtarget has LSE, leave cmpxchg intact for codegen.
25824 if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
25825 return AtomicExpansionKind::None;
25826 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
25827 // implement cmpxchg without spilling. If the address being exchanged is also
25828 // on the stack and close enough to the spill slot, this can lead to a
25829 // situation where the monitor always gets cleared and the atomic operation
25830 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
25831 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
25832 return AtomicExpansionKind::None;
25833
25834 // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
25835 // it.
25836 unsigned Size = AI->getCompareOperand()->getType()->getPrimitiveSizeInBits();
25837 if (Size > 64)
25838 return AtomicExpansionKind::None;
25839
25840 return AtomicExpansionKind::LLSC;
25841}
25842
25843Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
25844 Type *ValueTy, Value *Addr,
25845 AtomicOrdering Ord) const {
25846 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
25847 bool IsAcquire = isAcquireOrStronger(AO: Ord);
25848
25849 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
25850 // intrinsic must return {i64, i64} and we have to recombine them into a
25851 // single i128 here.
25852 if (ValueTy->getPrimitiveSizeInBits() == 128) {
25853 Intrinsic::ID Int =
25854 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
25855 Function *Ldxr = Intrinsic::getDeclaration(M, id: Int);
25856
25857 Value *LoHi = Builder.CreateCall(Callee: Ldxr, Args: Addr, Name: "lohi");
25858
25859 Value *Lo = Builder.CreateExtractValue(Agg: LoHi, Idxs: 0, Name: "lo");
25860 Value *Hi = Builder.CreateExtractValue(Agg: LoHi, Idxs: 1, Name: "hi");
25861 Lo = Builder.CreateZExt(V: Lo, DestTy: ValueTy, Name: "lo64");
25862 Hi = Builder.CreateZExt(V: Hi, DestTy: ValueTy, Name: "hi64");
25863 return Builder.CreateOr(
25864 LHS: Lo, RHS: Builder.CreateShl(LHS: Hi, RHS: ConstantInt::get(Ty: ValueTy, V: 64)), Name: "val64");
25865 }
25866
25867 Type *Tys[] = { Addr->getType() };
25868 Intrinsic::ID Int =
25869 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
25870 Function *Ldxr = Intrinsic::getDeclaration(M, id: Int, Tys);
25871
25872 const DataLayout &DL = M->getDataLayout();
25873 IntegerType *IntEltTy = Builder.getIntNTy(N: DL.getTypeSizeInBits(Ty: ValueTy));
25874 CallInst *CI = Builder.CreateCall(Callee: Ldxr, Args: Addr);
25875 CI->addParamAttr(
25876 0, Attribute::get(Builder.getContext(), Attribute::ElementType, ValueTy));
25877 Value *Trunc = Builder.CreateTrunc(V: CI, DestTy: IntEltTy);
25878
25879 return Builder.CreateBitCast(V: Trunc, DestTy: ValueTy);
25880}
25881
25882void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
25883 IRBuilderBase &Builder) const {
25884 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
25885 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
25886}
25887
25888Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder,
25889 Value *Val, Value *Addr,
25890 AtomicOrdering Ord) const {
25891 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
25892 bool IsRelease = isReleaseOrStronger(AO: Ord);
25893
25894 // Since the intrinsics must have legal type, the i128 intrinsics take two
25895 // parameters: "i64, i64". We must marshal Val into the appropriate form
25896 // before the call.
25897 if (Val->getType()->getPrimitiveSizeInBits() == 128) {
25898 Intrinsic::ID Int =
25899 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
25900 Function *Stxr = Intrinsic::getDeclaration(M, id: Int);
25901 Type *Int64Ty = Type::getInt64Ty(C&: M->getContext());
25902
25903 Value *Lo = Builder.CreateTrunc(V: Val, DestTy: Int64Ty, Name: "lo");
25904 Value *Hi = Builder.CreateTrunc(V: Builder.CreateLShr(LHS: Val, RHS: 64), DestTy: Int64Ty, Name: "hi");
25905 return Builder.CreateCall(Callee: Stxr, Args: {Lo, Hi, Addr});
25906 }
25907
25908 Intrinsic::ID Int =
25909 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
25910 Type *Tys[] = { Addr->getType() };
25911 Function *Stxr = Intrinsic::getDeclaration(M, id: Int, Tys);
25912
25913 const DataLayout &DL = M->getDataLayout();
25914 IntegerType *IntValTy = Builder.getIntNTy(N: DL.getTypeSizeInBits(Ty: Val->getType()));
25915 Val = Builder.CreateBitCast(V: Val, DestTy: IntValTy);
25916
25917 CallInst *CI = Builder.CreateCall(
25918 Callee: Stxr, Args: {Builder.CreateZExtOrBitCast(
25919 V: Val, DestTy: Stxr->getFunctionType()->getParamType(i: 0)),
25920 Addr});
25921 CI->addParamAttr(1, Attribute::get(Builder.getContext(),
25922 Attribute::ElementType, Val->getType()));
25923 return CI;
25924}
25925
25926bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
25927 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
25928 const DataLayout &DL) const {
25929 if (!Ty->isArrayTy()) {
25930 const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
25931 return TySize.isScalable() && TySize.getKnownMinValue() > 128;
25932 }
25933
25934 // All non aggregate members of the type must have the same type
25935 SmallVector<EVT> ValueVTs;
25936 ComputeValueVTs(TLI: *this, DL, Ty, ValueVTs);
25937 return all_equal(Range&: ValueVTs);
25938}
25939
25940bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
25941 EVT) const {
25942 return false;
25943}
25944
25945static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {
25946 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
25947 Function *ThreadPointerFunc =
25948 Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
25949 return IRB.CreatePointerCast(
25950 V: IRB.CreateConstGEP1_32(Ty: IRB.getInt8Ty(), Ptr: IRB.CreateCall(Callee: ThreadPointerFunc),
25951 Idx0: Offset),
25952 DestTy: IRB.getPtrTy(AddrSpace: 0));
25953}
25954
25955Value *AArch64TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
25956 // Android provides a fixed TLS slot for the stack cookie. See the definition
25957 // of TLS_SLOT_STACK_GUARD in
25958 // https://android.googlesource.com/platform/bionic/+/main/libc/platform/bionic/tls_defines.h
25959 if (Subtarget->isTargetAndroid())
25960 return UseTlsOffset(IRB, Offset: 0x28);
25961
25962 // Fuchsia is similar.
25963 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
25964 if (Subtarget->isTargetFuchsia())
25965 return UseTlsOffset(IRB, Offset: -0x10);
25966
25967 return TargetLowering::getIRStackGuard(IRB);
25968}
25969
25970void AArch64TargetLowering::insertSSPDeclarations(Module &M) const {
25971 // MSVC CRT provides functionalities for stack protection.
25972 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
25973 // MSVC CRT has a global variable holding security cookie.
25974 M.getOrInsertGlobal(Name: "__security_cookie",
25975 Ty: PointerType::getUnqual(C&: M.getContext()));
25976
25977 // MSVC CRT has a function to validate security cookie.
25978 FunctionCallee SecurityCheckCookie =
25979 M.getOrInsertFunction(Name: Subtarget->getSecurityCheckCookieName(),
25980 RetTy: Type::getVoidTy(C&: M.getContext()),
25981 Args: PointerType::getUnqual(C&: M.getContext()));
25982 if (Function *F = dyn_cast<Function>(Val: SecurityCheckCookie.getCallee())) {
25983 F->setCallingConv(CallingConv::Win64);
25984 F->addParamAttr(0, Attribute::AttrKind::InReg);
25985 }
25986 return;
25987 }
25988 TargetLowering::insertSSPDeclarations(M);
25989}
25990
25991Value *AArch64TargetLowering::getSDagStackGuard(const Module &M) const {
25992 // MSVC CRT has a global variable holding security cookie.
25993 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
25994 return M.getGlobalVariable(Name: "__security_cookie");
25995 return TargetLowering::getSDagStackGuard(M);
25996}
25997
25998Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const {
25999 // MSVC CRT has a function to validate security cookie.
26000 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
26001 return M.getFunction(Name: Subtarget->getSecurityCheckCookieName());
26002 return TargetLowering::getSSPStackGuardCheck(M);
26003}
26004
26005Value *
26006AArch64TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
26007 // Android provides a fixed TLS slot for the SafeStack pointer. See the
26008 // definition of TLS_SLOT_SAFESTACK in
26009 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
26010 if (Subtarget->isTargetAndroid())
26011 return UseTlsOffset(IRB, Offset: 0x48);
26012
26013 // Fuchsia is similar.
26014 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
26015 if (Subtarget->isTargetFuchsia())
26016 return UseTlsOffset(IRB, Offset: -0x8);
26017
26018 return TargetLowering::getSafeStackPointerLocation(IRB);
26019}
26020
26021bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
26022 const Instruction &AndI) const {
26023 // Only sink 'and' mask to cmp use block if it is masking a single bit, since
26024 // this is likely to be fold the and/cmp/br into a single tbz instruction. It
26025 // may be beneficial to sink in other cases, but we would have to check that
26026 // the cmp would not get folded into the br to form a cbz for these to be
26027 // beneficial.
26028 ConstantInt* Mask = dyn_cast<ConstantInt>(Val: AndI.getOperand(i: 1));
26029 if (!Mask)
26030 return false;
26031 return Mask->getValue().isPowerOf2();
26032}
26033
26034bool AArch64TargetLowering::
26035 shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
26036 SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
26037 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
26038 SelectionDAG &DAG) const {
26039 // Does baseline recommend not to perform the fold by default?
26040 if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
26041 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
26042 return false;
26043 // Else, if this is a vector shift, prefer 'shl'.
26044 return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
26045}
26046
26047TargetLowering::ShiftLegalizationStrategy
26048AArch64TargetLowering::preferredShiftLegalizationStrategy(
26049 SelectionDAG &DAG, SDNode *N, unsigned int ExpansionFactor) const {
26050 if (DAG.getMachineFunction().getFunction().hasMinSize() &&
26051 !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
26052 return ShiftLegalizationStrategy::LowerToLibcall;
26053 return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,
26054 ExpansionFactor);
26055}
26056
26057void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
26058 // Update IsSplitCSR in AArch64unctionInfo.
26059 AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
26060 AFI->setIsSplitCSR(true);
26061}
26062
26063void AArch64TargetLowering::insertCopiesSplitCSR(
26064 MachineBasicBlock *Entry,
26065 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
26066 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
26067 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(MF: Entry->getParent());
26068 if (!IStart)
26069 return;
26070
26071 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
26072 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
26073 MachineBasicBlock::iterator MBBI = Entry->begin();
26074 for (const MCPhysReg *I = IStart; *I; ++I) {
26075 const TargetRegisterClass *RC = nullptr;
26076 if (AArch64::GPR64RegClass.contains(*I))
26077 RC = &AArch64::GPR64RegClass;
26078 else if (AArch64::FPR64RegClass.contains(*I))
26079 RC = &AArch64::FPR64RegClass;
26080 else
26081 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
26082
26083 Register NewVR = MRI->createVirtualRegister(RegClass: RC);
26084 // Create copy from CSR to a virtual register.
26085 // FIXME: this currently does not emit CFI pseudo-instructions, it works
26086 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
26087 // nounwind. If we want to generalize this later, we may need to emit
26088 // CFI pseudo-instructions.
26089 assert(Entry->getParent()->getFunction().hasFnAttribute(
26090 Attribute::NoUnwind) &&
26091 "Function should be nounwind in insertCopiesSplitCSR!");
26092 Entry->addLiveIn(PhysReg: *I);
26093 BuildMI(BB&: *Entry, I: MBBI, MIMD: DebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: NewVR)
26094 .addReg(RegNo: *I);
26095
26096 // Insert the copy-back instructions right before the terminator.
26097 for (auto *Exit : Exits)
26098 BuildMI(BB&: *Exit, I: Exit->getFirstTerminator(), MIMD: DebugLoc(),
26099 MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: *I)
26100 .addReg(RegNo: NewVR);
26101 }
26102}
26103
26104bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
26105 // Integer division on AArch64 is expensive. However, when aggressively
26106 // optimizing for code size, we prefer to use a div instruction, as it is
26107 // usually smaller than the alternative sequence.
26108 // The exception to this is vector division. Since AArch64 doesn't have vector
26109 // integer division, leaving the division as-is is a loss even in terms of
26110 // size, because it will have to be scalarized, while the alternative code
26111 // sequence can be performed in vector form.
26112 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
26113 return OptSize && !VT.isVector();
26114}
26115
26116bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
26117 // We want inc-of-add for scalars and sub-of-not for vectors.
26118 return VT.isScalarInteger();
26119}
26120
26121bool AArch64TargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT,
26122 EVT VT) const {
26123 // v8f16 without fp16 need to be extended to v8f32, which is more difficult to
26124 // legalize.
26125 if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
26126 return false;
26127 if (FPVT == MVT::v8bf16)
26128 return false;
26129 return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
26130}
26131
26132MachineInstr *
26133AArch64TargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,
26134 MachineBasicBlock::instr_iterator &MBBI,
26135 const TargetInstrInfo *TII) const {
26136 assert(MBBI->isCall() && MBBI->getCFIType() &&
26137 "Invalid call instruction for a KCFI check");
26138
26139 switch (MBBI->getOpcode()) {
26140 case AArch64::BLR:
26141 case AArch64::BLRNoIP:
26142 case AArch64::TCRETURNri:
26143 case AArch64::TCRETURNrix16x17:
26144 case AArch64::TCRETURNrix17:
26145 case AArch64::TCRETURNrinotx16:
26146 break;
26147 default:
26148 llvm_unreachable("Unexpected CFI call opcode");
26149 }
26150
26151 MachineOperand &Target = MBBI->getOperand(i: 0);
26152 assert(Target.isReg() && "Invalid target operand for an indirect call");
26153 Target.setIsRenamable(false);
26154
26155 return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(AArch64::KCFI_CHECK))
26156 .addReg(Target.getReg())
26157 .addImm(MBBI->getCFIType())
26158 .getInstr();
26159}
26160
26161bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const {
26162 return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
26163}
26164
26165unsigned
26166AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
26167 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
26168 return getPointerTy(DL).getSizeInBits();
26169
26170 return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
26171}
26172
26173void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
26174 MachineFrameInfo &MFI = MF.getFrameInfo();
26175 // If we have any vulnerable SVE stack objects then the stack protector
26176 // needs to be placed at the top of the SVE stack area, as the SVE locals
26177 // are placed above the other locals, so we allocate it as if it were a
26178 // scalable vector.
26179 // FIXME: It may be worthwhile having a specific interface for this rather
26180 // than doing it here in finalizeLowering.
26181 if (MFI.hasStackProtectorIndex()) {
26182 for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
26183 if (MFI.getStackID(ObjectIdx: i) == TargetStackID::ScalableVector &&
26184 MFI.getObjectSSPLayout(ObjectIdx: i) != MachineFrameInfo::SSPLK_None) {
26185 MFI.setStackID(ObjectIdx: MFI.getStackProtectorIndex(),
26186 ID: TargetStackID::ScalableVector);
26187 MFI.setObjectAlignment(ObjectIdx: MFI.getStackProtectorIndex(), Alignment: Align(16));
26188 break;
26189 }
26190 }
26191 }
26192 MFI.computeMaxCallFrameSize(MF);
26193 TargetLoweringBase::finalizeLowering(MF);
26194}
26195
26196// Unlike X86, we let frame lowering assign offsets to all catch objects.
26197bool AArch64TargetLowering::needsFixedCatchObjects() const {
26198 return false;
26199}
26200
26201bool AArch64TargetLowering::shouldLocalize(
26202 const MachineInstr &MI, const TargetTransformInfo *TTI) const {
26203 auto &MF = *MI.getMF();
26204 auto &MRI = MF.getRegInfo();
26205 auto maxUses = [](unsigned RematCost) {
26206 // A cost of 1 means remats are basically free.
26207 if (RematCost == 1)
26208 return std::numeric_limits<unsigned>::max();
26209 if (RematCost == 2)
26210 return 2U;
26211
26212 // Remat is too expensive, only sink if there's one user.
26213 if (RematCost > 2)
26214 return 1U;
26215 llvm_unreachable("Unexpected remat cost");
26216 };
26217
26218 unsigned Opc = MI.getOpcode();
26219 switch (Opc) {
26220 case TargetOpcode::G_GLOBAL_VALUE: {
26221 // On Darwin, TLS global vars get selected into function calls, which
26222 // we don't want localized, as they can get moved into the middle of a
26223 // another call sequence.
26224 const GlobalValue &GV = *MI.getOperand(i: 1).getGlobal();
26225 if (GV.isThreadLocal() && Subtarget->isTargetMachO())
26226 return false;
26227 return true; // Always localize G_GLOBAL_VALUE to avoid high reg pressure.
26228 }
26229 case TargetOpcode::G_FCONSTANT:
26230 case TargetOpcode::G_CONSTANT: {
26231 const ConstantInt *CI;
26232 unsigned AdditionalCost = 0;
26233
26234 if (Opc == TargetOpcode::G_CONSTANT)
26235 CI = MI.getOperand(i: 1).getCImm();
26236 else {
26237 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
26238 // We try to estimate cost of 32/64b fpimms, as they'll likely be
26239 // materialized as integers.
26240 if (Ty.getScalarSizeInBits() != 32 && Ty.getScalarSizeInBits() != 64)
26241 break;
26242 auto APF = MI.getOperand(i: 1).getFPImm()->getValueAPF();
26243 bool OptForSize =
26244 MF.getFunction().hasOptSize() || MF.getFunction().hasMinSize();
26245 if (isFPImmLegal(Imm: APF, VT: EVT::getFloatingPointVT(BitWidth: Ty.getScalarSizeInBits()),
26246 OptForSize))
26247 return true; // Constant should be cheap.
26248 CI =
26249 ConstantInt::get(Context&: MF.getFunction().getContext(), V: APF.bitcastToAPInt());
26250 // FP materialization also costs an extra move, from gpr to fpr.
26251 AdditionalCost = 1;
26252 }
26253 APInt Imm = CI->getValue();
26254 InstructionCost Cost = TTI->getIntImmCost(
26255 Imm, Ty: CI->getType(), CostKind: TargetTransformInfo::TCK_CodeSize);
26256 assert(Cost.isValid() && "Expected a valid imm cost");
26257
26258 unsigned RematCost = *Cost.getValue();
26259 RematCost += AdditionalCost;
26260 Register Reg = MI.getOperand(i: 0).getReg();
26261 unsigned MaxUses = maxUses(RematCost);
26262 // Don't pass UINT_MAX sentinel value to hasAtMostUserInstrs().
26263 if (MaxUses == std::numeric_limits<unsigned>::max())
26264 --MaxUses;
26265 return MRI.hasAtMostUserInstrs(Reg, MaxUsers: MaxUses);
26266 }
26267 // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
26268 // localizable.
26269 case AArch64::ADRP:
26270 case AArch64::G_ADD_LOW:
26271 // Need to localize G_PTR_ADD so that G_GLOBAL_VALUE can be localized too.
26272 case TargetOpcode::G_PTR_ADD:
26273 return true;
26274 default:
26275 break;
26276 }
26277 return TargetLoweringBase::shouldLocalize(MI, TTI);
26278}
26279
26280bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
26281 if (Inst.getType()->isScalableTy())
26282 return true;
26283
26284 for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
26285 if (Inst.getOperand(i)->getType()->isScalableTy())
26286 return true;
26287
26288 if (const AllocaInst *AI = dyn_cast<AllocaInst>(Val: &Inst)) {
26289 if (AI->getAllocatedType()->isScalableTy())
26290 return true;
26291 }
26292
26293 // Checks to allow the use of SME instructions
26294 if (auto *Base = dyn_cast<CallBase>(Val: &Inst)) {
26295 auto CallerAttrs = SMEAttrs(*Inst.getFunction());
26296 auto CalleeAttrs = SMEAttrs(*Base);
26297 if (CallerAttrs.requiresSMChange(Callee: CalleeAttrs) ||
26298 CallerAttrs.requiresLazySave(Callee: CalleeAttrs) ||
26299 CallerAttrs.requiresPreservingZT0(Callee: CalleeAttrs))
26300 return true;
26301 }
26302 return false;
26303}
26304
26305// Return the largest legal scalable vector type that matches VT's element type.
26306static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT) {
26307 assert(VT.isFixedLengthVector() &&
26308 DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
26309 "Expected legal fixed length vector!");
26310 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
26311 default:
26312 llvm_unreachable("unexpected element type for SVE container");
26313 case MVT::i8:
26314 return EVT(MVT::nxv16i8);
26315 case MVT::i16:
26316 return EVT(MVT::nxv8i16);
26317 case MVT::i32:
26318 return EVT(MVT::nxv4i32);
26319 case MVT::i64:
26320 return EVT(MVT::nxv2i64);
26321 case MVT::bf16:
26322 return EVT(MVT::nxv8bf16);
26323 case MVT::f16:
26324 return EVT(MVT::nxv8f16);
26325 case MVT::f32:
26326 return EVT(MVT::nxv4f32);
26327 case MVT::f64:
26328 return EVT(MVT::nxv2f64);
26329 }
26330}
26331
26332// Return a PTRUE with active lanes corresponding to the extent of VT.
26333static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL,
26334 EVT VT) {
26335 assert(VT.isFixedLengthVector() &&
26336 DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
26337 "Expected legal fixed length vector!");
26338
26339 std::optional<unsigned> PgPattern =
26340 getSVEPredPatternFromNumElements(VT.getVectorNumElements());
26341 assert(PgPattern && "Unexpected element count for SVE predicate");
26342
26343 // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
26344 // AArch64SVEPredPattern::all, which can enable the use of unpredicated
26345 // variants of instructions when available.
26346 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
26347 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
26348 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
26349 if (MaxSVESize && MinSVESize == MaxSVESize &&
26350 MaxSVESize == VT.getSizeInBits())
26351 PgPattern = AArch64SVEPredPattern::all;
26352
26353 MVT MaskVT;
26354 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
26355 default:
26356 llvm_unreachable("unexpected element type for SVE predicate");
26357 case MVT::i8:
26358 MaskVT = MVT::nxv16i1;
26359 break;
26360 case MVT::i16:
26361 case MVT::f16:
26362 case MVT::bf16:
26363 MaskVT = MVT::nxv8i1;
26364 break;
26365 case MVT::i32:
26366 case MVT::f32:
26367 MaskVT = MVT::nxv4i1;
26368 break;
26369 case MVT::i64:
26370 case MVT::f64:
26371 MaskVT = MVT::nxv2i1;
26372 break;
26373 }
26374
26375 return getPTrue(DAG, DL, VT: MaskVT, Pattern: *PgPattern);
26376}
26377
26378static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
26379 EVT VT) {
26380 assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
26381 "Expected legal scalable vector!");
26382 auto PredTy = VT.changeVectorElementType(MVT::i1);
26383 return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
26384}
26385
26386static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT) {
26387 if (VT.isFixedLengthVector())
26388 return getPredicateForFixedLengthVector(DAG, DL, VT);
26389
26390 return getPredicateForScalableVector(DAG, DL, VT);
26391}
26392
26393// Grow V to consume an entire SVE register.
26394static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
26395 assert(VT.isScalableVector() &&
26396 "Expected to convert into a scalable vector!");
26397 assert(V.getValueType().isFixedLengthVector() &&
26398 "Expected a fixed length vector operand!");
26399 SDLoc DL(V);
26400 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
26401 return DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT, N1: DAG.getUNDEF(VT), N2: V, N3: Zero);
26402}
26403
26404// Shrink V so it's just big enough to maintain a VT's worth of data.
26405static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
26406 assert(VT.isFixedLengthVector() &&
26407 "Expected to convert into a fixed length vector!");
26408 assert(V.getValueType().isScalableVector() &&
26409 "Expected a scalable vector operand!");
26410 SDLoc DL(V);
26411 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
26412 return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: V, N2: Zero);
26413}
26414
26415// Convert all fixed length vector loads larger than NEON to masked_loads.
26416SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
26417 SDValue Op, SelectionDAG &DAG) const {
26418 auto Load = cast<LoadSDNode>(Val&: Op);
26419
26420 SDLoc DL(Op);
26421 EVT VT = Op.getValueType();
26422 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26423 EVT LoadVT = ContainerVT;
26424 EVT MemVT = Load->getMemoryVT();
26425
26426 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
26427
26428 if (VT.isFloatingPoint()) {
26429 LoadVT = ContainerVT.changeTypeToInteger();
26430 MemVT = MemVT.changeTypeToInteger();
26431 }
26432
26433 SDValue NewLoad = DAG.getMaskedLoad(
26434 VT: LoadVT, dl: DL, Chain: Load->getChain(), Base: Load->getBasePtr(), Offset: Load->getOffset(), Mask: Pg,
26435 Src0: DAG.getUNDEF(VT: LoadVT), MemVT, MMO: Load->getMemOperand(),
26436 AM: Load->getAddressingMode(), Load->getExtensionType());
26437
26438 SDValue Result = NewLoad;
26439 if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
26440 EVT ExtendVT = ContainerVT.changeVectorElementType(
26441 EltVT: Load->getMemoryVT().getVectorElementType());
26442
26443 Result = getSVESafeBitCast(VT: ExtendVT, Op: Result, DAG);
26444 Result = DAG.getNode(Opcode: AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, VT: ContainerVT,
26445 N1: Pg, N2: Result, N3: DAG.getUNDEF(VT: ContainerVT));
26446 } else if (VT.isFloatingPoint()) {
26447 Result = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ContainerVT, Operand: Result);
26448 }
26449
26450 Result = convertFromScalableVector(DAG, VT, V: Result);
26451 SDValue MergedValues[2] = {Result, NewLoad.getValue(R: 1)};
26452 return DAG.getMergeValues(Ops: MergedValues, dl: DL);
26453}
26454
26455static SDValue convertFixedMaskToScalableVector(SDValue Mask,
26456 SelectionDAG &DAG) {
26457 SDLoc DL(Mask);
26458 EVT InVT = Mask.getValueType();
26459 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: InVT);
26460
26461 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT: InVT);
26462
26463 if (ISD::isBuildVectorAllOnes(N: Mask.getNode()))
26464 return Pg;
26465
26466 auto Op1 = convertToScalableVector(DAG, VT: ContainerVT, V: Mask);
26467 auto Op2 = DAG.getConstant(Val: 0, DL, VT: ContainerVT);
26468
26469 return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL, VT: Pg.getValueType(),
26470 Ops: {Pg, Op1, Op2, DAG.getCondCode(Cond: ISD::SETNE)});
26471}
26472
26473// Convert all fixed length vector loads larger than NEON to masked_loads.
26474SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
26475 SDValue Op, SelectionDAG &DAG) const {
26476 auto Load = cast<MaskedLoadSDNode>(Val&: Op);
26477
26478 SDLoc DL(Op);
26479 EVT VT = Op.getValueType();
26480 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26481
26482 SDValue Mask = Load->getMask();
26483 // If this is an extending load and the mask type is not the same as
26484 // load's type then we have to extend the mask type.
26485 if (VT.getScalarSizeInBits() > Mask.getValueType().getScalarSizeInBits()) {
26486 assert(Load->getExtensionType() != ISD::NON_EXTLOAD &&
26487 "Incorrect mask type");
26488 Mask = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT, Operand: Mask);
26489 }
26490 Mask = convertFixedMaskToScalableVector(Mask, DAG);
26491
26492 SDValue PassThru;
26493 bool IsPassThruZeroOrUndef = false;
26494
26495 if (Load->getPassThru()->isUndef()) {
26496 PassThru = DAG.getUNDEF(VT: ContainerVT);
26497 IsPassThruZeroOrUndef = true;
26498 } else {
26499 if (ContainerVT.isInteger())
26500 PassThru = DAG.getConstant(Val: 0, DL, VT: ContainerVT);
26501 else
26502 PassThru = DAG.getConstantFP(Val: 0, DL, VT: ContainerVT);
26503 if (isZerosVector(N: Load->getPassThru().getNode()))
26504 IsPassThruZeroOrUndef = true;
26505 }
26506
26507 SDValue NewLoad = DAG.getMaskedLoad(
26508 VT: ContainerVT, dl: DL, Chain: Load->getChain(), Base: Load->getBasePtr(), Offset: Load->getOffset(),
26509 Mask, Src0: PassThru, MemVT: Load->getMemoryVT(), MMO: Load->getMemOperand(),
26510 AM: Load->getAddressingMode(), Load->getExtensionType());
26511
26512 SDValue Result = NewLoad;
26513 if (!IsPassThruZeroOrUndef) {
26514 SDValue OldPassThru =
26515 convertToScalableVector(DAG, VT: ContainerVT, V: Load->getPassThru());
26516 Result = DAG.getSelect(DL, VT: ContainerVT, Cond: Mask, LHS: Result, RHS: OldPassThru);
26517 }
26518
26519 Result = convertFromScalableVector(DAG, VT, V: Result);
26520 SDValue MergedValues[2] = {Result, NewLoad.getValue(R: 1)};
26521 return DAG.getMergeValues(Ops: MergedValues, dl: DL);
26522}
26523
26524// Convert all fixed length vector stores larger than NEON to masked_stores.
26525SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
26526 SDValue Op, SelectionDAG &DAG) const {
26527 auto Store = cast<StoreSDNode>(Val&: Op);
26528
26529 SDLoc DL(Op);
26530 EVT VT = Store->getValue().getValueType();
26531 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26532 EVT MemVT = Store->getMemoryVT();
26533
26534 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
26535 auto NewValue = convertToScalableVector(DAG, VT: ContainerVT, V: Store->getValue());
26536
26537 if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
26538 EVT TruncVT = ContainerVT.changeVectorElementType(
26539 EltVT: Store->getMemoryVT().getVectorElementType());
26540 MemVT = MemVT.changeTypeToInteger();
26541 NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
26542 NewValue, DAG.getTargetConstant(0, DL, MVT::i64),
26543 DAG.getUNDEF(TruncVT));
26544 NewValue =
26545 getSVESafeBitCast(VT: ContainerVT.changeTypeToInteger(), Op: NewValue, DAG);
26546 } else if (VT.isFloatingPoint()) {
26547 MemVT = MemVT.changeTypeToInteger();
26548 NewValue =
26549 getSVESafeBitCast(VT: ContainerVT.changeTypeToInteger(), Op: NewValue, DAG);
26550 }
26551
26552 return DAG.getMaskedStore(Chain: Store->getChain(), dl: DL, Val: NewValue,
26553 Base: Store->getBasePtr(), Offset: Store->getOffset(), Mask: Pg, MemVT,
26554 MMO: Store->getMemOperand(), AM: Store->getAddressingMode(),
26555 IsTruncating: Store->isTruncatingStore());
26556}
26557
26558SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
26559 SDValue Op, SelectionDAG &DAG) const {
26560 auto *Store = cast<MaskedStoreSDNode>(Val&: Op);
26561
26562 SDLoc DL(Op);
26563 EVT VT = Store->getValue().getValueType();
26564 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26565
26566 auto NewValue = convertToScalableVector(DAG, VT: ContainerVT, V: Store->getValue());
26567 SDValue Mask = convertFixedMaskToScalableVector(Mask: Store->getMask(), DAG);
26568
26569 return DAG.getMaskedStore(
26570 Chain: Store->getChain(), dl: DL, Val: NewValue, Base: Store->getBasePtr(), Offset: Store->getOffset(),
26571 Mask, MemVT: Store->getMemoryVT(), MMO: Store->getMemOperand(),
26572 AM: Store->getAddressingMode(), IsTruncating: Store->isTruncatingStore());
26573}
26574
26575SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
26576 SDValue Op, SelectionDAG &DAG) const {
26577 SDLoc dl(Op);
26578 EVT VT = Op.getValueType();
26579 EVT EltVT = VT.getVectorElementType();
26580
26581 bool Signed = Op.getOpcode() == ISD::SDIV;
26582 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
26583
26584 bool Negated;
26585 uint64_t SplatVal;
26586 if (Signed && isPow2Splat(Op: Op.getOperand(i: 1), SplatVal, Negated)) {
26587 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26588 SDValue Op1 = convertToScalableVector(DAG, VT: ContainerVT, V: Op.getOperand(i: 0));
26589 SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32);
26590
26591 SDValue Pg = getPredicateForFixedLengthVector(DAG, DL&: dl, VT);
26592 SDValue Res =
26593 DAG.getNode(Opcode: AArch64ISD::SRAD_MERGE_OP1, DL: dl, VT: ContainerVT, N1: Pg, N2: Op1, N3: Op2);
26594 if (Negated)
26595 Res = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT: ContainerVT,
26596 N1: DAG.getConstant(Val: 0, DL: dl, VT: ContainerVT), N2: Res);
26597
26598 return convertFromScalableVector(DAG, VT, V: Res);
26599 }
26600
26601 // Scalable vector i32/i64 DIV is supported.
26602 if (EltVT == MVT::i32 || EltVT == MVT::i64)
26603 return LowerToPredicatedOp(Op, DAG, NewOp: PredOpcode);
26604
26605 // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
26606 EVT HalfVT = VT.getHalfNumVectorElementsVT(Context&: *DAG.getContext());
26607 EVT PromVT = HalfVT.widenIntegerVectorElementType(Context&: *DAG.getContext());
26608 unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
26609
26610 // If the wider type is legal: extend, op, and truncate.
26611 EVT WideVT = VT.widenIntegerVectorElementType(Context&: *DAG.getContext());
26612 if (DAG.getTargetLoweringInfo().isTypeLegal(VT: WideVT)) {
26613 SDValue Op0 = DAG.getNode(Opcode: ExtendOpcode, DL: dl, VT: WideVT, Operand: Op.getOperand(i: 0));
26614 SDValue Op1 = DAG.getNode(Opcode: ExtendOpcode, DL: dl, VT: WideVT, Operand: Op.getOperand(i: 1));
26615 SDValue Div = DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: WideVT, N1: Op0, N2: Op1);
26616 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT, Operand: Div);
26617 }
26618
26619 auto HalveAndExtendVector = [&DAG, &dl, &HalfVT, &PromVT,
26620 &ExtendOpcode](SDValue Op) {
26621 SDValue IdxZero = DAG.getConstant(0, dl, MVT::i64);
26622 SDValue IdxHalf =
26623 DAG.getConstant(HalfVT.getVectorNumElements(), dl, MVT::i64);
26624 SDValue Lo = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT: HalfVT, N1: Op, N2: IdxZero);
26625 SDValue Hi = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT: HalfVT, N1: Op, N2: IdxHalf);
26626 return std::pair<SDValue, SDValue>(
26627 {DAG.getNode(Opcode: ExtendOpcode, DL: dl, VT: PromVT, Operand: Lo),
26628 DAG.getNode(Opcode: ExtendOpcode, DL: dl, VT: PromVT, Operand: Hi)});
26629 };
26630
26631 // If wider type is not legal: split, extend, op, trunc and concat.
26632 auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(i: 0));
26633 auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(i: 1));
26634 SDValue Lo = DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: PromVT, N1: Op0LoExt, N2: Op1LoExt);
26635 SDValue Hi = DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: PromVT, N1: Op0HiExt, N2: Op1HiExt);
26636 SDValue LoTrunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: HalfVT, Operand: Lo);
26637 SDValue HiTrunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: HalfVT, Operand: Hi);
26638 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT, Ops: {LoTrunc, HiTrunc});
26639}
26640
26641SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
26642 SDValue Op, SelectionDAG &DAG) const {
26643 EVT VT = Op.getValueType();
26644 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26645
26646 SDLoc DL(Op);
26647 SDValue Val = Op.getOperand(i: 0);
26648 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: Val.getValueType());
26649 Val = convertToScalableVector(DAG, VT: ContainerVT, V: Val);
26650
26651 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
26652 unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
26653
26654 // Repeatedly unpack Val until the result is of the desired element type.
26655 switch (ContainerVT.getSimpleVT().SimpleTy) {
26656 default:
26657 llvm_unreachable("unimplemented container type");
26658 case MVT::nxv16i8:
26659 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
26660 if (VT.getVectorElementType() == MVT::i16)
26661 break;
26662 [[fallthrough]];
26663 case MVT::nxv8i16:
26664 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
26665 if (VT.getVectorElementType() == MVT::i32)
26666 break;
26667 [[fallthrough]];
26668 case MVT::nxv4i32:
26669 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
26670 assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
26671 break;
26672 }
26673
26674 return convertFromScalableVector(DAG, VT, V: Val);
26675}
26676
26677SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
26678 SDValue Op, SelectionDAG &DAG) const {
26679 EVT VT = Op.getValueType();
26680 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26681
26682 SDLoc DL(Op);
26683 SDValue Val = Op.getOperand(i: 0);
26684 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: Val.getValueType());
26685 Val = convertToScalableVector(DAG, VT: ContainerVT, V: Val);
26686
26687 // Repeatedly truncate Val until the result is of the desired element type.
26688 switch (ContainerVT.getSimpleVT().SimpleTy) {
26689 default:
26690 llvm_unreachable("unimplemented container type");
26691 case MVT::nxv2i64:
26692 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
26693 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
26694 if (VT.getVectorElementType() == MVT::i32)
26695 break;
26696 [[fallthrough]];
26697 case MVT::nxv4i32:
26698 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
26699 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
26700 if (VT.getVectorElementType() == MVT::i16)
26701 break;
26702 [[fallthrough]];
26703 case MVT::nxv8i16:
26704 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
26705 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
26706 assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
26707 break;
26708 }
26709
26710 return convertFromScalableVector(DAG, VT, V: Val);
26711}
26712
26713SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
26714 SDValue Op, SelectionDAG &DAG) const {
26715 EVT VT = Op.getValueType();
26716 EVT InVT = Op.getOperand(i: 0).getValueType();
26717 assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
26718
26719 SDLoc DL(Op);
26720 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: InVT);
26721 SDValue Op0 = convertToScalableVector(DAG, VT: ContainerVT, V: Op->getOperand(Num: 0));
26722
26723 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT, N1: Op0, N2: Op.getOperand(i: 1));
26724}
26725
26726SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
26727 SDValue Op, SelectionDAG &DAG) const {
26728 EVT VT = Op.getValueType();
26729 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26730
26731 SDLoc DL(Op);
26732 EVT InVT = Op.getOperand(i: 0).getValueType();
26733 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: InVT);
26734 SDValue Op0 = convertToScalableVector(DAG, VT: ContainerVT, V: Op->getOperand(Num: 0));
26735
26736 auto ScalableRes = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: ContainerVT, N1: Op0,
26737 N2: Op.getOperand(i: 1), N3: Op.getOperand(i: 2));
26738
26739 return convertFromScalableVector(DAG, VT, V: ScalableRes);
26740}
26741
26742// Convert vector operation 'Op' to an equivalent predicated operation whereby
26743// the original operation's type is used to construct a suitable predicate.
26744// NOTE: The results for inactive lanes are undefined.
26745SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
26746 SelectionDAG &DAG,
26747 unsigned NewOp) const {
26748 EVT VT = Op.getValueType();
26749 SDLoc DL(Op);
26750 auto Pg = getPredicateForVector(DAG, DL, VT);
26751
26752 if (VT.isFixedLengthVector()) {
26753 assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
26754 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26755
26756 // Create list of operands by converting existing ones to scalable types.
26757 SmallVector<SDValue, 4> Operands = {Pg};
26758 for (const SDValue &V : Op->op_values()) {
26759 if (isa<CondCodeSDNode>(Val: V)) {
26760 Operands.push_back(Elt: V);
26761 continue;
26762 }
26763
26764 if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(Val: V)) {
26765 EVT VTArg = VTNode->getVT().getVectorElementType();
26766 EVT NewVTArg = ContainerVT.changeVectorElementType(EltVT: VTArg);
26767 Operands.push_back(Elt: DAG.getValueType(NewVTArg));
26768 continue;
26769 }
26770
26771 assert(isTypeLegal(V.getValueType()) &&
26772 "Expected only legal fixed-width types");
26773 Operands.push_back(Elt: convertToScalableVector(DAG, VT: ContainerVT, V));
26774 }
26775
26776 if (isMergePassthruOpcode(Opc: NewOp))
26777 Operands.push_back(Elt: DAG.getUNDEF(VT: ContainerVT));
26778
26779 auto ScalableRes = DAG.getNode(Opcode: NewOp, DL, VT: ContainerVT, Ops: Operands);
26780 return convertFromScalableVector(DAG, VT, V: ScalableRes);
26781 }
26782
26783 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
26784
26785 SmallVector<SDValue, 4> Operands = {Pg};
26786 for (const SDValue &V : Op->op_values()) {
26787 assert((!V.getValueType().isVector() ||
26788 V.getValueType().isScalableVector()) &&
26789 "Only scalable vectors are supported!");
26790 Operands.push_back(Elt: V);
26791 }
26792
26793 if (isMergePassthruOpcode(Opc: NewOp))
26794 Operands.push_back(Elt: DAG.getUNDEF(VT));
26795
26796 return DAG.getNode(Opcode: NewOp, DL, VT, Ops: Operands, Flags: Op->getFlags());
26797}
26798
26799// If a fixed length vector operation has no side effects when applied to
26800// undefined elements, we can safely use scalable vectors to perform the same
26801// operation without needing to worry about predication.
26802SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
26803 SelectionDAG &DAG) const {
26804 EVT VT = Op.getValueType();
26805 assert(VT.isFixedLengthVector() && isTypeLegal(VT) &&
26806 "Only expected to lower fixed length vector operation!");
26807 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26808
26809 // Create list of operands by converting existing ones to scalable types.
26810 SmallVector<SDValue, 4> Ops;
26811 for (const SDValue &V : Op->op_values()) {
26812 assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
26813
26814 // Pass through non-vector operands.
26815 if (!V.getValueType().isVector()) {
26816 Ops.push_back(Elt: V);
26817 continue;
26818 }
26819
26820 // "cast" fixed length vector to a scalable vector.
26821 assert(V.getValueType().isFixedLengthVector() &&
26822 isTypeLegal(V.getValueType()) &&
26823 "Only fixed length vectors are supported!");
26824 Ops.push_back(Elt: convertToScalableVector(DAG, VT: ContainerVT, V));
26825 }
26826
26827 auto ScalableRes = DAG.getNode(Opcode: Op.getOpcode(), DL: SDLoc(Op), VT: ContainerVT, Ops);
26828 return convertFromScalableVector(DAG, VT, V: ScalableRes);
26829}
26830
26831SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
26832 SelectionDAG &DAG) const {
26833 SDLoc DL(ScalarOp);
26834 SDValue AccOp = ScalarOp.getOperand(i: 0);
26835 SDValue VecOp = ScalarOp.getOperand(i: 1);
26836 EVT SrcVT = VecOp.getValueType();
26837 EVT ResVT = SrcVT.getVectorElementType();
26838
26839 EVT ContainerVT = SrcVT;
26840 if (SrcVT.isFixedLengthVector()) {
26841 ContainerVT = getContainerForFixedLengthVector(DAG, VT: SrcVT);
26842 VecOp = convertToScalableVector(DAG, VT: ContainerVT, V: VecOp);
26843 }
26844
26845 SDValue Pg = getPredicateForVector(DAG, DL, VT: SrcVT);
26846 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
26847
26848 // Convert operands to Scalable.
26849 AccOp = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: ContainerVT,
26850 N1: DAG.getUNDEF(VT: ContainerVT), N2: AccOp, N3: Zero);
26851
26852 // Perform reduction.
26853 SDValue Rdx = DAG.getNode(Opcode: AArch64ISD::FADDA_PRED, DL, VT: ContainerVT,
26854 N1: Pg, N2: AccOp, N3: VecOp);
26855
26856 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ResVT, N1: Rdx, N2: Zero);
26857}
26858
26859SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
26860 SelectionDAG &DAG) const {
26861 SDLoc DL(ReduceOp);
26862 SDValue Op = ReduceOp.getOperand(i: 0);
26863 EVT OpVT = Op.getValueType();
26864 EVT VT = ReduceOp.getValueType();
26865
26866 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
26867 return SDValue();
26868
26869 SDValue Pg = getPredicateForVector(DAG, DL, VT: OpVT);
26870
26871 switch (ReduceOp.getOpcode()) {
26872 default:
26873 return SDValue();
26874 case ISD::VECREDUCE_OR:
26875 if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1)
26876 // The predicate can be 'Op' because
26877 // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
26878 return getPTest(DAG, VT, Pg: Op, Op, Cond: AArch64CC::ANY_ACTIVE);
26879 else
26880 return getPTest(DAG, VT, Pg, Op, Cond: AArch64CC::ANY_ACTIVE);
26881 case ISD::VECREDUCE_AND: {
26882 Op = DAG.getNode(Opcode: ISD::XOR, DL, VT: OpVT, N1: Op, N2: Pg);
26883 return getPTest(DAG, VT, Pg, Op, Cond: AArch64CC::NONE_ACTIVE);
26884 }
26885 case ISD::VECREDUCE_XOR: {
26886 SDValue ID =
26887 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
26888 if (OpVT == MVT::nxv1i1) {
26889 // Emulate a CNTP on .Q using .D and a different governing predicate.
26890 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg);
26891 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op);
26892 }
26893 SDValue Cntp =
26894 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
26895 return DAG.getAnyExtOrTrunc(Op: Cntp, DL, VT);
26896 }
26897 }
26898
26899 return SDValue();
26900}
26901
26902SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
26903 SDValue ScalarOp,
26904 SelectionDAG &DAG) const {
26905 SDLoc DL(ScalarOp);
26906 SDValue VecOp = ScalarOp.getOperand(i: 0);
26907 EVT SrcVT = VecOp.getValueType();
26908
26909 if (useSVEForFixedLengthVectorVT(
26910 VT: SrcVT,
26911 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
26912 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: SrcVT);
26913 VecOp = convertToScalableVector(DAG, VT: ContainerVT, V: VecOp);
26914 }
26915
26916 // UADDV always returns an i64 result.
26917 EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
26918 SrcVT.getVectorElementType();
26919 EVT RdxVT = SrcVT;
26920 if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
26921 RdxVT = getPackedSVEVectorVT(VT: ResVT);
26922
26923 SDValue Pg = getPredicateForVector(DAG, DL, VT: SrcVT);
26924 SDValue Rdx = DAG.getNode(Opcode, DL, VT: RdxVT, N1: Pg, N2: VecOp);
26925 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
26926 Rdx, DAG.getConstant(0, DL, MVT::i64));
26927
26928 // The VEC_REDUCE nodes expect an element size result.
26929 if (ResVT != ScalarOp.getValueType())
26930 Res = DAG.getAnyExtOrTrunc(Op: Res, DL, VT: ScalarOp.getValueType());
26931
26932 return Res;
26933}
26934
26935SDValue
26936AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
26937 SelectionDAG &DAG) const {
26938 EVT VT = Op.getValueType();
26939 SDLoc DL(Op);
26940
26941 EVT InVT = Op.getOperand(i: 1).getValueType();
26942 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: InVT);
26943 SDValue Op1 = convertToScalableVector(DAG, VT: ContainerVT, V: Op->getOperand(Num: 1));
26944 SDValue Op2 = convertToScalableVector(DAG, VT: ContainerVT, V: Op->getOperand(Num: 2));
26945
26946 // Convert the mask to a predicated (NOTE: We don't need to worry about
26947 // inactive lanes since VSELECT is safe when given undefined elements).
26948 EVT MaskVT = Op.getOperand(i: 0).getValueType();
26949 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, VT: MaskVT);
26950 auto Mask = convertToScalableVector(DAG, VT: MaskContainerVT, V: Op.getOperand(i: 0));
26951 Mask = DAG.getNode(ISD::TRUNCATE, DL,
26952 MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
26953
26954 auto ScalableRes = DAG.getNode(Opcode: ISD::VSELECT, DL, VT: ContainerVT,
26955 N1: Mask, N2: Op1, N3: Op2);
26956
26957 return convertFromScalableVector(DAG, VT, V: ScalableRes);
26958}
26959
26960SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
26961 SDValue Op, SelectionDAG &DAG) const {
26962 SDLoc DL(Op);
26963 EVT InVT = Op.getOperand(i: 0).getValueType();
26964 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: InVT);
26965
26966 assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) &&
26967 "Only expected to lower fixed length vector operation!");
26968 assert(Op.getValueType() == InVT.changeTypeToInteger() &&
26969 "Expected integer result of the same bit length as the inputs!");
26970
26971 auto Op1 = convertToScalableVector(DAG, VT: ContainerVT, V: Op.getOperand(i: 0));
26972 auto Op2 = convertToScalableVector(DAG, VT: ContainerVT, V: Op.getOperand(i: 1));
26973 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT: InVT);
26974
26975 EVT CmpVT = Pg.getValueType();
26976 auto Cmp = DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL, VT: CmpVT,
26977 Ops: {Pg, Op1, Op2, Op.getOperand(i: 2)});
26978
26979 EVT PromoteVT = ContainerVT.changeTypeToInteger();
26980 auto Promote = DAG.getBoolExtOrTrunc(Op: Cmp, SL: DL, VT: PromoteVT, OpVT: InVT);
26981 return convertFromScalableVector(DAG, VT: Op.getValueType(), V: Promote);
26982}
26983
26984SDValue
26985AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
26986 SelectionDAG &DAG) const {
26987 SDLoc DL(Op);
26988 auto SrcOp = Op.getOperand(i: 0);
26989 EVT VT = Op.getValueType();
26990 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
26991 EVT ContainerSrcVT =
26992 getContainerForFixedLengthVector(DAG, VT: SrcOp.getValueType());
26993
26994 SrcOp = convertToScalableVector(DAG, VT: ContainerSrcVT, V: SrcOp);
26995 Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ContainerDstVT, Operand: SrcOp);
26996 return convertFromScalableVector(DAG, VT, V: Op);
26997}
26998
26999SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
27000 SDValue Op, SelectionDAG &DAG) const {
27001 SDLoc DL(Op);
27002 unsigned NumOperands = Op->getNumOperands();
27003
27004 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
27005 "Unexpected number of operands in CONCAT_VECTORS");
27006
27007 auto SrcOp1 = Op.getOperand(i: 0);
27008 auto SrcOp2 = Op.getOperand(i: 1);
27009 EVT VT = Op.getValueType();
27010 EVT SrcVT = SrcOp1.getValueType();
27011
27012 if (NumOperands > 2) {
27013 SmallVector<SDValue, 4> Ops;
27014 EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
27015 for (unsigned I = 0; I < NumOperands; I += 2)
27016 Ops.push_back(Elt: DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: PairVT,
27017 N1: Op->getOperand(Num: I), N2: Op->getOperand(Num: I + 1)));
27018
27019 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, Ops);
27020 }
27021
27022 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27023
27024 SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT: SrcVT);
27025 SrcOp1 = convertToScalableVector(DAG, VT: ContainerVT, V: SrcOp1);
27026 SrcOp2 = convertToScalableVector(DAG, VT: ContainerVT, V: SrcOp2);
27027
27028 Op = DAG.getNode(Opcode: AArch64ISD::SPLICE, DL, VT: ContainerVT, N1: Pg, N2: SrcOp1, N3: SrcOp2);
27029
27030 return convertFromScalableVector(DAG, VT, V: Op);
27031}
27032
27033SDValue
27034AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
27035 SelectionDAG &DAG) const {
27036 EVT VT = Op.getValueType();
27037 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27038
27039 SDLoc DL(Op);
27040 SDValue Val = Op.getOperand(i: 0);
27041 SDValue Pg = getPredicateForVector(DAG, DL, VT);
27042 EVT SrcVT = Val.getValueType();
27043 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27044 EVT ExtendVT = ContainerVT.changeVectorElementType(
27045 EltVT: SrcVT.getVectorElementType());
27046
27047 Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: SrcVT.changeTypeToInteger(), Operand: Val);
27048 Val = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VT.changeTypeToInteger(), Operand: Val);
27049
27050 Val = convertToScalableVector(DAG, VT: ContainerVT.changeTypeToInteger(), V: Val);
27051 Val = getSVESafeBitCast(VT: ExtendVT, Op: Val, DAG);
27052 Val = DAG.getNode(Opcode: AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, VT: ContainerVT,
27053 N1: Pg, N2: Val, N3: DAG.getUNDEF(VT: ContainerVT));
27054
27055 return convertFromScalableVector(DAG, VT, V: Val);
27056}
27057
27058SDValue
27059AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
27060 SelectionDAG &DAG) const {
27061 EVT VT = Op.getValueType();
27062 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27063
27064 SDLoc DL(Op);
27065 SDValue Val = Op.getOperand(i: 0);
27066 EVT SrcVT = Val.getValueType();
27067 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, VT: SrcVT);
27068 EVT RoundVT = ContainerSrcVT.changeVectorElementType(
27069 EltVT: VT.getVectorElementType());
27070 SDValue Pg = getPredicateForVector(DAG, DL, VT: RoundVT);
27071
27072 Val = convertToScalableVector(DAG, VT: ContainerSrcVT, V: Val);
27073 Val = DAG.getNode(Opcode: AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, VT: RoundVT, N1: Pg, N2: Val,
27074 N3: Op.getOperand(i: 1), N4: DAG.getUNDEF(VT: RoundVT));
27075 Val = getSVESafeBitCast(VT: ContainerSrcVT.changeTypeToInteger(), Op: Val, DAG);
27076 Val = convertFromScalableVector(DAG, VT: SrcVT.changeTypeToInteger(), V: Val);
27077
27078 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VT.changeTypeToInteger(), Operand: Val);
27079 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Val);
27080}
27081
27082SDValue
27083AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
27084 SelectionDAG &DAG) const {
27085 EVT VT = Op.getValueType();
27086 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27087
27088 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
27089 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
27090 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
27091
27092 SDLoc DL(Op);
27093 SDValue Val = Op.getOperand(i: 0);
27094 EVT SrcVT = Val.getValueType();
27095 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27096 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, VT: SrcVT);
27097
27098 if (VT.bitsGE(VT: SrcVT)) {
27099 SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
27100
27101 Val = DAG.getNode(Opcode: IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
27102 VT: VT.changeTypeToInteger(), Operand: Val);
27103
27104 // Safe to use a larger than specified operand because by promoting the
27105 // value nothing has changed from an arithmetic point of view.
27106 Val =
27107 convertToScalableVector(DAG, VT: ContainerDstVT.changeTypeToInteger(), V: Val);
27108 Val = DAG.getNode(Opcode, DL, VT: ContainerDstVT, N1: Pg, N2: Val,
27109 N3: DAG.getUNDEF(VT: ContainerDstVT));
27110 return convertFromScalableVector(DAG, VT, V: Val);
27111 } else {
27112 EVT CvtVT = ContainerSrcVT.changeVectorElementType(
27113 EltVT: ContainerDstVT.getVectorElementType());
27114 SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT: SrcVT);
27115
27116 Val = convertToScalableVector(DAG, VT: ContainerSrcVT, V: Val);
27117 Val = DAG.getNode(Opcode, DL, VT: CvtVT, N1: Pg, N2: Val, N3: DAG.getUNDEF(VT: CvtVT));
27118 Val = getSVESafeBitCast(VT: ContainerSrcVT, Op: Val, DAG);
27119 Val = convertFromScalableVector(DAG, VT: SrcVT, V: Val);
27120
27121 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VT.changeTypeToInteger(), Operand: Val);
27122 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Val);
27123 }
27124}
27125
27126SDValue
27127AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
27128 SelectionDAG &DAG) const {
27129 SDLoc DL(Op);
27130 EVT OpVT = Op.getValueType();
27131 assert(OpVT.isScalableVector() &&
27132 "Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
27133 SDValue Even = DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: OpVT, N1: Op.getOperand(i: 0),
27134 N2: Op.getOperand(i: 1));
27135 SDValue Odd = DAG.getNode(Opcode: AArch64ISD::UZP2, DL, VT: OpVT, N1: Op.getOperand(i: 0),
27136 N2: Op.getOperand(i: 1));
27137 return DAG.getMergeValues(Ops: {Even, Odd}, dl: DL);
27138}
27139
27140SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
27141 SelectionDAG &DAG) const {
27142 SDLoc DL(Op);
27143 EVT OpVT = Op.getValueType();
27144 assert(OpVT.isScalableVector() &&
27145 "Expected scalable vector in LowerVECTOR_INTERLEAVE.");
27146
27147 SDValue Lo = DAG.getNode(Opcode: AArch64ISD::ZIP1, DL, VT: OpVT, N1: Op.getOperand(i: 0),
27148 N2: Op.getOperand(i: 1));
27149 SDValue Hi = DAG.getNode(Opcode: AArch64ISD::ZIP2, DL, VT: OpVT, N1: Op.getOperand(i: 0),
27150 N2: Op.getOperand(i: 1));
27151 return DAG.getMergeValues(Ops: {Lo, Hi}, dl: DL);
27152}
27153
27154SDValue
27155AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
27156 SelectionDAG &DAG) const {
27157 EVT VT = Op.getValueType();
27158 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27159
27160 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
27161 unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
27162 : AArch64ISD::FCVTZU_MERGE_PASSTHRU;
27163
27164 SDLoc DL(Op);
27165 SDValue Val = Op.getOperand(i: 0);
27166 EVT SrcVT = Val.getValueType();
27167 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27168 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, VT: SrcVT);
27169
27170 if (VT.bitsGT(VT: SrcVT)) {
27171 EVT CvtVT = ContainerDstVT.changeVectorElementType(
27172 EltVT: ContainerSrcVT.getVectorElementType());
27173 SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
27174
27175 Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: SrcVT.changeTypeToInteger(), Operand: Val);
27176 Val = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT, Operand: Val);
27177
27178 Val = convertToScalableVector(DAG, VT: ContainerDstVT, V: Val);
27179 Val = getSVESafeBitCast(VT: CvtVT, Op: Val, DAG);
27180 Val = DAG.getNode(Opcode, DL, VT: ContainerDstVT, N1: Pg, N2: Val,
27181 N3: DAG.getUNDEF(VT: ContainerDstVT));
27182 return convertFromScalableVector(DAG, VT, V: Val);
27183 } else {
27184 EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
27185 SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT: SrcVT);
27186
27187 // Safe to use a larger than specified result since an fp_to_int where the
27188 // result doesn't fit into the destination is undefined.
27189 Val = convertToScalableVector(DAG, VT: ContainerSrcVT, V: Val);
27190 Val = DAG.getNode(Opcode, DL, VT: CvtVT, N1: Pg, N2: Val, N3: DAG.getUNDEF(VT: CvtVT));
27191 Val = convertFromScalableVector(DAG, VT: SrcVT.changeTypeToInteger(), V: Val);
27192
27193 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: Val);
27194 }
27195}
27196
27197static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2,
27198 ArrayRef<int> ShuffleMask, EVT VT,
27199 EVT ContainerVT, SelectionDAG &DAG) {
27200 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
27201 SDLoc DL(Op);
27202 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
27203 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
27204 bool IsSingleOp =
27205 ShuffleVectorInst::isSingleSourceMask(Mask: ShuffleMask, NumSrcElts: ShuffleMask.size());
27206
27207 if (!Subtarget.isNeonAvailable() && !MinSVESize)
27208 MinSVESize = 128;
27209
27210 // Ignore two operands if no SVE2 or all index numbers couldn't
27211 // be represented.
27212 if (!IsSingleOp && !Subtarget.hasSVE2())
27213 return SDValue();
27214
27215 EVT VTOp1 = Op.getOperand(i: 0).getValueType();
27216 unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
27217 unsigned IndexLen = MinSVESize / BitsPerElt;
27218 unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
27219 uint64_t MaxOffset = APInt(BitsPerElt, -1, false).getZExtValue();
27220 EVT MaskEltType = VTOp1.getVectorElementType().changeTypeToInteger();
27221 EVT MaskType = EVT::getVectorVT(Context&: *DAG.getContext(), VT: MaskEltType, NumElements: IndexLen);
27222 bool MinMaxEqual = (MinSVESize == MaxSVESize);
27223 assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen &&
27224 "Incorrectly legalised shuffle operation");
27225
27226 SmallVector<SDValue, 8> TBLMask;
27227 // If MinSVESize is not equal to MaxSVESize then we need to know which
27228 // TBL mask element needs adjustment.
27229 SmallVector<SDValue, 8> AddRuntimeVLMask;
27230
27231 // Bail out for 8-bits element types, because with 2048-bit SVE register
27232 // size 8 bits is only sufficient to index into the first source vector.
27233 if (!IsSingleOp && !MinMaxEqual && BitsPerElt == 8)
27234 return SDValue();
27235
27236 for (int Index : ShuffleMask) {
27237 // Handling poison index value.
27238 if (Index < 0)
27239 Index = 0;
27240 // If the mask refers to elements in the second operand, then we have to
27241 // offset the index by the number of elements in a vector. If this is number
27242 // is not known at compile-time, we need to maintain a mask with 'VL' values
27243 // to add at runtime.
27244 if ((unsigned)Index >= ElementsPerVectorReg) {
27245 if (MinMaxEqual) {
27246 Index += IndexLen - ElementsPerVectorReg;
27247 } else {
27248 Index = Index - ElementsPerVectorReg;
27249 AddRuntimeVLMask.push_back(DAG.getConstant(1, DL, MVT::i64));
27250 }
27251 } else if (!MinMaxEqual)
27252 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
27253 // For 8-bit elements and 1024-bit SVE registers and MaxOffset equals
27254 // to 255, this might point to the last element of in the second operand
27255 // of the shufflevector, thus we are rejecting this transform.
27256 if ((unsigned)Index >= MaxOffset)
27257 return SDValue();
27258 TBLMask.push_back(DAG.getConstant(Index, DL, MVT::i64));
27259 }
27260
27261 // Choosing an out-of-range index leads to the lane being zeroed vs zero
27262 // value where it would perform first lane duplication for out of
27263 // index elements. For i8 elements an out-of-range index could be a valid
27264 // for 2048-bit vector register size.
27265 for (unsigned i = 0; i < IndexLen - ElementsPerVectorReg; ++i) {
27266 TBLMask.push_back(DAG.getConstant((int)MaxOffset, DL, MVT::i64));
27267 if (!MinMaxEqual)
27268 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
27269 }
27270
27271 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, VT: MaskType);
27272 SDValue VecMask =
27273 DAG.getBuildVector(VT: MaskType, DL, Ops: ArrayRef(TBLMask.data(), IndexLen));
27274 SDValue SVEMask = convertToScalableVector(DAG, VT: MaskContainerVT, V: VecMask);
27275
27276 SDValue Shuffle;
27277 if (IsSingleOp)
27278 Shuffle =
27279 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
27280 DAG.getConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32),
27281 Op1, SVEMask);
27282 else if (Subtarget.hasSVE2()) {
27283 if (!MinMaxEqual) {
27284 unsigned MinNumElts = AArch64::SVEBitsPerBlock / BitsPerElt;
27285 SDValue VScale = (BitsPerElt == 64)
27286 ? DAG.getVScale(DL, MVT::i64, APInt(64, MinNumElts))
27287 : DAG.getVScale(DL, MVT::i32, APInt(32, MinNumElts));
27288 SDValue VecMask =
27289 DAG.getBuildVector(VT: MaskType, DL, Ops: ArrayRef(TBLMask.data(), IndexLen));
27290 SDValue MulByMask = DAG.getNode(
27291 Opcode: ISD::MUL, DL, VT: MaskType,
27292 N1: DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: MaskType, Operand: VScale),
27293 N2: DAG.getBuildVector(VT: MaskType, DL,
27294 Ops: ArrayRef(AddRuntimeVLMask.data(), IndexLen)));
27295 SDValue UpdatedVecMask =
27296 DAG.getNode(Opcode: ISD::ADD, DL, VT: MaskType, N1: VecMask, N2: MulByMask);
27297 SVEMask = convertToScalableVector(
27298 DAG, VT: getContainerForFixedLengthVector(DAG, VT: MaskType), V: UpdatedVecMask);
27299 }
27300 Shuffle =
27301 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
27302 DAG.getConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32),
27303 Op1, Op2, SVEMask);
27304 }
27305 Shuffle = convertFromScalableVector(DAG, VT, V: Shuffle);
27306 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: Op.getValueType(), Operand: Shuffle);
27307}
27308
27309SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
27310 SDValue Op, SelectionDAG &DAG) const {
27311 EVT VT = Op.getValueType();
27312 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27313
27314 auto *SVN = cast<ShuffleVectorSDNode>(Val: Op.getNode());
27315 auto ShuffleMask = SVN->getMask();
27316
27317 SDLoc DL(Op);
27318 SDValue Op1 = Op.getOperand(i: 0);
27319 SDValue Op2 = Op.getOperand(i: 1);
27320
27321 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27322 Op1 = convertToScalableVector(DAG, VT: ContainerVT, V: Op1);
27323 Op2 = convertToScalableVector(DAG, VT: ContainerVT, V: Op2);
27324
27325 auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT {
27326 if (ScalarTy == MVT::i8 || ScalarTy == MVT::i16)
27327 return MVT::i32;
27328 return ScalarTy;
27329 };
27330
27331 if (SVN->isSplat()) {
27332 unsigned Lane = std::max(a: 0, b: SVN->getSplatIndex());
27333 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
27334 SDValue SplatEl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
27335 DAG.getConstant(Lane, DL, MVT::i64));
27336 Op = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: ContainerVT, Operand: SplatEl);
27337 return convertFromScalableVector(DAG, VT, V: Op);
27338 }
27339
27340 bool ReverseEXT = false;
27341 unsigned Imm;
27342 if (isEXTMask(M: ShuffleMask, VT, ReverseEXT, Imm) &&
27343 Imm == VT.getVectorNumElements() - 1) {
27344 if (ReverseEXT)
27345 std::swap(a&: Op1, b&: Op2);
27346 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
27347 SDValue Scalar = DAG.getNode(
27348 ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
27349 DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64));
27350 Op = DAG.getNode(Opcode: AArch64ISD::INSR, DL, VT: ContainerVT, N1: Op2, N2: Scalar);
27351 return convertFromScalableVector(DAG, VT, V: Op);
27352 }
27353
27354 for (unsigned LaneSize : {64U, 32U, 16U}) {
27355 if (isREVMask(M: ShuffleMask, VT, BlockSize: LaneSize)) {
27356 EVT NewVT =
27357 getPackedSVEVectorVT(VT: EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: LaneSize));
27358 unsigned RevOp;
27359 unsigned EltSz = VT.getScalarSizeInBits();
27360 if (EltSz == 8)
27361 RevOp = AArch64ISD::BSWAP_MERGE_PASSTHRU;
27362 else if (EltSz == 16)
27363 RevOp = AArch64ISD::REVH_MERGE_PASSTHRU;
27364 else
27365 RevOp = AArch64ISD::REVW_MERGE_PASSTHRU;
27366
27367 Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: NewVT, Operand: Op1);
27368 Op = LowerToPredicatedOp(Op, DAG, NewOp: RevOp);
27369 Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ContainerVT, Operand: Op);
27370 return convertFromScalableVector(DAG, VT, V: Op);
27371 }
27372 }
27373
27374 if (Subtarget->hasSVE2p1() && VT.getScalarSizeInBits() == 64 &&
27375 isREVMask(M: ShuffleMask, VT, BlockSize: 128)) {
27376 if (!VT.isFloatingPoint())
27377 return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::REVD_MERGE_PASSTHRU);
27378
27379 EVT NewVT = getPackedSVEVectorVT(VT: EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: 64));
27380 Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: NewVT, Operand: Op1);
27381 Op = LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::REVD_MERGE_PASSTHRU);
27382 Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ContainerVT, Operand: Op);
27383 return convertFromScalableVector(DAG, VT, V: Op);
27384 }
27385
27386 unsigned WhichResult;
27387 if (isZIPMask(M: ShuffleMask, VT, WhichResultOut&: WhichResult) && WhichResult == 0)
27388 return convertFromScalableVector(
27389 DAG, VT, V: DAG.getNode(Opcode: AArch64ISD::ZIP1, DL, VT: ContainerVT, N1: Op1, N2: Op2));
27390
27391 if (isTRNMask(M: ShuffleMask, VT, WhichResult)) {
27392 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
27393 return convertFromScalableVector(
27394 DAG, VT, V: DAG.getNode(Opcode: Opc, DL, VT: ContainerVT, N1: Op1, N2: Op2));
27395 }
27396
27397 if (isZIP_v_undef_Mask(M: ShuffleMask, VT, WhichResult) && WhichResult == 0)
27398 return convertFromScalableVector(
27399 DAG, VT, V: DAG.getNode(Opcode: AArch64ISD::ZIP1, DL, VT: ContainerVT, N1: Op1, N2: Op1));
27400
27401 if (isTRN_v_undef_Mask(M: ShuffleMask, VT, WhichResult)) {
27402 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
27403 return convertFromScalableVector(
27404 DAG, VT, V: DAG.getNode(Opcode: Opc, DL, VT: ContainerVT, N1: Op1, N2: Op1));
27405 }
27406
27407 // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
27408 // represents the same logical operation as performed by a ZIP instruction. In
27409 // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
27410 // equivalent to an AArch64 instruction. There's the extra component of
27411 // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
27412 // only operated on 64/128bit vector types that have a direct mapping to a
27413 // target register and so an exact mapping is implied.
27414 // However, when using SVE for fixed length vectors, most legal vector types
27415 // are actually sub-vectors of a larger SVE register. When mapping
27416 // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
27417 // how the mask's indices translate. Specifically, when the mapping requires
27418 // an exact meaning for a specific vector index (e.g. Index X is the last
27419 // vector element in the register) then such mappings are often only safe when
27420 // the exact SVE register size is know. The main exception to this is when
27421 // indices are logically relative to the first element of either
27422 // ISD::VECTOR_SHUFFLE operand because these relative indices don't change
27423 // when converting from fixed-length to scalable vector types (i.e. the start
27424 // of a fixed length vector is always the start of a scalable vector).
27425 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
27426 unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
27427 if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
27428 if (ShuffleVectorInst::isReverseMask(Mask: ShuffleMask, NumSrcElts: ShuffleMask.size()) &&
27429 Op2.isUndef()) {
27430 Op = DAG.getNode(Opcode: ISD::VECTOR_REVERSE, DL, VT: ContainerVT, Operand: Op1);
27431 return convertFromScalableVector(DAG, VT, V: Op);
27432 }
27433
27434 if (isZIPMask(M: ShuffleMask, VT, WhichResultOut&: WhichResult) && WhichResult != 0)
27435 return convertFromScalableVector(
27436 DAG, VT, V: DAG.getNode(Opcode: AArch64ISD::ZIP2, DL, VT: ContainerVT, N1: Op1, N2: Op2));
27437
27438 if (isUZPMask(M: ShuffleMask, VT, WhichResultOut&: WhichResult)) {
27439 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
27440 return convertFromScalableVector(
27441 DAG, VT, V: DAG.getNode(Opcode: Opc, DL, VT: ContainerVT, N1: Op1, N2: Op2));
27442 }
27443
27444 if (isZIP_v_undef_Mask(M: ShuffleMask, VT, WhichResult) && WhichResult != 0)
27445 return convertFromScalableVector(
27446 DAG, VT, V: DAG.getNode(Opcode: AArch64ISD::ZIP2, DL, VT: ContainerVT, N1: Op1, N2: Op1));
27447
27448 if (isUZP_v_undef_Mask(M: ShuffleMask, VT, WhichResult)) {
27449 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
27450 return convertFromScalableVector(
27451 DAG, VT, V: DAG.getNode(Opcode: Opc, DL, VT: ContainerVT, N1: Op1, N2: Op1));
27452 }
27453 }
27454
27455 // Avoid producing TBL instruction if we don't know SVE register minimal size,
27456 // unless NEON is not available and we can assume minimal SVE register size is
27457 // 128-bits.
27458 if (MinSVESize || !Subtarget->isNeonAvailable())
27459 return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT,
27460 DAG);
27461
27462 return SDValue();
27463}
27464
27465SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
27466 SelectionDAG &DAG) const {
27467 SDLoc DL(Op);
27468 EVT InVT = Op.getValueType();
27469
27470 assert(VT.isScalableVector() && isTypeLegal(VT) &&
27471 InVT.isScalableVector() && isTypeLegal(InVT) &&
27472 "Only expect to cast between legal scalable vector types!");
27473 assert(VT.getVectorElementType() != MVT::i1 &&
27474 InVT.getVectorElementType() != MVT::i1 &&
27475 "For predicate bitcasts, use getSVEPredicateBitCast");
27476
27477 if (InVT == VT)
27478 return Op;
27479
27480 EVT PackedVT = getPackedSVEVectorVT(VT: VT.getVectorElementType());
27481 EVT PackedInVT = getPackedSVEVectorVT(VT: InVT.getVectorElementType());
27482
27483 // Safe bitcasting between unpacked vector types of different element counts
27484 // is currently unsupported because the following is missing the necessary
27485 // work to ensure the result's elements live where they're supposed to within
27486 // an SVE register.
27487 // 01234567
27488 // e.g. nxv2i32 = XX??XX??
27489 // nxv4f16 = X?X?X?X?
27490 assert((VT.getVectorElementCount() == InVT.getVectorElementCount() ||
27491 VT == PackedVT || InVT == PackedInVT) &&
27492 "Unexpected bitcast!");
27493
27494 // Pack input if required.
27495 if (InVT != PackedInVT)
27496 Op = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT: PackedInVT, Operand: Op);
27497
27498 Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: PackedVT, Operand: Op);
27499
27500 // Unpack result if required.
27501 if (VT != PackedVT)
27502 Op = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT, Operand: Op);
27503
27504 return Op;
27505}
27506
27507bool AArch64TargetLowering::isAllActivePredicate(SelectionDAG &DAG,
27508 SDValue N) const {
27509 return ::isAllActivePredicate(DAG, N);
27510}
27511
27512EVT AArch64TargetLowering::getPromotedVTForPredicate(EVT VT) const {
27513 return ::getPromotedVTForPredicate(VT);
27514}
27515
27516bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
27517 SDValue Op, const APInt &OriginalDemandedBits,
27518 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
27519 unsigned Depth) const {
27520
27521 unsigned Opc = Op.getOpcode();
27522 switch (Opc) {
27523 case AArch64ISD::VSHL: {
27524 // Match (VSHL (VLSHR Val X) X)
27525 SDValue ShiftL = Op;
27526 SDValue ShiftR = Op->getOperand(Num: 0);
27527 if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
27528 return false;
27529
27530 if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
27531 return false;
27532
27533 unsigned ShiftLBits = ShiftL->getConstantOperandVal(Num: 1);
27534 unsigned ShiftRBits = ShiftR->getConstantOperandVal(Num: 1);
27535
27536 // Other cases can be handled as well, but this is not
27537 // implemented.
27538 if (ShiftRBits != ShiftLBits)
27539 return false;
27540
27541 unsigned ScalarSize = Op.getScalarValueSizeInBits();
27542 assert(ScalarSize > ShiftLBits && "Invalid shift imm");
27543
27544 APInt ZeroBits = APInt::getLowBitsSet(numBits: ScalarSize, loBitsSet: ShiftLBits);
27545 APInt UnusedBits = ~OriginalDemandedBits;
27546
27547 if ((ZeroBits & UnusedBits) != ZeroBits)
27548 return false;
27549
27550 // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
27551 // used - simplify to just Val.
27552 return TLO.CombineTo(O: Op, N: ShiftR->getOperand(Num: 0));
27553 }
27554 case AArch64ISD::BICi: {
27555 // Fold BICi if all destination bits already known to be zeroed
27556 SDValue Op0 = Op.getOperand(i: 0);
27557 KnownBits KnownOp0 =
27558 TLO.DAG.computeKnownBits(Op: Op0, DemandedElts: OriginalDemandedElts, Depth: Depth + 1);
27559 // Op0 &= ~(ConstantOperandVal(1) << ConstantOperandVal(2))
27560 uint64_t BitsToClear = Op->getConstantOperandVal(Num: 1)
27561 << Op->getConstantOperandVal(Num: 2);
27562 APInt AlreadyZeroedBitsToClear = BitsToClear & KnownOp0.Zero;
27563 if (APInt(Known.getBitWidth(), BitsToClear)
27564 .isSubsetOf(RHS: AlreadyZeroedBitsToClear))
27565 return TLO.CombineTo(O: Op, N: Op0);
27566
27567 Known = KnownOp0 &
27568 KnownBits::makeConstant(C: APInt(Known.getBitWidth(), ~BitsToClear));
27569
27570 return false;
27571 }
27572 case ISD::INTRINSIC_WO_CHAIN: {
27573 if (auto ElementSize = IsSVECntIntrinsic(S: Op)) {
27574 unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
27575 if (!MaxSVEVectorSizeInBits)
27576 MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
27577 unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize;
27578 // The SVE count intrinsics don't support the multiplier immediate so we
27579 // don't have to account for that here. The value returned may be slightly
27580 // over the true required bits, as this is based on the "ALL" pattern. The
27581 // other patterns are also exposed by these intrinsics, but they all
27582 // return a value that's strictly less than "ALL".
27583 unsigned RequiredBits = llvm::bit_width(Value: MaxElements);
27584 unsigned BitWidth = Known.Zero.getBitWidth();
27585 if (RequiredBits < BitWidth)
27586 Known.Zero.setHighBits(BitWidth - RequiredBits);
27587 return false;
27588 }
27589 }
27590 }
27591
27592 return TargetLowering::SimplifyDemandedBitsForTargetNode(
27593 Op, DemandedBits: OriginalDemandedBits, DemandedElts: OriginalDemandedElts, Known, TLO, Depth);
27594}
27595
27596bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
27597 return Op.getOpcode() == AArch64ISD::DUP ||
27598 Op.getOpcode() == AArch64ISD::MOVI ||
27599 (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
27600 Op.getOperand(i: 0).getOpcode() == AArch64ISD::DUP) ||
27601 TargetLowering::isTargetCanonicalConstantNode(Op);
27602}
27603
27604bool AArch64TargetLowering::isComplexDeinterleavingSupported() const {
27605 return Subtarget->hasSVE() || Subtarget->hasSVE2() ||
27606 Subtarget->hasComplxNum();
27607}
27608
27609bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported(
27610 ComplexDeinterleavingOperation Operation, Type *Ty) const {
27611 auto *VTy = dyn_cast<VectorType>(Val: Ty);
27612 if (!VTy)
27613 return false;
27614
27615 // If the vector is scalable, SVE is enabled, implying support for complex
27616 // numbers. Otherwise, we need to ensure complex number support is available
27617 if (!VTy->isScalableTy() && !Subtarget->hasComplxNum())
27618 return false;
27619
27620 auto *ScalarTy = VTy->getScalarType();
27621 unsigned NumElements = VTy->getElementCount().getKnownMinValue();
27622
27623 // We can only process vectors that have a bit size of 128 or higher (with an
27624 // additional 64 bits for Neon). Additionally, these vectors must have a
27625 // power-of-2 size, as we later split them into the smallest supported size
27626 // and merging them back together after applying complex operation.
27627 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
27628 if ((VTyWidth < 128 && (VTy->isScalableTy() || VTyWidth != 64)) ||
27629 !llvm::isPowerOf2_32(Value: VTyWidth))
27630 return false;
27631
27632 if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
27633 unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
27634 return 8 <= ScalarWidth && ScalarWidth <= 64;
27635 }
27636
27637 return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
27638 ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();
27639}
27640
27641Value *AArch64TargetLowering::createComplexDeinterleavingIR(
27642 IRBuilderBase &B, ComplexDeinterleavingOperation OperationType,
27643 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
27644 Value *Accumulator) const {
27645 VectorType *Ty = cast<VectorType>(Val: InputA->getType());
27646 bool IsScalable = Ty->isScalableTy();
27647 bool IsInt = Ty->getElementType()->isIntegerTy();
27648
27649 unsigned TyWidth =
27650 Ty->getScalarSizeInBits() * Ty->getElementCount().getKnownMinValue();
27651
27652 assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) &&
27653 "Vector type must be either 64 or a power of 2 that is at least 128");
27654
27655 if (TyWidth > 128) {
27656 int Stride = Ty->getElementCount().getKnownMinValue() / 2;
27657 auto *HalfTy = VectorType::getHalfElementsVectorType(VTy: Ty);
27658 auto *LowerSplitA = B.CreateExtractVector(DstType: HalfTy, SrcVec: InputA, Idx: B.getInt64(C: 0));
27659 auto *LowerSplitB = B.CreateExtractVector(DstType: HalfTy, SrcVec: InputB, Idx: B.getInt64(C: 0));
27660 auto *UpperSplitA =
27661 B.CreateExtractVector(DstType: HalfTy, SrcVec: InputA, Idx: B.getInt64(C: Stride));
27662 auto *UpperSplitB =
27663 B.CreateExtractVector(DstType: HalfTy, SrcVec: InputB, Idx: B.getInt64(C: Stride));
27664 Value *LowerSplitAcc = nullptr;
27665 Value *UpperSplitAcc = nullptr;
27666 if (Accumulator) {
27667 LowerSplitAcc = B.CreateExtractVector(DstType: HalfTy, SrcVec: Accumulator, Idx: B.getInt64(C: 0));
27668 UpperSplitAcc =
27669 B.CreateExtractVector(DstType: HalfTy, SrcVec: Accumulator, Idx: B.getInt64(C: Stride));
27670 }
27671 auto *LowerSplitInt = createComplexDeinterleavingIR(
27672 B, OperationType, Rotation, InputA: LowerSplitA, InputB: LowerSplitB, Accumulator: LowerSplitAcc);
27673 auto *UpperSplitInt = createComplexDeinterleavingIR(
27674 B, OperationType, Rotation, InputA: UpperSplitA, InputB: UpperSplitB, Accumulator: UpperSplitAcc);
27675
27676 auto *Result = B.CreateInsertVector(DstType: Ty, SrcVec: PoisonValue::get(T: Ty), SubVec: LowerSplitInt,
27677 Idx: B.getInt64(C: 0));
27678 return B.CreateInsertVector(DstType: Ty, SrcVec: Result, SubVec: UpperSplitInt, Idx: B.getInt64(C: Stride));
27679 }
27680
27681 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
27682 if (Accumulator == nullptr)
27683 Accumulator = Constant::getNullValue(Ty);
27684
27685 if (IsScalable) {
27686 if (IsInt)
27687 return B.CreateIntrinsic(
27688 Intrinsic::aarch64_sve_cmla_x, Ty,
27689 {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
27690
27691 auto *Mask = B.getAllOnesMask(NumElts: Ty->getElementCount());
27692 return B.CreateIntrinsic(
27693 Intrinsic::aarch64_sve_fcmla, Ty,
27694 {Mask, Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
27695 }
27696
27697 Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0,
27698 Intrinsic::aarch64_neon_vcmla_rot90,
27699 Intrinsic::aarch64_neon_vcmla_rot180,
27700 Intrinsic::aarch64_neon_vcmla_rot270};
27701
27702
27703 return B.CreateIntrinsic(ID: IdMap[(int)Rotation], Types: Ty,
27704 Args: {Accumulator, InputA, InputB});
27705 }
27706
27707 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
27708 if (IsScalable) {
27709 if (Rotation == ComplexDeinterleavingRotation::Rotation_90 ||
27710 Rotation == ComplexDeinterleavingRotation::Rotation_270) {
27711 if (IsInt)
27712 return B.CreateIntrinsic(
27713 Intrinsic::aarch64_sve_cadd_x, Ty,
27714 {InputA, InputB, B.getInt32((int)Rotation * 90)});
27715
27716 auto *Mask = B.getAllOnesMask(NumElts: Ty->getElementCount());
27717 return B.CreateIntrinsic(
27718 Intrinsic::aarch64_sve_fcadd, Ty,
27719 {Mask, InputA, InputB, B.getInt32((int)Rotation * 90)});
27720 }
27721 return nullptr;
27722 }
27723
27724 Intrinsic::ID IntId = Intrinsic::not_intrinsic;
27725 if (Rotation == ComplexDeinterleavingRotation::Rotation_90)
27726 IntId = Intrinsic::aarch64_neon_vcadd_rot90;
27727 else if (Rotation == ComplexDeinterleavingRotation::Rotation_270)
27728 IntId = Intrinsic::aarch64_neon_vcadd_rot270;
27729
27730 if (IntId == Intrinsic::not_intrinsic)
27731 return nullptr;
27732
27733 return B.CreateIntrinsic(ID: IntId, Types: Ty, Args: {InputA, InputB});
27734 }
27735
27736 return nullptr;
27737}
27738
27739bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
27740 unsigned Opc = N->getOpcode();
27741 if (ISD::isExtOpcode(Opcode: Opc)) {
27742 if (any_of(Range: N->uses(),
27743 P: [&](SDNode *Use) { return Use->getOpcode() == ISD::MUL; }))
27744 return false;
27745 }
27746 return true;
27747}
27748
27749unsigned AArch64TargetLowering::getMinimumJumpTableEntries() const {
27750 return Subtarget->getMinimumJumpTableEntries();
27751}
27752
27753MVT AArch64TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
27754 CallingConv::ID CC,
27755 EVT VT) const {
27756 bool NonUnitFixedLengthVector =
27757 VT.isFixedLengthVector() && !VT.getVectorElementCount().isScalar();
27758 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
27759 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
27760
27761 EVT VT1;
27762 MVT RegisterVT;
27763 unsigned NumIntermediates;
27764 getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT&: VT1, NumIntermediates,
27765 RegisterVT);
27766 return RegisterVT;
27767}
27768
27769unsigned AArch64TargetLowering::getNumRegistersForCallingConv(
27770 LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
27771 bool NonUnitFixedLengthVector =
27772 VT.isFixedLengthVector() && !VT.getVectorElementCount().isScalar();
27773 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
27774 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
27775
27776 EVT VT1;
27777 MVT VT2;
27778 unsigned NumIntermediates;
27779 return getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT&: VT1,
27780 NumIntermediates, RegisterVT&: VT2);
27781}
27782
27783unsigned AArch64TargetLowering::getVectorTypeBreakdownForCallingConv(
27784 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
27785 unsigned &NumIntermediates, MVT &RegisterVT) const {
27786 int NumRegs = TargetLowering::getVectorTypeBreakdownForCallingConv(
27787 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
27788 if (!RegisterVT.isFixedLengthVector() ||
27789 RegisterVT.getFixedSizeInBits() <= 128)
27790 return NumRegs;
27791
27792 assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!");
27793 assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!");
27794 assert(RegisterVT.getFixedSizeInBits() % 128 == 0 && "Unexpected size!");
27795
27796 // A size mismatch here implies either type promotion or widening and would
27797 // have resulted in scalarisation if larger vectors had not be available.
27798 if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) {
27799 EVT EltTy = VT.getVectorElementType();
27800 EVT NewVT = EVT::getVectorVT(Context, VT: EltTy, EC: ElementCount::getFixed(MinVal: 1));
27801 if (!isTypeLegal(VT: NewVT))
27802 NewVT = EltTy;
27803
27804 IntermediateVT = NewVT;
27805 NumIntermediates = VT.getVectorNumElements();
27806 RegisterVT = getRegisterType(Context, VT: NewVT);
27807 return NumIntermediates;
27808 }
27809
27810 // SVE VLS support does not introduce a new ABI so we should use NEON sized
27811 // types for vector arguments and returns.
27812
27813 unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / 128;
27814 NumIntermediates *= NumSubRegs;
27815 NumRegs *= NumSubRegs;
27816
27817 switch (RegisterVT.getVectorElementType().SimpleTy) {
27818 default:
27819 llvm_unreachable("unexpected element type for vector");
27820 case MVT::i8:
27821 IntermediateVT = RegisterVT = MVT::v16i8;
27822 break;
27823 case MVT::i16:
27824 IntermediateVT = RegisterVT = MVT::v8i16;
27825 break;
27826 case MVT::i32:
27827 IntermediateVT = RegisterVT = MVT::v4i32;
27828 break;
27829 case MVT::i64:
27830 IntermediateVT = RegisterVT = MVT::v2i64;
27831 break;
27832 case MVT::f16:
27833 IntermediateVT = RegisterVT = MVT::v8f16;
27834 break;
27835 case MVT::f32:
27836 IntermediateVT = RegisterVT = MVT::v4f32;
27837 break;
27838 case MVT::f64:
27839 IntermediateVT = RegisterVT = MVT::v2f64;
27840 break;
27841 case MVT::bf16:
27842 IntermediateVT = RegisterVT = MVT::v8bf16;
27843 break;
27844 }
27845
27846 return NumRegs;
27847}
27848
27849bool AArch64TargetLowering::hasInlineStackProbe(
27850 const MachineFunction &MF) const {
27851 return !Subtarget->isTargetWindows() &&
27852 MF.getInfo<AArch64FunctionInfo>()->hasStackProbing();
27853}
27854
27855#ifndef NDEBUG
27856void AArch64TargetLowering::verifyTargetSDNode(const SDNode *N) const {
27857 switch (N->getOpcode()) {
27858 default:
27859 break;
27860 case AArch64ISD::SUNPKLO:
27861 case AArch64ISD::SUNPKHI:
27862 case AArch64ISD::UUNPKLO:
27863 case AArch64ISD::UUNPKHI: {
27864 assert(N->getNumValues() == 1 && "Expected one result!");
27865 assert(N->getNumOperands() == 1 && "Expected one operand!");
27866 EVT VT = N->getValueType(ResNo: 0);
27867 EVT OpVT = N->getOperand(Num: 0).getValueType();
27868 assert(OpVT.isVector() && VT.isVector() && OpVT.isInteger() &&
27869 VT.isInteger() && "Expected integer vectors!");
27870 assert(OpVT.getSizeInBits() == VT.getSizeInBits() &&
27871 "Expected vectors of equal size!");
27872 // TODO: Enable assert once bogus creations have been fixed.
27873 // assert(OpVT.getVectorElementCount() == VT.getVectorElementCount()*2 &&
27874 // "Expected result vector with half the lanes of its input!");
27875 break;
27876 }
27877 case AArch64ISD::TRN1:
27878 case AArch64ISD::TRN2:
27879 case AArch64ISD::UZP1:
27880 case AArch64ISD::UZP2:
27881 case AArch64ISD::ZIP1:
27882 case AArch64ISD::ZIP2: {
27883 assert(N->getNumValues() == 1 && "Expected one result!");
27884 assert(N->getNumOperands() == 2 && "Expected two operands!");
27885 EVT VT = N->getValueType(ResNo: 0);
27886 EVT Op0VT = N->getOperand(Num: 0).getValueType();
27887 EVT Op1VT = N->getOperand(Num: 1).getValueType();
27888 assert(VT.isVector() && Op0VT.isVector() && Op1VT.isVector() &&
27889 "Expected vectors!");
27890 // TODO: Enable assert once bogus creations have been fixed.
27891 // assert(VT == Op0VT && VT == Op1VT && "Expected matching vectors!");
27892 break;
27893 }
27894 }
27895}
27896#endif
27897

source code of llvm/lib/Target/AArch64/AArch64ISelLowering.cpp