1//===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that ARM uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "ARMISelLowering.h"
15#include "ARMBaseInstrInfo.h"
16#include "ARMBaseRegisterInfo.h"
17#include "ARMCallingConv.h"
18#include "ARMConstantPoolValue.h"
19#include "ARMMachineFunctionInfo.h"
20#include "ARMPerfectShuffle.h"
21#include "ARMRegisterInfo.h"
22#include "ARMSelectionDAGInfo.h"
23#include "ARMSubtarget.h"
24#include "ARMTargetTransformInfo.h"
25#include "MCTargetDesc/ARMAddressingModes.h"
26#include "MCTargetDesc/ARMBaseInfo.h"
27#include "Utils/ARMBaseInfo.h"
28#include "llvm/ADT/APFloat.h"
29#include "llvm/ADT/APInt.h"
30#include "llvm/ADT/ArrayRef.h"
31#include "llvm/ADT/BitVector.h"
32#include "llvm/ADT/DenseMap.h"
33#include "llvm/ADT/STLExtras.h"
34#include "llvm/ADT/SmallPtrSet.h"
35#include "llvm/ADT/SmallVector.h"
36#include "llvm/ADT/Statistic.h"
37#include "llvm/ADT/StringExtras.h"
38#include "llvm/ADT/StringRef.h"
39#include "llvm/ADT/StringSwitch.h"
40#include "llvm/ADT/Twine.h"
41#include "llvm/Analysis/VectorUtils.h"
42#include "llvm/CodeGen/CallingConvLower.h"
43#include "llvm/CodeGen/ComplexDeinterleavingPass.h"
44#include "llvm/CodeGen/ISDOpcodes.h"
45#include "llvm/CodeGen/IntrinsicLowering.h"
46#include "llvm/CodeGen/MachineBasicBlock.h"
47#include "llvm/CodeGen/MachineConstantPool.h"
48#include "llvm/CodeGen/MachineFrameInfo.h"
49#include "llvm/CodeGen/MachineFunction.h"
50#include "llvm/CodeGen/MachineInstr.h"
51#include "llvm/CodeGen/MachineInstrBuilder.h"
52#include "llvm/CodeGen/MachineJumpTableInfo.h"
53#include "llvm/CodeGen/MachineMemOperand.h"
54#include "llvm/CodeGen/MachineOperand.h"
55#include "llvm/CodeGen/MachineRegisterInfo.h"
56#include "llvm/CodeGen/RuntimeLibcalls.h"
57#include "llvm/CodeGen/SelectionDAG.h"
58#include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
59#include "llvm/CodeGen/SelectionDAGNodes.h"
60#include "llvm/CodeGen/TargetInstrInfo.h"
61#include "llvm/CodeGen/TargetLowering.h"
62#include "llvm/CodeGen/TargetOpcodes.h"
63#include "llvm/CodeGen/TargetRegisterInfo.h"
64#include "llvm/CodeGen/TargetSubtargetInfo.h"
65#include "llvm/CodeGen/ValueTypes.h"
66#include "llvm/CodeGenTypes/MachineValueType.h"
67#include "llvm/IR/Attributes.h"
68#include "llvm/IR/CallingConv.h"
69#include "llvm/IR/Constant.h"
70#include "llvm/IR/Constants.h"
71#include "llvm/IR/DataLayout.h"
72#include "llvm/IR/DebugLoc.h"
73#include "llvm/IR/DerivedTypes.h"
74#include "llvm/IR/Function.h"
75#include "llvm/IR/GlobalAlias.h"
76#include "llvm/IR/GlobalValue.h"
77#include "llvm/IR/GlobalVariable.h"
78#include "llvm/IR/IRBuilder.h"
79#include "llvm/IR/InlineAsm.h"
80#include "llvm/IR/Instruction.h"
81#include "llvm/IR/Instructions.h"
82#include "llvm/IR/IntrinsicInst.h"
83#include "llvm/IR/Intrinsics.h"
84#include "llvm/IR/IntrinsicsARM.h"
85#include "llvm/IR/Module.h"
86#include "llvm/IR/PatternMatch.h"
87#include "llvm/IR/Type.h"
88#include "llvm/IR/User.h"
89#include "llvm/IR/Value.h"
90#include "llvm/MC/MCInstrDesc.h"
91#include "llvm/MC/MCInstrItineraries.h"
92#include "llvm/MC/MCRegisterInfo.h"
93#include "llvm/MC/MCSchedule.h"
94#include "llvm/Support/AtomicOrdering.h"
95#include "llvm/Support/BranchProbability.h"
96#include "llvm/Support/Casting.h"
97#include "llvm/Support/CodeGen.h"
98#include "llvm/Support/CommandLine.h"
99#include "llvm/Support/Compiler.h"
100#include "llvm/Support/Debug.h"
101#include "llvm/Support/ErrorHandling.h"
102#include "llvm/Support/KnownBits.h"
103#include "llvm/Support/MathExtras.h"
104#include "llvm/Support/raw_ostream.h"
105#include "llvm/Target/TargetMachine.h"
106#include "llvm/Target/TargetOptions.h"
107#include "llvm/TargetParser/Triple.h"
108#include <algorithm>
109#include <cassert>
110#include <cstdint>
111#include <cstdlib>
112#include <iterator>
113#include <limits>
114#include <optional>
115#include <tuple>
116#include <utility>
117#include <vector>
118
119using namespace llvm;
120using namespace llvm::PatternMatch;
121
122#define DEBUG_TYPE "arm-isel"
123
124STATISTIC(NumTailCalls, "Number of tail calls");
125STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
126STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
127STATISTIC(NumConstpoolPromoted,
128 "Number of constants with their storage promoted into constant pools");
129
130static cl::opt<bool>
131ARMInterworking("arm-interworking", cl::Hidden,
132 cl::desc("Enable / disable ARM interworking (for debugging only)"),
133 cl::init(Val: true));
134
135static cl::opt<bool> EnableConstpoolPromotion(
136 "arm-promote-constant", cl::Hidden,
137 cl::desc("Enable / disable promotion of unnamed_addr constants into "
138 "constant pools"),
139 cl::init(Val: false)); // FIXME: set to true by default once PR32780 is fixed
140static cl::opt<unsigned> ConstpoolPromotionMaxSize(
141 "arm-promote-constant-max-size", cl::Hidden,
142 cl::desc("Maximum size of constant to promote into a constant pool"),
143 cl::init(Val: 64));
144static cl::opt<unsigned> ConstpoolPromotionMaxTotal(
145 "arm-promote-constant-max-total", cl::Hidden,
146 cl::desc("Maximum size of ALL constants to promote into a constant pool"),
147 cl::init(Val: 128));
148
149cl::opt<unsigned>
150MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden,
151 cl::desc("Maximum interleave factor for MVE VLDn to generate."),
152 cl::init(Val: 2));
153
154// The APCS parameter registers.
155static const MCPhysReg GPRArgRegs[] = {
156 ARM::R0, ARM::R1, ARM::R2, ARM::R3
157};
158
159void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) {
160 if (VT != PromotedLdStVT) {
161 setOperationAction(Op: ISD::LOAD, VT, Action: Promote);
162 AddPromotedToType (Opc: ISD::LOAD, OrigVT: VT, DestVT: PromotedLdStVT);
163
164 setOperationAction(Op: ISD::STORE, VT, Action: Promote);
165 AddPromotedToType (Opc: ISD::STORE, OrigVT: VT, DestVT: PromotedLdStVT);
166 }
167
168 MVT ElemTy = VT.getVectorElementType();
169 if (ElemTy != MVT::f64)
170 setOperationAction(Op: ISD::SETCC, VT, Action: Custom);
171 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT, Action: Custom);
172 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT, Action: Custom);
173 if (ElemTy == MVT::i32) {
174 setOperationAction(Op: ISD::SINT_TO_FP, VT, Action: Custom);
175 setOperationAction(Op: ISD::UINT_TO_FP, VT, Action: Custom);
176 setOperationAction(Op: ISD::FP_TO_SINT, VT, Action: Custom);
177 setOperationAction(Op: ISD::FP_TO_UINT, VT, Action: Custom);
178 } else {
179 setOperationAction(Op: ISD::SINT_TO_FP, VT, Action: Expand);
180 setOperationAction(Op: ISD::UINT_TO_FP, VT, Action: Expand);
181 setOperationAction(Op: ISD::FP_TO_SINT, VT, Action: Expand);
182 setOperationAction(Op: ISD::FP_TO_UINT, VT, Action: Expand);
183 }
184 setOperationAction(Op: ISD::BUILD_VECTOR, VT, Action: Custom);
185 setOperationAction(Op: ISD::VECTOR_SHUFFLE, VT, Action: Custom);
186 setOperationAction(Op: ISD::CONCAT_VECTORS, VT, Action: Legal);
187 setOperationAction(Op: ISD::EXTRACT_SUBVECTOR, VT, Action: Legal);
188 setOperationAction(Op: ISD::SELECT, VT, Action: Expand);
189 setOperationAction(Op: ISD::SELECT_CC, VT, Action: Expand);
190 setOperationAction(Op: ISD::VSELECT, VT, Action: Expand);
191 setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT, Action: Expand);
192 if (VT.isInteger()) {
193 setOperationAction(Op: ISD::SHL, VT, Action: Custom);
194 setOperationAction(Op: ISD::SRA, VT, Action: Custom);
195 setOperationAction(Op: ISD::SRL, VT, Action: Custom);
196 }
197
198 // Neon does not support vector divide/remainder operations.
199 setOperationAction(Op: ISD::SDIV, VT, Action: Expand);
200 setOperationAction(Op: ISD::UDIV, VT, Action: Expand);
201 setOperationAction(Op: ISD::FDIV, VT, Action: Expand);
202 setOperationAction(Op: ISD::SREM, VT, Action: Expand);
203 setOperationAction(Op: ISD::UREM, VT, Action: Expand);
204 setOperationAction(Op: ISD::FREM, VT, Action: Expand);
205 setOperationAction(Op: ISD::SDIVREM, VT, Action: Expand);
206 setOperationAction(Op: ISD::UDIVREM, VT, Action: Expand);
207
208 if (!VT.isFloatingPoint() &&
209 VT != MVT::v2i64 && VT != MVT::v1i64)
210 for (auto Opcode : {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
211 setOperationAction(Op: Opcode, VT, Action: Legal);
212 if (!VT.isFloatingPoint())
213 for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT})
214 setOperationAction(Op: Opcode, VT, Action: Legal);
215}
216
217void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
218 addRegisterClass(VT, &ARM::DPRRegClass);
219 addTypeForNEON(VT, MVT::f64);
220}
221
222void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
223 addRegisterClass(VT, &ARM::DPairRegClass);
224 addTypeForNEON(VT, MVT::v2f64);
225}
226
227void ARMTargetLowering::setAllExpand(MVT VT) {
228 for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
229 setOperationAction(Op: Opc, VT, Action: Expand);
230
231 // We support these really simple operations even on types where all
232 // the actual arithmetic has to be broken down into simpler
233 // operations or turned into library calls.
234 setOperationAction(Op: ISD::BITCAST, VT, Action: Legal);
235 setOperationAction(Op: ISD::LOAD, VT, Action: Legal);
236 setOperationAction(Op: ISD::STORE, VT, Action: Legal);
237 setOperationAction(Op: ISD::UNDEF, VT, Action: Legal);
238}
239
240void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To,
241 LegalizeAction Action) {
242 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: From, MemVT: To, Action);
243 setLoadExtAction(ExtType: ISD::ZEXTLOAD, ValVT: From, MemVT: To, Action);
244 setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: From, MemVT: To, Action);
245}
246
247void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
248 const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 };
249
250 for (auto VT : IntTypes) {
251 addRegisterClass(VT, &ARM::MQPRRegClass);
252 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
253 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
254 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
255 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
256 setOperationAction(ISD::SHL, VT, Custom);
257 setOperationAction(ISD::SRA, VT, Custom);
258 setOperationAction(ISD::SRL, VT, Custom);
259 setOperationAction(ISD::SMIN, VT, Legal);
260 setOperationAction(ISD::SMAX, VT, Legal);
261 setOperationAction(ISD::UMIN, VT, Legal);
262 setOperationAction(ISD::UMAX, VT, Legal);
263 setOperationAction(ISD::ABS, VT, Legal);
264 setOperationAction(ISD::SETCC, VT, Custom);
265 setOperationAction(ISD::MLOAD, VT, Custom);
266 setOperationAction(ISD::MSTORE, VT, Legal);
267 setOperationAction(ISD::CTLZ, VT, Legal);
268 setOperationAction(ISD::CTTZ, VT, Custom);
269 setOperationAction(ISD::BITREVERSE, VT, Legal);
270 setOperationAction(ISD::BSWAP, VT, Legal);
271 setOperationAction(ISD::SADDSAT, VT, Legal);
272 setOperationAction(ISD::UADDSAT, VT, Legal);
273 setOperationAction(ISD::SSUBSAT, VT, Legal);
274 setOperationAction(ISD::USUBSAT, VT, Legal);
275 setOperationAction(ISD::ABDS, VT, Legal);
276 setOperationAction(ISD::ABDU, VT, Legal);
277 setOperationAction(ISD::AVGFLOORS, VT, Legal);
278 setOperationAction(ISD::AVGFLOORU, VT, Legal);
279 setOperationAction(ISD::AVGCEILS, VT, Legal);
280 setOperationAction(ISD::AVGCEILU, VT, Legal);
281
282 // No native support for these.
283 setOperationAction(ISD::UDIV, VT, Expand);
284 setOperationAction(ISD::SDIV, VT, Expand);
285 setOperationAction(ISD::UREM, VT, Expand);
286 setOperationAction(ISD::SREM, VT, Expand);
287 setOperationAction(ISD::UDIVREM, VT, Expand);
288 setOperationAction(ISD::SDIVREM, VT, Expand);
289 setOperationAction(ISD::CTPOP, VT, Expand);
290 setOperationAction(ISD::SELECT, VT, Expand);
291 setOperationAction(ISD::SELECT_CC, VT, Expand);
292
293 // Vector reductions
294 setOperationAction(ISD::VECREDUCE_ADD, VT, Legal);
295 setOperationAction(ISD::VECREDUCE_SMAX, VT, Legal);
296 setOperationAction(ISD::VECREDUCE_UMAX, VT, Legal);
297 setOperationAction(ISD::VECREDUCE_SMIN, VT, Legal);
298 setOperationAction(ISD::VECREDUCE_UMIN, VT, Legal);
299 setOperationAction(ISD::VECREDUCE_MUL, VT, Custom);
300 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
301 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
302 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
303
304 if (!HasMVEFP) {
305 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
306 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
307 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
308 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
309 } else {
310 setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
311 setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
312 }
313
314 // Pre and Post inc are supported on loads and stores
315 for (unsigned im = (unsigned)ISD::PRE_INC;
316 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
317 setIndexedLoadAction(im, VT, Legal);
318 setIndexedStoreAction(im, VT, Legal);
319 setIndexedMaskedLoadAction(im, VT, Legal);
320 setIndexedMaskedStoreAction(im, VT, Legal);
321 }
322 }
323
324 const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 };
325 for (auto VT : FloatTypes) {
326 addRegisterClass(VT, &ARM::MQPRRegClass);
327 if (!HasMVEFP)
328 setAllExpand(VT);
329
330 // These are legal or custom whether we have MVE.fp or not
331 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
332 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
333 setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getVectorElementType(), Custom);
334 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
335 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
336 setOperationAction(ISD::BUILD_VECTOR, VT.getVectorElementType(), Custom);
337 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal);
338 setOperationAction(ISD::SETCC, VT, Custom);
339 setOperationAction(ISD::MLOAD, VT, Custom);
340 setOperationAction(ISD::MSTORE, VT, Legal);
341 setOperationAction(ISD::SELECT, VT, Expand);
342 setOperationAction(ISD::SELECT_CC, VT, Expand);
343
344 // Pre and Post inc are supported on loads and stores
345 for (unsigned im = (unsigned)ISD::PRE_INC;
346 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
347 setIndexedLoadAction(im, VT, Legal);
348 setIndexedStoreAction(im, VT, Legal);
349 setIndexedMaskedLoadAction(im, VT, Legal);
350 setIndexedMaskedStoreAction(im, VT, Legal);
351 }
352
353 if (HasMVEFP) {
354 setOperationAction(ISD::FMINNUM, VT, Legal);
355 setOperationAction(ISD::FMAXNUM, VT, Legal);
356 setOperationAction(ISD::FROUND, VT, Legal);
357 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
358 setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom);
359 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
360 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
361
362 // No native support for these.
363 setOperationAction(ISD::FDIV, VT, Expand);
364 setOperationAction(ISD::FREM, VT, Expand);
365 setOperationAction(ISD::FSQRT, VT, Expand);
366 setOperationAction(ISD::FSIN, VT, Expand);
367 setOperationAction(ISD::FCOS, VT, Expand);
368 setOperationAction(ISD::FPOW, VT, Expand);
369 setOperationAction(ISD::FLOG, VT, Expand);
370 setOperationAction(ISD::FLOG2, VT, Expand);
371 setOperationAction(ISD::FLOG10, VT, Expand);
372 setOperationAction(ISD::FEXP, VT, Expand);
373 setOperationAction(ISD::FEXP2, VT, Expand);
374 setOperationAction(ISD::FEXP10, VT, Expand);
375 setOperationAction(ISD::FNEARBYINT, VT, Expand);
376 }
377 }
378
379 // Custom Expand smaller than legal vector reductions to prevent false zero
380 // items being added.
381 setOperationAction(ISD::VECREDUCE_FADD, MVT::v4f16, Custom);
382 setOperationAction(ISD::VECREDUCE_FMUL, MVT::v4f16, Custom);
383 setOperationAction(ISD::VECREDUCE_FMIN, MVT::v4f16, Custom);
384 setOperationAction(ISD::VECREDUCE_FMAX, MVT::v4f16, Custom);
385 setOperationAction(ISD::VECREDUCE_FADD, MVT::v2f16, Custom);
386 setOperationAction(ISD::VECREDUCE_FMUL, MVT::v2f16, Custom);
387 setOperationAction(ISD::VECREDUCE_FMIN, MVT::v2f16, Custom);
388 setOperationAction(ISD::VECREDUCE_FMAX, MVT::v2f16, Custom);
389
390 // We 'support' these types up to bitcast/load/store level, regardless of
391 // MVE integer-only / float support. Only doing FP data processing on the FP
392 // vector types is inhibited at integer-only level.
393 const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 };
394 for (auto VT : LongTypes) {
395 addRegisterClass(VT, &ARM::MQPRRegClass);
396 setAllExpand(VT);
397 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
398 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
399 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
400 setOperationAction(ISD::VSELECT, VT, Legal);
401 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
402 }
403 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
404
405 // We can do bitwise operations on v2i64 vectors
406 setOperationAction(ISD::AND, MVT::v2i64, Legal);
407 setOperationAction(ISD::OR, MVT::v2i64, Legal);
408 setOperationAction(ISD::XOR, MVT::v2i64, Legal);
409
410 // It is legal to extload from v4i8 to v4i16 or v4i32.
411 addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal);
412 addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal);
413 addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal);
414
415 // It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16.
416 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Legal);
417 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Legal);
418 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Legal);
419 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i8, Legal);
420 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i16, Legal);
421
422 // Some truncating stores are legal too.
423 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
424 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
425 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
426
427 // Pre and Post inc on these are legal, given the correct extends
428 for (unsigned im = (unsigned)ISD::PRE_INC;
429 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
430 for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) {
431 setIndexedLoadAction(im, VT, Legal);
432 setIndexedStoreAction(im, VT, Legal);
433 setIndexedMaskedLoadAction(im, VT, Legal);
434 setIndexedMaskedStoreAction(im, VT, Legal);
435 }
436 }
437
438 // Predicate types
439 const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1};
440 for (auto VT : pTypes) {
441 addRegisterClass(VT, &ARM::VCCRRegClass);
442 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
443 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
444 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
445 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
446 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
447 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
448 setOperationAction(ISD::SETCC, VT, Custom);
449 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand);
450 setOperationAction(ISD::LOAD, VT, Custom);
451 setOperationAction(ISD::STORE, VT, Custom);
452 setOperationAction(ISD::TRUNCATE, VT, Custom);
453 setOperationAction(ISD::VSELECT, VT, Expand);
454 setOperationAction(ISD::SELECT, VT, Expand);
455 setOperationAction(ISD::SELECT_CC, VT, Expand);
456
457 if (!HasMVEFP) {
458 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
459 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
460 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
461 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
462 }
463 }
464 setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
465 setOperationAction(ISD::TRUNCATE, MVT::v2i1, Expand);
466 setOperationAction(ISD::AND, MVT::v2i1, Expand);
467 setOperationAction(ISD::OR, MVT::v2i1, Expand);
468 setOperationAction(ISD::XOR, MVT::v2i1, Expand);
469 setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Expand);
470 setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Expand);
471 setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Expand);
472 setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Expand);
473
474 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom);
475 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
476 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
477 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom);
478 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom);
479 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
480 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
481 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
482}
483
484ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
485 const ARMSubtarget &STI)
486 : TargetLowering(TM), Subtarget(&STI) {
487 RegInfo = Subtarget->getRegisterInfo();
488 Itins = Subtarget->getInstrItineraryData();
489
490 setBooleanContents(ZeroOrOneBooleanContent);
491 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
492
493 if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() &&
494 !Subtarget->isTargetWatchOS() && !Subtarget->isTargetDriverKit()) {
495 bool IsHFTarget = TM.Options.FloatABIType == FloatABI::Hard;
496 for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID)
497 setLibcallCallingConv(Call: static_cast<RTLIB::Libcall>(LCID),
498 CC: IsHFTarget ? CallingConv::ARM_AAPCS_VFP
499 : CallingConv::ARM_AAPCS);
500 }
501
502 if (Subtarget->isTargetMachO()) {
503 // Uses VFP for Thumb libfuncs if available.
504 if (Subtarget->isThumb() && Subtarget->hasVFP2Base() &&
505 Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
506 static const struct {
507 const RTLIB::Libcall Op;
508 const char * const Name;
509 const ISD::CondCode Cond;
510 } LibraryCalls[] = {
511 // Single-precision floating-point arithmetic.
512 { .Op: RTLIB::ADD_F32, .Name: "__addsf3vfp", .Cond: ISD::SETCC_INVALID },
513 { .Op: RTLIB::SUB_F32, .Name: "__subsf3vfp", .Cond: ISD::SETCC_INVALID },
514 { .Op: RTLIB::MUL_F32, .Name: "__mulsf3vfp", .Cond: ISD::SETCC_INVALID },
515 { .Op: RTLIB::DIV_F32, .Name: "__divsf3vfp", .Cond: ISD::SETCC_INVALID },
516
517 // Double-precision floating-point arithmetic.
518 { .Op: RTLIB::ADD_F64, .Name: "__adddf3vfp", .Cond: ISD::SETCC_INVALID },
519 { .Op: RTLIB::SUB_F64, .Name: "__subdf3vfp", .Cond: ISD::SETCC_INVALID },
520 { .Op: RTLIB::MUL_F64, .Name: "__muldf3vfp", .Cond: ISD::SETCC_INVALID },
521 { .Op: RTLIB::DIV_F64, .Name: "__divdf3vfp", .Cond: ISD::SETCC_INVALID },
522
523 // Single-precision comparisons.
524 { .Op: RTLIB::OEQ_F32, .Name: "__eqsf2vfp", .Cond: ISD::SETNE },
525 { .Op: RTLIB::UNE_F32, .Name: "__nesf2vfp", .Cond: ISD::SETNE },
526 { .Op: RTLIB::OLT_F32, .Name: "__ltsf2vfp", .Cond: ISD::SETNE },
527 { .Op: RTLIB::OLE_F32, .Name: "__lesf2vfp", .Cond: ISD::SETNE },
528 { .Op: RTLIB::OGE_F32, .Name: "__gesf2vfp", .Cond: ISD::SETNE },
529 { .Op: RTLIB::OGT_F32, .Name: "__gtsf2vfp", .Cond: ISD::SETNE },
530 { .Op: RTLIB::UO_F32, .Name: "__unordsf2vfp", .Cond: ISD::SETNE },
531
532 // Double-precision comparisons.
533 { .Op: RTLIB::OEQ_F64, .Name: "__eqdf2vfp", .Cond: ISD::SETNE },
534 { .Op: RTLIB::UNE_F64, .Name: "__nedf2vfp", .Cond: ISD::SETNE },
535 { .Op: RTLIB::OLT_F64, .Name: "__ltdf2vfp", .Cond: ISD::SETNE },
536 { .Op: RTLIB::OLE_F64, .Name: "__ledf2vfp", .Cond: ISD::SETNE },
537 { .Op: RTLIB::OGE_F64, .Name: "__gedf2vfp", .Cond: ISD::SETNE },
538 { .Op: RTLIB::OGT_F64, .Name: "__gtdf2vfp", .Cond: ISD::SETNE },
539 { .Op: RTLIB::UO_F64, .Name: "__unorddf2vfp", .Cond: ISD::SETNE },
540
541 // Floating-point to integer conversions.
542 // i64 conversions are done via library routines even when generating VFP
543 // instructions, so use the same ones.
544 { .Op: RTLIB::FPTOSINT_F64_I32, .Name: "__fixdfsivfp", .Cond: ISD::SETCC_INVALID },
545 { .Op: RTLIB::FPTOUINT_F64_I32, .Name: "__fixunsdfsivfp", .Cond: ISD::SETCC_INVALID },
546 { .Op: RTLIB::FPTOSINT_F32_I32, .Name: "__fixsfsivfp", .Cond: ISD::SETCC_INVALID },
547 { .Op: RTLIB::FPTOUINT_F32_I32, .Name: "__fixunssfsivfp", .Cond: ISD::SETCC_INVALID },
548
549 // Conversions between floating types.
550 { .Op: RTLIB::FPROUND_F64_F32, .Name: "__truncdfsf2vfp", .Cond: ISD::SETCC_INVALID },
551 { .Op: RTLIB::FPEXT_F32_F64, .Name: "__extendsfdf2vfp", .Cond: ISD::SETCC_INVALID },
552
553 // Integer to floating-point conversions.
554 // i64 conversions are done via library routines even when generating VFP
555 // instructions, so use the same ones.
556 // FIXME: There appears to be some naming inconsistency in ARM libgcc:
557 // e.g., __floatunsidf vs. __floatunssidfvfp.
558 { .Op: RTLIB::SINTTOFP_I32_F64, .Name: "__floatsidfvfp", .Cond: ISD::SETCC_INVALID },
559 { .Op: RTLIB::UINTTOFP_I32_F64, .Name: "__floatunssidfvfp", .Cond: ISD::SETCC_INVALID },
560 { .Op: RTLIB::SINTTOFP_I32_F32, .Name: "__floatsisfvfp", .Cond: ISD::SETCC_INVALID },
561 { .Op: RTLIB::UINTTOFP_I32_F32, .Name: "__floatunssisfvfp", .Cond: ISD::SETCC_INVALID },
562 };
563
564 for (const auto &LC : LibraryCalls) {
565 setLibcallName(Call: LC.Op, Name: LC.Name);
566 if (LC.Cond != ISD::SETCC_INVALID)
567 setCmpLibcallCC(Call: LC.Op, CC: LC.Cond);
568 }
569 }
570 }
571
572 // These libcalls are not available in 32-bit.
573 setLibcallName(Call: RTLIB::SHL_I128, Name: nullptr);
574 setLibcallName(Call: RTLIB::SRL_I128, Name: nullptr);
575 setLibcallName(Call: RTLIB::SRA_I128, Name: nullptr);
576 setLibcallName(Call: RTLIB::MUL_I128, Name: nullptr);
577 setLibcallName(Call: RTLIB::MULO_I64, Name: nullptr);
578 setLibcallName(Call: RTLIB::MULO_I128, Name: nullptr);
579
580 // RTLIB
581 if (Subtarget->isAAPCS_ABI() &&
582 (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||
583 Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) {
584 static const struct {
585 const RTLIB::Libcall Op;
586 const char * const Name;
587 const CallingConv::ID CC;
588 const ISD::CondCode Cond;
589 } LibraryCalls[] = {
590 // Double-precision floating-point arithmetic helper functions
591 // RTABI chapter 4.1.2, Table 2
592 { .Op: RTLIB::ADD_F64, .Name: "__aeabi_dadd", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
593 { .Op: RTLIB::DIV_F64, .Name: "__aeabi_ddiv", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
594 { .Op: RTLIB::MUL_F64, .Name: "__aeabi_dmul", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
595 { .Op: RTLIB::SUB_F64, .Name: "__aeabi_dsub", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
596
597 // Double-precision floating-point comparison helper functions
598 // RTABI chapter 4.1.2, Table 3
599 { .Op: RTLIB::OEQ_F64, .Name: "__aeabi_dcmpeq", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETNE },
600 { .Op: RTLIB::UNE_F64, .Name: "__aeabi_dcmpeq", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETEQ },
601 { .Op: RTLIB::OLT_F64, .Name: "__aeabi_dcmplt", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETNE },
602 { .Op: RTLIB::OLE_F64, .Name: "__aeabi_dcmple", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETNE },
603 { .Op: RTLIB::OGE_F64, .Name: "__aeabi_dcmpge", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETNE },
604 { .Op: RTLIB::OGT_F64, .Name: "__aeabi_dcmpgt", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETNE },
605 { .Op: RTLIB::UO_F64, .Name: "__aeabi_dcmpun", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETNE },
606
607 // Single-precision floating-point arithmetic helper functions
608 // RTABI chapter 4.1.2, Table 4
609 { .Op: RTLIB::ADD_F32, .Name: "__aeabi_fadd", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
610 { .Op: RTLIB::DIV_F32, .Name: "__aeabi_fdiv", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
611 { .Op: RTLIB::MUL_F32, .Name: "__aeabi_fmul", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
612 { .Op: RTLIB::SUB_F32, .Name: "__aeabi_fsub", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
613
614 // Single-precision floating-point comparison helper functions
615 // RTABI chapter 4.1.2, Table 5
616 { .Op: RTLIB::OEQ_F32, .Name: "__aeabi_fcmpeq", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETNE },
617 { .Op: RTLIB::UNE_F32, .Name: "__aeabi_fcmpeq", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETEQ },
618 { .Op: RTLIB::OLT_F32, .Name: "__aeabi_fcmplt", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETNE },
619 { .Op: RTLIB::OLE_F32, .Name: "__aeabi_fcmple", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETNE },
620 { .Op: RTLIB::OGE_F32, .Name: "__aeabi_fcmpge", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETNE },
621 { .Op: RTLIB::OGT_F32, .Name: "__aeabi_fcmpgt", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETNE },
622 { .Op: RTLIB::UO_F32, .Name: "__aeabi_fcmpun", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETNE },
623
624 // Floating-point to integer conversions.
625 // RTABI chapter 4.1.2, Table 6
626 { .Op: RTLIB::FPTOSINT_F64_I32, .Name: "__aeabi_d2iz", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
627 { .Op: RTLIB::FPTOUINT_F64_I32, .Name: "__aeabi_d2uiz", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
628 { .Op: RTLIB::FPTOSINT_F64_I64, .Name: "__aeabi_d2lz", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
629 { .Op: RTLIB::FPTOUINT_F64_I64, .Name: "__aeabi_d2ulz", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
630 { .Op: RTLIB::FPTOSINT_F32_I32, .Name: "__aeabi_f2iz", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
631 { .Op: RTLIB::FPTOUINT_F32_I32, .Name: "__aeabi_f2uiz", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
632 { .Op: RTLIB::FPTOSINT_F32_I64, .Name: "__aeabi_f2lz", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
633 { .Op: RTLIB::FPTOUINT_F32_I64, .Name: "__aeabi_f2ulz", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
634
635 // Conversions between floating types.
636 // RTABI chapter 4.1.2, Table 7
637 { .Op: RTLIB::FPROUND_F64_F32, .Name: "__aeabi_d2f", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
638 { .Op: RTLIB::FPROUND_F64_F16, .Name: "__aeabi_d2h", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
639 { .Op: RTLIB::FPEXT_F32_F64, .Name: "__aeabi_f2d", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
640
641 // Integer to floating-point conversions.
642 // RTABI chapter 4.1.2, Table 8
643 { .Op: RTLIB::SINTTOFP_I32_F64, .Name: "__aeabi_i2d", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
644 { .Op: RTLIB::UINTTOFP_I32_F64, .Name: "__aeabi_ui2d", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
645 { .Op: RTLIB::SINTTOFP_I64_F64, .Name: "__aeabi_l2d", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
646 { .Op: RTLIB::UINTTOFP_I64_F64, .Name: "__aeabi_ul2d", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
647 { .Op: RTLIB::SINTTOFP_I32_F32, .Name: "__aeabi_i2f", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
648 { .Op: RTLIB::UINTTOFP_I32_F32, .Name: "__aeabi_ui2f", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
649 { .Op: RTLIB::SINTTOFP_I64_F32, .Name: "__aeabi_l2f", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
650 { .Op: RTLIB::UINTTOFP_I64_F32, .Name: "__aeabi_ul2f", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
651
652 // Long long helper functions
653 // RTABI chapter 4.2, Table 9
654 { .Op: RTLIB::MUL_I64, .Name: "__aeabi_lmul", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
655 { .Op: RTLIB::SHL_I64, .Name: "__aeabi_llsl", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
656 { .Op: RTLIB::SRL_I64, .Name: "__aeabi_llsr", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
657 { .Op: RTLIB::SRA_I64, .Name: "__aeabi_lasr", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
658
659 // Integer division functions
660 // RTABI chapter 4.3.1
661 { .Op: RTLIB::SDIV_I8, .Name: "__aeabi_idiv", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
662 { .Op: RTLIB::SDIV_I16, .Name: "__aeabi_idiv", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
663 { .Op: RTLIB::SDIV_I32, .Name: "__aeabi_idiv", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
664 { .Op: RTLIB::SDIV_I64, .Name: "__aeabi_ldivmod", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
665 { .Op: RTLIB::UDIV_I8, .Name: "__aeabi_uidiv", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
666 { .Op: RTLIB::UDIV_I16, .Name: "__aeabi_uidiv", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
667 { .Op: RTLIB::UDIV_I32, .Name: "__aeabi_uidiv", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
668 { .Op: RTLIB::UDIV_I64, .Name: "__aeabi_uldivmod", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
669 };
670
671 for (const auto &LC : LibraryCalls) {
672 setLibcallName(Call: LC.Op, Name: LC.Name);
673 setLibcallCallingConv(Call: LC.Op, CC: LC.CC);
674 if (LC.Cond != ISD::SETCC_INVALID)
675 setCmpLibcallCC(Call: LC.Op, CC: LC.Cond);
676 }
677
678 // EABI dependent RTLIB
679 if (TM.Options.EABIVersion == EABI::EABI4 ||
680 TM.Options.EABIVersion == EABI::EABI5) {
681 static const struct {
682 const RTLIB::Libcall Op;
683 const char *const Name;
684 const CallingConv::ID CC;
685 const ISD::CondCode Cond;
686 } MemOpsLibraryCalls[] = {
687 // Memory operations
688 // RTABI chapter 4.3.4
689 { .Op: RTLIB::MEMCPY, .Name: "__aeabi_memcpy", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
690 { .Op: RTLIB::MEMMOVE, .Name: "__aeabi_memmove", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
691 { .Op: RTLIB::MEMSET, .Name: "__aeabi_memset", .CC: CallingConv::ARM_AAPCS, .Cond: ISD::SETCC_INVALID },
692 };
693
694 for (const auto &LC : MemOpsLibraryCalls) {
695 setLibcallName(Call: LC.Op, Name: LC.Name);
696 setLibcallCallingConv(Call: LC.Op, CC: LC.CC);
697 if (LC.Cond != ISD::SETCC_INVALID)
698 setCmpLibcallCC(Call: LC.Op, CC: LC.Cond);
699 }
700 }
701 }
702
703 if (Subtarget->isTargetWindows()) {
704 static const struct {
705 const RTLIB::Libcall Op;
706 const char * const Name;
707 const CallingConv::ID CC;
708 } LibraryCalls[] = {
709 { .Op: RTLIB::FPTOSINT_F32_I64, .Name: "__stoi64", .CC: CallingConv::ARM_AAPCS_VFP },
710 { .Op: RTLIB::FPTOSINT_F64_I64, .Name: "__dtoi64", .CC: CallingConv::ARM_AAPCS_VFP },
711 { .Op: RTLIB::FPTOUINT_F32_I64, .Name: "__stou64", .CC: CallingConv::ARM_AAPCS_VFP },
712 { .Op: RTLIB::FPTOUINT_F64_I64, .Name: "__dtou64", .CC: CallingConv::ARM_AAPCS_VFP },
713 { .Op: RTLIB::SINTTOFP_I64_F32, .Name: "__i64tos", .CC: CallingConv::ARM_AAPCS_VFP },
714 { .Op: RTLIB::SINTTOFP_I64_F64, .Name: "__i64tod", .CC: CallingConv::ARM_AAPCS_VFP },
715 { .Op: RTLIB::UINTTOFP_I64_F32, .Name: "__u64tos", .CC: CallingConv::ARM_AAPCS_VFP },
716 { .Op: RTLIB::UINTTOFP_I64_F64, .Name: "__u64tod", .CC: CallingConv::ARM_AAPCS_VFP },
717 };
718
719 for (const auto &LC : LibraryCalls) {
720 setLibcallName(Call: LC.Op, Name: LC.Name);
721 setLibcallCallingConv(Call: LC.Op, CC: LC.CC);
722 }
723 }
724
725 // Use divmod compiler-rt calls for iOS 5.0 and later.
726 if (Subtarget->isTargetMachO() &&
727 !(Subtarget->isTargetIOS() &&
728 Subtarget->getTargetTriple().isOSVersionLT(Major: 5, Minor: 0))) {
729 setLibcallName(Call: RTLIB::SDIVREM_I32, Name: "__divmodsi4");
730 setLibcallName(Call: RTLIB::UDIVREM_I32, Name: "__udivmodsi4");
731 }
732
733 // The half <-> float conversion functions are always soft-float on
734 // non-watchos platforms, but are needed for some targets which use a
735 // hard-float calling convention by default.
736 if (!Subtarget->isTargetWatchABI()) {
737 if (Subtarget->isAAPCS_ABI()) {
738 setLibcallCallingConv(Call: RTLIB::FPROUND_F32_F16, CC: CallingConv::ARM_AAPCS);
739 setLibcallCallingConv(Call: RTLIB::FPROUND_F64_F16, CC: CallingConv::ARM_AAPCS);
740 setLibcallCallingConv(Call: RTLIB::FPEXT_F16_F32, CC: CallingConv::ARM_AAPCS);
741 } else {
742 setLibcallCallingConv(Call: RTLIB::FPROUND_F32_F16, CC: CallingConv::ARM_APCS);
743 setLibcallCallingConv(Call: RTLIB::FPROUND_F64_F16, CC: CallingConv::ARM_APCS);
744 setLibcallCallingConv(Call: RTLIB::FPEXT_F16_F32, CC: CallingConv::ARM_APCS);
745 }
746 }
747
748 // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have
749 // a __gnu_ prefix (which is the default).
750 if (Subtarget->isTargetAEABI()) {
751 static const struct {
752 const RTLIB::Libcall Op;
753 const char * const Name;
754 const CallingConv::ID CC;
755 } LibraryCalls[] = {
756 { .Op: RTLIB::FPROUND_F32_F16, .Name: "__aeabi_f2h", .CC: CallingConv::ARM_AAPCS },
757 { .Op: RTLIB::FPROUND_F64_F16, .Name: "__aeabi_d2h", .CC: CallingConv::ARM_AAPCS },
758 { .Op: RTLIB::FPEXT_F16_F32, .Name: "__aeabi_h2f", .CC: CallingConv::ARM_AAPCS },
759 };
760
761 for (const auto &LC : LibraryCalls) {
762 setLibcallName(Call: LC.Op, Name: LC.Name);
763 setLibcallCallingConv(Call: LC.Op, CC: LC.CC);
764 }
765 }
766
767 if (Subtarget->isThumb1Only())
768 addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
769 else
770 addRegisterClass(MVT::i32, &ARM::GPRRegClass);
771
772 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() &&
773 Subtarget->hasFPRegs()) {
774 addRegisterClass(MVT::f32, &ARM::SPRRegClass);
775 addRegisterClass(MVT::f64, &ARM::DPRRegClass);
776
777 setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i32, Custom);
778 setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i32, Custom);
779 setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
780 setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
781
782 if (!Subtarget->hasVFP2Base())
783 setAllExpand(MVT::f32);
784 if (!Subtarget->hasFP64())
785 setAllExpand(MVT::f64);
786 }
787
788 if (Subtarget->hasFullFP16()) {
789 addRegisterClass(MVT::f16, &ARM::HPRRegClass);
790 setOperationAction(ISD::BITCAST, MVT::i16, Custom);
791 setOperationAction(ISD::BITCAST, MVT::f16, Custom);
792
793 setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
794 setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
795 }
796
797 if (Subtarget->hasBF16()) {
798 addRegisterClass(MVT::bf16, &ARM::HPRRegClass);
799 setAllExpand(MVT::bf16);
800 if (!Subtarget->hasFullFP16())
801 setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
802 }
803
804 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
805 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
806 setTruncStoreAction(VT, InnerVT, Expand);
807 addAllExtLoads(VT, InnerVT, Expand);
808 }
809
810 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
811 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
812
813 setOperationAction(ISD::BSWAP, VT, Expand);
814 }
815
816 setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
817 setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
818
819 setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom);
820 setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom);
821
822 if (Subtarget->hasMVEIntegerOps())
823 addMVEVectorTypes(HasMVEFP: Subtarget->hasMVEFloatOps());
824
825 // Combine low-overhead loop intrinsics so that we can lower i1 types.
826 if (Subtarget->hasLOB()) {
827 setTargetDAGCombine({ISD::BRCOND, ISD::BR_CC});
828 }
829
830 if (Subtarget->hasNEON()) {
831 addDRTypeForNEON(MVT::v2f32);
832 addDRTypeForNEON(MVT::v8i8);
833 addDRTypeForNEON(MVT::v4i16);
834 addDRTypeForNEON(MVT::v2i32);
835 addDRTypeForNEON(MVT::v1i64);
836
837 addQRTypeForNEON(MVT::v4f32);
838 addQRTypeForNEON(MVT::v2f64);
839 addQRTypeForNEON(MVT::v16i8);
840 addQRTypeForNEON(MVT::v8i16);
841 addQRTypeForNEON(MVT::v4i32);
842 addQRTypeForNEON(MVT::v2i64);
843
844 if (Subtarget->hasFullFP16()) {
845 addQRTypeForNEON(MVT::v8f16);
846 addDRTypeForNEON(MVT::v4f16);
847 }
848
849 if (Subtarget->hasBF16()) {
850 addQRTypeForNEON(MVT::v8bf16);
851 addDRTypeForNEON(MVT::v4bf16);
852 }
853 }
854
855 if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) {
856 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
857 // none of Neon, MVE or VFP supports any arithmetic operations on it.
858 setOperationAction(ISD::FADD, MVT::v2f64, Expand);
859 setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
860 setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
861 // FIXME: Code duplication: FDIV and FREM are expanded always, see
862 // ARMTargetLowering::addTypeForNEON method for details.
863 setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
864 setOperationAction(ISD::FREM, MVT::v2f64, Expand);
865 // FIXME: Create unittest.
866 // In another words, find a way when "copysign" appears in DAG with vector
867 // operands.
868 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand);
869 // FIXME: Code duplication: SETCC has custom operation action, see
870 // ARMTargetLowering::addTypeForNEON method for details.
871 setOperationAction(ISD::SETCC, MVT::v2f64, Expand);
872 // FIXME: Create unittest for FNEG and for FABS.
873 setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
874 setOperationAction(ISD::FABS, MVT::v2f64, Expand);
875 setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
876 setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
877 setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
878 setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
879 setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
880 setOperationAction(ISD::FLOG2, MVT::v2f64, Expand);
881 setOperationAction(ISD::FLOG10, MVT::v2f64, Expand);
882 setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
883 setOperationAction(ISD::FEXP2, MVT::v2f64, Expand);
884 setOperationAction(ISD::FEXP10, MVT::v2f64, Expand);
885 // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR.
886 setOperationAction(ISD::FCEIL, MVT::v2f64, Expand);
887 setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand);
888 setOperationAction(ISD::FRINT, MVT::v2f64, Expand);
889 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
890 setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
891 setOperationAction(ISD::FMA, MVT::v2f64, Expand);
892 }
893
894 if (Subtarget->hasNEON()) {
895 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
896 // supported for v4f32.
897 setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
898 setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
899 setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
900 setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
901 setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
902 setOperationAction(ISD::FLOG2, MVT::v4f32, Expand);
903 setOperationAction(ISD::FLOG10, MVT::v4f32, Expand);
904 setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
905 setOperationAction(ISD::FEXP2, MVT::v4f32, Expand);
906 setOperationAction(ISD::FEXP10, MVT::v4f32, Expand);
907 setOperationAction(ISD::FCEIL, MVT::v4f32, Expand);
908 setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand);
909 setOperationAction(ISD::FRINT, MVT::v4f32, Expand);
910 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
911 setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand);
912
913 // Mark v2f32 intrinsics.
914 setOperationAction(ISD::FSQRT, MVT::v2f32, Expand);
915 setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
916 setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
917 setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
918 setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
919 setOperationAction(ISD::FLOG2, MVT::v2f32, Expand);
920 setOperationAction(ISD::FLOG10, MVT::v2f32, Expand);
921 setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
922 setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
923 setOperationAction(ISD::FEXP10, MVT::v2f32, Expand);
924 setOperationAction(ISD::FCEIL, MVT::v2f32, Expand);
925 setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand);
926 setOperationAction(ISD::FRINT, MVT::v2f32, Expand);
927 setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand);
928 setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand);
929
930 // Neon does not support some operations on v1i64 and v2i64 types.
931 setOperationAction(ISD::MUL, MVT::v1i64, Expand);
932 // Custom handling for some quad-vector types to detect VMULL.
933 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
934 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
935 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
936 // Custom handling for some vector types to avoid expensive expansions
937 setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
938 setOperationAction(ISD::SDIV, MVT::v8i8, Custom);
939 setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
940 setOperationAction(ISD::UDIV, MVT::v8i8, Custom);
941 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
942 // a destination type that is wider than the source, and nor does
943 // it have a FP_TO_[SU]INT instruction with a narrower destination than
944 // source.
945 setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
946 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom);
947 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
948 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
949 setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);
950 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom);
951 setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
952 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);
953
954 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
955 setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand);
956
957 // NEON does not have single instruction CTPOP for vectors with element
958 // types wider than 8-bits. However, custom lowering can leverage the
959 // v8i8/v16i8 vcnt instruction.
960 setOperationAction(ISD::CTPOP, MVT::v2i32, Custom);
961 setOperationAction(ISD::CTPOP, MVT::v4i32, Custom);
962 setOperationAction(ISD::CTPOP, MVT::v4i16, Custom);
963 setOperationAction(ISD::CTPOP, MVT::v8i16, Custom);
964 setOperationAction(ISD::CTPOP, MVT::v1i64, Custom);
965 setOperationAction(ISD::CTPOP, MVT::v2i64, Custom);
966
967 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
968 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
969
970 // NEON does not have single instruction CTTZ for vectors.
971 setOperationAction(ISD::CTTZ, MVT::v8i8, Custom);
972 setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
973 setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
974 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
975
976 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
977 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
978 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
979 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
980
981 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom);
982 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom);
983 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom);
984 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom);
985
986 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom);
987 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom);
988 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
989 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
990
991 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
992 setOperationAction(ISD::MULHS, VT, Expand);
993 setOperationAction(ISD::MULHU, VT, Expand);
994 }
995
996 // NEON only has FMA instructions as of VFP4.
997 if (!Subtarget->hasVFP4Base()) {
998 setOperationAction(ISD::FMA, MVT::v2f32, Expand);
999 setOperationAction(ISD::FMA, MVT::v4f32, Expand);
1000 }
1001
1002 setTargetDAGCombine({ISD::SHL, ISD::SRL, ISD::SRA, ISD::FP_TO_SINT,
1003 ISD::FP_TO_UINT, ISD::FDIV, ISD::LOAD});
1004
1005 // It is legal to extload from v4i8 to v4i16 or v4i32.
1006 for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
1007 MVT::v2i32}) {
1008 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
1009 setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal);
1010 setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal);
1011 setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal);
1012 }
1013 }
1014
1015 for (auto VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1016 MVT::v4i32}) {
1017 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1018 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1019 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1020 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1021 }
1022 }
1023
1024 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
1025 setTargetDAGCombine(
1026 {ISD::BUILD_VECTOR, ISD::VECTOR_SHUFFLE, ISD::INSERT_SUBVECTOR,
1027 ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT,
1028 ISD::SIGN_EXTEND_INREG, ISD::STORE, ISD::SIGN_EXTEND, ISD::ZERO_EXTEND,
1029 ISD::ANY_EXTEND, ISD::INTRINSIC_WO_CHAIN, ISD::INTRINSIC_W_CHAIN,
1030 ISD::INTRINSIC_VOID, ISD::VECREDUCE_ADD, ISD::ADD, ISD::BITCAST});
1031 }
1032 if (Subtarget->hasMVEIntegerOps()) {
1033 setTargetDAGCombine({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX,
1034 ISD::FP_EXTEND, ISD::SELECT, ISD::SELECT_CC,
1035 ISD::SETCC});
1036 }
1037 if (Subtarget->hasMVEFloatOps()) {
1038 setTargetDAGCombine(ISD::FADD);
1039 }
1040
1041 if (!Subtarget->hasFP64()) {
1042 // When targeting a floating-point unit with only single-precision
1043 // operations, f64 is legal for the few double-precision instructions which
1044 // are present However, no double-precision operations other than moves,
1045 // loads and stores are provided by the hardware.
1046 setOperationAction(ISD::FADD, MVT::f64, Expand);
1047 setOperationAction(ISD::FSUB, MVT::f64, Expand);
1048 setOperationAction(ISD::FMUL, MVT::f64, Expand);
1049 setOperationAction(ISD::FMA, MVT::f64, Expand);
1050 setOperationAction(ISD::FDIV, MVT::f64, Expand);
1051 setOperationAction(ISD::FREM, MVT::f64, Expand);
1052 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
1053 setOperationAction(ISD::FGETSIGN, MVT::f64, Expand);
1054 setOperationAction(ISD::FNEG, MVT::f64, Expand);
1055 setOperationAction(ISD::FABS, MVT::f64, Expand);
1056 setOperationAction(ISD::FSQRT, MVT::f64, Expand);
1057 setOperationAction(ISD::FSIN, MVT::f64, Expand);
1058 setOperationAction(ISD::FCOS, MVT::f64, Expand);
1059 setOperationAction(ISD::FPOW, MVT::f64, Expand);
1060 setOperationAction(ISD::FLOG, MVT::f64, Expand);
1061 setOperationAction(ISD::FLOG2, MVT::f64, Expand);
1062 setOperationAction(ISD::FLOG10, MVT::f64, Expand);
1063 setOperationAction(ISD::FEXP, MVT::f64, Expand);
1064 setOperationAction(ISD::FEXP2, MVT::f64, Expand);
1065 setOperationAction(ISD::FEXP10, MVT::f64, Expand);
1066 setOperationAction(ISD::FCEIL, MVT::f64, Expand);
1067 setOperationAction(ISD::FTRUNC, MVT::f64, Expand);
1068 setOperationAction(ISD::FRINT, MVT::f64, Expand);
1069 setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand);
1070 setOperationAction(ISD::FFLOOR, MVT::f64, Expand);
1071 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
1072 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
1073 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
1074 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
1075 setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom);
1076 setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom);
1077 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
1078 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
1079 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
1080 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::f64, Custom);
1081 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::f64, Custom);
1082 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
1083 }
1084
1085 if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) {
1086 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
1087 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);
1088 if (Subtarget->hasFullFP16()) {
1089 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
1090 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
1091 }
1092 }
1093
1094 if (!Subtarget->hasFP16()) {
1095 setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
1096 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom);
1097 }
1098
1099 computeRegisterProperties(Subtarget->getRegisterInfo());
1100
1101 // ARM does not have floating-point extending loads.
1102 for (MVT VT : MVT::fp_valuetypes()) {
1103 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
1104 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
1105 }
1106
1107 // ... or truncating stores
1108 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
1109 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
1110 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
1111
1112 // ARM does not have i1 sign extending load.
1113 for (MVT VT : MVT::integer_valuetypes())
1114 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
1115
1116 // ARM supports all 4 flavors of integer indexed load / store.
1117 if (!Subtarget->isThumb1Only()) {
1118 for (unsigned im = (unsigned)ISD::PRE_INC;
1119 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
1120 setIndexedLoadAction(im, MVT::i1, Legal);
1121 setIndexedLoadAction(im, MVT::i8, Legal);
1122 setIndexedLoadAction(im, MVT::i16, Legal);
1123 setIndexedLoadAction(im, MVT::i32, Legal);
1124 setIndexedStoreAction(im, MVT::i1, Legal);
1125 setIndexedStoreAction(im, MVT::i8, Legal);
1126 setIndexedStoreAction(im, MVT::i16, Legal);
1127 setIndexedStoreAction(im, MVT::i32, Legal);
1128 }
1129 } else {
1130 // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
1131 setIndexedLoadAction(ISD::POST_INC, MVT::i32, Legal);
1132 setIndexedStoreAction(ISD::POST_INC, MVT::i32, Legal);
1133 }
1134
1135 setOperationAction(ISD::SADDO, MVT::i32, Custom);
1136 setOperationAction(ISD::UADDO, MVT::i32, Custom);
1137 setOperationAction(ISD::SSUBO, MVT::i32, Custom);
1138 setOperationAction(ISD::USUBO, MVT::i32, Custom);
1139
1140 setOperationAction(ISD::UADDO_CARRY, MVT::i32, Custom);
1141 setOperationAction(ISD::USUBO_CARRY, MVT::i32, Custom);
1142 if (Subtarget->hasDSP()) {
1143 setOperationAction(ISD::SADDSAT, MVT::i8, Custom);
1144 setOperationAction(ISD::SSUBSAT, MVT::i8, Custom);
1145 setOperationAction(ISD::SADDSAT, MVT::i16, Custom);
1146 setOperationAction(ISD::SSUBSAT, MVT::i16, Custom);
1147 setOperationAction(ISD::UADDSAT, MVT::i8, Custom);
1148 setOperationAction(ISD::USUBSAT, MVT::i8, Custom);
1149 setOperationAction(ISD::UADDSAT, MVT::i16, Custom);
1150 setOperationAction(ISD::USUBSAT, MVT::i16, Custom);
1151 }
1152 if (Subtarget->hasBaseDSP()) {
1153 setOperationAction(ISD::SADDSAT, MVT::i32, Legal);
1154 setOperationAction(ISD::SSUBSAT, MVT::i32, Legal);
1155 }
1156
1157 // i64 operation support.
1158 setOperationAction(ISD::MUL, MVT::i64, Expand);
1159 setOperationAction(ISD::MULHU, MVT::i32, Expand);
1160 if (Subtarget->isThumb1Only()) {
1161 setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
1162 setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
1163 }
1164 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
1165 || (Subtarget->isThumb2() && !Subtarget->hasDSP()))
1166 setOperationAction(ISD::MULHS, MVT::i32, Expand);
1167
1168 setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
1169 setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
1170 setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
1171 setOperationAction(ISD::SRL, MVT::i64, Custom);
1172 setOperationAction(ISD::SRA, MVT::i64, Custom);
1173 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1174 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1175 setOperationAction(ISD::LOAD, MVT::i64, Custom);
1176 setOperationAction(ISD::STORE, MVT::i64, Custom);
1177
1178 // MVE lowers 64 bit shifts to lsll and lsrl
1179 // assuming that ISD::SRL and SRA of i64 are already marked custom
1180 if (Subtarget->hasMVEIntegerOps())
1181 setOperationAction(ISD::SHL, MVT::i64, Custom);
1182
1183 // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.
1184 if (Subtarget->isThumb1Only()) {
1185 setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
1186 setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
1187 setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
1188 }
1189
1190 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
1191 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
1192
1193 // ARM does not have ROTL.
1194 setOperationAction(ISD::ROTL, MVT::i32, Expand);
1195 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
1196 setOperationAction(ISD::ROTL, VT, Expand);
1197 setOperationAction(ISD::ROTR, VT, Expand);
1198 }
1199 setOperationAction(ISD::CTTZ, MVT::i32, Custom);
1200 setOperationAction(ISD::CTPOP, MVT::i32, Expand);
1201 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) {
1202 setOperationAction(ISD::CTLZ, MVT::i32, Expand);
1203 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, LibCall);
1204 }
1205
1206 // @llvm.readcyclecounter requires the Performance Monitors extension.
1207 // Default to the 0 expansion on unsupported platforms.
1208 // FIXME: Technically there are older ARM CPUs that have
1209 // implementation-specific ways of obtaining this information.
1210 if (Subtarget->hasPerfMon())
1211 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
1212
1213 // Only ARMv6 has BSWAP.
1214 if (!Subtarget->hasV6Ops())
1215 setOperationAction(ISD::BSWAP, MVT::i32, Expand);
1216
1217 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
1218 : Subtarget->hasDivideInARMMode();
1219 if (!hasDivide) {
1220 // These are expanded into libcalls if the cpu doesn't have HW divider.
1221 setOperationAction(ISD::SDIV, MVT::i32, LibCall);
1222 setOperationAction(ISD::UDIV, MVT::i32, LibCall);
1223 }
1224
1225 if (Subtarget->isTargetWindows() && !Subtarget->hasDivideInThumbMode()) {
1226 setOperationAction(ISD::SDIV, MVT::i32, Custom);
1227 setOperationAction(ISD::UDIV, MVT::i32, Custom);
1228
1229 setOperationAction(ISD::SDIV, MVT::i64, Custom);
1230 setOperationAction(ISD::UDIV, MVT::i64, Custom);
1231 }
1232
1233 setOperationAction(ISD::SREM, MVT::i32, Expand);
1234 setOperationAction(ISD::UREM, MVT::i32, Expand);
1235
1236 // Register based DivRem for AEABI (RTABI 4.2)
1237 if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
1238 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
1239 Subtarget->isTargetWindows()) {
1240 setOperationAction(ISD::SREM, MVT::i64, Custom);
1241 setOperationAction(ISD::UREM, MVT::i64, Custom);
1242 HasStandaloneRem = false;
1243
1244 if (Subtarget->isTargetWindows()) {
1245 const struct {
1246 const RTLIB::Libcall Op;
1247 const char * const Name;
1248 const CallingConv::ID CC;
1249 } LibraryCalls[] = {
1250 { .Op: RTLIB::SDIVREM_I8, .Name: "__rt_sdiv", .CC: CallingConv::ARM_AAPCS },
1251 { .Op: RTLIB::SDIVREM_I16, .Name: "__rt_sdiv", .CC: CallingConv::ARM_AAPCS },
1252 { .Op: RTLIB::SDIVREM_I32, .Name: "__rt_sdiv", .CC: CallingConv::ARM_AAPCS },
1253 { .Op: RTLIB::SDIVREM_I64, .Name: "__rt_sdiv64", .CC: CallingConv::ARM_AAPCS },
1254
1255 { .Op: RTLIB::UDIVREM_I8, .Name: "__rt_udiv", .CC: CallingConv::ARM_AAPCS },
1256 { .Op: RTLIB::UDIVREM_I16, .Name: "__rt_udiv", .CC: CallingConv::ARM_AAPCS },
1257 { .Op: RTLIB::UDIVREM_I32, .Name: "__rt_udiv", .CC: CallingConv::ARM_AAPCS },
1258 { .Op: RTLIB::UDIVREM_I64, .Name: "__rt_udiv64", .CC: CallingConv::ARM_AAPCS },
1259 };
1260
1261 for (const auto &LC : LibraryCalls) {
1262 setLibcallName(Call: LC.Op, Name: LC.Name);
1263 setLibcallCallingConv(Call: LC.Op, CC: LC.CC);
1264 }
1265 } else {
1266 const struct {
1267 const RTLIB::Libcall Op;
1268 const char * const Name;
1269 const CallingConv::ID CC;
1270 } LibraryCalls[] = {
1271 { .Op: RTLIB::SDIVREM_I8, .Name: "__aeabi_idivmod", .CC: CallingConv::ARM_AAPCS },
1272 { .Op: RTLIB::SDIVREM_I16, .Name: "__aeabi_idivmod", .CC: CallingConv::ARM_AAPCS },
1273 { .Op: RTLIB::SDIVREM_I32, .Name: "__aeabi_idivmod", .CC: CallingConv::ARM_AAPCS },
1274 { .Op: RTLIB::SDIVREM_I64, .Name: "__aeabi_ldivmod", .CC: CallingConv::ARM_AAPCS },
1275
1276 { .Op: RTLIB::UDIVREM_I8, .Name: "__aeabi_uidivmod", .CC: CallingConv::ARM_AAPCS },
1277 { .Op: RTLIB::UDIVREM_I16, .Name: "__aeabi_uidivmod", .CC: CallingConv::ARM_AAPCS },
1278 { .Op: RTLIB::UDIVREM_I32, .Name: "__aeabi_uidivmod", .CC: CallingConv::ARM_AAPCS },
1279 { .Op: RTLIB::UDIVREM_I64, .Name: "__aeabi_uldivmod", .CC: CallingConv::ARM_AAPCS },
1280 };
1281
1282 for (const auto &LC : LibraryCalls) {
1283 setLibcallName(Call: LC.Op, Name: LC.Name);
1284 setLibcallCallingConv(Call: LC.Op, CC: LC.CC);
1285 }
1286 }
1287
1288 setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
1289 setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
1290 setOperationAction(ISD::SDIVREM, MVT::i64, Custom);
1291 setOperationAction(ISD::UDIVREM, MVT::i64, Custom);
1292 } else {
1293 setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
1294 setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
1295 }
1296
1297 if (Subtarget->getTargetTriple().isOSMSVCRT()) {
1298 // MSVCRT doesn't have powi; fall back to pow
1299 setLibcallName(Call: RTLIB::POWI_F32, Name: nullptr);
1300 setLibcallName(Call: RTLIB::POWI_F64, Name: nullptr);
1301 }
1302
1303 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
1304 setOperationAction(ISD::ConstantPool, MVT::i32, Custom);
1305 setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
1306 setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
1307
1308 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1309 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
1310
1311 // Use the default implementation.
1312 setOperationAction(ISD::VASTART, MVT::Other, Custom);
1313 setOperationAction(ISD::VAARG, MVT::Other, Expand);
1314 setOperationAction(ISD::VACOPY, MVT::Other, Expand);
1315 setOperationAction(ISD::VAEND, MVT::Other, Expand);
1316 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
1317 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
1318
1319 if (Subtarget->isTargetWindows())
1320 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
1321 else
1322 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
1323
1324 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
1325 // the default expansion.
1326 InsertFencesForAtomic = false;
1327 if (Subtarget->hasAnyDataBarrier() &&
1328 (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
1329 // ATOMIC_FENCE needs custom lowering; the others should have been expanded
1330 // to ldrex/strex loops already.
1331 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
1332 if (!Subtarget->isThumb() || !Subtarget->isMClass())
1333 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
1334
1335 // On v8, we have particularly efficient implementations of atomic fences
1336 // if they can be combined with nearby atomic loads and stores.
1337 if (!Subtarget->hasAcquireRelease() ||
1338 getTargetMachine().getOptLevel() == CodeGenOptLevel::None) {
1339 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
1340 InsertFencesForAtomic = true;
1341 }
1342 } else {
1343 // If there's anything we can use as a barrier, go through custom lowering
1344 // for ATOMIC_FENCE.
1345 // If target has DMB in thumb, Fences can be inserted.
1346 if (Subtarget->hasDataBarrier())
1347 InsertFencesForAtomic = true;
1348
1349 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other,
1350 Subtarget->hasAnyDataBarrier() ? Custom : Expand);
1351
1352 // Set them all for libcall, which will force libcalls.
1353 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, LibCall);
1354 setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, LibCall);
1355 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, LibCall);
1356 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, LibCall);
1357 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, LibCall);
1358 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, LibCall);
1359 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall);
1360 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, LibCall);
1361 setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, LibCall);
1362 setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, LibCall);
1363 setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, LibCall);
1364 setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, LibCall);
1365 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
1366 // Unordered/Monotonic case.
1367 if (!InsertFencesForAtomic) {
1368 setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
1369 setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
1370 }
1371 }
1372
1373 // Compute supported atomic widths.
1374 if (Subtarget->isTargetLinux() ||
1375 (!Subtarget->isMClass() && Subtarget->hasV6Ops())) {
1376 // For targets where __sync_* routines are reliably available, we use them
1377 // if necessary.
1378 //
1379 // ARM Linux always supports 64-bit atomics through kernel-assisted atomic
1380 // routines (kernel 3.1 or later). FIXME: Not with compiler-rt?
1381 //
1382 // ARMv6 targets have native instructions in ARM mode. For Thumb mode,
1383 // such targets should provide __sync_* routines, which use the ARM mode
1384 // instructions. (ARMv6 doesn't have dmb, but it has an equivalent
1385 // encoding; see ARMISD::MEMBARRIER_MCR.)
1386 setMaxAtomicSizeInBitsSupported(64);
1387 } else if ((Subtarget->isMClass() && Subtarget->hasV8MBaselineOps()) ||
1388 Subtarget->hasForced32BitAtomics()) {
1389 // Cortex-M (besides Cortex-M0) have 32-bit atomics.
1390 setMaxAtomicSizeInBitsSupported(32);
1391 } else {
1392 // We can't assume anything about other targets; just use libatomic
1393 // routines.
1394 setMaxAtomicSizeInBitsSupported(0);
1395 }
1396
1397 setMaxDivRemBitWidthSupported(64);
1398
1399 setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
1400
1401 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
1402 if (!Subtarget->hasV6Ops()) {
1403 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
1404 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
1405 }
1406 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
1407
1408 if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() &&
1409 !Subtarget->isThumb1Only()) {
1410 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
1411 // iff target supports vfp2.
1412 setOperationAction(ISD::BITCAST, MVT::i64, Custom);
1413 setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
1414 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
1415 setOperationAction(ISD::GET_FPENV, MVT::i32, Legal);
1416 setOperationAction(ISD::SET_FPENV, MVT::i32, Legal);
1417 setOperationAction(ISD::RESET_FPENV, MVT::Other, Legal);
1418 setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal);
1419 setOperationAction(ISD::SET_FPMODE, MVT::i32, Custom);
1420 setOperationAction(ISD::RESET_FPMODE, MVT::Other, Custom);
1421 }
1422
1423 // We want to custom lower some of our intrinsics.
1424 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1425 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
1426 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
1427 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
1428 if (Subtarget->useSjLjEH())
1429 setLibcallName(Call: RTLIB::UNWIND_RESUME, Name: "_Unwind_SjLj_Resume");
1430
1431 setOperationAction(ISD::SETCC, MVT::i32, Expand);
1432 setOperationAction(ISD::SETCC, MVT::f32, Expand);
1433 setOperationAction(ISD::SETCC, MVT::f64, Expand);
1434 setOperationAction(ISD::SELECT, MVT::i32, Custom);
1435 setOperationAction(ISD::SELECT, MVT::f32, Custom);
1436 setOperationAction(ISD::SELECT, MVT::f64, Custom);
1437 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
1438 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
1439 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
1440 if (Subtarget->hasFullFP16()) {
1441 setOperationAction(ISD::SETCC, MVT::f16, Expand);
1442 setOperationAction(ISD::SELECT, MVT::f16, Custom);
1443 setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
1444 }
1445
1446 setOperationAction(ISD::SETCCCARRY, MVT::i32, Custom);
1447
1448 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
1449 setOperationAction(ISD::BR_CC, MVT::i32, Custom);
1450 if (Subtarget->hasFullFP16())
1451 setOperationAction(ISD::BR_CC, MVT::f16, Custom);
1452 setOperationAction(ISD::BR_CC, MVT::f32, Custom);
1453 setOperationAction(ISD::BR_CC, MVT::f64, Custom);
1454 setOperationAction(ISD::BR_JT, MVT::Other, Custom);
1455
1456 // We don't support sin/cos/fmod/copysign/pow
1457 setOperationAction(ISD::FSIN, MVT::f64, Expand);
1458 setOperationAction(ISD::FSIN, MVT::f32, Expand);
1459 setOperationAction(ISD::FCOS, MVT::f32, Expand);
1460 setOperationAction(ISD::FCOS, MVT::f64, Expand);
1461 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
1462 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
1463 setOperationAction(ISD::FREM, MVT::f64, Expand);
1464 setOperationAction(ISD::FREM, MVT::f32, Expand);
1465 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() &&
1466 !Subtarget->isThumb1Only()) {
1467 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
1468 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
1469 }
1470 setOperationAction(ISD::FPOW, MVT::f64, Expand);
1471 setOperationAction(ISD::FPOW, MVT::f32, Expand);
1472
1473 if (!Subtarget->hasVFP4Base()) {
1474 setOperationAction(ISD::FMA, MVT::f64, Expand);
1475 setOperationAction(ISD::FMA, MVT::f32, Expand);
1476 }
1477
1478 // Various VFP goodness
1479 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
1480 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
1481 if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) {
1482 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
1483 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
1484 }
1485
1486 // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
1487 if (!Subtarget->hasFP16()) {
1488 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
1489 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
1490 }
1491
1492 // Strict floating-point comparisons need custom lowering.
1493 setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom);
1494 setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);
1495 setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom);
1496 setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom);
1497 setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom);
1498 setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);
1499 }
1500
1501 // Use __sincos_stret if available.
1502 if (getLibcallName(Call: RTLIB::SINCOS_STRET_F32) != nullptr &&
1503 getLibcallName(Call: RTLIB::SINCOS_STRET_F64) != nullptr) {
1504 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1505 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1506 }
1507
1508 // FP-ARMv8 implements a lot of rounding-like FP operations.
1509 if (Subtarget->hasFPARMv8Base()) {
1510 setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
1511 setOperationAction(ISD::FCEIL, MVT::f32, Legal);
1512 setOperationAction(ISD::FROUND, MVT::f32, Legal);
1513 setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
1514 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
1515 setOperationAction(ISD::FRINT, MVT::f32, Legal);
1516 setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
1517 setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
1518 if (Subtarget->hasNEON()) {
1519 setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal);
1520 setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal);
1521 setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
1522 setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
1523 }
1524
1525 if (Subtarget->hasFP64()) {
1526 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
1527 setOperationAction(ISD::FCEIL, MVT::f64, Legal);
1528 setOperationAction(ISD::FROUND, MVT::f64, Legal);
1529 setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
1530 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
1531 setOperationAction(ISD::FRINT, MVT::f64, Legal);
1532 setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
1533 setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
1534 }
1535 }
1536
1537 // FP16 often need to be promoted to call lib functions
1538 if (Subtarget->hasFullFP16()) {
1539 setOperationAction(ISD::FREM, MVT::f16, Promote);
1540 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
1541 setOperationAction(ISD::FSIN, MVT::f16, Promote);
1542 setOperationAction(ISD::FCOS, MVT::f16, Promote);
1543 setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
1544 setOperationAction(ISD::FPOWI, MVT::f16, Promote);
1545 setOperationAction(ISD::FPOW, MVT::f16, Promote);
1546 setOperationAction(ISD::FEXP, MVT::f16, Promote);
1547 setOperationAction(ISD::FEXP2, MVT::f16, Promote);
1548 setOperationAction(ISD::FEXP10, MVT::f16, Promote);
1549 setOperationAction(ISD::FLOG, MVT::f16, Promote);
1550 setOperationAction(ISD::FLOG10, MVT::f16, Promote);
1551 setOperationAction(ISD::FLOG2, MVT::f16, Promote);
1552
1553 setOperationAction(ISD::FROUND, MVT::f16, Legal);
1554 }
1555
1556 if (Subtarget->hasNEON()) {
1557 // vmin and vmax aren't available in a scalar form, so we can use
1558 // a NEON instruction with an undef lane instead. This has a performance
1559 // penalty on some cores, so we don't do this unless we have been
1560 // asked to by the core tuning model.
1561 if (Subtarget->useNEONForSinglePrecisionFP()) {
1562 setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);
1563 setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
1564 setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
1565 setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
1566 }
1567 setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal);
1568 setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal);
1569 setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal);
1570 setOperationAction(ISD::FMAXIMUM, MVT::v4f32, Legal);
1571
1572 if (Subtarget->hasFullFP16()) {
1573 setOperationAction(ISD::FMINNUM, MVT::v4f16, Legal);
1574 setOperationAction(ISD::FMAXNUM, MVT::v4f16, Legal);
1575 setOperationAction(ISD::FMINNUM, MVT::v8f16, Legal);
1576 setOperationAction(ISD::FMAXNUM, MVT::v8f16, Legal);
1577
1578 setOperationAction(ISD::FMINIMUM, MVT::v4f16, Legal);
1579 setOperationAction(ISD::FMAXIMUM, MVT::v4f16, Legal);
1580 setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal);
1581 setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal);
1582 }
1583 }
1584
1585 // We have target-specific dag combine patterns for the following nodes:
1586 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine
1587 setTargetDAGCombine(
1588 {ISD::ADD, ISD::SUB, ISD::MUL, ISD::AND, ISD::OR, ISD::XOR});
1589
1590 if (Subtarget->hasMVEIntegerOps())
1591 setTargetDAGCombine(ISD::VSELECT);
1592
1593 if (Subtarget->hasV6Ops())
1594 setTargetDAGCombine(ISD::SRL);
1595 if (Subtarget->isThumb1Only())
1596 setTargetDAGCombine(ISD::SHL);
1597 // Attempt to lower smin/smax to ssat/usat
1598 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) ||
1599 Subtarget->isThumb2()) {
1600 setTargetDAGCombine({ISD::SMIN, ISD::SMAX});
1601 }
1602
1603 setStackPointerRegisterToSaveRestore(ARM::SP);
1604
1605 if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
1606 !Subtarget->hasVFP2Base() || Subtarget->hasMinSize())
1607 setSchedulingPreference(Sched::RegPressure);
1608 else
1609 setSchedulingPreference(Sched::Hybrid);
1610
1611 //// temporary - rewrite interface to use type
1612 MaxStoresPerMemset = 8;
1613 MaxStoresPerMemsetOptSize = 4;
1614 MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
1615 MaxStoresPerMemcpyOptSize = 2;
1616 MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
1617 MaxStoresPerMemmoveOptSize = 2;
1618
1619 // On ARM arguments smaller than 4 bytes are extended, so all arguments
1620 // are at least 4 bytes aligned.
1621 setMinStackArgumentAlignment(Align(4));
1622
1623 // Prefer likely predicted branches to selects on out-of-order cores.
1624 PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
1625
1626 setPrefLoopAlignment(Align(1ULL << Subtarget->getPrefLoopLogAlignment()));
1627 setPrefFunctionAlignment(Align(1ULL << Subtarget->getPrefLoopLogAlignment()));
1628
1629 setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4));
1630
1631 if (Subtarget->isThumb() || Subtarget->isThumb2())
1632 setTargetDAGCombine(ISD::ABS);
1633}
1634
1635bool ARMTargetLowering::useSoftFloat() const {
1636 return Subtarget->useSoftFloat();
1637}
1638
1639// FIXME: It might make sense to define the representative register class as the
1640// nearest super-register that has a non-null superset. For example, DPR_VFP2 is
1641// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
1642// SPR's representative would be DPR_VFP2. This should work well if register
1643// pressure tracking were modified such that a register use would increment the
1644// pressure of the register class's representative and all of it's super
1645// classes' representatives transitively. We have not implemented this because
1646// of the difficulty prior to coalescing of modeling operand register classes
1647// due to the common occurrence of cross class copies and subregister insertions
1648// and extractions.
1649std::pair<const TargetRegisterClass *, uint8_t>
1650ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
1651 MVT VT) const {
1652 const TargetRegisterClass *RRC = nullptr;
1653 uint8_t Cost = 1;
1654 switch (VT.SimpleTy) {
1655 default:
1656 return TargetLowering::findRepresentativeClass(TRI, VT);
1657 // Use DPR as representative register class for all floating point
1658 // and vector types. Since there are 32 SPR registers and 32 DPR registers so
1659 // the cost is 1 for both f32 and f64.
1660 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
1661 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
1662 RRC = &ARM::DPRRegClass;
1663 // When NEON is used for SP, only half of the register file is available
1664 // because operations that define both SP and DP results will be constrained
1665 // to the VFP2 class (D0-D15). We currently model this constraint prior to
1666 // coalescing by double-counting the SP regs. See the FIXME above.
1667 if (Subtarget->useNEONForSinglePrecisionFP())
1668 Cost = 2;
1669 break;
1670 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1671 case MVT::v4f32: case MVT::v2f64:
1672 RRC = &ARM::DPRRegClass;
1673 Cost = 2;
1674 break;
1675 case MVT::v4i64:
1676 RRC = &ARM::DPRRegClass;
1677 Cost = 4;
1678 break;
1679 case MVT::v8i64:
1680 RRC = &ARM::DPRRegClass;
1681 Cost = 8;
1682 break;
1683 }
1684 return std::make_pair(x&: RRC, y&: Cost);
1685}
1686
1687const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
1688#define MAKE_CASE(V) \
1689 case V: \
1690 return #V;
1691 switch ((ARMISD::NodeType)Opcode) {
1692 case ARMISD::FIRST_NUMBER:
1693 break;
1694 MAKE_CASE(ARMISD::Wrapper)
1695 MAKE_CASE(ARMISD::WrapperPIC)
1696 MAKE_CASE(ARMISD::WrapperJT)
1697 MAKE_CASE(ARMISD::COPY_STRUCT_BYVAL)
1698 MAKE_CASE(ARMISD::CALL)
1699 MAKE_CASE(ARMISD::CALL_PRED)
1700 MAKE_CASE(ARMISD::CALL_NOLINK)
1701 MAKE_CASE(ARMISD::tSECALL)
1702 MAKE_CASE(ARMISD::t2CALL_BTI)
1703 MAKE_CASE(ARMISD::BRCOND)
1704 MAKE_CASE(ARMISD::BR_JT)
1705 MAKE_CASE(ARMISD::BR2_JT)
1706 MAKE_CASE(ARMISD::RET_GLUE)
1707 MAKE_CASE(ARMISD::SERET_GLUE)
1708 MAKE_CASE(ARMISD::INTRET_GLUE)
1709 MAKE_CASE(ARMISD::PIC_ADD)
1710 MAKE_CASE(ARMISD::CMP)
1711 MAKE_CASE(ARMISD::CMN)
1712 MAKE_CASE(ARMISD::CMPZ)
1713 MAKE_CASE(ARMISD::CMPFP)
1714 MAKE_CASE(ARMISD::CMPFPE)
1715 MAKE_CASE(ARMISD::CMPFPw0)
1716 MAKE_CASE(ARMISD::CMPFPEw0)
1717 MAKE_CASE(ARMISD::BCC_i64)
1718 MAKE_CASE(ARMISD::FMSTAT)
1719 MAKE_CASE(ARMISD::CMOV)
1720 MAKE_CASE(ARMISD::SUBS)
1721 MAKE_CASE(ARMISD::SSAT)
1722 MAKE_CASE(ARMISD::USAT)
1723 MAKE_CASE(ARMISD::ASRL)
1724 MAKE_CASE(ARMISD::LSRL)
1725 MAKE_CASE(ARMISD::LSLL)
1726 MAKE_CASE(ARMISD::SRL_GLUE)
1727 MAKE_CASE(ARMISD::SRA_GLUE)
1728 MAKE_CASE(ARMISD::RRX)
1729 MAKE_CASE(ARMISD::ADDC)
1730 MAKE_CASE(ARMISD::ADDE)
1731 MAKE_CASE(ARMISD::SUBC)
1732 MAKE_CASE(ARMISD::SUBE)
1733 MAKE_CASE(ARMISD::LSLS)
1734 MAKE_CASE(ARMISD::VMOVRRD)
1735 MAKE_CASE(ARMISD::VMOVDRR)
1736 MAKE_CASE(ARMISD::VMOVhr)
1737 MAKE_CASE(ARMISD::VMOVrh)
1738 MAKE_CASE(ARMISD::VMOVSR)
1739 MAKE_CASE(ARMISD::EH_SJLJ_SETJMP)
1740 MAKE_CASE(ARMISD::EH_SJLJ_LONGJMP)
1741 MAKE_CASE(ARMISD::EH_SJLJ_SETUP_DISPATCH)
1742 MAKE_CASE(ARMISD::TC_RETURN)
1743 MAKE_CASE(ARMISD::THREAD_POINTER)
1744 MAKE_CASE(ARMISD::DYN_ALLOC)
1745 MAKE_CASE(ARMISD::MEMBARRIER_MCR)
1746 MAKE_CASE(ARMISD::PRELOAD)
1747 MAKE_CASE(ARMISD::LDRD)
1748 MAKE_CASE(ARMISD::STRD)
1749 MAKE_CASE(ARMISD::WIN__CHKSTK)
1750 MAKE_CASE(ARMISD::WIN__DBZCHK)
1751 MAKE_CASE(ARMISD::PREDICATE_CAST)
1752 MAKE_CASE(ARMISD::VECTOR_REG_CAST)
1753 MAKE_CASE(ARMISD::MVESEXT)
1754 MAKE_CASE(ARMISD::MVEZEXT)
1755 MAKE_CASE(ARMISD::MVETRUNC)
1756 MAKE_CASE(ARMISD::VCMP)
1757 MAKE_CASE(ARMISD::VCMPZ)
1758 MAKE_CASE(ARMISD::VTST)
1759 MAKE_CASE(ARMISD::VSHLs)
1760 MAKE_CASE(ARMISD::VSHLu)
1761 MAKE_CASE(ARMISD::VSHLIMM)
1762 MAKE_CASE(ARMISD::VSHRsIMM)
1763 MAKE_CASE(ARMISD::VSHRuIMM)
1764 MAKE_CASE(ARMISD::VRSHRsIMM)
1765 MAKE_CASE(ARMISD::VRSHRuIMM)
1766 MAKE_CASE(ARMISD::VRSHRNIMM)
1767 MAKE_CASE(ARMISD::VQSHLsIMM)
1768 MAKE_CASE(ARMISD::VQSHLuIMM)
1769 MAKE_CASE(ARMISD::VQSHLsuIMM)
1770 MAKE_CASE(ARMISD::VQSHRNsIMM)
1771 MAKE_CASE(ARMISD::VQSHRNuIMM)
1772 MAKE_CASE(ARMISD::VQSHRNsuIMM)
1773 MAKE_CASE(ARMISD::VQRSHRNsIMM)
1774 MAKE_CASE(ARMISD::VQRSHRNuIMM)
1775 MAKE_CASE(ARMISD::VQRSHRNsuIMM)
1776 MAKE_CASE(ARMISD::VSLIIMM)
1777 MAKE_CASE(ARMISD::VSRIIMM)
1778 MAKE_CASE(ARMISD::VGETLANEu)
1779 MAKE_CASE(ARMISD::VGETLANEs)
1780 MAKE_CASE(ARMISD::VMOVIMM)
1781 MAKE_CASE(ARMISD::VMVNIMM)
1782 MAKE_CASE(ARMISD::VMOVFPIMM)
1783 MAKE_CASE(ARMISD::VDUP)
1784 MAKE_CASE(ARMISD::VDUPLANE)
1785 MAKE_CASE(ARMISD::VEXT)
1786 MAKE_CASE(ARMISD::VREV64)
1787 MAKE_CASE(ARMISD::VREV32)
1788 MAKE_CASE(ARMISD::VREV16)
1789 MAKE_CASE(ARMISD::VZIP)
1790 MAKE_CASE(ARMISD::VUZP)
1791 MAKE_CASE(ARMISD::VTRN)
1792 MAKE_CASE(ARMISD::VTBL1)
1793 MAKE_CASE(ARMISD::VTBL2)
1794 MAKE_CASE(ARMISD::VMOVN)
1795 MAKE_CASE(ARMISD::VQMOVNs)
1796 MAKE_CASE(ARMISD::VQMOVNu)
1797 MAKE_CASE(ARMISD::VCVTN)
1798 MAKE_CASE(ARMISD::VCVTL)
1799 MAKE_CASE(ARMISD::VIDUP)
1800 MAKE_CASE(ARMISD::VMULLs)
1801 MAKE_CASE(ARMISD::VMULLu)
1802 MAKE_CASE(ARMISD::VQDMULH)
1803 MAKE_CASE(ARMISD::VADDVs)
1804 MAKE_CASE(ARMISD::VADDVu)
1805 MAKE_CASE(ARMISD::VADDVps)
1806 MAKE_CASE(ARMISD::VADDVpu)
1807 MAKE_CASE(ARMISD::VADDLVs)
1808 MAKE_CASE(ARMISD::VADDLVu)
1809 MAKE_CASE(ARMISD::VADDLVAs)
1810 MAKE_CASE(ARMISD::VADDLVAu)
1811 MAKE_CASE(ARMISD::VADDLVps)
1812 MAKE_CASE(ARMISD::VADDLVpu)
1813 MAKE_CASE(ARMISD::VADDLVAps)
1814 MAKE_CASE(ARMISD::VADDLVApu)
1815 MAKE_CASE(ARMISD::VMLAVs)
1816 MAKE_CASE(ARMISD::VMLAVu)
1817 MAKE_CASE(ARMISD::VMLAVps)
1818 MAKE_CASE(ARMISD::VMLAVpu)
1819 MAKE_CASE(ARMISD::VMLALVs)
1820 MAKE_CASE(ARMISD::VMLALVu)
1821 MAKE_CASE(ARMISD::VMLALVps)
1822 MAKE_CASE(ARMISD::VMLALVpu)
1823 MAKE_CASE(ARMISD::VMLALVAs)
1824 MAKE_CASE(ARMISD::VMLALVAu)
1825 MAKE_CASE(ARMISD::VMLALVAps)
1826 MAKE_CASE(ARMISD::VMLALVApu)
1827 MAKE_CASE(ARMISD::VMINVu)
1828 MAKE_CASE(ARMISD::VMINVs)
1829 MAKE_CASE(ARMISD::VMAXVu)
1830 MAKE_CASE(ARMISD::VMAXVs)
1831 MAKE_CASE(ARMISD::UMAAL)
1832 MAKE_CASE(ARMISD::UMLAL)
1833 MAKE_CASE(ARMISD::SMLAL)
1834 MAKE_CASE(ARMISD::SMLALBB)
1835 MAKE_CASE(ARMISD::SMLALBT)
1836 MAKE_CASE(ARMISD::SMLALTB)
1837 MAKE_CASE(ARMISD::SMLALTT)
1838 MAKE_CASE(ARMISD::SMULWB)
1839 MAKE_CASE(ARMISD::SMULWT)
1840 MAKE_CASE(ARMISD::SMLALD)
1841 MAKE_CASE(ARMISD::SMLALDX)
1842 MAKE_CASE(ARMISD::SMLSLD)
1843 MAKE_CASE(ARMISD::SMLSLDX)
1844 MAKE_CASE(ARMISD::SMMLAR)
1845 MAKE_CASE(ARMISD::SMMLSR)
1846 MAKE_CASE(ARMISD::QADD16b)
1847 MAKE_CASE(ARMISD::QSUB16b)
1848 MAKE_CASE(ARMISD::QADD8b)
1849 MAKE_CASE(ARMISD::QSUB8b)
1850 MAKE_CASE(ARMISD::UQADD16b)
1851 MAKE_CASE(ARMISD::UQSUB16b)
1852 MAKE_CASE(ARMISD::UQADD8b)
1853 MAKE_CASE(ARMISD::UQSUB8b)
1854 MAKE_CASE(ARMISD::BUILD_VECTOR)
1855 MAKE_CASE(ARMISD::BFI)
1856 MAKE_CASE(ARMISD::VORRIMM)
1857 MAKE_CASE(ARMISD::VBICIMM)
1858 MAKE_CASE(ARMISD::VBSP)
1859 MAKE_CASE(ARMISD::MEMCPY)
1860 MAKE_CASE(ARMISD::VLD1DUP)
1861 MAKE_CASE(ARMISD::VLD2DUP)
1862 MAKE_CASE(ARMISD::VLD3DUP)
1863 MAKE_CASE(ARMISD::VLD4DUP)
1864 MAKE_CASE(ARMISD::VLD1_UPD)
1865 MAKE_CASE(ARMISD::VLD2_UPD)
1866 MAKE_CASE(ARMISD::VLD3_UPD)
1867 MAKE_CASE(ARMISD::VLD4_UPD)
1868 MAKE_CASE(ARMISD::VLD1x2_UPD)
1869 MAKE_CASE(ARMISD::VLD1x3_UPD)
1870 MAKE_CASE(ARMISD::VLD1x4_UPD)
1871 MAKE_CASE(ARMISD::VLD2LN_UPD)
1872 MAKE_CASE(ARMISD::VLD3LN_UPD)
1873 MAKE_CASE(ARMISD::VLD4LN_UPD)
1874 MAKE_CASE(ARMISD::VLD1DUP_UPD)
1875 MAKE_CASE(ARMISD::VLD2DUP_UPD)
1876 MAKE_CASE(ARMISD::VLD3DUP_UPD)
1877 MAKE_CASE(ARMISD::VLD4DUP_UPD)
1878 MAKE_CASE(ARMISD::VST1_UPD)
1879 MAKE_CASE(ARMISD::VST2_UPD)
1880 MAKE_CASE(ARMISD::VST3_UPD)
1881 MAKE_CASE(ARMISD::VST4_UPD)
1882 MAKE_CASE(ARMISD::VST1x2_UPD)
1883 MAKE_CASE(ARMISD::VST1x3_UPD)
1884 MAKE_CASE(ARMISD::VST1x4_UPD)
1885 MAKE_CASE(ARMISD::VST2LN_UPD)
1886 MAKE_CASE(ARMISD::VST3LN_UPD)
1887 MAKE_CASE(ARMISD::VST4LN_UPD)
1888 MAKE_CASE(ARMISD::WLS)
1889 MAKE_CASE(ARMISD::WLSSETUP)
1890 MAKE_CASE(ARMISD::LE)
1891 MAKE_CASE(ARMISD::LOOP_DEC)
1892 MAKE_CASE(ARMISD::CSINV)
1893 MAKE_CASE(ARMISD::CSNEG)
1894 MAKE_CASE(ARMISD::CSINC)
1895 MAKE_CASE(ARMISD::MEMCPYLOOP)
1896 MAKE_CASE(ARMISD::MEMSETLOOP)
1897#undef MAKE_CASE
1898 }
1899 return nullptr;
1900}
1901
1902EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
1903 EVT VT) const {
1904 if (!VT.isVector())
1905 return getPointerTy(DL);
1906
1907 // MVE has a predicate register.
1908 if ((Subtarget->hasMVEIntegerOps() &&
1909 (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
1910 VT == MVT::v16i8)) ||
1911 (Subtarget->hasMVEFloatOps() &&
1912 (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16)))
1913 return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
1914 return VT.changeVectorElementTypeToInteger();
1915}
1916
1917/// getRegClassFor - Return the register class that should be used for the
1918/// specified value type.
1919const TargetRegisterClass *
1920ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
1921 (void)isDivergent;
1922 // Map v4i64 to QQ registers but do not make the type legal. Similarly map
1923 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
1924 // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive
1925 // MVE Q registers.
1926 if (Subtarget->hasNEON()) {
1927 if (VT == MVT::v4i64)
1928 return &ARM::QQPRRegClass;
1929 if (VT == MVT::v8i64)
1930 return &ARM::QQQQPRRegClass;
1931 }
1932 if (Subtarget->hasMVEIntegerOps()) {
1933 if (VT == MVT::v4i64)
1934 return &ARM::MQQPRRegClass;
1935 if (VT == MVT::v8i64)
1936 return &ARM::MQQQQPRRegClass;
1937 }
1938 return TargetLowering::getRegClassFor(VT);
1939}
1940
1941// memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
1942// source/dest is aligned and the copy size is large enough. We therefore want
1943// to align such objects passed to memory intrinsics.
1944bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,
1945 Align &PrefAlign) const {
1946 if (!isa<MemIntrinsic>(Val: CI))
1947 return false;
1948 MinSize = 8;
1949 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
1950 // cycle faster than 4-byte aligned LDM.
1951 PrefAlign =
1952 (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? Align(8) : Align(4));
1953 return true;
1954}
1955
1956// Create a fast isel object.
1957FastISel *
1958ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
1959 const TargetLibraryInfo *libInfo) const {
1960 return ARM::createFastISel(funcInfo, libInfo);
1961}
1962
1963Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
1964 unsigned NumVals = N->getNumValues();
1965 if (!NumVals)
1966 return Sched::RegPressure;
1967
1968 for (unsigned i = 0; i != NumVals; ++i) {
1969 EVT VT = N->getValueType(ResNo: i);
1970 if (VT == MVT::Glue || VT == MVT::Other)
1971 continue;
1972 if (VT.isFloatingPoint() || VT.isVector())
1973 return Sched::ILP;
1974 }
1975
1976 if (!N->isMachineOpcode())
1977 return Sched::RegPressure;
1978
1979 // Load are scheduled for latency even if there instruction itinerary
1980 // is not available.
1981 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1982 const MCInstrDesc &MCID = TII->get(Opcode: N->getMachineOpcode());
1983
1984 if (MCID.getNumDefs() == 0)
1985 return Sched::RegPressure;
1986 if (!Itins->isEmpty() &&
1987 Itins->getOperandCycle(ItinClassIndx: MCID.getSchedClass(), OperandIdx: 0) > 2U)
1988 return Sched::ILP;
1989
1990 return Sched::RegPressure;
1991}
1992
1993//===----------------------------------------------------------------------===//
1994// Lowering Code
1995//===----------------------------------------------------------------------===//
1996
1997static bool isSRL16(const SDValue &Op) {
1998 if (Op.getOpcode() != ISD::SRL)
1999 return false;
2000 if (auto Const = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 1)))
2001 return Const->getZExtValue() == 16;
2002 return false;
2003}
2004
2005static bool isSRA16(const SDValue &Op) {
2006 if (Op.getOpcode() != ISD::SRA)
2007 return false;
2008 if (auto Const = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 1)))
2009 return Const->getZExtValue() == 16;
2010 return false;
2011}
2012
2013static bool isSHL16(const SDValue &Op) {
2014 if (Op.getOpcode() != ISD::SHL)
2015 return false;
2016 if (auto Const = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 1)))
2017 return Const->getZExtValue() == 16;
2018 return false;
2019}
2020
2021// Check for a signed 16-bit value. We special case SRA because it makes it
2022// more simple when also looking for SRAs that aren't sign extending a
2023// smaller value. Without the check, we'd need to take extra care with
2024// checking order for some operations.
2025static bool isS16(const SDValue &Op, SelectionDAG &DAG) {
2026 if (isSRA16(Op))
2027 return isSHL16(Op: Op.getOperand(i: 0));
2028 return DAG.ComputeNumSignBits(Op) == 17;
2029}
2030
2031/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
2032static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
2033 switch (CC) {
2034 default: llvm_unreachable("Unknown condition code!");
2035 case ISD::SETNE: return ARMCC::NE;
2036 case ISD::SETEQ: return ARMCC::EQ;
2037 case ISD::SETGT: return ARMCC::GT;
2038 case ISD::SETGE: return ARMCC::GE;
2039 case ISD::SETLT: return ARMCC::LT;
2040 case ISD::SETLE: return ARMCC::LE;
2041 case ISD::SETUGT: return ARMCC::HI;
2042 case ISD::SETUGE: return ARMCC::HS;
2043 case ISD::SETULT: return ARMCC::LO;
2044 case ISD::SETULE: return ARMCC::LS;
2045 }
2046}
2047
2048/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
2049static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
2050 ARMCC::CondCodes &CondCode2) {
2051 CondCode2 = ARMCC::AL;
2052 switch (CC) {
2053 default: llvm_unreachable("Unknown FP condition!");
2054 case ISD::SETEQ:
2055 case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
2056 case ISD::SETGT:
2057 case ISD::SETOGT: CondCode = ARMCC::GT; break;
2058 case ISD::SETGE:
2059 case ISD::SETOGE: CondCode = ARMCC::GE; break;
2060 case ISD::SETOLT: CondCode = ARMCC::MI; break;
2061 case ISD::SETOLE: CondCode = ARMCC::LS; break;
2062 case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
2063 case ISD::SETO: CondCode = ARMCC::VC; break;
2064 case ISD::SETUO: CondCode = ARMCC::VS; break;
2065 case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
2066 case ISD::SETUGT: CondCode = ARMCC::HI; break;
2067 case ISD::SETUGE: CondCode = ARMCC::PL; break;
2068 case ISD::SETLT:
2069 case ISD::SETULT: CondCode = ARMCC::LT; break;
2070 case ISD::SETLE:
2071 case ISD::SETULE: CondCode = ARMCC::LE; break;
2072 case ISD::SETNE:
2073 case ISD::SETUNE: CondCode = ARMCC::NE; break;
2074 }
2075}
2076
2077//===----------------------------------------------------------------------===//
2078// Calling Convention Implementation
2079//===----------------------------------------------------------------------===//
2080
2081/// getEffectiveCallingConv - Get the effective calling convention, taking into
2082/// account presence of floating point hardware and calling convention
2083/// limitations, such as support for variadic functions.
2084CallingConv::ID
2085ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
2086 bool isVarArg) const {
2087 switch (CC) {
2088 default:
2089 report_fatal_error(reason: "Unsupported calling convention");
2090 case CallingConv::ARM_AAPCS:
2091 case CallingConv::ARM_APCS:
2092 case CallingConv::GHC:
2093 case CallingConv::CFGuard_Check:
2094 return CC;
2095 case CallingConv::PreserveMost:
2096 return CallingConv::PreserveMost;
2097 case CallingConv::PreserveAll:
2098 return CallingConv::PreserveAll;
2099 case CallingConv::ARM_AAPCS_VFP:
2100 case CallingConv::Swift:
2101 case CallingConv::SwiftTail:
2102 return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP;
2103 case CallingConv::C:
2104 case CallingConv::Tail:
2105 if (!Subtarget->isAAPCS_ABI())
2106 return CallingConv::ARM_APCS;
2107 else if (Subtarget->hasFPRegs() && !Subtarget->isThumb1Only() &&
2108 getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
2109 !isVarArg)
2110 return CallingConv::ARM_AAPCS_VFP;
2111 else
2112 return CallingConv::ARM_AAPCS;
2113 case CallingConv::Fast:
2114 case CallingConv::CXX_FAST_TLS:
2115 if (!Subtarget->isAAPCS_ABI()) {
2116 if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg)
2117 return CallingConv::Fast;
2118 return CallingConv::ARM_APCS;
2119 } else if (Subtarget->hasVFP2Base() &&
2120 !Subtarget->isThumb1Only() && !isVarArg)
2121 return CallingConv::ARM_AAPCS_VFP;
2122 else
2123 return CallingConv::ARM_AAPCS;
2124 }
2125}
2126
2127CCAssignFn *ARMTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
2128 bool isVarArg) const {
2129 return CCAssignFnForNode(CC, Return: false, isVarArg);
2130}
2131
2132CCAssignFn *ARMTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
2133 bool isVarArg) const {
2134 return CCAssignFnForNode(CC, Return: true, isVarArg);
2135}
2136
2137/// CCAssignFnForNode - Selects the correct CCAssignFn for the given
2138/// CallingConvention.
2139CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
2140 bool Return,
2141 bool isVarArg) const {
2142 switch (getEffectiveCallingConv(CC, isVarArg)) {
2143 default:
2144 report_fatal_error(reason: "Unsupported calling convention");
2145 case CallingConv::ARM_APCS:
2146 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
2147 case CallingConv::ARM_AAPCS:
2148 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2149 case CallingConv::ARM_AAPCS_VFP:
2150 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
2151 case CallingConv::Fast:
2152 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
2153 case CallingConv::GHC:
2154 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
2155 case CallingConv::PreserveMost:
2156 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2157 case CallingConv::PreserveAll:
2158 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2159 case CallingConv::CFGuard_Check:
2160 return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check);
2161 }
2162}
2163
2164SDValue ARMTargetLowering::MoveToHPR(const SDLoc &dl, SelectionDAG &DAG,
2165 MVT LocVT, MVT ValVT, SDValue Val) const {
2166 Val = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::getIntegerVT(BitWidth: LocVT.getSizeInBits()),
2167 Operand: Val);
2168 if (Subtarget->hasFullFP16()) {
2169 Val = DAG.getNode(Opcode: ARMISD::VMOVhr, DL: dl, VT: ValVT, Operand: Val);
2170 } else {
2171 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl,
2172 VT: MVT::getIntegerVT(BitWidth: ValVT.getSizeInBits()), Operand: Val);
2173 Val = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: ValVT, Operand: Val);
2174 }
2175 return Val;
2176}
2177
2178SDValue ARMTargetLowering::MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG,
2179 MVT LocVT, MVT ValVT,
2180 SDValue Val) const {
2181 if (Subtarget->hasFullFP16()) {
2182 Val = DAG.getNode(Opcode: ARMISD::VMOVrh, DL: dl,
2183 VT: MVT::getIntegerVT(BitWidth: LocVT.getSizeInBits()), Operand: Val);
2184 } else {
2185 Val = DAG.getNode(Opcode: ISD::BITCAST, DL: dl,
2186 VT: MVT::getIntegerVT(BitWidth: ValVT.getSizeInBits()), Operand: Val);
2187 Val = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl,
2188 VT: MVT::getIntegerVT(BitWidth: LocVT.getSizeInBits()), Operand: Val);
2189 }
2190 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: LocVT, Operand: Val);
2191}
2192
2193/// LowerCallResult - Lower the result values of a call into the
2194/// appropriate copies out of appropriate physical registers.
2195SDValue ARMTargetLowering::LowerCallResult(
2196 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
2197 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2198 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
2199 SDValue ThisVal) const {
2200 // Assign locations to each value returned by this call.
2201 SmallVector<CCValAssign, 16> RVLocs;
2202 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2203 *DAG.getContext());
2204 CCInfo.AnalyzeCallResult(Ins, Fn: CCAssignFnForReturn(CC: CallConv, isVarArg));
2205
2206 // Copy all of the result registers out of their specified physreg.
2207 for (unsigned i = 0; i != RVLocs.size(); ++i) {
2208 CCValAssign VA = RVLocs[i];
2209
2210 // Pass 'this' value directly from the argument to return value, to avoid
2211 // reg unit interference
2212 if (i == 0 && isThisReturn) {
2213 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
2214 "unexpected return calling convention register assignment");
2215 InVals.push_back(Elt: ThisVal);
2216 continue;
2217 }
2218
2219 SDValue Val;
2220 if (VA.needsCustom() &&
2221 (VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2f64)) {
2222 // Handle f64 or half of a v2f64.
2223 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
2224 InGlue);
2225 Chain = Lo.getValue(R: 1);
2226 InGlue = Lo.getValue(R: 2);
2227 VA = RVLocs[++i]; // skip ahead to next loc
2228 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
2229 InGlue);
2230 Chain = Hi.getValue(R: 1);
2231 InGlue = Hi.getValue(R: 2);
2232 if (!Subtarget->isLittle())
2233 std::swap (a&: Lo, b&: Hi);
2234 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
2235
2236 if (VA.getLocVT() == MVT::v2f64) {
2237 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
2238 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
2239 DAG.getConstant(0, dl, MVT::i32));
2240
2241 VA = RVLocs[++i]; // skip ahead to next loc
2242 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
2243 Chain = Lo.getValue(R: 1);
2244 InGlue = Lo.getValue(R: 2);
2245 VA = RVLocs[++i]; // skip ahead to next loc
2246 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
2247 Chain = Hi.getValue(R: 1);
2248 InGlue = Hi.getValue(R: 2);
2249 if (!Subtarget->isLittle())
2250 std::swap (a&: Lo, b&: Hi);
2251 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
2252 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
2253 DAG.getConstant(1, dl, MVT::i32));
2254 }
2255 } else {
2256 Val = DAG.getCopyFromReg(Chain, dl, Reg: VA.getLocReg(), VT: VA.getLocVT(),
2257 Glue: InGlue);
2258 Chain = Val.getValue(R: 1);
2259 InGlue = Val.getValue(R: 2);
2260 }
2261
2262 switch (VA.getLocInfo()) {
2263 default: llvm_unreachable("Unknown loc info!");
2264 case CCValAssign::Full: break;
2265 case CCValAssign::BCvt:
2266 Val = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: VA.getValVT(), Operand: Val);
2267 break;
2268 }
2269
2270 // f16 arguments have their size extended to 4 bytes and passed as if they
2271 // had been copied to the LSBs of a 32-bit register.
2272 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2273 if (VA.needsCustom() &&
2274 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
2275 Val = MoveToHPR(dl, DAG, LocVT: VA.getLocVT(), ValVT: VA.getValVT(), Val);
2276
2277 InVals.push_back(Elt: Val);
2278 }
2279
2280 return Chain;
2281}
2282
2283std::pair<SDValue, MachinePointerInfo> ARMTargetLowering::computeAddrForCallArg(
2284 const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, SDValue StackPtr,
2285 bool IsTailCall, int SPDiff) const {
2286 SDValue DstAddr;
2287 MachinePointerInfo DstInfo;
2288 int32_t Offset = VA.getLocMemOffset();
2289 MachineFunction &MF = DAG.getMachineFunction();
2290
2291 if (IsTailCall) {
2292 Offset += SPDiff;
2293 auto PtrVT = getPointerTy(DL: DAG.getDataLayout());
2294 int Size = VA.getLocVT().getFixedSizeInBits() / 8;
2295 int FI = MF.getFrameInfo().CreateFixedObject(Size, SPOffset: Offset, IsImmutable: true);
2296 DstAddr = DAG.getFrameIndex(FI, VT: PtrVT);
2297 DstInfo =
2298 MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI);
2299 } else {
2300 SDValue PtrOff = DAG.getIntPtrConstant(Val: Offset, DL: dl);
2301 DstAddr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: getPointerTy(DL: DAG.getDataLayout()),
2302 N1: StackPtr, N2: PtrOff);
2303 DstInfo =
2304 MachinePointerInfo::getStack(MF&: DAG.getMachineFunction(), Offset);
2305 }
2306
2307 return std::make_pair(x&: DstAddr, y&: DstInfo);
2308}
2309
2310void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
2311 SDValue Chain, SDValue &Arg,
2312 RegsToPassVector &RegsToPass,
2313 CCValAssign &VA, CCValAssign &NextVA,
2314 SDValue &StackPtr,
2315 SmallVectorImpl<SDValue> &MemOpChains,
2316 bool IsTailCall,
2317 int SPDiff) const {
2318 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
2319 DAG.getVTList(MVT::i32, MVT::i32), Arg);
2320 unsigned id = Subtarget->isLittle() ? 0 : 1;
2321 RegsToPass.push_back(Elt: std::make_pair(x: VA.getLocReg(), y: fmrrd.getValue(R: id)));
2322
2323 if (NextVA.isRegLoc())
2324 RegsToPass.push_back(Elt: std::make_pair(x: NextVA.getLocReg(), y: fmrrd.getValue(R: 1-id)));
2325 else {
2326 assert(NextVA.isMemLoc());
2327 if (!StackPtr.getNode())
2328 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
2329 getPointerTy(DAG.getDataLayout()));
2330
2331 SDValue DstAddr;
2332 MachinePointerInfo DstInfo;
2333 std::tie(args&: DstAddr, args&: DstInfo) =
2334 computeAddrForCallArg(dl, DAG, VA: NextVA, StackPtr, IsTailCall, SPDiff);
2335 MemOpChains.push_back(
2336 Elt: DAG.getStore(Chain, dl, Val: fmrrd.getValue(R: 1 - id), Ptr: DstAddr, PtrInfo: DstInfo));
2337 }
2338}
2339
2340static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
2341 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
2342 CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
2343}
2344
2345/// LowerCall - Lowering a call into a callseq_start <-
2346/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
2347/// nodes.
2348SDValue
2349ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2350 SmallVectorImpl<SDValue> &InVals) const {
2351 SelectionDAG &DAG = CLI.DAG;
2352 SDLoc &dl = CLI.DL;
2353 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2354 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2355 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2356 SDValue Chain = CLI.Chain;
2357 SDValue Callee = CLI.Callee;
2358 bool &isTailCall = CLI.IsTailCall;
2359 CallingConv::ID CallConv = CLI.CallConv;
2360 bool doesNotRet = CLI.DoesNotReturn;
2361 bool isVarArg = CLI.IsVarArg;
2362
2363 MachineFunction &MF = DAG.getMachineFunction();
2364 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2365 MachineFunction::CallSiteInfo CSInfo;
2366 bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
2367 bool isThisReturn = false;
2368 bool isCmseNSCall = false;
2369 bool isSibCall = false;
2370 bool PreferIndirect = false;
2371 bool GuardWithBTI = false;
2372
2373 // Lower 'returns_twice' calls to a pseudo-instruction.
2374 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) &&
2375 !Subtarget->noBTIAtReturnTwice())
2376 GuardWithBTI = AFI->branchTargetEnforcement();
2377
2378 // Determine whether this is a non-secure function call.
2379 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Kind: "cmse_nonsecure_call"))
2380 isCmseNSCall = true;
2381
2382 // Disable tail calls if they're not supported.
2383 if (!Subtarget->supportsTailCall())
2384 isTailCall = false;
2385
2386 // For both the non-secure calls and the returns from a CMSE entry function,
2387 // the function needs to do some extra work afte r the call, or before the
2388 // return, respectively, thus it cannot end with atail call
2389 if (isCmseNSCall || AFI->isCmseNSEntryFunction())
2390 isTailCall = false;
2391
2392 if (isa<GlobalAddressSDNode>(Val: Callee)) {
2393 // If we're optimizing for minimum size and the function is called three or
2394 // more times in this block, we can improve codesize by calling indirectly
2395 // as BLXr has a 16-bit encoding.
2396 auto *GV = cast<GlobalAddressSDNode>(Val&: Callee)->getGlobal();
2397 if (CLI.CB) {
2398 auto *BB = CLI.CB->getParent();
2399 PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() &&
2400 count_if(Range: GV->users(), P: [&BB](const User *U) {
2401 return isa<Instruction>(Val: U) &&
2402 cast<Instruction>(Val: U)->getParent() == BB;
2403 }) > 2;
2404 }
2405 }
2406 if (isTailCall) {
2407 // Check if it's really possible to do a tail call.
2408 isTailCall = IsEligibleForTailCallOptimization(
2409 Callee, CalleeCC: CallConv, isVarArg, isCalleeStructRet: isStructRet,
2410 isCallerStructRet: MF.getFunction().hasStructRetAttr(), Outs, OutVals, Ins, DAG,
2411 isIndirect: PreferIndirect);
2412
2413 if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt &&
2414 CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
2415 isSibCall = true;
2416
2417 // We don't support GuaranteedTailCallOpt for ARM, only automatically
2418 // detected sibcalls.
2419 if (isTailCall)
2420 ++NumTailCalls;
2421 }
2422
2423 if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall())
2424 report_fatal_error(reason: "failed to perform tail call elimination on a call "
2425 "site marked musttail");
2426 // Analyze operands of the call, assigning locations to each operand.
2427 SmallVector<CCValAssign, 16> ArgLocs;
2428 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2429 *DAG.getContext());
2430 CCInfo.AnalyzeCallOperands(Outs, Fn: CCAssignFnForCall(CC: CallConv, isVarArg));
2431
2432 // Get a count of how many bytes are to be pushed on the stack.
2433 unsigned NumBytes = CCInfo.getStackSize();
2434
2435 // SPDiff is the byte offset of the call's argument area from the callee's.
2436 // Stores to callee stack arguments will be placed in FixedStackSlots offset
2437 // by this amount for a tail call. In a sibling call it must be 0 because the
2438 // caller will deallocate the entire stack and the callee still expects its
2439 // arguments to begin at SP+0. Completely unused for non-tail calls.
2440 int SPDiff = 0;
2441
2442 if (isTailCall && !isSibCall) {
2443 auto FuncInfo = MF.getInfo<ARMFunctionInfo>();
2444 unsigned NumReusableBytes = FuncInfo->getArgumentStackSize();
2445
2446 // Since callee will pop argument stack as a tail call, we must keep the
2447 // popped size 16-byte aligned.
2448 Align StackAlign = DAG.getDataLayout().getStackAlignment();
2449 NumBytes = alignTo(Size: NumBytes, A: StackAlign);
2450
2451 // SPDiff will be negative if this tail call requires more space than we
2452 // would automatically have in our incoming argument space. Positive if we
2453 // can actually shrink the stack.
2454 SPDiff = NumReusableBytes - NumBytes;
2455
2456 // If this call requires more stack than we have available from
2457 // LowerFormalArguments, tell FrameLowering to reserve space for it.
2458 if (SPDiff < 0 && AFI->getArgRegsSaveSize() < (unsigned)-SPDiff)
2459 AFI->setArgRegsSaveSize(-SPDiff);
2460 }
2461
2462 if (isSibCall) {
2463 // For sibling tail calls, memory operands are available in our caller's stack.
2464 NumBytes = 0;
2465 } else {
2466 // Adjust the stack pointer for the new arguments...
2467 // These operations are automatically eliminated by the prolog/epilog pass
2468 Chain = DAG.getCALLSEQ_START(Chain, InSize: isTailCall ? 0 : NumBytes, OutSize: 0, DL: dl);
2469 }
2470
2471 SDValue StackPtr =
2472 DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
2473
2474 RegsToPassVector RegsToPass;
2475 SmallVector<SDValue, 8> MemOpChains;
2476
2477 // During a tail call, stores to the argument area must happen after all of
2478 // the function's incoming arguments have been loaded because they may alias.
2479 // This is done by folding in a TokenFactor from LowerFormalArguments, but
2480 // there's no point in doing so repeatedly so this tracks whether that's
2481 // happened yet.
2482 bool AfterFormalArgLoads = false;
2483
2484 // Walk the register/memloc assignments, inserting copies/loads. In the case
2485 // of tail call optimization, arguments are handled later.
2486 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
2487 i != e;
2488 ++i, ++realArgIdx) {
2489 CCValAssign &VA = ArgLocs[i];
2490 SDValue Arg = OutVals[realArgIdx];
2491 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2492 bool isByVal = Flags.isByVal();
2493
2494 // Promote the value if needed.
2495 switch (VA.getLocInfo()) {
2496 default: llvm_unreachable("Unknown loc info!");
2497 case CCValAssign::Full: break;
2498 case CCValAssign::SExt:
2499 Arg = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL: dl, VT: VA.getLocVT(), Operand: Arg);
2500 break;
2501 case CCValAssign::ZExt:
2502 Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl, VT: VA.getLocVT(), Operand: Arg);
2503 break;
2504 case CCValAssign::AExt:
2505 Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: VA.getLocVT(), Operand: Arg);
2506 break;
2507 case CCValAssign::BCvt:
2508 Arg = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: VA.getLocVT(), Operand: Arg);
2509 break;
2510 }
2511
2512 if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) {
2513 Chain = DAG.getStackArgumentTokenFactor(Chain);
2514 AfterFormalArgLoads = true;
2515 }
2516
2517 // f16 arguments have their size extended to 4 bytes and passed as if they
2518 // had been copied to the LSBs of a 32-bit register.
2519 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2520 if (VA.needsCustom() &&
2521 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) {
2522 Arg = MoveFromHPR(dl, DAG, LocVT: VA.getLocVT(), ValVT: VA.getValVT(), Val: Arg);
2523 } else {
2524 // f16 arguments could have been extended prior to argument lowering.
2525 // Mask them arguments if this is a CMSE nonsecure call.
2526 auto ArgVT = Outs[realArgIdx].ArgVT;
2527 if (isCmseNSCall && (ArgVT == MVT::f16)) {
2528 auto LocBits = VA.getLocVT().getSizeInBits();
2529 auto MaskValue = APInt::getLowBitsSet(numBits: LocBits, loBitsSet: ArgVT.getSizeInBits());
2530 SDValue Mask =
2531 DAG.getConstant(Val: MaskValue, DL: dl, VT: MVT::getIntegerVT(BitWidth: LocBits));
2532 Arg = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::getIntegerVT(BitWidth: LocBits), Operand: Arg);
2533 Arg = DAG.getNode(Opcode: ISD::AND, DL: dl, VT: MVT::getIntegerVT(BitWidth: LocBits), N1: Arg, N2: Mask);
2534 Arg = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: VA.getLocVT(), Operand: Arg);
2535 }
2536 }
2537
2538 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
2539 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
2540 SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2541 DAG.getConstant(0, dl, MVT::i32));
2542 SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2543 DAG.getConstant(1, dl, MVT::i32));
2544
2545 PassF64ArgInRegs(dl, DAG, Chain, Arg&: Op0, RegsToPass, VA, NextVA&: ArgLocs[++i],
2546 StackPtr, MemOpChains, IsTailCall: isTailCall, SPDiff);
2547
2548 VA = ArgLocs[++i]; // skip ahead to next loc
2549 if (VA.isRegLoc()) {
2550 PassF64ArgInRegs(dl, DAG, Chain, Arg&: Op1, RegsToPass, VA, NextVA&: ArgLocs[++i],
2551 StackPtr, MemOpChains, IsTailCall: isTailCall, SPDiff);
2552 } else {
2553 assert(VA.isMemLoc());
2554 SDValue DstAddr;
2555 MachinePointerInfo DstInfo;
2556 std::tie(args&: DstAddr, args&: DstInfo) =
2557 computeAddrForCallArg(dl, DAG, VA, StackPtr, IsTailCall: isTailCall, SPDiff);
2558 MemOpChains.push_back(Elt: DAG.getStore(Chain, dl, Val: Op1, Ptr: DstAddr, PtrInfo: DstInfo));
2559 }
2560 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
2561 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, NextVA&: ArgLocs[++i],
2562 StackPtr, MemOpChains, IsTailCall: isTailCall, SPDiff);
2563 } else if (VA.isRegLoc()) {
2564 if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
2565 Outs[0].VT == MVT::i32) {
2566 assert(VA.getLocVT() == MVT::i32 &&
2567 "unexpected calling convention register assignment");
2568 assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
2569 "unexpected use of 'returned'");
2570 isThisReturn = true;
2571 }
2572 const TargetOptions &Options = DAG.getTarget().Options;
2573 if (Options.EmitCallSiteInfo)
2574 CSInfo.ArgRegPairs.emplace_back(Args: VA.getLocReg(), Args&: i);
2575 RegsToPass.push_back(Elt: std::make_pair(x: VA.getLocReg(), y&: Arg));
2576 } else if (isByVal) {
2577 assert(VA.isMemLoc());
2578 unsigned offset = 0;
2579
2580 // True if this byval aggregate will be split between registers
2581 // and memory.
2582 unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
2583 unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
2584
2585 if (CurByValIdx < ByValArgsCount) {
2586
2587 unsigned RegBegin, RegEnd;
2588 CCInfo.getInRegsParamInfo(InRegsParamRecordIndex: CurByValIdx, BeginReg&: RegBegin, EndReg&: RegEnd);
2589
2590 EVT PtrVT =
2591 DAG.getTargetLoweringInfo().getPointerTy(DL: DAG.getDataLayout());
2592 unsigned int i, j;
2593 for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
2594 SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
2595 SDValue AddArg = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: Arg, N2: Const);
2596 SDValue Load =
2597 DAG.getLoad(VT: PtrVT, dl, Chain, Ptr: AddArg, PtrInfo: MachinePointerInfo(),
2598 Alignment: DAG.InferPtrAlign(Ptr: AddArg));
2599 MemOpChains.push_back(Elt: Load.getValue(R: 1));
2600 RegsToPass.push_back(Elt: std::make_pair(x&: j, y&: Load));
2601 }
2602
2603 // If parameter size outsides register area, "offset" value
2604 // helps us to calculate stack slot for remained part properly.
2605 offset = RegEnd - RegBegin;
2606
2607 CCInfo.nextInRegsParam();
2608 }
2609
2610 if (Flags.getByValSize() > 4*offset) {
2611 auto PtrVT = getPointerTy(DL: DAG.getDataLayout());
2612 SDValue Dst;
2613 MachinePointerInfo DstInfo;
2614 std::tie(args&: Dst, args&: DstInfo) =
2615 computeAddrForCallArg(dl, DAG, VA, StackPtr, IsTailCall: isTailCall, SPDiff);
2616 SDValue SrcOffset = DAG.getIntPtrConstant(Val: 4*offset, DL: dl);
2617 SDValue Src = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: Arg, N2: SrcOffset);
2618 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
2619 MVT::i32);
2620 SDValue AlignNode =
2621 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
2622
2623 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2624 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
2625 MemOpChains.push_back(Elt: DAG.getNode(Opcode: ARMISD::COPY_STRUCT_BYVAL, DL: dl, VTList: VTs,
2626 Ops));
2627 }
2628 } else {
2629 assert(VA.isMemLoc());
2630 SDValue DstAddr;
2631 MachinePointerInfo DstInfo;
2632 std::tie(args&: DstAddr, args&: DstInfo) =
2633 computeAddrForCallArg(dl, DAG, VA, StackPtr, IsTailCall: isTailCall, SPDiff);
2634
2635 SDValue Store = DAG.getStore(Chain, dl, Val: Arg, Ptr: DstAddr, PtrInfo: DstInfo);
2636 MemOpChains.push_back(Elt: Store);
2637 }
2638 }
2639
2640 if (!MemOpChains.empty())
2641 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2642
2643 // Build a sequence of copy-to-reg nodes chained together with token chain
2644 // and flag operands which copy the outgoing args into the appropriate regs.
2645 SDValue InGlue;
2646 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
2647 Chain = DAG.getCopyToReg(Chain, dl, Reg: RegsToPass[i].first,
2648 N: RegsToPass[i].second, Glue: InGlue);
2649 InGlue = Chain.getValue(R: 1);
2650 }
2651
2652 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
2653 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
2654 // node so that legalize doesn't hack it.
2655 bool isDirect = false;
2656
2657 const TargetMachine &TM = getTargetMachine();
2658 const GlobalValue *GVal = nullptr;
2659 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Val&: Callee))
2660 GVal = G->getGlobal();
2661 bool isStub = !TM.shouldAssumeDSOLocal(GV: GVal) && Subtarget->isTargetMachO();
2662
2663 bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
2664 bool isLocalARMFunc = false;
2665 auto PtrVt = getPointerTy(DL: DAG.getDataLayout());
2666
2667 if (Subtarget->genLongCalls()) {
2668 assert((!isPositionIndependent() || Subtarget->isTargetWindows()) &&
2669 "long-calls codegen is not position independent!");
2670 // Handle a global address or an external symbol. If it's not one of
2671 // those, the target's already in a register, so we don't need to do
2672 // anything extra.
2673 if (isa<GlobalAddressSDNode>(Val: Callee)) {
2674 if (Subtarget->genExecuteOnly()) {
2675 if (Subtarget->useMovt())
2676 ++NumMovwMovt;
2677 Callee = DAG.getNode(Opcode: ARMISD::Wrapper, DL: dl, VT: PtrVt,
2678 Operand: DAG.getTargetGlobalAddress(GV: GVal, DL: dl, VT: PtrVt));
2679 } else {
2680 // Create a constant pool entry for the callee address
2681 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2682 ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(
2683 C: GVal, ID: ARMPCLabelIndex, Kind: ARMCP::CPValue, PCAdj: 0);
2684
2685 // Get the address of the callee into a register
2686 SDValue Addr = DAG.getTargetConstantPool(C: CPV, VT: PtrVt, Align: Align(4));
2687 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2688 Callee = DAG.getLoad(
2689 VT: PtrVt, dl, Chain: DAG.getEntryNode(), Ptr: Addr,
2690 PtrInfo: MachinePointerInfo::getConstantPool(MF&: DAG.getMachineFunction()));
2691 }
2692 } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Val&: Callee)) {
2693 const char *Sym = S->getSymbol();
2694
2695 if (Subtarget->genExecuteOnly()) {
2696 if (Subtarget->useMovt())
2697 ++NumMovwMovt;
2698 Callee = DAG.getNode(Opcode: ARMISD::Wrapper, DL: dl, VT: PtrVt,
2699 Operand: DAG.getTargetGlobalAddress(GV: GVal, DL: dl, VT: PtrVt));
2700 } else {
2701 // Create a constant pool entry for the callee address
2702 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2703 ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(
2704 C&: *DAG.getContext(), s: Sym, ID: ARMPCLabelIndex, PCAdj: 0);
2705
2706 // Get the address of the callee into a register
2707 SDValue Addr = DAG.getTargetConstantPool(C: CPV, VT: PtrVt, Align: Align(4));
2708 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2709 Callee = DAG.getLoad(
2710 VT: PtrVt, dl, Chain: DAG.getEntryNode(), Ptr: Addr,
2711 PtrInfo: MachinePointerInfo::getConstantPool(MF&: DAG.getMachineFunction()));
2712 }
2713 }
2714 } else if (isa<GlobalAddressSDNode>(Val: Callee)) {
2715 if (!PreferIndirect) {
2716 isDirect = true;
2717 bool isDef = GVal->isStrongDefinitionForLinker();
2718
2719 // ARM call to a local ARM function is predicable.
2720 isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
2721 // tBX takes a register source operand.
2722 if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2723 assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
2724 Callee = DAG.getNode(
2725 Opcode: ARMISD::WrapperPIC, DL: dl, VT: PtrVt,
2726 Operand: DAG.getTargetGlobalAddress(GV: GVal, DL: dl, VT: PtrVt, offset: 0, TargetFlags: ARMII::MO_NONLAZY));
2727 Callee = DAG.getLoad(
2728 VT: PtrVt, dl, Chain: DAG.getEntryNode(), Ptr: Callee,
2729 PtrInfo: MachinePointerInfo::getGOT(MF&: DAG.getMachineFunction()), Alignment: MaybeAlign(),
2730 MMOFlags: MachineMemOperand::MODereferenceable |
2731 MachineMemOperand::MOInvariant);
2732 } else if (Subtarget->isTargetCOFF()) {
2733 assert(Subtarget->isTargetWindows() &&
2734 "Windows is the only supported COFF target");
2735 unsigned TargetFlags = ARMII::MO_NO_FLAG;
2736 if (GVal->hasDLLImportStorageClass())
2737 TargetFlags = ARMII::MO_DLLIMPORT;
2738 else if (!TM.shouldAssumeDSOLocal(GV: GVal))
2739 TargetFlags = ARMII::MO_COFFSTUB;
2740 Callee = DAG.getTargetGlobalAddress(GV: GVal, DL: dl, VT: PtrVt, /*offset=*/0,
2741 TargetFlags);
2742 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
2743 Callee =
2744 DAG.getLoad(VT: PtrVt, dl, Chain: DAG.getEntryNode(),
2745 Ptr: DAG.getNode(Opcode: ARMISD::Wrapper, DL: dl, VT: PtrVt, Operand: Callee),
2746 PtrInfo: MachinePointerInfo::getGOT(MF&: DAG.getMachineFunction()));
2747 } else {
2748 Callee = DAG.getTargetGlobalAddress(GV: GVal, DL: dl, VT: PtrVt, offset: 0, TargetFlags: 0);
2749 }
2750 }
2751 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Val&: Callee)) {
2752 isDirect = true;
2753 // tBX takes a register source operand.
2754 const char *Sym = S->getSymbol();
2755 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2756 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2757 ARMConstantPoolValue *CPV =
2758 ARMConstantPoolSymbol::Create(C&: *DAG.getContext(), s: Sym,
2759 ID: ARMPCLabelIndex, PCAdj: 4);
2760 SDValue CPAddr = DAG.getTargetConstantPool(C: CPV, VT: PtrVt, Align: Align(4));
2761 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2762 Callee = DAG.getLoad(
2763 VT: PtrVt, dl, Chain: DAG.getEntryNode(), Ptr: CPAddr,
2764 PtrInfo: MachinePointerInfo::getConstantPool(MF&: DAG.getMachineFunction()));
2765 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2766 Callee = DAG.getNode(Opcode: ARMISD::PIC_ADD, DL: dl, VT: PtrVt, N1: Callee, N2: PICLabel);
2767 } else {
2768 Callee = DAG.getTargetExternalSymbol(Sym, VT: PtrVt, TargetFlags: 0);
2769 }
2770 }
2771
2772 if (isCmseNSCall) {
2773 assert(!isARMFunc && !isDirect &&
2774 "Cannot handle call to ARM function or direct call");
2775 if (NumBytes > 0) {
2776 DiagnosticInfoUnsupported Diag(DAG.getMachineFunction().getFunction(),
2777 "call to non-secure function would "
2778 "require passing arguments on stack",
2779 dl.getDebugLoc());
2780 DAG.getContext()->diagnose(DI: Diag);
2781 }
2782 if (isStructRet) {
2783 DiagnosticInfoUnsupported Diag(
2784 DAG.getMachineFunction().getFunction(),
2785 "call to non-secure function would return value through pointer",
2786 dl.getDebugLoc());
2787 DAG.getContext()->diagnose(DI: Diag);
2788 }
2789 }
2790
2791 // FIXME: handle tail calls differently.
2792 unsigned CallOpc;
2793 if (Subtarget->isThumb()) {
2794 if (GuardWithBTI)
2795 CallOpc = ARMISD::t2CALL_BTI;
2796 else if (isCmseNSCall)
2797 CallOpc = ARMISD::tSECALL;
2798 else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
2799 CallOpc = ARMISD::CALL_NOLINK;
2800 else
2801 CallOpc = ARMISD::CALL;
2802 } else {
2803 if (!isDirect && !Subtarget->hasV5TOps())
2804 CallOpc = ARMISD::CALL_NOLINK;
2805 else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
2806 // Emit regular call when code size is the priority
2807 !Subtarget->hasMinSize())
2808 // "mov lr, pc; b _foo" to avoid confusing the RSP
2809 CallOpc = ARMISD::CALL_NOLINK;
2810 else
2811 CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
2812 }
2813
2814 // We don't usually want to end the call-sequence here because we would tidy
2815 // the frame up *after* the call, however in the ABI-changing tail-call case
2816 // we've carefully laid out the parameters so that when sp is reset they'll be
2817 // in the correct location.
2818 if (isTailCall && !isSibCall) {
2819 Chain = DAG.getCALLSEQ_END(Chain, Size1: 0, Size2: 0, Glue: InGlue, DL: dl);
2820 InGlue = Chain.getValue(R: 1);
2821 }
2822
2823 std::vector<SDValue> Ops;
2824 Ops.push_back(x: Chain);
2825 Ops.push_back(x: Callee);
2826
2827 if (isTailCall) {
2828 Ops.push_back(DAG.getTargetConstant(SPDiff, dl, MVT::i32));
2829 }
2830
2831 // Add argument registers to the end of the list so that they are known live
2832 // into the call.
2833 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
2834 Ops.push_back(x: DAG.getRegister(Reg: RegsToPass[i].first,
2835 VT: RegsToPass[i].second.getValueType()));
2836
2837 // Add a register mask operand representing the call-preserved registers.
2838 const uint32_t *Mask;
2839 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
2840 if (isThisReturn) {
2841 // For 'this' returns, use the R0-preserving mask if applicable
2842 Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
2843 if (!Mask) {
2844 // Set isThisReturn to false if the calling convention is not one that
2845 // allows 'returned' to be modeled in this way, so LowerCallResult does
2846 // not try to pass 'this' straight through
2847 isThisReturn = false;
2848 Mask = ARI->getCallPreservedMask(MF, CallConv);
2849 }
2850 } else
2851 Mask = ARI->getCallPreservedMask(MF, CallConv);
2852
2853 assert(Mask && "Missing call preserved mask for calling convention");
2854 Ops.push_back(x: DAG.getRegisterMask(RegMask: Mask));
2855
2856 if (InGlue.getNode())
2857 Ops.push_back(x: InGlue);
2858
2859 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2860 if (isTailCall) {
2861 MF.getFrameInfo().setHasTailCall();
2862 SDValue Ret = DAG.getNode(Opcode: ARMISD::TC_RETURN, DL: dl, VTList: NodeTys, Ops);
2863 DAG.addNoMergeSiteInfo(Node: Ret.getNode(), NoMerge: CLI.NoMerge);
2864 DAG.addCallSiteInfo(Node: Ret.getNode(), CallInfo: std::move(CSInfo));
2865 return Ret;
2866 }
2867
2868 // Returns a chain and a flag for retval copy to use.
2869 Chain = DAG.getNode(Opcode: CallOpc, DL: dl, VTList: NodeTys, Ops);
2870 DAG.addNoMergeSiteInfo(Node: Chain.getNode(), NoMerge: CLI.NoMerge);
2871 InGlue = Chain.getValue(R: 1);
2872 DAG.addCallSiteInfo(Node: Chain.getNode(), CallInfo: std::move(CSInfo));
2873
2874 // If we're guaranteeing tail-calls will be honoured, the callee must
2875 // pop its own argument stack on return. But this call is *not* a tail call so
2876 // we need to undo that after it returns to restore the status-quo.
2877 bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
2878 uint64_t CalleePopBytes =
2879 canGuaranteeTCO(CC: CallConv, GuaranteeTailCalls: TailCallOpt) ? alignTo(Value: NumBytes, Align: 16) : -1ULL;
2880
2881 Chain = DAG.getCALLSEQ_END(Chain, Size1: NumBytes, Size2: CalleePopBytes, Glue: InGlue, DL: dl);
2882 if (!Ins.empty())
2883 InGlue = Chain.getValue(R: 1);
2884
2885 // Handle result values, copying them out of physregs into vregs that we
2886 // return.
2887 return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,
2888 InVals, isThisReturn,
2889 ThisVal: isThisReturn ? OutVals[0] : SDValue());
2890}
2891
2892/// HandleByVal - Every parameter *after* a byval parameter is passed
2893/// on the stack. Remember the next parameter register to allocate,
2894/// and then confiscate the rest of the parameter registers to insure
2895/// this.
2896void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
2897 Align Alignment) const {
2898 // Byval (as with any stack) slots are always at least 4 byte aligned.
2899 Alignment = std::max(a: Alignment, b: Align(4));
2900
2901 unsigned Reg = State->AllocateReg(GPRArgRegs);
2902 if (!Reg)
2903 return;
2904
2905 unsigned AlignInRegs = Alignment.value() / 4;
2906 unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
2907 for (unsigned i = 0; i < Waste; ++i)
2908 Reg = State->AllocateReg(GPRArgRegs);
2909
2910 if (!Reg)
2911 return;
2912
2913 unsigned Excess = 4 * (ARM::R4 - Reg);
2914
2915 // Special case when NSAA != SP and parameter size greater than size of
2916 // all remained GPR regs. In that case we can't split parameter, we must
2917 // send it to stack. We also must set NCRN to R4, so waste all
2918 // remained registers.
2919 const unsigned NSAAOffset = State->getStackSize();
2920 if (NSAAOffset != 0 && Size > Excess) {
2921 while (State->AllocateReg(GPRArgRegs))
2922 ;
2923 return;
2924 }
2925
2926 // First register for byval parameter is the first register that wasn't
2927 // allocated before this method call, so it would be "reg".
2928 // If parameter is small enough to be saved in range [reg, r4), then
2929 // the end (first after last) register would be reg + param-size-in-regs,
2930 // else parameter would be splitted between registers and stack,
2931 // end register would be r4 in this case.
2932 unsigned ByValRegBegin = Reg;
2933 unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
2934 State->addInRegsParamInfo(RegBegin: ByValRegBegin, RegEnd: ByValRegEnd);
2935 // Note, first register is allocated in the beginning of function already,
2936 // allocate remained amount of registers we need.
2937 for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
2938 State->AllocateReg(GPRArgRegs);
2939 // A byval parameter that is split between registers and memory needs its
2940 // size truncated here.
2941 // In the case where the entire structure fits in registers, we set the
2942 // size in memory to zero.
2943 Size = std::max<int>(a: Size - Excess, b: 0);
2944}
2945
2946/// MatchingStackOffset - Return true if the given stack call argument is
2947/// already available in the same position (relatively) of the caller's
2948/// incoming argument stack.
2949static
2950bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
2951 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
2952 const TargetInstrInfo *TII) {
2953 unsigned Bytes = Arg.getValueSizeInBits() / 8;
2954 int FI = std::numeric_limits<int>::max();
2955 if (Arg.getOpcode() == ISD::CopyFromReg) {
2956 Register VR = cast<RegisterSDNode>(Val: Arg.getOperand(i: 1))->getReg();
2957 if (!VR.isVirtual())
2958 return false;
2959 MachineInstr *Def = MRI->getVRegDef(Reg: VR);
2960 if (!Def)
2961 return false;
2962 if (!Flags.isByVal()) {
2963 if (!TII->isLoadFromStackSlot(MI: *Def, FrameIndex&: FI))
2964 return false;
2965 } else {
2966 return false;
2967 }
2968 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Val&: Arg)) {
2969 if (Flags.isByVal())
2970 // ByVal argument is passed in as a pointer but it's now being
2971 // dereferenced. e.g.
2972 // define @foo(%struct.X* %A) {
2973 // tail call @bar(%struct.X* byval %A)
2974 // }
2975 return false;
2976 SDValue Ptr = Ld->getBasePtr();
2977 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Val&: Ptr);
2978 if (!FINode)
2979 return false;
2980 FI = FINode->getIndex();
2981 } else
2982 return false;
2983
2984 assert(FI != std::numeric_limits<int>::max());
2985 if (!MFI.isFixedObjectIndex(ObjectIdx: FI))
2986 return false;
2987 return Offset == MFI.getObjectOffset(ObjectIdx: FI) && Bytes == MFI.getObjectSize(ObjectIdx: FI);
2988}
2989
2990/// IsEligibleForTailCallOptimization - Check whether the call is eligible
2991/// for tail call optimization. Targets which want to do tail call
2992/// optimization should implement this function.
2993bool ARMTargetLowering::IsEligibleForTailCallOptimization(
2994 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
2995 bool isCalleeStructRet, bool isCallerStructRet,
2996 const SmallVectorImpl<ISD::OutputArg> &Outs,
2997 const SmallVectorImpl<SDValue> &OutVals,
2998 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG,
2999 const bool isIndirect) const {
3000 MachineFunction &MF = DAG.getMachineFunction();
3001 const Function &CallerF = MF.getFunction();
3002 CallingConv::ID CallerCC = CallerF.getCallingConv();
3003
3004 assert(Subtarget->supportsTailCall());
3005
3006 // Indirect tail calls cannot be optimized for Thumb1 if the args
3007 // to the call take up r0-r3. The reason is that there are no legal registers
3008 // left to hold the pointer to the function to be called.
3009 // Similarly, if the function uses return address sign and authentication,
3010 // r12 is needed to hold the PAC and is not available to hold the callee
3011 // address.
3012 if (Outs.size() >= 4 &&
3013 (!isa<GlobalAddressSDNode>(Val: Callee.getNode()) || isIndirect)) {
3014 if (Subtarget->isThumb1Only())
3015 return false;
3016 // Conservatively assume the function spills LR.
3017 if (MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress(SpillsLR: true))
3018 return false;
3019 }
3020
3021 // Look for obvious safe cases to perform tail call optimization that do not
3022 // require ABI changes. This is what gcc calls sibcall.
3023
3024 // Exception-handling functions need a special set of instructions to indicate
3025 // a return to the hardware. Tail-calling another function would probably
3026 // break this.
3027 if (CallerF.hasFnAttribute(Kind: "interrupt"))
3028 return false;
3029
3030 if (canGuaranteeTCO(CC: CalleeCC, GuaranteeTailCalls: getTargetMachine().Options.GuaranteedTailCallOpt))
3031 return CalleeCC == CallerCC;
3032
3033 // Also avoid sibcall optimization if either caller or callee uses struct
3034 // return semantics.
3035 if (isCalleeStructRet || isCallerStructRet)
3036 return false;
3037
3038 // Externally-defined functions with weak linkage should not be
3039 // tail-called on ARM when the OS does not support dynamic
3040 // pre-emption of symbols, as the AAELF spec requires normal calls
3041 // to undefined weak functions to be replaced with a NOP or jump to the
3042 // next instruction. The behaviour of branch instructions in this
3043 // situation (as used for tail calls) is implementation-defined, so we
3044 // cannot rely on the linker replacing the tail call with a return.
3045 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Val&: Callee)) {
3046 const GlobalValue *GV = G->getGlobal();
3047 const Triple &TT = getTargetMachine().getTargetTriple();
3048 if (GV->hasExternalWeakLinkage() &&
3049 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
3050 return false;
3051 }
3052
3053 // Check that the call results are passed in the same way.
3054 LLVMContext &C = *DAG.getContext();
3055 if (!CCState::resultsCompatible(
3056 CalleeCC: getEffectiveCallingConv(CC: CalleeCC, isVarArg),
3057 CallerCC: getEffectiveCallingConv(CC: CallerCC, isVarArg: CallerF.isVarArg()), MF, C, Ins,
3058 CalleeFn: CCAssignFnForReturn(CC: CalleeCC, isVarArg),
3059 CallerFn: CCAssignFnForReturn(CC: CallerCC, isVarArg: CallerF.isVarArg())))
3060 return false;
3061 // The callee has to preserve all registers the caller needs to preserve.
3062 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
3063 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3064 if (CalleeCC != CallerCC) {
3065 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3066 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3067 return false;
3068 }
3069
3070 // If Caller's vararg or byval argument has been split between registers and
3071 // stack, do not perform tail call, since part of the argument is in caller's
3072 // local frame.
3073 const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
3074 if (AFI_Caller->getArgRegsSaveSize())
3075 return false;
3076
3077 // If the callee takes no arguments then go on to check the results of the
3078 // call.
3079 if (!Outs.empty()) {
3080 // Check if stack adjustment is needed. For now, do not do this if any
3081 // argument is passed on the stack.
3082 SmallVector<CCValAssign, 16> ArgLocs;
3083 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3084 CCInfo.AnalyzeCallOperands(Outs, Fn: CCAssignFnForCall(CC: CalleeCC, isVarArg));
3085 if (CCInfo.getStackSize()) {
3086 // Check if the arguments are already laid out in the right way as
3087 // the caller's fixed stack objects.
3088 MachineFrameInfo &MFI = MF.getFrameInfo();
3089 const MachineRegisterInfo *MRI = &MF.getRegInfo();
3090 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3091 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
3092 i != e;
3093 ++i, ++realArgIdx) {
3094 CCValAssign &VA = ArgLocs[i];
3095 EVT RegVT = VA.getLocVT();
3096 SDValue Arg = OutVals[realArgIdx];
3097 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
3098 if (VA.getLocInfo() == CCValAssign::Indirect)
3099 return false;
3100 if (VA.needsCustom() && (RegVT == MVT::f64 || RegVT == MVT::v2f64)) {
3101 // f64 and vector types are split into multiple registers or
3102 // register/stack-slot combinations. The types will not match
3103 // the registers; give up on memory f64 refs until we figure
3104 // out what to do about this.
3105 if (!VA.isRegLoc())
3106 return false;
3107 if (!ArgLocs[++i].isRegLoc())
3108 return false;
3109 if (RegVT == MVT::v2f64) {
3110 if (!ArgLocs[++i].isRegLoc())
3111 return false;
3112 if (!ArgLocs[++i].isRegLoc())
3113 return false;
3114 }
3115 } else if (!VA.isRegLoc()) {
3116 if (!MatchingStackOffset(Arg, Offset: VA.getLocMemOffset(), Flags,
3117 MFI, MRI, TII))
3118 return false;
3119 }
3120 }
3121 }
3122
3123 const MachineRegisterInfo &MRI = MF.getRegInfo();
3124 if (!parametersInCSRMatch(MRI, CallerPreservedMask: CallerPreserved, ArgLocs, OutVals))
3125 return false;
3126 }
3127
3128 return true;
3129}
3130
3131bool
3132ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
3133 MachineFunction &MF, bool isVarArg,
3134 const SmallVectorImpl<ISD::OutputArg> &Outs,
3135 LLVMContext &Context) const {
3136 SmallVector<CCValAssign, 16> RVLocs;
3137 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
3138 return CCInfo.CheckReturn(Outs, Fn: CCAssignFnForReturn(CC: CallConv, isVarArg));
3139}
3140
3141static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,
3142 const SDLoc &DL, SelectionDAG &DAG) {
3143 const MachineFunction &MF = DAG.getMachineFunction();
3144 const Function &F = MF.getFunction();
3145
3146 StringRef IntKind = F.getFnAttribute(Kind: "interrupt").getValueAsString();
3147
3148 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
3149 // version of the "preferred return address". These offsets affect the return
3150 // instruction if this is a return from PL1 without hypervisor extensions.
3151 // IRQ/FIQ: +4 "subs pc, lr, #4"
3152 // SWI: 0 "subs pc, lr, #0"
3153 // ABORT: +4 "subs pc, lr, #4"
3154 // UNDEF: +4/+2 "subs pc, lr, #0"
3155 // UNDEF varies depending on where the exception came from ARM or Thumb
3156 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
3157
3158 int64_t LROffset;
3159 if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
3160 IntKind == "ABORT")
3161 LROffset = 4;
3162 else if (IntKind == "SWI" || IntKind == "UNDEF")
3163 LROffset = 0;
3164 else
3165 report_fatal_error(reason: "Unsupported interrupt attribute. If present, value "
3166 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
3167
3168 RetOps.insert(RetOps.begin() + 1,
3169 DAG.getConstant(LROffset, DL, MVT::i32, false));
3170
3171 return DAG.getNode(ARMISD::INTRET_GLUE, DL, MVT::Other, RetOps);
3172}
3173
3174SDValue
3175ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3176 bool isVarArg,
3177 const SmallVectorImpl<ISD::OutputArg> &Outs,
3178 const SmallVectorImpl<SDValue> &OutVals,
3179 const SDLoc &dl, SelectionDAG &DAG) const {
3180 // CCValAssign - represent the assignment of the return value to a location.
3181 SmallVector<CCValAssign, 16> RVLocs;
3182
3183 // CCState - Info about the registers and stack slots.
3184 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3185 *DAG.getContext());
3186
3187 // Analyze outgoing return values.
3188 CCInfo.AnalyzeReturn(Outs, Fn: CCAssignFnForReturn(CC: CallConv, isVarArg));
3189
3190 SDValue Glue;
3191 SmallVector<SDValue, 4> RetOps;
3192 RetOps.push_back(Elt: Chain); // Operand #0 = Chain (updated below)
3193 bool isLittleEndian = Subtarget->isLittle();
3194
3195 MachineFunction &MF = DAG.getMachineFunction();
3196 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3197 AFI->setReturnRegsCount(RVLocs.size());
3198
3199 // Report error if cmse entry function returns structure through first ptr arg.
3200 if (AFI->isCmseNSEntryFunction() && MF.getFunction().hasStructRetAttr()) {
3201 // Note: using an empty SDLoc(), as the first line of the function is a
3202 // better place to report than the last line.
3203 DiagnosticInfoUnsupported Diag(
3204 DAG.getMachineFunction().getFunction(),
3205 "secure entry function would return value through pointer",
3206 SDLoc().getDebugLoc());
3207 DAG.getContext()->diagnose(DI: Diag);
3208 }
3209
3210 // Copy the result values into the output registers.
3211 for (unsigned i = 0, realRVLocIdx = 0;
3212 i != RVLocs.size();
3213 ++i, ++realRVLocIdx) {
3214 CCValAssign &VA = RVLocs[i];
3215 assert(VA.isRegLoc() && "Can only return in registers!");
3216
3217 SDValue Arg = OutVals[realRVLocIdx];
3218 bool ReturnF16 = false;
3219
3220 if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) {
3221 // Half-precision return values can be returned like this:
3222 //
3223 // t11 f16 = fadd ...
3224 // t12: i16 = bitcast t11
3225 // t13: i32 = zero_extend t12
3226 // t14: f32 = bitcast t13 <~~~~~~~ Arg
3227 //
3228 // to avoid code generation for bitcasts, we simply set Arg to the node
3229 // that produces the f16 value, t11 in this case.
3230 //
3231 if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) {
3232 SDValue ZE = Arg.getOperand(i: 0);
3233 if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) {
3234 SDValue BC = ZE.getOperand(i: 0);
3235 if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) {
3236 Arg = BC.getOperand(i: 0);
3237 ReturnF16 = true;
3238 }
3239 }
3240 }
3241 }
3242
3243 switch (VA.getLocInfo()) {
3244 default: llvm_unreachable("Unknown loc info!");
3245 case CCValAssign::Full: break;
3246 case CCValAssign::BCvt:
3247 if (!ReturnF16)
3248 Arg = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: VA.getLocVT(), Operand: Arg);
3249 break;
3250 }
3251
3252 // Mask f16 arguments if this is a CMSE nonsecure entry.
3253 auto RetVT = Outs[realRVLocIdx].ArgVT;
3254 if (AFI->isCmseNSEntryFunction() && (RetVT == MVT::f16)) {
3255 if (VA.needsCustom() && VA.getValVT() == MVT::f16) {
3256 Arg = MoveFromHPR(dl, DAG, LocVT: VA.getLocVT(), ValVT: VA.getValVT(), Val: Arg);
3257 } else {
3258 auto LocBits = VA.getLocVT().getSizeInBits();
3259 auto MaskValue = APInt::getLowBitsSet(numBits: LocBits, loBitsSet: RetVT.getSizeInBits());
3260 SDValue Mask =
3261 DAG.getConstant(Val: MaskValue, DL: dl, VT: MVT::getIntegerVT(BitWidth: LocBits));
3262 Arg = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::getIntegerVT(BitWidth: LocBits), Operand: Arg);
3263 Arg = DAG.getNode(Opcode: ISD::AND, DL: dl, VT: MVT::getIntegerVT(BitWidth: LocBits), N1: Arg, N2: Mask);
3264 Arg = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: VA.getLocVT(), Operand: Arg);
3265 }
3266 }
3267
3268 if (VA.needsCustom() &&
3269 (VA.getLocVT() == MVT::v2f64 || VA.getLocVT() == MVT::f64)) {
3270 if (VA.getLocVT() == MVT::v2f64) {
3271 // Extract the first half and return it in two registers.
3272 SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3273 DAG.getConstant(0, dl, MVT::i32));
3274 SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
3275 DAG.getVTList(MVT::i32, MVT::i32), Half);
3276
3277 Chain =
3278 DAG.getCopyToReg(Chain, dl, Reg: VA.getLocReg(),
3279 N: HalfGPRs.getValue(R: isLittleEndian ? 0 : 1), Glue);
3280 Glue = Chain.getValue(R: 1);
3281 RetOps.push_back(Elt: DAG.getRegister(Reg: VA.getLocReg(), VT: VA.getLocVT()));
3282 VA = RVLocs[++i]; // skip ahead to next loc
3283 Chain =
3284 DAG.getCopyToReg(Chain, dl, Reg: VA.getLocReg(),
3285 N: HalfGPRs.getValue(R: isLittleEndian ? 1 : 0), Glue);
3286 Glue = Chain.getValue(R: 1);
3287 RetOps.push_back(Elt: DAG.getRegister(Reg: VA.getLocReg(), VT: VA.getLocVT()));
3288 VA = RVLocs[++i]; // skip ahead to next loc
3289
3290 // Extract the 2nd half and fall through to handle it as an f64 value.
3291 Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3292 DAG.getConstant(1, dl, MVT::i32));
3293 }
3294 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is
3295 // available.
3296 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
3297 DAG.getVTList(MVT::i32, MVT::i32), Arg);
3298 Chain = DAG.getCopyToReg(Chain, dl, Reg: VA.getLocReg(),
3299 N: fmrrd.getValue(R: isLittleEndian ? 0 : 1), Glue);
3300 Glue = Chain.getValue(R: 1);
3301 RetOps.push_back(Elt: DAG.getRegister(Reg: VA.getLocReg(), VT: VA.getLocVT()));
3302 VA = RVLocs[++i]; // skip ahead to next loc
3303 Chain = DAG.getCopyToReg(Chain, dl, Reg: VA.getLocReg(),
3304 N: fmrrd.getValue(R: isLittleEndian ? 1 : 0), Glue);
3305 } else
3306 Chain = DAG.getCopyToReg(Chain, dl, Reg: VA.getLocReg(), N: Arg, Glue);
3307
3308 // Guarantee that all emitted copies are
3309 // stuck together, avoiding something bad.
3310 Glue = Chain.getValue(R: 1);
3311 RetOps.push_back(Elt: DAG.getRegister(
3312 Reg: VA.getLocReg(), VT: ReturnF16 ? Arg.getValueType() : VA.getLocVT()));
3313 }
3314 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
3315 const MCPhysReg *I =
3316 TRI->getCalleeSavedRegsViaCopy(MF: &DAG.getMachineFunction());
3317 if (I) {
3318 for (; *I; ++I) {
3319 if (ARM::GPRRegClass.contains(*I))
3320 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3321 else if (ARM::DPRRegClass.contains(*I))
3322 RetOps.push_back(Elt: DAG.getRegister(Reg: *I, VT: MVT::getFloatingPointVT(BitWidth: 64)));
3323 else
3324 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3325 }
3326 }
3327
3328 // Update chain and glue.
3329 RetOps[0] = Chain;
3330 if (Glue.getNode())
3331 RetOps.push_back(Elt: Glue);
3332
3333 // CPUs which aren't M-class use a special sequence to return from
3334 // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
3335 // though we use "subs pc, lr, #N").
3336 //
3337 // M-class CPUs actually use a normal return sequence with a special
3338 // (hardware-provided) value in LR, so the normal code path works.
3339 if (DAG.getMachineFunction().getFunction().hasFnAttribute(Kind: "interrupt") &&
3340 !Subtarget->isMClass()) {
3341 if (Subtarget->isThumb1Only())
3342 report_fatal_error(reason: "interrupt attribute is not supported in Thumb1");
3343 return LowerInterruptReturn(RetOps, DL: dl, DAG);
3344 }
3345
3346 ARMISD::NodeType RetNode = AFI->isCmseNSEntryFunction() ? ARMISD::SERET_GLUE :
3347 ARMISD::RET_GLUE;
3348 return DAG.getNode(RetNode, dl, MVT::Other, RetOps);
3349}
3350
3351bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
3352 if (N->getNumValues() != 1)
3353 return false;
3354 if (!N->hasNUsesOfValue(NUses: 1, Value: 0))
3355 return false;
3356
3357 SDValue TCChain = Chain;
3358 SDNode *Copy = *N->use_begin();
3359 if (Copy->getOpcode() == ISD::CopyToReg) {
3360 // If the copy has a glue operand, we conservatively assume it isn't safe to
3361 // perform a tail call.
3362 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3363 return false;
3364 TCChain = Copy->getOperand(Num: 0);
3365 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
3366 SDNode *VMov = Copy;
3367 // f64 returned in a pair of GPRs.
3368 SmallPtrSet<SDNode*, 2> Copies;
3369 for (SDNode *U : VMov->uses()) {
3370 if (U->getOpcode() != ISD::CopyToReg)
3371 return false;
3372 Copies.insert(Ptr: U);
3373 }
3374 if (Copies.size() > 2)
3375 return false;
3376
3377 for (SDNode *U : VMov->uses()) {
3378 SDValue UseChain = U->getOperand(Num: 0);
3379 if (Copies.count(Ptr: UseChain.getNode()))
3380 // Second CopyToReg
3381 Copy = U;
3382 else {
3383 // We are at the top of this chain.
3384 // If the copy has a glue operand, we conservatively assume it
3385 // isn't safe to perform a tail call.
3386 if (U->getOperand(U->getNumOperands() - 1).getValueType() == MVT::Glue)
3387 return false;
3388 // First CopyToReg
3389 TCChain = UseChain;
3390 }
3391 }
3392 } else if (Copy->getOpcode() == ISD::BITCAST) {
3393 // f32 returned in a single GPR.
3394 if (!Copy->hasOneUse())
3395 return false;
3396 Copy = *Copy->use_begin();
3397 if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(NUses: 1, Value: 0))
3398 return false;
3399 // If the copy has a glue operand, we conservatively assume it isn't safe to
3400 // perform a tail call.
3401 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3402 return false;
3403 TCChain = Copy->getOperand(Num: 0);
3404 } else {
3405 return false;
3406 }
3407
3408 bool HasRet = false;
3409 for (const SDNode *U : Copy->uses()) {
3410 if (U->getOpcode() != ARMISD::RET_GLUE &&
3411 U->getOpcode() != ARMISD::INTRET_GLUE)
3412 return false;
3413 HasRet = true;
3414 }
3415
3416 if (!HasRet)
3417 return false;
3418
3419 Chain = TCChain;
3420 return true;
3421}
3422
3423bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3424 if (!Subtarget->supportsTailCall())
3425 return false;
3426
3427 if (!CI->isTailCall())
3428 return false;
3429
3430 return true;
3431}
3432
3433// Trying to write a 64 bit value so need to split into two 32 bit values first,
3434// and pass the lower and high parts through.
3435static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) {
3436 SDLoc DL(Op);
3437 SDValue WriteValue = Op->getOperand(Num: 2);
3438
3439 // This function is only supposed to be called for i64 type argument.
3440 assert(WriteValue.getValueType() == MVT::i64
3441 && "LowerWRITE_REGISTER called for non-i64 type argument.");
3442
3443 SDValue Lo, Hi;
3444 std::tie(Lo, Hi) = DAG.SplitScalar(WriteValue, DL, MVT::i32, MVT::i32);
3445 SDValue Ops[] = { Op->getOperand(Num: 0), Op->getOperand(Num: 1), Lo, Hi };
3446 return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
3447}
3448
3449// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
3450// their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
3451// one of the above mentioned nodes. It has to be wrapped because otherwise
3452// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
3453// be used to form addressing mode. These wrapped nodes will be selected
3454// into MOVi.
3455SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
3456 SelectionDAG &DAG) const {
3457 EVT PtrVT = Op.getValueType();
3458 // FIXME there is no actual debug info here
3459 SDLoc dl(Op);
3460 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Val&: Op);
3461 SDValue Res;
3462
3463 // When generating execute-only code Constant Pools must be promoted to the
3464 // global data section. It's a bit ugly that we can't share them across basic
3465 // blocks, but this way we guarantee that execute-only behaves correct with
3466 // position-independent addressing modes.
3467 if (Subtarget->genExecuteOnly()) {
3468 auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
3469 auto T = const_cast<Type*>(CP->getType());
3470 auto C = const_cast<Constant*>(CP->getConstVal());
3471 auto M = const_cast<Module*>(DAG.getMachineFunction().
3472 getFunction().getParent());
3473 auto GV = new GlobalVariable(
3474 *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C,
3475 Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" +
3476 Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" +
3477 Twine(AFI->createPICLabelUId())
3478 );
3479 SDValue GA = DAG.getTargetGlobalAddress(GV: dyn_cast<GlobalValue>(Val: GV),
3480 DL: dl, VT: PtrVT);
3481 return LowerGlobalAddress(Op: GA, DAG);
3482 }
3483
3484 // The 16-bit ADR instruction can only encode offsets that are multiples of 4,
3485 // so we need to align to at least 4 bytes when we don't have 32-bit ADR.
3486 Align CPAlign = CP->getAlign();
3487 if (Subtarget->isThumb1Only())
3488 CPAlign = std::max(a: CPAlign, b: Align(4));
3489 if (CP->isMachineConstantPoolEntry())
3490 Res =
3491 DAG.getTargetConstantPool(C: CP->getMachineCPVal(), VT: PtrVT, Align: CPAlign);
3492 else
3493 Res = DAG.getTargetConstantPool(C: CP->getConstVal(), VT: PtrVT, Align: CPAlign);
3494 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
3495}
3496
3497unsigned ARMTargetLowering::getJumpTableEncoding() const {
3498 // If we don't have a 32-bit pc-relative branch instruction then the jump
3499 // table consists of block addresses. Usually this is inline, but for
3500 // execute-only it must be placed out-of-line.
3501 if (Subtarget->genExecuteOnly() && !Subtarget->hasV8MBaselineOps())
3502 return MachineJumpTableInfo::EK_BlockAddress;
3503 return MachineJumpTableInfo::EK_Inline;
3504}
3505
3506SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
3507 SelectionDAG &DAG) const {
3508 MachineFunction &MF = DAG.getMachineFunction();
3509 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3510 unsigned ARMPCLabelIndex = 0;
3511 SDLoc DL(Op);
3512 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
3513 const BlockAddress *BA = cast<BlockAddressSDNode>(Val&: Op)->getBlockAddress();
3514 SDValue CPAddr;
3515 bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI();
3516 if (!IsPositionIndependent) {
3517 CPAddr = DAG.getTargetConstantPool(C: BA, VT: PtrVT, Align: Align(4));
3518 } else {
3519 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
3520 ARMPCLabelIndex = AFI->createPICLabelUId();
3521 ARMConstantPoolValue *CPV =
3522 ARMConstantPoolConstant::Create(C: BA, ID: ARMPCLabelIndex,
3523 Kind: ARMCP::CPBlockAddress, PCAdj);
3524 CPAddr = DAG.getTargetConstantPool(C: CPV, VT: PtrVT, Align: Align(4));
3525 }
3526 CPAddr = DAG.getNode(Opcode: ARMISD::Wrapper, DL, VT: PtrVT, Operand: CPAddr);
3527 SDValue Result = DAG.getLoad(
3528 VT: PtrVT, dl: DL, Chain: DAG.getEntryNode(), Ptr: CPAddr,
3529 PtrInfo: MachinePointerInfo::getConstantPool(MF&: DAG.getMachineFunction()));
3530 if (!IsPositionIndependent)
3531 return Result;
3532 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
3533 return DAG.getNode(Opcode: ARMISD::PIC_ADD, DL, VT: PtrVT, N1: Result, N2: PICLabel);
3534}
3535
3536/// Convert a TLS address reference into the correct sequence of loads
3537/// and calls to compute the variable's address for Darwin, and return an
3538/// SDValue containing the final node.
3539
3540/// Darwin only has one TLS scheme which must be capable of dealing with the
3541/// fully general situation, in the worst case. This means:
3542/// + "extern __thread" declaration.
3543/// + Defined in a possibly unknown dynamic library.
3544///
3545/// The general system is that each __thread variable has a [3 x i32] descriptor
3546/// which contains information used by the runtime to calculate the address. The
3547/// only part of this the compiler needs to know about is the first word, which
3548/// contains a function pointer that must be called with the address of the
3549/// entire descriptor in "r0".
3550///
3551/// Since this descriptor may be in a different unit, in general access must
3552/// proceed along the usual ARM rules. A common sequence to produce is:
3553///
3554/// movw rT1, :lower16:_var$non_lazy_ptr
3555/// movt rT1, :upper16:_var$non_lazy_ptr
3556/// ldr r0, [rT1]
3557/// ldr rT2, [r0]
3558/// blx rT2
3559/// [...address now in r0...]
3560SDValue
3561ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
3562 SelectionDAG &DAG) const {
3563 assert(Subtarget->isTargetDarwin() &&
3564 "This function expects a Darwin target");
3565 SDLoc DL(Op);
3566
3567 // First step is to get the address of the actua global symbol. This is where
3568 // the TLS descriptor lives.
3569 SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);
3570
3571 // The first entry in the descriptor is a function pointer that we must call
3572 // to obtain the address of the variable.
3573 SDValue Chain = DAG.getEntryNode();
3574 SDValue FuncTLVGet = DAG.getLoad(
3575 MVT::i32, DL, Chain, DescAddr,
3576 MachinePointerInfo::getGOT(DAG.getMachineFunction()), Align(4),
3577 MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable |
3578 MachineMemOperand::MOInvariant);
3579 Chain = FuncTLVGet.getValue(R: 1);
3580
3581 MachineFunction &F = DAG.getMachineFunction();
3582 MachineFrameInfo &MFI = F.getFrameInfo();
3583 MFI.setAdjustsStack(true);
3584
3585 // TLS calls preserve all registers except those that absolutely must be
3586 // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
3587 // silly).
3588 auto TRI =
3589 getTargetMachine().getSubtargetImpl(F.getFunction())->getRegisterInfo();
3590 auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
3591 const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction());
3592
3593 // Finally, we can make the call. This is just a degenerate version of a
3594 // normal AArch64 call node: r0 takes the address of the descriptor, and
3595 // returns the address of the variable in this thread.
3596 Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
3597 Chain =
3598 DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
3599 Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
3600 DAG.getRegisterMask(Mask), Chain.getValue(1));
3601 return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
3602}
3603
3604SDValue
3605ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
3606 SelectionDAG &DAG) const {
3607 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
3608
3609 SDValue Chain = DAG.getEntryNode();
3610 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
3611 SDLoc DL(Op);
3612
3613 // Load the current TEB (thread environment block)
3614 SDValue Ops[] = {Chain,
3615 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
3616 DAG.getTargetConstant(15, DL, MVT::i32),
3617 DAG.getTargetConstant(0, DL, MVT::i32),
3618 DAG.getTargetConstant(13, DL, MVT::i32),
3619 DAG.getTargetConstant(0, DL, MVT::i32),
3620 DAG.getTargetConstant(2, DL, MVT::i32)};
3621 SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
3622 DAG.getVTList(MVT::i32, MVT::Other), Ops);
3623
3624 SDValue TEB = CurrentTEB.getValue(R: 0);
3625 Chain = CurrentTEB.getValue(R: 1);
3626
3627 // Load the ThreadLocalStoragePointer from the TEB
3628 // A pointer to the TLS array is located at offset 0x2c from the TEB.
3629 SDValue TLSArray =
3630 DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: TEB, N2: DAG.getIntPtrConstant(Val: 0x2c, DL));
3631 TLSArray = DAG.getLoad(VT: PtrVT, dl: DL, Chain, Ptr: TLSArray, PtrInfo: MachinePointerInfo());
3632
3633 // The pointer to the thread's TLS data area is at the TLS Index scaled by 4
3634 // offset into the TLSArray.
3635
3636 // Load the TLS index from the C runtime
3637 SDValue TLSIndex =
3638 DAG.getTargetExternalSymbol(Sym: "_tls_index", VT: PtrVT, TargetFlags: ARMII::MO_NO_FLAG);
3639 TLSIndex = DAG.getNode(Opcode: ARMISD::Wrapper, DL, VT: PtrVT, Operand: TLSIndex);
3640 TLSIndex = DAG.getLoad(VT: PtrVT, dl: DL, Chain, Ptr: TLSIndex, PtrInfo: MachinePointerInfo());
3641
3642 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
3643 DAG.getConstant(2, DL, MVT::i32));
3644 SDValue TLS = DAG.getLoad(VT: PtrVT, dl: DL, Chain,
3645 Ptr: DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: TLSArray, N2: Slot),
3646 PtrInfo: MachinePointerInfo());
3647
3648 // Get the offset of the start of the .tls section (section base)
3649 const auto *GA = cast<GlobalAddressSDNode>(Val&: Op);
3650 auto *CPV = ARMConstantPoolConstant::Create(GV: GA->getGlobal(), Modifier: ARMCP::SECREL);
3651 SDValue Offset = DAG.getLoad(
3652 PtrVT, DL, Chain,
3653 DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
3654 DAG.getTargetConstantPool(CPV, PtrVT, Align(4))),
3655 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3656
3657 return DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: TLS, N2: Offset);
3658}
3659
3660// Lower ISD::GlobalTLSAddress using the "general dynamic" model
3661SDValue
3662ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
3663 SelectionDAG &DAG) const {
3664 SDLoc dl(GA);
3665 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
3666 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3667 MachineFunction &MF = DAG.getMachineFunction();
3668 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3669 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3670 ARMConstantPoolValue *CPV =
3671 ARMConstantPoolConstant::Create(C: GA->getGlobal(), ID: ARMPCLabelIndex,
3672 Kind: ARMCP::CPValue, PCAdj, Modifier: ARMCP::TLSGD, AddCurrentAddress: true);
3673 SDValue Argument = DAG.getTargetConstantPool(C: CPV, VT: PtrVT, Align: Align(4));
3674 Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
3675 Argument = DAG.getLoad(
3676 VT: PtrVT, dl, Chain: DAG.getEntryNode(), Ptr: Argument,
3677 PtrInfo: MachinePointerInfo::getConstantPool(MF&: DAG.getMachineFunction()));
3678 SDValue Chain = Argument.getValue(R: 1);
3679
3680 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3681 Argument = DAG.getNode(Opcode: ARMISD::PIC_ADD, DL: dl, VT: PtrVT, N1: Argument, N2: PICLabel);
3682
3683 // call __tls_get_addr.
3684 ArgListTy Args;
3685 ArgListEntry Entry;
3686 Entry.Node = Argument;
3687 Entry.Ty = (Type *) Type::getInt32Ty(C&: *DAG.getContext());
3688 Args.push_back(x: Entry);
3689
3690 // FIXME: is there useful debug info available here?
3691 TargetLowering::CallLoweringInfo CLI(DAG);
3692 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
3693 CC: CallingConv::C, ResultType: Type::getInt32Ty(C&: *DAG.getContext()),
3694 Target: DAG.getExternalSymbol(Sym: "__tls_get_addr", VT: PtrVT), ArgsList: std::move(Args));
3695
3696 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
3697 return CallResult.first;
3698}
3699
3700// Lower ISD::GlobalTLSAddress using the "initial exec" or
3701// "local exec" model.
3702SDValue
3703ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
3704 SelectionDAG &DAG,
3705 TLSModel::Model model) const {
3706 const GlobalValue *GV = GA->getGlobal();
3707 SDLoc dl(GA);
3708 SDValue Offset;
3709 SDValue Chain = DAG.getEntryNode();
3710 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
3711 // Get the Thread Pointer
3712 SDValue ThreadPointer = DAG.getNode(Opcode: ARMISD::THREAD_POINTER, DL: dl, VT: PtrVT);
3713
3714 if (model == TLSModel::InitialExec) {
3715 MachineFunction &MF = DAG.getMachineFunction();
3716 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3717 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3718 // Initial exec model.
3719 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3720 ARMConstantPoolValue *CPV =
3721 ARMConstantPoolConstant::Create(C: GA->getGlobal(), ID: ARMPCLabelIndex,
3722 Kind: ARMCP::CPValue, PCAdj, Modifier: ARMCP::GOTTPOFF,
3723 AddCurrentAddress: true);
3724 Offset = DAG.getTargetConstantPool(C: CPV, VT: PtrVT, Align: Align(4));
3725 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3726 Offset = DAG.getLoad(
3727 VT: PtrVT, dl, Chain, Ptr: Offset,
3728 PtrInfo: MachinePointerInfo::getConstantPool(MF&: DAG.getMachineFunction()));
3729 Chain = Offset.getValue(R: 1);
3730
3731 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3732 Offset = DAG.getNode(Opcode: ARMISD::PIC_ADD, DL: dl, VT: PtrVT, N1: Offset, N2: PICLabel);
3733
3734 Offset = DAG.getLoad(
3735 VT: PtrVT, dl, Chain, Ptr: Offset,
3736 PtrInfo: MachinePointerInfo::getConstantPool(MF&: DAG.getMachineFunction()));
3737 } else {
3738 // local exec model
3739 assert(model == TLSModel::LocalExec);
3740 ARMConstantPoolValue *CPV =
3741 ARMConstantPoolConstant::Create(GV, Modifier: ARMCP::TPOFF);
3742 Offset = DAG.getTargetConstantPool(C: CPV, VT: PtrVT, Align: Align(4));
3743 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3744 Offset = DAG.getLoad(
3745 VT: PtrVT, dl, Chain, Ptr: Offset,
3746 PtrInfo: MachinePointerInfo::getConstantPool(MF&: DAG.getMachineFunction()));
3747 }
3748
3749 // The address of the thread local variable is the add of the thread
3750 // pointer with the offset of the variable.
3751 return DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: ThreadPointer, N2: Offset);
3752}
3753
3754SDValue
3755ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
3756 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Val&: Op);
3757 if (DAG.getTarget().useEmulatedTLS())
3758 return LowerToTLSEmulatedModel(GA, DAG);
3759
3760 if (Subtarget->isTargetDarwin())
3761 return LowerGlobalTLSAddressDarwin(Op, DAG);
3762
3763 if (Subtarget->isTargetWindows())
3764 return LowerGlobalTLSAddressWindows(Op, DAG);
3765
3766 // TODO: implement the "local dynamic" model
3767 assert(Subtarget->isTargetELF() && "Only ELF implemented here");
3768 TLSModel::Model model = getTargetMachine().getTLSModel(GV: GA->getGlobal());
3769
3770 switch (model) {
3771 case TLSModel::GeneralDynamic:
3772 case TLSModel::LocalDynamic:
3773 return LowerToTLSGeneralDynamicModel(GA, DAG);
3774 case TLSModel::InitialExec:
3775 case TLSModel::LocalExec:
3776 return LowerToTLSExecModels(GA, DAG, model);
3777 }
3778 llvm_unreachable("bogus TLS model");
3779}
3780
3781/// Return true if all users of V are within function F, looking through
3782/// ConstantExprs.
3783static bool allUsersAreInFunction(const Value *V, const Function *F) {
3784 SmallVector<const User*,4> Worklist(V->users());
3785 while (!Worklist.empty()) {
3786 auto *U = Worklist.pop_back_val();
3787 if (isa<ConstantExpr>(Val: U)) {
3788 append_range(C&: Worklist, R: U->users());
3789 continue;
3790 }
3791
3792 auto *I = dyn_cast<Instruction>(Val: U);
3793 if (!I || I->getParent()->getParent() != F)
3794 return false;
3795 }
3796 return true;
3797}
3798
3799static SDValue promoteToConstantPool(const ARMTargetLowering *TLI,
3800 const GlobalValue *GV, SelectionDAG &DAG,
3801 EVT PtrVT, const SDLoc &dl) {
3802 // If we're creating a pool entry for a constant global with unnamed address,
3803 // and the global is small enough, we can emit it inline into the constant pool
3804 // to save ourselves an indirection.
3805 //
3806 // This is a win if the constant is only used in one function (so it doesn't
3807 // need to be duplicated) or duplicating the constant wouldn't increase code
3808 // size (implying the constant is no larger than 4 bytes).
3809 const Function &F = DAG.getMachineFunction().getFunction();
3810
3811 // We rely on this decision to inline being idemopotent and unrelated to the
3812 // use-site. We know that if we inline a variable at one use site, we'll
3813 // inline it elsewhere too (and reuse the constant pool entry). Fast-isel
3814 // doesn't know about this optimization, so bail out if it's enabled else
3815 // we could decide to inline here (and thus never emit the GV) but require
3816 // the GV from fast-isel generated code.
3817 if (!EnableConstpoolPromotion ||
3818 DAG.getMachineFunction().getTarget().Options.EnableFastISel)
3819 return SDValue();
3820
3821 auto *GVar = dyn_cast<GlobalVariable>(Val: GV);
3822 if (!GVar || !GVar->hasInitializer() ||
3823 !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() ||
3824 !GVar->hasLocalLinkage())
3825 return SDValue();
3826
3827 // If we inline a value that contains relocations, we move the relocations
3828 // from .data to .text. This is not allowed in position-independent code.
3829 auto *Init = GVar->getInitializer();
3830 if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) &&
3831 Init->needsDynamicRelocation())
3832 return SDValue();
3833
3834 // The constant islands pass can only really deal with alignment requests
3835 // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote
3836 // any type wanting greater alignment requirements than 4 bytes. We also
3837 // can only promote constants that are multiples of 4 bytes in size or
3838 // are paddable to a multiple of 4. Currently we only try and pad constants
3839 // that are strings for simplicity.
3840 auto *CDAInit = dyn_cast<ConstantDataArray>(Val: Init);
3841 unsigned Size = DAG.getDataLayout().getTypeAllocSize(Ty: Init->getType());
3842 Align PrefAlign = DAG.getDataLayout().getPreferredAlign(GV: GVar);
3843 unsigned RequiredPadding = 4 - (Size % 4);
3844 bool PaddingPossible =
3845 RequiredPadding == 4 || (CDAInit && CDAInit->isString());
3846 if (!PaddingPossible || PrefAlign > 4 || Size > ConstpoolPromotionMaxSize ||
3847 Size == 0)
3848 return SDValue();
3849
3850 unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding);
3851 MachineFunction &MF = DAG.getMachineFunction();
3852 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3853
3854 // We can't bloat the constant pool too much, else the ConstantIslands pass
3855 // may fail to converge. If we haven't promoted this global yet (it may have
3856 // multiple uses), and promoting it would increase the constant pool size (Sz
3857 // > 4), ensure we have space to do so up to MaxTotal.
3858 if (!AFI->getGlobalsPromotedToConstantPool().count(Ptr: GVar) && Size > 4)
3859 if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >=
3860 ConstpoolPromotionMaxTotal)
3861 return SDValue();
3862
3863 // This is only valid if all users are in a single function; we can't clone
3864 // the constant in general. The LLVM IR unnamed_addr allows merging
3865 // constants, but not cloning them.
3866 //
3867 // We could potentially allow cloning if we could prove all uses of the
3868 // constant in the current function don't care about the address, like
3869 // printf format strings. But that isn't implemented for now.
3870 if (!allUsersAreInFunction(V: GVar, F: &F))
3871 return SDValue();
3872
3873 // We're going to inline this global. Pad it out if needed.
3874 if (RequiredPadding != 4) {
3875 StringRef S = CDAInit->getAsString();
3876
3877 SmallVector<uint8_t,16> V(S.size());
3878 std::copy(first: S.bytes_begin(), last: S.bytes_end(), result: V.begin());
3879 while (RequiredPadding--)
3880 V.push_back(Elt: 0);
3881 Init = ConstantDataArray::get(Context&: *DAG.getContext(), Elts&: V);
3882 }
3883
3884 auto CPVal = ARMConstantPoolConstant::Create(GV: GVar, Initializer: Init);
3885 SDValue CPAddr = DAG.getTargetConstantPool(C: CPVal, VT: PtrVT, Align: Align(4));
3886 if (!AFI->getGlobalsPromotedToConstantPool().count(Ptr: GVar)) {
3887 AFI->markGlobalAsPromotedToConstantPool(GV: GVar);
3888 AFI->setPromotedConstpoolIncrease(AFI->getPromotedConstpoolIncrease() +
3889 PaddedSize - 4);
3890 }
3891 ++NumConstpoolPromoted;
3892 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3893}
3894
3895bool ARMTargetLowering::isReadOnly(const GlobalValue *GV) const {
3896 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(Val: GV))
3897 if (!(GV = GA->getAliaseeObject()))
3898 return false;
3899 if (const auto *V = dyn_cast<GlobalVariable>(Val: GV))
3900 return V->isConstant();
3901 return isa<Function>(Val: GV);
3902}
3903
3904SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op,
3905 SelectionDAG &DAG) const {
3906 switch (Subtarget->getTargetTriple().getObjectFormat()) {
3907 default: llvm_unreachable("unknown object format");
3908 case Triple::COFF:
3909 return LowerGlobalAddressWindows(Op, DAG);
3910 case Triple::ELF:
3911 return LowerGlobalAddressELF(Op, DAG);
3912 case Triple::MachO:
3913 return LowerGlobalAddressDarwin(Op, DAG);
3914 }
3915}
3916
3917SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
3918 SelectionDAG &DAG) const {
3919 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
3920 SDLoc dl(Op);
3921 const GlobalValue *GV = cast<GlobalAddressSDNode>(Val&: Op)->getGlobal();
3922 bool IsRO = isReadOnly(GV);
3923
3924 // promoteToConstantPool only if not generating XO text section
3925 if (GV->isDSOLocal() && !Subtarget->genExecuteOnly())
3926 if (SDValue V = promoteToConstantPool(TLI: this, GV, DAG, PtrVT, dl))
3927 return V;
3928
3929 if (isPositionIndependent()) {
3930 SDValue G = DAG.getTargetGlobalAddress(
3931 GV, DL: dl, VT: PtrVT, offset: 0, TargetFlags: GV->isDSOLocal() ? 0 : ARMII::MO_GOT);
3932 SDValue Result = DAG.getNode(Opcode: ARMISD::WrapperPIC, DL: dl, VT: PtrVT, Operand: G);
3933 if (!GV->isDSOLocal())
3934 Result =
3935 DAG.getLoad(VT: PtrVT, dl, Chain: DAG.getEntryNode(), Ptr: Result,
3936 PtrInfo: MachinePointerInfo::getGOT(MF&: DAG.getMachineFunction()));
3937 return Result;
3938 } else if (Subtarget->isROPI() && IsRO) {
3939 // PC-relative.
3940 SDValue G = DAG.getTargetGlobalAddress(GV, DL: dl, VT: PtrVT);
3941 SDValue Result = DAG.getNode(Opcode: ARMISD::WrapperPIC, DL: dl, VT: PtrVT, Operand: G);
3942 return Result;
3943 } else if (Subtarget->isRWPI() && !IsRO) {
3944 // SB-relative.
3945 SDValue RelAddr;
3946 if (Subtarget->useMovt()) {
3947 ++NumMovwMovt;
3948 SDValue G = DAG.getTargetGlobalAddress(GV, DL: dl, VT: PtrVT, offset: 0, TargetFlags: ARMII::MO_SBREL);
3949 RelAddr = DAG.getNode(Opcode: ARMISD::Wrapper, DL: dl, VT: PtrVT, Operand: G);
3950 } else { // use literal pool for address constant
3951 ARMConstantPoolValue *CPV =
3952 ARMConstantPoolConstant::Create(GV, Modifier: ARMCP::SBREL);
3953 SDValue CPAddr = DAG.getTargetConstantPool(C: CPV, VT: PtrVT, Align: Align(4));
3954 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3955 RelAddr = DAG.getLoad(
3956 VT: PtrVT, dl, Chain: DAG.getEntryNode(), Ptr: CPAddr,
3957 PtrInfo: MachinePointerInfo::getConstantPool(MF&: DAG.getMachineFunction()));
3958 }
3959 SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT);
3960 SDValue Result = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: SB, N2: RelAddr);
3961 return Result;
3962 }
3963
3964 // If we have T2 ops, we can materialize the address directly via movt/movw
3965 // pair. This is always cheaper. If need to generate Execute Only code, and we
3966 // only have Thumb1 available, we can't use a constant pool and are forced to
3967 // use immediate relocations.
3968 if (Subtarget->useMovt() || Subtarget->genExecuteOnly()) {
3969 if (Subtarget->useMovt())
3970 ++NumMovwMovt;
3971 // FIXME: Once remat is capable of dealing with instructions with register
3972 // operands, expand this into two nodes.
3973 return DAG.getNode(Opcode: ARMISD::Wrapper, DL: dl, VT: PtrVT,
3974 Operand: DAG.getTargetGlobalAddress(GV, DL: dl, VT: PtrVT));
3975 } else {
3976 SDValue CPAddr = DAG.getTargetConstantPool(C: GV, VT: PtrVT, Align: Align(4));
3977 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3978 return DAG.getLoad(
3979 VT: PtrVT, dl, Chain: DAG.getEntryNode(), Ptr: CPAddr,
3980 PtrInfo: MachinePointerInfo::getConstantPool(MF&: DAG.getMachineFunction()));
3981 }
3982}
3983
3984SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
3985 SelectionDAG &DAG) const {
3986 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3987 "ROPI/RWPI not currently supported for Darwin");
3988 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
3989 SDLoc dl(Op);
3990 const GlobalValue *GV = cast<GlobalAddressSDNode>(Val&: Op)->getGlobal();
3991
3992 if (Subtarget->useMovt())
3993 ++NumMovwMovt;
3994
3995 // FIXME: Once remat is capable of dealing with instructions with register
3996 // operands, expand this into multiple nodes
3997 unsigned Wrapper =
3998 isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper;
3999
4000 SDValue G = DAG.getTargetGlobalAddress(GV, DL: dl, VT: PtrVT, offset: 0, TargetFlags: ARMII::MO_NONLAZY);
4001 SDValue Result = DAG.getNode(Opcode: Wrapper, DL: dl, VT: PtrVT, Operand: G);
4002
4003 if (Subtarget->isGVIndirectSymbol(GV))
4004 Result = DAG.getLoad(VT: PtrVT, dl, Chain: DAG.getEntryNode(), Ptr: Result,
4005 PtrInfo: MachinePointerInfo::getGOT(MF&: DAG.getMachineFunction()));
4006 return Result;
4007}
4008
4009SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
4010 SelectionDAG &DAG) const {
4011 assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported");
4012 assert(Subtarget->useMovt() &&
4013 "Windows on ARM expects to use movw/movt");
4014 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
4015 "ROPI/RWPI not currently supported for Windows");
4016
4017 const TargetMachine &TM = getTargetMachine();
4018 const GlobalValue *GV = cast<GlobalAddressSDNode>(Val&: Op)->getGlobal();
4019 ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG;
4020 if (GV->hasDLLImportStorageClass())
4021 TargetFlags = ARMII::MO_DLLIMPORT;
4022 else if (!TM.shouldAssumeDSOLocal(GV))
4023 TargetFlags = ARMII::MO_COFFSTUB;
4024 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
4025 SDValue Result;
4026 SDLoc DL(Op);
4027
4028 ++NumMovwMovt;
4029
4030 // FIXME: Once remat is capable of dealing with instructions with register
4031 // operands, expand this into two nodes.
4032 Result = DAG.getNode(Opcode: ARMISD::Wrapper, DL, VT: PtrVT,
4033 Operand: DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT, /*offset=*/0,
4034 TargetFlags));
4035 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
4036 Result = DAG.getLoad(VT: PtrVT, dl: DL, Chain: DAG.getEntryNode(), Ptr: Result,
4037 PtrInfo: MachinePointerInfo::getGOT(MF&: DAG.getMachineFunction()));
4038 return Result;
4039}
4040
4041SDValue
4042ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
4043 SDLoc dl(Op);
4044 SDValue Val = DAG.getConstant(0, dl, MVT::i32);
4045 return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
4046 DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
4047 Op.getOperand(1), Val);
4048}
4049
4050SDValue
4051ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
4052 SDLoc dl(Op);
4053 return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
4054 Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
4055}
4056
4057SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
4058 SelectionDAG &DAG) const {
4059 SDLoc dl(Op);
4060 return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other,
4061 Op.getOperand(0));
4062}
4063
4064SDValue ARMTargetLowering::LowerINTRINSIC_VOID(
4065 SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const {
4066 unsigned IntNo =
4067 Op.getConstantOperandVal(Op.getOperand(0).getValueType() == MVT::Other);
4068 switch (IntNo) {
4069 default:
4070 return SDValue(); // Don't custom lower most intrinsics.
4071 case Intrinsic::arm_gnu_eabi_mcount: {
4072 MachineFunction &MF = DAG.getMachineFunction();
4073 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
4074 SDLoc dl(Op);
4075 SDValue Chain = Op.getOperand(i: 0);
4076 // call "\01__gnu_mcount_nc"
4077 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
4078 const uint32_t *Mask =
4079 ARI->getCallPreservedMask(MF: DAG.getMachineFunction(), CallingConv::C);
4080 assert(Mask && "Missing call preserved mask for calling convention");
4081 // Mark LR an implicit live-in.
4082 Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
4083 SDValue ReturnAddress =
4084 DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl, Reg, VT: PtrVT);
4085 constexpr EVT ResultTys[] = {MVT::Other, MVT::Glue};
4086 SDValue Callee =
4087 DAG.getTargetExternalSymbol(Sym: "\01__gnu_mcount_nc", VT: PtrVT, TargetFlags: 0);
4088 SDValue RegisterMask = DAG.getRegisterMask(RegMask: Mask);
4089 if (Subtarget->isThumb())
4090 return SDValue(
4091 DAG.getMachineNode(
4092 ARM::tBL_PUSHLR, dl, ResultTys,
4093 {ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT),
4094 DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}),
4095 0);
4096 return SDValue(
4097 DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys,
4098 {ReturnAddress, Callee, RegisterMask, Chain}),
4099 0);
4100 }
4101 }
4102}
4103
4104SDValue
4105ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
4106 const ARMSubtarget *Subtarget) const {
4107 unsigned IntNo = Op.getConstantOperandVal(i: 0);
4108 SDLoc dl(Op);
4109 switch (IntNo) {
4110 default: return SDValue(); // Don't custom lower most intrinsics.
4111 case Intrinsic::thread_pointer: {
4112 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
4113 return DAG.getNode(Opcode: ARMISD::THREAD_POINTER, DL: dl, VT: PtrVT);
4114 }
4115 case Intrinsic::arm_cls: {
4116 const SDValue &Operand = Op.getOperand(i: 1);
4117 const EVT VTy = Op.getValueType();
4118 SDValue SRA =
4119 DAG.getNode(Opcode: ISD::SRA, DL: dl, VT: VTy, N1: Operand, N2: DAG.getConstant(Val: 31, DL: dl, VT: VTy));
4120 SDValue XOR = DAG.getNode(Opcode: ISD::XOR, DL: dl, VT: VTy, N1: SRA, N2: Operand);
4121 SDValue SHL =
4122 DAG.getNode(Opcode: ISD::SHL, DL: dl, VT: VTy, N1: XOR, N2: DAG.getConstant(Val: 1, DL: dl, VT: VTy));
4123 SDValue OR =
4124 DAG.getNode(Opcode: ISD::OR, DL: dl, VT: VTy, N1: SHL, N2: DAG.getConstant(Val: 1, DL: dl, VT: VTy));
4125 SDValue Result = DAG.getNode(Opcode: ISD::CTLZ, DL: dl, VT: VTy, Operand: OR);
4126 return Result;
4127 }
4128 case Intrinsic::arm_cls64: {
4129 // cls(x) = if cls(hi(x)) != 31 then cls(hi(x))
4130 // else 31 + clz(if hi(x) == 0 then lo(x) else not(lo(x)))
4131 const SDValue &Operand = Op.getOperand(i: 1);
4132 const EVT VTy = Op.getValueType();
4133 SDValue Lo, Hi;
4134 std::tie(args&: Lo, args&: Hi) = DAG.SplitScalar(N: Operand, DL: dl, LoVT: VTy, HiVT: VTy);
4135 SDValue Constant0 = DAG.getConstant(Val: 0, DL: dl, VT: VTy);
4136 SDValue Constant1 = DAG.getConstant(Val: 1, DL: dl, VT: VTy);
4137 SDValue Constant31 = DAG.getConstant(Val: 31, DL: dl, VT: VTy);
4138 SDValue SRAHi = DAG.getNode(Opcode: ISD::SRA, DL: dl, VT: VTy, N1: Hi, N2: Constant31);
4139 SDValue XORHi = DAG.getNode(Opcode: ISD::XOR, DL: dl, VT: VTy, N1: SRAHi, N2: Hi);
4140 SDValue SHLHi = DAG.getNode(Opcode: ISD::SHL, DL: dl, VT: VTy, N1: XORHi, N2: Constant1);
4141 SDValue ORHi = DAG.getNode(Opcode: ISD::OR, DL: dl, VT: VTy, N1: SHLHi, N2: Constant1);
4142 SDValue CLSHi = DAG.getNode(Opcode: ISD::CTLZ, DL: dl, VT: VTy, Operand: ORHi);
4143 SDValue CheckLo =
4144 DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::CondCode::SETEQ);
4145 SDValue HiIsZero =
4146 DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::CondCode::SETEQ);
4147 SDValue AdjustedLo =
4148 DAG.getSelect(DL: dl, VT: VTy, Cond: HiIsZero, LHS: Lo, RHS: DAG.getNOT(DL: dl, Val: Lo, VT: VTy));
4149 SDValue CLZAdjustedLo = DAG.getNode(Opcode: ISD::CTLZ, DL: dl, VT: VTy, Operand: AdjustedLo);
4150 SDValue Result =
4151 DAG.getSelect(DL: dl, VT: VTy, Cond: CheckLo,
4152 LHS: DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: VTy, N1: CLZAdjustedLo, N2: Constant31), RHS: CLSHi);
4153 return Result;
4154 }
4155 case Intrinsic::eh_sjlj_lsda: {
4156 MachineFunction &MF = DAG.getMachineFunction();
4157 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4158 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
4159 EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
4160 SDValue CPAddr;
4161 bool IsPositionIndependent = isPositionIndependent();
4162 unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
4163 ARMConstantPoolValue *CPV =
4164 ARMConstantPoolConstant::Create(C: &MF.getFunction(), ID: ARMPCLabelIndex,
4165 Kind: ARMCP::CPLSDA, PCAdj);
4166 CPAddr = DAG.getTargetConstantPool(C: CPV, VT: PtrVT, Align: Align(4));
4167 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
4168 SDValue Result = DAG.getLoad(
4169 VT: PtrVT, dl, Chain: DAG.getEntryNode(), Ptr: CPAddr,
4170 PtrInfo: MachinePointerInfo::getConstantPool(MF&: DAG.getMachineFunction()));
4171
4172 if (IsPositionIndependent) {
4173 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
4174 Result = DAG.getNode(Opcode: ARMISD::PIC_ADD, DL: dl, VT: PtrVT, N1: Result, N2: PICLabel);
4175 }
4176 return Result;
4177 }
4178 case Intrinsic::arm_neon_vabs:
4179 return DAG.getNode(Opcode: ISD::ABS, DL: SDLoc(Op), VT: Op.getValueType(),
4180 Operand: Op.getOperand(i: 1));
4181 case Intrinsic::arm_neon_vmulls:
4182 case Intrinsic::arm_neon_vmullu: {
4183 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
4184 ? ARMISD::VMULLs : ARMISD::VMULLu;
4185 return DAG.getNode(Opcode: NewOpc, DL: SDLoc(Op), VT: Op.getValueType(),
4186 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
4187 }
4188 case Intrinsic::arm_neon_vminnm:
4189 case Intrinsic::arm_neon_vmaxnm: {
4190 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm)
4191 ? ISD::FMINNUM : ISD::FMAXNUM;
4192 return DAG.getNode(Opcode: NewOpc, DL: SDLoc(Op), VT: Op.getValueType(),
4193 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
4194 }
4195 case Intrinsic::arm_neon_vminu:
4196 case Intrinsic::arm_neon_vmaxu: {
4197 if (Op.getValueType().isFloatingPoint())
4198 return SDValue();
4199 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu)
4200 ? ISD::UMIN : ISD::UMAX;
4201 return DAG.getNode(Opcode: NewOpc, DL: SDLoc(Op), VT: Op.getValueType(),
4202 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
4203 }
4204 case Intrinsic::arm_neon_vmins:
4205 case Intrinsic::arm_neon_vmaxs: {
4206 // v{min,max}s is overloaded between signed integers and floats.
4207 if (!Op.getValueType().isFloatingPoint()) {
4208 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
4209 ? ISD::SMIN : ISD::SMAX;
4210 return DAG.getNode(Opcode: NewOpc, DL: SDLoc(Op), VT: Op.getValueType(),
4211 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
4212 }
4213 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
4214 ? ISD::FMINIMUM : ISD::FMAXIMUM;
4215 return DAG.getNode(Opcode: NewOpc, DL: SDLoc(Op), VT: Op.getValueType(),
4216 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
4217 }
4218 case Intrinsic::arm_neon_vtbl1:
4219 return DAG.getNode(Opcode: ARMISD::VTBL1, DL: SDLoc(Op), VT: Op.getValueType(),
4220 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
4221 case Intrinsic::arm_neon_vtbl2:
4222 return DAG.getNode(Opcode: ARMISD::VTBL2, DL: SDLoc(Op), VT: Op.getValueType(),
4223 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
4224 case Intrinsic::arm_mve_pred_i2v:
4225 case Intrinsic::arm_mve_pred_v2i:
4226 return DAG.getNode(Opcode: ARMISD::PREDICATE_CAST, DL: SDLoc(Op), VT: Op.getValueType(),
4227 Operand: Op.getOperand(i: 1));
4228 case Intrinsic::arm_mve_vreinterpretq:
4229 return DAG.getNode(Opcode: ARMISD::VECTOR_REG_CAST, DL: SDLoc(Op), VT: Op.getValueType(),
4230 Operand: Op.getOperand(i: 1));
4231 case Intrinsic::arm_mve_lsll:
4232 return DAG.getNode(Opcode: ARMISD::LSLL, DL: SDLoc(Op), VTList: Op->getVTList(),
4233 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
4234 case Intrinsic::arm_mve_asrl:
4235 return DAG.getNode(Opcode: ARMISD::ASRL, DL: SDLoc(Op), VTList: Op->getVTList(),
4236 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
4237 }
4238}
4239
4240static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
4241 const ARMSubtarget *Subtarget) {
4242 SDLoc dl(Op);
4243 auto SSID = static_cast<SyncScope::ID>(Op.getConstantOperandVal(i: 2));
4244 if (SSID == SyncScope::SingleThread)
4245 return Op;
4246
4247 if (!Subtarget->hasDataBarrier()) {
4248 // Some ARMv6 cpus can support data barriers with an mcr instruction.
4249 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
4250 // here.
4251 assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
4252 "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
4253 return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
4254 DAG.getConstant(0, dl, MVT::i32));
4255 }
4256
4257 AtomicOrdering Ord =
4258 static_cast<AtomicOrdering>(Op.getConstantOperandVal(i: 1));
4259 ARM_MB::MemBOpt Domain = ARM_MB::ISH;
4260 if (Subtarget->isMClass()) {
4261 // Only a full system barrier exists in the M-class architectures.
4262 Domain = ARM_MB::SY;
4263 } else if (Subtarget->preferISHSTBarriers() &&
4264 Ord == AtomicOrdering::Release) {
4265 // Swift happens to implement ISHST barriers in a way that's compatible with
4266 // Release semantics but weaker than ISH so we'd be fools not to use
4267 // it. Beware: other processors probably don't!
4268 Domain = ARM_MB::ISHST;
4269 }
4270
4271 return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
4272 DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32),
4273 DAG.getConstant(Domain, dl, MVT::i32));
4274}
4275
4276static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,
4277 const ARMSubtarget *Subtarget) {
4278 // ARM pre v5TE and Thumb1 does not have preload instructions.
4279 if (!(Subtarget->isThumb2() ||
4280 (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
4281 // Just preserve the chain.
4282 return Op.getOperand(i: 0);
4283
4284 SDLoc dl(Op);
4285 unsigned isRead = ~Op.getConstantOperandVal(i: 2) & 1;
4286 if (!isRead &&
4287 (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
4288 // ARMv7 with MP extension has PLDW.
4289 return Op.getOperand(i: 0);
4290
4291 unsigned isData = Op.getConstantOperandVal(i: 4);
4292 if (Subtarget->isThumb()) {
4293 // Invert the bits.
4294 isRead = ~isRead & 1;
4295 isData = ~isData & 1;
4296 }
4297
4298 return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
4299 Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32),
4300 DAG.getConstant(isData, dl, MVT::i32));
4301}
4302
4303static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
4304 MachineFunction &MF = DAG.getMachineFunction();
4305 ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
4306
4307 // vastart just stores the address of the VarArgsFrameIndex slot into the
4308 // memory location argument.
4309 SDLoc dl(Op);
4310 EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DL: DAG.getDataLayout());
4311 SDValue FR = DAG.getFrameIndex(FI: FuncInfo->getVarArgsFrameIndex(), VT: PtrVT);
4312 const Value *SV = cast<SrcValueSDNode>(Val: Op.getOperand(i: 2))->getValue();
4313 return DAG.getStore(Chain: Op.getOperand(i: 0), dl, Val: FR, Ptr: Op.getOperand(i: 1),
4314 PtrInfo: MachinePointerInfo(SV));
4315}
4316
4317SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA,
4318 CCValAssign &NextVA,
4319 SDValue &Root,
4320 SelectionDAG &DAG,
4321 const SDLoc &dl) const {
4322 MachineFunction &MF = DAG.getMachineFunction();
4323 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4324
4325 const TargetRegisterClass *RC;
4326 if (AFI->isThumb1OnlyFunction())
4327 RC = &ARM::tGPRRegClass;
4328 else
4329 RC = &ARM::GPRRegClass;
4330
4331 // Transform the arguments stored in physical registers into virtual ones.
4332 Register Reg = MF.addLiveIn(PReg: VA.getLocReg(), RC);
4333 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4334
4335 SDValue ArgValue2;
4336 if (NextVA.isMemLoc()) {
4337 MachineFrameInfo &MFI = MF.getFrameInfo();
4338 int FI = MFI.CreateFixedObject(Size: 4, SPOffset: NextVA.getLocMemOffset(), IsImmutable: true);
4339
4340 // Create load node to retrieve arguments from the stack.
4341 SDValue FIN = DAG.getFrameIndex(FI, VT: getPointerTy(DL: DAG.getDataLayout()));
4342 ArgValue2 = DAG.getLoad(
4343 MVT::i32, dl, Root, FIN,
4344 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
4345 } else {
4346 Reg = MF.addLiveIn(PReg: NextVA.getLocReg(), RC);
4347 ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4348 }
4349 if (!Subtarget->isLittle())
4350 std::swap (a&: ArgValue, b&: ArgValue2);
4351 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
4352}
4353
4354// The remaining GPRs hold either the beginning of variable-argument
4355// data, or the beginning of an aggregate passed by value (usually
4356// byval). Either way, we allocate stack slots adjacent to the data
4357// provided by our caller, and store the unallocated registers there.
4358// If this is a variadic function, the va_list pointer will begin with
4359// these values; otherwise, this reassembles a (byval) structure that
4360// was split between registers and memory.
4361// Return: The frame index registers were stored into.
4362int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
4363 const SDLoc &dl, SDValue &Chain,
4364 const Value *OrigArg,
4365 unsigned InRegsParamRecordIdx,
4366 int ArgOffset, unsigned ArgSize) const {
4367 // Currently, two use-cases possible:
4368 // Case #1. Non-var-args function, and we meet first byval parameter.
4369 // Setup first unallocated register as first byval register;
4370 // eat all remained registers
4371 // (these two actions are performed by HandleByVal method).
4372 // Then, here, we initialize stack frame with
4373 // "store-reg" instructions.
4374 // Case #2. Var-args function, that doesn't contain byval parameters.
4375 // The same: eat all remained unallocated registers,
4376 // initialize stack frame.
4377
4378 MachineFunction &MF = DAG.getMachineFunction();
4379 MachineFrameInfo &MFI = MF.getFrameInfo();
4380 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4381 unsigned RBegin, REnd;
4382 if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
4383 CCInfo.getInRegsParamInfo(InRegsParamRecordIndex: InRegsParamRecordIdx, BeginReg&: RBegin, EndReg&: REnd);
4384 } else {
4385 unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4386 RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx];
4387 REnd = ARM::R4;
4388 }
4389
4390 if (REnd != RBegin)
4391 ArgOffset = -4 * (ARM::R4 - RBegin);
4392
4393 auto PtrVT = getPointerTy(DL: DAG.getDataLayout());
4394 int FrameIndex = MFI.CreateFixedObject(Size: ArgSize, SPOffset: ArgOffset, IsImmutable: false);
4395 SDValue FIN = DAG.getFrameIndex(FI: FrameIndex, VT: PtrVT);
4396
4397 SmallVector<SDValue, 4> MemOps;
4398 const TargetRegisterClass *RC =
4399 AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
4400
4401 for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
4402 Register VReg = MF.addLiveIn(PReg: Reg, RC);
4403 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
4404 SDValue Store = DAG.getStore(Chain: Val.getValue(R: 1), dl, Val, Ptr: FIN,
4405 PtrInfo: MachinePointerInfo(OrigArg, 4 * i));
4406 MemOps.push_back(Elt: Store);
4407 FIN = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: FIN, N2: DAG.getConstant(Val: 4, DL: dl, VT: PtrVT));
4408 }
4409
4410 if (!MemOps.empty())
4411 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
4412 return FrameIndex;
4413}
4414
4415// Setup stack frame, the va_list pointer will start from.
4416void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
4417 const SDLoc &dl, SDValue &Chain,
4418 unsigned ArgOffset,
4419 unsigned TotalArgRegsSaveSize,
4420 bool ForceMutable) const {
4421 MachineFunction &MF = DAG.getMachineFunction();
4422 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4423
4424 // Try to store any remaining integer argument regs
4425 // to their spots on the stack so that they may be loaded by dereferencing
4426 // the result of va_next.
4427 // If there is no regs to be stored, just point address after last
4428 // argument passed via stack.
4429 int FrameIndex = StoreByValRegs(
4430 CCInfo, DAG, dl, Chain, OrigArg: nullptr, InRegsParamRecordIdx: CCInfo.getInRegsParamsCount(),
4431 ArgOffset: CCInfo.getStackSize(), ArgSize: std::max(a: 4U, b: TotalArgRegsSaveSize));
4432 AFI->setVarArgsFrameIndex(FrameIndex);
4433}
4434
4435bool ARMTargetLowering::splitValueIntoRegisterParts(
4436 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
4437 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
4438 EVT ValueVT = Val.getValueType();
4439 if ((ValueVT == MVT::f16 || ValueVT == MVT::bf16) && PartVT == MVT::f32) {
4440 unsigned ValueBits = ValueVT.getSizeInBits();
4441 unsigned PartBits = PartVT.getSizeInBits();
4442 Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::getIntegerVT(BitWidth: ValueBits), Operand: Val);
4443 Val = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::getIntegerVT(BitWidth: PartBits), Operand: Val);
4444 Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: PartVT, Operand: Val);
4445 Parts[0] = Val;
4446 return true;
4447 }
4448 return false;
4449}
4450
4451SDValue ARMTargetLowering::joinRegisterPartsIntoValue(
4452 SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
4453 MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const {
4454 if ((ValueVT == MVT::f16 || ValueVT == MVT::bf16) && PartVT == MVT::f32) {
4455 unsigned ValueBits = ValueVT.getSizeInBits();
4456 unsigned PartBits = PartVT.getSizeInBits();
4457 SDValue Val = Parts[0];
4458
4459 Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::getIntegerVT(BitWidth: PartBits), Operand: Val);
4460 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::getIntegerVT(BitWidth: ValueBits), Operand: Val);
4461 Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ValueVT, Operand: Val);
4462 return Val;
4463 }
4464 return SDValue();
4465}
4466
4467SDValue ARMTargetLowering::LowerFormalArguments(
4468 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4469 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4470 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4471 MachineFunction &MF = DAG.getMachineFunction();
4472 MachineFrameInfo &MFI = MF.getFrameInfo();
4473
4474 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4475
4476 // Assign locations to all of the incoming arguments.
4477 SmallVector<CCValAssign, 16> ArgLocs;
4478 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
4479 *DAG.getContext());
4480 CCInfo.AnalyzeFormalArguments(Ins, Fn: CCAssignFnForCall(CC: CallConv, isVarArg));
4481
4482 SmallVector<SDValue, 16> ArgValues;
4483 SDValue ArgValue;
4484 Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin();
4485 unsigned CurArgIdx = 0;
4486
4487 // Initially ArgRegsSaveSize is zero.
4488 // Then we increase this value each time we meet byval parameter.
4489 // We also increase this value in case of varargs function.
4490 AFI->setArgRegsSaveSize(0);
4491
4492 // Calculate the amount of stack space that we need to allocate to store
4493 // byval and variadic arguments that are passed in registers.
4494 // We need to know this before we allocate the first byval or variadic
4495 // argument, as they will be allocated a stack slot below the CFA (Canonical
4496 // Frame Address, the stack pointer at entry to the function).
4497 unsigned ArgRegBegin = ARM::R4;
4498 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4499 if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount())
4500 break;
4501
4502 CCValAssign &VA = ArgLocs[i];
4503 unsigned Index = VA.getValNo();
4504 ISD::ArgFlagsTy Flags = Ins[Index].Flags;
4505 if (!Flags.isByVal())
4506 continue;
4507
4508 assert(VA.isMemLoc() && "unexpected byval pointer in reg");
4509 unsigned RBegin, REnd;
4510 CCInfo.getInRegsParamInfo(InRegsParamRecordIndex: CCInfo.getInRegsParamsProcessed(), BeginReg&: RBegin, EndReg&: REnd);
4511 ArgRegBegin = std::min(a: ArgRegBegin, b: RBegin);
4512
4513 CCInfo.nextInRegsParam();
4514 }
4515 CCInfo.rewindByValRegsInfo();
4516
4517 int lastInsIndex = -1;
4518 if (isVarArg && MFI.hasVAStart()) {
4519 unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4520 if (RegIdx != std::size(GPRArgRegs))
4521 ArgRegBegin = std::min(a: ArgRegBegin, b: (unsigned)GPRArgRegs[RegIdx]);
4522 }
4523
4524 unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin);
4525 AFI->setArgRegsSaveSize(TotalArgRegsSaveSize);
4526 auto PtrVT = getPointerTy(DL: DAG.getDataLayout());
4527
4528 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4529 CCValAssign &VA = ArgLocs[i];
4530 if (Ins[VA.getValNo()].isOrigArg()) {
4531 std::advance(i&: CurOrigArg,
4532 n: Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx);
4533 CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex();
4534 }
4535 // Arguments stored in registers.
4536 if (VA.isRegLoc()) {
4537 EVT RegVT = VA.getLocVT();
4538
4539 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
4540 // f64 and vector types are split up into multiple registers or
4541 // combinations of registers and stack slots.
4542 SDValue ArgValue1 =
4543 GetF64FormalArgument(VA, NextVA&: ArgLocs[++i], Root&: Chain, DAG, dl);
4544 VA = ArgLocs[++i]; // skip ahead to next loc
4545 SDValue ArgValue2;
4546 if (VA.isMemLoc()) {
4547 int FI = MFI.CreateFixedObject(Size: 8, SPOffset: VA.getLocMemOffset(), IsImmutable: true);
4548 SDValue FIN = DAG.getFrameIndex(FI, VT: PtrVT);
4549 ArgValue2 = DAG.getLoad(
4550 MVT::f64, dl, Chain, FIN,
4551 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
4552 } else {
4553 ArgValue2 = GetF64FormalArgument(VA, NextVA&: ArgLocs[++i], Root&: Chain, DAG, dl);
4554 }
4555 ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
4556 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4557 ArgValue1, DAG.getIntPtrConstant(0, dl));
4558 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4559 ArgValue2, DAG.getIntPtrConstant(1, dl));
4560 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
4561 ArgValue = GetF64FormalArgument(VA, NextVA&: ArgLocs[++i], Root&: Chain, DAG, dl);
4562 } else {
4563 const TargetRegisterClass *RC;
4564
4565 if (RegVT == MVT::f16 || RegVT == MVT::bf16)
4566 RC = &ARM::HPRRegClass;
4567 else if (RegVT == MVT::f32)
4568 RC = &ARM::SPRRegClass;
4569 else if (RegVT == MVT::f64 || RegVT == MVT::v4f16 ||
4570 RegVT == MVT::v4bf16)
4571 RC = &ARM::DPRRegClass;
4572 else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16 ||
4573 RegVT == MVT::v8bf16)
4574 RC = &ARM::QPRRegClass;
4575 else if (RegVT == MVT::i32)
4576 RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
4577 : &ARM::GPRRegClass;
4578 else
4579 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
4580
4581 // Transform the arguments in physical registers into virtual ones.
4582 Register Reg = MF.addLiveIn(PReg: VA.getLocReg(), RC);
4583 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, VT: RegVT);
4584
4585 // If this value is passed in r0 and has the returned attribute (e.g.
4586 // C++ 'structors), record this fact for later use.
4587 if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) {
4588 AFI->setPreservesR0();
4589 }
4590 }
4591
4592 // If this is an 8 or 16-bit value, it is really passed promoted
4593 // to 32 bits. Insert an assert[sz]ext to capture this, then
4594 // truncate to the right size.
4595 switch (VA.getLocInfo()) {
4596 default: llvm_unreachable("Unknown loc info!");
4597 case CCValAssign::Full: break;
4598 case CCValAssign::BCvt:
4599 ArgValue = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: VA.getValVT(), Operand: ArgValue);
4600 break;
4601 case CCValAssign::SExt:
4602 ArgValue = DAG.getNode(Opcode: ISD::AssertSext, DL: dl, VT: RegVT, N1: ArgValue,
4603 N2: DAG.getValueType(VA.getValVT()));
4604 ArgValue = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: VA.getValVT(), Operand: ArgValue);
4605 break;
4606 case CCValAssign::ZExt:
4607 ArgValue = DAG.getNode(Opcode: ISD::AssertZext, DL: dl, VT: RegVT, N1: ArgValue,
4608 N2: DAG.getValueType(VA.getValVT()));
4609 ArgValue = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: VA.getValVT(), Operand: ArgValue);
4610 break;
4611 }
4612
4613 // f16 arguments have their size extended to 4 bytes and passed as if they
4614 // had been copied to the LSBs of a 32-bit register.
4615 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
4616 if (VA.needsCustom() &&
4617 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
4618 ArgValue = MoveToHPR(dl, DAG, LocVT: VA.getLocVT(), ValVT: VA.getValVT(), Val: ArgValue);
4619
4620 InVals.push_back(Elt: ArgValue);
4621 } else { // VA.isRegLoc()
4622 // Only arguments passed on the stack should make it here.
4623 assert(VA.isMemLoc());
4624 assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
4625
4626 int index = VA.getValNo();
4627
4628 // Some Ins[] entries become multiple ArgLoc[] entries.
4629 // Process them only once.
4630 if (index != lastInsIndex)
4631 {
4632 ISD::ArgFlagsTy Flags = Ins[index].Flags;
4633 // FIXME: For now, all byval parameter objects are marked mutable.
4634 // This can be changed with more analysis.
4635 // In case of tail call optimization mark all arguments mutable.
4636 // Since they could be overwritten by lowering of arguments in case of
4637 // a tail call.
4638 if (Flags.isByVal()) {
4639 assert(Ins[index].isOrigArg() &&
4640 "Byval arguments cannot be implicit");
4641 unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();
4642
4643 int FrameIndex = StoreByValRegs(
4644 CCInfo, DAG, dl, Chain, OrigArg: &*CurOrigArg, InRegsParamRecordIdx: CurByValIndex,
4645 ArgOffset: VA.getLocMemOffset(), ArgSize: Flags.getByValSize());
4646 InVals.push_back(Elt: DAG.getFrameIndex(FI: FrameIndex, VT: PtrVT));
4647 CCInfo.nextInRegsParam();
4648 } else {
4649 unsigned FIOffset = VA.getLocMemOffset();
4650 int FI = MFI.CreateFixedObject(Size: VA.getLocVT().getSizeInBits()/8,
4651 SPOffset: FIOffset, IsImmutable: true);
4652
4653 // Create load nodes to retrieve arguments from the stack.
4654 SDValue FIN = DAG.getFrameIndex(FI, VT: PtrVT);
4655 InVals.push_back(Elt: DAG.getLoad(VT: VA.getValVT(), dl, Chain, Ptr: FIN,
4656 PtrInfo: MachinePointerInfo::getFixedStack(
4657 MF&: DAG.getMachineFunction(), FI)));
4658 }
4659 lastInsIndex = index;
4660 }
4661 }
4662 }
4663
4664 // varargs
4665 if (isVarArg && MFI.hasVAStart()) {
4666 VarArgStyleRegisters(CCInfo, DAG, dl, Chain, ArgOffset: CCInfo.getStackSize(),
4667 TotalArgRegsSaveSize);
4668 if (AFI->isCmseNSEntryFunction()) {
4669 DiagnosticInfoUnsupported Diag(
4670 DAG.getMachineFunction().getFunction(),
4671 "secure entry function must not be variadic", dl.getDebugLoc());
4672 DAG.getContext()->diagnose(DI: Diag);
4673 }
4674 }
4675
4676 unsigned StackArgSize = CCInfo.getStackSize();
4677 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4678 if (canGuaranteeTCO(CC: CallConv, GuaranteeTailCalls: TailCallOpt)) {
4679 // The only way to guarantee a tail call is if the callee restores its
4680 // argument area, but it must also keep the stack aligned when doing so.
4681 const DataLayout &DL = DAG.getDataLayout();
4682 StackArgSize = alignTo(Size: StackArgSize, A: DL.getStackAlignment());
4683
4684 AFI->setArgumentStackToRestore(StackArgSize);
4685 }
4686 AFI->setArgumentStackSize(StackArgSize);
4687
4688 if (CCInfo.getStackSize() > 0 && AFI->isCmseNSEntryFunction()) {
4689 DiagnosticInfoUnsupported Diag(
4690 DAG.getMachineFunction().getFunction(),
4691 "secure entry function requires arguments on stack", dl.getDebugLoc());
4692 DAG.getContext()->diagnose(DI: Diag);
4693 }
4694
4695 return Chain;
4696}
4697
4698/// isFloatingPointZero - Return true if this is +0.0.
4699static bool isFloatingPointZero(SDValue Op) {
4700 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Val&: Op))
4701 return CFP->getValueAPF().isPosZero();
4702 else if (ISD::isEXTLoad(N: Op.getNode()) || ISD::isNON_EXTLoad(N: Op.getNode())) {
4703 // Maybe this has already been legalized into the constant pool?
4704 if (Op.getOperand(i: 1).getOpcode() == ARMISD::Wrapper) {
4705 SDValue WrapperOp = Op.getOperand(i: 1).getOperand(i: 0);
4706 if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Val&: WrapperOp))
4707 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Val: CP->getConstVal()))
4708 return CFP->getValueAPF().isPosZero();
4709 }
4710 } else if (Op->getOpcode() == ISD::BITCAST &&
4711 Op->getValueType(0) == MVT::f64) {
4712 // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
4713 // created by LowerConstantFP().
4714 SDValue BitcastOp = Op->getOperand(Num: 0);
4715 if (BitcastOp->getOpcode() == ARMISD::VMOVIMM &&
4716 isNullConstant(V: BitcastOp->getOperand(Num: 0)))
4717 return true;
4718 }
4719 return false;
4720}
4721
4722/// Returns appropriate ARM CMP (cmp) and corresponding condition code for
4723/// the given operands.
4724SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
4725 SDValue &ARMcc, SelectionDAG &DAG,
4726 const SDLoc &dl) const {
4727 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(Val: RHS.getNode())) {
4728 unsigned C = RHSC->getZExtValue();
4729 if (!isLegalICmpImmediate(Imm: (int32_t)C)) {
4730 // Constant does not fit, try adjusting it by one.
4731 switch (CC) {
4732 default: break;
4733 case ISD::SETLT:
4734 case ISD::SETGE:
4735 if (C != 0x80000000 && isLegalICmpImmediate(Imm: C-1)) {
4736 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4737 RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4738 }
4739 break;
4740 case ISD::SETULT:
4741 case ISD::SETUGE:
4742 if (C != 0 && isLegalICmpImmediate(Imm: C-1)) {
4743 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
4744 RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4745 }
4746 break;
4747 case ISD::SETLE:
4748 case ISD::SETGT:
4749 if (C != 0x7fffffff && isLegalICmpImmediate(Imm: C+1)) {
4750 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4751 RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4752 }
4753 break;
4754 case ISD::SETULE:
4755 case ISD::SETUGT:
4756 if (C != 0xffffffff && isLegalICmpImmediate(Imm: C+1)) {
4757 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
4758 RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4759 }
4760 break;
4761 }
4762 }
4763 } else if ((ARM_AM::getShiftOpcForNode(Opcode: LHS.getOpcode()) != ARM_AM::no_shift) &&
4764 (ARM_AM::getShiftOpcForNode(Opcode: RHS.getOpcode()) == ARM_AM::no_shift)) {
4765 // In ARM and Thumb-2, the compare instructions can shift their second
4766 // operand.
4767 CC = ISD::getSetCCSwappedOperands(Operation: CC);
4768 std::swap(a&: LHS, b&: RHS);
4769 }
4770
4771 // Thumb1 has very limited immediate modes, so turning an "and" into a
4772 // shift can save multiple instructions.
4773 //
4774 // If we have (x & C1), and C1 is an appropriate mask, we can transform it
4775 // into "((x << n) >> n)". But that isn't necessarily profitable on its
4776 // own. If it's the operand to an unsigned comparison with an immediate,
4777 // we can eliminate one of the shifts: we transform
4778 // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)".
4779 //
4780 // We avoid transforming cases which aren't profitable due to encoding
4781 // details:
4782 //
4783 // 1. C2 fits into the immediate field of a cmp, and the transformed version
4784 // would not; in that case, we're essentially trading one immediate load for
4785 // another.
4786 // 2. C1 is 255 or 65535, so we can use uxtb or uxth.
4787 // 3. C2 is zero; we have other code for this special case.
4788 //
4789 // FIXME: Figure out profitability for Thumb2; we usually can't save an
4790 // instruction, since the AND is always one instruction anyway, but we could
4791 // use narrow instructions in some cases.
4792 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND &&
4793 LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4794 LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) &&
4795 !isSignedIntSetCC(CC)) {
4796 unsigned Mask = LHS.getConstantOperandVal(i: 1);
4797 auto *RHSC = cast<ConstantSDNode>(Val: RHS.getNode());
4798 uint64_t RHSV = RHSC->getZExtValue();
4799 if (isMask_32(Value: Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) {
4800 unsigned ShiftBits = llvm::countl_zero(Val: Mask);
4801 if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) {
4802 SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32);
4803 LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt);
4804 RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32);
4805 }
4806 }
4807 }
4808
4809 // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a
4810 // single "lsls x, c+1". The shift sets the "C" and "Z" flags the same
4811 // way a cmp would.
4812 // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and
4813 // some tweaks to the heuristics for the previous and->shift transform.
4814 // FIXME: Optimize cases where the LHS isn't a shift.
4815 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL &&
4816 isa<ConstantSDNode>(Val: RHS) && RHS->getAsZExtVal() == 0x80000000U &&
4817 CC == ISD::SETUGT && isa<ConstantSDNode>(Val: LHS.getOperand(i: 1)) &&
4818 LHS.getConstantOperandVal(i: 1) < 31) {
4819 unsigned ShiftAmt = LHS.getConstantOperandVal(i: 1) + 1;
4820 SDValue Shift = DAG.getNode(ARMISD::LSLS, dl,
4821 DAG.getVTList(MVT::i32, MVT::i32),
4822 LHS.getOperand(0),
4823 DAG.getConstant(ShiftAmt, dl, MVT::i32));
4824 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
4825 Shift.getValue(1), SDValue());
4826 ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32);
4827 return Chain.getValue(R: 1);
4828 }
4829
4830 ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
4831
4832 // If the RHS is a constant zero then the V (overflow) flag will never be
4833 // set. This can allow us to simplify GE to PL or LT to MI, which can be
4834 // simpler for other passes (like the peephole optimiser) to deal with.
4835 if (isNullConstant(V: RHS)) {
4836 switch (CondCode) {
4837 default: break;
4838 case ARMCC::GE:
4839 CondCode = ARMCC::PL;
4840 break;
4841 case ARMCC::LT:
4842 CondCode = ARMCC::MI;
4843 break;
4844 }
4845 }
4846
4847 ARMISD::NodeType CompareType;
4848 switch (CondCode) {
4849 default:
4850 CompareType = ARMISD::CMP;
4851 break;
4852 case ARMCC::EQ:
4853 case ARMCC::NE:
4854 // Uses only Z Flag
4855 CompareType = ARMISD::CMPZ;
4856 break;
4857 }
4858 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
4859 return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS);
4860}
4861
4862/// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
4863SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
4864 SelectionDAG &DAG, const SDLoc &dl,
4865 bool Signaling) const {
4866 assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64);
4867 SDValue Cmp;
4868 if (!isFloatingPointZero(RHS))
4869 Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPE : ARMISD::CMPFP,
4870 dl, MVT::Glue, LHS, RHS);
4871 else
4872 Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPEw0 : ARMISD::CMPFPw0,
4873 dl, MVT::Glue, LHS);
4874 return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp);
4875}
4876
4877/// duplicateCmp - Glue values can have only one use, so this function
4878/// duplicates a comparison node.
4879SDValue
4880ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
4881 unsigned Opc = Cmp.getOpcode();
4882 SDLoc DL(Cmp);
4883 if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ)
4884 return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
4885
4886 assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation");
4887 Cmp = Cmp.getOperand(i: 0);
4888 Opc = Cmp.getOpcode();
4889 if (Opc == ARMISD::CMPFP)
4890 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
4891 else {
4892 assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT");
4893 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0));
4894 }
4895 return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);
4896}
4897
4898// This function returns three things: the arithmetic computation itself
4899// (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The
4900// comparison and the condition code define the case in which the arithmetic
4901// computation *does not* overflow.
4902std::pair<SDValue, SDValue>
4903ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
4904 SDValue &ARMcc) const {
4905 assert(Op.getValueType() == MVT::i32 && "Unsupported value type");
4906
4907 SDValue Value, OverflowCmp;
4908 SDValue LHS = Op.getOperand(i: 0);
4909 SDValue RHS = Op.getOperand(i: 1);
4910 SDLoc dl(Op);
4911
4912 // FIXME: We are currently always generating CMPs because we don't support
4913 // generating CMN through the backend. This is not as good as the natural
4914 // CMP case because it causes a register dependency and cannot be folded
4915 // later.
4916
4917 switch (Op.getOpcode()) {
4918 default:
4919 llvm_unreachable("Unknown overflow instruction!");
4920 case ISD::SADDO:
4921 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4922 Value = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: Op.getValueType(), N1: LHS, N2: RHS);
4923 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
4924 break;
4925 case ISD::UADDO:
4926 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4927 // We use ADDC here to correspond to its use in LowerUnsignedALUO.
4928 // We do not use it in the USUBO case as Value may not be used.
4929 Value = DAG.getNode(ARMISD::ADDC, dl,
4930 DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS)
4931 .getValue(0);
4932 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
4933 break;
4934 case ISD::SSUBO:
4935 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4936 Value = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT: Op.getValueType(), N1: LHS, N2: RHS);
4937 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
4938 break;
4939 case ISD::USUBO:
4940 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4941 Value = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT: Op.getValueType(), N1: LHS, N2: RHS);
4942 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
4943 break;
4944 case ISD::UMULO:
4945 // We generate a UMUL_LOHI and then check if the high word is 0.
4946 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4947 Value = DAG.getNode(Opcode: ISD::UMUL_LOHI, DL: dl,
4948 VTList: DAG.getVTList(VT1: Op.getValueType(), VT2: Op.getValueType()),
4949 N1: LHS, N2: RHS);
4950 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1),
4951 DAG.getConstant(0, dl, MVT::i32));
4952 Value = Value.getValue(R: 0); // We only want the low 32 bits for the result.
4953 break;
4954 case ISD::SMULO:
4955 // We generate a SMUL_LOHI and then check if all the bits of the high word
4956 // are the same as the sign bit of the low word.
4957 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4958 Value = DAG.getNode(Opcode: ISD::SMUL_LOHI, DL: dl,
4959 VTList: DAG.getVTList(VT1: Op.getValueType(), VT2: Op.getValueType()),
4960 N1: LHS, N2: RHS);
4961 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1),
4962 DAG.getNode(ISD::SRA, dl, Op.getValueType(),
4963 Value.getValue(0),
4964 DAG.getConstant(31, dl, MVT::i32)));
4965 Value = Value.getValue(R: 0); // We only want the low 32 bits for the result.
4966 break;
4967 } // switch (...)
4968
4969 return std::make_pair(x&: Value, y&: OverflowCmp);
4970}
4971
4972SDValue
4973ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const {
4974 // Let legalize expand this if it isn't a legal type yet.
4975 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: Op.getValueType()))
4976 return SDValue();
4977
4978 SDValue Value, OverflowCmp;
4979 SDValue ARMcc;
4980 std::tie(args&: Value, args&: OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);
4981 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4982 SDLoc dl(Op);
4983 // We use 0 and 1 as false and true values.
4984 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
4985 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
4986 EVT VT = Op.getValueType();
4987
4988 SDValue Overflow = DAG.getNode(Opcode: ARMISD::CMOV, DL: dl, VT, N1: TVal, N2: FVal,
4989 N3: ARMcc, N4: CCR, N5: OverflowCmp);
4990
4991 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
4992 return DAG.getNode(Opcode: ISD::MERGE_VALUES, DL: dl, VTList: VTs, N1: Value, N2: Overflow);
4993}
4994
4995static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry,
4996 SelectionDAG &DAG) {
4997 SDLoc DL(BoolCarry);
4998 EVT CarryVT = BoolCarry.getValueType();
4999
5000 // This converts the boolean value carry into the carry flag by doing
5001 // ARMISD::SUBC Carry, 1
5002 SDValue Carry = DAG.getNode(ARMISD::SUBC, DL,
5003 DAG.getVTList(CarryVT, MVT::i32),
5004 BoolCarry, DAG.getConstant(1, DL, CarryVT));
5005 return Carry.getValue(R: 1);
5006}
5007
5008static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT,
5009 SelectionDAG &DAG) {
5010 SDLoc DL(Flags);
5011
5012 // Now convert the carry flag into a boolean carry. We do this
5013 // using ARMISD:ADDE 0, 0, Carry
5014 return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32),
5015 DAG.getConstant(0, DL, MVT::i32),
5016 DAG.getConstant(0, DL, MVT::i32), Flags);
5017}
5018
5019SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op,
5020 SelectionDAG &DAG) const {
5021 // Let legalize expand this if it isn't a legal type yet.
5022 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: Op.getValueType()))
5023 return SDValue();
5024
5025 SDValue LHS = Op.getOperand(i: 0);
5026 SDValue RHS = Op.getOperand(i: 1);
5027 SDLoc dl(Op);
5028
5029 EVT VT = Op.getValueType();
5030 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
5031 SDValue Value;
5032 SDValue Overflow;
5033 switch (Op.getOpcode()) {
5034 default:
5035 llvm_unreachable("Unknown overflow instruction!");
5036 case ISD::UADDO:
5037 Value = DAG.getNode(Opcode: ARMISD::ADDC, DL: dl, VTList: VTs, N1: LHS, N2: RHS);
5038 // Convert the carry flag into a boolean value.
5039 Overflow = ConvertCarryFlagToBooleanCarry(Flags: Value.getValue(R: 1), VT, DAG);
5040 break;
5041 case ISD::USUBO: {
5042 Value = DAG.getNode(Opcode: ARMISD::SUBC, DL: dl, VTList: VTs, N1: LHS, N2: RHS);
5043 // Convert the carry flag into a boolean value.
5044 Overflow = ConvertCarryFlagToBooleanCarry(Flags: Value.getValue(R: 1), VT, DAG);
5045 // ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow
5046 // value. So compute 1 - C.
5047 Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32,
5048 DAG.getConstant(1, dl, MVT::i32), Overflow);
5049 break;
5050 }
5051 }
5052
5053 return DAG.getNode(Opcode: ISD::MERGE_VALUES, DL: dl, VTList: VTs, N1: Value, N2: Overflow);
5054}
5055
5056static SDValue LowerADDSUBSAT(SDValue Op, SelectionDAG &DAG,
5057 const ARMSubtarget *Subtarget) {
5058 EVT VT = Op.getValueType();
5059 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP() || Subtarget->isThumb1Only())
5060 return SDValue();
5061 if (!VT.isSimple())
5062 return SDValue();
5063
5064 unsigned NewOpcode;
5065 switch (VT.getSimpleVT().SimpleTy) {
5066 default:
5067 return SDValue();
5068 case MVT::i8:
5069 switch (Op->getOpcode()) {
5070 case ISD::UADDSAT:
5071 NewOpcode = ARMISD::UQADD8b;
5072 break;
5073 case ISD::SADDSAT:
5074 NewOpcode = ARMISD::QADD8b;
5075 break;
5076 case ISD::USUBSAT:
5077 NewOpcode = ARMISD::UQSUB8b;
5078 break;
5079 case ISD::SSUBSAT:
5080 NewOpcode = ARMISD::QSUB8b;
5081 break;
5082 }
5083 break;
5084 case MVT::i16:
5085 switch (Op->getOpcode()) {
5086 case ISD::UADDSAT:
5087 NewOpcode = ARMISD::UQADD16b;
5088 break;
5089 case ISD::SADDSAT:
5090 NewOpcode = ARMISD::QADD16b;
5091 break;
5092 case ISD::USUBSAT:
5093 NewOpcode = ARMISD::UQSUB16b;
5094 break;
5095 case ISD::SSUBSAT:
5096 NewOpcode = ARMISD::QSUB16b;
5097 break;
5098 }
5099 break;
5100 }
5101
5102 SDLoc dl(Op);
5103 SDValue Add =
5104 DAG.getNode(NewOpcode, dl, MVT::i32,
5105 DAG.getSExtOrTrunc(Op->getOperand(0), dl, MVT::i32),
5106 DAG.getSExtOrTrunc(Op->getOperand(1), dl, MVT::i32));
5107 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT, Operand: Add);
5108}
5109
5110SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
5111 SDValue Cond = Op.getOperand(i: 0);
5112 SDValue SelectTrue = Op.getOperand(i: 1);
5113 SDValue SelectFalse = Op.getOperand(i: 2);
5114 SDLoc dl(Op);
5115 unsigned Opc = Cond.getOpcode();
5116
5117 if (Cond.getResNo() == 1 &&
5118 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5119 Opc == ISD::USUBO)) {
5120 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: Cond->getValueType(ResNo: 0)))
5121 return SDValue();
5122
5123 SDValue Value, OverflowCmp;
5124 SDValue ARMcc;
5125 std::tie(args&: Value, args&: OverflowCmp) = getARMXALUOOp(Op: Cond, DAG, ARMcc);
5126 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5127 EVT VT = Op.getValueType();
5128
5129 return getCMOV(dl, VT, FalseVal: SelectTrue, TrueVal: SelectFalse, ARMcc, CCR,
5130 Cmp: OverflowCmp, DAG);
5131 }
5132
5133 // Convert:
5134 //
5135 // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
5136 // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
5137 //
5138 if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
5139 const ConstantSDNode *CMOVTrue =
5140 dyn_cast<ConstantSDNode>(Val: Cond.getOperand(i: 0));
5141 const ConstantSDNode *CMOVFalse =
5142 dyn_cast<ConstantSDNode>(Val: Cond.getOperand(i: 1));
5143
5144 if (CMOVTrue && CMOVFalse) {
5145 unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
5146 unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
5147
5148 SDValue True;
5149 SDValue False;
5150 if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
5151 True = SelectTrue;
5152 False = SelectFalse;
5153 } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
5154 True = SelectFalse;
5155 False = SelectTrue;
5156 }
5157
5158 if (True.getNode() && False.getNode()) {
5159 EVT VT = Op.getValueType();
5160 SDValue ARMcc = Cond.getOperand(i: 2);
5161 SDValue CCR = Cond.getOperand(i: 3);
5162 SDValue Cmp = duplicateCmp(Cmp: Cond.getOperand(i: 4), DAG);
5163 assert(True.getValueType() == VT);
5164 return getCMOV(dl, VT, FalseVal: True, TrueVal: False, ARMcc, CCR, Cmp, DAG);
5165 }
5166 }
5167 }
5168
5169 // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
5170 // undefined bits before doing a full-word comparison with zero.
5171 Cond = DAG.getNode(Opcode: ISD::AND, DL: dl, VT: Cond.getValueType(), N1: Cond,
5172 N2: DAG.getConstant(Val: 1, DL: dl, VT: Cond.getValueType()));
5173
5174 return DAG.getSelectCC(DL: dl, LHS: Cond,
5175 RHS: DAG.getConstant(Val: 0, DL: dl, VT: Cond.getValueType()),
5176 True: SelectTrue, False: SelectFalse, Cond: ISD::SETNE);
5177}
5178
5179static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
5180 bool &swpCmpOps, bool &swpVselOps) {
5181 // Start by selecting the GE condition code for opcodes that return true for
5182 // 'equality'
5183 if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
5184 CC == ISD::SETULE || CC == ISD::SETGE || CC == ISD::SETLE)
5185 CondCode = ARMCC::GE;
5186
5187 // and GT for opcodes that return false for 'equality'.
5188 else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
5189 CC == ISD::SETULT || CC == ISD::SETGT || CC == ISD::SETLT)
5190 CondCode = ARMCC::GT;
5191
5192 // Since we are constrained to GE/GT, if the opcode contains 'less', we need
5193 // to swap the compare operands.
5194 if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
5195 CC == ISD::SETULT || CC == ISD::SETLE || CC == ISD::SETLT)
5196 swpCmpOps = true;
5197
5198 // Both GT and GE are ordered comparisons, and return false for 'unordered'.
5199 // If we have an unordered opcode, we need to swap the operands to the VSEL
5200 // instruction (effectively negating the condition).
5201 //
5202 // This also has the effect of swapping which one of 'less' or 'greater'
5203 // returns true, so we also swap the compare operands. It also switches
5204 // whether we return true for 'equality', so we compensate by picking the
5205 // opposite condition code to our original choice.
5206 if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
5207 CC == ISD::SETUGT) {
5208 swpCmpOps = !swpCmpOps;
5209 swpVselOps = !swpVselOps;
5210 CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
5211 }
5212
5213 // 'ordered' is 'anything but unordered', so use the VS condition code and
5214 // swap the VSEL operands.
5215 if (CC == ISD::SETO) {
5216 CondCode = ARMCC::VS;
5217 swpVselOps = true;
5218 }
5219
5220 // 'unordered or not equal' is 'anything but equal', so use the EQ condition
5221 // code and swap the VSEL operands. Also do this if we don't care about the
5222 // unordered case.
5223 if (CC == ISD::SETUNE || CC == ISD::SETNE) {
5224 CondCode = ARMCC::EQ;
5225 swpVselOps = true;
5226 }
5227}
5228
5229SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal,
5230 SDValue TrueVal, SDValue ARMcc, SDValue CCR,
5231 SDValue Cmp, SelectionDAG &DAG) const {
5232 if (!Subtarget->hasFP64() && VT == MVT::f64) {
5233 FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl,
5234 DAG.getVTList(MVT::i32, MVT::i32), FalseVal);
5235 TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl,
5236 DAG.getVTList(MVT::i32, MVT::i32), TrueVal);
5237
5238 SDValue TrueLow = TrueVal.getValue(R: 0);
5239 SDValue TrueHigh = TrueVal.getValue(R: 1);
5240 SDValue FalseLow = FalseVal.getValue(R: 0);
5241 SDValue FalseHigh = FalseVal.getValue(R: 1);
5242
5243 SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow,
5244 ARMcc, CCR, Cmp);
5245 SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh,
5246 ARMcc, CCR, duplicateCmp(Cmp, DAG));
5247
5248 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High);
5249 } else {
5250 return DAG.getNode(Opcode: ARMISD::CMOV, DL: dl, VT, N1: FalseVal, N2: TrueVal, N3: ARMcc, N4: CCR,
5251 N5: Cmp);
5252 }
5253}
5254
5255static bool isGTorGE(ISD::CondCode CC) {
5256 return CC == ISD::SETGT || CC == ISD::SETGE;
5257}
5258
5259static bool isLTorLE(ISD::CondCode CC) {
5260 return CC == ISD::SETLT || CC == ISD::SETLE;
5261}
5262
5263// See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating.
5264// All of these conditions (and their <= and >= counterparts) will do:
5265// x < k ? k : x
5266// x > k ? x : k
5267// k < x ? x : k
5268// k > x ? k : x
5269static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,
5270 const SDValue TrueVal, const SDValue FalseVal,
5271 const ISD::CondCode CC, const SDValue K) {
5272 return (isGTorGE(CC) &&
5273 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) ||
5274 (isLTorLE(CC) &&
5275 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal)));
5276}
5277
5278// Check if two chained conditionals could be converted into SSAT or USAT.
5279//
5280// SSAT can replace a set of two conditional selectors that bound a number to an
5281// interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples:
5282//
5283// x < -k ? -k : (x > k ? k : x)
5284// x < -k ? -k : (x < k ? x : k)
5285// x > -k ? (x > k ? k : x) : -k
5286// x < k ? (x < -k ? -k : x) : k
5287// etc.
5288//
5289// LLVM canonicalizes these to either a min(max()) or a max(min())
5290// pattern. This function tries to match one of these and will return a SSAT
5291// node if successful.
5292//
5293// USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1
5294// is a power of 2.
5295static SDValue LowerSaturatingConditional(SDValue Op, SelectionDAG &DAG) {
5296 EVT VT = Op.getValueType();
5297 SDValue V1 = Op.getOperand(i: 0);
5298 SDValue K1 = Op.getOperand(i: 1);
5299 SDValue TrueVal1 = Op.getOperand(i: 2);
5300 SDValue FalseVal1 = Op.getOperand(i: 3);
5301 ISD::CondCode CC1 = cast<CondCodeSDNode>(Val: Op.getOperand(i: 4))->get();
5302
5303 const SDValue Op2 = isa<ConstantSDNode>(Val: TrueVal1) ? FalseVal1 : TrueVal1;
5304 if (Op2.getOpcode() != ISD::SELECT_CC)
5305 return SDValue();
5306
5307 SDValue V2 = Op2.getOperand(i: 0);
5308 SDValue K2 = Op2.getOperand(i: 1);
5309 SDValue TrueVal2 = Op2.getOperand(i: 2);
5310 SDValue FalseVal2 = Op2.getOperand(i: 3);
5311 ISD::CondCode CC2 = cast<CondCodeSDNode>(Val: Op2.getOperand(i: 4))->get();
5312
5313 SDValue V1Tmp = V1;
5314 SDValue V2Tmp = V2;
5315
5316 // Check that the registers and the constants match a max(min()) or min(max())
5317 // pattern
5318 if (V1Tmp != TrueVal1 || V2Tmp != TrueVal2 || K1 != FalseVal1 ||
5319 K2 != FalseVal2 ||
5320 !((isGTorGE(CC: CC1) && isLTorLE(CC: CC2)) || (isLTorLE(CC: CC1) && isGTorGE(CC: CC2))))
5321 return SDValue();
5322
5323 // Check that the constant in the lower-bound check is
5324 // the opposite of the constant in the upper-bound check
5325 // in 1's complement.
5326 if (!isa<ConstantSDNode>(Val: K1) || !isa<ConstantSDNode>(Val: K2))
5327 return SDValue();
5328
5329 int64_t Val1 = cast<ConstantSDNode>(Val&: K1)->getSExtValue();
5330 int64_t Val2 = cast<ConstantSDNode>(Val&: K2)->getSExtValue();
5331 int64_t PosVal = std::max(a: Val1, b: Val2);
5332 int64_t NegVal = std::min(a: Val1, b: Val2);
5333
5334 if (!((Val1 > Val2 && isLTorLE(CC: CC1)) || (Val1 < Val2 && isLTorLE(CC: CC2))) ||
5335 !isPowerOf2_64(Value: PosVal + 1))
5336 return SDValue();
5337
5338 // Handle the difference between USAT (unsigned) and SSAT (signed)
5339 // saturation
5340 // At this point, PosVal is guaranteed to be positive
5341 uint64_t K = PosVal;
5342 SDLoc dl(Op);
5343 if (Val1 == ~Val2)
5344 return DAG.getNode(Opcode: ARMISD::SSAT, DL: dl, VT, N1: V2Tmp,
5345 N2: DAG.getConstant(Val: llvm::countr_one(Value: K), DL: dl, VT));
5346 if (NegVal == 0)
5347 return DAG.getNode(Opcode: ARMISD::USAT, DL: dl, VT, N1: V2Tmp,
5348 N2: DAG.getConstant(Val: llvm::countr_one(Value: K), DL: dl, VT));
5349
5350 return SDValue();
5351}
5352
5353// Check if a condition of the type x < k ? k : x can be converted into a
5354// bit operation instead of conditional moves.
5355// Currently this is allowed given:
5356// - The conditions and values match up
5357// - k is 0 or -1 (all ones)
5358// This function will not check the last condition, thats up to the caller
5359// It returns true if the transformation can be made, and in such case
5360// returns x in V, and k in SatK.
5361static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V,
5362 SDValue &SatK)
5363{
5364 SDValue LHS = Op.getOperand(i: 0);
5365 SDValue RHS = Op.getOperand(i: 1);
5366 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: 4))->get();
5367 SDValue TrueVal = Op.getOperand(i: 2);
5368 SDValue FalseVal = Op.getOperand(i: 3);
5369
5370 SDValue *K = isa<ConstantSDNode>(Val: LHS) ? &LHS : isa<ConstantSDNode>(Val: RHS)
5371 ? &RHS
5372 : nullptr;
5373
5374 // No constant operation in comparison, early out
5375 if (!K)
5376 return false;
5377
5378 SDValue KTmp = isa<ConstantSDNode>(Val: TrueVal) ? TrueVal : FalseVal;
5379 V = (KTmp == TrueVal) ? FalseVal : TrueVal;
5380 SDValue VTmp = (K && *K == LHS) ? RHS : LHS;
5381
5382 // If the constant on left and right side, or variable on left and right,
5383 // does not match, early out
5384 if (*K != KTmp || V != VTmp)
5385 return false;
5386
5387 if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, K: *K)) {
5388 SatK = *K;
5389 return true;
5390 }
5391
5392 return false;
5393}
5394
5395bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const {
5396 if (VT == MVT::f32)
5397 return !Subtarget->hasVFP2Base();
5398 if (VT == MVT::f64)
5399 return !Subtarget->hasFP64();
5400 if (VT == MVT::f16)
5401 return !Subtarget->hasFullFP16();
5402 return false;
5403}
5404
5405SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
5406 EVT VT = Op.getValueType();
5407 SDLoc dl(Op);
5408
5409 // Try to convert two saturating conditional selects into a single SSAT
5410 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2())
5411 if (SDValue SatValue = LowerSaturatingConditional(Op, DAG))
5412 return SatValue;
5413
5414 // Try to convert expressions of the form x < k ? k : x (and similar forms)
5415 // into more efficient bit operations, which is possible when k is 0 or -1
5416 // On ARM and Thumb-2 which have flexible operand 2 this will result in
5417 // single instructions. On Thumb the shift and the bit operation will be two
5418 // instructions.
5419 // Only allow this transformation on full-width (32-bit) operations
5420 SDValue LowerSatConstant;
5421 SDValue SatValue;
5422 if (VT == MVT::i32 &&
5423 isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) {
5424 SDValue ShiftV = DAG.getNode(Opcode: ISD::SRA, DL: dl, VT, N1: SatValue,
5425 N2: DAG.getConstant(Val: 31, DL: dl, VT));
5426 if (isNullConstant(V: LowerSatConstant)) {
5427 SDValue NotShiftV = DAG.getNode(Opcode: ISD::XOR, DL: dl, VT, N1: ShiftV,
5428 N2: DAG.getAllOnesConstant(DL: dl, VT));
5429 return DAG.getNode(Opcode: ISD::AND, DL: dl, VT, N1: SatValue, N2: NotShiftV);
5430 } else if (isAllOnesConstant(V: LowerSatConstant))
5431 return DAG.getNode(Opcode: ISD::OR, DL: dl, VT, N1: SatValue, N2: ShiftV);
5432 }
5433
5434 SDValue LHS = Op.getOperand(i: 0);
5435 SDValue RHS = Op.getOperand(i: 1);
5436 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: 4))->get();
5437 SDValue TrueVal = Op.getOperand(i: 2);
5438 SDValue FalseVal = Op.getOperand(i: 3);
5439 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(Val&: FalseVal);
5440 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(Val&: TrueVal);
5441
5442 if (Subtarget->hasV8_1MMainlineOps() && CFVal && CTVal &&
5443 LHS.getValueType() == MVT::i32 && RHS.getValueType() == MVT::i32) {
5444 unsigned TVal = CTVal->getZExtValue();
5445 unsigned FVal = CFVal->getZExtValue();
5446 unsigned Opcode = 0;
5447
5448 if (TVal == ~FVal) {
5449 Opcode = ARMISD::CSINV;
5450 } else if (TVal == ~FVal + 1) {
5451 Opcode = ARMISD::CSNEG;
5452 } else if (TVal + 1 == FVal) {
5453 Opcode = ARMISD::CSINC;
5454 } else if (TVal == FVal + 1) {
5455 Opcode = ARMISD::CSINC;
5456 std::swap(a&: TrueVal, b&: FalseVal);
5457 std::swap(a&: TVal, b&: FVal);
5458 CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
5459 }
5460
5461 if (Opcode) {
5462 // If one of the constants is cheaper than another, materialise the
5463 // cheaper one and let the csel generate the other.
5464 if (Opcode != ARMISD::CSINC &&
5465 HasLowerConstantMaterializationCost(Val1: FVal, Val2: TVal, Subtarget)) {
5466 std::swap(a&: TrueVal, b&: FalseVal);
5467 std::swap(a&: TVal, b&: FVal);
5468 CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
5469 }
5470
5471 // Attempt to use ZR checking TVal is 0, possibly inverting the condition
5472 // to get there. CSINC not is invertable like the other two (~(~a) == a,
5473 // -(-a) == a, but (a+1)+1 != a).
5474 if (FVal == 0 && Opcode != ARMISD::CSINC) {
5475 std::swap(a&: TrueVal, b&: FalseVal);
5476 std::swap(a&: TVal, b&: FVal);
5477 CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
5478 }
5479
5480 // Drops F's value because we can get it by inverting/negating TVal.
5481 FalseVal = TrueVal;
5482
5483 SDValue ARMcc;
5484 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5485 EVT VT = TrueVal.getValueType();
5486 return DAG.getNode(Opcode, DL: dl, VT, N1: TrueVal, N2: FalseVal, N3: ARMcc, N4: Cmp);
5487 }
5488 }
5489
5490 if (isUnsupportedFloatingType(VT: LHS.getValueType())) {
5491 DAG.getTargetLoweringInfo().softenSetCCOperands(
5492 DAG, VT: LHS.getValueType(), NewLHS&: LHS, NewRHS&: RHS, CCCode&: CC, DL: dl, OldLHS: LHS, OldRHS: RHS);
5493
5494 // If softenSetCCOperands only returned one value, we should compare it to
5495 // zero.
5496 if (!RHS.getNode()) {
5497 RHS = DAG.getConstant(Val: 0, DL: dl, VT: LHS.getValueType());
5498 CC = ISD::SETNE;
5499 }
5500 }
5501
5502 if (LHS.getValueType() == MVT::i32) {
5503 // Try to generate VSEL on ARMv8.
5504 // The VSEL instruction can't use all the usual ARM condition
5505 // codes: it only has two bits to select the condition code, so it's
5506 // constrained to use only GE, GT, VS and EQ.
5507 //
5508 // To implement all the various ISD::SETXXX opcodes, we sometimes need to
5509 // swap the operands of the previous compare instruction (effectively
5510 // inverting the compare condition, swapping 'less' and 'greater') and
5511 // sometimes need to swap the operands to the VSEL (which inverts the
5512 // condition in the sense of firing whenever the previous condition didn't)
5513 if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 ||
5514 TrueVal.getValueType() == MVT::f32 ||
5515 TrueVal.getValueType() == MVT::f64)) {
5516 ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
5517 if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
5518 CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
5519 CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
5520 std::swap(a&: TrueVal, b&: FalseVal);
5521 }
5522 }
5523
5524 SDValue ARMcc;
5525 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5526 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5527 // Choose GE over PL, which vsel does now support
5528 if (ARMcc->getAsZExtVal() == ARMCC::PL)
5529 ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32);
5530 return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
5531 }
5532
5533 ARMCC::CondCodes CondCode, CondCode2;
5534 FPCCToARMCC(CC, CondCode, CondCode2);
5535
5536 // Normalize the fp compare. If RHS is zero we prefer to keep it there so we
5537 // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we
5538 // must use VSEL (limited condition codes), due to not having conditional f16
5539 // moves.
5540 if (Subtarget->hasFPARMv8Base() &&
5541 !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) &&
5542 (TrueVal.getValueType() == MVT::f16 ||
5543 TrueVal.getValueType() == MVT::f32 ||
5544 TrueVal.getValueType() == MVT::f64)) {
5545 bool swpCmpOps = false;
5546 bool swpVselOps = false;
5547 checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
5548
5549 if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
5550 CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
5551 if (swpCmpOps)
5552 std::swap(a&: LHS, b&: RHS);
5553 if (swpVselOps)
5554 std::swap(a&: TrueVal, b&: FalseVal);
5555 }
5556 }
5557
5558 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5559 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5560 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5561 SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
5562 if (CondCode2 != ARMCC::AL) {
5563 SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
5564 // FIXME: Needs another CMP because flag can have but one use.
5565 SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl);
5566 Result = getCMOV(dl, VT, FalseVal: Result, TrueVal, ARMcc: ARMcc2, CCR, Cmp: Cmp2, DAG);
5567 }
5568 return Result;
5569}
5570
5571/// canChangeToInt - Given the fp compare operand, return true if it is suitable
5572/// to morph to an integer compare sequence.
5573static bool canChangeToInt(SDValue Op, bool &SeenZero,
5574 const ARMSubtarget *Subtarget) {
5575 SDNode *N = Op.getNode();
5576 if (!N->hasOneUse())
5577 // Otherwise it requires moving the value from fp to integer registers.
5578 return false;
5579 if (!N->getNumValues())
5580 return false;
5581 EVT VT = Op.getValueType();
5582 if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
5583 // f32 case is generally profitable. f64 case only makes sense when vcmpe +
5584 // vmrs are very slow, e.g. cortex-a8.
5585 return false;
5586
5587 if (isFloatingPointZero(Op)) {
5588 SeenZero = true;
5589 return true;
5590 }
5591 return ISD::isNormalLoad(N);
5592}
5593
5594static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) {
5595 if (isFloatingPointZero(Op))
5596 return DAG.getConstant(0, SDLoc(Op), MVT::i32);
5597
5598 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op))
5599 return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(),
5600 Ld->getPointerInfo(), Ld->getAlign(),
5601 Ld->getMemOperand()->getFlags());
5602
5603 llvm_unreachable("Unknown VFP cmp argument!");
5604}
5605
5606static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
5607 SDValue &RetVal1, SDValue &RetVal2) {
5608 SDLoc dl(Op);
5609
5610 if (isFloatingPointZero(Op)) {
5611 RetVal1 = DAG.getConstant(0, dl, MVT::i32);
5612 RetVal2 = DAG.getConstant(0, dl, MVT::i32);
5613 return;
5614 }
5615
5616 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Val&: Op)) {
5617 SDValue Ptr = Ld->getBasePtr();
5618 RetVal1 =
5619 DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
5620 Ld->getAlign(), Ld->getMemOperand()->getFlags());
5621
5622 EVT PtrType = Ptr.getValueType();
5623 SDValue NewPtr = DAG.getNode(Opcode: ISD::ADD, DL: dl,
5624 VT: PtrType, N1: Ptr, N2: DAG.getConstant(Val: 4, DL: dl, VT: PtrType));
5625 RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr,
5626 Ld->getPointerInfo().getWithOffset(4),
5627 commonAlignment(Ld->getAlign(), 4),
5628 Ld->getMemOperand()->getFlags());
5629 return;
5630 }
5631
5632 llvm_unreachable("Unknown VFP cmp argument!");
5633}
5634
5635/// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some
5636/// f32 and even f64 comparisons to integer ones.
5637SDValue
5638ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
5639 SDValue Chain = Op.getOperand(i: 0);
5640 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: 1))->get();
5641 SDValue LHS = Op.getOperand(i: 2);
5642 SDValue RHS = Op.getOperand(i: 3);
5643 SDValue Dest = Op.getOperand(i: 4);
5644 SDLoc dl(Op);
5645
5646 bool LHSSeenZero = false;
5647 bool LHSOk = canChangeToInt(Op: LHS, SeenZero&: LHSSeenZero, Subtarget);
5648 bool RHSSeenZero = false;
5649 bool RHSOk = canChangeToInt(Op: RHS, SeenZero&: RHSSeenZero, Subtarget);
5650 if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {
5651 // If unsafe fp math optimization is enabled and there are no other uses of
5652 // the CMP operands, and the condition code is EQ or NE, we can optimize it
5653 // to an integer comparison.
5654 if (CC == ISD::SETOEQ)
5655 CC = ISD::SETEQ;
5656 else if (CC == ISD::SETUNE)
5657 CC = ISD::SETNE;
5658
5659 SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32);
5660 SDValue ARMcc;
5661 if (LHS.getValueType() == MVT::f32) {
5662 LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5663 bitcastf32Toi32(LHS, DAG), Mask);
5664 RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5665 bitcastf32Toi32(RHS, DAG), Mask);
5666 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5667 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5668 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
5669 Chain, Dest, ARMcc, CCR, Cmp);
5670 }
5671
5672 SDValue LHS1, LHS2;
5673 SDValue RHS1, RHS2;
5674 expandf64Toi32(Op: LHS, DAG, RetVal1&: LHS1, RetVal2&: LHS2);
5675 expandf64Toi32(Op: RHS, DAG, RetVal1&: RHS1, RetVal2&: RHS2);
5676 LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
5677 RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
5678 ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
5679 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5680 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
5681 SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
5682 return DAG.getNode(Opcode: ARMISD::BCC_i64, DL: dl, VTList, Ops);
5683 }
5684
5685 return SDValue();
5686}
5687
5688SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
5689 SDValue Chain = Op.getOperand(i: 0);
5690 SDValue Cond = Op.getOperand(i: 1);
5691 SDValue Dest = Op.getOperand(i: 2);
5692 SDLoc dl(Op);
5693
5694 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5695 // instruction.
5696 unsigned Opc = Cond.getOpcode();
5697 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5698 !Subtarget->isThumb1Only();
5699 if (Cond.getResNo() == 1 &&
5700 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5701 Opc == ISD::USUBO || OptimizeMul)) {
5702 // Only lower legal XALUO ops.
5703 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: Cond->getValueType(ResNo: 0)))
5704 return SDValue();
5705
5706 // The actual operation with overflow check.
5707 SDValue Value, OverflowCmp;
5708 SDValue ARMcc;
5709 std::tie(args&: Value, args&: OverflowCmp) = getARMXALUOOp(Op: Cond, DAG, ARMcc);
5710
5711 // Reverse the condition code.
5712 ARMCC::CondCodes CondCode =
5713 (ARMCC::CondCodes)cast<const ConstantSDNode>(Val&: ARMcc)->getZExtValue();
5714 CondCode = ARMCC::getOppositeCondition(CC: CondCode);
5715 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
5716 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5717
5718 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR,
5719 OverflowCmp);
5720 }
5721
5722 return SDValue();
5723}
5724
5725SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
5726 SDValue Chain = Op.getOperand(i: 0);
5727 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: 1))->get();
5728 SDValue LHS = Op.getOperand(i: 2);
5729 SDValue RHS = Op.getOperand(i: 3);
5730 SDValue Dest = Op.getOperand(i: 4);
5731 SDLoc dl(Op);
5732
5733 if (isUnsupportedFloatingType(VT: LHS.getValueType())) {
5734 DAG.getTargetLoweringInfo().softenSetCCOperands(
5735 DAG, VT: LHS.getValueType(), NewLHS&: LHS, NewRHS&: RHS, CCCode&: CC, DL: dl, OldLHS: LHS, OldRHS: RHS);
5736
5737 // If softenSetCCOperands only returned one value, we should compare it to
5738 // zero.
5739 if (!RHS.getNode()) {
5740 RHS = DAG.getConstant(Val: 0, DL: dl, VT: LHS.getValueType());
5741 CC = ISD::SETNE;
5742 }
5743 }
5744
5745 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5746 // instruction.
5747 unsigned Opc = LHS.getOpcode();
5748 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5749 !Subtarget->isThumb1Only();
5750 if (LHS.getResNo() == 1 && (isOneConstant(V: RHS) || isNullConstant(V: RHS)) &&
5751 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5752 Opc == ISD::USUBO || OptimizeMul) &&
5753 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
5754 // Only lower legal XALUO ops.
5755 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: LHS->getValueType(ResNo: 0)))
5756 return SDValue();
5757
5758 // The actual operation with overflow check.
5759 SDValue Value, OverflowCmp;
5760 SDValue ARMcc;
5761 std::tie(args&: Value, args&: OverflowCmp) = getARMXALUOOp(Op: LHS.getValue(R: 0), DAG, ARMcc);
5762
5763 if ((CC == ISD::SETNE) != isOneConstant(V: RHS)) {
5764 // Reverse the condition code.
5765 ARMCC::CondCodes CondCode =
5766 (ARMCC::CondCodes)cast<const ConstantSDNode>(Val&: ARMcc)->getZExtValue();
5767 CondCode = ARMCC::getOppositeCondition(CC: CondCode);
5768 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
5769 }
5770 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5771
5772 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR,
5773 OverflowCmp);
5774 }
5775
5776 if (LHS.getValueType() == MVT::i32) {
5777 SDValue ARMcc;
5778 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5779 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5780 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
5781 Chain, Dest, ARMcc, CCR, Cmp);
5782 }
5783
5784 if (getTargetMachine().Options.UnsafeFPMath &&
5785 (CC == ISD::SETEQ || CC == ISD::SETOEQ ||
5786 CC == ISD::SETNE || CC == ISD::SETUNE)) {
5787 if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
5788 return Result;
5789 }
5790
5791 ARMCC::CondCodes CondCode, CondCode2;
5792 FPCCToARMCC(CC, CondCode, CondCode2);
5793
5794 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5795 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5796 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5797 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
5798 SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp };
5799 SDValue Res = DAG.getNode(Opcode: ARMISD::BRCOND, DL: dl, VTList, Ops);
5800 if (CondCode2 != ARMCC::AL) {
5801 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
5802 SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(R: 1) };
5803 Res = DAG.getNode(Opcode: ARMISD::BRCOND, DL: dl, VTList, Ops);
5804 }
5805 return Res;
5806}
5807
5808SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
5809 SDValue Chain = Op.getOperand(i: 0);
5810 SDValue Table = Op.getOperand(i: 1);
5811 SDValue Index = Op.getOperand(i: 2);
5812 SDLoc dl(Op);
5813
5814 EVT PTy = getPointerTy(DL: DAG.getDataLayout());
5815 JumpTableSDNode *JT = cast<JumpTableSDNode>(Val&: Table);
5816 SDValue JTI = DAG.getTargetJumpTable(JTI: JT->getIndex(), VT: PTy);
5817 Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI);
5818 Index = DAG.getNode(Opcode: ISD::MUL, DL: dl, VT: PTy, N1: Index, N2: DAG.getConstant(Val: 4, DL: dl, VT: PTy));
5819 SDValue Addr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PTy, N1: Table, N2: Index);
5820 if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) {
5821 // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table
5822 // which does another jump to the destination. This also makes it easier
5823 // to translate it to TBB / TBH later (Thumb2 only).
5824 // FIXME: This might not work if the function is extremely large.
5825 return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
5826 Addr, Op.getOperand(2), JTI);
5827 }
5828 if (isPositionIndependent() || Subtarget->isROPI()) {
5829 Addr =
5830 DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
5831 MachinePointerInfo::getJumpTable(DAG.getMachineFunction()));
5832 Chain = Addr.getValue(R: 1);
5833 Addr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PTy, N1: Table, N2: Addr);
5834 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5835 } else {
5836 Addr =
5837 DAG.getLoad(VT: PTy, dl, Chain, Ptr: Addr,
5838 PtrInfo: MachinePointerInfo::getJumpTable(MF&: DAG.getMachineFunction()));
5839 Chain = Addr.getValue(R: 1);
5840 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5841 }
5842}
5843
5844static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
5845 EVT VT = Op.getValueType();
5846 SDLoc dl(Op);
5847
5848 if (Op.getValueType().getVectorElementType() == MVT::i32) {
5849 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
5850 return Op;
5851 return DAG.UnrollVectorOp(N: Op.getNode());
5852 }
5853
5854 const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
5855
5856 EVT NewTy;
5857 const EVT OpTy = Op.getOperand(i: 0).getValueType();
5858 if (OpTy == MVT::v4f32)
5859 NewTy = MVT::v4i32;
5860 else if (OpTy == MVT::v4f16 && HasFullFP16)
5861 NewTy = MVT::v4i16;
5862 else if (OpTy == MVT::v8f16 && HasFullFP16)
5863 NewTy = MVT::v8i16;
5864 else
5865 llvm_unreachable("Invalid type for custom lowering!");
5866
5867 if (VT != MVT::v4i16 && VT != MVT::v8i16)
5868 return DAG.UnrollVectorOp(N: Op.getNode());
5869
5870 Op = DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: NewTy, Operand: Op.getOperand(i: 0));
5871 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT, Operand: Op);
5872}
5873
5874SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
5875 EVT VT = Op.getValueType();
5876 if (VT.isVector())
5877 return LowerVectorFP_TO_INT(Op, DAG);
5878
5879 bool IsStrict = Op->isStrictFPOpcode();
5880 SDValue SrcVal = Op.getOperand(i: IsStrict ? 1 : 0);
5881
5882 if (isUnsupportedFloatingType(VT: SrcVal.getValueType())) {
5883 RTLIB::Libcall LC;
5884 if (Op.getOpcode() == ISD::FP_TO_SINT ||
5885 Op.getOpcode() == ISD::STRICT_FP_TO_SINT)
5886 LC = RTLIB::getFPTOSINT(OpVT: SrcVal.getValueType(),
5887 RetVT: Op.getValueType());
5888 else
5889 LC = RTLIB::getFPTOUINT(OpVT: SrcVal.getValueType(),
5890 RetVT: Op.getValueType());
5891 SDLoc Loc(Op);
5892 MakeLibCallOptions CallOptions;
5893 SDValue Chain = IsStrict ? Op.getOperand(i: 0) : SDValue();
5894 SDValue Result;
5895 std::tie(args&: Result, args&: Chain) = makeLibCall(DAG, LC, RetVT: Op.getValueType(), Ops: SrcVal,
5896 CallOptions, dl: Loc, Chain);
5897 return IsStrict ? DAG.getMergeValues(Ops: {Result, Chain}, dl: Loc) : Result;
5898 }
5899
5900 // FIXME: Remove this when we have strict fp instruction selection patterns
5901 if (IsStrict) {
5902 SDLoc Loc(Op);
5903 SDValue Result =
5904 DAG.getNode(Opcode: Op.getOpcode() == ISD::STRICT_FP_TO_SINT ? ISD::FP_TO_SINT
5905 : ISD::FP_TO_UINT,
5906 DL: Loc, VT: Op.getValueType(), Operand: SrcVal);
5907 return DAG.getMergeValues(Ops: {Result, Op.getOperand(i: 0)}, dl: Loc);
5908 }
5909
5910 return Op;
5911}
5912
5913static SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG,
5914 const ARMSubtarget *Subtarget) {
5915 EVT VT = Op.getValueType();
5916 EVT ToVT = cast<VTSDNode>(Val: Op.getOperand(i: 1))->getVT();
5917 EVT FromVT = Op.getOperand(i: 0).getValueType();
5918
5919 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f32)
5920 return Op;
5921 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f64 &&
5922 Subtarget->hasFP64())
5923 return Op;
5924 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f16 &&
5925 Subtarget->hasFullFP16())
5926 return Op;
5927 if (VT == MVT::v4i32 && ToVT == MVT::i32 && FromVT == MVT::v4f32 &&
5928 Subtarget->hasMVEFloatOps())
5929 return Op;
5930 if (VT == MVT::v8i16 && ToVT == MVT::i16 && FromVT == MVT::v8f16 &&
5931 Subtarget->hasMVEFloatOps())
5932 return Op;
5933
5934 if (FromVT != MVT::v4f32 && FromVT != MVT::v8f16)
5935 return SDValue();
5936
5937 SDLoc DL(Op);
5938 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT_SAT;
5939 unsigned BW = ToVT.getScalarSizeInBits() - IsSigned;
5940 SDValue CVT = DAG.getNode(Opcode: Op.getOpcode(), DL, VT, N1: Op.getOperand(i: 0),
5941 N2: DAG.getValueType(VT.getScalarType()));
5942 SDValue Max = DAG.getNode(Opcode: IsSigned ? ISD::SMIN : ISD::UMIN, DL, VT, N1: CVT,
5943 N2: DAG.getConstant(Val: (1 << BW) - 1, DL, VT));
5944 if (IsSigned)
5945 Max = DAG.getNode(Opcode: ISD::SMAX, DL, VT, N1: Max,
5946 N2: DAG.getConstant(Val: -(1 << BW), DL, VT));
5947 return Max;
5948}
5949
5950static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
5951 EVT VT = Op.getValueType();
5952 SDLoc dl(Op);
5953
5954 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
5955 if (VT.getVectorElementType() == MVT::f32)
5956 return Op;
5957 return DAG.UnrollVectorOp(N: Op.getNode());
5958 }
5959
5960 assert((Op.getOperand(0).getValueType() == MVT::v4i16 ||
5961 Op.getOperand(0).getValueType() == MVT::v8i16) &&
5962 "Invalid type for custom lowering!");
5963
5964 const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
5965
5966 EVT DestVecType;
5967 if (VT == MVT::v4f32)
5968 DestVecType = MVT::v4i32;
5969 else if (VT == MVT::v4f16 && HasFullFP16)
5970 DestVecType = MVT::v4i16;
5971 else if (VT == MVT::v8f16 && HasFullFP16)
5972 DestVecType = MVT::v8i16;
5973 else
5974 return DAG.UnrollVectorOp(N: Op.getNode());
5975
5976 unsigned CastOpc;
5977 unsigned Opc;
5978 switch (Op.getOpcode()) {
5979 default: llvm_unreachable("Invalid opcode!");
5980 case ISD::SINT_TO_FP:
5981 CastOpc = ISD::SIGN_EXTEND;
5982 Opc = ISD::SINT_TO_FP;
5983 break;
5984 case ISD::UINT_TO_FP:
5985 CastOpc = ISD::ZERO_EXTEND;
5986 Opc = ISD::UINT_TO_FP;
5987 break;
5988 }
5989
5990 Op = DAG.getNode(Opcode: CastOpc, DL: dl, VT: DestVecType, Operand: Op.getOperand(i: 0));
5991 return DAG.getNode(Opcode: Opc, DL: dl, VT, Operand: Op);
5992}
5993
5994SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
5995 EVT VT = Op.getValueType();
5996 if (VT.isVector())
5997 return LowerVectorINT_TO_FP(Op, DAG);
5998 if (isUnsupportedFloatingType(VT)) {
5999 RTLIB::Libcall LC;
6000 if (Op.getOpcode() == ISD::SINT_TO_FP)
6001 LC = RTLIB::getSINTTOFP(OpVT: Op.getOperand(i: 0).getValueType(),
6002 RetVT: Op.getValueType());
6003 else
6004 LC = RTLIB::getUINTTOFP(OpVT: Op.getOperand(i: 0).getValueType(),
6005 RetVT: Op.getValueType());
6006 MakeLibCallOptions CallOptions;
6007 return makeLibCall(DAG, LC, RetVT: Op.getValueType(), Ops: Op.getOperand(i: 0),
6008 CallOptions, dl: SDLoc(Op)).first;
6009 }
6010
6011 return Op;
6012}
6013
6014SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
6015 // Implement fcopysign with a fabs and a conditional fneg.
6016 SDValue Tmp0 = Op.getOperand(i: 0);
6017 SDValue Tmp1 = Op.getOperand(i: 1);
6018 SDLoc dl(Op);
6019 EVT VT = Op.getValueType();
6020 EVT SrcVT = Tmp1.getValueType();
6021 bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
6022 Tmp0.getOpcode() == ARMISD::VMOVDRR;
6023 bool UseNEON = !InGPR && Subtarget->hasNEON();
6024
6025 if (UseNEON) {
6026 // Use VBSL to copy the sign bit.
6027 unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode: 0x6, Val: 0x80);
6028 SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
6029 DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
6030 EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
6031 if (VT == MVT::f64)
6032 Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
6033 DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
6034 DAG.getConstant(32, dl, MVT::i32));
6035 else /*if (VT == MVT::f32)*/
6036 Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
6037 if (SrcVT == MVT::f32) {
6038 Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
6039 if (VT == MVT::f64)
6040 Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
6041 DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
6042 DAG.getConstant(32, dl, MVT::i32));
6043 } else if (VT == MVT::f32)
6044 Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64,
6045 DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
6046 DAG.getConstant(32, dl, MVT::i32));
6047 Tmp0 = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: OpVT, Operand: Tmp0);
6048 Tmp1 = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: OpVT, Operand: Tmp1);
6049
6050 SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff),
6051 dl, MVT::i32);
6052 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
6053 SDValue MaskNot = DAG.getNode(Opcode: ISD::XOR, DL: dl, VT: OpVT, N1: Mask,
6054 N2: DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: OpVT, Operand: AllOnes));
6055
6056 SDValue Res = DAG.getNode(Opcode: ISD::OR, DL: dl, VT: OpVT,
6057 N1: DAG.getNode(Opcode: ISD::AND, DL: dl, VT: OpVT, N1: Tmp1, N2: Mask),
6058 N2: DAG.getNode(Opcode: ISD::AND, DL: dl, VT: OpVT, N1: Tmp0, N2: MaskNot));
6059 if (VT == MVT::f32) {
6060 Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
6061 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
6062 DAG.getConstant(0, dl, MVT::i32));
6063 } else {
6064 Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
6065 }
6066
6067 return Res;
6068 }
6069
6070 // Bitcast operand 1 to i32.
6071 if (SrcVT == MVT::f64)
6072 Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
6073 Tmp1).getValue(1);
6074 Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
6075
6076 // Or in the signbit with integer operations.
6077 SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32);
6078 SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32);
6079 Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
6080 if (VT == MVT::f32) {
6081 Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
6082 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
6083 return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
6084 DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
6085 }
6086
6087 // f64: Or the high part with signbit and then combine two parts.
6088 Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
6089 Tmp0);
6090 SDValue Lo = Tmp0.getValue(R: 0);
6091 SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
6092 Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
6093 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
6094}
6095
6096SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
6097 MachineFunction &MF = DAG.getMachineFunction();
6098 MachineFrameInfo &MFI = MF.getFrameInfo();
6099 MFI.setReturnAddressIsTaken(true);
6100
6101 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
6102 return SDValue();
6103
6104 EVT VT = Op.getValueType();
6105 SDLoc dl(Op);
6106 unsigned Depth = Op.getConstantOperandVal(i: 0);
6107 if (Depth) {
6108 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
6109 SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
6110 return DAG.getLoad(VT, dl, Chain: DAG.getEntryNode(),
6111 Ptr: DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: FrameAddr, N2: Offset),
6112 PtrInfo: MachinePointerInfo());
6113 }
6114
6115 // Return LR, which contains the return address. Mark it an implicit live-in.
6116 Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
6117 return DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl, Reg, VT);
6118}
6119
6120SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
6121 const ARMBaseRegisterInfo &ARI =
6122 *static_cast<const ARMBaseRegisterInfo*>(RegInfo);
6123 MachineFunction &MF = DAG.getMachineFunction();
6124 MachineFrameInfo &MFI = MF.getFrameInfo();
6125 MFI.setFrameAddressIsTaken(true);
6126
6127 EVT VT = Op.getValueType();
6128 SDLoc dl(Op); // FIXME probably not meaningful
6129 unsigned Depth = Op.getConstantOperandVal(i: 0);
6130 Register FrameReg = ARI.getFrameRegister(MF);
6131 SDValue FrameAddr = DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl, Reg: FrameReg, VT);
6132 while (Depth--)
6133 FrameAddr = DAG.getLoad(VT, dl, Chain: DAG.getEntryNode(), Ptr: FrameAddr,
6134 PtrInfo: MachinePointerInfo());
6135 return FrameAddr;
6136}
6137
6138// FIXME? Maybe this could be a TableGen attribute on some registers and
6139// this table could be generated automatically from RegInfo.
6140Register ARMTargetLowering::getRegisterByName(const char* RegName, LLT VT,
6141 const MachineFunction &MF) const {
6142 Register Reg = StringSwitch<unsigned>(RegName)
6143 .Case("sp", ARM::SP)
6144 .Default(0);
6145 if (Reg)
6146 return Reg;
6147 report_fatal_error(reason: Twine("Invalid register name \""
6148 + StringRef(RegName) + "\"."));
6149}
6150
6151// Result is 64 bit value so split into two 32 bit values and return as a
6152// pair of values.
6153static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results,
6154 SelectionDAG &DAG) {
6155 SDLoc DL(N);
6156
6157 // This function is only supposed to be called for i64 type destination.
6158 assert(N->getValueType(0) == MVT::i64
6159 && "ExpandREAD_REGISTER called for non-i64 type result.");
6160
6161 SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL,
6162 DAG.getVTList(MVT::i32, MVT::i32, MVT::Other),
6163 N->getOperand(0),
6164 N->getOperand(1));
6165
6166 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0),
6167 Read.getValue(1)));
6168 Results.push_back(Elt: Read.getOperand(i: 0));
6169}
6170
6171/// \p BC is a bitcast that is about to be turned into a VMOVDRR.
6172/// When \p DstVT, the destination type of \p BC, is on the vector
6173/// register bank and the source of bitcast, \p Op, operates on the same bank,
6174/// it might be possible to combine them, such that everything stays on the
6175/// vector register bank.
6176/// \p return The node that would replace \p BT, if the combine
6177/// is possible.
6178static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC,
6179 SelectionDAG &DAG) {
6180 SDValue Op = BC->getOperand(Num: 0);
6181 EVT DstVT = BC->getValueType(ResNo: 0);
6182
6183 // The only vector instruction that can produce a scalar (remember,
6184 // since the bitcast was about to be turned into VMOVDRR, the source
6185 // type is i64) from a vector is EXTRACT_VECTOR_ELT.
6186 // Moreover, we can do this combine only if there is one use.
6187 // Finally, if the destination type is not a vector, there is not
6188 // much point on forcing everything on the vector bank.
6189 if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6190 !Op.hasOneUse())
6191 return SDValue();
6192
6193 // If the index is not constant, we will introduce an additional
6194 // multiply that will stick.
6195 // Give up in that case.
6196 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 1));
6197 if (!Index)
6198 return SDValue();
6199 unsigned DstNumElt = DstVT.getVectorNumElements();
6200
6201 // Compute the new index.
6202 const APInt &APIntIndex = Index->getAPIntValue();
6203 APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
6204 NewIndex *= APIntIndex;
6205 // Check if the new constant index fits into i32.
6206 if (NewIndex.getBitWidth() > 32)
6207 return SDValue();
6208
6209 // vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
6210 // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
6211 SDLoc dl(Op);
6212 SDValue ExtractSrc = Op.getOperand(i: 0);
6213 EVT VecVT = EVT::getVectorVT(
6214 Context&: *DAG.getContext(), VT: DstVT.getScalarType(),
6215 NumElements: ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
6216 SDValue BitCast = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: VecVT, Operand: ExtractSrc);
6217 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,
6218 DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
6219}
6220
6221/// ExpandBITCAST - If the target supports VFP, this function is called to
6222/// expand a bit convert where either the source or destination type is i64 to
6223/// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64
6224/// operand type is illegal (e.g., v2f32 for a target that doesn't support
6225/// vectors), since the legalizer won't know what to do with that.
6226SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
6227 const ARMSubtarget *Subtarget) const {
6228 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6229 SDLoc dl(N);
6230 SDValue Op = N->getOperand(Num: 0);
6231
6232 // This function is only supposed to be called for i16 and i64 types, either
6233 // as the source or destination of the bit convert.
6234 EVT SrcVT = Op.getValueType();
6235 EVT DstVT = N->getValueType(ResNo: 0);
6236
6237 if ((SrcVT == MVT::i16 || SrcVT == MVT::i32) &&
6238 (DstVT == MVT::f16 || DstVT == MVT::bf16))
6239 return MoveToHPR(SDLoc(N), DAG, MVT::i32, DstVT.getSimpleVT(),
6240 DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), MVT::i32, Op));
6241
6242 if ((DstVT == MVT::i16 || DstVT == MVT::i32) &&
6243 (SrcVT == MVT::f16 || SrcVT == MVT::bf16))
6244 return DAG.getNode(
6245 ISD::TRUNCATE, SDLoc(N), DstVT,
6246 MoveFromHPR(SDLoc(N), DAG, MVT::i32, SrcVT.getSimpleVT(), Op));
6247
6248 if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))
6249 return SDValue();
6250
6251 // Turn i64->f64 into VMOVDRR.
6252 if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
6253 // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
6254 // if we can combine the bitcast with its source.
6255 if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(BC: N, DAG))
6256 return Val;
6257 SDValue Lo, Hi;
6258 std::tie(Lo, Hi) = DAG.SplitScalar(Op, dl, MVT::i32, MVT::i32);
6259 return DAG.getNode(ISD::BITCAST, dl, DstVT,
6260 DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
6261 }
6262
6263 // Turn f64->i64 into VMOVRRD.
6264 if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) {
6265 SDValue Cvt;
6266 if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&
6267 SrcVT.getVectorNumElements() > 1)
6268 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
6269 DAG.getVTList(MVT::i32, MVT::i32),
6270 DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op));
6271 else
6272 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
6273 DAG.getVTList(MVT::i32, MVT::i32), Op);
6274 // Merge the pieces into a single i64 value.
6275 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
6276 }
6277
6278 return SDValue();
6279}
6280
6281/// getZeroVector - Returns a vector of specified type with all zero elements.
6282/// Zero vectors are used to represent vector negation and in those cases
6283/// will be implemented with the NEON VNEG instruction. However, VNEG does
6284/// not support i64 elements, so sometimes the zero vectors will need to be
6285/// explicitly constructed. Regardless, use a canonical VMOV to create the
6286/// zero vector.
6287static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6288 assert(VT.isVector() && "Expected a vector type");
6289 // The canonical modified immediate encoding of a zero vector is....0!
6290 SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32);
6291 EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
6292 SDValue Vmov = DAG.getNode(Opcode: ARMISD::VMOVIMM, DL: dl, VT: VmovVT, Operand: EncodedVal);
6293 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT, Operand: Vmov);
6294}
6295
6296/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
6297/// i32 values and take a 2 x i32 value to shift plus a shift amount.
6298SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
6299 SelectionDAG &DAG) const {
6300 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6301 EVT VT = Op.getValueType();
6302 unsigned VTBits = VT.getSizeInBits();
6303 SDLoc dl(Op);
6304 SDValue ShOpLo = Op.getOperand(i: 0);
6305 SDValue ShOpHi = Op.getOperand(i: 1);
6306 SDValue ShAmt = Op.getOperand(i: 2);
6307 SDValue ARMcc;
6308 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
6309 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
6310
6311 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
6312
6313 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6314 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6315 SDValue Tmp1 = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT, N1: ShOpLo, N2: ShAmt);
6316 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6317 DAG.getConstant(VTBits, dl, MVT::i32));
6318 SDValue Tmp2 = DAG.getNode(Opcode: ISD::SHL, DL: dl, VT, N1: ShOpHi, N2: RevShAmt);
6319 SDValue LoSmallShift = DAG.getNode(Opcode: ISD::OR, DL: dl, VT, N1: Tmp1, N2: Tmp2);
6320 SDValue LoBigShift = DAG.getNode(Opcode: Opc, DL: dl, VT, N1: ShOpHi, N2: ExtraShAmt);
6321 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6322 ISD::SETGE, ARMcc, DAG, dl);
6323 SDValue Lo = DAG.getNode(Opcode: ARMISD::CMOV, DL: dl, VT, N1: LoSmallShift, N2: LoBigShift,
6324 N3: ARMcc, N4: CCR, N5: CmpLo);
6325
6326 SDValue HiSmallShift = DAG.getNode(Opcode: Opc, DL: dl, VT, N1: ShOpHi, N2: ShAmt);
6327 SDValue HiBigShift = Opc == ISD::SRA
6328 ? DAG.getNode(Opcode: Opc, DL: dl, VT, N1: ShOpHi,
6329 N2: DAG.getConstant(Val: VTBits - 1, DL: dl, VT))
6330 : DAG.getConstant(Val: 0, DL: dl, VT);
6331 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6332 ISD::SETGE, ARMcc, DAG, dl);
6333 SDValue Hi = DAG.getNode(Opcode: ARMISD::CMOV, DL: dl, VT, N1: HiSmallShift, N2: HiBigShift,
6334 N3: ARMcc, N4: CCR, N5: CmpHi);
6335
6336 SDValue Ops[2] = { Lo, Hi };
6337 return DAG.getMergeValues(Ops, dl);
6338}
6339
6340/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
6341/// i32 values and take a 2 x i32 value to shift plus a shift amount.
6342SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
6343 SelectionDAG &DAG) const {
6344 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6345 EVT VT = Op.getValueType();
6346 unsigned VTBits = VT.getSizeInBits();
6347 SDLoc dl(Op);
6348 SDValue ShOpLo = Op.getOperand(i: 0);
6349 SDValue ShOpHi = Op.getOperand(i: 1);
6350 SDValue ShAmt = Op.getOperand(i: 2);
6351 SDValue ARMcc;
6352 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
6353
6354 assert(Op.getOpcode() == ISD::SHL_PARTS);
6355 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6356 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6357 SDValue Tmp1 = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT, N1: ShOpLo, N2: RevShAmt);
6358 SDValue Tmp2 = DAG.getNode(Opcode: ISD::SHL, DL: dl, VT, N1: ShOpHi, N2: ShAmt);
6359 SDValue HiSmallShift = DAG.getNode(Opcode: ISD::OR, DL: dl, VT, N1: Tmp1, N2: Tmp2);
6360
6361 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6362 DAG.getConstant(VTBits, dl, MVT::i32));
6363 SDValue HiBigShift = DAG.getNode(Opcode: ISD::SHL, DL: dl, VT, N1: ShOpLo, N2: ExtraShAmt);
6364 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6365 ISD::SETGE, ARMcc, DAG, dl);
6366 SDValue Hi = DAG.getNode(Opcode: ARMISD::CMOV, DL: dl, VT, N1: HiSmallShift, N2: HiBigShift,
6367 N3: ARMcc, N4: CCR, N5: CmpHi);
6368
6369 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6370 ISD::SETGE, ARMcc, DAG, dl);
6371 SDValue LoSmallShift = DAG.getNode(Opcode: ISD::SHL, DL: dl, VT, N1: ShOpLo, N2: ShAmt);
6372 SDValue Lo = DAG.getNode(Opcode: ARMISD::CMOV, DL: dl, VT, N1: LoSmallShift,
6373 N2: DAG.getConstant(Val: 0, DL: dl, VT), N3: ARMcc, N4: CCR, N5: CmpLo);
6374
6375 SDValue Ops[2] = { Lo, Hi };
6376 return DAG.getMergeValues(Ops, dl);
6377}
6378
6379SDValue ARMTargetLowering::LowerGET_ROUNDING(SDValue Op,
6380 SelectionDAG &DAG) const {
6381 // The rounding mode is in bits 23:22 of the FPSCR.
6382 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
6383 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
6384 // so that the shift + and get folded into a bitfield extract.
6385 SDLoc dl(Op);
6386 SDValue Chain = Op.getOperand(i: 0);
6387 SDValue Ops[] = {Chain,
6388 DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32)};
6389
6390 SDValue FPSCR =
6391 DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, {MVT::i32, MVT::Other}, Ops);
6392 Chain = FPSCR.getValue(R: 1);
6393 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
6394 DAG.getConstant(1U << 22, dl, MVT::i32));
6395 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
6396 DAG.getConstant(22, dl, MVT::i32));
6397 SDValue And = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
6398 DAG.getConstant(3, dl, MVT::i32));
6399 return DAG.getMergeValues(Ops: {And, Chain}, dl);
6400}
6401
6402SDValue ARMTargetLowering::LowerSET_ROUNDING(SDValue Op,
6403 SelectionDAG &DAG) const {
6404 SDLoc DL(Op);
6405 SDValue Chain = Op->getOperand(Num: 0);
6406 SDValue RMValue = Op->getOperand(Num: 1);
6407
6408 // The rounding mode is in bits 23:22 of the FPSCR.
6409 // The llvm.set.rounding argument value to ARM rounding mode value mapping
6410 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
6411 // ((arg - 1) & 3) << 22).
6412 //
6413 // It is expected that the argument of llvm.set.rounding is within the
6414 // segment [0, 3], so NearestTiesToAway (4) is not handled here. It is
6415 // responsibility of the code generated llvm.set.rounding to ensure this
6416 // condition.
6417
6418 // Calculate new value of FPSCR[23:22].
6419 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
6420 DAG.getConstant(1, DL, MVT::i32));
6421 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
6422 DAG.getConstant(0x3, DL, MVT::i32));
6423 RMValue = DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
6424 DAG.getConstant(ARM::RoundingBitsPos, DL, MVT::i32));
6425
6426 // Get current value of FPSCR.
6427 SDValue Ops[] = {Chain,
6428 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6429 SDValue FPSCR =
6430 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6431 Chain = FPSCR.getValue(R: 1);
6432 FPSCR = FPSCR.getValue(R: 0);
6433
6434 // Put new rounding mode into FPSCR[23:22].
6435 const unsigned RMMask = ~(ARM::Rounding::rmMask << ARM::RoundingBitsPos);
6436 FPSCR = DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,
6437 DAG.getConstant(RMMask, DL, MVT::i32));
6438 FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCR, RMValue);
6439 SDValue Ops2[] = {
6440 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
6441 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6442}
6443
6444SDValue ARMTargetLowering::LowerSET_FPMODE(SDValue Op,
6445 SelectionDAG &DAG) const {
6446 SDLoc DL(Op);
6447 SDValue Chain = Op->getOperand(Num: 0);
6448 SDValue Mode = Op->getOperand(Num: 1);
6449
6450 // Generate nodes to build:
6451 // FPSCR = (FPSCR & FPStatusBits) | (Mode & ~FPStatusBits)
6452 SDValue Ops[] = {Chain,
6453 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6454 SDValue FPSCR =
6455 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6456 Chain = FPSCR.getValue(R: 1);
6457 FPSCR = FPSCR.getValue(R: 0);
6458
6459 SDValue FPSCRMasked =
6460 DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,
6461 DAG.getConstant(ARM::FPStatusBits, DL, MVT::i32));
6462 SDValue InputMasked =
6463 DAG.getNode(ISD::AND, DL, MVT::i32, Mode,
6464 DAG.getConstant(~ARM::FPStatusBits, DL, MVT::i32));
6465 FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCRMasked, InputMasked);
6466
6467 SDValue Ops2[] = {
6468 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
6469 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6470}
6471
6472SDValue ARMTargetLowering::LowerRESET_FPMODE(SDValue Op,
6473 SelectionDAG &DAG) const {
6474 SDLoc DL(Op);
6475 SDValue Chain = Op->getOperand(Num: 0);
6476
6477 // To get the default FP mode all control bits are cleared:
6478 // FPSCR = FPSCR & (FPStatusBits | FPReservedBits)
6479 SDValue Ops[] = {Chain,
6480 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6481 SDValue FPSCR =
6482 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6483 Chain = FPSCR.getValue(R: 1);
6484 FPSCR = FPSCR.getValue(R: 0);
6485
6486 SDValue FPSCRMasked = DAG.getNode(
6487 ISD::AND, DL, MVT::i32, FPSCR,
6488 DAG.getConstant(ARM::FPStatusBits | ARM::FPReservedBits, DL, MVT::i32));
6489 SDValue Ops2[] = {Chain,
6490 DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32),
6491 FPSCRMasked};
6492 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6493}
6494
6495static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
6496 const ARMSubtarget *ST) {
6497 SDLoc dl(N);
6498 EVT VT = N->getValueType(ResNo: 0);
6499 if (VT.isVector() && ST->hasNEON()) {
6500
6501 // Compute the least significant set bit: LSB = X & -X
6502 SDValue X = N->getOperand(Num: 0);
6503 SDValue NX = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT, N1: getZeroVector(VT, DAG, dl), N2: X);
6504 SDValue LSB = DAG.getNode(Opcode: ISD::AND, DL: dl, VT, N1: X, N2: NX);
6505
6506 EVT ElemTy = VT.getVectorElementType();
6507
6508 if (ElemTy == MVT::i8) {
6509 // Compute with: cttz(x) = ctpop(lsb - 1)
6510 SDValue One = DAG.getNode(Opcode: ARMISD::VMOVIMM, DL: dl, VT,
6511 Operand: DAG.getTargetConstant(Val: 1, DL: dl, VT: ElemTy));
6512 SDValue Bits = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT, N1: LSB, N2: One);
6513 return DAG.getNode(Opcode: ISD::CTPOP, DL: dl, VT, Operand: Bits);
6514 }
6515
6516 if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) &&
6517 (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) {
6518 // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0
6519 unsigned NumBits = ElemTy.getSizeInBits();
6520 SDValue WidthMinus1 =
6521 DAG.getNode(Opcode: ARMISD::VMOVIMM, DL: dl, VT,
6522 Operand: DAG.getTargetConstant(Val: NumBits - 1, DL: dl, VT: ElemTy));
6523 SDValue CTLZ = DAG.getNode(Opcode: ISD::CTLZ, DL: dl, VT, Operand: LSB);
6524 return DAG.getNode(Opcode: ISD::SUB, DL: dl, VT, N1: WidthMinus1, N2: CTLZ);
6525 }
6526
6527 // Compute with: cttz(x) = ctpop(lsb - 1)
6528
6529 // Compute LSB - 1.
6530 SDValue Bits;
6531 if (ElemTy == MVT::i64) {
6532 // Load constant 0xffff'ffff'ffff'ffff to register.
6533 SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6534 DAG.getTargetConstant(0x1eff, dl, MVT::i32));
6535 Bits = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: LSB, N2: FF);
6536 } else {
6537 SDValue One = DAG.getNode(Opcode: ARMISD::VMOVIMM, DL: dl, VT,
6538 Operand: DAG.getTargetConstant(Val: 1, DL: dl, VT: ElemTy));
6539 Bits = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT, N1: LSB, N2: One);
6540 }
6541 return DAG.getNode(Opcode: ISD::CTPOP, DL: dl, VT, Operand: Bits);
6542 }
6543
6544 if (!ST->hasV6T2Ops())
6545 return SDValue();
6546
6547 SDValue rbit = DAG.getNode(Opcode: ISD::BITREVERSE, DL: dl, VT, Operand: N->getOperand(Num: 0));
6548 return DAG.getNode(Opcode: ISD::CTLZ, DL: dl, VT, Operand: rbit);
6549}
6550
6551static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
6552 const ARMSubtarget *ST) {
6553 EVT VT = N->getValueType(ResNo: 0);
6554 SDLoc DL(N);
6555
6556 assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
6557 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
6558 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
6559 "Unexpected type for custom ctpop lowering");
6560
6561 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6562 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
6563 SDValue Res = DAG.getBitcast(VT: VT8Bit, V: N->getOperand(Num: 0));
6564 Res = DAG.getNode(Opcode: ISD::CTPOP, DL, VT: VT8Bit, Operand: Res);
6565
6566 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
6567 unsigned EltSize = 8;
6568 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
6569 while (EltSize != VT.getScalarSizeInBits()) {
6570 SmallVector<SDValue, 8> Ops;
6571 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL,
6572 TLI.getPointerTy(DAG.getDataLayout())));
6573 Ops.push_back(Elt: Res);
6574
6575 EltSize *= 2;
6576 NumElts /= 2;
6577 MVT WidenVT = MVT::getVectorVT(VT: MVT::getIntegerVT(BitWidth: EltSize), NumElements: NumElts);
6578 Res = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: WidenVT, Ops);
6579 }
6580
6581 return Res;
6582}
6583
6584/// Getvshiftimm - Check if this is a valid build_vector for the immediate
6585/// operand of a vector shift operation, where all the elements of the
6586/// build_vector must have the same constant integer value.
6587static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
6588 // Ignore bit_converts.
6589 while (Op.getOpcode() == ISD::BITCAST)
6590 Op = Op.getOperand(i: 0);
6591 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Val: Op.getNode());
6592 APInt SplatBits, SplatUndef;
6593 unsigned SplatBitSize;
6594 bool HasAnyUndefs;
6595 if (!BVN ||
6596 !BVN->isConstantSplat(SplatValue&: SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
6597 MinSplatBits: ElementBits) ||
6598 SplatBitSize > ElementBits)
6599 return false;
6600 Cnt = SplatBits.getSExtValue();
6601 return true;
6602}
6603
6604/// isVShiftLImm - Check if this is a valid build_vector for the immediate
6605/// operand of a vector shift left operation. That value must be in the range:
6606/// 0 <= Value < ElementBits for a left shift; or
6607/// 0 <= Value <= ElementBits for a long left shift.
6608static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
6609 assert(VT.isVector() && "vector shift count is not a vector type");
6610 int64_t ElementBits = VT.getScalarSizeInBits();
6611 if (!getVShiftImm(Op, ElementBits, Cnt))
6612 return false;
6613 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
6614}
6615
6616/// isVShiftRImm - Check if this is a valid build_vector for the immediate
6617/// operand of a vector shift right operation. For a shift opcode, the value
6618/// is positive, but for an intrinsic the value count must be negative. The
6619/// absolute value must be in the range:
6620/// 1 <= |Value| <= ElementBits for a right shift; or
6621/// 1 <= |Value| <= ElementBits/2 for a narrow right shift.
6622static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
6623 int64_t &Cnt) {
6624 assert(VT.isVector() && "vector shift count is not a vector type");
6625 int64_t ElementBits = VT.getScalarSizeInBits();
6626 if (!getVShiftImm(Op, ElementBits, Cnt))
6627 return false;
6628 if (!isIntrinsic)
6629 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
6630 if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) {
6631 Cnt = -Cnt;
6632 return true;
6633 }
6634 return false;
6635}
6636
6637static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
6638 const ARMSubtarget *ST) {
6639 EVT VT = N->getValueType(ResNo: 0);
6640 SDLoc dl(N);
6641 int64_t Cnt;
6642
6643 if (!VT.isVector())
6644 return SDValue();
6645
6646 // We essentially have two forms here. Shift by an immediate and shift by a
6647 // vector register (there are also shift by a gpr, but that is just handled
6648 // with a tablegen pattern). We cannot easily match shift by an immediate in
6649 // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM.
6650 // For shifting by a vector, we don't have VSHR, only VSHL (which can be
6651 // signed or unsigned, and a negative shift indicates a shift right).
6652 if (N->getOpcode() == ISD::SHL) {
6653 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
6654 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
6655 DAG.getConstant(Cnt, dl, MVT::i32));
6656 return DAG.getNode(Opcode: ARMISD::VSHLu, DL: dl, VT, N1: N->getOperand(Num: 0),
6657 N2: N->getOperand(Num: 1));
6658 }
6659
6660 assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
6661 "unexpected vector shift opcode");
6662
6663 if (isVShiftRImm(Op: N->getOperand(Num: 1), VT, isNarrow: false, isIntrinsic: false, Cnt)) {
6664 unsigned VShiftOpc =
6665 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
6666 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
6667 DAG.getConstant(Cnt, dl, MVT::i32));
6668 }
6669
6670 // Other right shifts we don't have operations for (we use a shift left by a
6671 // negative number).
6672 EVT ShiftVT = N->getOperand(Num: 1).getValueType();
6673 SDValue NegatedCount = DAG.getNode(
6674 Opcode: ISD::SUB, DL: dl, VT: ShiftVT, N1: getZeroVector(VT: ShiftVT, DAG, dl), N2: N->getOperand(Num: 1));
6675 unsigned VShiftOpc =
6676 (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu);
6677 return DAG.getNode(Opcode: VShiftOpc, DL: dl, VT, N1: N->getOperand(Num: 0), N2: NegatedCount);
6678}
6679
6680static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
6681 const ARMSubtarget *ST) {
6682 EVT VT = N->getValueType(ResNo: 0);
6683 SDLoc dl(N);
6684
6685 // We can get here for a node like i32 = ISD::SHL i32, i64
6686 if (VT != MVT::i64)
6687 return SDValue();
6688
6689 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA ||
6690 N->getOpcode() == ISD::SHL) &&
6691 "Unknown shift to lower!");
6692
6693 unsigned ShOpc = N->getOpcode();
6694 if (ST->hasMVEIntegerOps()) {
6695 SDValue ShAmt = N->getOperand(Num: 1);
6696 unsigned ShPartsOpc = ARMISD::LSLL;
6697 ConstantSDNode *Con = dyn_cast<ConstantSDNode>(Val&: ShAmt);
6698
6699 // If the shift amount is greater than 32 or has a greater bitwidth than 64
6700 // then do the default optimisation
6701 if ((!Con && ShAmt->getValueType(ResNo: 0).getSizeInBits() > 64) ||
6702 (Con && (Con->getAPIntValue() == 0 || Con->getAPIntValue().uge(RHS: 32))))
6703 return SDValue();
6704
6705 // Extract the lower 32 bits of the shift amount if it's not an i32
6706 if (ShAmt->getValueType(0) != MVT::i32)
6707 ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32);
6708
6709 if (ShOpc == ISD::SRL) {
6710 if (!Con)
6711 // There is no t2LSRLr instruction so negate and perform an lsll if the
6712 // shift amount is in a register, emulating a right shift.
6713 ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6714 DAG.getConstant(0, dl, MVT::i32), ShAmt);
6715 else
6716 // Else generate an lsrl on the immediate shift amount
6717 ShPartsOpc = ARMISD::LSRL;
6718 } else if (ShOpc == ISD::SRA)
6719 ShPartsOpc = ARMISD::ASRL;
6720
6721 // Split Lower/Upper 32 bits of the destination/source
6722 SDValue Lo, Hi;
6723 std::tie(Lo, Hi) =
6724 DAG.SplitScalar(N->getOperand(0), dl, MVT::i32, MVT::i32);
6725 // Generate the shift operation as computed above
6726 Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi,
6727 ShAmt);
6728 // The upper 32 bits come from the second return value of lsll
6729 Hi = SDValue(Lo.getNode(), 1);
6730 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6731 }
6732
6733 // We only lower SRA, SRL of 1 here, all others use generic lowering.
6734 if (!isOneConstant(V: N->getOperand(Num: 1)) || N->getOpcode() == ISD::SHL)
6735 return SDValue();
6736
6737 // If we are in thumb mode, we don't have RRX.
6738 if (ST->isThumb1Only())
6739 return SDValue();
6740
6741 // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr.
6742 SDValue Lo, Hi;
6743 std::tie(Lo, Hi) = DAG.SplitScalar(N->getOperand(0), dl, MVT::i32, MVT::i32);
6744
6745 // First, build a SRA_GLUE/SRL_GLUE op, which shifts the top part by one and
6746 // captures the result into a carry flag.
6747 unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_GLUE:ARMISD::SRA_GLUE;
6748 Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi);
6749
6750 // The low part is an ARMISD::RRX operand, which shifts the carry in.
6751 Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
6752
6753 // Merge the pieces into a single i64 value.
6754 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6755}
6756
6757static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG,
6758 const ARMSubtarget *ST) {
6759 bool Invert = false;
6760 bool Swap = false;
6761 unsigned Opc = ARMCC::AL;
6762
6763 SDValue Op0 = Op.getOperand(i: 0);
6764 SDValue Op1 = Op.getOperand(i: 1);
6765 SDValue CC = Op.getOperand(i: 2);
6766 EVT VT = Op.getValueType();
6767 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(Val&: CC)->get();
6768 SDLoc dl(Op);
6769
6770 EVT CmpVT;
6771 if (ST->hasNEON())
6772 CmpVT = Op0.getValueType().changeVectorElementTypeToInteger();
6773 else {
6774 assert(ST->hasMVEIntegerOps() &&
6775 "No hardware support for integer vector comparison!");
6776
6777 if (Op.getValueType().getVectorElementType() != MVT::i1)
6778 return SDValue();
6779
6780 // Make sure we expand floating point setcc to scalar if we do not have
6781 // mve.fp, so that we can handle them from there.
6782 if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps())
6783 return SDValue();
6784
6785 CmpVT = VT;
6786 }
6787
6788 if (Op0.getValueType().getVectorElementType() == MVT::i64 &&
6789 (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) {
6790 // Special-case integer 64-bit equality comparisons. They aren't legal,
6791 // but they can be lowered with a few vector instructions.
6792 unsigned CmpElements = CmpVT.getVectorNumElements() * 2;
6793 EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements);
6794 SDValue CastOp0 = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: SplitVT, Operand: Op0);
6795 SDValue CastOp1 = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: SplitVT, Operand: Op1);
6796 SDValue Cmp = DAG.getNode(Opcode: ISD::SETCC, DL: dl, VT: SplitVT, N1: CastOp0, N2: CastOp1,
6797 N3: DAG.getCondCode(Cond: ISD::SETEQ));
6798 SDValue Reversed = DAG.getNode(Opcode: ARMISD::VREV64, DL: dl, VT: SplitVT, Operand: Cmp);
6799 SDValue Merged = DAG.getNode(Opcode: ISD::AND, DL: dl, VT: SplitVT, N1: Cmp, N2: Reversed);
6800 Merged = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: CmpVT, Operand: Merged);
6801 if (SetCCOpcode == ISD::SETNE)
6802 Merged = DAG.getNOT(DL: dl, Val: Merged, VT: CmpVT);
6803 Merged = DAG.getSExtOrTrunc(Op: Merged, DL: dl, VT);
6804 return Merged;
6805 }
6806
6807 if (CmpVT.getVectorElementType() == MVT::i64)
6808 // 64-bit comparisons are not legal in general.
6809 return SDValue();
6810
6811 if (Op1.getValueType().isFloatingPoint()) {
6812 switch (SetCCOpcode) {
6813 default: llvm_unreachable("Illegal FP comparison");
6814 case ISD::SETUNE:
6815 case ISD::SETNE:
6816 if (ST->hasMVEFloatOps()) {
6817 Opc = ARMCC::NE; break;
6818 } else {
6819 Invert = true; [[fallthrough]];
6820 }
6821 case ISD::SETOEQ:
6822 case ISD::SETEQ: Opc = ARMCC::EQ; break;
6823 case ISD::SETOLT:
6824 case ISD::SETLT: Swap = true; [[fallthrough]];
6825 case ISD::SETOGT:
6826 case ISD::SETGT: Opc = ARMCC::GT; break;
6827 case ISD::SETOLE:
6828 case ISD::SETLE: Swap = true; [[fallthrough]];
6829 case ISD::SETOGE:
6830 case ISD::SETGE: Opc = ARMCC::GE; break;
6831 case ISD::SETUGE: Swap = true; [[fallthrough]];
6832 case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break;
6833 case ISD::SETUGT: Swap = true; [[fallthrough]];
6834 case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break;
6835 case ISD::SETUEQ: Invert = true; [[fallthrough]];
6836 case ISD::SETONE: {
6837 // Expand this to (OLT | OGT).
6838 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6839 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6840 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6841 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6842 SDValue Result = DAG.getNode(Opcode: ISD::OR, DL: dl, VT: CmpVT, N1: TmpOp0, N2: TmpOp1);
6843 if (Invert)
6844 Result = DAG.getNOT(DL: dl, Val: Result, VT);
6845 return Result;
6846 }
6847 case ISD::SETUO: Invert = true; [[fallthrough]];
6848 case ISD::SETO: {
6849 // Expand this to (OLT | OGE).
6850 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6851 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6852 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6853 DAG.getConstant(ARMCC::GE, dl, MVT::i32));
6854 SDValue Result = DAG.getNode(Opcode: ISD::OR, DL: dl, VT: CmpVT, N1: TmpOp0, N2: TmpOp1);
6855 if (Invert)
6856 Result = DAG.getNOT(DL: dl, Val: Result, VT);
6857 return Result;
6858 }
6859 }
6860 } else {
6861 // Integer comparisons.
6862 switch (SetCCOpcode) {
6863 default: llvm_unreachable("Illegal integer comparison");
6864 case ISD::SETNE:
6865 if (ST->hasMVEIntegerOps()) {
6866 Opc = ARMCC::NE; break;
6867 } else {
6868 Invert = true; [[fallthrough]];
6869 }
6870 case ISD::SETEQ: Opc = ARMCC::EQ; break;
6871 case ISD::SETLT: Swap = true; [[fallthrough]];
6872 case ISD::SETGT: Opc = ARMCC::GT; break;
6873 case ISD::SETLE: Swap = true; [[fallthrough]];
6874 case ISD::SETGE: Opc = ARMCC::GE; break;
6875 case ISD::SETULT: Swap = true; [[fallthrough]];
6876 case ISD::SETUGT: Opc = ARMCC::HI; break;
6877 case ISD::SETULE: Swap = true; [[fallthrough]];
6878 case ISD::SETUGE: Opc = ARMCC::HS; break;
6879 }
6880
6881 // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
6882 if (ST->hasNEON() && Opc == ARMCC::EQ) {
6883 SDValue AndOp;
6884 if (ISD::isBuildVectorAllZeros(N: Op1.getNode()))
6885 AndOp = Op0;
6886 else if (ISD::isBuildVectorAllZeros(N: Op0.getNode()))
6887 AndOp = Op1;
6888
6889 // Ignore bitconvert.
6890 if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
6891 AndOp = AndOp.getOperand(i: 0);
6892
6893 if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
6894 Op0 = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: CmpVT, Operand: AndOp.getOperand(i: 0));
6895 Op1 = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: CmpVT, Operand: AndOp.getOperand(i: 1));
6896 SDValue Result = DAG.getNode(Opcode: ARMISD::VTST, DL: dl, VT: CmpVT, N1: Op0, N2: Op1);
6897 if (!Invert)
6898 Result = DAG.getNOT(DL: dl, Val: Result, VT);
6899 return Result;
6900 }
6901 }
6902 }
6903
6904 if (Swap)
6905 std::swap(a&: Op0, b&: Op1);
6906
6907 // If one of the operands is a constant vector zero, attempt to fold the
6908 // comparison to a specialized compare-against-zero form.
6909 if (ISD::isBuildVectorAllZeros(N: Op0.getNode()) &&
6910 (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::EQ ||
6911 Opc == ARMCC::NE)) {
6912 if (Opc == ARMCC::GE)
6913 Opc = ARMCC::LE;
6914 else if (Opc == ARMCC::GT)
6915 Opc = ARMCC::LT;
6916 std::swap(a&: Op0, b&: Op1);
6917 }
6918
6919 SDValue Result;
6920 if (ISD::isBuildVectorAllZeros(Op1.getNode()) &&
6921 (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::LE ||
6922 Opc == ARMCC::LT || Opc == ARMCC::NE || Opc == ARMCC::EQ))
6923 Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, Op0,
6924 DAG.getConstant(Opc, dl, MVT::i32));
6925 else
6926 Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6927 DAG.getConstant(Opc, dl, MVT::i32));
6928
6929 Result = DAG.getSExtOrTrunc(Op: Result, DL: dl, VT);
6930
6931 if (Invert)
6932 Result = DAG.getNOT(DL: dl, Val: Result, VT);
6933
6934 return Result;
6935}
6936
6937static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) {
6938 SDValue LHS = Op.getOperand(i: 0);
6939 SDValue RHS = Op.getOperand(i: 1);
6940 SDValue Carry = Op.getOperand(i: 2);
6941 SDValue Cond = Op.getOperand(i: 3);
6942 SDLoc DL(Op);
6943
6944 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
6945
6946 // ARMISD::SUBE expects a carry not a borrow like ISD::USUBO_CARRY so we
6947 // have to invert the carry first.
6948 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
6949 DAG.getConstant(1, DL, MVT::i32), Carry);
6950 // This converts the boolean value carry into the carry flag.
6951 Carry = ConvertBooleanCarryToCarryFlag(BoolCarry: Carry, DAG);
6952
6953 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
6954 SDValue Cmp = DAG.getNode(Opcode: ARMISD::SUBE, DL, VTList: VTs, N1: LHS, N2: RHS, N3: Carry);
6955
6956 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
6957 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
6958 SDValue ARMcc = DAG.getConstant(
6959 IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32);
6960 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
6961 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR,
6962 Cmp.getValue(1), SDValue());
6963 return DAG.getNode(Opcode: ARMISD::CMOV, DL, VT: Op.getValueType(), N1: FVal, N2: TVal, N3: ARMcc,
6964 N4: CCR, N5: Chain.getValue(R: 1));
6965}
6966
6967/// isVMOVModifiedImm - Check if the specified splat value corresponds to a
6968/// valid vector constant for a NEON or MVE instruction with a "modified
6969/// immediate" operand (e.g., VMOV). If so, return the encoded value.
6970static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
6971 unsigned SplatBitSize, SelectionDAG &DAG,
6972 const SDLoc &dl, EVT &VT, EVT VectorVT,
6973 VMOVModImmType type) {
6974 unsigned OpCmode, Imm;
6975 bool is128Bits = VectorVT.is128BitVector();
6976
6977 // SplatBitSize is set to the smallest size that splats the vector, so a
6978 // zero vector will always have SplatBitSize == 8. However, NEON modified
6979 // immediate instructions others than VMOV do not support the 8-bit encoding
6980 // of a zero vector, and the default encoding of zero is supposed to be the
6981 // 32-bit version.
6982 if (SplatBits == 0)
6983 SplatBitSize = 32;
6984
6985 switch (SplatBitSize) {
6986 case 8:
6987 if (type != VMOVModImm)
6988 return SDValue();
6989 // Any 1-byte value is OK. Op=0, Cmode=1110.
6990 assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
6991 OpCmode = 0xe;
6992 Imm = SplatBits;
6993 VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
6994 break;
6995
6996 case 16:
6997 // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
6998 VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
6999 if ((SplatBits & ~0xff) == 0) {
7000 // Value = 0x00nn: Op=x, Cmode=100x.
7001 OpCmode = 0x8;
7002 Imm = SplatBits;
7003 break;
7004 }
7005 if ((SplatBits & ~0xff00) == 0) {
7006 // Value = 0xnn00: Op=x, Cmode=101x.
7007 OpCmode = 0xa;
7008 Imm = SplatBits >> 8;
7009 break;
7010 }
7011 return SDValue();
7012
7013 case 32:
7014 // NEON's 32-bit VMOV supports splat values where:
7015 // * only one byte is nonzero, or
7016 // * the least significant byte is 0xff and the second byte is nonzero, or
7017 // * the least significant 2 bytes are 0xff and the third is nonzero.
7018 VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
7019 if ((SplatBits & ~0xff) == 0) {
7020 // Value = 0x000000nn: Op=x, Cmode=000x.
7021 OpCmode = 0;
7022 Imm = SplatBits;
7023 break;
7024 }
7025 if ((SplatBits & ~0xff00) == 0) {
7026 // Value = 0x0000nn00: Op=x, Cmode=001x.
7027 OpCmode = 0x2;
7028 Imm = SplatBits >> 8;
7029 break;
7030 }
7031 if ((SplatBits & ~0xff0000) == 0) {
7032 // Value = 0x00nn0000: Op=x, Cmode=010x.
7033 OpCmode = 0x4;
7034 Imm = SplatBits >> 16;
7035 break;
7036 }
7037 if ((SplatBits & ~0xff000000) == 0) {
7038 // Value = 0xnn000000: Op=x, Cmode=011x.
7039 OpCmode = 0x6;
7040 Imm = SplatBits >> 24;
7041 break;
7042 }
7043
7044 // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
7045 if (type == OtherModImm) return SDValue();
7046
7047 if ((SplatBits & ~0xffff) == 0 &&
7048 ((SplatBits | SplatUndef) & 0xff) == 0xff) {
7049 // Value = 0x0000nnff: Op=x, Cmode=1100.
7050 OpCmode = 0xc;
7051 Imm = SplatBits >> 8;
7052 break;
7053 }
7054
7055 // cmode == 0b1101 is not supported for MVE VMVN
7056 if (type == MVEVMVNModImm)
7057 return SDValue();
7058
7059 if ((SplatBits & ~0xffffff) == 0 &&
7060 ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
7061 // Value = 0x00nnffff: Op=x, Cmode=1101.
7062 OpCmode = 0xd;
7063 Imm = SplatBits >> 16;
7064 break;
7065 }
7066
7067 // Note: there are a few 32-bit splat values (specifically: 00ffff00,
7068 // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
7069 // VMOV.I32. A (very) minor optimization would be to replicate the value
7070 // and fall through here to test for a valid 64-bit splat. But, then the
7071 // caller would also need to check and handle the change in size.
7072 return SDValue();
7073
7074 case 64: {
7075 if (type != VMOVModImm)
7076 return SDValue();
7077 // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
7078 uint64_t BitMask = 0xff;
7079 unsigned ImmMask = 1;
7080 Imm = 0;
7081 for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
7082 if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
7083 Imm |= ImmMask;
7084 } else if ((SplatBits & BitMask) != 0) {
7085 return SDValue();
7086 }
7087 BitMask <<= 8;
7088 ImmMask <<= 1;
7089 }
7090
7091 if (DAG.getDataLayout().isBigEndian()) {
7092 // Reverse the order of elements within the vector.
7093 unsigned BytesPerElem = VectorVT.getScalarSizeInBits() / 8;
7094 unsigned Mask = (1 << BytesPerElem) - 1;
7095 unsigned NumElems = 8 / BytesPerElem;
7096 unsigned NewImm = 0;
7097 for (unsigned ElemNum = 0; ElemNum < NumElems; ++ElemNum) {
7098 unsigned Elem = ((Imm >> ElemNum * BytesPerElem) & Mask);
7099 NewImm |= Elem << (NumElems - ElemNum - 1) * BytesPerElem;
7100 }
7101 Imm = NewImm;
7102 }
7103
7104 // Op=1, Cmode=1110.
7105 OpCmode = 0x1e;
7106 VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
7107 break;
7108 }
7109
7110 default:
7111 llvm_unreachable("unexpected size for isVMOVModifiedImm");
7112 }
7113
7114 unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Val: Imm);
7115 return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
7116}
7117
7118SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
7119 const ARMSubtarget *ST) const {
7120 EVT VT = Op.getValueType();
7121 bool IsDouble = (VT == MVT::f64);
7122 ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Val&: Op);
7123 const APFloat &FPVal = CFP->getValueAPF();
7124
7125 // Prevent floating-point constants from using literal loads
7126 // when execute-only is enabled.
7127 if (ST->genExecuteOnly()) {
7128 // We shouldn't trigger this for v6m execute-only
7129 assert((!ST->isThumb1Only() || ST->hasV8MBaselineOps()) &&
7130 "Unexpected architecture");
7131
7132 // If we can represent the constant as an immediate, don't lower it
7133 if (isFPImmLegal(Imm: FPVal, VT))
7134 return Op;
7135 // Otherwise, construct as integer, and move to float register
7136 APInt INTVal = FPVal.bitcastToAPInt();
7137 SDLoc DL(CFP);
7138 switch (VT.getSimpleVT().SimpleTy) {
7139 default:
7140 llvm_unreachable("Unknown floating point type!");
7141 break;
7142 case MVT::f64: {
7143 SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
7144 SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
7145 return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
7146 }
7147 case MVT::f32:
7148 return DAG.getNode(ARMISD::VMOVSR, DL, VT,
7149 DAG.getConstant(INTVal, DL, MVT::i32));
7150 }
7151 }
7152
7153 if (!ST->hasVFP3Base())
7154 return SDValue();
7155
7156 // Use the default (constant pool) lowering for double constants when we have
7157 // an SP-only FPU
7158 if (IsDouble && !Subtarget->hasFP64())
7159 return SDValue();
7160
7161 // Try splatting with a VMOV.f32...
7162 int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPImm: FPVal) : ARM_AM::getFP32Imm(FPImm: FPVal);
7163
7164 if (ImmVal != -1) {
7165 if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
7166 // We have code in place to select a valid ConstantFP already, no need to
7167 // do any mangling.
7168 return Op;
7169 }
7170
7171 // It's a float and we are trying to use NEON operations where
7172 // possible. Lower it to a splat followed by an extract.
7173 SDLoc DL(Op);
7174 SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32);
7175 SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
7176 NewVal);
7177 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
7178 DAG.getConstant(0, DL, MVT::i32));
7179 }
7180
7181 // The rest of our options are NEON only, make sure that's allowed before
7182 // proceeding..
7183 if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
7184 return SDValue();
7185
7186 EVT VMovVT;
7187 uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
7188
7189 // It wouldn't really be worth bothering for doubles except for one very
7190 // important value, which does happen to match: 0.0. So make sure we don't do
7191 // anything stupid.
7192 if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
7193 return SDValue();
7194
7195 // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
7196 SDValue NewVal = isVMOVModifiedImm(SplatBits: iVal & 0xffffffffU, SplatUndef: 0, SplatBitSize: 32, DAG, dl: SDLoc(Op),
7197 VT&: VMovVT, VectorVT: VT, type: VMOVModImm);
7198 if (NewVal != SDValue()) {
7199 SDLoc DL(Op);
7200 SDValue VecConstant = DAG.getNode(Opcode: ARMISD::VMOVIMM, DL, VT: VMovVT,
7201 Operand: NewVal);
7202 if (IsDouble)
7203 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
7204
7205 // It's a float: cast and extract a vector element.
7206 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
7207 VecConstant);
7208 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
7209 DAG.getConstant(0, DL, MVT::i32));
7210 }
7211
7212 // Finally, try a VMVN.i32
7213 NewVal = isVMOVModifiedImm(SplatBits: ~iVal & 0xffffffffU, SplatUndef: 0, SplatBitSize: 32, DAG, dl: SDLoc(Op), VT&: VMovVT,
7214 VectorVT: VT, type: VMVNModImm);
7215 if (NewVal != SDValue()) {
7216 SDLoc DL(Op);
7217 SDValue VecConstant = DAG.getNode(Opcode: ARMISD::VMVNIMM, DL, VT: VMovVT, Operand: NewVal);
7218
7219 if (IsDouble)
7220 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
7221
7222 // It's a float: cast and extract a vector element.
7223 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
7224 VecConstant);
7225 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
7226 DAG.getConstant(0, DL, MVT::i32));
7227 }
7228
7229 return SDValue();
7230}
7231
7232// check if an VEXT instruction can handle the shuffle mask when the
7233// vector sources of the shuffle are the same.
7234static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
7235 unsigned NumElts = VT.getVectorNumElements();
7236
7237 // Assume that the first shuffle index is not UNDEF. Fail if it is.
7238 if (M[0] < 0)
7239 return false;
7240
7241 Imm = M[0];
7242
7243 // If this is a VEXT shuffle, the immediate value is the index of the first
7244 // element. The other shuffle indices must be the successive elements after
7245 // the first one.
7246 unsigned ExpectedElt = Imm;
7247 for (unsigned i = 1; i < NumElts; ++i) {
7248 // Increment the expected index. If it wraps around, just follow it
7249 // back to index zero and keep going.
7250 ++ExpectedElt;
7251 if (ExpectedElt == NumElts)
7252 ExpectedElt = 0;
7253
7254 if (M[i] < 0) continue; // ignore UNDEF indices
7255 if (ExpectedElt != static_cast<unsigned>(M[i]))
7256 return false;
7257 }
7258
7259 return true;
7260}
7261
7262static bool isVEXTMask(ArrayRef<int> M, EVT VT,
7263 bool &ReverseVEXT, unsigned &Imm) {
7264 unsigned NumElts = VT.getVectorNumElements();
7265 ReverseVEXT = false;
7266
7267 // Assume that the first shuffle index is not UNDEF. Fail if it is.
7268 if (M[0] < 0)
7269 return false;
7270
7271 Imm = M[0];
7272
7273 // If this is a VEXT shuffle, the immediate value is the index of the first
7274 // element. The other shuffle indices must be the successive elements after
7275 // the first one.
7276 unsigned ExpectedElt = Imm;
7277 for (unsigned i = 1; i < NumElts; ++i) {
7278 // Increment the expected index. If it wraps around, it may still be
7279 // a VEXT but the source vectors must be swapped.
7280 ExpectedElt += 1;
7281 if (ExpectedElt == NumElts * 2) {
7282 ExpectedElt = 0;
7283 ReverseVEXT = true;
7284 }
7285
7286 if (M[i] < 0) continue; // ignore UNDEF indices
7287 if (ExpectedElt != static_cast<unsigned>(M[i]))
7288 return false;
7289 }
7290
7291 // Adjust the index value if the source operands will be swapped.
7292 if (ReverseVEXT)
7293 Imm -= NumElts;
7294
7295 return true;
7296}
7297
7298static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
7299 // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
7300 // range, then 0 is placed into the resulting vector. So pretty much any mask
7301 // of 8 elements can work here.
7302 return VT == MVT::v8i8 && M.size() == 8;
7303}
7304
7305static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask,
7306 unsigned Index) {
7307 if (Mask.size() == Elements * 2)
7308 return Index / Elements;
7309 return Mask[Index] == 0 ? 0 : 1;
7310}
7311
7312// Checks whether the shuffle mask represents a vector transpose (VTRN) by
7313// checking that pairs of elements in the shuffle mask represent the same index
7314// in each vector, incrementing the expected index by 2 at each step.
7315// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]
7316// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}
7317// v2={e,f,g,h}
7318// WhichResult gives the offset for each element in the mask based on which
7319// of the two results it belongs to.
7320//
7321// The transpose can be represented either as:
7322// result1 = shufflevector v1, v2, result1_shuffle_mask
7323// result2 = shufflevector v1, v2, result2_shuffle_mask
7324// where v1/v2 and the shuffle masks have the same number of elements
7325// (here WhichResult (see below) indicates which result is being checked)
7326//
7327// or as:
7328// results = shufflevector v1, v2, shuffle_mask
7329// where both results are returned in one vector and the shuffle mask has twice
7330// as many elements as v1/v2 (here WhichResult will always be 0 if true) here we
7331// want to check the low half and high half of the shuffle mask as if it were
7332// the other case
7333static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7334 unsigned EltSz = VT.getScalarSizeInBits();
7335 if (EltSz == 64)
7336 return false;
7337
7338 unsigned NumElts = VT.getVectorNumElements();
7339 if (M.size() != NumElts && M.size() != NumElts*2)
7340 return false;
7341
7342 // If the mask is twice as long as the input vector then we need to check the
7343 // upper and lower parts of the mask with a matching value for WhichResult
7344 // FIXME: A mask with only even values will be rejected in case the first
7345 // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only
7346 // M[0] is used to determine WhichResult
7347 for (unsigned i = 0; i < M.size(); i += NumElts) {
7348 WhichResult = SelectPairHalf(Elements: NumElts, Mask: M, Index: i);
7349 for (unsigned j = 0; j < NumElts; j += 2) {
7350 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7351 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))
7352 return false;
7353 }
7354 }
7355
7356 if (M.size() == NumElts*2)
7357 WhichResult = 0;
7358
7359 return true;
7360}
7361
7362/// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
7363/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7364/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
7365static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7366 unsigned EltSz = VT.getScalarSizeInBits();
7367 if (EltSz == 64)
7368 return false;
7369
7370 unsigned NumElts = VT.getVectorNumElements();
7371 if (M.size() != NumElts && M.size() != NumElts*2)
7372 return false;
7373
7374 for (unsigned i = 0; i < M.size(); i += NumElts) {
7375 WhichResult = SelectPairHalf(Elements: NumElts, Mask: M, Index: i);
7376 for (unsigned j = 0; j < NumElts; j += 2) {
7377 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7378 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))
7379 return false;
7380 }
7381 }
7382
7383 if (M.size() == NumElts*2)
7384 WhichResult = 0;
7385
7386 return true;
7387}
7388
7389// Checks whether the shuffle mask represents a vector unzip (VUZP) by checking
7390// that the mask elements are either all even and in steps of size 2 or all odd
7391// and in steps of size 2.
7392// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]
7393// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}
7394// v2={e,f,g,h}
7395// Requires similar checks to that of isVTRNMask with
7396// respect the how results are returned.
7397static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7398 unsigned EltSz = VT.getScalarSizeInBits();
7399 if (EltSz == 64)
7400 return false;
7401
7402 unsigned NumElts = VT.getVectorNumElements();
7403 if (M.size() != NumElts && M.size() != NumElts*2)
7404 return false;
7405
7406 for (unsigned i = 0; i < M.size(); i += NumElts) {
7407 WhichResult = SelectPairHalf(Elements: NumElts, Mask: M, Index: i);
7408 for (unsigned j = 0; j < NumElts; ++j) {
7409 if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)
7410 return false;
7411 }
7412 }
7413
7414 if (M.size() == NumElts*2)
7415 WhichResult = 0;
7416
7417 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7418 if (VT.is64BitVector() && EltSz == 32)
7419 return false;
7420
7421 return true;
7422}
7423
7424/// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
7425/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7426/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
7427static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7428 unsigned EltSz = VT.getScalarSizeInBits();
7429 if (EltSz == 64)
7430 return false;
7431
7432 unsigned NumElts = VT.getVectorNumElements();
7433 if (M.size() != NumElts && M.size() != NumElts*2)
7434 return false;
7435
7436 unsigned Half = NumElts / 2;
7437 for (unsigned i = 0; i < M.size(); i += NumElts) {
7438 WhichResult = SelectPairHalf(Elements: NumElts, Mask: M, Index: i);
7439 for (unsigned j = 0; j < NumElts; j += Half) {
7440 unsigned Idx = WhichResult;
7441 for (unsigned k = 0; k < Half; ++k) {
7442 int MIdx = M[i + j + k];
7443 if (MIdx >= 0 && (unsigned) MIdx != Idx)
7444 return false;
7445 Idx += 2;
7446 }
7447 }
7448 }
7449
7450 if (M.size() == NumElts*2)
7451 WhichResult = 0;
7452
7453 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7454 if (VT.is64BitVector() && EltSz == 32)
7455 return false;
7456
7457 return true;
7458}
7459
7460// Checks whether the shuffle mask represents a vector zip (VZIP) by checking
7461// that pairs of elements of the shufflemask represent the same index in each
7462// vector incrementing sequentially through the vectors.
7463// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]
7464// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}
7465// v2={e,f,g,h}
7466// Requires similar checks to that of isVTRNMask with respect the how results
7467// are returned.
7468static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7469 unsigned EltSz = VT.getScalarSizeInBits();
7470 if (EltSz == 64)
7471 return false;
7472
7473 unsigned NumElts = VT.getVectorNumElements();
7474 if (M.size() != NumElts && M.size() != NumElts*2)
7475 return false;
7476
7477 for (unsigned i = 0; i < M.size(); i += NumElts) {
7478 WhichResult = SelectPairHalf(Elements: NumElts, Mask: M, Index: i);
7479 unsigned Idx = WhichResult * NumElts / 2;
7480 for (unsigned j = 0; j < NumElts; j += 2) {
7481 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7482 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts))
7483 return false;
7484 Idx += 1;
7485 }
7486 }
7487
7488 if (M.size() == NumElts*2)
7489 WhichResult = 0;
7490
7491 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7492 if (VT.is64BitVector() && EltSz == 32)
7493 return false;
7494
7495 return true;
7496}
7497
7498/// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
7499/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7500/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
7501static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7502 unsigned EltSz = VT.getScalarSizeInBits();
7503 if (EltSz == 64)
7504 return false;
7505
7506 unsigned NumElts = VT.getVectorNumElements();
7507 if (M.size() != NumElts && M.size() != NumElts*2)
7508 return false;
7509
7510 for (unsigned i = 0; i < M.size(); i += NumElts) {
7511 WhichResult = SelectPairHalf(Elements: NumElts, Mask: M, Index: i);
7512 unsigned Idx = WhichResult * NumElts / 2;
7513 for (unsigned j = 0; j < NumElts; j += 2) {
7514 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7515 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx))
7516 return false;
7517 Idx += 1;
7518 }
7519 }
7520
7521 if (M.size() == NumElts*2)
7522 WhichResult = 0;
7523
7524 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7525 if (VT.is64BitVector() && EltSz == 32)
7526 return false;
7527
7528 return true;
7529}
7530
7531/// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN),
7532/// and return the corresponding ARMISD opcode if it is, or 0 if it isn't.
7533static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT,
7534 unsigned &WhichResult,
7535 bool &isV_UNDEF) {
7536 isV_UNDEF = false;
7537 if (isVTRNMask(M: ShuffleMask, VT, WhichResult))
7538 return ARMISD::VTRN;
7539 if (isVUZPMask(M: ShuffleMask, VT, WhichResult))
7540 return ARMISD::VUZP;
7541 if (isVZIPMask(M: ShuffleMask, VT, WhichResult))
7542 return ARMISD::VZIP;
7543
7544 isV_UNDEF = true;
7545 if (isVTRN_v_undef_Mask(M: ShuffleMask, VT, WhichResult))
7546 return ARMISD::VTRN;
7547 if (isVUZP_v_undef_Mask(M: ShuffleMask, VT, WhichResult))
7548 return ARMISD::VUZP;
7549 if (isVZIP_v_undef_Mask(M: ShuffleMask, VT, WhichResult))
7550 return ARMISD::VZIP;
7551
7552 return 0;
7553}
7554
7555/// \return true if this is a reverse operation on an vector.
7556static bool isReverseMask(ArrayRef<int> M, EVT VT) {
7557 unsigned NumElts = VT.getVectorNumElements();
7558 // Make sure the mask has the right size.
7559 if (NumElts != M.size())
7560 return false;
7561
7562 // Look for <15, ..., 3, -1, 1, 0>.
7563 for (unsigned i = 0; i != NumElts; ++i)
7564 if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
7565 return false;
7566
7567 return true;
7568}
7569
7570static bool isTruncMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
7571 unsigned NumElts = VT.getVectorNumElements();
7572 // Make sure the mask has the right size.
7573 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
7574 return false;
7575
7576 // Half-width truncation patterns (e.g. v4i32 -> v8i16):
7577 // !Top && SingleSource: <0, 2, 4, 6, 0, 2, 4, 6>
7578 // !Top && !SingleSource: <0, 2, 4, 6, 8, 10, 12, 14>
7579 // Top && SingleSource: <1, 3, 5, 7, 1, 3, 5, 7>
7580 // Top && !SingleSource: <1, 3, 5, 7, 9, 11, 13, 15>
7581 int Ofs = Top ? 1 : 0;
7582 int Upper = SingleSource ? 0 : NumElts;
7583 for (int i = 0, e = NumElts / 2; i != e; ++i) {
7584 if (M[i] >= 0 && M[i] != (i * 2) + Ofs)
7585 return false;
7586 if (M[i + e] >= 0 && M[i + e] != (i * 2) + Ofs + Upper)
7587 return false;
7588 }
7589 return true;
7590}
7591
7592static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
7593 unsigned NumElts = VT.getVectorNumElements();
7594 // Make sure the mask has the right size.
7595 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
7596 return false;
7597
7598 // If Top
7599 // Look for <0, N, 2, N+2, 4, N+4, ..>.
7600 // This inserts Input2 into Input1
7601 // else if not Top
7602 // Look for <0, N+1, 2, N+3, 4, N+5, ..>
7603 // This inserts Input1 into Input2
7604 unsigned Offset = Top ? 0 : 1;
7605 unsigned N = SingleSource ? 0 : NumElts;
7606 for (unsigned i = 0; i < NumElts; i += 2) {
7607 if (M[i] >= 0 && M[i] != (int)i)
7608 return false;
7609 if (M[i + 1] >= 0 && M[i + 1] != (int)(N + i + Offset))
7610 return false;
7611 }
7612
7613 return true;
7614}
7615
7616static bool isVMOVNTruncMask(ArrayRef<int> M, EVT ToVT, bool rev) {
7617 unsigned NumElts = ToVT.getVectorNumElements();
7618 if (NumElts != M.size())
7619 return false;
7620
7621 // Test if the Trunc can be convertable to a VMOVN with this shuffle. We are
7622 // looking for patterns of:
7623 // !rev: 0 N/2 1 N/2+1 2 N/2+2 ...
7624 // rev: N/2 0 N/2+1 1 N/2+2 2 ...
7625
7626 unsigned Off0 = rev ? NumElts / 2 : 0;
7627 unsigned Off1 = rev ? 0 : NumElts / 2;
7628 for (unsigned i = 0; i < NumElts; i += 2) {
7629 if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2))
7630 return false;
7631 if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2))
7632 return false;
7633 }
7634
7635 return true;
7636}
7637
7638// Reconstruct an MVE VCVT from a BuildVector of scalar fptrunc, all extracted
7639// from a pair of inputs. For example:
7640// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7641// FP_ROUND(EXTRACT_ELT(Y, 0),
7642// FP_ROUND(EXTRACT_ELT(X, 1),
7643// FP_ROUND(EXTRACT_ELT(Y, 1), ...)
7644static SDValue LowerBuildVectorOfFPTrunc(SDValue BV, SelectionDAG &DAG,
7645 const ARMSubtarget *ST) {
7646 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7647 if (!ST->hasMVEFloatOps())
7648 return SDValue();
7649
7650 SDLoc dl(BV);
7651 EVT VT = BV.getValueType();
7652 if (VT != MVT::v8f16)
7653 return SDValue();
7654
7655 // We are looking for a buildvector of fptrunc elements, where all the
7656 // elements are interleavingly extracted from two sources. Check the first two
7657 // items are valid enough and extract some info from them (they are checked
7658 // properly in the loop below).
7659 if (BV.getOperand(i: 0).getOpcode() != ISD::FP_ROUND ||
7660 BV.getOperand(i: 0).getOperand(i: 0).getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7661 BV.getOperand(i: 0).getOperand(i: 0).getConstantOperandVal(i: 1) != 0)
7662 return SDValue();
7663 if (BV.getOperand(i: 1).getOpcode() != ISD::FP_ROUND ||
7664 BV.getOperand(i: 1).getOperand(i: 0).getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7665 BV.getOperand(i: 1).getOperand(i: 0).getConstantOperandVal(i: 1) != 0)
7666 return SDValue();
7667 SDValue Op0 = BV.getOperand(i: 0).getOperand(i: 0).getOperand(i: 0);
7668 SDValue Op1 = BV.getOperand(i: 1).getOperand(i: 0).getOperand(i: 0);
7669 if (Op0.getValueType() != MVT::v4f32 || Op1.getValueType() != MVT::v4f32)
7670 return SDValue();
7671
7672 // Check all the values in the BuildVector line up with our expectations.
7673 for (unsigned i = 1; i < 4; i++) {
7674 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7675 return Trunc.getOpcode() == ISD::FP_ROUND &&
7676 Trunc.getOperand(i: 0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7677 Trunc.getOperand(i: 0).getOperand(i: 0) == Op &&
7678 Trunc.getOperand(i: 0).getConstantOperandVal(i: 1) == Idx;
7679 };
7680 if (!Check(BV.getOperand(i: i * 2 + 0), Op0, i))
7681 return SDValue();
7682 if (!Check(BV.getOperand(i: i * 2 + 1), Op1, i))
7683 return SDValue();
7684 }
7685
7686 SDValue N1 = DAG.getNode(ARMISD::VCVTN, dl, VT, DAG.getUNDEF(VT), Op0,
7687 DAG.getConstant(0, dl, MVT::i32));
7688 return DAG.getNode(ARMISD::VCVTN, dl, VT, N1, Op1,
7689 DAG.getConstant(1, dl, MVT::i32));
7690}
7691
7692// Reconstruct an MVE VCVT from a BuildVector of scalar fpext, all extracted
7693// from a single input on alternating lanes. For example:
7694// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7695// FP_ROUND(EXTRACT_ELT(X, 2),
7696// FP_ROUND(EXTRACT_ELT(X, 4), ...)
7697static SDValue LowerBuildVectorOfFPExt(SDValue BV, SelectionDAG &DAG,
7698 const ARMSubtarget *ST) {
7699 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7700 if (!ST->hasMVEFloatOps())
7701 return SDValue();
7702
7703 SDLoc dl(BV);
7704 EVT VT = BV.getValueType();
7705 if (VT != MVT::v4f32)
7706 return SDValue();
7707
7708 // We are looking for a buildvector of fptext elements, where all the
7709 // elements are alternating lanes from a single source. For example <0,2,4,6>
7710 // or <1,3,5,7>. Check the first two items are valid enough and extract some
7711 // info from them (they are checked properly in the loop below).
7712 if (BV.getOperand(i: 0).getOpcode() != ISD::FP_EXTEND ||
7713 BV.getOperand(i: 0).getOperand(i: 0).getOpcode() != ISD::EXTRACT_VECTOR_ELT)
7714 return SDValue();
7715 SDValue Op0 = BV.getOperand(i: 0).getOperand(i: 0).getOperand(i: 0);
7716 int Offset = BV.getOperand(i: 0).getOperand(i: 0).getConstantOperandVal(i: 1);
7717 if (Op0.getValueType() != MVT::v8f16 || (Offset != 0 && Offset != 1))
7718 return SDValue();
7719
7720 // Check all the values in the BuildVector line up with our expectations.
7721 for (unsigned i = 1; i < 4; i++) {
7722 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7723 return Trunc.getOpcode() == ISD::FP_EXTEND &&
7724 Trunc.getOperand(i: 0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7725 Trunc.getOperand(i: 0).getOperand(i: 0) == Op &&
7726 Trunc.getOperand(i: 0).getConstantOperandVal(i: 1) == Idx;
7727 };
7728 if (!Check(BV.getOperand(i), Op0, 2 * i + Offset))
7729 return SDValue();
7730 }
7731
7732 return DAG.getNode(ARMISD::VCVTL, dl, VT, Op0,
7733 DAG.getConstant(Offset, dl, MVT::i32));
7734}
7735
7736// If N is an integer constant that can be moved into a register in one
7737// instruction, return an SDValue of such a constant (will become a MOV
7738// instruction). Otherwise return null.
7739static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
7740 const ARMSubtarget *ST, const SDLoc &dl) {
7741 uint64_t Val;
7742 if (!isa<ConstantSDNode>(Val: N))
7743 return SDValue();
7744 Val = N->getAsZExtVal();
7745
7746 if (ST->isThumb1Only()) {
7747 if (Val <= 255 || ~Val <= 255)
7748 return DAG.getConstant(Val, dl, MVT::i32);
7749 } else {
7750 if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
7751 return DAG.getConstant(Val, dl, MVT::i32);
7752 }
7753 return SDValue();
7754}
7755
7756static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG,
7757 const ARMSubtarget *ST) {
7758 SDLoc dl(Op);
7759 EVT VT = Op.getValueType();
7760
7761 assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!");
7762
7763 unsigned NumElts = VT.getVectorNumElements();
7764 unsigned BoolMask;
7765 unsigned BitsPerBool;
7766 if (NumElts == 2) {
7767 BitsPerBool = 8;
7768 BoolMask = 0xff;
7769 } else if (NumElts == 4) {
7770 BitsPerBool = 4;
7771 BoolMask = 0xf;
7772 } else if (NumElts == 8) {
7773 BitsPerBool = 2;
7774 BoolMask = 0x3;
7775 } else if (NumElts == 16) {
7776 BitsPerBool = 1;
7777 BoolMask = 0x1;
7778 } else
7779 return SDValue();
7780
7781 // If this is a single value copied into all lanes (a splat), we can just sign
7782 // extend that single value
7783 SDValue FirstOp = Op.getOperand(i: 0);
7784 if (!isa<ConstantSDNode>(Val: FirstOp) &&
7785 llvm::all_of(Range: llvm::drop_begin(RangeOrContainer: Op->ops()), P: [&FirstOp](const SDUse &U) {
7786 return U.get().isUndef() || U.get() == FirstOp;
7787 })) {
7788 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp,
7789 DAG.getValueType(MVT::i1));
7790 return DAG.getNode(Opcode: ARMISD::PREDICATE_CAST, DL: dl, VT: Op.getValueType(), Operand: Ext);
7791 }
7792
7793 // First create base with bits set where known
7794 unsigned Bits32 = 0;
7795 for (unsigned i = 0; i < NumElts; ++i) {
7796 SDValue V = Op.getOperand(i);
7797 if (!isa<ConstantSDNode>(Val: V) && !V.isUndef())
7798 continue;
7799 bool BitSet = V.isUndef() ? false : V->getAsZExtVal();
7800 if (BitSet)
7801 Bits32 |= BoolMask << (i * BitsPerBool);
7802 }
7803
7804 // Add in unknown nodes
7805 SDValue Base = DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT,
7806 DAG.getConstant(Bits32, dl, MVT::i32));
7807 for (unsigned i = 0; i < NumElts; ++i) {
7808 SDValue V = Op.getOperand(i);
7809 if (isa<ConstantSDNode>(Val: V) || V.isUndef())
7810 continue;
7811 Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V,
7812 DAG.getConstant(i, dl, MVT::i32));
7813 }
7814
7815 return Base;
7816}
7817
7818static SDValue LowerBUILD_VECTORToVIDUP(SDValue Op, SelectionDAG &DAG,
7819 const ARMSubtarget *ST) {
7820 if (!ST->hasMVEIntegerOps())
7821 return SDValue();
7822
7823 // We are looking for a buildvector where each element is Op[0] + i*N
7824 EVT VT = Op.getValueType();
7825 SDValue Op0 = Op.getOperand(i: 0);
7826 unsigned NumElts = VT.getVectorNumElements();
7827
7828 // Get the increment value from operand 1
7829 SDValue Op1 = Op.getOperand(i: 1);
7830 if (Op1.getOpcode() != ISD::ADD || Op1.getOperand(i: 0) != Op0 ||
7831 !isa<ConstantSDNode>(Val: Op1.getOperand(i: 1)))
7832 return SDValue();
7833 unsigned N = Op1.getConstantOperandVal(i: 1);
7834 if (N != 1 && N != 2 && N != 4 && N != 8)
7835 return SDValue();
7836
7837 // Check that each other operand matches
7838 for (unsigned I = 2; I < NumElts; I++) {
7839 SDValue OpI = Op.getOperand(i: I);
7840 if (OpI.getOpcode() != ISD::ADD || OpI.getOperand(i: 0) != Op0 ||
7841 !isa<ConstantSDNode>(Val: OpI.getOperand(i: 1)) ||
7842 OpI.getConstantOperandVal(i: 1) != I * N)
7843 return SDValue();
7844 }
7845
7846 SDLoc DL(Op);
7847 return DAG.getNode(ARMISD::VIDUP, DL, DAG.getVTList(VT, MVT::i32), Op0,
7848 DAG.getConstant(N, DL, MVT::i32));
7849}
7850
7851// Returns true if the operation N can be treated as qr instruction variant at
7852// operand Op.
7853static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op) {
7854 switch (N->getOpcode()) {
7855 case ISD::ADD:
7856 case ISD::MUL:
7857 case ISD::SADDSAT:
7858 case ISD::UADDSAT:
7859 return true;
7860 case ISD::SUB:
7861 case ISD::SSUBSAT:
7862 case ISD::USUBSAT:
7863 return N->getOperand(Num: 1).getNode() == Op;
7864 case ISD::INTRINSIC_WO_CHAIN:
7865 switch (N->getConstantOperandVal(Num: 0)) {
7866 case Intrinsic::arm_mve_add_predicated:
7867 case Intrinsic::arm_mve_mul_predicated:
7868 case Intrinsic::arm_mve_qadd_predicated:
7869 case Intrinsic::arm_mve_vhadd:
7870 case Intrinsic::arm_mve_hadd_predicated:
7871 case Intrinsic::arm_mve_vqdmulh:
7872 case Intrinsic::arm_mve_qdmulh_predicated:
7873 case Intrinsic::arm_mve_vqrdmulh:
7874 case Intrinsic::arm_mve_qrdmulh_predicated:
7875 case Intrinsic::arm_mve_vqdmull:
7876 case Intrinsic::arm_mve_vqdmull_predicated:
7877 return true;
7878 case Intrinsic::arm_mve_sub_predicated:
7879 case Intrinsic::arm_mve_qsub_predicated:
7880 case Intrinsic::arm_mve_vhsub:
7881 case Intrinsic::arm_mve_hsub_predicated:
7882 return N->getOperand(Num: 2).getNode() == Op;
7883 default:
7884 return false;
7885 }
7886 default:
7887 return false;
7888 }
7889}
7890
7891// If this is a case we can't handle, return null and let the default
7892// expansion code take care of it.
7893SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
7894 const ARMSubtarget *ST) const {
7895 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Val: Op.getNode());
7896 SDLoc dl(Op);
7897 EVT VT = Op.getValueType();
7898
7899 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
7900 return LowerBUILD_VECTOR_i1(Op, DAG, ST);
7901
7902 if (SDValue R = LowerBUILD_VECTORToVIDUP(Op, DAG, ST))
7903 return R;
7904
7905 APInt SplatBits, SplatUndef;
7906 unsigned SplatBitSize;
7907 bool HasAnyUndefs;
7908 if (BVN->isConstantSplat(SplatValue&: SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
7909 if (SplatUndef.isAllOnes())
7910 return DAG.getUNDEF(VT);
7911
7912 // If all the users of this constant splat are qr instruction variants,
7913 // generate a vdup of the constant.
7914 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == SplatBitSize &&
7915 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32) &&
7916 all_of(Range: BVN->uses(),
7917 P: [BVN](const SDNode *U) { return IsQRMVEInstruction(N: U, Op: BVN); })) {
7918 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
7919 : SplatBitSize == 16 ? MVT::v8i16
7920 : MVT::v16i8;
7921 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
7922 SDValue VDup = DAG.getNode(Opcode: ARMISD::VDUP, DL: dl, VT: DupVT, Operand: Const);
7923 return DAG.getNode(Opcode: ARMISD::VECTOR_REG_CAST, DL: dl, VT, Operand: VDup);
7924 }
7925
7926 if ((ST->hasNEON() && SplatBitSize <= 64) ||
7927 (ST->hasMVEIntegerOps() && SplatBitSize <= 64)) {
7928 // Check if an immediate VMOV works.
7929 EVT VmovVT;
7930 SDValue Val =
7931 isVMOVModifiedImm(SplatBits: SplatBits.getZExtValue(), SplatUndef: SplatUndef.getZExtValue(),
7932 SplatBitSize, DAG, dl, VT&: VmovVT, VectorVT: VT, type: VMOVModImm);
7933
7934 if (Val.getNode()) {
7935 SDValue Vmov = DAG.getNode(Opcode: ARMISD::VMOVIMM, DL: dl, VT: VmovVT, Operand: Val);
7936 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT, Operand: Vmov);
7937 }
7938
7939 // Try an immediate VMVN.
7940 uint64_t NegatedImm = (~SplatBits).getZExtValue();
7941 Val = isVMOVModifiedImm(
7942 NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT,
7943 VT, ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm);
7944 if (Val.getNode()) {
7945 SDValue Vmov = DAG.getNode(Opcode: ARMISD::VMVNIMM, DL: dl, VT: VmovVT, Operand: Val);
7946 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT, Operand: Vmov);
7947 }
7948
7949 // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
7950 if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
7951 int ImmVal = ARM_AM::getFP32Imm(Imm: SplatBits);
7952 if (ImmVal != -1) {
7953 SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32);
7954 return DAG.getNode(Opcode: ARMISD::VMOVFPIMM, DL: dl, VT, Operand: Val);
7955 }
7956 }
7957
7958 // If we are under MVE, generate a VDUP(constant), bitcast to the original
7959 // type.
7960 if (ST->hasMVEIntegerOps() &&
7961 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32)) {
7962 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
7963 : SplatBitSize == 16 ? MVT::v8i16
7964 : MVT::v16i8;
7965 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
7966 SDValue VDup = DAG.getNode(Opcode: ARMISD::VDUP, DL: dl, VT: DupVT, Operand: Const);
7967 return DAG.getNode(Opcode: ARMISD::VECTOR_REG_CAST, DL: dl, VT, Operand: VDup);
7968 }
7969 }
7970 }
7971
7972 // Scan through the operands to see if only one value is used.
7973 //
7974 // As an optimisation, even if more than one value is used it may be more
7975 // profitable to splat with one value then change some lanes.
7976 //
7977 // Heuristically we decide to do this if the vector has a "dominant" value,
7978 // defined as splatted to more than half of the lanes.
7979 unsigned NumElts = VT.getVectorNumElements();
7980 bool isOnlyLowElement = true;
7981 bool usesOnlyOneValue = true;
7982 bool hasDominantValue = false;
7983 bool isConstant = true;
7984
7985 // Map of the number of times a particular SDValue appears in the
7986 // element list.
7987 DenseMap<SDValue, unsigned> ValueCounts;
7988 SDValue Value;
7989 for (unsigned i = 0; i < NumElts; ++i) {
7990 SDValue V = Op.getOperand(i);
7991 if (V.isUndef())
7992 continue;
7993 if (i > 0)
7994 isOnlyLowElement = false;
7995 if (!isa<ConstantFPSDNode>(Val: V) && !isa<ConstantSDNode>(Val: V))
7996 isConstant = false;
7997
7998 ValueCounts.insert(KV: std::make_pair(x&: V, y: 0));
7999 unsigned &Count = ValueCounts[V];
8000
8001 // Is this value dominant? (takes up more than half of the lanes)
8002 if (++Count > (NumElts / 2)) {
8003 hasDominantValue = true;
8004 Value = V;
8005 }
8006 }
8007 if (ValueCounts.size() != 1)
8008 usesOnlyOneValue = false;
8009 if (!Value.getNode() && !ValueCounts.empty())
8010 Value = ValueCounts.begin()->first;
8011
8012 if (ValueCounts.empty())
8013 return DAG.getUNDEF(VT);
8014
8015 // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
8016 // Keep going if we are hitting this case.
8017 if (isOnlyLowElement && !ISD::isNormalLoad(N: Value.getNode()))
8018 return DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT, Operand: Value);
8019
8020 unsigned EltSize = VT.getScalarSizeInBits();
8021
8022 // Use VDUP for non-constant splats. For f32 constant splats, reduce to
8023 // i32 and try again.
8024 if (hasDominantValue && EltSize <= 32) {
8025 if (!isConstant) {
8026 SDValue N;
8027
8028 // If we are VDUPing a value that comes directly from a vector, that will
8029 // cause an unnecessary move to and from a GPR, where instead we could
8030 // just use VDUPLANE. We can only do this if the lane being extracted
8031 // is at a constant index, as the VDUP from lane instructions only have
8032 // constant-index forms.
8033 ConstantSDNode *constIndex;
8034 if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
8035 (constIndex = dyn_cast<ConstantSDNode>(Val: Value->getOperand(Num: 1)))) {
8036 // We need to create a new undef vector to use for the VDUPLANE if the
8037 // size of the vector from which we get the value is different than the
8038 // size of the vector that we need to create. We will insert the element
8039 // such that the register coalescer will remove unnecessary copies.
8040 if (VT != Value->getOperand(Num: 0).getValueType()) {
8041 unsigned index = constIndex->getAPIntValue().getLimitedValue() %
8042 VT.getVectorNumElements();
8043 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
8044 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
8045 Value, DAG.getConstant(index, dl, MVT::i32)),
8046 DAG.getConstant(index, dl, MVT::i32));
8047 } else
8048 N = DAG.getNode(Opcode: ARMISD::VDUPLANE, DL: dl, VT,
8049 N1: Value->getOperand(Num: 0), N2: Value->getOperand(Num: 1));
8050 } else
8051 N = DAG.getNode(Opcode: ARMISD::VDUP, DL: dl, VT, Operand: Value);
8052
8053 if (!usesOnlyOneValue) {
8054 // The dominant value was splatted as 'N', but we now have to insert
8055 // all differing elements.
8056 for (unsigned I = 0; I < NumElts; ++I) {
8057 if (Op.getOperand(i: I) == Value)
8058 continue;
8059 SmallVector<SDValue, 3> Ops;
8060 Ops.push_back(Elt: N);
8061 Ops.push_back(Elt: Op.getOperand(i: I));
8062 Ops.push_back(DAG.getConstant(I, dl, MVT::i32));
8063 N = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: dl, VT, Ops);
8064 }
8065 }
8066 return N;
8067 }
8068 if (VT.getVectorElementType().isFloatingPoint()) {
8069 SmallVector<SDValue, 8> Ops;
8070 MVT FVT = VT.getVectorElementType().getSimpleVT();
8071 assert(FVT == MVT::f32 || FVT == MVT::f16);
8072 MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16;
8073 for (unsigned i = 0; i < NumElts; ++i)
8074 Ops.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: IVT,
8075 Operand: Op.getOperand(i)));
8076 EVT VecVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: IVT, NumElements: NumElts);
8077 SDValue Val = DAG.getBuildVector(VT: VecVT, DL: dl, Ops);
8078 Val = LowerBUILD_VECTOR(Op: Val, DAG, ST);
8079 if (Val.getNode())
8080 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT, Operand: Val);
8081 }
8082 if (usesOnlyOneValue) {
8083 SDValue Val = IsSingleInstrConstant(N: Value, DAG, ST, dl);
8084 if (isConstant && Val.getNode())
8085 return DAG.getNode(Opcode: ARMISD::VDUP, DL: dl, VT, Operand: Val);
8086 }
8087 }
8088
8089 // If all elements are constants and the case above didn't get hit, fall back
8090 // to the default expansion, which will generate a load from the constant
8091 // pool.
8092 if (isConstant)
8093 return SDValue();
8094
8095 // Reconstruct the BUILDVECTOR to one of the legal shuffles (such as vext and
8096 // vmovn). Empirical tests suggest this is rarely worth it for vectors of
8097 // length <= 2.
8098 if (NumElts >= 4)
8099 if (SDValue shuffle = ReconstructShuffle(Op, DAG))
8100 return shuffle;
8101
8102 // Attempt to turn a buildvector of scalar fptrunc's or fpext's back into
8103 // VCVT's
8104 if (SDValue VCVT = LowerBuildVectorOfFPTrunc(BV: Op, DAG, ST: Subtarget))
8105 return VCVT;
8106 if (SDValue VCVT = LowerBuildVectorOfFPExt(BV: Op, DAG, ST: Subtarget))
8107 return VCVT;
8108
8109 if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
8110 // If we haven't found an efficient lowering, try splitting a 128-bit vector
8111 // into two 64-bit vectors; we might discover a better way to lower it.
8112 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts);
8113 EVT ExtVT = VT.getVectorElementType();
8114 EVT HVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: ExtVT, NumElements: NumElts / 2);
8115 SDValue Lower = DAG.getBuildVector(VT: HVT, DL: dl, Ops: ArrayRef(&Ops[0], NumElts / 2));
8116 if (Lower.getOpcode() == ISD::BUILD_VECTOR)
8117 Lower = LowerBUILD_VECTOR(Op: Lower, DAG, ST);
8118 SDValue Upper =
8119 DAG.getBuildVector(VT: HVT, DL: dl, Ops: ArrayRef(&Ops[NumElts / 2], NumElts / 2));
8120 if (Upper.getOpcode() == ISD::BUILD_VECTOR)
8121 Upper = LowerBUILD_VECTOR(Op: Upper, DAG, ST);
8122 if (Lower && Upper)
8123 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT, N1: Lower, N2: Upper);
8124 }
8125
8126 // Vectors with 32- or 64-bit elements can be built by directly assigning
8127 // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands
8128 // will be legalized.
8129 if (EltSize >= 32) {
8130 // Do the expansion with floating-point types, since that is what the VFP
8131 // registers are defined to use, and since i64 is not legal.
8132 EVT EltVT = EVT::getFloatingPointVT(BitWidth: EltSize);
8133 EVT VecVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: NumElts);
8134 SmallVector<SDValue, 8> Ops;
8135 for (unsigned i = 0; i < NumElts; ++i)
8136 Ops.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: EltVT, Operand: Op.getOperand(i)));
8137 SDValue Val = DAG.getNode(Opcode: ARMISD::BUILD_VECTOR, DL: dl, VT: VecVT, Ops);
8138 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT, Operand: Val);
8139 }
8140
8141 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
8142 // know the default expansion would otherwise fall back on something even
8143 // worse. For a vector with one or two non-undef values, that's
8144 // scalar_to_vector for the elements followed by a shuffle (provided the
8145 // shuffle is valid for the target) and materialization element by element
8146 // on the stack followed by a load for everything else.
8147 if (!isConstant && !usesOnlyOneValue) {
8148 SDValue Vec = DAG.getUNDEF(VT);
8149 for (unsigned i = 0 ; i < NumElts; ++i) {
8150 SDValue V = Op.getOperand(i);
8151 if (V.isUndef())
8152 continue;
8153 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32);
8154 Vec = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: dl, VT, N1: Vec, N2: V, N3: LaneIdx);
8155 }
8156 return Vec;
8157 }
8158
8159 return SDValue();
8160}
8161
8162// Gather data to see if the operation can be modelled as a
8163// shuffle in combination with VEXTs.
8164SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
8165 SelectionDAG &DAG) const {
8166 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
8167 SDLoc dl(Op);
8168 EVT VT = Op.getValueType();
8169 unsigned NumElts = VT.getVectorNumElements();
8170
8171 struct ShuffleSourceInfo {
8172 SDValue Vec;
8173 unsigned MinElt = std::numeric_limits<unsigned>::max();
8174 unsigned MaxElt = 0;
8175
8176 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
8177 // be compatible with the shuffle we intend to construct. As a result
8178 // ShuffleVec will be some sliding window into the original Vec.
8179 SDValue ShuffleVec;
8180
8181 // Code should guarantee that element i in Vec starts at element "WindowBase
8182 // + i * WindowScale in ShuffleVec".
8183 int WindowBase = 0;
8184 int WindowScale = 1;
8185
8186 ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {}
8187
8188 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
8189 };
8190
8191 // First gather all vectors used as an immediate source for this BUILD_VECTOR
8192 // node.
8193 SmallVector<ShuffleSourceInfo, 2> Sources;
8194 for (unsigned i = 0; i < NumElts; ++i) {
8195 SDValue V = Op.getOperand(i);
8196 if (V.isUndef())
8197 continue;
8198 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
8199 // A shuffle can only come from building a vector from various
8200 // elements of other vectors.
8201 return SDValue();
8202 } else if (!isa<ConstantSDNode>(Val: V.getOperand(i: 1))) {
8203 // Furthermore, shuffles require a constant mask, whereas extractelts
8204 // accept variable indices.
8205 return SDValue();
8206 }
8207
8208 // Add this element source to the list if it's not already there.
8209 SDValue SourceVec = V.getOperand(i: 0);
8210 auto Source = llvm::find(Range&: Sources, Val: SourceVec);
8211 if (Source == Sources.end())
8212 Source = Sources.insert(I: Sources.end(), Elt: ShuffleSourceInfo(SourceVec));
8213
8214 // Update the minimum and maximum lane number seen.
8215 unsigned EltNo = V.getConstantOperandVal(i: 1);
8216 Source->MinElt = std::min(a: Source->MinElt, b: EltNo);
8217 Source->MaxElt = std::max(a: Source->MaxElt, b: EltNo);
8218 }
8219
8220 // Currently only do something sane when at most two source vectors
8221 // are involved.
8222 if (Sources.size() > 2)
8223 return SDValue();
8224
8225 // Find out the smallest element size among result and two sources, and use
8226 // it as element size to build the shuffle_vector.
8227 EVT SmallestEltTy = VT.getVectorElementType();
8228 for (auto &Source : Sources) {
8229 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
8230 if (SrcEltTy.bitsLT(VT: SmallestEltTy))
8231 SmallestEltTy = SrcEltTy;
8232 }
8233 unsigned ResMultiplier =
8234 VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
8235 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
8236 EVT ShuffleVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: SmallestEltTy, NumElements: NumElts);
8237
8238 // If the source vector is too wide or too narrow, we may nevertheless be able
8239 // to construct a compatible shuffle either by concatenating it with UNDEF or
8240 // extracting a suitable range of elements.
8241 for (auto &Src : Sources) {
8242 EVT SrcVT = Src.ShuffleVec.getValueType();
8243
8244 uint64_t SrcVTSize = SrcVT.getFixedSizeInBits();
8245 uint64_t VTSize = VT.getFixedSizeInBits();
8246 if (SrcVTSize == VTSize)
8247 continue;
8248
8249 // This stage of the search produces a source with the same element type as
8250 // the original, but with a total width matching the BUILD_VECTOR output.
8251 EVT EltVT = SrcVT.getVectorElementType();
8252 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
8253 EVT DestVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: NumSrcElts);
8254
8255 if (SrcVTSize < VTSize) {
8256 if (2 * SrcVTSize != VTSize)
8257 return SDValue();
8258 // We can pad out the smaller vector for free, so if it's part of a
8259 // shuffle...
8260 Src.ShuffleVec =
8261 DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT: DestVT, N1: Src.ShuffleVec,
8262 N2: DAG.getUNDEF(VT: Src.ShuffleVec.getValueType()));
8263 continue;
8264 }
8265
8266 if (SrcVTSize != 2 * VTSize)
8267 return SDValue();
8268
8269 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
8270 // Span too large for a VEXT to cope
8271 return SDValue();
8272 }
8273
8274 if (Src.MinElt >= NumSrcElts) {
8275 // The extraction can just take the second half
8276 Src.ShuffleVec =
8277 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8278 DAG.getConstant(NumSrcElts, dl, MVT::i32));
8279 Src.WindowBase = -NumSrcElts;
8280 } else if (Src.MaxElt < NumSrcElts) {
8281 // The extraction can just take the first half
8282 Src.ShuffleVec =
8283 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8284 DAG.getConstant(0, dl, MVT::i32));
8285 } else {
8286 // An actual VEXT is needed
8287 SDValue VEXTSrc1 =
8288 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8289 DAG.getConstant(0, dl, MVT::i32));
8290 SDValue VEXTSrc2 =
8291 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8292 DAG.getConstant(NumSrcElts, dl, MVT::i32));
8293
8294 Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1,
8295 VEXTSrc2,
8296 DAG.getConstant(Src.MinElt, dl, MVT::i32));
8297 Src.WindowBase = -Src.MinElt;
8298 }
8299 }
8300
8301 // Another possible incompatibility occurs from the vector element types. We
8302 // can fix this by bitcasting the source vectors to the same type we intend
8303 // for the shuffle.
8304 for (auto &Src : Sources) {
8305 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
8306 if (SrcEltTy == SmallestEltTy)
8307 continue;
8308 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
8309 Src.ShuffleVec = DAG.getNode(Opcode: ARMISD::VECTOR_REG_CAST, DL: dl, VT: ShuffleVT, Operand: Src.ShuffleVec);
8310 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
8311 Src.WindowBase *= Src.WindowScale;
8312 }
8313
8314 // Final check before we try to actually produce a shuffle.
8315 LLVM_DEBUG(for (auto Src
8316 : Sources)
8317 assert(Src.ShuffleVec.getValueType() == ShuffleVT););
8318
8319 // The stars all align, our next step is to produce the mask for the shuffle.
8320 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
8321 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
8322 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
8323 SDValue Entry = Op.getOperand(i);
8324 if (Entry.isUndef())
8325 continue;
8326
8327 auto Src = llvm::find(Range&: Sources, Val: Entry.getOperand(i: 0));
8328 int EltNo = cast<ConstantSDNode>(Val: Entry.getOperand(i: 1))->getSExtValue();
8329
8330 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
8331 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
8332 // segment.
8333 EVT OrigEltTy = Entry.getOperand(i: 0).getValueType().getVectorElementType();
8334 int BitsDefined = std::min(a: OrigEltTy.getScalarSizeInBits(),
8335 b: VT.getScalarSizeInBits());
8336 int LanesDefined = BitsDefined / BitsPerShuffleLane;
8337
8338 // This source is expected to fill ResMultiplier lanes of the final shuffle,
8339 // starting at the appropriate offset.
8340 int *LaneMask = &Mask[i * ResMultiplier];
8341
8342 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
8343 ExtractBase += NumElts * (Src - Sources.begin());
8344 for (int j = 0; j < LanesDefined; ++j)
8345 LaneMask[j] = ExtractBase + j;
8346 }
8347
8348
8349 // We can't handle more than two sources. This should have already
8350 // been checked before this point.
8351 assert(Sources.size() <= 2 && "Too many sources!");
8352
8353 SDValue ShuffleOps[] = { DAG.getUNDEF(VT: ShuffleVT), DAG.getUNDEF(VT: ShuffleVT) };
8354 for (unsigned i = 0; i < Sources.size(); ++i)
8355 ShuffleOps[i] = Sources[i].ShuffleVec;
8356
8357 SDValue Shuffle = buildLegalVectorShuffle(VT: ShuffleVT, DL: dl, N0: ShuffleOps[0],
8358 N1: ShuffleOps[1], Mask, DAG);
8359 if (!Shuffle)
8360 return SDValue();
8361 return DAG.getNode(Opcode: ARMISD::VECTOR_REG_CAST, DL: dl, VT, Operand: Shuffle);
8362}
8363
8364enum ShuffleOpCodes {
8365 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
8366 OP_VREV,
8367 OP_VDUP0,
8368 OP_VDUP1,
8369 OP_VDUP2,
8370 OP_VDUP3,
8371 OP_VEXT1,
8372 OP_VEXT2,
8373 OP_VEXT3,
8374 OP_VUZPL, // VUZP, left result
8375 OP_VUZPR, // VUZP, right result
8376 OP_VZIPL, // VZIP, left result
8377 OP_VZIPR, // VZIP, right result
8378 OP_VTRNL, // VTRN, left result
8379 OP_VTRNR // VTRN, right result
8380};
8381
8382static bool isLegalMVEShuffleOp(unsigned PFEntry) {
8383 unsigned OpNum = (PFEntry >> 26) & 0x0F;
8384 switch (OpNum) {
8385 case OP_COPY:
8386 case OP_VREV:
8387 case OP_VDUP0:
8388 case OP_VDUP1:
8389 case OP_VDUP2:
8390 case OP_VDUP3:
8391 return true;
8392 }
8393 return false;
8394}
8395
8396/// isShuffleMaskLegal - Targets can use this to indicate that they only
8397/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
8398/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
8399/// are assumed to be legal.
8400bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
8401 if (VT.getVectorNumElements() == 4 &&
8402 (VT.is128BitVector() || VT.is64BitVector())) {
8403 unsigned PFIndexes[4];
8404 for (unsigned i = 0; i != 4; ++i) {
8405 if (M[i] < 0)
8406 PFIndexes[i] = 8;
8407 else
8408 PFIndexes[i] = M[i];
8409 }
8410
8411 // Compute the index in the perfect shuffle table.
8412 unsigned PFTableIndex =
8413 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
8414 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
8415 unsigned Cost = (PFEntry >> 30);
8416
8417 if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry)))
8418 return true;
8419 }
8420
8421 bool ReverseVEXT, isV_UNDEF;
8422 unsigned Imm, WhichResult;
8423
8424 unsigned EltSize = VT.getScalarSizeInBits();
8425 if (EltSize >= 32 ||
8426 ShuffleVectorSDNode::isSplatMask(Mask: &M[0], VT) ||
8427 ShuffleVectorInst::isIdentityMask(Mask: M, NumSrcElts: M.size()) ||
8428 isVREVMask(M, VT, BlockSize: 64) ||
8429 isVREVMask(M, VT, BlockSize: 32) ||
8430 isVREVMask(M, VT, BlockSize: 16))
8431 return true;
8432 else if (Subtarget->hasNEON() &&
8433 (isVEXTMask(M, VT, ReverseVEXT, Imm) ||
8434 isVTBLMask(M, VT) ||
8435 isNEONTwoResultShuffleMask(ShuffleMask: M, VT, WhichResult, isV_UNDEF)))
8436 return true;
8437 else if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8438 isReverseMask(M, VT))
8439 return true;
8440 else if (Subtarget->hasMVEIntegerOps() &&
8441 (isVMOVNMask(M, VT, Top: true, SingleSource: false) ||
8442 isVMOVNMask(M, VT, Top: false, SingleSource: false) || isVMOVNMask(M, VT, Top: true, SingleSource: true)))
8443 return true;
8444 else if (Subtarget->hasMVEIntegerOps() &&
8445 (isTruncMask(M, VT, Top: false, SingleSource: false) ||
8446 isTruncMask(M, VT, Top: false, SingleSource: true) ||
8447 isTruncMask(M, VT, Top: true, SingleSource: false) || isTruncMask(M, VT, Top: true, SingleSource: true)))
8448 return true;
8449 else
8450 return false;
8451}
8452
8453/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
8454/// the specified operations to build the shuffle.
8455static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
8456 SDValue RHS, SelectionDAG &DAG,
8457 const SDLoc &dl) {
8458 unsigned OpNum = (PFEntry >> 26) & 0x0F;
8459 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
8460 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
8461
8462 if (OpNum == OP_COPY) {
8463 if (LHSID == (1*9+2)*9+3) return LHS;
8464 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
8465 return RHS;
8466 }
8467
8468 SDValue OpLHS, OpRHS;
8469 OpLHS = GeneratePerfectShuffle(PFEntry: PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
8470 OpRHS = GeneratePerfectShuffle(PFEntry: PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
8471 EVT VT = OpLHS.getValueType();
8472
8473 switch (OpNum) {
8474 default: llvm_unreachable("Unknown shuffle opcode!");
8475 case OP_VREV:
8476 // VREV divides the vector in half and swaps within the half.
8477 if (VT.getScalarSizeInBits() == 32)
8478 return DAG.getNode(Opcode: ARMISD::VREV64, DL: dl, VT, Operand: OpLHS);
8479 // vrev <4 x i16> -> VREV32
8480 if (VT.getScalarSizeInBits() == 16)
8481 return DAG.getNode(Opcode: ARMISD::VREV32, DL: dl, VT, Operand: OpLHS);
8482 // vrev <4 x i8> -> VREV16
8483 assert(VT.getScalarSizeInBits() == 8);
8484 return DAG.getNode(Opcode: ARMISD::VREV16, DL: dl, VT, Operand: OpLHS);
8485 case OP_VDUP0:
8486 case OP_VDUP1:
8487 case OP_VDUP2:
8488 case OP_VDUP3:
8489 return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
8490 OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32));
8491 case OP_VEXT1:
8492 case OP_VEXT2:
8493 case OP_VEXT3:
8494 return DAG.getNode(ARMISD::VEXT, dl, VT,
8495 OpLHS, OpRHS,
8496 DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32));
8497 case OP_VUZPL:
8498 case OP_VUZPR:
8499 return DAG.getNode(Opcode: ARMISD::VUZP, DL: dl, VTList: DAG.getVTList(VT1: VT, VT2: VT),
8500 N1: OpLHS, N2: OpRHS).getValue(R: OpNum-OP_VUZPL);
8501 case OP_VZIPL:
8502 case OP_VZIPR:
8503 return DAG.getNode(Opcode: ARMISD::VZIP, DL: dl, VTList: DAG.getVTList(VT1: VT, VT2: VT),
8504 N1: OpLHS, N2: OpRHS).getValue(R: OpNum-OP_VZIPL);
8505 case OP_VTRNL:
8506 case OP_VTRNR:
8507 return DAG.getNode(Opcode: ARMISD::VTRN, DL: dl, VTList: DAG.getVTList(VT1: VT, VT2: VT),
8508 N1: OpLHS, N2: OpRHS).getValue(R: OpNum-OP_VTRNL);
8509 }
8510}
8511
8512static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
8513 ArrayRef<int> ShuffleMask,
8514 SelectionDAG &DAG) {
8515 // Check to see if we can use the VTBL instruction.
8516 SDValue V1 = Op.getOperand(i: 0);
8517 SDValue V2 = Op.getOperand(i: 1);
8518 SDLoc DL(Op);
8519
8520 SmallVector<SDValue, 8> VTBLMask;
8521 for (int I : ShuffleMask)
8522 VTBLMask.push_back(DAG.getConstant(I, DL, MVT::i32));
8523
8524 if (V2.getNode()->isUndef())
8525 return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
8526 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
8527
8528 return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
8529 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
8530}
8531
8532static SDValue LowerReverse_VECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
8533 SDLoc DL(Op);
8534 EVT VT = Op.getValueType();
8535
8536 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8537 "Expect an v8i16/v16i8 type");
8538 SDValue OpLHS = DAG.getNode(Opcode: ARMISD::VREV64, DL, VT, Operand: Op.getOperand(i: 0));
8539 // For a v16i8 type: After the VREV, we have got <7, ..., 0, 15, ..., 8>. Now,
8540 // extract the first 8 bytes into the top double word and the last 8 bytes
8541 // into the bottom double word, through a new vector shuffle that will be
8542 // turned into a VEXT on Neon, or a couple of VMOVDs on MVE.
8543 std::vector<int> NewMask;
8544 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
8545 NewMask.push_back(x: VT.getVectorNumElements() / 2 + i);
8546 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
8547 NewMask.push_back(x: i);
8548 return DAG.getVectorShuffle(VT, dl: DL, N1: OpLHS, N2: OpLHS, Mask: NewMask);
8549}
8550
8551static EVT getVectorTyFromPredicateVector(EVT VT) {
8552 switch (VT.getSimpleVT().SimpleTy) {
8553 case MVT::v2i1:
8554 return MVT::v2f64;
8555 case MVT::v4i1:
8556 return MVT::v4i32;
8557 case MVT::v8i1:
8558 return MVT::v8i16;
8559 case MVT::v16i1:
8560 return MVT::v16i8;
8561 default:
8562 llvm_unreachable("Unexpected vector predicate type");
8563 }
8564}
8565
8566static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT,
8567 SelectionDAG &DAG) {
8568 // Converting from boolean predicates to integers involves creating a vector
8569 // of all ones or all zeroes and selecting the lanes based upon the real
8570 // predicate.
8571 SDValue AllOnes =
8572 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32);
8573 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes);
8574
8575 SDValue AllZeroes =
8576 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32);
8577 AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes);
8578
8579 // Get full vector type from predicate type
8580 EVT NewVT = getVectorTyFromPredicateVector(VT);
8581
8582 SDValue RecastV1;
8583 // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast
8584 // this to a v16i1. This cannot be done with an ordinary bitcast because the
8585 // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node,
8586 // since we know in hardware the sizes are really the same.
8587 if (VT != MVT::v16i1)
8588 RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred);
8589 else
8590 RecastV1 = Pred;
8591
8592 // Select either all ones or zeroes depending upon the real predicate bits.
8593 SDValue PredAsVector =
8594 DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes);
8595
8596 // Recast our new predicate-as-integer v16i8 vector into something
8597 // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate.
8598 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: NewVT, Operand: PredAsVector);
8599}
8600
8601static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG,
8602 const ARMSubtarget *ST) {
8603 EVT VT = Op.getValueType();
8604 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Val: Op.getNode());
8605 ArrayRef<int> ShuffleMask = SVN->getMask();
8606
8607 assert(ST->hasMVEIntegerOps() &&
8608 "No support for vector shuffle of boolean predicates");
8609
8610 SDValue V1 = Op.getOperand(i: 0);
8611 SDValue V2 = Op.getOperand(i: 1);
8612 SDLoc dl(Op);
8613 if (isReverseMask(M: ShuffleMask, VT)) {
8614 SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1);
8615 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast);
8616 SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit,
8617 DAG.getConstant(16, dl, MVT::i32));
8618 return DAG.getNode(Opcode: ARMISD::PREDICATE_CAST, DL: dl, VT, Operand: srl);
8619 }
8620
8621 // Until we can come up with optimised cases for every single vector
8622 // shuffle in existence we have chosen the least painful strategy. This is
8623 // to essentially promote the boolean predicate to a 8-bit integer, where
8624 // each predicate represents a byte. Then we fall back on a normal integer
8625 // vector shuffle and convert the result back into a predicate vector. In
8626 // many cases the generated code might be even better than scalar code
8627 // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit
8628 // fields in a register into 8 other arbitrary 2-bit fields!
8629 SDValue PredAsVector1 = PromoteMVEPredVector(dl, Pred: V1, VT, DAG);
8630 EVT NewVT = PredAsVector1.getValueType();
8631 SDValue PredAsVector2 = V2.isUndef() ? DAG.getUNDEF(VT: NewVT)
8632 : PromoteMVEPredVector(dl, Pred: V2, VT, DAG);
8633 assert(PredAsVector2.getValueType() == NewVT &&
8634 "Expected identical vector type in expanded i1 shuffle!");
8635
8636 // Do the shuffle!
8637 SDValue Shuffled = DAG.getVectorShuffle(VT: NewVT, dl, N1: PredAsVector1,
8638 N2: PredAsVector2, Mask: ShuffleMask);
8639
8640 // Now return the result of comparing the shuffled vector with zero,
8641 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i1
8642 // we convert to a v4i1 compare to fill in the two halves of the i64 as i32s.
8643 if (VT == MVT::v2i1) {
8644 SDValue BC = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Shuffled);
8645 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, BC,
8646 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8647 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
8648 }
8649 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled,
8650 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8651}
8652
8653static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op,
8654 ArrayRef<int> ShuffleMask,
8655 SelectionDAG &DAG) {
8656 // Attempt to lower the vector shuffle using as many whole register movs as
8657 // possible. This is useful for types smaller than 32bits, which would
8658 // often otherwise become a series for grp movs.
8659 SDLoc dl(Op);
8660 EVT VT = Op.getValueType();
8661 if (VT.getScalarSizeInBits() >= 32)
8662 return SDValue();
8663
8664 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8665 "Unexpected vector type");
8666 int NumElts = VT.getVectorNumElements();
8667 int QuarterSize = NumElts / 4;
8668 // The four final parts of the vector, as i32's
8669 SDValue Parts[4];
8670
8671 // Look for full lane vmovs like <0,1,2,3> or <u,5,6,7> etc, (but not
8672 // <u,u,u,u>), returning the vmov lane index
8673 auto getMovIdx = [](ArrayRef<int> ShuffleMask, int Start, int Length) {
8674 // Detect which mov lane this would be from the first non-undef element.
8675 int MovIdx = -1;
8676 for (int i = 0; i < Length; i++) {
8677 if (ShuffleMask[Start + i] >= 0) {
8678 if (ShuffleMask[Start + i] % Length != i)
8679 return -1;
8680 MovIdx = ShuffleMask[Start + i] / Length;
8681 break;
8682 }
8683 }
8684 // If all items are undef, leave this for other combines
8685 if (MovIdx == -1)
8686 return -1;
8687 // Check the remaining values are the correct part of the same mov
8688 for (int i = 1; i < Length; i++) {
8689 if (ShuffleMask[Start + i] >= 0 &&
8690 (ShuffleMask[Start + i] / Length != MovIdx ||
8691 ShuffleMask[Start + i] % Length != i))
8692 return -1;
8693 }
8694 return MovIdx;
8695 };
8696
8697 for (int Part = 0; Part < 4; ++Part) {
8698 // Does this part look like a mov
8699 int Elt = getMovIdx(ShuffleMask, Part * QuarterSize, QuarterSize);
8700 if (Elt != -1) {
8701 SDValue Input = Op->getOperand(Num: 0);
8702 if (Elt >= 4) {
8703 Input = Op->getOperand(Num: 1);
8704 Elt -= 4;
8705 }
8706 SDValue BitCast = DAG.getBitcast(MVT::v4f32, Input);
8707 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, BitCast,
8708 DAG.getConstant(Elt, dl, MVT::i32));
8709 }
8710 }
8711
8712 // Nothing interesting found, just return
8713 if (!Parts[0] && !Parts[1] && !Parts[2] && !Parts[3])
8714 return SDValue();
8715
8716 // The other parts need to be built with the old shuffle vector, cast to a
8717 // v4i32 and extract_vector_elts
8718 if (!Parts[0] || !Parts[1] || !Parts[2] || !Parts[3]) {
8719 SmallVector<int, 16> NewShuffleMask;
8720 for (int Part = 0; Part < 4; ++Part)
8721 for (int i = 0; i < QuarterSize; i++)
8722 NewShuffleMask.push_back(
8723 Elt: Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]);
8724 SDValue NewShuffle = DAG.getVectorShuffle(
8725 VT, dl, N1: Op->getOperand(Num: 0), N2: Op->getOperand(Num: 1), Mask: NewShuffleMask);
8726 SDValue BitCast = DAG.getBitcast(MVT::v4f32, NewShuffle);
8727
8728 for (int Part = 0; Part < 4; ++Part)
8729 if (!Parts[Part])
8730 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32,
8731 BitCast, DAG.getConstant(Part, dl, MVT::i32));
8732 }
8733 // Build a vector out of the various parts and bitcast it back to the original
8734 // type.
8735 SDValue NewVec = DAG.getNode(ARMISD::BUILD_VECTOR, dl, MVT::v4f32, Parts);
8736 return DAG.getBitcast(VT, V: NewVec);
8737}
8738
8739static SDValue LowerVECTOR_SHUFFLEUsingOneOff(SDValue Op,
8740 ArrayRef<int> ShuffleMask,
8741 SelectionDAG &DAG) {
8742 SDValue V1 = Op.getOperand(i: 0);
8743 SDValue V2 = Op.getOperand(i: 1);
8744 EVT VT = Op.getValueType();
8745 unsigned NumElts = VT.getVectorNumElements();
8746
8747 // An One-Off Identity mask is one that is mostly an identity mask from as
8748 // single source but contains a single element out-of-place, either from a
8749 // different vector or from another position in the same vector. As opposed to
8750 // lowering this via a ARMISD::BUILD_VECTOR we can generate an extract/insert
8751 // pair directly.
8752 auto isOneOffIdentityMask = [](ArrayRef<int> Mask, EVT VT, int BaseOffset,
8753 int &OffElement) {
8754 OffElement = -1;
8755 int NonUndef = 0;
8756 for (int i = 0, NumMaskElts = Mask.size(); i < NumMaskElts; ++i) {
8757 if (Mask[i] == -1)
8758 continue;
8759 NonUndef++;
8760 if (Mask[i] != i + BaseOffset) {
8761 if (OffElement == -1)
8762 OffElement = i;
8763 else
8764 return false;
8765 }
8766 }
8767 return NonUndef > 2 && OffElement != -1;
8768 };
8769 int OffElement;
8770 SDValue VInput;
8771 if (isOneOffIdentityMask(ShuffleMask, VT, 0, OffElement))
8772 VInput = V1;
8773 else if (isOneOffIdentityMask(ShuffleMask, VT, NumElts, OffElement))
8774 VInput = V2;
8775 else
8776 return SDValue();
8777
8778 SDLoc dl(Op);
8779 EVT SVT = VT.getScalarType() == MVT::i8 || VT.getScalarType() == MVT::i16
8780 ? MVT::i32
8781 : VT.getScalarType();
8782 SDValue Elt = DAG.getNode(
8783 Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT: SVT,
8784 N1: ShuffleMask[OffElement] < (int)NumElts ? V1 : V2,
8785 N2: DAG.getVectorIdxConstant(Val: ShuffleMask[OffElement] % NumElts, DL: dl));
8786 return DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: dl, VT, N1: VInput, N2: Elt,
8787 N3: DAG.getVectorIdxConstant(Val: OffElement % NumElts, DL: dl));
8788}
8789
8790static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
8791 const ARMSubtarget *ST) {
8792 SDValue V1 = Op.getOperand(i: 0);
8793 SDValue V2 = Op.getOperand(i: 1);
8794 SDLoc dl(Op);
8795 EVT VT = Op.getValueType();
8796 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Val: Op.getNode());
8797 unsigned EltSize = VT.getScalarSizeInBits();
8798
8799 if (ST->hasMVEIntegerOps() && EltSize == 1)
8800 return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST);
8801
8802 // Convert shuffles that are directly supported on NEON to target-specific
8803 // DAG nodes, instead of keeping them as shuffles and matching them again
8804 // during code selection. This is more efficient and avoids the possibility
8805 // of inconsistencies between legalization and selection.
8806 // FIXME: floating-point vectors should be canonicalized to integer vectors
8807 // of the same time so that they get CSEd properly.
8808 ArrayRef<int> ShuffleMask = SVN->getMask();
8809
8810 if (EltSize <= 32) {
8811 if (SVN->isSplat()) {
8812 int Lane = SVN->getSplatIndex();
8813 // If this is undef splat, generate it via "just" vdup, if possible.
8814 if (Lane == -1) Lane = 0;
8815
8816 // Test if V1 is a SCALAR_TO_VECTOR.
8817 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8818 return DAG.getNode(Opcode: ARMISD::VDUP, DL: dl, VT, Operand: V1.getOperand(i: 0));
8819 }
8820 // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
8821 // (and probably will turn into a SCALAR_TO_VECTOR once legalization
8822 // reaches it).
8823 if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
8824 !isa<ConstantSDNode>(Val: V1.getOperand(i: 0))) {
8825 bool IsScalarToVector = true;
8826 for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
8827 if (!V1.getOperand(i).isUndef()) {
8828 IsScalarToVector = false;
8829 break;
8830 }
8831 if (IsScalarToVector)
8832 return DAG.getNode(Opcode: ARMISD::VDUP, DL: dl, VT, Operand: V1.getOperand(i: 0));
8833 }
8834 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
8835 DAG.getConstant(Lane, dl, MVT::i32));
8836 }
8837
8838 bool ReverseVEXT = false;
8839 unsigned Imm = 0;
8840 if (ST->hasNEON() && isVEXTMask(M: ShuffleMask, VT, ReverseVEXT, Imm)) {
8841 if (ReverseVEXT)
8842 std::swap(a&: V1, b&: V2);
8843 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
8844 DAG.getConstant(Imm, dl, MVT::i32));
8845 }
8846
8847 if (isVREVMask(M: ShuffleMask, VT, BlockSize: 64))
8848 return DAG.getNode(Opcode: ARMISD::VREV64, DL: dl, VT, Operand: V1);
8849 if (isVREVMask(M: ShuffleMask, VT, BlockSize: 32))
8850 return DAG.getNode(Opcode: ARMISD::VREV32, DL: dl, VT, Operand: V1);
8851 if (isVREVMask(M: ShuffleMask, VT, BlockSize: 16))
8852 return DAG.getNode(Opcode: ARMISD::VREV16, DL: dl, VT, Operand: V1);
8853
8854 if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(M: ShuffleMask, VT, Imm)) {
8855 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
8856 DAG.getConstant(Imm, dl, MVT::i32));
8857 }
8858
8859 // Check for Neon shuffles that modify both input vectors in place.
8860 // If both results are used, i.e., if there are two shuffles with the same
8861 // source operands and with masks corresponding to both results of one of
8862 // these operations, DAG memoization will ensure that a single node is
8863 // used for both shuffles.
8864 unsigned WhichResult = 0;
8865 bool isV_UNDEF = false;
8866 if (ST->hasNEON()) {
8867 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
8868 ShuffleMask, VT, WhichResult, isV_UNDEF)) {
8869 if (isV_UNDEF)
8870 V2 = V1;
8871 return DAG.getNode(Opcode: ShuffleOpc, DL: dl, VTList: DAG.getVTList(VT1: VT, VT2: VT), N1: V1, N2: V2)
8872 .getValue(R: WhichResult);
8873 }
8874 }
8875 if (ST->hasMVEIntegerOps()) {
8876 if (isVMOVNMask(ShuffleMask, VT, false, false))
8877 return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1,
8878 DAG.getConstant(0, dl, MVT::i32));
8879 if (isVMOVNMask(ShuffleMask, VT, true, false))
8880 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2,
8881 DAG.getConstant(1, dl, MVT::i32));
8882 if (isVMOVNMask(ShuffleMask, VT, true, true))
8883 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V1,
8884 DAG.getConstant(1, dl, MVT::i32));
8885 }
8886
8887 // Also check for these shuffles through CONCAT_VECTORS: we canonicalize
8888 // shuffles that produce a result larger than their operands with:
8889 // shuffle(concat(v1, undef), concat(v2, undef))
8890 // ->
8891 // shuffle(concat(v1, v2), undef)
8892 // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine).
8893 //
8894 // This is useful in the general case, but there are special cases where
8895 // native shuffles produce larger results: the two-result ops.
8896 //
8897 // Look through the concat when lowering them:
8898 // shuffle(concat(v1, v2), undef)
8899 // ->
8900 // concat(VZIP(v1, v2):0, :1)
8901 //
8902 if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
8903 SDValue SubV1 = V1->getOperand(Num: 0);
8904 SDValue SubV2 = V1->getOperand(Num: 1);
8905 EVT SubVT = SubV1.getValueType();
8906
8907 // We expect these to have been canonicalized to -1.
8908 assert(llvm::all_of(ShuffleMask, [&](int i) {
8909 return i < (int)VT.getVectorNumElements();
8910 }) && "Unexpected shuffle index into UNDEF operand!");
8911
8912 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
8913 ShuffleMask, VT: SubVT, WhichResult, isV_UNDEF)) {
8914 if (isV_UNDEF)
8915 SubV2 = SubV1;
8916 assert((WhichResult == 0) &&
8917 "In-place shuffle of concat can only have one result!");
8918 SDValue Res = DAG.getNode(Opcode: ShuffleOpc, DL: dl, VTList: DAG.getVTList(VT1: SubVT, VT2: SubVT),
8919 N1: SubV1, N2: SubV2);
8920 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT, N1: Res.getValue(R: 0),
8921 N2: Res.getValue(R: 1));
8922 }
8923 }
8924 }
8925
8926 if (ST->hasMVEIntegerOps() && EltSize <= 32) {
8927 if (SDValue V = LowerVECTOR_SHUFFLEUsingOneOff(Op, ShuffleMask, DAG))
8928 return V;
8929
8930 for (bool Top : {false, true}) {
8931 for (bool SingleSource : {false, true}) {
8932 if (isTruncMask(M: ShuffleMask, VT, Top, SingleSource)) {
8933 MVT FromSVT = MVT::getIntegerVT(BitWidth: EltSize * 2);
8934 MVT FromVT = MVT::getVectorVT(VT: FromSVT, NumElements: ShuffleMask.size() / 2);
8935 SDValue Lo = DAG.getNode(Opcode: ARMISD::VECTOR_REG_CAST, DL: dl, VT: FromVT, Operand: V1);
8936 SDValue Hi = DAG.getNode(Opcode: ARMISD::VECTOR_REG_CAST, DL: dl, VT: FromVT,
8937 Operand: SingleSource ? V1 : V2);
8938 if (Top) {
8939 SDValue Amt = DAG.getConstant(Val: EltSize, DL: dl, VT: FromVT);
8940 Lo = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: FromVT, N1: Lo, N2: Amt);
8941 Hi = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: FromVT, N1: Hi, N2: Amt);
8942 }
8943 return DAG.getNode(Opcode: ARMISD::MVETRUNC, DL: dl, VT, N1: Lo, N2: Hi);
8944 }
8945 }
8946 }
8947 }
8948
8949 // If the shuffle is not directly supported and it has 4 elements, use
8950 // the PerfectShuffle-generated table to synthesize it from other shuffles.
8951 unsigned NumElts = VT.getVectorNumElements();
8952 if (NumElts == 4) {
8953 unsigned PFIndexes[4];
8954 for (unsigned i = 0; i != 4; ++i) {
8955 if (ShuffleMask[i] < 0)
8956 PFIndexes[i] = 8;
8957 else
8958 PFIndexes[i] = ShuffleMask[i];
8959 }
8960
8961 // Compute the index in the perfect shuffle table.
8962 unsigned PFTableIndex =
8963 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
8964 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
8965 unsigned Cost = (PFEntry >> 30);
8966
8967 if (Cost <= 4) {
8968 if (ST->hasNEON())
8969 return GeneratePerfectShuffle(PFEntry, LHS: V1, RHS: V2, DAG, dl);
8970 else if (isLegalMVEShuffleOp(PFEntry)) {
8971 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
8972 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
8973 unsigned PFEntryLHS = PerfectShuffleTable[LHSID];
8974 unsigned PFEntryRHS = PerfectShuffleTable[RHSID];
8975 if (isLegalMVEShuffleOp(PFEntry: PFEntryLHS) && isLegalMVEShuffleOp(PFEntry: PFEntryRHS))
8976 return GeneratePerfectShuffle(PFEntry, LHS: V1, RHS: V2, DAG, dl);
8977 }
8978 }
8979 }
8980
8981 // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
8982 if (EltSize >= 32) {
8983 // Do the expansion with floating-point types, since that is what the VFP
8984 // registers are defined to use, and since i64 is not legal.
8985 EVT EltVT = EVT::getFloatingPointVT(BitWidth: EltSize);
8986 EVT VecVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: NumElts);
8987 V1 = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: VecVT, Operand: V1);
8988 V2 = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: VecVT, Operand: V2);
8989 SmallVector<SDValue, 8> Ops;
8990 for (unsigned i = 0; i < NumElts; ++i) {
8991 if (ShuffleMask[i] < 0)
8992 Ops.push_back(Elt: DAG.getUNDEF(VT: EltVT));
8993 else
8994 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
8995 ShuffleMask[i] < (int)NumElts ? V1 : V2,
8996 DAG.getConstant(ShuffleMask[i] & (NumElts-1),
8997 dl, MVT::i32)));
8998 }
8999 SDValue Val = DAG.getNode(Opcode: ARMISD::BUILD_VECTOR, DL: dl, VT: VecVT, Ops);
9000 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT, Operand: Val);
9001 }
9002
9003 if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
9004 isReverseMask(ShuffleMask, VT))
9005 return LowerReverse_VECTOR_SHUFFLE(Op, DAG);
9006
9007 if (ST->hasNEON() && VT == MVT::v8i8)
9008 if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
9009 return NewOp;
9010
9011 if (ST->hasMVEIntegerOps())
9012 if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG))
9013 return NewOp;
9014
9015 return SDValue();
9016}
9017
9018static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG,
9019 const ARMSubtarget *ST) {
9020 EVT VecVT = Op.getOperand(i: 0).getValueType();
9021 SDLoc dl(Op);
9022
9023 assert(ST->hasMVEIntegerOps() &&
9024 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
9025
9026 SDValue Conv =
9027 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
9028 unsigned Lane = Op.getConstantOperandVal(i: 2);
9029 unsigned LaneWidth =
9030 getVectorTyFromPredicateVector(VT: VecVT).getScalarSizeInBits() / 8;
9031 unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth;
9032 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32,
9033 Op.getOperand(1), DAG.getValueType(MVT::i1));
9034 SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext,
9035 DAG.getConstant(~Mask, dl, MVT::i32));
9036 return DAG.getNode(Opcode: ARMISD::PREDICATE_CAST, DL: dl, VT: Op.getValueType(), Operand: BFI);
9037}
9038
9039SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
9040 SelectionDAG &DAG) const {
9041 // INSERT_VECTOR_ELT is legal only for immediate indexes.
9042 SDValue Lane = Op.getOperand(i: 2);
9043 if (!isa<ConstantSDNode>(Val: Lane))
9044 return SDValue();
9045
9046 SDValue Elt = Op.getOperand(i: 1);
9047 EVT EltVT = Elt.getValueType();
9048
9049 if (Subtarget->hasMVEIntegerOps() &&
9050 Op.getValueType().getScalarSizeInBits() == 1)
9051 return LowerINSERT_VECTOR_ELT_i1(Op, DAG, ST: Subtarget);
9052
9053 if (getTypeAction(Context&: *DAG.getContext(), VT: EltVT) ==
9054 TargetLowering::TypeSoftPromoteHalf) {
9055 // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32,
9056 // but the type system will try to do that if we don't intervene.
9057 // Reinterpret any such vector-element insertion as one with the
9058 // corresponding integer types.
9059
9060 SDLoc dl(Op);
9061
9062 EVT IEltVT = MVT::getIntegerVT(BitWidth: EltVT.getScalarSizeInBits());
9063 assert(getTypeAction(*DAG.getContext(), IEltVT) !=
9064 TargetLowering::TypeSoftPromoteHalf);
9065
9066 SDValue VecIn = Op.getOperand(i: 0);
9067 EVT VecVT = VecIn.getValueType();
9068 EVT IVecVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: IEltVT,
9069 NumElements: VecVT.getVectorNumElements());
9070
9071 SDValue IElt = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: IEltVT, Operand: Elt);
9072 SDValue IVecIn = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: IVecVT, Operand: VecIn);
9073 SDValue IVecOut = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: dl, VT: IVecVT,
9074 N1: IVecIn, N2: IElt, N3: Lane);
9075 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: VecVT, Operand: IVecOut);
9076 }
9077
9078 return Op;
9079}
9080
9081static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG,
9082 const ARMSubtarget *ST) {
9083 EVT VecVT = Op.getOperand(i: 0).getValueType();
9084 SDLoc dl(Op);
9085
9086 assert(ST->hasMVEIntegerOps() &&
9087 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
9088
9089 SDValue Conv =
9090 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
9091 unsigned Lane = Op.getConstantOperandVal(i: 1);
9092 unsigned LaneWidth =
9093 getVectorTyFromPredicateVector(VT: VecVT).getScalarSizeInBits() / 8;
9094 SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv,
9095 DAG.getConstant(Lane * LaneWidth, dl, MVT::i32));
9096 return Shift;
9097}
9098
9099static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG,
9100 const ARMSubtarget *ST) {
9101 // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
9102 SDValue Lane = Op.getOperand(i: 1);
9103 if (!isa<ConstantSDNode>(Val: Lane))
9104 return SDValue();
9105
9106 SDValue Vec = Op.getOperand(i: 0);
9107 EVT VT = Vec.getValueType();
9108
9109 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
9110 return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST);
9111
9112 if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) {
9113 SDLoc dl(Op);
9114 return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
9115 }
9116
9117 return Op;
9118}
9119
9120static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG,
9121 const ARMSubtarget *ST) {
9122 SDLoc dl(Op);
9123 assert(Op.getValueType().getScalarSizeInBits() == 1 &&
9124 "Unexpected custom CONCAT_VECTORS lowering");
9125 assert(isPowerOf2_32(Op.getNumOperands()) &&
9126 "Unexpected custom CONCAT_VECTORS lowering");
9127 assert(ST->hasMVEIntegerOps() &&
9128 "CONCAT_VECTORS lowering only supported for MVE");
9129
9130 auto ConcatPair = [&](SDValue V1, SDValue V2) {
9131 EVT Op1VT = V1.getValueType();
9132 EVT Op2VT = V2.getValueType();
9133 assert(Op1VT == Op2VT && "Operand types don't match!");
9134 assert((Op1VT == MVT::v2i1 || Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) &&
9135 "Unexpected i1 concat operations!");
9136 EVT VT = Op1VT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
9137
9138 SDValue NewV1 = PromoteMVEPredVector(dl, Pred: V1, VT: Op1VT, DAG);
9139 SDValue NewV2 = PromoteMVEPredVector(dl, Pred: V2, VT: Op2VT, DAG);
9140
9141 // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets
9142 // promoted to v8i16, etc.
9143 MVT ElType =
9144 getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT();
9145 unsigned NumElts = 2 * Op1VT.getVectorNumElements();
9146
9147 EVT ConcatVT = MVT::getVectorVT(VT: ElType, NumElements: NumElts);
9148 if (Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) {
9149 // Use MVETRUNC to truncate the combined NewV1::NewV2 into the smaller
9150 // ConcatVT.
9151 SDValue ConVec =
9152 DAG.getNode(Opcode: ARMISD::MVETRUNC, DL: dl, VT: ConcatVT, N1: NewV1, N2: NewV2);
9153 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
9154 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9155 }
9156
9157 // Extract the vector elements from Op1 and Op2 one by one and truncate them
9158 // to be the right size for the destination. For example, if Op1 is v4i1
9159 // then the promoted vector is v4i32. The result of concatenation gives a
9160 // v8i1, which when promoted is v8i16. That means each i32 element from Op1
9161 // needs truncating to i16 and inserting in the result.
9162 auto ExtractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) {
9163 EVT NewVT = NewV.getValueType();
9164 EVT ConcatVT = ConVec.getValueType();
9165 unsigned ExtScale = 1;
9166 if (NewVT == MVT::v2f64) {
9167 NewV = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, NewV);
9168 ExtScale = 2;
9169 }
9170 for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) {
9171 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV,
9172 DAG.getIntPtrConstant(i * ExtScale, dl));
9173 ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt,
9174 DAG.getConstant(j, dl, MVT::i32));
9175 }
9176 return ConVec;
9177 };
9178 unsigned j = 0;
9179 SDValue ConVec = DAG.getNode(Opcode: ISD::UNDEF, DL: dl, VT: ConcatVT);
9180 ConVec = ExtractInto(NewV1, ConVec, j);
9181 ConVec = ExtractInto(NewV2, ConVec, j);
9182
9183 // Now return the result of comparing the subvector with zero, which will
9184 // generate a real predicate, i.e. v4i1, v8i1 or v16i1.
9185 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
9186 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9187 };
9188
9189 // Concat each pair of subvectors and pack into the lower half of the array.
9190 SmallVector<SDValue> ConcatOps(Op->op_begin(), Op->op_end());
9191 while (ConcatOps.size() > 1) {
9192 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
9193 SDValue V1 = ConcatOps[I];
9194 SDValue V2 = ConcatOps[I + 1];
9195 ConcatOps[I / 2] = ConcatPair(V1, V2);
9196 }
9197 ConcatOps.resize(N: ConcatOps.size() / 2);
9198 }
9199 return ConcatOps[0];
9200}
9201
9202static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
9203 const ARMSubtarget *ST) {
9204 EVT VT = Op->getValueType(ResNo: 0);
9205 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
9206 return LowerCONCAT_VECTORS_i1(Op, DAG, ST);
9207
9208 // The only time a CONCAT_VECTORS operation can have legal types is when
9209 // two 64-bit vectors are concatenated to a 128-bit vector.
9210 assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
9211 "unexpected CONCAT_VECTORS");
9212 SDLoc dl(Op);
9213 SDValue Val = DAG.getUNDEF(MVT::v2f64);
9214 SDValue Op0 = Op.getOperand(i: 0);
9215 SDValue Op1 = Op.getOperand(i: 1);
9216 if (!Op0.isUndef())
9217 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
9218 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
9219 DAG.getIntPtrConstant(0, dl));
9220 if (!Op1.isUndef())
9221 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
9222 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
9223 DAG.getIntPtrConstant(1, dl));
9224 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: Op.getValueType(), Operand: Val);
9225}
9226
9227static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG,
9228 const ARMSubtarget *ST) {
9229 SDValue V1 = Op.getOperand(i: 0);
9230 SDValue V2 = Op.getOperand(i: 1);
9231 SDLoc dl(Op);
9232 EVT VT = Op.getValueType();
9233 EVT Op1VT = V1.getValueType();
9234 unsigned NumElts = VT.getVectorNumElements();
9235 unsigned Index = V2->getAsZExtVal();
9236
9237 assert(VT.getScalarSizeInBits() == 1 &&
9238 "Unexpected custom EXTRACT_SUBVECTOR lowering");
9239 assert(ST->hasMVEIntegerOps() &&
9240 "EXTRACT_SUBVECTOR lowering only supported for MVE");
9241
9242 SDValue NewV1 = PromoteMVEPredVector(dl, Pred: V1, VT: Op1VT, DAG);
9243
9244 // We now have Op1 promoted to a vector of integers, where v8i1 gets
9245 // promoted to v8i16, etc.
9246
9247 MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT();
9248
9249 if (NumElts == 2) {
9250 EVT SubVT = MVT::v4i32;
9251 SDValue SubVec = DAG.getNode(Opcode: ISD::UNDEF, DL: dl, VT: SubVT);
9252 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j += 2) {
9253 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
9254 DAG.getIntPtrConstant(i, dl));
9255 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
9256 DAG.getConstant(j, dl, MVT::i32));
9257 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
9258 DAG.getConstant(j + 1, dl, MVT::i32));
9259 }
9260 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, SubVec,
9261 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9262 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
9263 }
9264
9265 EVT SubVT = MVT::getVectorVT(VT: ElType, NumElements: NumElts);
9266 SDValue SubVec = DAG.getNode(Opcode: ISD::UNDEF, DL: dl, VT: SubVT);
9267 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) {
9268 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
9269 DAG.getIntPtrConstant(i, dl));
9270 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
9271 DAG.getConstant(j, dl, MVT::i32));
9272 }
9273
9274 // Now return the result of comparing the subvector with zero,
9275 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
9276 return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec,
9277 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9278}
9279
9280// Turn a truncate into a predicate (an i1 vector) into icmp(and(x, 1), 0).
9281static SDValue LowerTruncatei1(SDNode *N, SelectionDAG &DAG,
9282 const ARMSubtarget *ST) {
9283 assert(ST->hasMVEIntegerOps() && "Expected MVE!");
9284 EVT VT = N->getValueType(ResNo: 0);
9285 assert((VT == MVT::v16i1 || VT == MVT::v8i1 || VT == MVT::v4i1) &&
9286 "Expected a vector i1 type!");
9287 SDValue Op = N->getOperand(Num: 0);
9288 EVT FromVT = Op.getValueType();
9289 SDLoc DL(N);
9290
9291 SDValue And =
9292 DAG.getNode(Opcode: ISD::AND, DL, VT: FromVT, N1: Op, N2: DAG.getConstant(Val: 1, DL, VT: FromVT));
9293 return DAG.getNode(Opcode: ISD::SETCC, DL, VT, N1: And, N2: DAG.getConstant(Val: 0, DL, VT: FromVT),
9294 N3: DAG.getCondCode(Cond: ISD::SETNE));
9295}
9296
9297static SDValue LowerTruncate(SDNode *N, SelectionDAG &DAG,
9298 const ARMSubtarget *Subtarget) {
9299 if (!Subtarget->hasMVEIntegerOps())
9300 return SDValue();
9301
9302 EVT ToVT = N->getValueType(ResNo: 0);
9303 if (ToVT.getScalarType() == MVT::i1)
9304 return LowerTruncatei1(N, DAG, ST: Subtarget);
9305
9306 // MVE does not have a single instruction to perform the truncation of a v4i32
9307 // into the lower half of a v8i16, in the same way that a NEON vmovn would.
9308 // Most of the instructions in MVE follow the 'Beats' system, where moving
9309 // values from different lanes is usually something that the instructions
9310 // avoid.
9311 //
9312 // Instead it has top/bottom instructions such as VMOVLT/B and VMOVNT/B,
9313 // which take a the top/bottom half of a larger lane and extend it (or do the
9314 // opposite, truncating into the top/bottom lane from a larger lane). Note
9315 // that because of the way we widen lanes, a v4i16 is really a v4i32 using the
9316 // bottom 16bits from each vector lane. This works really well with T/B
9317 // instructions, but that doesn't extend to v8i32->v8i16 where the lanes need
9318 // to move order.
9319 //
9320 // But truncates and sext/zext are always going to be fairly common from llvm.
9321 // We have several options for how to deal with them:
9322 // - Wherever possible combine them into an instruction that makes them
9323 // "free". This includes loads/stores, which can perform the trunc as part
9324 // of the memory operation. Or certain shuffles that can be turned into
9325 // VMOVN/VMOVL.
9326 // - Lane Interleaving to transform blocks surrounded by ext/trunc. So
9327 // trunc(mul(sext(a), sext(b))) may become
9328 // VMOVNT(VMUL(VMOVLB(a), VMOVLB(b)), VMUL(VMOVLT(a), VMOVLT(b))). (Which in
9329 // this case can use VMULL). This is performed in the
9330 // MVELaneInterleavingPass.
9331 // - Otherwise we have an option. By default we would expand the
9332 // zext/sext/trunc into a series of lane extract/inserts going via GPR
9333 // registers. One for each vector lane in the vector. This can obviously be
9334 // very expensive.
9335 // - The other option is to use the fact that loads/store can extend/truncate
9336 // to turn a trunc into two truncating stack stores and a stack reload. This
9337 // becomes 3 back-to-back memory operations, but at least that is less than
9338 // all the insert/extracts.
9339 //
9340 // In order to do the last, we convert certain trunc's into MVETRUNC, which
9341 // are either optimized where they can be, or eventually lowered into stack
9342 // stores/loads. This prevents us from splitting a v8i16 trunc into two stores
9343 // two early, where other instructions would be better, and stops us from
9344 // having to reconstruct multiple buildvector shuffles into loads/stores.
9345 if (ToVT != MVT::v8i16 && ToVT != MVT::v16i8)
9346 return SDValue();
9347 EVT FromVT = N->getOperand(Num: 0).getValueType();
9348 if (FromVT != MVT::v8i32 && FromVT != MVT::v16i16)
9349 return SDValue();
9350
9351 SDValue Lo, Hi;
9352 std::tie(args&: Lo, args&: Hi) = DAG.SplitVectorOperand(N, OpNo: 0);
9353 SDLoc DL(N);
9354 return DAG.getNode(Opcode: ARMISD::MVETRUNC, DL, VT: ToVT, N1: Lo, N2: Hi);
9355}
9356
9357static SDValue LowerVectorExtend(SDNode *N, SelectionDAG &DAG,
9358 const ARMSubtarget *Subtarget) {
9359 if (!Subtarget->hasMVEIntegerOps())
9360 return SDValue();
9361
9362 // See LowerTruncate above for an explanation of MVEEXT/MVETRUNC.
9363
9364 EVT ToVT = N->getValueType(ResNo: 0);
9365 if (ToVT != MVT::v16i32 && ToVT != MVT::v8i32 && ToVT != MVT::v16i16)
9366 return SDValue();
9367 SDValue Op = N->getOperand(Num: 0);
9368 EVT FromVT = Op.getValueType();
9369 if (FromVT != MVT::v8i16 && FromVT != MVT::v16i8)
9370 return SDValue();
9371
9372 SDLoc DL(N);
9373 EVT ExtVT = ToVT.getHalfNumVectorElementsVT(Context&: *DAG.getContext());
9374 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8)
9375 ExtVT = MVT::v8i16;
9376
9377 unsigned Opcode =
9378 N->getOpcode() == ISD::SIGN_EXTEND ? ARMISD::MVESEXT : ARMISD::MVEZEXT;
9379 SDValue Ext = DAG.getNode(Opcode, DL, VTList: DAG.getVTList(VT1: ExtVT, VT2: ExtVT), N: Op);
9380 SDValue Ext1 = Ext.getValue(R: 1);
9381
9382 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8) {
9383 Ext = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext);
9384 Ext1 = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext1);
9385 }
9386
9387 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: ToVT, N1: Ext, N2: Ext1);
9388}
9389
9390/// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
9391/// element has been zero/sign-extended, depending on the isSigned parameter,
9392/// from an integer type half its size.
9393static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
9394 bool isSigned) {
9395 // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
9396 EVT VT = N->getValueType(ResNo: 0);
9397 if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
9398 SDNode *BVN = N->getOperand(Num: 0).getNode();
9399 if (BVN->getValueType(0) != MVT::v4i32 ||
9400 BVN->getOpcode() != ISD::BUILD_VECTOR)
9401 return false;
9402 unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
9403 unsigned HiElt = 1 - LoElt;
9404 ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(Val: BVN->getOperand(Num: LoElt));
9405 ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(Val: BVN->getOperand(Num: HiElt));
9406 ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(Val: BVN->getOperand(Num: LoElt+2));
9407 ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(Val: BVN->getOperand(Num: HiElt+2));
9408 if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
9409 return false;
9410 if (isSigned) {
9411 if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
9412 Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
9413 return true;
9414 } else {
9415 if (Hi0->isZero() && Hi1->isZero())
9416 return true;
9417 }
9418 return false;
9419 }
9420
9421 if (N->getOpcode() != ISD::BUILD_VECTOR)
9422 return false;
9423
9424 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
9425 SDNode *Elt = N->getOperand(Num: i).getNode();
9426 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: Elt)) {
9427 unsigned EltSize = VT.getScalarSizeInBits();
9428 unsigned HalfSize = EltSize / 2;
9429 if (isSigned) {
9430 if (!isIntN(N: HalfSize, x: C->getSExtValue()))
9431 return false;
9432 } else {
9433 if (!isUIntN(N: HalfSize, x: C->getZExtValue()))
9434 return false;
9435 }
9436 continue;
9437 }
9438 return false;
9439 }
9440
9441 return true;
9442}
9443
9444/// isSignExtended - Check if a node is a vector value that is sign-extended
9445/// or a constant BUILD_VECTOR with sign-extended elements.
9446static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
9447 if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
9448 return true;
9449 if (isExtendedBUILD_VECTOR(N, DAG, isSigned: true))
9450 return true;
9451 return false;
9452}
9453
9454/// isZeroExtended - Check if a node is a vector value that is zero-extended (or
9455/// any-extended) or a constant BUILD_VECTOR with zero-extended elements.
9456static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
9457 if (N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND ||
9458 ISD::isZEXTLoad(N))
9459 return true;
9460 if (isExtendedBUILD_VECTOR(N, DAG, isSigned: false))
9461 return true;
9462 return false;
9463}
9464
9465static EVT getExtensionTo64Bits(const EVT &OrigVT) {
9466 if (OrigVT.getSizeInBits() >= 64)
9467 return OrigVT;
9468
9469 assert(OrigVT.isSimple() && "Expecting a simple value type");
9470
9471 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
9472 switch (OrigSimpleTy) {
9473 default: llvm_unreachable("Unexpected Vector Type");
9474 case MVT::v2i8:
9475 case MVT::v2i16:
9476 return MVT::v2i32;
9477 case MVT::v4i8:
9478 return MVT::v4i16;
9479 }
9480}
9481
9482/// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
9483/// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
9484/// We insert the required extension here to get the vector to fill a D register.
9485static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG,
9486 const EVT &OrigTy,
9487 const EVT &ExtTy,
9488 unsigned ExtOpcode) {
9489 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
9490 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
9491 // 64-bits we need to insert a new extension so that it will be 64-bits.
9492 assert(ExtTy.is128BitVector() && "Unexpected extension size");
9493 if (OrigTy.getSizeInBits() >= 64)
9494 return N;
9495
9496 // Must extend size to at least 64 bits to be used as an operand for VMULL.
9497 EVT NewVT = getExtensionTo64Bits(OrigVT: OrigTy);
9498
9499 return DAG.getNode(Opcode: ExtOpcode, DL: SDLoc(N), VT: NewVT, Operand: N);
9500}
9501
9502/// SkipLoadExtensionForVMULL - return a load of the original vector size that
9503/// does not do any sign/zero extension. If the original vector is less
9504/// than 64 bits, an appropriate extension will be added after the load to
9505/// reach a total size of 64 bits. We have to add the extension separately
9506/// because ARM does not have a sign/zero extending load for vectors.
9507static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) {
9508 EVT ExtendedTy = getExtensionTo64Bits(OrigVT: LD->getMemoryVT());
9509
9510 // The load already has the right type.
9511 if (ExtendedTy == LD->getMemoryVT())
9512 return DAG.getLoad(VT: LD->getMemoryVT(), dl: SDLoc(LD), Chain: LD->getChain(),
9513 Ptr: LD->getBasePtr(), PtrInfo: LD->getPointerInfo(), Alignment: LD->getAlign(),
9514 MMOFlags: LD->getMemOperand()->getFlags());
9515
9516 // We need to create a zextload/sextload. We cannot just create a load
9517 // followed by a zext/zext node because LowerMUL is also run during normal
9518 // operation legalization where we can't create illegal types.
9519 return DAG.getExtLoad(ExtType: LD->getExtensionType(), dl: SDLoc(LD), VT: ExtendedTy,
9520 Chain: LD->getChain(), Ptr: LD->getBasePtr(), PtrInfo: LD->getPointerInfo(),
9521 MemVT: LD->getMemoryVT(), Alignment: LD->getAlign(),
9522 MMOFlags: LD->getMemOperand()->getFlags());
9523}
9524
9525/// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
9526/// ANY_EXTEND, extending load, or BUILD_VECTOR with extended elements, return
9527/// the unextended value. The unextended vector should be 64 bits so that it can
9528/// be used as an operand to a VMULL instruction. If the original vector size
9529/// before extension is less than 64 bits we add a an extension to resize
9530/// the vector to 64 bits.
9531static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
9532 if (N->getOpcode() == ISD::SIGN_EXTEND ||
9533 N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND)
9534 return AddRequiredExtensionForVMULL(N: N->getOperand(Num: 0), DAG,
9535 OrigTy: N->getOperand(Num: 0)->getValueType(ResNo: 0),
9536 ExtTy: N->getValueType(ResNo: 0),
9537 ExtOpcode: N->getOpcode());
9538
9539 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val: N)) {
9540 assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) &&
9541 "Expected extending load");
9542
9543 SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG);
9544 DAG.ReplaceAllUsesOfValueWith(From: SDValue(LD, 1), To: newLoad.getValue(R: 1));
9545 unsigned Opcode = ISD::isSEXTLoad(N: LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
9546 SDValue extLoad =
9547 DAG.getNode(Opcode, DL: SDLoc(newLoad), VT: LD->getValueType(ResNo: 0), Operand: newLoad);
9548 DAG.ReplaceAllUsesOfValueWith(From: SDValue(LD, 0), To: extLoad);
9549
9550 return newLoad;
9551 }
9552
9553 // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will
9554 // have been legalized as a BITCAST from v4i32.
9555 if (N->getOpcode() == ISD::BITCAST) {
9556 SDNode *BVN = N->getOperand(Num: 0).getNode();
9557 assert(BVN->getOpcode() == ISD::BUILD_VECTOR &&
9558 BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
9559 unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
9560 return DAG.getBuildVector(
9561 MVT::v2i32, SDLoc(N),
9562 {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)});
9563 }
9564 // Construct a new BUILD_VECTOR with elements truncated to half the size.
9565 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
9566 EVT VT = N->getValueType(ResNo: 0);
9567 unsigned EltSize = VT.getScalarSizeInBits() / 2;
9568 unsigned NumElts = VT.getVectorNumElements();
9569 MVT TruncVT = MVT::getIntegerVT(BitWidth: EltSize);
9570 SmallVector<SDValue, 8> Ops;
9571 SDLoc dl(N);
9572 for (unsigned i = 0; i != NumElts; ++i) {
9573 const APInt &CInt = N->getConstantOperandAPInt(Num: i);
9574 // Element types smaller than 32 bits are not legal, so use i32 elements.
9575 // The values are implicitly truncated so sext vs. zext doesn't matter.
9576 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
9577 }
9578 return DAG.getBuildVector(VT: MVT::getVectorVT(VT: TruncVT, NumElements: NumElts), DL: dl, Ops);
9579}
9580
9581static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
9582 unsigned Opcode = N->getOpcode();
9583 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
9584 SDNode *N0 = N->getOperand(Num: 0).getNode();
9585 SDNode *N1 = N->getOperand(Num: 1).getNode();
9586 return N0->hasOneUse() && N1->hasOneUse() &&
9587 isSignExtended(N: N0, DAG) && isSignExtended(N: N1, DAG);
9588 }
9589 return false;
9590}
9591
9592static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
9593 unsigned Opcode = N->getOpcode();
9594 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
9595 SDNode *N0 = N->getOperand(Num: 0).getNode();
9596 SDNode *N1 = N->getOperand(Num: 1).getNode();
9597 return N0->hasOneUse() && N1->hasOneUse() &&
9598 isZeroExtended(N: N0, DAG) && isZeroExtended(N: N1, DAG);
9599 }
9600 return false;
9601}
9602
9603static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
9604 // Multiplications are only custom-lowered for 128-bit vectors so that
9605 // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
9606 EVT VT = Op.getValueType();
9607 assert(VT.is128BitVector() && VT.isInteger() &&
9608 "unexpected type for custom-lowering ISD::MUL");
9609 SDNode *N0 = Op.getOperand(i: 0).getNode();
9610 SDNode *N1 = Op.getOperand(i: 1).getNode();
9611 unsigned NewOpc = 0;
9612 bool isMLA = false;
9613 bool isN0SExt = isSignExtended(N: N0, DAG);
9614 bool isN1SExt = isSignExtended(N: N1, DAG);
9615 if (isN0SExt && isN1SExt)
9616 NewOpc = ARMISD::VMULLs;
9617 else {
9618 bool isN0ZExt = isZeroExtended(N: N0, DAG);
9619 bool isN1ZExt = isZeroExtended(N: N1, DAG);
9620 if (isN0ZExt && isN1ZExt)
9621 NewOpc = ARMISD::VMULLu;
9622 else if (isN1SExt || isN1ZExt) {
9623 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
9624 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
9625 if (isN1SExt && isAddSubSExt(N: N0, DAG)) {
9626 NewOpc = ARMISD::VMULLs;
9627 isMLA = true;
9628 } else if (isN1ZExt && isAddSubZExt(N: N0, DAG)) {
9629 NewOpc = ARMISD::VMULLu;
9630 isMLA = true;
9631 } else if (isN0ZExt && isAddSubZExt(N: N1, DAG)) {
9632 std::swap(a&: N0, b&: N1);
9633 NewOpc = ARMISD::VMULLu;
9634 isMLA = true;
9635 }
9636 }
9637
9638 if (!NewOpc) {
9639 if (VT == MVT::v2i64)
9640 // Fall through to expand this. It is not legal.
9641 return SDValue();
9642 else
9643 // Other vector multiplications are legal.
9644 return Op;
9645 }
9646 }
9647
9648 // Legalize to a VMULL instruction.
9649 SDLoc DL(Op);
9650 SDValue Op0;
9651 SDValue Op1 = SkipExtensionForVMULL(N: N1, DAG);
9652 if (!isMLA) {
9653 Op0 = SkipExtensionForVMULL(N: N0, DAG);
9654 assert(Op0.getValueType().is64BitVector() &&
9655 Op1.getValueType().is64BitVector() &&
9656 "unexpected types for extended operands to VMULL");
9657 return DAG.getNode(Opcode: NewOpc, DL, VT, N1: Op0, N2: Op1);
9658 }
9659
9660 // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
9661 // isel lowering to take advantage of no-stall back to back vmul + vmla.
9662 // vmull q0, d4, d6
9663 // vmlal q0, d5, d6
9664 // is faster than
9665 // vaddl q0, d4, d5
9666 // vmovl q1, d6
9667 // vmul q0, q0, q1
9668 SDValue N00 = SkipExtensionForVMULL(N: N0->getOperand(Num: 0).getNode(), DAG);
9669 SDValue N01 = SkipExtensionForVMULL(N: N0->getOperand(Num: 1).getNode(), DAG);
9670 EVT Op1VT = Op1.getValueType();
9671 return DAG.getNode(Opcode: N0->getOpcode(), DL, VT,
9672 N1: DAG.getNode(Opcode: NewOpc, DL, VT,
9673 N1: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: Op1VT, Operand: N00), N2: Op1),
9674 N2: DAG.getNode(Opcode: NewOpc, DL, VT,
9675 N1: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: Op1VT, Operand: N01), N2: Op1));
9676}
9677
9678static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl,
9679 SelectionDAG &DAG) {
9680 // TODO: Should this propagate fast-math-flags?
9681
9682 // Convert to float
9683 // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
9684 // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
9685 X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
9686 Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
9687 X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
9688 Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
9689 // Get reciprocal estimate.
9690 // float4 recip = vrecpeq_f32(yf);
9691 Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9692 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9693 Y);
9694 // Because char has a smaller range than uchar, we can actually get away
9695 // without any newton steps. This requires that we use a weird bias
9696 // of 0xb000, however (again, this has been exhaustively tested).
9697 // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
9698 X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
9699 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
9700 Y = DAG.getConstant(0xb000, dl, MVT::v4i32);
9701 X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
9702 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
9703 // Convert back to short.
9704 X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
9705 X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
9706 return X;
9707}
9708
9709static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl,
9710 SelectionDAG &DAG) {
9711 // TODO: Should this propagate fast-math-flags?
9712
9713 SDValue N2;
9714 // Convert to float.
9715 // float4 yf = vcvt_f32_s32(vmovl_s16(y));
9716 // float4 xf = vcvt_f32_s32(vmovl_s16(x));
9717 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
9718 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
9719 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
9720 N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
9721
9722 // Use reciprocal estimate and one refinement step.
9723 // float4 recip = vrecpeq_f32(yf);
9724 // recip *= vrecpsq_f32(yf, recip);
9725 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9726 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9727 N1);
9728 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9729 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9730 N1, N2);
9731 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9732 // Because short has a smaller range than ushort, we can actually get away
9733 // with only a single newton step. This requires that we use a weird bias
9734 // of 89, however (again, this has been exhaustively tested).
9735 // float4 result = as_float4(as_int4(xf*recip) + 0x89);
9736 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9737 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9738 N1 = DAG.getConstant(0x89, dl, MVT::v4i32);
9739 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9740 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9741 // Convert back to integer and return.
9742 // return vmovn_s32(vcvt_s32_f32(result));
9743 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9744 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9745 return N0;
9746}
9747
9748static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG,
9749 const ARMSubtarget *ST) {
9750 EVT VT = Op.getValueType();
9751 assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9752 "unexpected type for custom-lowering ISD::SDIV");
9753
9754 SDLoc dl(Op);
9755 SDValue N0 = Op.getOperand(i: 0);
9756 SDValue N1 = Op.getOperand(i: 1);
9757 SDValue N2, N3;
9758
9759 if (VT == MVT::v8i8) {
9760 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
9761 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
9762
9763 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9764 DAG.getIntPtrConstant(4, dl));
9765 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9766 DAG.getIntPtrConstant(4, dl));
9767 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9768 DAG.getIntPtrConstant(0, dl));
9769 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9770 DAG.getIntPtrConstant(0, dl));
9771
9772 N0 = LowerSDIV_v4i8(X: N0, Y: N1, dl, DAG); // v4i16
9773 N2 = LowerSDIV_v4i8(X: N2, Y: N3, dl, DAG); // v4i16
9774
9775 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9776 N0 = LowerCONCAT_VECTORS(Op: N0, DAG, ST);
9777
9778 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
9779 return N0;
9780 }
9781 return LowerSDIV_v4i16(N0, N1, dl, DAG);
9782}
9783
9784static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG,
9785 const ARMSubtarget *ST) {
9786 // TODO: Should this propagate fast-math-flags?
9787 EVT VT = Op.getValueType();
9788 assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9789 "unexpected type for custom-lowering ISD::UDIV");
9790
9791 SDLoc dl(Op);
9792 SDValue N0 = Op.getOperand(i: 0);
9793 SDValue N1 = Op.getOperand(i: 1);
9794 SDValue N2, N3;
9795
9796 if (VT == MVT::v8i8) {
9797 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
9798 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
9799
9800 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9801 DAG.getIntPtrConstant(4, dl));
9802 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9803 DAG.getIntPtrConstant(4, dl));
9804 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9805 DAG.getIntPtrConstant(0, dl));
9806 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9807 DAG.getIntPtrConstant(0, dl));
9808
9809 N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
9810 N2 = LowerSDIV_v4i16(N0: N2, N1: N3, dl, DAG); // v4i16
9811
9812 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9813 N0 = LowerCONCAT_VECTORS(Op: N0, DAG, ST);
9814
9815 N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
9816 DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,
9817 MVT::i32),
9818 N0);
9819 return N0;
9820 }
9821
9822 // v4i16 sdiv ... Convert to float.
9823 // float4 yf = vcvt_f32_s32(vmovl_u16(y));
9824 // float4 xf = vcvt_f32_s32(vmovl_u16(x));
9825 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
9826 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
9827 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
9828 SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
9829
9830 // Use reciprocal estimate and two refinement steps.
9831 // float4 recip = vrecpeq_f32(yf);
9832 // recip *= vrecpsq_f32(yf, recip);
9833 // recip *= vrecpsq_f32(yf, recip);
9834 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9835 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9836 BN1);
9837 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9838 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9839 BN1, N2);
9840 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9841 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9842 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9843 BN1, N2);
9844 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9845 // Simply multiplying by the reciprocal estimate can leave us a few ulps
9846 // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
9847 // and that it will never cause us to return an answer too large).
9848 // float4 result = as_float4(as_int4(xf*recip) + 2);
9849 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9850 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9851 N1 = DAG.getConstant(2, dl, MVT::v4i32);
9852 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9853 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9854 // Convert back to integer and return.
9855 // return vmovn_u32(vcvt_s32_f32(result));
9856 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9857 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9858 return N0;
9859}
9860
9861static SDValue LowerUADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) {
9862 SDNode *N = Op.getNode();
9863 EVT VT = N->getValueType(ResNo: 0);
9864 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
9865
9866 SDValue Carry = Op.getOperand(i: 2);
9867
9868 SDLoc DL(Op);
9869
9870 SDValue Result;
9871 if (Op.getOpcode() == ISD::UADDO_CARRY) {
9872 // This converts the boolean value carry into the carry flag.
9873 Carry = ConvertBooleanCarryToCarryFlag(BoolCarry: Carry, DAG);
9874
9875 // Do the addition proper using the carry flag we wanted.
9876 Result = DAG.getNode(Opcode: ARMISD::ADDE, DL, VTList: VTs, N1: Op.getOperand(i: 0),
9877 N2: Op.getOperand(i: 1), N3: Carry);
9878
9879 // Now convert the carry flag into a boolean value.
9880 Carry = ConvertCarryFlagToBooleanCarry(Flags: Result.getValue(R: 1), VT, DAG);
9881 } else {
9882 // ARMISD::SUBE expects a carry not a borrow like ISD::USUBO_CARRY so we
9883 // have to invert the carry first.
9884 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
9885 DAG.getConstant(1, DL, MVT::i32), Carry);
9886 // This converts the boolean value carry into the carry flag.
9887 Carry = ConvertBooleanCarryToCarryFlag(BoolCarry: Carry, DAG);
9888
9889 // Do the subtraction proper using the carry flag we wanted.
9890 Result = DAG.getNode(Opcode: ARMISD::SUBE, DL, VTList: VTs, N1: Op.getOperand(i: 0),
9891 N2: Op.getOperand(i: 1), N3: Carry);
9892
9893 // Now convert the carry flag into a boolean value.
9894 Carry = ConvertCarryFlagToBooleanCarry(Flags: Result.getValue(R: 1), VT, DAG);
9895 // But the carry returned by ARMISD::SUBE is not a borrow as expected
9896 // by ISD::USUBO_CARRY, so compute 1 - C.
9897 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
9898 DAG.getConstant(1, DL, MVT::i32), Carry);
9899 }
9900
9901 // Return both values.
9902 return DAG.getNode(Opcode: ISD::MERGE_VALUES, DL, VTList: N->getVTList(), N1: Result, N2: Carry);
9903}
9904
9905SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
9906 assert(Subtarget->isTargetDarwin());
9907
9908 // For iOS, we want to call an alternative entry point: __sincos_stret,
9909 // return values are passed via sret.
9910 SDLoc dl(Op);
9911 SDValue Arg = Op.getOperand(i: 0);
9912 EVT ArgVT = Arg.getValueType();
9913 Type *ArgTy = ArgVT.getTypeForEVT(Context&: *DAG.getContext());
9914 auto PtrVT = getPointerTy(DL: DAG.getDataLayout());
9915
9916 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
9917 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9918
9919 // Pair of floats / doubles used to pass the result.
9920 Type *RetTy = StructType::get(elt1: ArgTy, elts: ArgTy);
9921 auto &DL = DAG.getDataLayout();
9922
9923 ArgListTy Args;
9924 bool ShouldUseSRet = Subtarget->isAPCS_ABI();
9925 SDValue SRet;
9926 if (ShouldUseSRet) {
9927 // Create stack object for sret.
9928 const uint64_t ByteSize = DL.getTypeAllocSize(Ty: RetTy);
9929 const Align StackAlign = DL.getPrefTypeAlign(Ty: RetTy);
9930 int FrameIdx = MFI.CreateStackObject(Size: ByteSize, Alignment: StackAlign, isSpillSlot: false);
9931 SRet = DAG.getFrameIndex(FI: FrameIdx, VT: TLI.getPointerTy(DL));
9932
9933 ArgListEntry Entry;
9934 Entry.Node = SRet;
9935 Entry.Ty = PointerType::getUnqual(C&: RetTy->getContext());
9936 Entry.IsSExt = false;
9937 Entry.IsZExt = false;
9938 Entry.IsSRet = true;
9939 Args.push_back(x: Entry);
9940 RetTy = Type::getVoidTy(C&: *DAG.getContext());
9941 }
9942
9943 ArgListEntry Entry;
9944 Entry.Node = Arg;
9945 Entry.Ty = ArgTy;
9946 Entry.IsSExt = false;
9947 Entry.IsZExt = false;
9948 Args.push_back(x: Entry);
9949
9950 RTLIB::Libcall LC =
9951 (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
9952 const char *LibcallName = getLibcallName(Call: LC);
9953 CallingConv::ID CC = getLibcallCallingConv(Call: LC);
9954 SDValue Callee = DAG.getExternalSymbol(Sym: LibcallName, VT: getPointerTy(DL));
9955
9956 TargetLowering::CallLoweringInfo CLI(DAG);
9957 CLI.setDebugLoc(dl)
9958 .setChain(DAG.getEntryNode())
9959 .setCallee(CC, ResultType: RetTy, Target: Callee, ArgsList: std::move(Args))
9960 .setDiscardResult(ShouldUseSRet);
9961 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
9962
9963 if (!ShouldUseSRet)
9964 return CallResult.first;
9965
9966 SDValue LoadSin =
9967 DAG.getLoad(VT: ArgVT, dl, Chain: CallResult.second, Ptr: SRet, PtrInfo: MachinePointerInfo());
9968
9969 // Address of cos field.
9970 SDValue Add = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: SRet,
9971 N2: DAG.getIntPtrConstant(Val: ArgVT.getStoreSize(), DL: dl));
9972 SDValue LoadCos =
9973 DAG.getLoad(VT: ArgVT, dl, Chain: LoadSin.getValue(R: 1), Ptr: Add, PtrInfo: MachinePointerInfo());
9974
9975 SDVTList Tys = DAG.getVTList(VT1: ArgVT, VT2: ArgVT);
9976 return DAG.getNode(Opcode: ISD::MERGE_VALUES, DL: dl, VTList: Tys,
9977 N1: LoadSin.getValue(R: 0), N2: LoadCos.getValue(R: 0));
9978}
9979
9980SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
9981 bool Signed,
9982 SDValue &Chain) const {
9983 EVT VT = Op.getValueType();
9984 assert((VT == MVT::i32 || VT == MVT::i64) &&
9985 "unexpected type for custom lowering DIV");
9986 SDLoc dl(Op);
9987
9988 const auto &DL = DAG.getDataLayout();
9989 const auto &TLI = DAG.getTargetLoweringInfo();
9990
9991 const char *Name = nullptr;
9992 if (Signed)
9993 Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64";
9994 else
9995 Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64";
9996
9997 SDValue ES = DAG.getExternalSymbol(Sym: Name, VT: TLI.getPointerTy(DL));
9998
9999 ARMTargetLowering::ArgListTy Args;
10000
10001 for (auto AI : {1, 0}) {
10002 ArgListEntry Arg;
10003 Arg.Node = Op.getOperand(i: AI);
10004 Arg.Ty = Arg.Node.getValueType().getTypeForEVT(Context&: *DAG.getContext());
10005 Args.push_back(x: Arg);
10006 }
10007
10008 CallLoweringInfo CLI(DAG);
10009 CLI.setDebugLoc(dl)
10010 .setChain(Chain)
10011 .setCallee(CC: CallingConv::ARM_AAPCS_VFP, ResultType: VT.getTypeForEVT(Context&: *DAG.getContext()),
10012 Target: ES, ArgsList: std::move(Args));
10013
10014 return LowerCallTo(CLI).first;
10015}
10016
10017// This is a code size optimisation: return the original SDIV node to
10018// DAGCombiner when we don't want to expand SDIV into a sequence of
10019// instructions, and an empty node otherwise which will cause the
10020// SDIV to be expanded in DAGCombine.
10021SDValue
10022ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
10023 SelectionDAG &DAG,
10024 SmallVectorImpl<SDNode *> &Created) const {
10025 // TODO: Support SREM
10026 if (N->getOpcode() != ISD::SDIV)
10027 return SDValue();
10028
10029 const auto &ST = DAG.getSubtarget<ARMSubtarget>();
10030 const bool MinSize = ST.hasMinSize();
10031 const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode()
10032 : ST.hasDivideInARMMode();
10033
10034 // Don't touch vector types; rewriting this may lead to scalarizing
10035 // the int divs.
10036 if (N->getOperand(Num: 0).getValueType().isVector())
10037 return SDValue();
10038
10039 // Bail if MinSize is not set, and also for both ARM and Thumb mode we need
10040 // hwdiv support for this to be really profitable.
10041 if (!(MinSize && HasDivide))
10042 return SDValue();
10043
10044 // ARM mode is a bit simpler than Thumb: we can handle large power
10045 // of 2 immediates with 1 mov instruction; no further checks required,
10046 // just return the sdiv node.
10047 if (!ST.isThumb())
10048 return SDValue(N, 0);
10049
10050 // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV,
10051 // and thus lose the code size benefits of a MOVS that requires only 2.
10052 // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here,
10053 // but as it's doing exactly this, it's not worth the trouble to get TTI.
10054 if (Divisor.sgt(RHS: 128))
10055 return SDValue();
10056
10057 return SDValue(N, 0);
10058}
10059
10060SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
10061 bool Signed) const {
10062 assert(Op.getValueType() == MVT::i32 &&
10063 "unexpected type for custom lowering DIV");
10064 SDLoc dl(Op);
10065
10066 SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other,
10067 DAG.getEntryNode(), Op.getOperand(1));
10068
10069 return LowerWindowsDIVLibCall(Op, DAG, Signed, Chain&: DBZCHK);
10070}
10071
10072static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain) {
10073 SDLoc DL(N);
10074 SDValue Op = N->getOperand(Num: 1);
10075 if (N->getValueType(0) == MVT::i32)
10076 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op);
10077 SDValue Lo, Hi;
10078 std::tie(Lo, Hi) = DAG.SplitScalar(Op, DL, MVT::i32, MVT::i32);
10079 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain,
10080 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi));
10081}
10082
10083void ARMTargetLowering::ExpandDIV_Windows(
10084 SDValue Op, SelectionDAG &DAG, bool Signed,
10085 SmallVectorImpl<SDValue> &Results) const {
10086 const auto &DL = DAG.getDataLayout();
10087 const auto &TLI = DAG.getTargetLoweringInfo();
10088
10089 assert(Op.getValueType() == MVT::i64 &&
10090 "unexpected type for custom lowering DIV");
10091 SDLoc dl(Op);
10092
10093 SDValue DBZCHK = WinDBZCheckDenominator(DAG, N: Op.getNode(), InChain: DAG.getEntryNode());
10094
10095 SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, Chain&: DBZCHK);
10096
10097 SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);
10098 SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,
10099 DAG.getConstant(32, dl, TLI.getPointerTy(DL)));
10100 Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
10101
10102 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lower, Upper));
10103}
10104
10105static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) {
10106 LoadSDNode *LD = cast<LoadSDNode>(Val: Op.getNode());
10107 EVT MemVT = LD->getMemoryVT();
10108 assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
10109 MemVT == MVT::v16i1) &&
10110 "Expected a predicate type!");
10111 assert(MemVT == Op.getValueType());
10112 assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
10113 "Expected a non-extending load");
10114 assert(LD->isUnindexed() && "Expected a unindexed load");
10115
10116 // The basic MVE VLDR on a v2i1/v4i1/v8i1 actually loads the entire 16bit
10117 // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We
10118 // need to make sure that 8/4/2 bits are actually loaded into the correct
10119 // place, which means loading the value and then shuffling the values into
10120 // the bottom bits of the predicate.
10121 // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect
10122 // for BE).
10123 // Speaking of BE, apparently the rest of llvm will assume a reverse order to
10124 // a natural VMSR(load), so needs to be reversed.
10125
10126 SDLoc dl(Op);
10127 SDValue Load = DAG.getExtLoad(
10128 ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(),
10129 EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()),
10130 LD->getMemOperand());
10131 SDValue Val = Load;
10132 if (DAG.getDataLayout().isBigEndian())
10133 Val = DAG.getNode(ISD::SRL, dl, MVT::i32,
10134 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Load),
10135 DAG.getConstant(32 - MemVT.getSizeInBits(), dl, MVT::i32));
10136 SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Val);
10137 if (MemVT != MVT::v16i1)
10138 Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred,
10139 DAG.getConstant(0, dl, MVT::i32));
10140 return DAG.getMergeValues(Ops: {Pred, Load.getValue(R: 1)}, dl);
10141}
10142
10143void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
10144 SelectionDAG &DAG) const {
10145 LoadSDNode *LD = cast<LoadSDNode>(Val: N);
10146 EVT MemVT = LD->getMemoryVT();
10147 assert(LD->isUnindexed() && "Loads should be unindexed at this point.");
10148
10149 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
10150 !Subtarget->isThumb1Only() && LD->isVolatile() &&
10151 LD->getAlign() >= Subtarget->getDualLoadStoreAlignment()) {
10152 SDLoc dl(N);
10153 SDValue Result = DAG.getMemIntrinsicNode(
10154 ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}),
10155 {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand());
10156 SDValue Lo = Result.getValue(R: DAG.getDataLayout().isLittleEndian() ? 0 : 1);
10157 SDValue Hi = Result.getValue(R: DAG.getDataLayout().isLittleEndian() ? 1 : 0);
10158 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
10159 Results.append(IL: {Pair, Result.getValue(R: 2)});
10160 }
10161}
10162
10163static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) {
10164 StoreSDNode *ST = cast<StoreSDNode>(Val: Op.getNode());
10165 EVT MemVT = ST->getMemoryVT();
10166 assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
10167 MemVT == MVT::v16i1) &&
10168 "Expected a predicate type!");
10169 assert(MemVT == ST->getValue().getValueType());
10170 assert(!ST->isTruncatingStore() && "Expected a non-extending store");
10171 assert(ST->isUnindexed() && "Expected a unindexed store");
10172
10173 // Only store the v2i1 or v4i1 or v8i1 worth of bits, via a buildvector with
10174 // top bits unset and a scalar store.
10175 SDLoc dl(Op);
10176 SDValue Build = ST->getValue();
10177 if (MemVT != MVT::v16i1) {
10178 SmallVector<SDValue, 16> Ops;
10179 for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) {
10180 unsigned Elt = DAG.getDataLayout().isBigEndian()
10181 ? MemVT.getVectorNumElements() - I - 1
10182 : I;
10183 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build,
10184 DAG.getConstant(Elt, dl, MVT::i32)));
10185 }
10186 for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++)
10187 Ops.push_back(DAG.getUNDEF(MVT::i32));
10188 Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops);
10189 }
10190 SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build);
10191 if (MemVT == MVT::v16i1 && DAG.getDataLayout().isBigEndian())
10192 GRP = DAG.getNode(ISD::SRL, dl, MVT::i32,
10193 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, GRP),
10194 DAG.getConstant(16, dl, MVT::i32));
10195 return DAG.getTruncStore(
10196 Chain: ST->getChain(), dl, Val: GRP, Ptr: ST->getBasePtr(),
10197 SVT: EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: MemVT.getSizeInBits()),
10198 MMO: ST->getMemOperand());
10199}
10200
10201static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG,
10202 const ARMSubtarget *Subtarget) {
10203 StoreSDNode *ST = cast<StoreSDNode>(Val: Op.getNode());
10204 EVT MemVT = ST->getMemoryVT();
10205 assert(ST->isUnindexed() && "Stores should be unindexed at this point.");
10206
10207 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
10208 !Subtarget->isThumb1Only() && ST->isVolatile() &&
10209 ST->getAlign() >= Subtarget->getDualLoadStoreAlignment()) {
10210 SDNode *N = Op.getNode();
10211 SDLoc dl(N);
10212
10213 SDValue Lo = DAG.getNode(
10214 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
10215 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 0 : 1, dl,
10216 MVT::i32));
10217 SDValue Hi = DAG.getNode(
10218 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
10219 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 1 : 0, dl,
10220 MVT::i32));
10221
10222 return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other),
10223 {ST->getChain(), Lo, Hi, ST->getBasePtr()},
10224 MemVT, ST->getMemOperand());
10225 } else if (Subtarget->hasMVEIntegerOps() &&
10226 ((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
10227 MemVT == MVT::v16i1))) {
10228 return LowerPredicateStore(Op, DAG);
10229 }
10230
10231 return SDValue();
10232}
10233
10234static bool isZeroVector(SDValue N) {
10235 return (ISD::isBuildVectorAllZeros(N: N.getNode()) ||
10236 (N->getOpcode() == ARMISD::VMOVIMM &&
10237 isNullConstant(V: N->getOperand(Num: 0))));
10238}
10239
10240static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) {
10241 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Val: Op.getNode());
10242 MVT VT = Op.getSimpleValueType();
10243 SDValue Mask = N->getMask();
10244 SDValue PassThru = N->getPassThru();
10245 SDLoc dl(Op);
10246
10247 if (isZeroVector(N: PassThru))
10248 return Op;
10249
10250 // MVE Masked loads use zero as the passthru value. Here we convert undef to
10251 // zero too, and other values are lowered to a select.
10252 SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
10253 DAG.getTargetConstant(0, dl, MVT::i32));
10254 SDValue NewLoad = DAG.getMaskedLoad(
10255 VT, dl, Chain: N->getChain(), Base: N->getBasePtr(), Offset: N->getOffset(), Mask, Src0: ZeroVec,
10256 MemVT: N->getMemoryVT(), MMO: N->getMemOperand(), AM: N->getAddressingMode(),
10257 N->getExtensionType(), IsExpanding: N->isExpandingLoad());
10258 SDValue Combo = NewLoad;
10259 bool PassThruIsCastZero = (PassThru.getOpcode() == ISD::BITCAST ||
10260 PassThru.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
10261 isZeroVector(N: PassThru->getOperand(Num: 0));
10262 if (!PassThru.isUndef() && !PassThruIsCastZero)
10263 Combo = DAG.getNode(Opcode: ISD::VSELECT, DL: dl, VT, N1: Mask, N2: NewLoad, N3: PassThru);
10264 return DAG.getMergeValues(Ops: {Combo, NewLoad.getValue(R: 1)}, dl);
10265}
10266
10267static SDValue LowerVecReduce(SDValue Op, SelectionDAG &DAG,
10268 const ARMSubtarget *ST) {
10269 if (!ST->hasMVEIntegerOps())
10270 return SDValue();
10271
10272 SDLoc dl(Op);
10273 unsigned BaseOpcode = 0;
10274 switch (Op->getOpcode()) {
10275 default: llvm_unreachable("Expected VECREDUCE opcode");
10276 case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break;
10277 case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break;
10278 case ISD::VECREDUCE_MUL: BaseOpcode = ISD::MUL; break;
10279 case ISD::VECREDUCE_AND: BaseOpcode = ISD::AND; break;
10280 case ISD::VECREDUCE_OR: BaseOpcode = ISD::OR; break;
10281 case ISD::VECREDUCE_XOR: BaseOpcode = ISD::XOR; break;
10282 case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break;
10283 case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break;
10284 }
10285
10286 SDValue Op0 = Op->getOperand(Num: 0);
10287 EVT VT = Op0.getValueType();
10288 EVT EltVT = VT.getVectorElementType();
10289 unsigned NumElts = VT.getVectorNumElements();
10290 unsigned NumActiveLanes = NumElts;
10291
10292 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
10293 NumActiveLanes == 2) &&
10294 "Only expected a power 2 vector size");
10295
10296 // Use Mul(X, Rev(X)) until 4 items remain. Going down to 4 vector elements
10297 // allows us to easily extract vector elements from the lanes.
10298 while (NumActiveLanes > 4) {
10299 unsigned RevOpcode = NumActiveLanes == 16 ? ARMISD::VREV16 : ARMISD::VREV32;
10300 SDValue Rev = DAG.getNode(Opcode: RevOpcode, DL: dl, VT, Operand: Op0);
10301 Op0 = DAG.getNode(Opcode: BaseOpcode, DL: dl, VT, N1: Op0, N2: Rev);
10302 NumActiveLanes /= 2;
10303 }
10304
10305 SDValue Res;
10306 if (NumActiveLanes == 4) {
10307 // The remaining 4 elements are summed sequentially
10308 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10309 DAG.getConstant(0 * NumElts / 4, dl, MVT::i32));
10310 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10311 DAG.getConstant(1 * NumElts / 4, dl, MVT::i32));
10312 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10313 DAG.getConstant(2 * NumElts / 4, dl, MVT::i32));
10314 SDValue Ext3 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10315 DAG.getConstant(3 * NumElts / 4, dl, MVT::i32));
10316 SDValue Res0 = DAG.getNode(Opcode: BaseOpcode, DL: dl, VT: EltVT, N1: Ext0, N2: Ext1, Flags: Op->getFlags());
10317 SDValue Res1 = DAG.getNode(Opcode: BaseOpcode, DL: dl, VT: EltVT, N1: Ext2, N2: Ext3, Flags: Op->getFlags());
10318 Res = DAG.getNode(Opcode: BaseOpcode, DL: dl, VT: EltVT, N1: Res0, N2: Res1, Flags: Op->getFlags());
10319 } else {
10320 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10321 DAG.getConstant(0, dl, MVT::i32));
10322 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10323 DAG.getConstant(1, dl, MVT::i32));
10324 Res = DAG.getNode(Opcode: BaseOpcode, DL: dl, VT: EltVT, N1: Ext0, N2: Ext1, Flags: Op->getFlags());
10325 }
10326
10327 // Result type may be wider than element type.
10328 if (EltVT != Op->getValueType(ResNo: 0))
10329 Res = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: Op->getValueType(ResNo: 0), Operand: Res);
10330 return Res;
10331}
10332
10333static SDValue LowerVecReduceF(SDValue Op, SelectionDAG &DAG,
10334 const ARMSubtarget *ST) {
10335 if (!ST->hasMVEFloatOps())
10336 return SDValue();
10337 return LowerVecReduce(Op, DAG, ST);
10338}
10339
10340static SDValue LowerVecReduceMinMax(SDValue Op, SelectionDAG &DAG,
10341 const ARMSubtarget *ST) {
10342 if (!ST->hasNEON())
10343 return SDValue();
10344
10345 SDLoc dl(Op);
10346 SDValue Op0 = Op->getOperand(Num: 0);
10347 EVT VT = Op0.getValueType();
10348 EVT EltVT = VT.getVectorElementType();
10349
10350 unsigned PairwiseIntrinsic = 0;
10351 switch (Op->getOpcode()) {
10352 default:
10353 llvm_unreachable("Expected VECREDUCE opcode");
10354 case ISD::VECREDUCE_UMIN:
10355 PairwiseIntrinsic = Intrinsic::arm_neon_vpminu;
10356 break;
10357 case ISD::VECREDUCE_UMAX:
10358 PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxu;
10359 break;
10360 case ISD::VECREDUCE_SMIN:
10361 PairwiseIntrinsic = Intrinsic::arm_neon_vpmins;
10362 break;
10363 case ISD::VECREDUCE_SMAX:
10364 PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxs;
10365 break;
10366 }
10367 SDValue PairwiseOp = DAG.getConstant(PairwiseIntrinsic, dl, MVT::i32);
10368
10369 unsigned NumElts = VT.getVectorNumElements();
10370 unsigned NumActiveLanes = NumElts;
10371
10372 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
10373 NumActiveLanes == 2) &&
10374 "Only expected a power 2 vector size");
10375
10376 // Split 128-bit vectors, since vpmin/max takes 2 64-bit vectors.
10377 if (VT.is128BitVector()) {
10378 SDValue Lo, Hi;
10379 std::tie(args&: Lo, args&: Hi) = DAG.SplitVector(N: Op0, DL: dl);
10380 VT = Lo.getValueType();
10381 Op0 = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT, Ops: {PairwiseOp, Lo, Hi});
10382 NumActiveLanes /= 2;
10383 }
10384
10385 // Use pairwise reductions until one lane remains
10386 while (NumActiveLanes > 1) {
10387 Op0 = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT, Ops: {PairwiseOp, Op0, Op0});
10388 NumActiveLanes /= 2;
10389 }
10390
10391 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10392 DAG.getConstant(0, dl, MVT::i32));
10393
10394 // Result type may be wider than element type.
10395 if (EltVT != Op.getValueType()) {
10396 unsigned Extend = 0;
10397 switch (Op->getOpcode()) {
10398 default:
10399 llvm_unreachable("Expected VECREDUCE opcode");
10400 case ISD::VECREDUCE_UMIN:
10401 case ISD::VECREDUCE_UMAX:
10402 Extend = ISD::ZERO_EXTEND;
10403 break;
10404 case ISD::VECREDUCE_SMIN:
10405 case ISD::VECREDUCE_SMAX:
10406 Extend = ISD::SIGN_EXTEND;
10407 break;
10408 }
10409 Res = DAG.getNode(Opcode: Extend, DL: dl, VT: Op.getValueType(), Operand: Res);
10410 }
10411 return Res;
10412}
10413
10414static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
10415 if (isStrongerThanMonotonic(AO: cast<AtomicSDNode>(Val&: Op)->getSuccessOrdering()))
10416 // Acquire/Release load/store is not legal for targets without a dmb or
10417 // equivalent available.
10418 return SDValue();
10419
10420 // Monotonic load/store is legal for all targets.
10421 return Op;
10422}
10423
10424static void ReplaceREADCYCLECOUNTER(SDNode *N,
10425 SmallVectorImpl<SDValue> &Results,
10426 SelectionDAG &DAG,
10427 const ARMSubtarget *Subtarget) {
10428 SDLoc DL(N);
10429 // Under Power Management extensions, the cycle-count is:
10430 // mrc p15, #0, <Rt>, c9, c13, #0
10431 SDValue Ops[] = { N->getOperand(0), // Chain
10432 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
10433 DAG.getTargetConstant(15, DL, MVT::i32),
10434 DAG.getTargetConstant(0, DL, MVT::i32),
10435 DAG.getTargetConstant(9, DL, MVT::i32),
10436 DAG.getTargetConstant(13, DL, MVT::i32),
10437 DAG.getTargetConstant(0, DL, MVT::i32)
10438 };
10439
10440 SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
10441 DAG.getVTList(MVT::i32, MVT::Other), Ops);
10442 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32,
10443 DAG.getConstant(0, DL, MVT::i32)));
10444 Results.push_back(Elt: Cycles32.getValue(R: 1));
10445}
10446
10447static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
10448 SDLoc dl(V.getNode());
10449 auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i32, MVT::i32);
10450 bool isBigEndian = DAG.getDataLayout().isBigEndian();
10451 if (isBigEndian)
10452 std::swap (VLo, VHi);
10453 SDValue RegClass =
10454 DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
10455 SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);
10456 SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32);
10457 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
10458 return SDValue(
10459 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
10460}
10461
10462static void ReplaceCMP_SWAP_64Results(SDNode *N,
10463 SmallVectorImpl<SDValue> & Results,
10464 SelectionDAG &DAG) {
10465 assert(N->getValueType(0) == MVT::i64 &&
10466 "AtomicCmpSwap on types less than 64 should be legal");
10467 SDValue Ops[] = {N->getOperand(Num: 1),
10468 createGPRPairNode(DAG, V: N->getOperand(Num: 2)),
10469 createGPRPairNode(DAG, V: N->getOperand(Num: 3)),
10470 N->getOperand(Num: 0)};
10471 SDNode *CmpSwap = DAG.getMachineNode(
10472 ARM::CMP_SWAP_64, SDLoc(N),
10473 DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops);
10474
10475 MachineMemOperand *MemOp = cast<MemSDNode>(Val: N)->getMemOperand();
10476 DAG.setNodeMemRefs(N: cast<MachineSDNode>(Val: CmpSwap), NewMemRefs: {MemOp});
10477
10478 bool isBigEndian = DAG.getDataLayout().isBigEndian();
10479
10480 SDValue Lo =
10481 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0,
10482 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
10483 SDValue Hi =
10484 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1,
10485 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
10486 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i64, Lo, Hi));
10487 Results.push_back(Elt: SDValue(CmpSwap, 2));
10488}
10489
10490SDValue ARMTargetLowering::LowerFSETCC(SDValue Op, SelectionDAG &DAG) const {
10491 SDLoc dl(Op);
10492 EVT VT = Op.getValueType();
10493 SDValue Chain = Op.getOperand(i: 0);
10494 SDValue LHS = Op.getOperand(i: 1);
10495 SDValue RHS = Op.getOperand(i: 2);
10496 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: 3))->get();
10497 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
10498
10499 // If we don't have instructions of this float type then soften to a libcall
10500 // and use SETCC instead.
10501 if (isUnsupportedFloatingType(VT: LHS.getValueType())) {
10502 DAG.getTargetLoweringInfo().softenSetCCOperands(
10503 DAG, VT: LHS.getValueType(), NewLHS&: LHS, NewRHS&: RHS, CCCode&: CC, DL: dl, OldLHS: LHS, OldRHS: RHS, Chain, IsSignaling);
10504 if (!RHS.getNode()) {
10505 RHS = DAG.getConstant(Val: 0, DL: dl, VT: LHS.getValueType());
10506 CC = ISD::SETNE;
10507 }
10508 SDValue Result = DAG.getNode(Opcode: ISD::SETCC, DL: dl, VT, N1: LHS, N2: RHS,
10509 N3: DAG.getCondCode(Cond: CC));
10510 return DAG.getMergeValues(Ops: {Result, Chain}, dl);
10511 }
10512
10513 ARMCC::CondCodes CondCode, CondCode2;
10514 FPCCToARMCC(CC, CondCode, CondCode2);
10515
10516 // FIXME: Chain is not handled correctly here. Currently the FPSCR is implicit
10517 // in CMPFP and CMPFPE, but instead it should be made explicit by these
10518 // instructions using a chain instead of glue. This would also fix the problem
10519 // here (and also in LowerSELECT_CC) where we generate two comparisons when
10520 // CondCode2 != AL.
10521 SDValue True = DAG.getConstant(Val: 1, DL: dl, VT);
10522 SDValue False = DAG.getConstant(Val: 0, DL: dl, VT);
10523 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
10524 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
10525 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, Signaling: IsSignaling);
10526 SDValue Result = getCMOV(dl, VT, FalseVal: False, TrueVal: True, ARMcc, CCR, Cmp, DAG);
10527 if (CondCode2 != ARMCC::AL) {
10528 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
10529 Cmp = getVFPCmp(LHS, RHS, DAG, dl, Signaling: IsSignaling);
10530 Result = getCMOV(dl, VT, FalseVal: Result, TrueVal: True, ARMcc, CCR, Cmp, DAG);
10531 }
10532 return DAG.getMergeValues(Ops: {Result, Chain}, dl);
10533}
10534
10535SDValue ARMTargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const {
10536 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
10537
10538 EVT VT = getPointerTy(DL: DAG.getDataLayout());
10539 SDLoc DL(Op);
10540 int FI = MFI.CreateFixedObject(Size: 4, SPOffset: 0, IsImmutable: false);
10541 return DAG.getFrameIndex(FI, VT);
10542}
10543
10544SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
10545 LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
10546 switch (Op.getOpcode()) {
10547 default: llvm_unreachable("Don't know how to custom lower this!");
10548 case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
10549 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
10550 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
10551 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
10552 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
10553 case ISD::SELECT: return LowerSELECT(Op, DAG);
10554 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
10555 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
10556 case ISD::BR_CC: return LowerBR_CC(Op, DAG);
10557 case ISD::BR_JT: return LowerBR_JT(Op, DAG);
10558 case ISD::VASTART: return LowerVASTART(Op, DAG);
10559 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget);
10560 case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget);
10561 case ISD::SINT_TO_FP:
10562 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
10563 case ISD::STRICT_FP_TO_SINT:
10564 case ISD::STRICT_FP_TO_UINT:
10565 case ISD::FP_TO_SINT:
10566 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
10567 case ISD::FP_TO_SINT_SAT:
10568 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG, Subtarget);
10569 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
10570 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
10571 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
10572 case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
10573 case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
10574 case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
10575 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget);
10576 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
10577 Subtarget);
10578 case ISD::BITCAST: return ExpandBITCAST(N: Op.getNode(), DAG, Subtarget);
10579 case ISD::SHL:
10580 case ISD::SRL:
10581 case ISD::SRA: return LowerShift(N: Op.getNode(), DAG, ST: Subtarget);
10582 case ISD::SREM: return LowerREM(N: Op.getNode(), DAG);
10583 case ISD::UREM: return LowerREM(N: Op.getNode(), DAG);
10584 case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG);
10585 case ISD::SRL_PARTS:
10586 case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
10587 case ISD::CTTZ:
10588 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(N: Op.getNode(), DAG, ST: Subtarget);
10589 case ISD::CTPOP: return LowerCTPOP(N: Op.getNode(), DAG, ST: Subtarget);
10590 case ISD::SETCC: return LowerVSETCC(Op, DAG, ST: Subtarget);
10591 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
10592 case ISD::ConstantFP: return LowerConstantFP(Op, DAG, ST: Subtarget);
10593 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, ST: Subtarget);
10594 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, ST: Subtarget);
10595 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, ST: Subtarget);
10596 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
10597 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, ST: Subtarget);
10598 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, ST: Subtarget);
10599 case ISD::TRUNCATE: return LowerTruncate(N: Op.getNode(), DAG, Subtarget);
10600 case ISD::SIGN_EXTEND:
10601 case ISD::ZERO_EXTEND: return LowerVectorExtend(N: Op.getNode(), DAG, Subtarget);
10602 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
10603 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
10604 case ISD::SET_FPMODE:
10605 return LowerSET_FPMODE(Op, DAG);
10606 case ISD::RESET_FPMODE:
10607 return LowerRESET_FPMODE(Op, DAG);
10608 case ISD::MUL: return LowerMUL(Op, DAG);
10609 case ISD::SDIV:
10610 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
10611 return LowerDIV_Windows(Op, DAG, /* Signed */ true);
10612 return LowerSDIV(Op, DAG, ST: Subtarget);
10613 case ISD::UDIV:
10614 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
10615 return LowerDIV_Windows(Op, DAG, /* Signed */ false);
10616 return LowerUDIV(Op, DAG, ST: Subtarget);
10617 case ISD::UADDO_CARRY:
10618 case ISD::USUBO_CARRY:
10619 return LowerUADDSUBO_CARRY(Op, DAG);
10620 case ISD::SADDO:
10621 case ISD::SSUBO:
10622 return LowerSignedALUO(Op, DAG);
10623 case ISD::UADDO:
10624 case ISD::USUBO:
10625 return LowerUnsignedALUO(Op, DAG);
10626 case ISD::SADDSAT:
10627 case ISD::SSUBSAT:
10628 case ISD::UADDSAT:
10629 case ISD::USUBSAT:
10630 return LowerADDSUBSAT(Op, DAG, Subtarget);
10631 case ISD::LOAD:
10632 return LowerPredicateLoad(Op, DAG);
10633 case ISD::STORE:
10634 return LowerSTORE(Op, DAG, Subtarget);
10635 case ISD::MLOAD:
10636 return LowerMLOAD(Op, DAG);
10637 case ISD::VECREDUCE_MUL:
10638 case ISD::VECREDUCE_AND:
10639 case ISD::VECREDUCE_OR:
10640 case ISD::VECREDUCE_XOR:
10641 return LowerVecReduce(Op, DAG, ST: Subtarget);
10642 case ISD::VECREDUCE_FADD:
10643 case ISD::VECREDUCE_FMUL:
10644 case ISD::VECREDUCE_FMIN:
10645 case ISD::VECREDUCE_FMAX:
10646 return LowerVecReduceF(Op, DAG, ST: Subtarget);
10647 case ISD::VECREDUCE_UMIN:
10648 case ISD::VECREDUCE_UMAX:
10649 case ISD::VECREDUCE_SMIN:
10650 case ISD::VECREDUCE_SMAX:
10651 return LowerVecReduceMinMax(Op, DAG, ST: Subtarget);
10652 case ISD::ATOMIC_LOAD:
10653 case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG);
10654 case ISD::FSINCOS: return LowerFSINCOS(Op, DAG);
10655 case ISD::SDIVREM:
10656 case ISD::UDIVREM: return LowerDivRem(Op, DAG);
10657 case ISD::DYNAMIC_STACKALLOC:
10658 if (Subtarget->isTargetWindows())
10659 return LowerDYNAMIC_STACKALLOC(Op, DAG);
10660 llvm_unreachable("Don't know how to custom lower this!");
10661 case ISD::STRICT_FP_ROUND:
10662 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
10663 case ISD::STRICT_FP_EXTEND:
10664 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
10665 case ISD::STRICT_FSETCC:
10666 case ISD::STRICT_FSETCCS: return LowerFSETCC(Op, DAG);
10667 case ISD::SPONENTRY:
10668 return LowerSPONENTRY(Op, DAG);
10669 case ARMISD::WIN__DBZCHK: return SDValue();
10670 }
10671}
10672
10673static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl<SDValue> &Results,
10674 SelectionDAG &DAG) {
10675 unsigned IntNo = N->getConstantOperandVal(Num: 0);
10676 unsigned Opc = 0;
10677 if (IntNo == Intrinsic::arm_smlald)
10678 Opc = ARMISD::SMLALD;
10679 else if (IntNo == Intrinsic::arm_smlaldx)
10680 Opc = ARMISD::SMLALDX;
10681 else if (IntNo == Intrinsic::arm_smlsld)
10682 Opc = ARMISD::SMLSLD;
10683 else if (IntNo == Intrinsic::arm_smlsldx)
10684 Opc = ARMISD::SMLSLDX;
10685 else
10686 return;
10687
10688 SDLoc dl(N);
10689 SDValue Lo, Hi;
10690 std::tie(Lo, Hi) = DAG.SplitScalar(N->getOperand(3), dl, MVT::i32, MVT::i32);
10691
10692 SDValue LongMul = DAG.getNode(Opc, dl,
10693 DAG.getVTList(MVT::i32, MVT::i32),
10694 N->getOperand(1), N->getOperand(2),
10695 Lo, Hi);
10696 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
10697 LongMul.getValue(0), LongMul.getValue(1)));
10698}
10699
10700/// ReplaceNodeResults - Replace the results of node with an illegal result
10701/// type with new values built out of custom code.
10702void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
10703 SmallVectorImpl<SDValue> &Results,
10704 SelectionDAG &DAG) const {
10705 SDValue Res;
10706 switch (N->getOpcode()) {
10707 default:
10708 llvm_unreachable("Don't know how to custom expand this!");
10709 case ISD::READ_REGISTER:
10710 ExpandREAD_REGISTER(N, Results, DAG);
10711 break;
10712 case ISD::BITCAST:
10713 Res = ExpandBITCAST(N, DAG, Subtarget);
10714 break;
10715 case ISD::SRL:
10716 case ISD::SRA:
10717 case ISD::SHL:
10718 Res = Expand64BitShift(N, DAG, ST: Subtarget);
10719 break;
10720 case ISD::SREM:
10721 case ISD::UREM:
10722 Res = LowerREM(N, DAG);
10723 break;
10724 case ISD::SDIVREM:
10725 case ISD::UDIVREM:
10726 Res = LowerDivRem(Op: SDValue(N, 0), DAG);
10727 assert(Res.getNumOperands() == 2 && "DivRem needs two values");
10728 Results.push_back(Elt: Res.getValue(R: 0));
10729 Results.push_back(Elt: Res.getValue(R: 1));
10730 return;
10731 case ISD::SADDSAT:
10732 case ISD::SSUBSAT:
10733 case ISD::UADDSAT:
10734 case ISD::USUBSAT:
10735 Res = LowerADDSUBSAT(Op: SDValue(N, 0), DAG, Subtarget);
10736 break;
10737 case ISD::READCYCLECOUNTER:
10738 ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
10739 return;
10740 case ISD::UDIV:
10741 case ISD::SDIV:
10742 assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows");
10743 return ExpandDIV_Windows(Op: SDValue(N, 0), DAG, Signed: N->getOpcode() == ISD::SDIV,
10744 Results);
10745 case ISD::ATOMIC_CMP_SWAP:
10746 ReplaceCMP_SWAP_64Results(N, Results, DAG);
10747 return;
10748 case ISD::INTRINSIC_WO_CHAIN:
10749 return ReplaceLongIntrinsic(N, Results, DAG);
10750 case ISD::LOAD:
10751 LowerLOAD(N, Results, DAG);
10752 break;
10753 case ISD::TRUNCATE:
10754 Res = LowerTruncate(N, DAG, Subtarget);
10755 break;
10756 case ISD::SIGN_EXTEND:
10757 case ISD::ZERO_EXTEND:
10758 Res = LowerVectorExtend(N, DAG, Subtarget);
10759 break;
10760 case ISD::FP_TO_SINT_SAT:
10761 case ISD::FP_TO_UINT_SAT:
10762 Res = LowerFP_TO_INT_SAT(Op: SDValue(N, 0), DAG, Subtarget);
10763 break;
10764 }
10765 if (Res.getNode())
10766 Results.push_back(Elt: Res);
10767}
10768
10769//===----------------------------------------------------------------------===//
10770// ARM Scheduler Hooks
10771//===----------------------------------------------------------------------===//
10772
10773/// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
10774/// registers the function context.
10775void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
10776 MachineBasicBlock *MBB,
10777 MachineBasicBlock *DispatchBB,
10778 int FI) const {
10779 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
10780 "ROPI/RWPI not currently supported with SjLj");
10781 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10782 DebugLoc dl = MI.getDebugLoc();
10783 MachineFunction *MF = MBB->getParent();
10784 MachineRegisterInfo *MRI = &MF->getRegInfo();
10785 MachineConstantPool *MCP = MF->getConstantPool();
10786 ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>();
10787 const Function &F = MF->getFunction();
10788
10789 bool isThumb = Subtarget->isThumb();
10790 bool isThumb2 = Subtarget->isThumb2();
10791
10792 unsigned PCLabelId = AFI->createPICLabelUId();
10793 unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
10794 ARMConstantPoolValue *CPV =
10795 ARMConstantPoolMBB::Create(C&: F.getContext(), mbb: DispatchBB, ID: PCLabelId, PCAdj);
10796 unsigned CPI = MCP->getConstantPoolIndex(V: CPV, Alignment: Align(4));
10797
10798 const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
10799 : &ARM::GPRRegClass;
10800
10801 // Grab constant pool and fixed stack memory operands.
10802 MachineMemOperand *CPMMO =
10803 MF->getMachineMemOperand(PtrInfo: MachinePointerInfo::getConstantPool(MF&: *MF),
10804 F: MachineMemOperand::MOLoad, Size: 4, BaseAlignment: Align(4));
10805
10806 MachineMemOperand *FIMMOSt =
10807 MF->getMachineMemOperand(PtrInfo: MachinePointerInfo::getFixedStack(MF&: *MF, FI),
10808 F: MachineMemOperand::MOStore, Size: 4, BaseAlignment: Align(4));
10809
10810 // Load the address of the dispatch MBB into the jump buffer.
10811 if (isThumb2) {
10812 // Incoming value: jbuf
10813 // ldr.n r5, LCPI1_1
10814 // orr r5, r5, #1
10815 // add r5, pc
10816 // str r5, [$jbuf, #+4] ; &jbuf[1]
10817 Register NewVReg1 = MRI->createVirtualRegister(RegClass: TRC);
10818 BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
10819 .addConstantPoolIndex(CPI)
10820 .addMemOperand(CPMMO)
10821 .add(predOps(ARMCC::AL));
10822 // Set the low bit because of thumb mode.
10823 Register NewVReg2 = MRI->createVirtualRegister(RegClass: TRC);
10824 BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
10825 .addReg(NewVReg1, RegState::Kill)
10826 .addImm(0x01)
10827 .add(predOps(ARMCC::AL))
10828 .add(condCodeOp());
10829 Register NewVReg3 = MRI->createVirtualRegister(RegClass: TRC);
10830 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
10831 .addReg(NewVReg2, RegState::Kill)
10832 .addImm(PCLabelId);
10833 BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
10834 .addReg(NewVReg3, RegState::Kill)
10835 .addFrameIndex(FI)
10836 .addImm(36) // &jbuf[1] :: pc
10837 .addMemOperand(FIMMOSt)
10838 .add(predOps(ARMCC::AL));
10839 } else if (isThumb) {
10840 // Incoming value: jbuf
10841 // ldr.n r1, LCPI1_4
10842 // add r1, pc
10843 // mov r2, #1
10844 // orrs r1, r2
10845 // add r2, $jbuf, #+4 ; &jbuf[1]
10846 // str r1, [r2]
10847 Register NewVReg1 = MRI->createVirtualRegister(RegClass: TRC);
10848 BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
10849 .addConstantPoolIndex(CPI)
10850 .addMemOperand(CPMMO)
10851 .add(predOps(ARMCC::AL));
10852 Register NewVReg2 = MRI->createVirtualRegister(RegClass: TRC);
10853 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
10854 .addReg(NewVReg1, RegState::Kill)
10855 .addImm(PCLabelId);
10856 // Set the low bit because of thumb mode.
10857 Register NewVReg3 = MRI->createVirtualRegister(RegClass: TRC);
10858 BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
10859 .addReg(ARM::CPSR, RegState::Define)
10860 .addImm(1)
10861 .add(predOps(ARMCC::AL));
10862 Register NewVReg4 = MRI->createVirtualRegister(RegClass: TRC);
10863 BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
10864 .addReg(ARM::CPSR, RegState::Define)
10865 .addReg(NewVReg2, RegState::Kill)
10866 .addReg(NewVReg3, RegState::Kill)
10867 .add(predOps(ARMCC::AL));
10868 Register NewVReg5 = MRI->createVirtualRegister(RegClass: TRC);
10869 BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
10870 .addFrameIndex(FI)
10871 .addImm(36); // &jbuf[1] :: pc
10872 BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
10873 .addReg(NewVReg4, RegState::Kill)
10874 .addReg(NewVReg5, RegState::Kill)
10875 .addImm(0)
10876 .addMemOperand(FIMMOSt)
10877 .add(predOps(ARMCC::AL));
10878 } else {
10879 // Incoming value: jbuf
10880 // ldr r1, LCPI1_1
10881 // add r1, pc, r1
10882 // str r1, [$jbuf, #+4] ; &jbuf[1]
10883 Register NewVReg1 = MRI->createVirtualRegister(RegClass: TRC);
10884 BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1)
10885 .addConstantPoolIndex(CPI)
10886 .addImm(0)
10887 .addMemOperand(CPMMO)
10888 .add(predOps(ARMCC::AL));
10889 Register NewVReg2 = MRI->createVirtualRegister(RegClass: TRC);
10890 BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
10891 .addReg(NewVReg1, RegState::Kill)
10892 .addImm(PCLabelId)
10893 .add(predOps(ARMCC::AL));
10894 BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
10895 .addReg(NewVReg2, RegState::Kill)
10896 .addFrameIndex(FI)
10897 .addImm(36) // &jbuf[1] :: pc
10898 .addMemOperand(FIMMOSt)
10899 .add(predOps(ARMCC::AL));
10900 }
10901}
10902
10903void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
10904 MachineBasicBlock *MBB) const {
10905 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10906 DebugLoc dl = MI.getDebugLoc();
10907 MachineFunction *MF = MBB->getParent();
10908 MachineRegisterInfo *MRI = &MF->getRegInfo();
10909 MachineFrameInfo &MFI = MF->getFrameInfo();
10910 int FI = MFI.getFunctionContextIndex();
10911
10912 const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass
10913 : &ARM::GPRnopcRegClass;
10914
10915 // Get a mapping of the call site numbers to all of the landing pads they're
10916 // associated with.
10917 DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad;
10918 unsigned MaxCSNum = 0;
10919 for (MachineBasicBlock &BB : *MF) {
10920 if (!BB.isEHPad())
10921 continue;
10922
10923 // FIXME: We should assert that the EH_LABEL is the first MI in the landing
10924 // pad.
10925 for (MachineInstr &II : BB) {
10926 if (!II.isEHLabel())
10927 continue;
10928
10929 MCSymbol *Sym = II.getOperand(i: 0).getMCSymbol();
10930 if (!MF->hasCallSiteLandingPad(Sym)) continue;
10931
10932 SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym);
10933 for (unsigned Idx : CallSiteIdxs) {
10934 CallSiteNumToLPad[Idx].push_back(Elt: &BB);
10935 MaxCSNum = std::max(a: MaxCSNum, b: Idx);
10936 }
10937 break;
10938 }
10939 }
10940
10941 // Get an ordered list of the machine basic blocks for the jump table.
10942 std::vector<MachineBasicBlock*> LPadList;
10943 SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs;
10944 LPadList.reserve(n: CallSiteNumToLPad.size());
10945 for (unsigned I = 1; I <= MaxCSNum; ++I) {
10946 SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
10947 for (MachineBasicBlock *MBB : MBBList) {
10948 LPadList.push_back(x: MBB);
10949 InvokeBBs.insert(I: MBB->pred_begin(), E: MBB->pred_end());
10950 }
10951 }
10952
10953 assert(!LPadList.empty() &&
10954 "No landing pad destinations for the dispatch jump table!");
10955
10956 // Create the jump table and associated information.
10957 MachineJumpTableInfo *JTI =
10958 MF->getOrCreateJumpTableInfo(JTEntryKind: MachineJumpTableInfo::EK_Inline);
10959 unsigned MJTI = JTI->createJumpTableIndex(DestBBs: LPadList);
10960
10961 // Create the MBBs for the dispatch code.
10962
10963 // Shove the dispatch's address into the return slot in the function context.
10964 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
10965 DispatchBB->setIsEHPad();
10966
10967 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
10968 unsigned trap_opcode;
10969 if (Subtarget->isThumb())
10970 trap_opcode = ARM::tTRAP;
10971 else
10972 trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP;
10973
10974 BuildMI(BB: TrapBB, MIMD: dl, MCID: TII->get(Opcode: trap_opcode));
10975 DispatchBB->addSuccessor(Succ: TrapBB);
10976
10977 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
10978 DispatchBB->addSuccessor(Succ: DispContBB);
10979
10980 // Insert and MBBs.
10981 MF->insert(MBBI: MF->end(), MBB: DispatchBB);
10982 MF->insert(MBBI: MF->end(), MBB: DispContBB);
10983 MF->insert(MBBI: MF->end(), MBB: TrapBB);
10984
10985 // Insert code into the entry block that creates and registers the function
10986 // context.
10987 SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
10988
10989 MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(
10990 PtrInfo: MachinePointerInfo::getFixedStack(MF&: *MF, FI),
10991 F: MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, Size: 4, BaseAlignment: Align(4));
10992
10993 MachineInstrBuilder MIB;
10994 MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
10995
10996 const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
10997 const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
10998
10999 // Add a register mask with no preserved registers. This results in all
11000 // registers being marked as clobbered. This can't work if the dispatch block
11001 // is in a Thumb1 function and is linked with ARM code which uses the FP
11002 // registers, as there is no way to preserve the FP registers in Thumb1 mode.
11003 MIB.addRegMask(Mask: RI.getSjLjDispatchPreservedMask(MF: *MF));
11004
11005 bool IsPositionIndependent = isPositionIndependent();
11006 unsigned NumLPads = LPadList.size();
11007 if (Subtarget->isThumb2()) {
11008 Register NewVReg1 = MRI->createVirtualRegister(RegClass: TRC);
11009 BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
11010 .addFrameIndex(FI)
11011 .addImm(4)
11012 .addMemOperand(FIMMOLd)
11013 .add(predOps(ARMCC::AL));
11014
11015 if (NumLPads < 256) {
11016 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
11017 .addReg(NewVReg1)
11018 .addImm(LPadList.size())
11019 .add(predOps(ARMCC::AL));
11020 } else {
11021 Register VReg1 = MRI->createVirtualRegister(RegClass: TRC);
11022 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
11023 .addImm(NumLPads & 0xFFFF)
11024 .add(predOps(ARMCC::AL));
11025
11026 unsigned VReg2 = VReg1;
11027 if ((NumLPads & 0xFFFF0000) != 0) {
11028 VReg2 = MRI->createVirtualRegister(RegClass: TRC);
11029 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
11030 .addReg(VReg1)
11031 .addImm(NumLPads >> 16)
11032 .add(predOps(ARMCC::AL));
11033 }
11034
11035 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
11036 .addReg(NewVReg1)
11037 .addReg(VReg2)
11038 .add(predOps(ARMCC::AL));
11039 }
11040
11041 BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
11042 .addMBB(TrapBB)
11043 .addImm(ARMCC::HI)
11044 .addReg(ARM::CPSR);
11045
11046 Register NewVReg3 = MRI->createVirtualRegister(RegClass: TRC);
11047 BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3)
11048 .addJumpTableIndex(MJTI)
11049 .add(predOps(ARMCC::AL));
11050
11051 Register NewVReg4 = MRI->createVirtualRegister(RegClass: TRC);
11052 BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
11053 .addReg(NewVReg3, RegState::Kill)
11054 .addReg(NewVReg1)
11055 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))
11056 .add(predOps(ARMCC::AL))
11057 .add(condCodeOp());
11058
11059 BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
11060 .addReg(NewVReg4, RegState::Kill)
11061 .addReg(NewVReg1)
11062 .addJumpTableIndex(MJTI);
11063 } else if (Subtarget->isThumb()) {
11064 Register NewVReg1 = MRI->createVirtualRegister(RegClass: TRC);
11065 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
11066 .addFrameIndex(FI)
11067 .addImm(1)
11068 .addMemOperand(FIMMOLd)
11069 .add(predOps(ARMCC::AL));
11070
11071 if (NumLPads < 256) {
11072 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
11073 .addReg(NewVReg1)
11074 .addImm(NumLPads)
11075 .add(predOps(ARMCC::AL));
11076 } else {
11077 MachineConstantPool *ConstantPool = MF->getConstantPool();
11078 Type *Int32Ty = Type::getInt32Ty(C&: MF->getFunction().getContext());
11079 const Constant *C = ConstantInt::get(Ty: Int32Ty, V: NumLPads);
11080
11081 // MachineConstantPool wants an explicit alignment.
11082 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Ty: Int32Ty);
11083 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11084
11085 Register VReg1 = MRI->createVirtualRegister(RegClass: TRC);
11086 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
11087 .addReg(VReg1, RegState::Define)
11088 .addConstantPoolIndex(Idx)
11089 .add(predOps(ARMCC::AL));
11090 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
11091 .addReg(NewVReg1)
11092 .addReg(VReg1)
11093 .add(predOps(ARMCC::AL));
11094 }
11095
11096 BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
11097 .addMBB(TrapBB)
11098 .addImm(ARMCC::HI)
11099 .addReg(ARM::CPSR);
11100
11101 Register NewVReg2 = MRI->createVirtualRegister(RegClass: TRC);
11102 BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
11103 .addReg(ARM::CPSR, RegState::Define)
11104 .addReg(NewVReg1)
11105 .addImm(2)
11106 .add(predOps(ARMCC::AL));
11107
11108 Register NewVReg3 = MRI->createVirtualRegister(RegClass: TRC);
11109 BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
11110 .addJumpTableIndex(MJTI)
11111 .add(predOps(ARMCC::AL));
11112
11113 Register NewVReg4 = MRI->createVirtualRegister(RegClass: TRC);
11114 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
11115 .addReg(ARM::CPSR, RegState::Define)
11116 .addReg(NewVReg2, RegState::Kill)
11117 .addReg(NewVReg3)
11118 .add(predOps(ARMCC::AL));
11119
11120 MachineMemOperand *JTMMOLd =
11121 MF->getMachineMemOperand(PtrInfo: MachinePointerInfo::getJumpTable(MF&: *MF),
11122 F: MachineMemOperand::MOLoad, Size: 4, BaseAlignment: Align(4));
11123
11124 Register NewVReg5 = MRI->createVirtualRegister(RegClass: TRC);
11125 BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
11126 .addReg(NewVReg4, RegState::Kill)
11127 .addImm(0)
11128 .addMemOperand(JTMMOLd)
11129 .add(predOps(ARMCC::AL));
11130
11131 unsigned NewVReg6 = NewVReg5;
11132 if (IsPositionIndependent) {
11133 NewVReg6 = MRI->createVirtualRegister(RegClass: TRC);
11134 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
11135 .addReg(ARM::CPSR, RegState::Define)
11136 .addReg(NewVReg5, RegState::Kill)
11137 .addReg(NewVReg3)
11138 .add(predOps(ARMCC::AL));
11139 }
11140
11141 BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
11142 .addReg(NewVReg6, RegState::Kill)
11143 .addJumpTableIndex(MJTI);
11144 } else {
11145 Register NewVReg1 = MRI->createVirtualRegister(RegClass: TRC);
11146 BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
11147 .addFrameIndex(FI)
11148 .addImm(4)
11149 .addMemOperand(FIMMOLd)
11150 .add(predOps(ARMCC::AL));
11151
11152 if (NumLPads < 256) {
11153 BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
11154 .addReg(NewVReg1)
11155 .addImm(NumLPads)
11156 .add(predOps(ARMCC::AL));
11157 } else if (Subtarget->hasV6T2Ops() && isUInt<16>(x: NumLPads)) {
11158 Register VReg1 = MRI->createVirtualRegister(RegClass: TRC);
11159 BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
11160 .addImm(NumLPads & 0xFFFF)
11161 .add(predOps(ARMCC::AL));
11162
11163 unsigned VReg2 = VReg1;
11164 if ((NumLPads & 0xFFFF0000) != 0) {
11165 VReg2 = MRI->createVirtualRegister(RegClass: TRC);
11166 BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
11167 .addReg(VReg1)
11168 .addImm(NumLPads >> 16)
11169 .add(predOps(ARMCC::AL));
11170 }
11171
11172 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
11173 .addReg(NewVReg1)
11174 .addReg(VReg2)
11175 .add(predOps(ARMCC::AL));
11176 } else {
11177 MachineConstantPool *ConstantPool = MF->getConstantPool();
11178 Type *Int32Ty = Type::getInt32Ty(C&: MF->getFunction().getContext());
11179 const Constant *C = ConstantInt::get(Ty: Int32Ty, V: NumLPads);
11180
11181 // MachineConstantPool wants an explicit alignment.
11182 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Ty: Int32Ty);
11183 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11184
11185 Register VReg1 = MRI->createVirtualRegister(RegClass: TRC);
11186 BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
11187 .addReg(VReg1, RegState::Define)
11188 .addConstantPoolIndex(Idx)
11189 .addImm(0)
11190 .add(predOps(ARMCC::AL));
11191 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
11192 .addReg(NewVReg1)
11193 .addReg(VReg1, RegState::Kill)
11194 .add(predOps(ARMCC::AL));
11195 }
11196
11197 BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
11198 .addMBB(TrapBB)
11199 .addImm(ARMCC::HI)
11200 .addReg(ARM::CPSR);
11201
11202 Register NewVReg3 = MRI->createVirtualRegister(RegClass: TRC);
11203 BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
11204 .addReg(NewVReg1)
11205 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))
11206 .add(predOps(ARMCC::AL))
11207 .add(condCodeOp());
11208 Register NewVReg4 = MRI->createVirtualRegister(RegClass: TRC);
11209 BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
11210 .addJumpTableIndex(MJTI)
11211 .add(predOps(ARMCC::AL));
11212
11213 MachineMemOperand *JTMMOLd =
11214 MF->getMachineMemOperand(PtrInfo: MachinePointerInfo::getJumpTable(MF&: *MF),
11215 F: MachineMemOperand::MOLoad, Size: 4, BaseAlignment: Align(4));
11216 Register NewVReg5 = MRI->createVirtualRegister(RegClass: TRC);
11217 BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
11218 .addReg(NewVReg3, RegState::Kill)
11219 .addReg(NewVReg4)
11220 .addImm(0)
11221 .addMemOperand(JTMMOLd)
11222 .add(predOps(ARMCC::AL));
11223
11224 if (IsPositionIndependent) {
11225 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
11226 .addReg(NewVReg5, RegState::Kill)
11227 .addReg(NewVReg4)
11228 .addJumpTableIndex(MJTI);
11229 } else {
11230 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
11231 .addReg(NewVReg5, RegState::Kill)
11232 .addJumpTableIndex(MJTI);
11233 }
11234 }
11235
11236 // Add the jump table entries as successors to the MBB.
11237 SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs;
11238 for (MachineBasicBlock *CurMBB : LPadList) {
11239 if (SeenMBBs.insert(Ptr: CurMBB).second)
11240 DispContBB->addSuccessor(Succ: CurMBB);
11241 }
11242
11243 // N.B. the order the invoke BBs are processed in doesn't matter here.
11244 const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);
11245 SmallVector<MachineBasicBlock*, 64> MBBLPads;
11246 for (MachineBasicBlock *BB : InvokeBBs) {
11247
11248 // Remove the landing pad successor from the invoke block and replace it
11249 // with the new dispatch block.
11250 SmallVector<MachineBasicBlock*, 4> Successors(BB->successors());
11251 while (!Successors.empty()) {
11252 MachineBasicBlock *SMBB = Successors.pop_back_val();
11253 if (SMBB->isEHPad()) {
11254 BB->removeSuccessor(Succ: SMBB);
11255 MBBLPads.push_back(Elt: SMBB);
11256 }
11257 }
11258
11259 BB->addSuccessor(Succ: DispatchBB, Prob: BranchProbability::getZero());
11260 BB->normalizeSuccProbs();
11261
11262 // Find the invoke call and mark all of the callee-saved registers as
11263 // 'implicit defined' so that they're spilled. This prevents code from
11264 // moving instructions to before the EH block, where they will never be
11265 // executed.
11266 for (MachineBasicBlock::reverse_iterator
11267 II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
11268 if (!II->isCall()) continue;
11269
11270 DenseMap<unsigned, bool> DefRegs;
11271 for (MachineInstr::mop_iterator
11272 OI = II->operands_begin(), OE = II->operands_end();
11273 OI != OE; ++OI) {
11274 if (!OI->isReg()) continue;
11275 DefRegs[OI->getReg()] = true;
11276 }
11277
11278 MachineInstrBuilder MIB(*MF, &*II);
11279
11280 for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
11281 unsigned Reg = SavedRegs[i];
11282 if (Subtarget->isThumb2() &&
11283 !ARM::tGPRRegClass.contains(Reg) &&
11284 !ARM::hGPRRegClass.contains(Reg))
11285 continue;
11286 if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
11287 continue;
11288 if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
11289 continue;
11290 if (!DefRegs[Reg])
11291 MIB.addReg(RegNo: Reg, flags: RegState::ImplicitDefine | RegState::Dead);
11292 }
11293
11294 break;
11295 }
11296 }
11297
11298 // Mark all former landing pads as non-landing pads. The dispatch is the only
11299 // landing pad now.
11300 for (MachineBasicBlock *MBBLPad : MBBLPads)
11301 MBBLPad->setIsEHPad(false);
11302
11303 // The instruction is gone now.
11304 MI.eraseFromParent();
11305}
11306
11307static
11308MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
11309 for (MachineBasicBlock *S : MBB->successors())
11310 if (S != Succ)
11311 return S;
11312 llvm_unreachable("Expecting a BB with two successors!");
11313}
11314
11315/// Return the load opcode for a given load size. If load size >= 8,
11316/// neon opcode will be returned.
11317static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {
11318 if (LdSize >= 8)
11319 return LdSize == 16 ? ARM::VLD1q32wb_fixed
11320 : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;
11321 if (IsThumb1)
11322 return LdSize == 4 ? ARM::tLDRi
11323 : LdSize == 2 ? ARM::tLDRHi
11324 : LdSize == 1 ? ARM::tLDRBi : 0;
11325 if (IsThumb2)
11326 return LdSize == 4 ? ARM::t2LDR_POST
11327 : LdSize == 2 ? ARM::t2LDRH_POST
11328 : LdSize == 1 ? ARM::t2LDRB_POST : 0;
11329 return LdSize == 4 ? ARM::LDR_POST_IMM
11330 : LdSize == 2 ? ARM::LDRH_POST
11331 : LdSize == 1 ? ARM::LDRB_POST_IMM : 0;
11332}
11333
11334/// Return the store opcode for a given store size. If store size >= 8,
11335/// neon opcode will be returned.
11336static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
11337 if (StSize >= 8)
11338 return StSize == 16 ? ARM::VST1q32wb_fixed
11339 : StSize == 8 ? ARM::VST1d32wb_fixed : 0;
11340 if (IsThumb1)
11341 return StSize == 4 ? ARM::tSTRi
11342 : StSize == 2 ? ARM::tSTRHi
11343 : StSize == 1 ? ARM::tSTRBi : 0;
11344 if (IsThumb2)
11345 return StSize == 4 ? ARM::t2STR_POST
11346 : StSize == 2 ? ARM::t2STRH_POST
11347 : StSize == 1 ? ARM::t2STRB_POST : 0;
11348 return StSize == 4 ? ARM::STR_POST_IMM
11349 : StSize == 2 ? ARM::STRH_POST
11350 : StSize == 1 ? ARM::STRB_POST_IMM : 0;
11351}
11352
11353/// Emit a post-increment load operation with given size. The instructions
11354/// will be added to BB at Pos.
11355static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
11356 const TargetInstrInfo *TII, const DebugLoc &dl,
11357 unsigned LdSize, unsigned Data, unsigned AddrIn,
11358 unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
11359 unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
11360 assert(LdOpc != 0 && "Should have a load opcode");
11361 if (LdSize >= 8) {
11362 BuildMI(BB&: *BB, I: Pos, MIMD: dl, MCID: TII->get(Opcode: LdOpc), DestReg: Data)
11363 .addReg(RegNo: AddrOut, flags: RegState::Define)
11364 .addReg(RegNo: AddrIn)
11365 .addImm(Val: 0)
11366 .add(MOs: predOps(Pred: ARMCC::AL));
11367 } else if (IsThumb1) {
11368 // load + update AddrIn
11369 BuildMI(BB&: *BB, I: Pos, MIMD: dl, MCID: TII->get(Opcode: LdOpc), DestReg: Data)
11370 .addReg(RegNo: AddrIn)
11371 .addImm(Val: 0)
11372 .add(MOs: predOps(Pred: ARMCC::AL));
11373 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
11374 .add(t1CondCodeOp())
11375 .addReg(AddrIn)
11376 .addImm(LdSize)
11377 .add(predOps(ARMCC::AL));
11378 } else if (IsThumb2) {
11379 BuildMI(BB&: *BB, I: Pos, MIMD: dl, MCID: TII->get(Opcode: LdOpc), DestReg: Data)
11380 .addReg(RegNo: AddrOut, flags: RegState::Define)
11381 .addReg(RegNo: AddrIn)
11382 .addImm(Val: LdSize)
11383 .add(MOs: predOps(Pred: ARMCC::AL));
11384 } else { // arm
11385 BuildMI(BB&: *BB, I: Pos, MIMD: dl, MCID: TII->get(Opcode: LdOpc), DestReg: Data)
11386 .addReg(RegNo: AddrOut, flags: RegState::Define)
11387 .addReg(RegNo: AddrIn)
11388 .addReg(RegNo: 0)
11389 .addImm(Val: LdSize)
11390 .add(MOs: predOps(Pred: ARMCC::AL));
11391 }
11392}
11393
11394/// Emit a post-increment store operation with given size. The instructions
11395/// will be added to BB at Pos.
11396static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
11397 const TargetInstrInfo *TII, const DebugLoc &dl,
11398 unsigned StSize, unsigned Data, unsigned AddrIn,
11399 unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
11400 unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
11401 assert(StOpc != 0 && "Should have a store opcode");
11402 if (StSize >= 8) {
11403 BuildMI(BB&: *BB, I: Pos, MIMD: dl, MCID: TII->get(Opcode: StOpc), DestReg: AddrOut)
11404 .addReg(RegNo: AddrIn)
11405 .addImm(Val: 0)
11406 .addReg(RegNo: Data)
11407 .add(MOs: predOps(Pred: ARMCC::AL));
11408 } else if (IsThumb1) {
11409 // store + update AddrIn
11410 BuildMI(BB&: *BB, I: Pos, MIMD: dl, MCID: TII->get(Opcode: StOpc))
11411 .addReg(RegNo: Data)
11412 .addReg(RegNo: AddrIn)
11413 .addImm(Val: 0)
11414 .add(MOs: predOps(Pred: ARMCC::AL));
11415 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
11416 .add(t1CondCodeOp())
11417 .addReg(AddrIn)
11418 .addImm(StSize)
11419 .add(predOps(ARMCC::AL));
11420 } else if (IsThumb2) {
11421 BuildMI(BB&: *BB, I: Pos, MIMD: dl, MCID: TII->get(Opcode: StOpc), DestReg: AddrOut)
11422 .addReg(RegNo: Data)
11423 .addReg(RegNo: AddrIn)
11424 .addImm(Val: StSize)
11425 .add(MOs: predOps(Pred: ARMCC::AL));
11426 } else { // arm
11427 BuildMI(BB&: *BB, I: Pos, MIMD: dl, MCID: TII->get(Opcode: StOpc), DestReg: AddrOut)
11428 .addReg(RegNo: Data)
11429 .addReg(RegNo: AddrIn)
11430 .addReg(RegNo: 0)
11431 .addImm(Val: StSize)
11432 .add(MOs: predOps(Pred: ARMCC::AL));
11433 }
11434}
11435
11436MachineBasicBlock *
11437ARMTargetLowering::EmitStructByval(MachineInstr &MI,
11438 MachineBasicBlock *BB) const {
11439 // This pseudo instruction has 3 operands: dst, src, size
11440 // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
11441 // Otherwise, we will generate unrolled scalar copies.
11442 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11443 const BasicBlock *LLVM_BB = BB->getBasicBlock();
11444 MachineFunction::iterator It = ++BB->getIterator();
11445
11446 Register dest = MI.getOperand(i: 0).getReg();
11447 Register src = MI.getOperand(i: 1).getReg();
11448 unsigned SizeVal = MI.getOperand(i: 2).getImm();
11449 unsigned Alignment = MI.getOperand(i: 3).getImm();
11450 DebugLoc dl = MI.getDebugLoc();
11451
11452 MachineFunction *MF = BB->getParent();
11453 MachineRegisterInfo &MRI = MF->getRegInfo();
11454 unsigned UnitSize = 0;
11455 const TargetRegisterClass *TRC = nullptr;
11456 const TargetRegisterClass *VecTRC = nullptr;
11457
11458 bool IsThumb1 = Subtarget->isThumb1Only();
11459 bool IsThumb2 = Subtarget->isThumb2();
11460 bool IsThumb = Subtarget->isThumb();
11461
11462 if (Alignment & 1) {
11463 UnitSize = 1;
11464 } else if (Alignment & 2) {
11465 UnitSize = 2;
11466 } else {
11467 // Check whether we can use NEON instructions.
11468 if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) &&
11469 Subtarget->hasNEON()) {
11470 if ((Alignment % 16 == 0) && SizeVal >= 16)
11471 UnitSize = 16;
11472 else if ((Alignment % 8 == 0) && SizeVal >= 8)
11473 UnitSize = 8;
11474 }
11475 // Can't use NEON instructions.
11476 if (UnitSize == 0)
11477 UnitSize = 4;
11478 }
11479
11480 // Select the correct opcode and register class for unit size load/store
11481 bool IsNeon = UnitSize >= 8;
11482 TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
11483 if (IsNeon)
11484 VecTRC = UnitSize == 16 ? &ARM::DPairRegClass
11485 : UnitSize == 8 ? &ARM::DPRRegClass
11486 : nullptr;
11487
11488 unsigned BytesLeft = SizeVal % UnitSize;
11489 unsigned LoopSize = SizeVal - BytesLeft;
11490
11491 if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
11492 // Use LDR and STR to copy.
11493 // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
11494 // [destOut] = STR_POST(scratch, destIn, UnitSize)
11495 unsigned srcIn = src;
11496 unsigned destIn = dest;
11497 for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
11498 Register srcOut = MRI.createVirtualRegister(RegClass: TRC);
11499 Register destOut = MRI.createVirtualRegister(RegClass: TRC);
11500 Register scratch = MRI.createVirtualRegister(RegClass: IsNeon ? VecTRC : TRC);
11501 emitPostLd(BB, Pos: MI, TII, dl, LdSize: UnitSize, Data: scratch, AddrIn: srcIn, AddrOut: srcOut,
11502 IsThumb1, IsThumb2);
11503 emitPostSt(BB, Pos: MI, TII, dl, StSize: UnitSize, Data: scratch, AddrIn: destIn, AddrOut: destOut,
11504 IsThumb1, IsThumb2);
11505 srcIn = srcOut;
11506 destIn = destOut;
11507 }
11508
11509 // Handle the leftover bytes with LDRB and STRB.
11510 // [scratch, srcOut] = LDRB_POST(srcIn, 1)
11511 // [destOut] = STRB_POST(scratch, destIn, 1)
11512 for (unsigned i = 0; i < BytesLeft; i++) {
11513 Register srcOut = MRI.createVirtualRegister(RegClass: TRC);
11514 Register destOut = MRI.createVirtualRegister(RegClass: TRC);
11515 Register scratch = MRI.createVirtualRegister(RegClass: TRC);
11516 emitPostLd(BB, Pos: MI, TII, dl, LdSize: 1, Data: scratch, AddrIn: srcIn, AddrOut: srcOut,
11517 IsThumb1, IsThumb2);
11518 emitPostSt(BB, Pos: MI, TII, dl, StSize: 1, Data: scratch, AddrIn: destIn, AddrOut: destOut,
11519 IsThumb1, IsThumb2);
11520 srcIn = srcOut;
11521 destIn = destOut;
11522 }
11523 MI.eraseFromParent(); // The instruction is gone now.
11524 return BB;
11525 }
11526
11527 // Expand the pseudo op to a loop.
11528 // thisMBB:
11529 // ...
11530 // movw varEnd, # --> with thumb2
11531 // movt varEnd, #
11532 // ldrcp varEnd, idx --> without thumb2
11533 // fallthrough --> loopMBB
11534 // loopMBB:
11535 // PHI varPhi, varEnd, varLoop
11536 // PHI srcPhi, src, srcLoop
11537 // PHI destPhi, dst, destLoop
11538 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11539 // [destLoop] = STR_POST(scratch, destPhi, UnitSize)
11540 // subs varLoop, varPhi, #UnitSize
11541 // bne loopMBB
11542 // fallthrough --> exitMBB
11543 // exitMBB:
11544 // epilogue to handle left-over bytes
11545 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11546 // [destOut] = STRB_POST(scratch, destLoop, 1)
11547 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(BB: LLVM_BB);
11548 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(BB: LLVM_BB);
11549 MF->insert(MBBI: It, MBB: loopMBB);
11550 MF->insert(MBBI: It, MBB: exitMBB);
11551
11552 // Set the call frame size on entry to the new basic blocks.
11553 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
11554 loopMBB->setCallFrameSize(CallFrameSize);
11555 exitMBB->setCallFrameSize(CallFrameSize);
11556
11557 // Transfer the remainder of BB and its successor edges to exitMBB.
11558 exitMBB->splice(Where: exitMBB->begin(), Other: BB,
11559 From: std::next(x: MachineBasicBlock::iterator(MI)), To: BB->end());
11560 exitMBB->transferSuccessorsAndUpdatePHIs(FromMBB: BB);
11561
11562 // Load an immediate to varEnd.
11563 Register varEnd = MRI.createVirtualRegister(RegClass: TRC);
11564 if (Subtarget->useMovt()) {
11565 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi32imm : ARM::MOVi32imm),
11566 varEnd)
11567 .addImm(LoopSize);
11568 } else if (Subtarget->genExecuteOnly()) {
11569 assert(IsThumb && "Non-thumb expected to have used movt");
11570 BuildMI(BB, dl, TII->get(ARM::tMOVi32imm), varEnd).addImm(LoopSize);
11571 } else {
11572 MachineConstantPool *ConstantPool = MF->getConstantPool();
11573 Type *Int32Ty = Type::getInt32Ty(C&: MF->getFunction().getContext());
11574 const Constant *C = ConstantInt::get(Ty: Int32Ty, V: LoopSize);
11575
11576 // MachineConstantPool wants an explicit alignment.
11577 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Ty: Int32Ty);
11578 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11579 MachineMemOperand *CPMMO =
11580 MF->getMachineMemOperand(PtrInfo: MachinePointerInfo::getConstantPool(MF&: *MF),
11581 F: MachineMemOperand::MOLoad, Size: 4, BaseAlignment: Align(4));
11582
11583 if (IsThumb)
11584 BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci))
11585 .addReg(varEnd, RegState::Define)
11586 .addConstantPoolIndex(Idx)
11587 .add(predOps(ARMCC::AL))
11588 .addMemOperand(CPMMO);
11589 else
11590 BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp))
11591 .addReg(varEnd, RegState::Define)
11592 .addConstantPoolIndex(Idx)
11593 .addImm(0)
11594 .add(predOps(ARMCC::AL))
11595 .addMemOperand(CPMMO);
11596 }
11597 BB->addSuccessor(Succ: loopMBB);
11598
11599 // Generate the loop body:
11600 // varPhi = PHI(varLoop, varEnd)
11601 // srcPhi = PHI(srcLoop, src)
11602 // destPhi = PHI(destLoop, dst)
11603 MachineBasicBlock *entryBB = BB;
11604 BB = loopMBB;
11605 Register varLoop = MRI.createVirtualRegister(RegClass: TRC);
11606 Register varPhi = MRI.createVirtualRegister(RegClass: TRC);
11607 Register srcLoop = MRI.createVirtualRegister(RegClass: TRC);
11608 Register srcPhi = MRI.createVirtualRegister(RegClass: TRC);
11609 Register destLoop = MRI.createVirtualRegister(RegClass: TRC);
11610 Register destPhi = MRI.createVirtualRegister(RegClass: TRC);
11611
11612 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
11613 .addReg(varLoop).addMBB(loopMBB)
11614 .addReg(varEnd).addMBB(entryBB);
11615 BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
11616 .addReg(srcLoop).addMBB(loopMBB)
11617 .addReg(src).addMBB(entryBB);
11618 BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
11619 .addReg(destLoop).addMBB(loopMBB)
11620 .addReg(dest).addMBB(entryBB);
11621
11622 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11623 // [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
11624 Register scratch = MRI.createVirtualRegister(RegClass: IsNeon ? VecTRC : TRC);
11625 emitPostLd(BB, Pos: BB->end(), TII, dl, LdSize: UnitSize, Data: scratch, AddrIn: srcPhi, AddrOut: srcLoop,
11626 IsThumb1, IsThumb2);
11627 emitPostSt(BB, Pos: BB->end(), TII, dl, StSize: UnitSize, Data: scratch, AddrIn: destPhi, AddrOut: destLoop,
11628 IsThumb1, IsThumb2);
11629
11630 // Decrement loop variable by UnitSize.
11631 if (IsThumb1) {
11632 BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop)
11633 .add(t1CondCodeOp())
11634 .addReg(varPhi)
11635 .addImm(UnitSize)
11636 .add(predOps(ARMCC::AL));
11637 } else {
11638 MachineInstrBuilder MIB =
11639 BuildMI(*BB, BB->end(), dl,
11640 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
11641 MIB.addReg(RegNo: varPhi)
11642 .addImm(Val: UnitSize)
11643 .add(MOs: predOps(Pred: ARMCC::AL))
11644 .add(MO: condCodeOp());
11645 MIB->getOperand(5).setReg(ARM::CPSR);
11646 MIB->getOperand(i: 5).setIsDef(true);
11647 }
11648 BuildMI(*BB, BB->end(), dl,
11649 TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))
11650 .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
11651
11652 // loopMBB can loop back to loopMBB or fall through to exitMBB.
11653 BB->addSuccessor(Succ: loopMBB);
11654 BB->addSuccessor(Succ: exitMBB);
11655
11656 // Add epilogue to handle BytesLeft.
11657 BB = exitMBB;
11658 auto StartOfExit = exitMBB->begin();
11659
11660 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11661 // [destOut] = STRB_POST(scratch, destLoop, 1)
11662 unsigned srcIn = srcLoop;
11663 unsigned destIn = destLoop;
11664 for (unsigned i = 0; i < BytesLeft; i++) {
11665 Register srcOut = MRI.createVirtualRegister(RegClass: TRC);
11666 Register destOut = MRI.createVirtualRegister(RegClass: TRC);
11667 Register scratch = MRI.createVirtualRegister(RegClass: TRC);
11668 emitPostLd(BB, Pos: StartOfExit, TII, dl, LdSize: 1, Data: scratch, AddrIn: srcIn, AddrOut: srcOut,
11669 IsThumb1, IsThumb2);
11670 emitPostSt(BB, Pos: StartOfExit, TII, dl, StSize: 1, Data: scratch, AddrIn: destIn, AddrOut: destOut,
11671 IsThumb1, IsThumb2);
11672 srcIn = srcOut;
11673 destIn = destOut;
11674 }
11675
11676 MI.eraseFromParent(); // The instruction is gone now.
11677 return BB;
11678}
11679
11680MachineBasicBlock *
11681ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
11682 MachineBasicBlock *MBB) const {
11683 const TargetMachine &TM = getTargetMachine();
11684 const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
11685 DebugLoc DL = MI.getDebugLoc();
11686
11687 assert(Subtarget->isTargetWindows() &&
11688 "__chkstk is only supported on Windows");
11689 assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");
11690
11691 // __chkstk takes the number of words to allocate on the stack in R4, and
11692 // returns the stack adjustment in number of bytes in R4. This will not
11693 // clober any other registers (other than the obvious lr).
11694 //
11695 // Although, technically, IP should be considered a register which may be
11696 // clobbered, the call itself will not touch it. Windows on ARM is a pure
11697 // thumb-2 environment, so there is no interworking required. As a result, we
11698 // do not expect a veneer to be emitted by the linker, clobbering IP.
11699 //
11700 // Each module receives its own copy of __chkstk, so no import thunk is
11701 // required, again, ensuring that IP is not clobbered.
11702 //
11703 // Finally, although some linkers may theoretically provide a trampoline for
11704 // out of range calls (which is quite common due to a 32M range limitation of
11705 // branches for Thumb), we can generate the long-call version via
11706 // -mcmodel=large, alleviating the need for the trampoline which may clobber
11707 // IP.
11708
11709 switch (TM.getCodeModel()) {
11710 case CodeModel::Tiny:
11711 llvm_unreachable("Tiny code model not available on ARM.");
11712 case CodeModel::Small:
11713 case CodeModel::Medium:
11714 case CodeModel::Kernel:
11715 BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
11716 .add(predOps(ARMCC::AL))
11717 .addExternalSymbol("__chkstk")
11718 .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
11719 .addReg(ARM::R4, RegState::Implicit | RegState::Define)
11720 .addReg(ARM::R12,
11721 RegState::Implicit | RegState::Define | RegState::Dead)
11722 .addReg(ARM::CPSR,
11723 RegState::Implicit | RegState::Define | RegState::Dead);
11724 break;
11725 case CodeModel::Large: {
11726 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
11727 Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11728
11729 BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
11730 .addExternalSymbol("__chkstk");
11731 BuildMI(*MBB, MI, DL, TII.get(gettBLXrOpcode(*MBB->getParent())))
11732 .add(predOps(ARMCC::AL))
11733 .addReg(Reg, RegState::Kill)
11734 .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
11735 .addReg(ARM::R4, RegState::Implicit | RegState::Define)
11736 .addReg(ARM::R12,
11737 RegState::Implicit | RegState::Define | RegState::Dead)
11738 .addReg(ARM::CPSR,
11739 RegState::Implicit | RegState::Define | RegState::Dead);
11740 break;
11741 }
11742 }
11743
11744 BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP)
11745 .addReg(ARM::SP, RegState::Kill)
11746 .addReg(ARM::R4, RegState::Kill)
11747 .setMIFlags(MachineInstr::FrameSetup)
11748 .add(predOps(ARMCC::AL))
11749 .add(condCodeOp());
11750
11751 MI.eraseFromParent();
11752 return MBB;
11753}
11754
11755MachineBasicBlock *
11756ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
11757 MachineBasicBlock *MBB) const {
11758 DebugLoc DL = MI.getDebugLoc();
11759 MachineFunction *MF = MBB->getParent();
11760 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11761
11762 MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock();
11763 MF->insert(MBBI: ++MBB->getIterator(), MBB: ContBB);
11764 ContBB->splice(Where: ContBB->begin(), Other: MBB,
11765 From: std::next(x: MachineBasicBlock::iterator(MI)), To: MBB->end());
11766 ContBB->transferSuccessorsAndUpdatePHIs(FromMBB: MBB);
11767 MBB->addSuccessor(Succ: ContBB);
11768
11769 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
11770 BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0));
11771 MF->push_back(MBB: TrapBB);
11772 MBB->addSuccessor(Succ: TrapBB);
11773
11774 BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8))
11775 .addReg(MI.getOperand(0).getReg())
11776 .addImm(0)
11777 .add(predOps(ARMCC::AL));
11778 BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc))
11779 .addMBB(TrapBB)
11780 .addImm(ARMCC::EQ)
11781 .addReg(ARM::CPSR);
11782
11783 MI.eraseFromParent();
11784 return ContBB;
11785}
11786
11787// The CPSR operand of SelectItr might be missing a kill marker
11788// because there were multiple uses of CPSR, and ISel didn't know
11789// which to mark. Figure out whether SelectItr should have had a
11790// kill marker, and set it if it should. Returns the correct kill
11791// marker value.
11792static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr,
11793 MachineBasicBlock* BB,
11794 const TargetRegisterInfo* TRI) {
11795 // Scan forward through BB for a use/def of CPSR.
11796 MachineBasicBlock::iterator miI(std::next(x: SelectItr));
11797 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
11798 const MachineInstr& mi = *miI;
11799 if (mi.readsRegister(ARM::CPSR, /*TRI=*/nullptr))
11800 return false;
11801 if (mi.definesRegister(ARM::CPSR, /*TRI=*/nullptr))
11802 break; // Should have kill-flag - update below.
11803 }
11804
11805 // If we hit the end of the block, check whether CPSR is live into a
11806 // successor.
11807 if (miI == BB->end()) {
11808 for (MachineBasicBlock *Succ : BB->successors())
11809 if (Succ->isLiveIn(ARM::CPSR))
11810 return false;
11811 }
11812
11813 // We found a def, or hit the end of the basic block and CPSR wasn't live
11814 // out. SelectMI should have a kill flag on CPSR.
11815 SelectItr->addRegisterKilled(ARM::CPSR, TRI);
11816 return true;
11817}
11818
11819/// Adds logic in loop entry MBB to calculate loop iteration count and adds
11820/// t2WhileLoopSetup and t2WhileLoopStart to generate WLS loop
11821static Register genTPEntry(MachineBasicBlock *TpEntry,
11822 MachineBasicBlock *TpLoopBody,
11823 MachineBasicBlock *TpExit, Register OpSizeReg,
11824 const TargetInstrInfo *TII, DebugLoc Dl,
11825 MachineRegisterInfo &MRI) {
11826 // Calculates loop iteration count = ceil(n/16) = (n + 15) >> 4.
11827 Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11828 BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg)
11829 .addUse(OpSizeReg)
11830 .addImm(15)
11831 .add(predOps(ARMCC::AL))
11832 .addReg(0);
11833
11834 Register LsrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11835 BuildMI(TpEntry, Dl, TII->get(ARM::t2LSRri), LsrDestReg)
11836 .addUse(AddDestReg, RegState::Kill)
11837 .addImm(4)
11838 .add(predOps(ARMCC::AL))
11839 .addReg(0);
11840
11841 Register TotalIterationsReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11842 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopSetup), TotalIterationsReg)
11843 .addUse(LsrDestReg, RegState::Kill);
11844
11845 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopStart))
11846 .addUse(TotalIterationsReg)
11847 .addMBB(TpExit);
11848
11849 BuildMI(TpEntry, Dl, TII->get(ARM::t2B))
11850 .addMBB(TpLoopBody)
11851 .add(predOps(ARMCC::AL));
11852
11853 return TotalIterationsReg;
11854}
11855
11856/// Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and
11857/// t2DoLoopEnd. These are used by later passes to generate tail predicated
11858/// loops.
11859static void genTPLoopBody(MachineBasicBlock *TpLoopBody,
11860 MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit,
11861 const TargetInstrInfo *TII, DebugLoc Dl,
11862 MachineRegisterInfo &MRI, Register OpSrcReg,
11863 Register OpDestReg, Register ElementCountReg,
11864 Register TotalIterationsReg, bool IsMemcpy) {
11865 // First insert 4 PHI nodes for: Current pointer to Src (if memcpy), Dest
11866 // array, loop iteration counter, predication counter.
11867
11868 Register SrcPhiReg, CurrSrcReg;
11869 if (IsMemcpy) {
11870 // Current position in the src array
11871 SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11872 CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11873 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg)
11874 .addUse(OpSrcReg)
11875 .addMBB(TpEntry)
11876 .addUse(CurrSrcReg)
11877 .addMBB(TpLoopBody);
11878 }
11879
11880 // Current position in the dest array
11881 Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11882 Register CurrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11883 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), DestPhiReg)
11884 .addUse(OpDestReg)
11885 .addMBB(TpEntry)
11886 .addUse(CurrDestReg)
11887 .addMBB(TpLoopBody);
11888
11889 // Current loop counter
11890 Register LoopCounterPhiReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11891 Register RemainingLoopIterationsReg =
11892 MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11893 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), LoopCounterPhiReg)
11894 .addUse(TotalIterationsReg)
11895 .addMBB(TpEntry)
11896 .addUse(RemainingLoopIterationsReg)
11897 .addMBB(TpLoopBody);
11898
11899 // Predication counter
11900 Register PredCounterPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11901 Register RemainingElementsReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11902 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), PredCounterPhiReg)
11903 .addUse(ElementCountReg)
11904 .addMBB(TpEntry)
11905 .addUse(RemainingElementsReg)
11906 .addMBB(TpLoopBody);
11907
11908 // Pass predication counter to VCTP
11909 Register VccrReg = MRI.createVirtualRegister(&ARM::VCCRRegClass);
11910 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VCTP8), VccrReg)
11911 .addUse(PredCounterPhiReg)
11912 .addImm(ARMVCC::None)
11913 .addReg(0)
11914 .addReg(0);
11915
11916 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2SUBri), RemainingElementsReg)
11917 .addUse(PredCounterPhiReg)
11918 .addImm(16)
11919 .add(predOps(ARMCC::AL))
11920 .addReg(0);
11921
11922 // VLDRB (only if memcpy) and VSTRB instructions, predicated using VPR
11923 Register SrcValueReg;
11924 if (IsMemcpy) {
11925 SrcValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass);
11926 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post))
11927 .addDef(CurrSrcReg)
11928 .addDef(SrcValueReg)
11929 .addReg(SrcPhiReg)
11930 .addImm(16)
11931 .addImm(ARMVCC::Then)
11932 .addUse(VccrReg)
11933 .addReg(0);
11934 } else
11935 SrcValueReg = OpSrcReg;
11936
11937 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post))
11938 .addDef(CurrDestReg)
11939 .addUse(SrcValueReg)
11940 .addReg(DestPhiReg)
11941 .addImm(16)
11942 .addImm(ARMVCC::Then)
11943 .addUse(VccrReg)
11944 .addReg(0);
11945
11946 // Add the pseudoInstrs for decrementing the loop counter and marking the
11947 // end:t2DoLoopDec and t2DoLoopEnd
11948 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopDec), RemainingLoopIterationsReg)
11949 .addUse(LoopCounterPhiReg)
11950 .addImm(1);
11951
11952 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopEnd))
11953 .addUse(RemainingLoopIterationsReg)
11954 .addMBB(TpLoopBody);
11955
11956 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2B))
11957 .addMBB(TpExit)
11958 .add(predOps(ARMCC::AL));
11959}
11960
11961MachineBasicBlock *
11962ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
11963 MachineBasicBlock *BB) const {
11964 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11965 DebugLoc dl = MI.getDebugLoc();
11966 bool isThumb2 = Subtarget->isThumb2();
11967 switch (MI.getOpcode()) {
11968 default: {
11969 MI.print(OS&: errs());
11970 llvm_unreachable("Unexpected instr type to insert");
11971 }
11972
11973 // Thumb1 post-indexed loads are really just single-register LDMs.
11974 case ARM::tLDR_postidx: {
11975 MachineOperand Def(MI.getOperand(i: 1));
11976 BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
11977 .add(Def) // Rn_wb
11978 .add(MI.getOperand(2)) // Rn
11979 .add(MI.getOperand(3)) // PredImm
11980 .add(MI.getOperand(4)) // PredReg
11981 .add(MI.getOperand(0)) // Rt
11982 .cloneMemRefs(MI);
11983 MI.eraseFromParent();
11984 return BB;
11985 }
11986
11987 case ARM::MVE_MEMCPYLOOPINST:
11988 case ARM::MVE_MEMSETLOOPINST: {
11989
11990 // Transformation below expands MVE_MEMCPYLOOPINST/MVE_MEMSETLOOPINST Pseudo
11991 // into a Tail Predicated (TP) Loop. It adds the instructions to calculate
11992 // the iteration count =ceil(size_in_bytes/16)) in the TP entry block and
11993 // adds the relevant instructions in the TP loop Body for generation of a
11994 // WLSTP loop.
11995
11996 // Below is relevant portion of the CFG after the transformation.
11997 // The Machine Basic Blocks are shown along with branch conditions (in
11998 // brackets). Note that TP entry/exit MBBs depict the entry/exit of this
11999 // portion of the CFG and may not necessarily be the entry/exit of the
12000 // function.
12001
12002 // (Relevant) CFG after transformation:
12003 // TP entry MBB
12004 // |
12005 // |-----------------|
12006 // (n <= 0) (n > 0)
12007 // | |
12008 // | TP loop Body MBB<--|
12009 // | | |
12010 // \ |___________|
12011 // \ /
12012 // TP exit MBB
12013
12014 MachineFunction *MF = BB->getParent();
12015 MachineFunctionProperties &Properties = MF->getProperties();
12016 MachineRegisterInfo &MRI = MF->getRegInfo();
12017
12018 Register OpDestReg = MI.getOperand(i: 0).getReg();
12019 Register OpSrcReg = MI.getOperand(i: 1).getReg();
12020 Register OpSizeReg = MI.getOperand(i: 2).getReg();
12021
12022 // Allocate the required MBBs and add to parent function.
12023 MachineBasicBlock *TpEntry = BB;
12024 MachineBasicBlock *TpLoopBody = MF->CreateMachineBasicBlock();
12025 MachineBasicBlock *TpExit;
12026
12027 MF->push_back(MBB: TpLoopBody);
12028
12029 // If any instructions are present in the current block after
12030 // MVE_MEMCPYLOOPINST or MVE_MEMSETLOOPINST, split the current block and
12031 // move the instructions into the newly created exit block. If there are no
12032 // instructions add an explicit branch to the FallThrough block and then
12033 // split.
12034 //
12035 // The split is required for two reasons:
12036 // 1) A terminator(t2WhileLoopStart) will be placed at that site.
12037 // 2) Since a TPLoopBody will be added later, any phis in successive blocks
12038 // need to be updated. splitAt() already handles this.
12039 TpExit = BB->splitAt(SplitInst&: MI, UpdateLiveIns: false);
12040 if (TpExit == BB) {
12041 assert(BB->canFallThrough() && "Exit Block must be Fallthrough of the "
12042 "block containing memcpy/memset Pseudo");
12043 TpExit = BB->getFallThrough();
12044 BuildMI(BB, dl, TII->get(ARM::t2B))
12045 .addMBB(TpExit)
12046 .add(predOps(ARMCC::AL));
12047 TpExit = BB->splitAt(SplitInst&: MI, UpdateLiveIns: false);
12048 }
12049
12050 // Add logic for iteration count
12051 Register TotalIterationsReg =
12052 genTPEntry(TpEntry, TpLoopBody, TpExit, OpSizeReg, TII, Dl: dl, MRI);
12053
12054 // Add the vectorized (and predicated) loads/store instructions
12055 bool IsMemcpy = MI.getOpcode() == ARM::MVE_MEMCPYLOOPINST;
12056 genTPLoopBody(TpLoopBody, TpEntry, TpExit, TII, Dl: dl, MRI, OpSrcReg,
12057 OpDestReg, ElementCountReg: OpSizeReg, TotalIterationsReg, IsMemcpy);
12058
12059 // Required to avoid conflict with the MachineVerifier during testing.
12060 Properties.reset(P: MachineFunctionProperties::Property::NoPHIs);
12061
12062 // Connect the blocks
12063 TpEntry->addSuccessor(Succ: TpLoopBody);
12064 TpLoopBody->addSuccessor(Succ: TpLoopBody);
12065 TpLoopBody->addSuccessor(Succ: TpExit);
12066
12067 // Reorder for a more natural layout
12068 TpLoopBody->moveAfter(NewBefore: TpEntry);
12069 TpExit->moveAfter(NewBefore: TpLoopBody);
12070
12071 // Finally, remove the memcpy Pseudo Instruction
12072 MI.eraseFromParent();
12073
12074 // Return the exit block as it may contain other instructions requiring a
12075 // custom inserter
12076 return TpExit;
12077 }
12078
12079 // The Thumb2 pre-indexed stores have the same MI operands, they just
12080 // define them differently in the .td files from the isel patterns, so
12081 // they need pseudos.
12082 case ARM::t2STR_preidx:
12083 MI.setDesc(TII->get(ARM::t2STR_PRE));
12084 return BB;
12085 case ARM::t2STRB_preidx:
12086 MI.setDesc(TII->get(ARM::t2STRB_PRE));
12087 return BB;
12088 case ARM::t2STRH_preidx:
12089 MI.setDesc(TII->get(ARM::t2STRH_PRE));
12090 return BB;
12091
12092 case ARM::STRi_preidx:
12093 case ARM::STRBi_preidx: {
12094 unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM
12095 : ARM::STRB_PRE_IMM;
12096 // Decode the offset.
12097 unsigned Offset = MI.getOperand(i: 4).getImm();
12098 bool isSub = ARM_AM::getAM2Op(AM2Opc: Offset) == ARM_AM::sub;
12099 Offset = ARM_AM::getAM2Offset(AM2Opc: Offset);
12100 if (isSub)
12101 Offset = -Offset;
12102
12103 MachineMemOperand *MMO = *MI.memoperands_begin();
12104 BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: NewOpc))
12105 .add(MO: MI.getOperand(i: 0)) // Rn_wb
12106 .add(MO: MI.getOperand(i: 1)) // Rt
12107 .add(MO: MI.getOperand(i: 2)) // Rn
12108 .addImm(Val: Offset) // offset (skip GPR==zero_reg)
12109 .add(MO: MI.getOperand(i: 5)) // pred
12110 .add(MO: MI.getOperand(i: 6))
12111 .addMemOperand(MMO);
12112 MI.eraseFromParent();
12113 return BB;
12114 }
12115 case ARM::STRr_preidx:
12116 case ARM::STRBr_preidx:
12117 case ARM::STRH_preidx: {
12118 unsigned NewOpc;
12119 switch (MI.getOpcode()) {
12120 default: llvm_unreachable("unexpected opcode!");
12121 case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
12122 case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
12123 case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
12124 }
12125 MachineInstrBuilder MIB = BuildMI(BB&: *BB, I&: MI, MIMD: dl, MCID: TII->get(Opcode: NewOpc));
12126 for (const MachineOperand &MO : MI.operands())
12127 MIB.add(MO);
12128 MI.eraseFromParent();
12129 return BB;
12130 }
12131
12132 case ARM::tMOVCCr_pseudo: {
12133 // To "insert" a SELECT_CC instruction, we actually have to insert the
12134 // diamond control-flow pattern. The incoming instruction knows the
12135 // destination vreg to set, the condition code register to branch on, the
12136 // true/false values to select between, and a branch opcode to use.
12137 const BasicBlock *LLVM_BB = BB->getBasicBlock();
12138 MachineFunction::iterator It = ++BB->getIterator();
12139
12140 // thisMBB:
12141 // ...
12142 // TrueVal = ...
12143 // cmpTY ccX, r1, r2
12144 // bCC copy1MBB
12145 // fallthrough --> copy0MBB
12146 MachineBasicBlock *thisMBB = BB;
12147 MachineFunction *F = BB->getParent();
12148 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
12149 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(BB: LLVM_BB);
12150 F->insert(MBBI: It, MBB: copy0MBB);
12151 F->insert(MBBI: It, MBB: sinkMBB);
12152
12153 // Set the call frame size on entry to the new basic blocks.
12154 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
12155 copy0MBB->setCallFrameSize(CallFrameSize);
12156 sinkMBB->setCallFrameSize(CallFrameSize);
12157
12158 // Check whether CPSR is live past the tMOVCCr_pseudo.
12159 const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
12160 if (!MI.killsRegister(ARM::CPSR, /*TRI=*/nullptr) &&
12161 !checkAndUpdateCPSRKill(MI, thisMBB, TRI)) {
12162 copy0MBB->addLiveIn(ARM::CPSR);
12163 sinkMBB->addLiveIn(ARM::CPSR);
12164 }
12165
12166 // Transfer the remainder of BB and its successor edges to sinkMBB.
12167 sinkMBB->splice(Where: sinkMBB->begin(), Other: BB,
12168 From: std::next(x: MachineBasicBlock::iterator(MI)), To: BB->end());
12169 sinkMBB->transferSuccessorsAndUpdatePHIs(FromMBB: BB);
12170
12171 BB->addSuccessor(Succ: copy0MBB);
12172 BB->addSuccessor(Succ: sinkMBB);
12173
12174 BuildMI(BB, dl, TII->get(ARM::tBcc))
12175 .addMBB(sinkMBB)
12176 .addImm(MI.getOperand(3).getImm())
12177 .addReg(MI.getOperand(4).getReg());
12178
12179 // copy0MBB:
12180 // %FalseValue = ...
12181 // # fallthrough to sinkMBB
12182 BB = copy0MBB;
12183
12184 // Update machine-CFG edges
12185 BB->addSuccessor(Succ: sinkMBB);
12186
12187 // sinkMBB:
12188 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
12189 // ...
12190 BB = sinkMBB;
12191 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg())
12192 .addReg(MI.getOperand(1).getReg())
12193 .addMBB(copy0MBB)
12194 .addReg(MI.getOperand(2).getReg())
12195 .addMBB(thisMBB);
12196
12197 MI.eraseFromParent(); // The pseudo instruction is gone now.
12198 return BB;
12199 }
12200
12201 case ARM::BCCi64:
12202 case ARM::BCCZi64: {
12203 // If there is an unconditional branch to the other successor, remove it.
12204 BB->erase(I: std::next(x: MachineBasicBlock::iterator(MI)), E: BB->end());
12205
12206 // Compare both parts that make up the double comparison separately for
12207 // equality.
12208 bool RHSisZero = MI.getOpcode() == ARM::BCCZi64;
12209
12210 Register LHS1 = MI.getOperand(i: 1).getReg();
12211 Register LHS2 = MI.getOperand(i: 2).getReg();
12212 if (RHSisZero) {
12213 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
12214 .addReg(LHS1)
12215 .addImm(0)
12216 .add(predOps(ARMCC::AL));
12217 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
12218 .addReg(LHS2).addImm(0)
12219 .addImm(ARMCC::EQ).addReg(ARM::CPSR);
12220 } else {
12221 Register RHS1 = MI.getOperand(i: 3).getReg();
12222 Register RHS2 = MI.getOperand(i: 4).getReg();
12223 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
12224 .addReg(LHS1)
12225 .addReg(RHS1)
12226 .add(predOps(ARMCC::AL));
12227 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
12228 .addReg(LHS2).addReg(RHS2)
12229 .addImm(ARMCC::EQ).addReg(ARM::CPSR);
12230 }
12231
12232 MachineBasicBlock *destMBB = MI.getOperand(i: RHSisZero ? 3 : 5).getMBB();
12233 MachineBasicBlock *exitMBB = OtherSucc(MBB: BB, Succ: destMBB);
12234 if (MI.getOperand(i: 0).getImm() == ARMCC::NE)
12235 std::swap(a&: destMBB, b&: exitMBB);
12236
12237 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
12238 .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
12239 if (isThumb2)
12240 BuildMI(BB, dl, TII->get(ARM::t2B))
12241 .addMBB(exitMBB)
12242 .add(predOps(ARMCC::AL));
12243 else
12244 BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
12245
12246 MI.eraseFromParent(); // The pseudo instruction is gone now.
12247 return BB;
12248 }
12249
12250 case ARM::Int_eh_sjlj_setjmp:
12251 case ARM::Int_eh_sjlj_setjmp_nofp:
12252 case ARM::tInt_eh_sjlj_setjmp:
12253 case ARM::t2Int_eh_sjlj_setjmp:
12254 case ARM::t2Int_eh_sjlj_setjmp_nofp:
12255 return BB;
12256
12257 case ARM::Int_eh_sjlj_setup_dispatch:
12258 EmitSjLjDispatchBlock(MI, MBB: BB);
12259 return BB;
12260
12261 case ARM::ABS:
12262 case ARM::t2ABS: {
12263 // To insert an ABS instruction, we have to insert the
12264 // diamond control-flow pattern. The incoming instruction knows the
12265 // source vreg to test against 0, the destination vreg to set,
12266 // the condition code register to branch on, the
12267 // true/false values to select between, and a branch opcode to use.
12268 // It transforms
12269 // V1 = ABS V0
12270 // into
12271 // V2 = MOVS V0
12272 // BCC (branch to SinkBB if V0 >= 0)
12273 // RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0)
12274 // SinkBB: V1 = PHI(V2, V3)
12275 const BasicBlock *LLVM_BB = BB->getBasicBlock();
12276 MachineFunction::iterator BBI = ++BB->getIterator();
12277 MachineFunction *Fn = BB->getParent();
12278 MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(BB: LLVM_BB);
12279 MachineBasicBlock *SinkBB = Fn->CreateMachineBasicBlock(BB: LLVM_BB);
12280 Fn->insert(MBBI: BBI, MBB: RSBBB);
12281 Fn->insert(MBBI: BBI, MBB: SinkBB);
12282
12283 Register ABSSrcReg = MI.getOperand(i: 1).getReg();
12284 Register ABSDstReg = MI.getOperand(i: 0).getReg();
12285 bool ABSSrcKIll = MI.getOperand(i: 1).isKill();
12286 bool isThumb2 = Subtarget->isThumb2();
12287 MachineRegisterInfo &MRI = Fn->getRegInfo();
12288 // In Thumb mode S must not be specified if source register is the SP or
12289 // PC and if destination register is the SP, so restrict register class
12290 Register NewRsbDstReg = MRI.createVirtualRegister(
12291 isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass);
12292
12293 // Transfer the remainder of BB and its successor edges to sinkMBB.
12294 SinkBB->splice(Where: SinkBB->begin(), Other: BB,
12295 From: std::next(x: MachineBasicBlock::iterator(MI)), To: BB->end());
12296 SinkBB->transferSuccessorsAndUpdatePHIs(FromMBB: BB);
12297
12298 BB->addSuccessor(Succ: RSBBB);
12299 BB->addSuccessor(Succ: SinkBB);
12300
12301 // fall through to SinkMBB
12302 RSBBB->addSuccessor(Succ: SinkBB);
12303
12304 // insert a cmp at the end of BB
12305 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
12306 .addReg(ABSSrcReg)
12307 .addImm(0)
12308 .add(predOps(ARMCC::AL));
12309
12310 // insert a bcc with opposite CC to ARMCC::MI at the end of BB
12311 BuildMI(BB, dl,
12312 TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB)
12313 .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR);
12314
12315 // insert rsbri in RSBBB
12316 // Note: BCC and rsbri will be converted into predicated rsbmi
12317 // by if-conversion pass
12318 BuildMI(*RSBBB, RSBBB->begin(), dl,
12319 TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg)
12320 .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0)
12321 .addImm(0)
12322 .add(predOps(ARMCC::AL))
12323 .add(condCodeOp());
12324
12325 // insert PHI in SinkBB,
12326 // reuse ABSDstReg to not change uses of ABS instruction
12327 BuildMI(*SinkBB, SinkBB->begin(), dl,
12328 TII->get(ARM::PHI), ABSDstReg)
12329 .addReg(NewRsbDstReg).addMBB(RSBBB)
12330 .addReg(ABSSrcReg).addMBB(BB);
12331
12332 // remove ABS instruction
12333 MI.eraseFromParent();
12334
12335 // return last added BB
12336 return SinkBB;
12337 }
12338 case ARM::COPY_STRUCT_BYVAL_I32:
12339 ++NumLoopByVals;
12340 return EmitStructByval(MI, BB);
12341 case ARM::WIN__CHKSTK:
12342 return EmitLowered__chkstk(MI, MBB: BB);
12343 case ARM::WIN__DBZCHK:
12344 return EmitLowered__dbzchk(MI, MBB: BB);
12345 }
12346}
12347
12348/// Attaches vregs to MEMCPY that it will use as scratch registers
12349/// when it is expanded into LDM/STM. This is done as a post-isel lowering
12350/// instead of as a custom inserter because we need the use list from the SDNode.
12351static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
12352 MachineInstr &MI, const SDNode *Node) {
12353 bool isThumb1 = Subtarget->isThumb1Only();
12354
12355 DebugLoc DL = MI.getDebugLoc();
12356 MachineFunction *MF = MI.getParent()->getParent();
12357 MachineRegisterInfo &MRI = MF->getRegInfo();
12358 MachineInstrBuilder MIB(*MF, MI);
12359
12360 // If the new dst/src is unused mark it as dead.
12361 if (!Node->hasAnyUseOfValue(Value: 0)) {
12362 MI.getOperand(i: 0).setIsDead(true);
12363 }
12364 if (!Node->hasAnyUseOfValue(Value: 1)) {
12365 MI.getOperand(i: 1).setIsDead(true);
12366 }
12367
12368 // The MEMCPY both defines and kills the scratch registers.
12369 for (unsigned I = 0; I != MI.getOperand(i: 4).getImm(); ++I) {
12370 Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
12371 : &ARM::GPRRegClass);
12372 MIB.addReg(RegNo: TmpReg, flags: RegState::Define|RegState::Dead);
12373 }
12374}
12375
12376void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
12377 SDNode *Node) const {
12378 if (MI.getOpcode() == ARM::MEMCPY) {
12379 attachMEMCPYScratchRegs(Subtarget, MI, Node);
12380 return;
12381 }
12382
12383 const MCInstrDesc *MCID = &MI.getDesc();
12384 // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
12385 // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
12386 // operand is still set to noreg. If needed, set the optional operand's
12387 // register to CPSR, and remove the redundant implicit def.
12388 //
12389 // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR).
12390
12391 // Rename pseudo opcodes.
12392 unsigned NewOpc = convertAddSubFlagsOpcode(OldOpc: MI.getOpcode());
12393 unsigned ccOutIdx;
12394 if (NewOpc) {
12395 const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
12396 MCID = &TII->get(NewOpc);
12397
12398 assert(MCID->getNumOperands() ==
12399 MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize()
12400 && "converted opcode should be the same except for cc_out"
12401 " (and, on Thumb1, pred)");
12402
12403 MI.setDesc(*MCID);
12404
12405 // Add the optional cc_out operand
12406 MI.addOperand(Op: MachineOperand::CreateReg(Reg: 0, /*isDef=*/true));
12407
12408 // On Thumb1, move all input operands to the end, then add the predicate
12409 if (Subtarget->isThumb1Only()) {
12410 for (unsigned c = MCID->getNumOperands() - 4; c--;) {
12411 MI.addOperand(Op: MI.getOperand(i: 1));
12412 MI.removeOperand(OpNo: 1);
12413 }
12414
12415 // Restore the ties
12416 for (unsigned i = MI.getNumOperands(); i--;) {
12417 const MachineOperand& op = MI.getOperand(i);
12418 if (op.isReg() && op.isUse()) {
12419 int DefIdx = MCID->getOperandConstraint(OpNum: i, Constraint: MCOI::TIED_TO);
12420 if (DefIdx != -1)
12421 MI.tieOperands(DefIdx, UseIdx: i);
12422 }
12423 }
12424
12425 MI.addOperand(Op: MachineOperand::CreateImm(Val: ARMCC::AL));
12426 MI.addOperand(Op: MachineOperand::CreateReg(Reg: 0, /*isDef=*/false));
12427 ccOutIdx = 1;
12428 } else
12429 ccOutIdx = MCID->getNumOperands() - 1;
12430 } else
12431 ccOutIdx = MCID->getNumOperands() - 1;
12432
12433 // Any ARM instruction that sets the 's' bit should specify an optional
12434 // "cc_out" operand in the last operand position.
12435 if (!MI.hasOptionalDef() || !MCID->operands()[ccOutIdx].isOptionalDef()) {
12436 assert(!NewOpc && "Optional cc_out operand required");
12437 return;
12438 }
12439 // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
12440 // since we already have an optional CPSR def.
12441 bool definesCPSR = false;
12442 bool deadCPSR = false;
12443 for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e;
12444 ++i) {
12445 const MachineOperand &MO = MI.getOperand(i);
12446 if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
12447 definesCPSR = true;
12448 if (MO.isDead())
12449 deadCPSR = true;
12450 MI.removeOperand(OpNo: i);
12451 break;
12452 }
12453 }
12454 if (!definesCPSR) {
12455 assert(!NewOpc && "Optional cc_out operand required");
12456 return;
12457 }
12458 assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
12459 if (deadCPSR) {
12460 assert(!MI.getOperand(ccOutIdx).getReg() &&
12461 "expect uninitialized optional cc_out operand");
12462 // Thumb1 instructions must have the S bit even if the CPSR is dead.
12463 if (!Subtarget->isThumb1Only())
12464 return;
12465 }
12466
12467 // If this instruction was defined with an optional CPSR def and its dag node
12468 // had a live implicit CPSR def, then activate the optional CPSR def.
12469 MachineOperand &MO = MI.getOperand(i: ccOutIdx);
12470 MO.setReg(ARM::CPSR);
12471 MO.setIsDef(true);
12472}
12473
12474//===----------------------------------------------------------------------===//
12475// ARM Optimization Hooks
12476//===----------------------------------------------------------------------===//
12477
12478// Helper function that checks if N is a null or all ones constant.
12479static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
12480 return AllOnes ? isAllOnesConstant(V: N) : isNullConstant(V: N);
12481}
12482
12483// Return true if N is conditionally 0 or all ones.
12484// Detects these expressions where cc is an i1 value:
12485//
12486// (select cc 0, y) [AllOnes=0]
12487// (select cc y, 0) [AllOnes=0]
12488// (zext cc) [AllOnes=0]
12489// (sext cc) [AllOnes=0/1]
12490// (select cc -1, y) [AllOnes=1]
12491// (select cc y, -1) [AllOnes=1]
12492//
12493// Invert is set when N is the null/all ones constant when CC is false.
12494// OtherOp is set to the alternative value of N.
12495static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes,
12496 SDValue &CC, bool &Invert,
12497 SDValue &OtherOp,
12498 SelectionDAG &DAG) {
12499 switch (N->getOpcode()) {
12500 default: return false;
12501 case ISD::SELECT: {
12502 CC = N->getOperand(Num: 0);
12503 SDValue N1 = N->getOperand(Num: 1);
12504 SDValue N2 = N->getOperand(Num: 2);
12505 if (isZeroOrAllOnes(N: N1, AllOnes)) {
12506 Invert = false;
12507 OtherOp = N2;
12508 return true;
12509 }
12510 if (isZeroOrAllOnes(N: N2, AllOnes)) {
12511 Invert = true;
12512 OtherOp = N1;
12513 return true;
12514 }
12515 return false;
12516 }
12517 case ISD::ZERO_EXTEND:
12518 // (zext cc) can never be the all ones value.
12519 if (AllOnes)
12520 return false;
12521 [[fallthrough]];
12522 case ISD::SIGN_EXTEND: {
12523 SDLoc dl(N);
12524 EVT VT = N->getValueType(ResNo: 0);
12525 CC = N->getOperand(Num: 0);
12526 if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC)
12527 return false;
12528 Invert = !AllOnes;
12529 if (AllOnes)
12530 // When looking for an AllOnes constant, N is an sext, and the 'other'
12531 // value is 0.
12532 OtherOp = DAG.getConstant(Val: 0, DL: dl, VT);
12533 else if (N->getOpcode() == ISD::ZERO_EXTEND)
12534 // When looking for a 0 constant, N can be zext or sext.
12535 OtherOp = DAG.getConstant(Val: 1, DL: dl, VT);
12536 else
12537 OtherOp = DAG.getAllOnesConstant(DL: dl, VT);
12538 return true;
12539 }
12540 }
12541}
12542
12543// Combine a constant select operand into its use:
12544//
12545// (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
12546// (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
12547// (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1]
12548// (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
12549// (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
12550//
12551// The transform is rejected if the select doesn't have a constant operand that
12552// is null, or all ones when AllOnes is set.
12553//
12554// Also recognize sext/zext from i1:
12555//
12556// (add (zext cc), x) -> (select cc (add x, 1), x)
12557// (add (sext cc), x) -> (select cc (add x, -1), x)
12558//
12559// These transformations eventually create predicated instructions.
12560//
12561// @param N The node to transform.
12562// @param Slct The N operand that is a select.
12563// @param OtherOp The other N operand (x above).
12564// @param DCI Context.
12565// @param AllOnes Require the select constant to be all ones instead of null.
12566// @returns The new node, or SDValue() on failure.
12567static
12568SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
12569 TargetLowering::DAGCombinerInfo &DCI,
12570 bool AllOnes = false) {
12571 SelectionDAG &DAG = DCI.DAG;
12572 EVT VT = N->getValueType(ResNo: 0);
12573 SDValue NonConstantVal;
12574 SDValue CCOp;
12575 bool SwapSelectOps;
12576 if (!isConditionalZeroOrAllOnes(N: Slct.getNode(), AllOnes, CC&: CCOp, Invert&: SwapSelectOps,
12577 OtherOp&: NonConstantVal, DAG))
12578 return SDValue();
12579
12580 // Slct is now know to be the desired identity constant when CC is true.
12581 SDValue TrueVal = OtherOp;
12582 SDValue FalseVal = DAG.getNode(Opcode: N->getOpcode(), DL: SDLoc(N), VT,
12583 N1: OtherOp, N2: NonConstantVal);
12584 // Unless SwapSelectOps says CC should be false.
12585 if (SwapSelectOps)
12586 std::swap(a&: TrueVal, b&: FalseVal);
12587
12588 return DAG.getNode(Opcode: ISD::SELECT, DL: SDLoc(N), VT,
12589 N1: CCOp, N2: TrueVal, N3: FalseVal);
12590}
12591
12592// Attempt combineSelectAndUse on each operand of a commutative operator N.
12593static
12594SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes,
12595 TargetLowering::DAGCombinerInfo &DCI) {
12596 SDValue N0 = N->getOperand(Num: 0);
12597 SDValue N1 = N->getOperand(Num: 1);
12598 if (N0.getNode()->hasOneUse())
12599 if (SDValue Result = combineSelectAndUse(N, Slct: N0, OtherOp: N1, DCI, AllOnes))
12600 return Result;
12601 if (N1.getNode()->hasOneUse())
12602 if (SDValue Result = combineSelectAndUse(N, Slct: N1, OtherOp: N0, DCI, AllOnes))
12603 return Result;
12604 return SDValue();
12605}
12606
12607static bool IsVUZPShuffleNode(SDNode *N) {
12608 // VUZP shuffle node.
12609 if (N->getOpcode() == ARMISD::VUZP)
12610 return true;
12611
12612 // "VUZP" on i32 is an alias for VTRN.
12613 if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32)
12614 return true;
12615
12616 return false;
12617}
12618
12619static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1,
12620 TargetLowering::DAGCombinerInfo &DCI,
12621 const ARMSubtarget *Subtarget) {
12622 // Look for ADD(VUZP.0, VUZP.1).
12623 if (!IsVUZPShuffleNode(N: N0.getNode()) || N0.getNode() != N1.getNode() ||
12624 N0 == N1)
12625 return SDValue();
12626
12627 // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD.
12628 if (!N->getValueType(ResNo: 0).is64BitVector())
12629 return SDValue();
12630
12631 // Generate vpadd.
12632 SelectionDAG &DAG = DCI.DAG;
12633 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12634 SDLoc dl(N);
12635 SDNode *Unzip = N0.getNode();
12636 EVT VT = N->getValueType(ResNo: 0);
12637
12638 SmallVector<SDValue, 8> Ops;
12639 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl,
12640 TLI.getPointerTy(DAG.getDataLayout())));
12641 Ops.push_back(Elt: Unzip->getOperand(Num: 0));
12642 Ops.push_back(Elt: Unzip->getOperand(Num: 1));
12643
12644 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT, Ops);
12645}
12646
12647static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1,
12648 TargetLowering::DAGCombinerInfo &DCI,
12649 const ARMSubtarget *Subtarget) {
12650 // Check for two extended operands.
12651 if (!(N0.getOpcode() == ISD::SIGN_EXTEND &&
12652 N1.getOpcode() == ISD::SIGN_EXTEND) &&
12653 !(N0.getOpcode() == ISD::ZERO_EXTEND &&
12654 N1.getOpcode() == ISD::ZERO_EXTEND))
12655 return SDValue();
12656
12657 SDValue N00 = N0.getOperand(i: 0);
12658 SDValue N10 = N1.getOperand(i: 0);
12659
12660 // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1))
12661 if (!IsVUZPShuffleNode(N: N00.getNode()) || N00.getNode() != N10.getNode() ||
12662 N00 == N10)
12663 return SDValue();
12664
12665 // We only recognize Q register paddl here; this can't be reached until
12666 // after type legalization.
12667 if (!N00.getValueType().is64BitVector() ||
12668 !N0.getValueType().is128BitVector())
12669 return SDValue();
12670
12671 // Generate vpaddl.
12672 SelectionDAG &DAG = DCI.DAG;
12673 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12674 SDLoc dl(N);
12675 EVT VT = N->getValueType(ResNo: 0);
12676
12677 SmallVector<SDValue, 8> Ops;
12678 // Form vpaddl.sN or vpaddl.uN depending on the kind of extension.
12679 unsigned Opcode;
12680 if (N0.getOpcode() == ISD::SIGN_EXTEND)
12681 Opcode = Intrinsic::arm_neon_vpaddls;
12682 else
12683 Opcode = Intrinsic::arm_neon_vpaddlu;
12684 Ops.push_back(Elt: DAG.getConstant(Val: Opcode, DL: dl,
12685 VT: TLI.getPointerTy(DL: DAG.getDataLayout())));
12686 EVT ElemTy = N00.getValueType().getVectorElementType();
12687 unsigned NumElts = VT.getVectorNumElements();
12688 EVT ConcatVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: ElemTy, NumElements: NumElts * 2);
12689 SDValue Concat = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(N), VT: ConcatVT,
12690 N1: N00.getOperand(i: 0), N2: N00.getOperand(i: 1));
12691 Ops.push_back(Elt: Concat);
12692
12693 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT, Ops);
12694}
12695
12696// FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in
12697// an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is
12698// much easier to match.
12699static SDValue
12700AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1,
12701 TargetLowering::DAGCombinerInfo &DCI,
12702 const ARMSubtarget *Subtarget) {
12703 // Only perform optimization if after legalize, and if NEON is available. We
12704 // also expected both operands to be BUILD_VECTORs.
12705 if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
12706 || N0.getOpcode() != ISD::BUILD_VECTOR
12707 || N1.getOpcode() != ISD::BUILD_VECTOR)
12708 return SDValue();
12709
12710 // Check output type since VPADDL operand elements can only be 8, 16, or 32.
12711 EVT VT = N->getValueType(ResNo: 0);
12712 if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
12713 return SDValue();
12714
12715 // Check that the vector operands are of the right form.
12716 // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
12717 // operands, where N is the size of the formed vector.
12718 // Each EXTRACT_VECTOR should have the same input vector and odd or even
12719 // index such that we have a pair wise add pattern.
12720
12721 // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
12722 if (N0->getOperand(Num: 0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
12723 return SDValue();
12724 SDValue Vec = N0->getOperand(Num: 0)->getOperand(Num: 0);
12725 SDNode *V = Vec.getNode();
12726 unsigned nextIndex = 0;
12727
12728 // For each operands to the ADD which are BUILD_VECTORs,
12729 // check to see if each of their operands are an EXTRACT_VECTOR with
12730 // the same vector and appropriate index.
12731 for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
12732 if (N0->getOperand(Num: i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT
12733 && N1->getOperand(Num: i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
12734
12735 SDValue ExtVec0 = N0->getOperand(Num: i);
12736 SDValue ExtVec1 = N1->getOperand(Num: i);
12737
12738 // First operand is the vector, verify its the same.
12739 if (V != ExtVec0->getOperand(Num: 0).getNode() ||
12740 V != ExtVec1->getOperand(Num: 0).getNode())
12741 return SDValue();
12742
12743 // Second is the constant, verify its correct.
12744 ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(Val: ExtVec0->getOperand(Num: 1));
12745 ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(Val: ExtVec1->getOperand(Num: 1));
12746
12747 // For the constant, we want to see all the even or all the odd.
12748 if (!C0 || !C1 || C0->getZExtValue() != nextIndex
12749 || C1->getZExtValue() != nextIndex+1)
12750 return SDValue();
12751
12752 // Increment index.
12753 nextIndex+=2;
12754 } else
12755 return SDValue();
12756 }
12757
12758 // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure
12759 // we're using the entire input vector, otherwise there's a size/legality
12760 // mismatch somewhere.
12761 if (nextIndex != Vec.getValueType().getVectorNumElements() ||
12762 Vec.getValueType().getVectorElementType() == VT.getVectorElementType())
12763 return SDValue();
12764
12765 // Create VPADDL node.
12766 SelectionDAG &DAG = DCI.DAG;
12767 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12768
12769 SDLoc dl(N);
12770
12771 // Build operand list.
12772 SmallVector<SDValue, 8> Ops;
12773 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl,
12774 TLI.getPointerTy(DAG.getDataLayout())));
12775
12776 // Input is the vector.
12777 Ops.push_back(Elt: Vec);
12778
12779 // Get widened type and narrowed type.
12780 MVT widenType;
12781 unsigned numElem = VT.getVectorNumElements();
12782
12783 EVT inputLaneType = Vec.getValueType().getVectorElementType();
12784 switch (inputLaneType.getSimpleVT().SimpleTy) {
12785 case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
12786 case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
12787 case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
12788 default:
12789 llvm_unreachable("Invalid vector element type for padd optimization.");
12790 }
12791
12792 SDValue tmp = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: widenType, Ops);
12793 unsigned ExtOp = VT.bitsGT(VT: tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
12794 return DAG.getNode(Opcode: ExtOp, DL: dl, VT, Operand: tmp);
12795}
12796
12797static SDValue findMUL_LOHI(SDValue V) {
12798 if (V->getOpcode() == ISD::UMUL_LOHI ||
12799 V->getOpcode() == ISD::SMUL_LOHI)
12800 return V;
12801 return SDValue();
12802}
12803
12804static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode,
12805 TargetLowering::DAGCombinerInfo &DCI,
12806 const ARMSubtarget *Subtarget) {
12807 if (!Subtarget->hasBaseDSP())
12808 return SDValue();
12809
12810 // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and
12811 // accumulates the product into a 64-bit value. The 16-bit values will
12812 // be sign extended somehow or SRA'd into 32-bit values
12813 // (addc (adde (mul 16bit, 16bit), lo), hi)
12814 SDValue Mul = AddcNode->getOperand(Num: 0);
12815 SDValue Lo = AddcNode->getOperand(Num: 1);
12816 if (Mul.getOpcode() != ISD::MUL) {
12817 Lo = AddcNode->getOperand(Num: 0);
12818 Mul = AddcNode->getOperand(Num: 1);
12819 if (Mul.getOpcode() != ISD::MUL)
12820 return SDValue();
12821 }
12822
12823 SDValue SRA = AddeNode->getOperand(Num: 0);
12824 SDValue Hi = AddeNode->getOperand(Num: 1);
12825 if (SRA.getOpcode() != ISD::SRA) {
12826 SRA = AddeNode->getOperand(Num: 1);
12827 Hi = AddeNode->getOperand(Num: 0);
12828 if (SRA.getOpcode() != ISD::SRA)
12829 return SDValue();
12830 }
12831 if (auto Const = dyn_cast<ConstantSDNode>(Val: SRA.getOperand(i: 1))) {
12832 if (Const->getZExtValue() != 31)
12833 return SDValue();
12834 } else
12835 return SDValue();
12836
12837 if (SRA.getOperand(i: 0) != Mul)
12838 return SDValue();
12839
12840 SelectionDAG &DAG = DCI.DAG;
12841 SDLoc dl(AddcNode);
12842 unsigned Opcode = 0;
12843 SDValue Op0;
12844 SDValue Op1;
12845
12846 if (isS16(Op: Mul.getOperand(i: 0), DAG) && isS16(Op: Mul.getOperand(i: 1), DAG)) {
12847 Opcode = ARMISD::SMLALBB;
12848 Op0 = Mul.getOperand(i: 0);
12849 Op1 = Mul.getOperand(i: 1);
12850 } else if (isS16(Op: Mul.getOperand(i: 0), DAG) && isSRA16(Op: Mul.getOperand(i: 1))) {
12851 Opcode = ARMISD::SMLALBT;
12852 Op0 = Mul.getOperand(i: 0);
12853 Op1 = Mul.getOperand(i: 1).getOperand(i: 0);
12854 } else if (isSRA16(Op: Mul.getOperand(i: 0)) && isS16(Op: Mul.getOperand(i: 1), DAG)) {
12855 Opcode = ARMISD::SMLALTB;
12856 Op0 = Mul.getOperand(i: 0).getOperand(i: 0);
12857 Op1 = Mul.getOperand(i: 1);
12858 } else if (isSRA16(Op: Mul.getOperand(i: 0)) && isSRA16(Op: Mul.getOperand(i: 1))) {
12859 Opcode = ARMISD::SMLALTT;
12860 Op0 = Mul->getOperand(Num: 0).getOperand(i: 0);
12861 Op1 = Mul->getOperand(Num: 1).getOperand(i: 0);
12862 }
12863
12864 if (!Op0 || !Op1)
12865 return SDValue();
12866
12867 SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
12868 Op0, Op1, Lo, Hi);
12869 // Replace the ADDs' nodes uses by the MLA node's values.
12870 SDValue HiMLALResult(SMLAL.getNode(), 1);
12871 SDValue LoMLALResult(SMLAL.getNode(), 0);
12872
12873 DAG.ReplaceAllUsesOfValueWith(From: SDValue(AddcNode, 0), To: LoMLALResult);
12874 DAG.ReplaceAllUsesOfValueWith(From: SDValue(AddeNode, 0), To: HiMLALResult);
12875
12876 // Return original node to notify the driver to stop replacing.
12877 SDValue resNode(AddcNode, 0);
12878 return resNode;
12879}
12880
12881static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode,
12882 TargetLowering::DAGCombinerInfo &DCI,
12883 const ARMSubtarget *Subtarget) {
12884 // Look for multiply add opportunities.
12885 // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
12886 // each add nodes consumes a value from ISD::UMUL_LOHI and there is
12887 // a glue link from the first add to the second add.
12888 // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
12889 // a S/UMLAL instruction.
12890 // UMUL_LOHI
12891 // / :lo \ :hi
12892 // V \ [no multiline comment]
12893 // loAdd -> ADDC |
12894 // \ :carry /
12895 // V V
12896 // ADDE <- hiAdd
12897 //
12898 // In the special case where only the higher part of a signed result is used
12899 // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts
12900 // a constant with the exact value of 0x80000000, we recognize we are dealing
12901 // with a "rounded multiply and add" (or subtract) and transform it into
12902 // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively.
12903
12904 assert((AddeSubeNode->getOpcode() == ARMISD::ADDE ||
12905 AddeSubeNode->getOpcode() == ARMISD::SUBE) &&
12906 "Expect an ADDE or SUBE");
12907
12908 assert(AddeSubeNode->getNumOperands() == 3 &&
12909 AddeSubeNode->getOperand(2).getValueType() == MVT::i32 &&
12910 "ADDE node has the wrong inputs");
12911
12912 // Check that we are chained to the right ADDC or SUBC node.
12913 SDNode *AddcSubcNode = AddeSubeNode->getOperand(Num: 2).getNode();
12914 if ((AddeSubeNode->getOpcode() == ARMISD::ADDE &&
12915 AddcSubcNode->getOpcode() != ARMISD::ADDC) ||
12916 (AddeSubeNode->getOpcode() == ARMISD::SUBE &&
12917 AddcSubcNode->getOpcode() != ARMISD::SUBC))
12918 return SDValue();
12919
12920 SDValue AddcSubcOp0 = AddcSubcNode->getOperand(Num: 0);
12921 SDValue AddcSubcOp1 = AddcSubcNode->getOperand(Num: 1);
12922
12923 // Check if the two operands are from the same mul_lohi node.
12924 if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode())
12925 return SDValue();
12926
12927 assert(AddcSubcNode->getNumValues() == 2 &&
12928 AddcSubcNode->getValueType(0) == MVT::i32 &&
12929 "Expect ADDC with two result values. First: i32");
12930
12931 // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it
12932 // maybe a SMLAL which multiplies two 16-bit values.
12933 if (AddeSubeNode->getOpcode() == ARMISD::ADDE &&
12934 AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI &&
12935 AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI &&
12936 AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI &&
12937 AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI)
12938 return AddCombineTo64BitSMLAL16(AddcNode: AddcSubcNode, AddeNode: AddeSubeNode, DCI, Subtarget);
12939
12940 // Check for the triangle shape.
12941 SDValue AddeSubeOp0 = AddeSubeNode->getOperand(Num: 0);
12942 SDValue AddeSubeOp1 = AddeSubeNode->getOperand(Num: 1);
12943
12944 // Make sure that the ADDE/SUBE operands are not coming from the same node.
12945 if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode())
12946 return SDValue();
12947
12948 // Find the MUL_LOHI node walking up ADDE/SUBE's operands.
12949 bool IsLeftOperandMUL = false;
12950 SDValue MULOp = findMUL_LOHI(V: AddeSubeOp0);
12951 if (MULOp == SDValue())
12952 MULOp = findMUL_LOHI(V: AddeSubeOp1);
12953 else
12954 IsLeftOperandMUL = true;
12955 if (MULOp == SDValue())
12956 return SDValue();
12957
12958 // Figure out the right opcode.
12959 unsigned Opc = MULOp->getOpcode();
12960 unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
12961
12962 // Figure out the high and low input values to the MLAL node.
12963 SDValue *HiAddSub = nullptr;
12964 SDValue *LoMul = nullptr;
12965 SDValue *LowAddSub = nullptr;
12966
12967 // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI.
12968 if ((AddeSubeOp0 != MULOp.getValue(R: 1)) && (AddeSubeOp1 != MULOp.getValue(R: 1)))
12969 return SDValue();
12970
12971 if (IsLeftOperandMUL)
12972 HiAddSub = &AddeSubeOp1;
12973 else
12974 HiAddSub = &AddeSubeOp0;
12975
12976 // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node
12977 // whose low result is fed to the ADDC/SUBC we are checking.
12978
12979 if (AddcSubcOp0 == MULOp.getValue(R: 0)) {
12980 LoMul = &AddcSubcOp0;
12981 LowAddSub = &AddcSubcOp1;
12982 }
12983 if (AddcSubcOp1 == MULOp.getValue(R: 0)) {
12984 LoMul = &AddcSubcOp1;
12985 LowAddSub = &AddcSubcOp0;
12986 }
12987
12988 if (!LoMul)
12989 return SDValue();
12990
12991 // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC
12992 // the replacement below will create a cycle.
12993 if (AddcSubcNode == HiAddSub->getNode() ||
12994 AddcSubcNode->isPredecessorOf(N: HiAddSub->getNode()))
12995 return SDValue();
12996
12997 // Create the merged node.
12998 SelectionDAG &DAG = DCI.DAG;
12999
13000 // Start building operand list.
13001 SmallVector<SDValue, 8> Ops;
13002 Ops.push_back(Elt: LoMul->getOperand(i: 0));
13003 Ops.push_back(Elt: LoMul->getOperand(i: 1));
13004
13005 // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be
13006 // the case, we must be doing signed multiplication and only use the higher
13007 // part of the result of the MLAL, furthermore the LowAddSub must be a constant
13008 // addition or subtraction with the value of 0x800000.
13009 if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() &&
13010 FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(Value: 1) &&
13011 LowAddSub->getNode()->getOpcode() == ISD::Constant &&
13012 static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() ==
13013 0x80000000) {
13014 Ops.push_back(Elt: *HiAddSub);
13015 if (AddcSubcNode->getOpcode() == ARMISD::SUBC) {
13016 FinalOpc = ARMISD::SMMLSR;
13017 } else {
13018 FinalOpc = ARMISD::SMMLAR;
13019 }
13020 SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops);
13021 DAG.ReplaceAllUsesOfValueWith(From: SDValue(AddeSubeNode, 0), To: NewNode);
13022
13023 return SDValue(AddeSubeNode, 0);
13024 } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC)
13025 // SMMLS is generated during instruction selection and the rest of this
13026 // function can not handle the case where AddcSubcNode is a SUBC.
13027 return SDValue();
13028
13029 // Finish building the operand list for {U/S}MLAL
13030 Ops.push_back(Elt: *LowAddSub);
13031 Ops.push_back(Elt: *HiAddSub);
13032
13033 SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode),
13034 DAG.getVTList(MVT::i32, MVT::i32), Ops);
13035
13036 // Replace the ADDs' nodes uses by the MLA node's values.
13037 SDValue HiMLALResult(MLALNode.getNode(), 1);
13038 DAG.ReplaceAllUsesOfValueWith(From: SDValue(AddeSubeNode, 0), To: HiMLALResult);
13039
13040 SDValue LoMLALResult(MLALNode.getNode(), 0);
13041 DAG.ReplaceAllUsesOfValueWith(From: SDValue(AddcSubcNode, 0), To: LoMLALResult);
13042
13043 // Return original node to notify the driver to stop replacing.
13044 return SDValue(AddeSubeNode, 0);
13045}
13046
13047static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode,
13048 TargetLowering::DAGCombinerInfo &DCI,
13049 const ARMSubtarget *Subtarget) {
13050 // UMAAL is similar to UMLAL except that it adds two unsigned values.
13051 // While trying to combine for the other MLAL nodes, first search for the
13052 // chance to use UMAAL. Check if Addc uses a node which has already
13053 // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde
13054 // as the addend, and it's handled in PerformUMLALCombine.
13055
13056 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
13057 return AddCombineTo64bitMLAL(AddeSubeNode: AddeNode, DCI, Subtarget);
13058
13059 // Check that we have a glued ADDC node.
13060 SDNode* AddcNode = AddeNode->getOperand(Num: 2).getNode();
13061 if (AddcNode->getOpcode() != ARMISD::ADDC)
13062 return SDValue();
13063
13064 // Find the converted UMAAL or quit if it doesn't exist.
13065 SDNode *UmlalNode = nullptr;
13066 SDValue AddHi;
13067 if (AddcNode->getOperand(Num: 0).getOpcode() == ARMISD::UMLAL) {
13068 UmlalNode = AddcNode->getOperand(Num: 0).getNode();
13069 AddHi = AddcNode->getOperand(Num: 1);
13070 } else if (AddcNode->getOperand(Num: 1).getOpcode() == ARMISD::UMLAL) {
13071 UmlalNode = AddcNode->getOperand(Num: 1).getNode();
13072 AddHi = AddcNode->getOperand(Num: 0);
13073 } else {
13074 return AddCombineTo64bitMLAL(AddeSubeNode: AddeNode, DCI, Subtarget);
13075 }
13076
13077 // The ADDC should be glued to an ADDE node, which uses the same UMLAL as
13078 // the ADDC as well as Zero.
13079 if (!isNullConstant(V: UmlalNode->getOperand(Num: 3)))
13080 return SDValue();
13081
13082 if ((isNullConstant(V: AddeNode->getOperand(Num: 0)) &&
13083 AddeNode->getOperand(Num: 1).getNode() == UmlalNode) ||
13084 (AddeNode->getOperand(Num: 0).getNode() == UmlalNode &&
13085 isNullConstant(V: AddeNode->getOperand(Num: 1)))) {
13086 SelectionDAG &DAG = DCI.DAG;
13087 SDValue Ops[] = { UmlalNode->getOperand(Num: 0), UmlalNode->getOperand(Num: 1),
13088 UmlalNode->getOperand(Num: 2), AddHi };
13089 SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode),
13090 DAG.getVTList(MVT::i32, MVT::i32), Ops);
13091
13092 // Replace the ADDs' nodes uses by the UMAAL node's values.
13093 DAG.ReplaceAllUsesOfValueWith(From: SDValue(AddeNode, 0), To: SDValue(UMAAL.getNode(), 1));
13094 DAG.ReplaceAllUsesOfValueWith(From: SDValue(AddcNode, 0), To: SDValue(UMAAL.getNode(), 0));
13095
13096 // Return original node to notify the driver to stop replacing.
13097 return SDValue(AddeNode, 0);
13098 }
13099 return SDValue();
13100}
13101
13102static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG,
13103 const ARMSubtarget *Subtarget) {
13104 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
13105 return SDValue();
13106
13107 // Check that we have a pair of ADDC and ADDE as operands.
13108 // Both addends of the ADDE must be zero.
13109 SDNode* AddcNode = N->getOperand(Num: 2).getNode();
13110 SDNode* AddeNode = N->getOperand(Num: 3).getNode();
13111 if ((AddcNode->getOpcode() == ARMISD::ADDC) &&
13112 (AddeNode->getOpcode() == ARMISD::ADDE) &&
13113 isNullConstant(V: AddeNode->getOperand(Num: 0)) &&
13114 isNullConstant(V: AddeNode->getOperand(Num: 1)) &&
13115 (AddeNode->getOperand(Num: 2).getNode() == AddcNode))
13116 return DAG.getNode(ARMISD::UMAAL, SDLoc(N),
13117 DAG.getVTList(MVT::i32, MVT::i32),
13118 {N->getOperand(0), N->getOperand(1),
13119 AddcNode->getOperand(0), AddcNode->getOperand(1)});
13120 else
13121 return SDValue();
13122}
13123
13124static SDValue PerformAddcSubcCombine(SDNode *N,
13125 TargetLowering::DAGCombinerInfo &DCI,
13126 const ARMSubtarget *Subtarget) {
13127 SelectionDAG &DAG(DCI.DAG);
13128
13129 if (N->getOpcode() == ARMISD::SUBC && N->hasAnyUseOfValue(Value: 1)) {
13130 // (SUBC (ADDE 0, 0, C), 1) -> C
13131 SDValue LHS = N->getOperand(Num: 0);
13132 SDValue RHS = N->getOperand(Num: 1);
13133 if (LHS->getOpcode() == ARMISD::ADDE &&
13134 isNullConstant(V: LHS->getOperand(Num: 0)) &&
13135 isNullConstant(V: LHS->getOperand(Num: 1)) && isOneConstant(V: RHS)) {
13136 return DCI.CombineTo(N, Res0: SDValue(N, 0), Res1: LHS->getOperand(Num: 2));
13137 }
13138 }
13139
13140 if (Subtarget->isThumb1Only()) {
13141 SDValue RHS = N->getOperand(Num: 1);
13142 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: RHS)) {
13143 int32_t imm = C->getSExtValue();
13144 if (imm < 0 && imm > std::numeric_limits<int>::min()) {
13145 SDLoc DL(N);
13146 RHS = DAG.getConstant(-imm, DL, MVT::i32);
13147 unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC
13148 : ARMISD::ADDC;
13149 return DAG.getNode(Opcode, DL, VTList: N->getVTList(), N1: N->getOperand(Num: 0), N2: RHS);
13150 }
13151 }
13152 }
13153
13154 return SDValue();
13155}
13156
13157static SDValue PerformAddeSubeCombine(SDNode *N,
13158 TargetLowering::DAGCombinerInfo &DCI,
13159 const ARMSubtarget *Subtarget) {
13160 if (Subtarget->isThumb1Only()) {
13161 SelectionDAG &DAG = DCI.DAG;
13162 SDValue RHS = N->getOperand(Num: 1);
13163 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: RHS)) {
13164 int64_t imm = C->getSExtValue();
13165 if (imm < 0) {
13166 SDLoc DL(N);
13167
13168 // The with-carry-in form matches bitwise not instead of the negation.
13169 // Effectively, the inverse interpretation of the carry flag already
13170 // accounts for part of the negation.
13171 RHS = DAG.getConstant(~imm, DL, MVT::i32);
13172
13173 unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE
13174 : ARMISD::ADDE;
13175 return DAG.getNode(Opcode, DL, VTList: N->getVTList(),
13176 N1: N->getOperand(Num: 0), N2: RHS, N3: N->getOperand(Num: 2));
13177 }
13178 }
13179 } else if (N->getOperand(Num: 1)->getOpcode() == ISD::SMUL_LOHI) {
13180 return AddCombineTo64bitMLAL(AddeSubeNode: N, DCI, Subtarget);
13181 }
13182 return SDValue();
13183}
13184
13185static SDValue PerformSELECTCombine(SDNode *N,
13186 TargetLowering::DAGCombinerInfo &DCI,
13187 const ARMSubtarget *Subtarget) {
13188 if (!Subtarget->hasMVEIntegerOps())
13189 return SDValue();
13190
13191 SDLoc dl(N);
13192 SDValue SetCC;
13193 SDValue LHS;
13194 SDValue RHS;
13195 ISD::CondCode CC;
13196 SDValue TrueVal;
13197 SDValue FalseVal;
13198
13199 if (N->getOpcode() == ISD::SELECT &&
13200 N->getOperand(Num: 0)->getOpcode() == ISD::SETCC) {
13201 SetCC = N->getOperand(Num: 0);
13202 LHS = SetCC->getOperand(Num: 0);
13203 RHS = SetCC->getOperand(Num: 1);
13204 CC = cast<CondCodeSDNode>(Val: SetCC->getOperand(Num: 2))->get();
13205 TrueVal = N->getOperand(Num: 1);
13206 FalseVal = N->getOperand(Num: 2);
13207 } else if (N->getOpcode() == ISD::SELECT_CC) {
13208 LHS = N->getOperand(Num: 0);
13209 RHS = N->getOperand(Num: 1);
13210 CC = cast<CondCodeSDNode>(Val: N->getOperand(Num: 4))->get();
13211 TrueVal = N->getOperand(Num: 2);
13212 FalseVal = N->getOperand(Num: 3);
13213 } else {
13214 return SDValue();
13215 }
13216
13217 unsigned int Opcode = 0;
13218 if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMIN ||
13219 FalseVal->getOpcode() == ISD::VECREDUCE_UMIN) &&
13220 (CC == ISD::SETULT || CC == ISD::SETUGT)) {
13221 Opcode = ARMISD::VMINVu;
13222 if (CC == ISD::SETUGT)
13223 std::swap(a&: TrueVal, b&: FalseVal);
13224 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMIN ||
13225 FalseVal->getOpcode() == ISD::VECREDUCE_SMIN) &&
13226 (CC == ISD::SETLT || CC == ISD::SETGT)) {
13227 Opcode = ARMISD::VMINVs;
13228 if (CC == ISD::SETGT)
13229 std::swap(a&: TrueVal, b&: FalseVal);
13230 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMAX ||
13231 FalseVal->getOpcode() == ISD::VECREDUCE_UMAX) &&
13232 (CC == ISD::SETUGT || CC == ISD::SETULT)) {
13233 Opcode = ARMISD::VMAXVu;
13234 if (CC == ISD::SETULT)
13235 std::swap(a&: TrueVal, b&: FalseVal);
13236 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMAX ||
13237 FalseVal->getOpcode() == ISD::VECREDUCE_SMAX) &&
13238 (CC == ISD::SETGT || CC == ISD::SETLT)) {
13239 Opcode = ARMISD::VMAXVs;
13240 if (CC == ISD::SETLT)
13241 std::swap(a&: TrueVal, b&: FalseVal);
13242 } else
13243 return SDValue();
13244
13245 // Normalise to the right hand side being the vector reduction
13246 switch (TrueVal->getOpcode()) {
13247 case ISD::VECREDUCE_UMIN:
13248 case ISD::VECREDUCE_SMIN:
13249 case ISD::VECREDUCE_UMAX:
13250 case ISD::VECREDUCE_SMAX:
13251 std::swap(a&: LHS, b&: RHS);
13252 std::swap(a&: TrueVal, b&: FalseVal);
13253 break;
13254 }
13255
13256 EVT VectorType = FalseVal->getOperand(Num: 0).getValueType();
13257
13258 if (VectorType != MVT::v16i8 && VectorType != MVT::v8i16 &&
13259 VectorType != MVT::v4i32)
13260 return SDValue();
13261
13262 EVT VectorScalarType = VectorType.getVectorElementType();
13263
13264 // The values being selected must also be the ones being compared
13265 if (TrueVal != LHS || FalseVal != RHS)
13266 return SDValue();
13267
13268 EVT LeftType = LHS->getValueType(ResNo: 0);
13269 EVT RightType = RHS->getValueType(ResNo: 0);
13270
13271 // The types must match the reduced type too
13272 if (LeftType != VectorScalarType || RightType != VectorScalarType)
13273 return SDValue();
13274
13275 // Legalise the scalar to an i32
13276 if (VectorScalarType != MVT::i32)
13277 LHS = DCI.DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
13278
13279 // Generate the reduction as an i32 for legalisation purposes
13280 auto Reduction =
13281 DCI.DAG.getNode(Opcode, dl, MVT::i32, LHS, RHS->getOperand(0));
13282
13283 // The result isn't actually an i32 so truncate it back to its original type
13284 if (VectorScalarType != MVT::i32)
13285 Reduction = DCI.DAG.getNode(ISD::TRUNCATE, dl, VectorScalarType, Reduction);
13286
13287 return Reduction;
13288}
13289
13290// A special combine for the vqdmulh family of instructions. This is one of the
13291// potential set of patterns that could patch this instruction. The base pattern
13292// you would expect to be min(max(ashr(mul(mul(sext(x), 2), sext(y)), 16))).
13293// This matches the different min(max(ashr(mul(mul(sext(x), sext(y)), 2), 16))),
13294// which llvm will have optimized to min(ashr(mul(sext(x), sext(y)), 15))) as
13295// the max is unnecessary.
13296static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG) {
13297 EVT VT = N->getValueType(ResNo: 0);
13298 SDValue Shft;
13299 ConstantSDNode *Clamp;
13300
13301 if (!VT.isVector() || VT.getScalarSizeInBits() > 64)
13302 return SDValue();
13303
13304 if (N->getOpcode() == ISD::SMIN) {
13305 Shft = N->getOperand(Num: 0);
13306 Clamp = isConstOrConstSplat(N: N->getOperand(Num: 1));
13307 } else if (N->getOpcode() == ISD::VSELECT) {
13308 // Detect a SMIN, which for an i64 node will be a vselect/setcc, not a smin.
13309 SDValue Cmp = N->getOperand(Num: 0);
13310 if (Cmp.getOpcode() != ISD::SETCC ||
13311 cast<CondCodeSDNode>(Val: Cmp.getOperand(i: 2))->get() != ISD::SETLT ||
13312 Cmp.getOperand(i: 0) != N->getOperand(Num: 1) ||
13313 Cmp.getOperand(i: 1) != N->getOperand(Num: 2))
13314 return SDValue();
13315 Shft = N->getOperand(Num: 1);
13316 Clamp = isConstOrConstSplat(N: N->getOperand(Num: 2));
13317 } else
13318 return SDValue();
13319
13320 if (!Clamp)
13321 return SDValue();
13322
13323 MVT ScalarType;
13324 int ShftAmt = 0;
13325 switch (Clamp->getSExtValue()) {
13326 case (1 << 7) - 1:
13327 ScalarType = MVT::i8;
13328 ShftAmt = 7;
13329 break;
13330 case (1 << 15) - 1:
13331 ScalarType = MVT::i16;
13332 ShftAmt = 15;
13333 break;
13334 case (1ULL << 31) - 1:
13335 ScalarType = MVT::i32;
13336 ShftAmt = 31;
13337 break;
13338 default:
13339 return SDValue();
13340 }
13341
13342 if (Shft.getOpcode() != ISD::SRA)
13343 return SDValue();
13344 ConstantSDNode *N1 = isConstOrConstSplat(N: Shft.getOperand(i: 1));
13345 if (!N1 || N1->getSExtValue() != ShftAmt)
13346 return SDValue();
13347
13348 SDValue Mul = Shft.getOperand(i: 0);
13349 if (Mul.getOpcode() != ISD::MUL)
13350 return SDValue();
13351
13352 SDValue Ext0 = Mul.getOperand(i: 0);
13353 SDValue Ext1 = Mul.getOperand(i: 1);
13354 if (Ext0.getOpcode() != ISD::SIGN_EXTEND ||
13355 Ext1.getOpcode() != ISD::SIGN_EXTEND)
13356 return SDValue();
13357 EVT VecVT = Ext0.getOperand(i: 0).getValueType();
13358 if (!VecVT.isPow2VectorType() || VecVT.getVectorNumElements() == 1)
13359 return SDValue();
13360 if (Ext1.getOperand(i: 0).getValueType() != VecVT ||
13361 VecVT.getScalarType() != ScalarType ||
13362 VT.getScalarSizeInBits() < ScalarType.getScalarSizeInBits() * 2)
13363 return SDValue();
13364
13365 SDLoc DL(Mul);
13366 unsigned LegalLanes = 128 / (ShftAmt + 1);
13367 EVT LegalVecVT = MVT::getVectorVT(VT: ScalarType, NumElements: LegalLanes);
13368 // For types smaller than legal vectors extend to be legal and only use needed
13369 // lanes.
13370 if (VecVT.getSizeInBits() < 128) {
13371 EVT ExtVecVT =
13372 MVT::getVectorVT(VT: MVT::getIntegerVT(BitWidth: 128 / VecVT.getVectorNumElements()),
13373 NumElements: VecVT.getVectorNumElements());
13374 SDValue Inp0 =
13375 DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: ExtVecVT, Operand: Ext0.getOperand(i: 0));
13376 SDValue Inp1 =
13377 DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: ExtVecVT, Operand: Ext1.getOperand(i: 0));
13378 Inp0 = DAG.getNode(Opcode: ARMISD::VECTOR_REG_CAST, DL, VT: LegalVecVT, Operand: Inp0);
13379 Inp1 = DAG.getNode(Opcode: ARMISD::VECTOR_REG_CAST, DL, VT: LegalVecVT, Operand: Inp1);
13380 SDValue VQDMULH = DAG.getNode(Opcode: ARMISD::VQDMULH, DL, VT: LegalVecVT, N1: Inp0, N2: Inp1);
13381 SDValue Trunc = DAG.getNode(Opcode: ARMISD::VECTOR_REG_CAST, DL, VT: ExtVecVT, Operand: VQDMULH);
13382 Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VecVT, Operand: Trunc);
13383 return DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT, Operand: Trunc);
13384 }
13385
13386 // For larger types, split into legal sized chunks.
13387 assert(VecVT.getSizeInBits() % 128 == 0 && "Expected a power2 type");
13388 unsigned NumParts = VecVT.getSizeInBits() / 128;
13389 SmallVector<SDValue> Parts;
13390 for (unsigned I = 0; I < NumParts; ++I) {
13391 SDValue Inp0 =
13392 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: LegalVecVT, N1: Ext0.getOperand(i: 0),
13393 N2: DAG.getVectorIdxConstant(Val: I * LegalLanes, DL));
13394 SDValue Inp1 =
13395 DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: LegalVecVT, N1: Ext1.getOperand(i: 0),
13396 N2: DAG.getVectorIdxConstant(Val: I * LegalLanes, DL));
13397 SDValue VQDMULH = DAG.getNode(Opcode: ARMISD::VQDMULH, DL, VT: LegalVecVT, N1: Inp0, N2: Inp1);
13398 Parts.push_back(Elt: VQDMULH);
13399 }
13400 return DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT,
13401 Operand: DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: VecVT, Ops: Parts));
13402}
13403
13404static SDValue PerformVSELECTCombine(SDNode *N,
13405 TargetLowering::DAGCombinerInfo &DCI,
13406 const ARMSubtarget *Subtarget) {
13407 if (!Subtarget->hasMVEIntegerOps())
13408 return SDValue();
13409
13410 if (SDValue V = PerformVQDMULHCombine(N, DAG&: DCI.DAG))
13411 return V;
13412
13413 // Transforms vselect(not(cond), lhs, rhs) into vselect(cond, rhs, lhs).
13414 //
13415 // We need to re-implement this optimization here as the implementation in the
13416 // Target-Independent DAGCombiner does not handle the kind of constant we make
13417 // (it calls isConstOrConstSplat with AllowTruncation set to false - and for
13418 // good reason, allowing truncation there would break other targets).
13419 //
13420 // Currently, this is only done for MVE, as it's the only target that benefits
13421 // from this transformation (e.g. VPNOT+VPSEL becomes a single VPSEL).
13422 if (N->getOperand(Num: 0).getOpcode() != ISD::XOR)
13423 return SDValue();
13424 SDValue XOR = N->getOperand(Num: 0);
13425
13426 // Check if the XOR's RHS is either a 1, or a BUILD_VECTOR of 1s.
13427 // It is important to check with truncation allowed as the BUILD_VECTORs we
13428 // generate in those situations will truncate their operands.
13429 ConstantSDNode *Const =
13430 isConstOrConstSplat(N: XOR->getOperand(Num: 1), /*AllowUndefs*/ false,
13431 /*AllowTruncation*/ true);
13432 if (!Const || !Const->isOne())
13433 return SDValue();
13434
13435 // Rewrite into vselect(cond, rhs, lhs).
13436 SDValue Cond = XOR->getOperand(Num: 0);
13437 SDValue LHS = N->getOperand(Num: 1);
13438 SDValue RHS = N->getOperand(Num: 2);
13439 EVT Type = N->getValueType(ResNo: 0);
13440 return DCI.DAG.getNode(Opcode: ISD::VSELECT, DL: SDLoc(N), VT: Type, N1: Cond, N2: RHS, N3: LHS);
13441}
13442
13443// Convert vsetcc([0,1,2,..], splat(n), ult) -> vctp n
13444static SDValue PerformVSetCCToVCTPCombine(SDNode *N,
13445 TargetLowering::DAGCombinerInfo &DCI,
13446 const ARMSubtarget *Subtarget) {
13447 SDValue Op0 = N->getOperand(Num: 0);
13448 SDValue Op1 = N->getOperand(Num: 1);
13449 ISD::CondCode CC = cast<CondCodeSDNode>(Val: N->getOperand(Num: 2))->get();
13450 EVT VT = N->getValueType(ResNo: 0);
13451
13452 if (!Subtarget->hasMVEIntegerOps() ||
13453 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
13454 return SDValue();
13455
13456 if (CC == ISD::SETUGE) {
13457 std::swap(a&: Op0, b&: Op1);
13458 CC = ISD::SETULT;
13459 }
13460
13461 if (CC != ISD::SETULT || VT.getScalarSizeInBits() != 1 ||
13462 Op0.getOpcode() != ISD::BUILD_VECTOR)
13463 return SDValue();
13464
13465 // Check first operand is BuildVector of 0,1,2,...
13466 for (unsigned I = 0; I < VT.getVectorNumElements(); I++) {
13467 if (!Op0.getOperand(i: I).isUndef() &&
13468 !(isa<ConstantSDNode>(Val: Op0.getOperand(i: I)) &&
13469 Op0.getConstantOperandVal(i: I) == I))
13470 return SDValue();
13471 }
13472
13473 // The second is a Splat of Op1S
13474 SDValue Op1S = DCI.DAG.getSplatValue(V: Op1);
13475 if (!Op1S)
13476 return SDValue();
13477
13478 unsigned Opc;
13479 switch (VT.getVectorNumElements()) {
13480 case 2:
13481 Opc = Intrinsic::arm_mve_vctp64;
13482 break;
13483 case 4:
13484 Opc = Intrinsic::arm_mve_vctp32;
13485 break;
13486 case 8:
13487 Opc = Intrinsic::arm_mve_vctp16;
13488 break;
13489 case 16:
13490 Opc = Intrinsic::arm_mve_vctp8;
13491 break;
13492 default:
13493 return SDValue();
13494 }
13495
13496 SDLoc DL(N);
13497 return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
13498 DCI.DAG.getConstant(Opc, DL, MVT::i32),
13499 DCI.DAG.getZExtOrTrunc(Op1S, DL, MVT::i32));
13500}
13501
13502static SDValue PerformABSCombine(SDNode *N,
13503 TargetLowering::DAGCombinerInfo &DCI,
13504 const ARMSubtarget *Subtarget) {
13505 SelectionDAG &DAG = DCI.DAG;
13506 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
13507
13508 if (TLI.isOperationLegal(Op: N->getOpcode(), VT: N->getValueType(ResNo: 0)))
13509 return SDValue();
13510
13511 return TLI.expandABS(N, DAG);
13512}
13513
13514/// PerformADDECombine - Target-specific dag combine transform from
13515/// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or
13516/// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
13517static SDValue PerformADDECombine(SDNode *N,
13518 TargetLowering::DAGCombinerInfo &DCI,
13519 const ARMSubtarget *Subtarget) {
13520 // Only ARM and Thumb2 support UMLAL/SMLAL.
13521 if (Subtarget->isThumb1Only())
13522 return PerformAddeSubeCombine(N, DCI, Subtarget);
13523
13524 // Only perform the checks after legalize when the pattern is available.
13525 if (DCI.isBeforeLegalize()) return SDValue();
13526
13527 return AddCombineTo64bitUMAAL(AddeNode: N, DCI, Subtarget);
13528}
13529
13530/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
13531/// operands N0 and N1. This is a helper for PerformADDCombine that is
13532/// called with the default operands, and if that fails, with commuted
13533/// operands.
13534static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
13535 TargetLowering::DAGCombinerInfo &DCI,
13536 const ARMSubtarget *Subtarget){
13537 // Attempt to create vpadd for this add.
13538 if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget))
13539 return Result;
13540
13541 // Attempt to create vpaddl for this add.
13542 if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget))
13543 return Result;
13544 if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI,
13545 Subtarget))
13546 return Result;
13547
13548 // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
13549 if (N0.getNode()->hasOneUse())
13550 if (SDValue Result = combineSelectAndUse(N, Slct: N0, OtherOp: N1, DCI))
13551 return Result;
13552 return SDValue();
13553}
13554
13555static SDValue TryDistrubutionADDVecReduce(SDNode *N, SelectionDAG &DAG) {
13556 EVT VT = N->getValueType(ResNo: 0);
13557 SDValue N0 = N->getOperand(Num: 0);
13558 SDValue N1 = N->getOperand(Num: 1);
13559 SDLoc dl(N);
13560
13561 auto IsVecReduce = [](SDValue Op) {
13562 switch (Op.getOpcode()) {
13563 case ISD::VECREDUCE_ADD:
13564 case ARMISD::VADDVs:
13565 case ARMISD::VADDVu:
13566 case ARMISD::VMLAVs:
13567 case ARMISD::VMLAVu:
13568 return true;
13569 }
13570 return false;
13571 };
13572
13573 auto DistrubuteAddAddVecReduce = [&](SDValue N0, SDValue N1) {
13574 // Distribute add(X, add(vecreduce(Y), vecreduce(Z))) ->
13575 // add(add(X, vecreduce(Y)), vecreduce(Z))
13576 // to make better use of vaddva style instructions.
13577 if (VT == MVT::i32 && N1.getOpcode() == ISD::ADD && !IsVecReduce(N0) &&
13578 IsVecReduce(N1.getOperand(0)) && IsVecReduce(N1.getOperand(1)) &&
13579 !isa<ConstantSDNode>(N0) && N1->hasOneUse()) {
13580 SDValue Add0 = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: N0, N2: N1.getOperand(i: 0));
13581 return DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: Add0, N2: N1.getOperand(i: 1));
13582 }
13583 // And turn add(add(A, reduce(B)), add(C, reduce(D))) ->
13584 // add(add(add(A, C), reduce(B)), reduce(D))
13585 if (VT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
13586 N1.getOpcode() == ISD::ADD && N0->hasOneUse() && N1->hasOneUse()) {
13587 unsigned N0RedOp = 0;
13588 if (!IsVecReduce(N0.getOperand(i: N0RedOp))) {
13589 N0RedOp = 1;
13590 if (!IsVecReduce(N0.getOperand(i: N0RedOp)))
13591 return SDValue();
13592 }
13593
13594 unsigned N1RedOp = 0;
13595 if (!IsVecReduce(N1.getOperand(i: N1RedOp)))
13596 N1RedOp = 1;
13597 if (!IsVecReduce(N1.getOperand(i: N1RedOp)))
13598 return SDValue();
13599
13600 SDValue Add0 = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: N0.getOperand(i: 1 - N0RedOp),
13601 N2: N1.getOperand(i: 1 - N1RedOp));
13602 SDValue Add1 =
13603 DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: Add0, N2: N0.getOperand(i: N0RedOp));
13604 return DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: Add1, N2: N1.getOperand(i: N1RedOp));
13605 }
13606 return SDValue();
13607 };
13608 if (SDValue R = DistrubuteAddAddVecReduce(N0, N1))
13609 return R;
13610 if (SDValue R = DistrubuteAddAddVecReduce(N1, N0))
13611 return R;
13612
13613 // Distribute add(vecreduce(load(Y)), vecreduce(load(Z)))
13614 // Or add(add(X, vecreduce(load(Y))), vecreduce(load(Z)))
13615 // by ascending load offsets. This can help cores prefetch if the order of
13616 // loads is more predictable.
13617 auto DistrubuteVecReduceLoad = [&](SDValue N0, SDValue N1, bool IsForward) {
13618 // Check if two reductions are known to load data where one is before/after
13619 // another. Return negative if N0 loads data before N1, positive if N1 is
13620 // before N0 and 0 otherwise if nothing is known.
13621 auto IsKnownOrderedLoad = [&](SDValue N0, SDValue N1) {
13622 // Look through to the first operand of a MUL, for the VMLA case.
13623 // Currently only looks at the first operand, in the hope they are equal.
13624 if (N0.getOpcode() == ISD::MUL)
13625 N0 = N0.getOperand(i: 0);
13626 if (N1.getOpcode() == ISD::MUL)
13627 N1 = N1.getOperand(i: 0);
13628
13629 // Return true if the two operands are loads to the same object and the
13630 // offset of the first is known to be less than the offset of the second.
13631 LoadSDNode *Load0 = dyn_cast<LoadSDNode>(Val&: N0);
13632 LoadSDNode *Load1 = dyn_cast<LoadSDNode>(Val&: N1);
13633 if (!Load0 || !Load1 || Load0->getChain() != Load1->getChain() ||
13634 !Load0->isSimple() || !Load1->isSimple() || Load0->isIndexed() ||
13635 Load1->isIndexed())
13636 return 0;
13637
13638 auto BaseLocDecomp0 = BaseIndexOffset::match(N: Load0, DAG);
13639 auto BaseLocDecomp1 = BaseIndexOffset::match(N: Load1, DAG);
13640
13641 if (!BaseLocDecomp0.getBase() ||
13642 BaseLocDecomp0.getBase() != BaseLocDecomp1.getBase() ||
13643 !BaseLocDecomp0.hasValidOffset() || !BaseLocDecomp1.hasValidOffset())
13644 return 0;
13645 if (BaseLocDecomp0.getOffset() < BaseLocDecomp1.getOffset())
13646 return -1;
13647 if (BaseLocDecomp0.getOffset() > BaseLocDecomp1.getOffset())
13648 return 1;
13649 return 0;
13650 };
13651
13652 SDValue X;
13653 if (N0.getOpcode() == ISD::ADD && N0->hasOneUse()) {
13654 if (IsVecReduce(N0.getOperand(i: 0)) && IsVecReduce(N0.getOperand(i: 1))) {
13655 int IsBefore = IsKnownOrderedLoad(N0.getOperand(i: 0).getOperand(i: 0),
13656 N0.getOperand(i: 1).getOperand(i: 0));
13657 if (IsBefore < 0) {
13658 X = N0.getOperand(i: 0);
13659 N0 = N0.getOperand(i: 1);
13660 } else if (IsBefore > 0) {
13661 X = N0.getOperand(i: 1);
13662 N0 = N0.getOperand(i: 0);
13663 } else
13664 return SDValue();
13665 } else if (IsVecReduce(N0.getOperand(i: 0))) {
13666 X = N0.getOperand(i: 1);
13667 N0 = N0.getOperand(i: 0);
13668 } else if (IsVecReduce(N0.getOperand(i: 1))) {
13669 X = N0.getOperand(i: 0);
13670 N0 = N0.getOperand(i: 1);
13671 } else
13672 return SDValue();
13673 } else if (IsForward && IsVecReduce(N0) && IsVecReduce(N1) &&
13674 IsKnownOrderedLoad(N0.getOperand(i: 0), N1.getOperand(i: 0)) < 0) {
13675 // Note this is backward to how you would expect. We create
13676 // add(reduce(load + 16), reduce(load + 0)) so that the
13677 // add(reduce(load+16), X) is combined into VADDVA(X, load+16)), leaving
13678 // the X as VADDV(load + 0)
13679 return DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1, N2: N0);
13680 } else
13681 return SDValue();
13682
13683 if (!IsVecReduce(N0) || !IsVecReduce(N1))
13684 return SDValue();
13685
13686 if (IsKnownOrderedLoad(N1.getOperand(i: 0), N0.getOperand(i: 0)) >= 0)
13687 return SDValue();
13688
13689 // Switch from add(add(X, N0), N1) to add(add(X, N1), N0)
13690 SDValue Add0 = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: X, N2: N1);
13691 return DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: Add0, N2: N0);
13692 };
13693 if (SDValue R = DistrubuteVecReduceLoad(N0, N1, true))
13694 return R;
13695 if (SDValue R = DistrubuteVecReduceLoad(N1, N0, false))
13696 return R;
13697 return SDValue();
13698}
13699
13700static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG,
13701 const ARMSubtarget *Subtarget) {
13702 if (!Subtarget->hasMVEIntegerOps())
13703 return SDValue();
13704
13705 if (SDValue R = TryDistrubutionADDVecReduce(N, DAG))
13706 return R;
13707
13708 EVT VT = N->getValueType(ResNo: 0);
13709 SDValue N0 = N->getOperand(Num: 0);
13710 SDValue N1 = N->getOperand(Num: 1);
13711 SDLoc dl(N);
13712
13713 if (VT != MVT::i64)
13714 return SDValue();
13715
13716 // We are looking for a i64 add of a VADDLVx. Due to these being i64's, this
13717 // will look like:
13718 // t1: i32,i32 = ARMISD::VADDLVs x
13719 // t2: i64 = build_pair t1, t1:1
13720 // t3: i64 = add t2, y
13721 // Otherwise we try to push the add up above VADDLVAx, to potentially allow
13722 // the add to be simplified seperately.
13723 // We also need to check for sext / zext and commutitive adds.
13724 auto MakeVecReduce = [&](unsigned Opcode, unsigned OpcodeA, SDValue NA,
13725 SDValue NB) {
13726 if (NB->getOpcode() != ISD::BUILD_PAIR)
13727 return SDValue();
13728 SDValue VecRed = NB->getOperand(Num: 0);
13729 if ((VecRed->getOpcode() != Opcode && VecRed->getOpcode() != OpcodeA) ||
13730 VecRed.getResNo() != 0 ||
13731 NB->getOperand(Num: 1) != SDValue(VecRed.getNode(), 1))
13732 return SDValue();
13733
13734 if (VecRed->getOpcode() == OpcodeA) {
13735 // add(NA, VADDLVA(Inp), Y) -> VADDLVA(add(NA, Inp), Y)
13736 SDValue Inp = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
13737 VecRed.getOperand(0), VecRed.getOperand(1));
13738 NA = DAG.getNode(ISD::ADD, dl, MVT::i64, Inp, NA);
13739 }
13740
13741 SmallVector<SDValue, 4> Ops(2);
13742 std::tie(Ops[0], Ops[1]) = DAG.SplitScalar(NA, dl, MVT::i32, MVT::i32);
13743
13744 unsigned S = VecRed->getOpcode() == OpcodeA ? 2 : 0;
13745 for (unsigned I = S, E = VecRed.getNumOperands(); I < E; I++)
13746 Ops.push_back(Elt: VecRed->getOperand(Num: I));
13747 SDValue Red =
13748 DAG.getNode(OpcodeA, dl, DAG.getVTList({MVT::i32, MVT::i32}), Ops);
13749 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red,
13750 SDValue(Red.getNode(), 1));
13751 };
13752
13753 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N0, N1))
13754 return M;
13755 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N0, N1))
13756 return M;
13757 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N1, N0))
13758 return M;
13759 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N1, N0))
13760 return M;
13761 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N0, N1))
13762 return M;
13763 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N0, N1))
13764 return M;
13765 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N1, N0))
13766 return M;
13767 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N1, N0))
13768 return M;
13769 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N0, N1))
13770 return M;
13771 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N0, N1))
13772 return M;
13773 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N1, N0))
13774 return M;
13775 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N1, N0))
13776 return M;
13777 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N0, N1))
13778 return M;
13779 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N0, N1))
13780 return M;
13781 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N1, N0))
13782 return M;
13783 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N1, N0))
13784 return M;
13785 return SDValue();
13786}
13787
13788bool
13789ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
13790 CombineLevel Level) const {
13791 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
13792 N->getOpcode() == ISD::SRL) &&
13793 "Expected shift op");
13794
13795 if (Level == BeforeLegalizeTypes)
13796 return true;
13797
13798 if (N->getOpcode() != ISD::SHL)
13799 return true;
13800
13801 if (Subtarget->isThumb1Only()) {
13802 // Avoid making expensive immediates by commuting shifts. (This logic
13803 // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted
13804 // for free.)
13805 if (N->getOpcode() != ISD::SHL)
13806 return true;
13807 SDValue N1 = N->getOperand(Num: 0);
13808 if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND &&
13809 N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR)
13810 return true;
13811 if (auto *Const = dyn_cast<ConstantSDNode>(Val: N1->getOperand(Num: 1))) {
13812 if (Const->getAPIntValue().ult(RHS: 256))
13813 return false;
13814 if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(RHS: 0) &&
13815 Const->getAPIntValue().sgt(RHS: -256))
13816 return false;
13817 }
13818 return true;
13819 }
13820
13821 // Turn off commute-with-shift transform after legalization, so it doesn't
13822 // conflict with PerformSHLSimplify. (We could try to detect when
13823 // PerformSHLSimplify would trigger more precisely, but it isn't
13824 // really necessary.)
13825 return false;
13826}
13827
13828bool ARMTargetLowering::isDesirableToCommuteXorWithShift(
13829 const SDNode *N) const {
13830 assert(N->getOpcode() == ISD::XOR &&
13831 (N->getOperand(0).getOpcode() == ISD::SHL ||
13832 N->getOperand(0).getOpcode() == ISD::SRL) &&
13833 "Expected XOR(SHIFT) pattern");
13834
13835 // Only commute if the entire NOT mask is a hidden shifted mask.
13836 auto *XorC = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
13837 auto *ShiftC = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 0).getOperand(i: 1));
13838 if (XorC && ShiftC) {
13839 unsigned MaskIdx, MaskLen;
13840 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
13841 unsigned ShiftAmt = ShiftC->getZExtValue();
13842 unsigned BitWidth = N->getValueType(ResNo: 0).getScalarSizeInBits();
13843 if (N->getOperand(Num: 0).getOpcode() == ISD::SHL)
13844 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
13845 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
13846 }
13847 }
13848
13849 return false;
13850}
13851
13852bool ARMTargetLowering::shouldFoldConstantShiftPairToMask(
13853 const SDNode *N, CombineLevel Level) const {
13854 assert(((N->getOpcode() == ISD::SHL &&
13855 N->getOperand(0).getOpcode() == ISD::SRL) ||
13856 (N->getOpcode() == ISD::SRL &&
13857 N->getOperand(0).getOpcode() == ISD::SHL)) &&
13858 "Expected shift-shift mask");
13859
13860 if (!Subtarget->isThumb1Only())
13861 return true;
13862
13863 if (Level == BeforeLegalizeTypes)
13864 return true;
13865
13866 return false;
13867}
13868
13869bool ARMTargetLowering::shouldFoldSelectWithIdentityConstant(unsigned BinOpcode,
13870 EVT VT) const {
13871 return Subtarget->hasMVEIntegerOps() && isTypeLegal(VT);
13872}
13873
13874bool ARMTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
13875 if (!Subtarget->hasNEON()) {
13876 if (Subtarget->isThumb1Only())
13877 return VT.getScalarSizeInBits() <= 32;
13878 return true;
13879 }
13880 return VT.isScalarInteger();
13881}
13882
13883bool ARMTargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT,
13884 EVT VT) const {
13885 if (!isOperationLegalOrCustom(Op, VT) || !FPVT.isSimple())
13886 return false;
13887
13888 switch (FPVT.getSimpleVT().SimpleTy) {
13889 case MVT::f16:
13890 return Subtarget->hasVFP2Base();
13891 case MVT::f32:
13892 return Subtarget->hasVFP2Base();
13893 case MVT::f64:
13894 return Subtarget->hasFP64();
13895 case MVT::v4f32:
13896 case MVT::v8f16:
13897 return Subtarget->hasMVEFloatOps();
13898 default:
13899 return false;
13900 }
13901}
13902
13903static SDValue PerformSHLSimplify(SDNode *N,
13904 TargetLowering::DAGCombinerInfo &DCI,
13905 const ARMSubtarget *ST) {
13906 // Allow the generic combiner to identify potential bswaps.
13907 if (DCI.isBeforeLegalize())
13908 return SDValue();
13909
13910 // DAG combiner will fold:
13911 // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
13912 // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2
13913 // Other code patterns that can be also be modified have the following form:
13914 // b + ((a << 1) | 510)
13915 // b + ((a << 1) & 510)
13916 // b + ((a << 1) ^ 510)
13917 // b + ((a << 1) + 510)
13918
13919 // Many instructions can perform the shift for free, but it requires both
13920 // the operands to be registers. If c1 << c2 is too large, a mov immediate
13921 // instruction will needed. So, unfold back to the original pattern if:
13922 // - if c1 and c2 are small enough that they don't require mov imms.
13923 // - the user(s) of the node can perform an shl
13924
13925 // No shifted operands for 16-bit instructions.
13926 if (ST->isThumb() && ST->isThumb1Only())
13927 return SDValue();
13928
13929 // Check that all the users could perform the shl themselves.
13930 for (auto *U : N->uses()) {
13931 switch(U->getOpcode()) {
13932 default:
13933 return SDValue();
13934 case ISD::SUB:
13935 case ISD::ADD:
13936 case ISD::AND:
13937 case ISD::OR:
13938 case ISD::XOR:
13939 case ISD::SETCC:
13940 case ARMISD::CMP:
13941 // Check that the user isn't already using a constant because there
13942 // aren't any instructions that support an immediate operand and a
13943 // shifted operand.
13944 if (isa<ConstantSDNode>(Val: U->getOperand(Num: 0)) ||
13945 isa<ConstantSDNode>(Val: U->getOperand(Num: 1)))
13946 return SDValue();
13947
13948 // Check that it's not already using a shift.
13949 if (U->getOperand(Num: 0).getOpcode() == ISD::SHL ||
13950 U->getOperand(Num: 1).getOpcode() == ISD::SHL)
13951 return SDValue();
13952 break;
13953 }
13954 }
13955
13956 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR &&
13957 N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND)
13958 return SDValue();
13959
13960 if (N->getOperand(Num: 0).getOpcode() != ISD::SHL)
13961 return SDValue();
13962
13963 SDValue SHL = N->getOperand(Num: 0);
13964
13965 auto *C1ShlC2 = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
13966 auto *C2 = dyn_cast<ConstantSDNode>(Val: SHL.getOperand(i: 1));
13967 if (!C1ShlC2 || !C2)
13968 return SDValue();
13969
13970 APInt C2Int = C2->getAPIntValue();
13971 APInt C1Int = C1ShlC2->getAPIntValue();
13972 unsigned C2Width = C2Int.getBitWidth();
13973 if (C2Int.uge(RHS: C2Width))
13974 return SDValue();
13975 uint64_t C2Value = C2Int.getZExtValue();
13976
13977 // Check that performing a lshr will not lose any information.
13978 APInt Mask = APInt::getHighBitsSet(numBits: C2Width, hiBitsSet: C2Width - C2Value);
13979 if ((C1Int & Mask) != C1Int)
13980 return SDValue();
13981
13982 // Shift the first constant.
13983 C1Int.lshrInPlace(ShiftAmt: C2Int);
13984
13985 // The immediates are encoded as an 8-bit value that can be rotated.
13986 auto LargeImm = [](const APInt &Imm) {
13987 unsigned Zeros = Imm.countl_zero() + Imm.countr_zero();
13988 return Imm.getBitWidth() - Zeros > 8;
13989 };
13990
13991 if (LargeImm(C1Int) || LargeImm(C2Int))
13992 return SDValue();
13993
13994 SelectionDAG &DAG = DCI.DAG;
13995 SDLoc dl(N);
13996 SDValue X = SHL.getOperand(i: 0);
13997 SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X,
13998 DAG.getConstant(C1Int, dl, MVT::i32));
13999 // Shift left to compensate for the lshr of C1Int.
14000 SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1));
14001
14002 LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump();
14003 SHL.dump(); N->dump());
14004 LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump());
14005 return Res;
14006}
14007
14008
14009/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
14010///
14011static SDValue PerformADDCombine(SDNode *N,
14012 TargetLowering::DAGCombinerInfo &DCI,
14013 const ARMSubtarget *Subtarget) {
14014 SDValue N0 = N->getOperand(Num: 0);
14015 SDValue N1 = N->getOperand(Num: 1);
14016
14017 // Only works one way, because it needs an immediate operand.
14018 if (SDValue Result = PerformSHLSimplify(N, DCI, ST: Subtarget))
14019 return Result;
14020
14021 if (SDValue Result = PerformADDVecReduce(N, DAG&: DCI.DAG, Subtarget))
14022 return Result;
14023
14024 // First try with the default operand order.
14025 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
14026 return Result;
14027
14028 // If that didn't work, try again with the operands commuted.
14029 return PerformADDCombineWithOperands(N, N0: N1, N1: N0, DCI, Subtarget);
14030}
14031
14032// Combine (sub 0, (csinc X, Y, CC)) -> (csinv -X, Y, CC)
14033// providing -X is as cheap as X (currently, just a constant).
14034static SDValue PerformSubCSINCCombine(SDNode *N, SelectionDAG &DAG) {
14035 if (N->getValueType(0) != MVT::i32 || !isNullConstant(N->getOperand(0)))
14036 return SDValue();
14037 SDValue CSINC = N->getOperand(Num: 1);
14038 if (CSINC.getOpcode() != ARMISD::CSINC || !CSINC.hasOneUse())
14039 return SDValue();
14040
14041 ConstantSDNode *X = dyn_cast<ConstantSDNode>(Val: CSINC.getOperand(i: 0));
14042 if (!X)
14043 return SDValue();
14044
14045 return DAG.getNode(ARMISD::CSINV, SDLoc(N), MVT::i32,
14046 DAG.getNode(ISD::SUB, SDLoc(N), MVT::i32, N->getOperand(0),
14047 CSINC.getOperand(0)),
14048 CSINC.getOperand(1), CSINC.getOperand(2),
14049 CSINC.getOperand(3));
14050}
14051
14052/// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
14053///
14054static SDValue PerformSUBCombine(SDNode *N,
14055 TargetLowering::DAGCombinerInfo &DCI,
14056 const ARMSubtarget *Subtarget) {
14057 SDValue N0 = N->getOperand(Num: 0);
14058 SDValue N1 = N->getOperand(Num: 1);
14059
14060 // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
14061 if (N1.getNode()->hasOneUse())
14062 if (SDValue Result = combineSelectAndUse(N, Slct: N1, OtherOp: N0, DCI))
14063 return Result;
14064
14065 if (SDValue R = PerformSubCSINCCombine(N, DAG&: DCI.DAG))
14066 return R;
14067
14068 if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(ResNo: 0).isVector())
14069 return SDValue();
14070
14071 // Fold (sub (ARMvmovImm 0), (ARMvdup x)) -> (ARMvdup (sub 0, x))
14072 // so that we can readily pattern match more mve instructions which can use
14073 // a scalar operand.
14074 SDValue VDup = N->getOperand(Num: 1);
14075 if (VDup->getOpcode() != ARMISD::VDUP)
14076 return SDValue();
14077
14078 SDValue VMov = N->getOperand(Num: 0);
14079 if (VMov->getOpcode() == ISD::BITCAST)
14080 VMov = VMov->getOperand(Num: 0);
14081
14082 if (VMov->getOpcode() != ARMISD::VMOVIMM || !isZeroVector(N: VMov))
14083 return SDValue();
14084
14085 SDLoc dl(N);
14086 SDValue Negate = DCI.DAG.getNode(ISD::SUB, dl, MVT::i32,
14087 DCI.DAG.getConstant(0, dl, MVT::i32),
14088 VDup->getOperand(0));
14089 return DCI.DAG.getNode(Opcode: ARMISD::VDUP, DL: dl, VT: N->getValueType(ResNo: 0), Operand: Negate);
14090}
14091
14092/// PerformVMULCombine
14093/// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
14094/// special multiplier accumulator forwarding.
14095/// vmul d3, d0, d2
14096/// vmla d3, d1, d2
14097/// is faster than
14098/// vadd d3, d0, d1
14099/// vmul d3, d3, d2
14100// However, for (A + B) * (A + B),
14101// vadd d2, d0, d1
14102// vmul d3, d0, d2
14103// vmla d3, d1, d2
14104// is slower than
14105// vadd d2, d0, d1
14106// vmul d3, d2, d2
14107static SDValue PerformVMULCombine(SDNode *N,
14108 TargetLowering::DAGCombinerInfo &DCI,
14109 const ARMSubtarget *Subtarget) {
14110 if (!Subtarget->hasVMLxForwarding())
14111 return SDValue();
14112
14113 SelectionDAG &DAG = DCI.DAG;
14114 SDValue N0 = N->getOperand(Num: 0);
14115 SDValue N1 = N->getOperand(Num: 1);
14116 unsigned Opcode = N0.getOpcode();
14117 if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
14118 Opcode != ISD::FADD && Opcode != ISD::FSUB) {
14119 Opcode = N1.getOpcode();
14120 if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
14121 Opcode != ISD::FADD && Opcode != ISD::FSUB)
14122 return SDValue();
14123 std::swap(a&: N0, b&: N1);
14124 }
14125
14126 if (N0 == N1)
14127 return SDValue();
14128
14129 EVT VT = N->getValueType(ResNo: 0);
14130 SDLoc DL(N);
14131 SDValue N00 = N0->getOperand(Num: 0);
14132 SDValue N01 = N0->getOperand(Num: 1);
14133 return DAG.getNode(Opcode, DL, VT,
14134 N1: DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: N00, N2: N1),
14135 N2: DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: N01, N2: N1));
14136}
14137
14138static SDValue PerformMVEVMULLCombine(SDNode *N, SelectionDAG &DAG,
14139 const ARMSubtarget *Subtarget) {
14140 EVT VT = N->getValueType(ResNo: 0);
14141 if (VT != MVT::v2i64)
14142 return SDValue();
14143
14144 SDValue N0 = N->getOperand(Num: 0);
14145 SDValue N1 = N->getOperand(Num: 1);
14146
14147 auto IsSignExt = [&](SDValue Op) {
14148 if (Op->getOpcode() != ISD::SIGN_EXTEND_INREG)
14149 return SDValue();
14150 EVT VT = cast<VTSDNode>(Val: Op->getOperand(Num: 1))->getVT();
14151 if (VT.getScalarSizeInBits() == 32)
14152 return Op->getOperand(Num: 0);
14153 return SDValue();
14154 };
14155 auto IsZeroExt = [&](SDValue Op) {
14156 // Zero extends are a little more awkward. At the point we are matching
14157 // this, we are looking for an AND with a (-1, 0, -1, 0) buildvector mask.
14158 // That might be before of after a bitcast depending on how the and is
14159 // placed. Because this has to look through bitcasts, it is currently only
14160 // supported on LE.
14161 if (!Subtarget->isLittle())
14162 return SDValue();
14163
14164 SDValue And = Op;
14165 if (And->getOpcode() == ISD::BITCAST)
14166 And = And->getOperand(Num: 0);
14167 if (And->getOpcode() != ISD::AND)
14168 return SDValue();
14169 SDValue Mask = And->getOperand(Num: 1);
14170 if (Mask->getOpcode() == ISD::BITCAST)
14171 Mask = Mask->getOperand(Num: 0);
14172
14173 if (Mask->getOpcode() != ISD::BUILD_VECTOR ||
14174 Mask.getValueType() != MVT::v4i32)
14175 return SDValue();
14176 if (isAllOnesConstant(V: Mask->getOperand(Num: 0)) &&
14177 isNullConstant(V: Mask->getOperand(Num: 1)) &&
14178 isAllOnesConstant(V: Mask->getOperand(Num: 2)) &&
14179 isNullConstant(V: Mask->getOperand(Num: 3)))
14180 return And->getOperand(Num: 0);
14181 return SDValue();
14182 };
14183
14184 SDLoc dl(N);
14185 if (SDValue Op0 = IsSignExt(N0)) {
14186 if (SDValue Op1 = IsSignExt(N1)) {
14187 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
14188 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
14189 return DAG.getNode(Opcode: ARMISD::VMULLs, DL: dl, VT, N1: New0a, N2: New1a);
14190 }
14191 }
14192 if (SDValue Op0 = IsZeroExt(N0)) {
14193 if (SDValue Op1 = IsZeroExt(N1)) {
14194 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
14195 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
14196 return DAG.getNode(Opcode: ARMISD::VMULLu, DL: dl, VT, N1: New0a, N2: New1a);
14197 }
14198 }
14199
14200 return SDValue();
14201}
14202
14203static SDValue PerformMULCombine(SDNode *N,
14204 TargetLowering::DAGCombinerInfo &DCI,
14205 const ARMSubtarget *Subtarget) {
14206 SelectionDAG &DAG = DCI.DAG;
14207
14208 EVT VT = N->getValueType(ResNo: 0);
14209 if (Subtarget->hasMVEIntegerOps() && VT == MVT::v2i64)
14210 return PerformMVEVMULLCombine(N, DAG, Subtarget);
14211
14212 if (Subtarget->isThumb1Only())
14213 return SDValue();
14214
14215 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
14216 return SDValue();
14217
14218 if (VT.is64BitVector() || VT.is128BitVector())
14219 return PerformVMULCombine(N, DCI, Subtarget);
14220 if (VT != MVT::i32)
14221 return SDValue();
14222
14223 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
14224 if (!C)
14225 return SDValue();
14226
14227 int64_t MulAmt = C->getSExtValue();
14228 unsigned ShiftAmt = llvm::countr_zero<uint64_t>(Val: MulAmt);
14229
14230 ShiftAmt = ShiftAmt & (32 - 1);
14231 SDValue V = N->getOperand(Num: 0);
14232 SDLoc DL(N);
14233
14234 SDValue Res;
14235 MulAmt >>= ShiftAmt;
14236
14237 if (MulAmt >= 0) {
14238 if (llvm::has_single_bit<uint32_t>(Value: MulAmt - 1)) {
14239 // (mul x, 2^N + 1) => (add (shl x, N), x)
14240 Res = DAG.getNode(ISD::ADD, DL, VT,
14241 V,
14242 DAG.getNode(ISD::SHL, DL, VT,
14243 V,
14244 DAG.getConstant(Log2_32(MulAmt - 1), DL,
14245 MVT::i32)));
14246 } else if (llvm::has_single_bit<uint32_t>(Value: MulAmt + 1)) {
14247 // (mul x, 2^N - 1) => (sub (shl x, N), x)
14248 Res = DAG.getNode(ISD::SUB, DL, VT,
14249 DAG.getNode(ISD::SHL, DL, VT,
14250 V,
14251 DAG.getConstant(Log2_32(MulAmt + 1), DL,
14252 MVT::i32)),
14253 V);
14254 } else
14255 return SDValue();
14256 } else {
14257 uint64_t MulAmtAbs = -MulAmt;
14258 if (llvm::has_single_bit<uint32_t>(Value: MulAmtAbs + 1)) {
14259 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
14260 Res = DAG.getNode(ISD::SUB, DL, VT,
14261 V,
14262 DAG.getNode(ISD::SHL, DL, VT,
14263 V,
14264 DAG.getConstant(Log2_32(MulAmtAbs + 1), DL,
14265 MVT::i32)));
14266 } else if (llvm::has_single_bit<uint32_t>(Value: MulAmtAbs - 1)) {
14267 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
14268 Res = DAG.getNode(ISD::ADD, DL, VT,
14269 V,
14270 DAG.getNode(ISD::SHL, DL, VT,
14271 V,
14272 DAG.getConstant(Log2_32(MulAmtAbs - 1), DL,
14273 MVT::i32)));
14274 Res = DAG.getNode(ISD::SUB, DL, VT,
14275 DAG.getConstant(0, DL, MVT::i32), Res);
14276 } else
14277 return SDValue();
14278 }
14279
14280 if (ShiftAmt != 0)
14281 Res = DAG.getNode(ISD::SHL, DL, VT,
14282 Res, DAG.getConstant(ShiftAmt, DL, MVT::i32));
14283
14284 // Do not add new nodes to DAG combiner worklist.
14285 DCI.CombineTo(N, Res, AddTo: false);
14286 return SDValue();
14287}
14288
14289static SDValue CombineANDShift(SDNode *N,
14290 TargetLowering::DAGCombinerInfo &DCI,
14291 const ARMSubtarget *Subtarget) {
14292 // Allow DAGCombine to pattern-match before we touch the canonical form.
14293 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
14294 return SDValue();
14295
14296 if (N->getValueType(0) != MVT::i32)
14297 return SDValue();
14298
14299 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
14300 if (!N1C)
14301 return SDValue();
14302
14303 uint32_t C1 = (uint32_t)N1C->getZExtValue();
14304 // Don't transform uxtb/uxth.
14305 if (C1 == 255 || C1 == 65535)
14306 return SDValue();
14307
14308 SDNode *N0 = N->getOperand(Num: 0).getNode();
14309 if (!N0->hasOneUse())
14310 return SDValue();
14311
14312 if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL)
14313 return SDValue();
14314
14315 bool LeftShift = N0->getOpcode() == ISD::SHL;
14316
14317 ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(Val: N0->getOperand(Num: 1));
14318 if (!N01C)
14319 return SDValue();
14320
14321 uint32_t C2 = (uint32_t)N01C->getZExtValue();
14322 if (!C2 || C2 >= 32)
14323 return SDValue();
14324
14325 // Clear irrelevant bits in the mask.
14326 if (LeftShift)
14327 C1 &= (-1U << C2);
14328 else
14329 C1 &= (-1U >> C2);
14330
14331 SelectionDAG &DAG = DCI.DAG;
14332 SDLoc DL(N);
14333
14334 // We have a pattern of the form "(and (shl x, c2) c1)" or
14335 // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to
14336 // transform to a pair of shifts, to save materializing c1.
14337
14338 // First pattern: right shift, then mask off leading bits.
14339 // FIXME: Use demanded bits?
14340 if (!LeftShift && isMask_32(Value: C1)) {
14341 uint32_t C3 = llvm::countl_zero(Val: C1);
14342 if (C2 < C3) {
14343 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
14344 DAG.getConstant(C3 - C2, DL, MVT::i32));
14345 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
14346 DAG.getConstant(C3, DL, MVT::i32));
14347 }
14348 }
14349
14350 // First pattern, reversed: left shift, then mask off trailing bits.
14351 if (LeftShift && isMask_32(Value: ~C1)) {
14352 uint32_t C3 = llvm::countr_zero(Val: C1);
14353 if (C2 < C3) {
14354 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
14355 DAG.getConstant(C3 - C2, DL, MVT::i32));
14356 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
14357 DAG.getConstant(C3, DL, MVT::i32));
14358 }
14359 }
14360
14361 // Second pattern: left shift, then mask off leading bits.
14362 // FIXME: Use demanded bits?
14363 if (LeftShift && isShiftedMask_32(Value: C1)) {
14364 uint32_t Trailing = llvm::countr_zero(Val: C1);
14365 uint32_t C3 = llvm::countl_zero(Val: C1);
14366 if (Trailing == C2 && C2 + C3 < 32) {
14367 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
14368 DAG.getConstant(C2 + C3, DL, MVT::i32));
14369 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
14370 DAG.getConstant(C3, DL, MVT::i32));
14371 }
14372 }
14373
14374 // Second pattern, reversed: right shift, then mask off trailing bits.
14375 // FIXME: Handle other patterns of known/demanded bits.
14376 if (!LeftShift && isShiftedMask_32(Value: C1)) {
14377 uint32_t Leading = llvm::countl_zero(Val: C1);
14378 uint32_t C3 = llvm::countr_zero(Val: C1);
14379 if (Leading == C2 && C2 + C3 < 32) {
14380 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
14381 DAG.getConstant(C2 + C3, DL, MVT::i32));
14382 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
14383 DAG.getConstant(C3, DL, MVT::i32));
14384 }
14385 }
14386
14387 // FIXME: Transform "(and (shl x, c2) c1)" ->
14388 // "(shl (and x, c1>>c2), c2)" if "c1 >> c2" is a cheaper immediate than
14389 // c1.
14390 return SDValue();
14391}
14392
14393static SDValue PerformANDCombine(SDNode *N,
14394 TargetLowering::DAGCombinerInfo &DCI,
14395 const ARMSubtarget *Subtarget) {
14396 // Attempt to use immediate-form VBIC
14397 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Val: N->getOperand(Num: 1));
14398 SDLoc dl(N);
14399 EVT VT = N->getValueType(ResNo: 0);
14400 SelectionDAG &DAG = DCI.DAG;
14401
14402 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || VT == MVT::v2i1 ||
14403 VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)
14404 return SDValue();
14405
14406 APInt SplatBits, SplatUndef;
14407 unsigned SplatBitSize;
14408 bool HasAnyUndefs;
14409 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
14410 BVN->isConstantSplat(SplatValue&: SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14411 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
14412 SplatBitSize == 64) {
14413 EVT VbicVT;
14414 SDValue Val = isVMOVModifiedImm(SplatBits: (~SplatBits).getZExtValue(),
14415 SplatUndef: SplatUndef.getZExtValue(), SplatBitSize,
14416 DAG, dl, VT&: VbicVT, VectorVT: VT, type: OtherModImm);
14417 if (Val.getNode()) {
14418 SDValue Input =
14419 DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: VbicVT, Operand: N->getOperand(Num: 0));
14420 SDValue Vbic = DAG.getNode(Opcode: ARMISD::VBICIMM, DL: dl, VT: VbicVT, N1: Input, N2: Val);
14421 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT, Operand: Vbic);
14422 }
14423 }
14424 }
14425
14426 if (!Subtarget->isThumb1Only()) {
14427 // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
14428 if (SDValue Result = combineSelectAndUseCommutative(N, AllOnes: true, DCI))
14429 return Result;
14430
14431 if (SDValue Result = PerformSHLSimplify(N, DCI, ST: Subtarget))
14432 return Result;
14433 }
14434
14435 if (Subtarget->isThumb1Only())
14436 if (SDValue Result = CombineANDShift(N, DCI, Subtarget))
14437 return Result;
14438
14439 return SDValue();
14440}
14441
14442// Try combining OR nodes to SMULWB, SMULWT.
14443static SDValue PerformORCombineToSMULWBT(SDNode *OR,
14444 TargetLowering::DAGCombinerInfo &DCI,
14445 const ARMSubtarget *Subtarget) {
14446 if (!Subtarget->hasV6Ops() ||
14447 (Subtarget->isThumb() &&
14448 (!Subtarget->hasThumb2() || !Subtarget->hasDSP())))
14449 return SDValue();
14450
14451 SDValue SRL = OR->getOperand(Num: 0);
14452 SDValue SHL = OR->getOperand(Num: 1);
14453
14454 if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
14455 SRL = OR->getOperand(Num: 1);
14456 SHL = OR->getOperand(Num: 0);
14457 }
14458 if (!isSRL16(Op: SRL) || !isSHL16(Op: SHL))
14459 return SDValue();
14460
14461 // The first operands to the shifts need to be the two results from the
14462 // same smul_lohi node.
14463 if ((SRL.getOperand(i: 0).getNode() != SHL.getOperand(i: 0).getNode()) ||
14464 SRL.getOperand(i: 0).getOpcode() != ISD::SMUL_LOHI)
14465 return SDValue();
14466
14467 SDNode *SMULLOHI = SRL.getOperand(i: 0).getNode();
14468 if (SRL.getOperand(i: 0) != SDValue(SMULLOHI, 0) ||
14469 SHL.getOperand(i: 0) != SDValue(SMULLOHI, 1))
14470 return SDValue();
14471
14472 // Now we have:
14473 // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
14474 // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
14475 // For SMUWB the 16-bit value will signed extended somehow.
14476 // For SMULWT only the SRA is required.
14477 // Check both sides of SMUL_LOHI
14478 SDValue OpS16 = SMULLOHI->getOperand(Num: 0);
14479 SDValue OpS32 = SMULLOHI->getOperand(Num: 1);
14480
14481 SelectionDAG &DAG = DCI.DAG;
14482 if (!isS16(Op: OpS16, DAG) && !isSRA16(Op: OpS16)) {
14483 OpS16 = OpS32;
14484 OpS32 = SMULLOHI->getOperand(Num: 0);
14485 }
14486
14487 SDLoc dl(OR);
14488 unsigned Opcode = 0;
14489 if (isS16(Op: OpS16, DAG))
14490 Opcode = ARMISD::SMULWB;
14491 else if (isSRA16(Op: OpS16)) {
14492 Opcode = ARMISD::SMULWT;
14493 OpS16 = OpS16->getOperand(Num: 0);
14494 }
14495 else
14496 return SDValue();
14497
14498 SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16);
14499 DAG.ReplaceAllUsesOfValueWith(From: SDValue(OR, 0), To: Res);
14500 return SDValue(OR, 0);
14501}
14502
14503static SDValue PerformORCombineToBFI(SDNode *N,
14504 TargetLowering::DAGCombinerInfo &DCI,
14505 const ARMSubtarget *Subtarget) {
14506 // BFI is only available on V6T2+
14507 if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
14508 return SDValue();
14509
14510 EVT VT = N->getValueType(ResNo: 0);
14511 SDValue N0 = N->getOperand(Num: 0);
14512 SDValue N1 = N->getOperand(Num: 1);
14513 SelectionDAG &DAG = DCI.DAG;
14514 SDLoc DL(N);
14515 // 1) or (and A, mask), val => ARMbfi A, val, mask
14516 // iff (val & mask) == val
14517 //
14518 // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
14519 // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
14520 // && mask == ~mask2
14521 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
14522 // && ~mask == mask2
14523 // (i.e., copy a bitfield value into another bitfield of the same width)
14524
14525 if (VT != MVT::i32)
14526 return SDValue();
14527
14528 SDValue N00 = N0.getOperand(i: 0);
14529
14530 // The value and the mask need to be constants so we can verify this is
14531 // actually a bitfield set. If the mask is 0xffff, we can do better
14532 // via a movt instruction, so don't use BFI in that case.
14533 SDValue MaskOp = N0.getOperand(i: 1);
14534 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Val&: MaskOp);
14535 if (!MaskC)
14536 return SDValue();
14537 unsigned Mask = MaskC->getZExtValue();
14538 if (Mask == 0xffff)
14539 return SDValue();
14540 SDValue Res;
14541 // Case (1): or (and A, mask), val => ARMbfi A, val, mask
14542 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Val&: N1);
14543 if (N1C) {
14544 unsigned Val = N1C->getZExtValue();
14545 if ((Val & ~Mask) != Val)
14546 return SDValue();
14547
14548 if (ARM::isBitFieldInvertedMask(v: Mask)) {
14549 Val >>= llvm::countr_zero(Val: ~Mask);
14550
14551 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
14552 DAG.getConstant(Val, DL, MVT::i32),
14553 DAG.getConstant(Mask, DL, MVT::i32));
14554
14555 DCI.CombineTo(N, Res, AddTo: false);
14556 // Return value from the original node to inform the combiner than N is
14557 // now dead.
14558 return SDValue(N, 0);
14559 }
14560 } else if (N1.getOpcode() == ISD::AND) {
14561 // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
14562 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(Val: N1.getOperand(i: 1));
14563 if (!N11C)
14564 return SDValue();
14565 unsigned Mask2 = N11C->getZExtValue();
14566
14567 // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
14568 // as is to match.
14569 if (ARM::isBitFieldInvertedMask(v: Mask) &&
14570 (Mask == ~Mask2)) {
14571 // The pack halfword instruction works better for masks that fit it,
14572 // so use that when it's available.
14573 if (Subtarget->hasDSP() &&
14574 (Mask == 0xffff || Mask == 0xffff0000))
14575 return SDValue();
14576 // 2a
14577 unsigned amt = llvm::countr_zero(Val: Mask2);
14578 Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
14579 DAG.getConstant(amt, DL, MVT::i32));
14580 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
14581 DAG.getConstant(Mask, DL, MVT::i32));
14582 DCI.CombineTo(N, Res, AddTo: false);
14583 // Return value from the original node to inform the combiner than N is
14584 // now dead.
14585 return SDValue(N, 0);
14586 } else if (ARM::isBitFieldInvertedMask(v: ~Mask) &&
14587 (~Mask == Mask2)) {
14588 // The pack halfword instruction works better for masks that fit it,
14589 // so use that when it's available.
14590 if (Subtarget->hasDSP() &&
14591 (Mask2 == 0xffff || Mask2 == 0xffff0000))
14592 return SDValue();
14593 // 2b
14594 unsigned lsb = llvm::countr_zero(Val: Mask);
14595 Res = DAG.getNode(ISD::SRL, DL, VT, N00,
14596 DAG.getConstant(lsb, DL, MVT::i32));
14597 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
14598 DAG.getConstant(Mask2, DL, MVT::i32));
14599 DCI.CombineTo(N, Res, AddTo: false);
14600 // Return value from the original node to inform the combiner than N is
14601 // now dead.
14602 return SDValue(N, 0);
14603 }
14604 }
14605
14606 if (DAG.MaskedValueIsZero(Op: N1, Mask: MaskC->getAPIntValue()) &&
14607 N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(Val: N00.getOperand(i: 1)) &&
14608 ARM::isBitFieldInvertedMask(v: ~Mask)) {
14609 // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
14610 // where lsb(mask) == #shamt and masked bits of B are known zero.
14611 SDValue ShAmt = N00.getOperand(i: 1);
14612 unsigned ShAmtC = ShAmt->getAsZExtVal();
14613 unsigned LSB = llvm::countr_zero(Val: Mask);
14614 if (ShAmtC != LSB)
14615 return SDValue();
14616
14617 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
14618 DAG.getConstant(~Mask, DL, MVT::i32));
14619
14620 DCI.CombineTo(N, Res, AddTo: false);
14621 // Return value from the original node to inform the combiner than N is
14622 // now dead.
14623 return SDValue(N, 0);
14624 }
14625
14626 return SDValue();
14627}
14628
14629static bool isValidMVECond(unsigned CC, bool IsFloat) {
14630 switch (CC) {
14631 case ARMCC::EQ:
14632 case ARMCC::NE:
14633 case ARMCC::LE:
14634 case ARMCC::GT:
14635 case ARMCC::GE:
14636 case ARMCC::LT:
14637 return true;
14638 case ARMCC::HS:
14639 case ARMCC::HI:
14640 return !IsFloat;
14641 default:
14642 return false;
14643 };
14644}
14645
14646static ARMCC::CondCodes getVCMPCondCode(SDValue N) {
14647 if (N->getOpcode() == ARMISD::VCMP)
14648 return (ARMCC::CondCodes)N->getConstantOperandVal(Num: 2);
14649 else if (N->getOpcode() == ARMISD::VCMPZ)
14650 return (ARMCC::CondCodes)N->getConstantOperandVal(Num: 1);
14651 else
14652 llvm_unreachable("Not a VCMP/VCMPZ!");
14653}
14654
14655static bool CanInvertMVEVCMP(SDValue N) {
14656 ARMCC::CondCodes CC = ARMCC::getOppositeCondition(CC: getVCMPCondCode(N));
14657 return isValidMVECond(CC, IsFloat: N->getOperand(Num: 0).getValueType().isFloatingPoint());
14658}
14659
14660static SDValue PerformORCombine_i1(SDNode *N, SelectionDAG &DAG,
14661 const ARMSubtarget *Subtarget) {
14662 // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain
14663 // together with predicates
14664 EVT VT = N->getValueType(ResNo: 0);
14665 SDLoc DL(N);
14666 SDValue N0 = N->getOperand(Num: 0);
14667 SDValue N1 = N->getOperand(Num: 1);
14668
14669 auto IsFreelyInvertable = [&](SDValue V) {
14670 if (V->getOpcode() == ARMISD::VCMP || V->getOpcode() == ARMISD::VCMPZ)
14671 return CanInvertMVEVCMP(N: V);
14672 return false;
14673 };
14674
14675 // At least one operand must be freely invertable.
14676 if (!(IsFreelyInvertable(N0) || IsFreelyInvertable(N1)))
14677 return SDValue();
14678
14679 SDValue NewN0 = DAG.getLogicalNOT(DL, Val: N0, VT);
14680 SDValue NewN1 = DAG.getLogicalNOT(DL, Val: N1, VT);
14681 SDValue And = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: NewN0, N2: NewN1);
14682 return DAG.getLogicalNOT(DL, Val: And, VT);
14683}
14684
14685/// PerformORCombine - Target-specific dag combine xforms for ISD::OR
14686static SDValue PerformORCombine(SDNode *N,
14687 TargetLowering::DAGCombinerInfo &DCI,
14688 const ARMSubtarget *Subtarget) {
14689 // Attempt to use immediate-form VORR
14690 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Val: N->getOperand(Num: 1));
14691 SDLoc dl(N);
14692 EVT VT = N->getValueType(ResNo: 0);
14693 SelectionDAG &DAG = DCI.DAG;
14694
14695 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
14696 return SDValue();
14697
14698 if (Subtarget->hasMVEIntegerOps() && (VT == MVT::v2i1 || VT == MVT::v4i1 ||
14699 VT == MVT::v8i1 || VT == MVT::v16i1))
14700 return PerformORCombine_i1(N, DAG, Subtarget);
14701
14702 APInt SplatBits, SplatUndef;
14703 unsigned SplatBitSize;
14704 bool HasAnyUndefs;
14705 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
14706 BVN->isConstantSplat(SplatValue&: SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14707 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
14708 SplatBitSize == 64) {
14709 EVT VorrVT;
14710 SDValue Val =
14711 isVMOVModifiedImm(SplatBits: SplatBits.getZExtValue(), SplatUndef: SplatUndef.getZExtValue(),
14712 SplatBitSize, DAG, dl, VT&: VorrVT, VectorVT: VT, type: OtherModImm);
14713 if (Val.getNode()) {
14714 SDValue Input =
14715 DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: VorrVT, Operand: N->getOperand(Num: 0));
14716 SDValue Vorr = DAG.getNode(Opcode: ARMISD::VORRIMM, DL: dl, VT: VorrVT, N1: Input, N2: Val);
14717 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT, Operand: Vorr);
14718 }
14719 }
14720 }
14721
14722 if (!Subtarget->isThumb1Only()) {
14723 // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
14724 if (SDValue Result = combineSelectAndUseCommutative(N, AllOnes: false, DCI))
14725 return Result;
14726 if (SDValue Result = PerformORCombineToSMULWBT(OR: N, DCI, Subtarget))
14727 return Result;
14728 }
14729
14730 SDValue N0 = N->getOperand(Num: 0);
14731 SDValue N1 = N->getOperand(Num: 1);
14732
14733 // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
14734 if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
14735 DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
14736
14737 // The code below optimizes (or (and X, Y), Z).
14738 // The AND operand needs to have a single user to make these optimizations
14739 // profitable.
14740 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
14741 return SDValue();
14742
14743 APInt SplatUndef;
14744 unsigned SplatBitSize;
14745 bool HasAnyUndefs;
14746
14747 APInt SplatBits0, SplatBits1;
14748 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(Val: N0->getOperand(Num: 1));
14749 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(Val: N1->getOperand(Num: 1));
14750 // Ensure that the second operand of both ands are constants
14751 if (BVN0 && BVN0->isConstantSplat(SplatValue&: SplatBits0, SplatUndef, SplatBitSize,
14752 HasAnyUndefs) && !HasAnyUndefs) {
14753 if (BVN1 && BVN1->isConstantSplat(SplatValue&: SplatBits1, SplatUndef, SplatBitSize,
14754 HasAnyUndefs) && !HasAnyUndefs) {
14755 // Ensure that the bit width of the constants are the same and that
14756 // the splat arguments are logical inverses as per the pattern we
14757 // are trying to simplify.
14758 if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
14759 SplatBits0 == ~SplatBits1) {
14760 // Canonicalize the vector type to make instruction selection
14761 // simpler.
14762 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
14763 SDValue Result = DAG.getNode(Opcode: ARMISD::VBSP, DL: dl, VT: CanonicalVT,
14764 N1: N0->getOperand(Num: 1),
14765 N2: N0->getOperand(Num: 0),
14766 N3: N1->getOperand(Num: 0));
14767 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT, Operand: Result);
14768 }
14769 }
14770 }
14771 }
14772
14773 // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
14774 // reasonable.
14775 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
14776 if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget))
14777 return Res;
14778 }
14779
14780 if (SDValue Result = PerformSHLSimplify(N, DCI, ST: Subtarget))
14781 return Result;
14782
14783 return SDValue();
14784}
14785
14786static SDValue PerformXORCombine(SDNode *N,
14787 TargetLowering::DAGCombinerInfo &DCI,
14788 const ARMSubtarget *Subtarget) {
14789 EVT VT = N->getValueType(ResNo: 0);
14790 SelectionDAG &DAG = DCI.DAG;
14791
14792 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
14793 return SDValue();
14794
14795 if (!Subtarget->isThumb1Only()) {
14796 // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
14797 if (SDValue Result = combineSelectAndUseCommutative(N, AllOnes: false, DCI))
14798 return Result;
14799
14800 if (SDValue Result = PerformSHLSimplify(N, DCI, ST: Subtarget))
14801 return Result;
14802 }
14803
14804 if (Subtarget->hasMVEIntegerOps()) {
14805 // fold (xor(vcmp/z, 1)) into a vcmp with the opposite condition.
14806 SDValue N0 = N->getOperand(Num: 0);
14807 SDValue N1 = N->getOperand(Num: 1);
14808 const TargetLowering *TLI = Subtarget->getTargetLowering();
14809 if (TLI->isConstTrueVal(N: N1) &&
14810 (N0->getOpcode() == ARMISD::VCMP || N0->getOpcode() == ARMISD::VCMPZ)) {
14811 if (CanInvertMVEVCMP(N: N0)) {
14812 SDLoc DL(N0);
14813 ARMCC::CondCodes CC = ARMCC::getOppositeCondition(CC: getVCMPCondCode(N: N0));
14814
14815 SmallVector<SDValue, 4> Ops;
14816 Ops.push_back(Elt: N0->getOperand(Num: 0));
14817 if (N0->getOpcode() == ARMISD::VCMP)
14818 Ops.push_back(Elt: N0->getOperand(Num: 1));
14819 Ops.push_back(DAG.getConstant(CC, DL, MVT::i32));
14820 return DAG.getNode(Opcode: N0->getOpcode(), DL, VT: N0->getValueType(ResNo: 0), Ops);
14821 }
14822 }
14823 }
14824
14825 return SDValue();
14826}
14827
14828// ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it,
14829// and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and
14830// their position in "to" (Rd).
14831static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) {
14832 assert(N->getOpcode() == ARMISD::BFI);
14833
14834 SDValue From = N->getOperand(Num: 1);
14835 ToMask = ~N->getConstantOperandAPInt(Num: 2);
14836 FromMask = APInt::getLowBitsSet(numBits: ToMask.getBitWidth(), loBitsSet: ToMask.popcount());
14837
14838 // If the Base came from a SHR #C, we can deduce that it is really testing bit
14839 // #C in the base of the SHR.
14840 if (From->getOpcode() == ISD::SRL &&
14841 isa<ConstantSDNode>(Val: From->getOperand(Num: 1))) {
14842 APInt Shift = From->getConstantOperandAPInt(Num: 1);
14843 assert(Shift.getLimitedValue() < 32 && "Shift too large!");
14844 FromMask <<= Shift.getLimitedValue(Limit: 31);
14845 From = From->getOperand(Num: 0);
14846 }
14847
14848 return From;
14849}
14850
14851// If A and B contain one contiguous set of bits, does A | B == A . B?
14852//
14853// Neither A nor B must be zero.
14854static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) {
14855 unsigned LastActiveBitInA = A.countr_zero();
14856 unsigned FirstActiveBitInB = B.getBitWidth() - B.countl_zero() - 1;
14857 return LastActiveBitInA - 1 == FirstActiveBitInB;
14858}
14859
14860static SDValue FindBFIToCombineWith(SDNode *N) {
14861 // We have a BFI in N. Find a BFI it can combine with, if one exists.
14862 APInt ToMask, FromMask;
14863 SDValue From = ParseBFI(N, ToMask, FromMask);
14864 SDValue To = N->getOperand(Num: 0);
14865
14866 SDValue V = To;
14867 if (V.getOpcode() != ARMISD::BFI)
14868 return SDValue();
14869
14870 APInt NewToMask, NewFromMask;
14871 SDValue NewFrom = ParseBFI(N: V.getNode(), ToMask&: NewToMask, FromMask&: NewFromMask);
14872 if (NewFrom != From)
14873 return SDValue();
14874
14875 // Do the written bits conflict with any we've seen so far?
14876 if ((NewToMask & ToMask).getBoolValue())
14877 // Conflicting bits.
14878 return SDValue();
14879
14880 // Are the new bits contiguous when combined with the old bits?
14881 if (BitsProperlyConcatenate(A: ToMask, B: NewToMask) &&
14882 BitsProperlyConcatenate(A: FromMask, B: NewFromMask))
14883 return V;
14884 if (BitsProperlyConcatenate(A: NewToMask, B: ToMask) &&
14885 BitsProperlyConcatenate(A: NewFromMask, B: FromMask))
14886 return V;
14887
14888 return SDValue();
14889}
14890
14891static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG) {
14892 SDValue N0 = N->getOperand(Num: 0);
14893 SDValue N1 = N->getOperand(Num: 1);
14894
14895 if (N1.getOpcode() == ISD::AND) {
14896 // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
14897 // the bits being cleared by the AND are not demanded by the BFI.
14898 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(Val: N1.getOperand(i: 1));
14899 if (!N11C)
14900 return SDValue();
14901 unsigned InvMask = N->getConstantOperandVal(Num: 2);
14902 unsigned LSB = llvm::countr_zero(Val: ~InvMask);
14903 unsigned Width = llvm::bit_width<unsigned>(Value: ~InvMask) - LSB;
14904 assert(Width <
14905 static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
14906 "undefined behavior");
14907 unsigned Mask = (1u << Width) - 1;
14908 unsigned Mask2 = N11C->getZExtValue();
14909 if ((Mask & (~Mask2)) == 0)
14910 return DAG.getNode(Opcode: ARMISD::BFI, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
14911 N1: N->getOperand(Num: 0), N2: N1.getOperand(i: 0), N3: N->getOperand(Num: 2));
14912 return SDValue();
14913 }
14914
14915 // Look for another BFI to combine with.
14916 if (SDValue CombineBFI = FindBFIToCombineWith(N)) {
14917 // We've found a BFI.
14918 APInt ToMask1, FromMask1;
14919 SDValue From1 = ParseBFI(N, ToMask&: ToMask1, FromMask&: FromMask1);
14920
14921 APInt ToMask2, FromMask2;
14922 SDValue From2 = ParseBFI(N: CombineBFI.getNode(), ToMask&: ToMask2, FromMask&: FromMask2);
14923 assert(From1 == From2);
14924 (void)From2;
14925
14926 // Create a new BFI, combining the two together.
14927 APInt NewFromMask = FromMask1 | FromMask2;
14928 APInt NewToMask = ToMask1 | ToMask2;
14929
14930 EVT VT = N->getValueType(ResNo: 0);
14931 SDLoc dl(N);
14932
14933 if (NewFromMask[0] == 0)
14934 From1 = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT, N1: From1,
14935 N2: DAG.getConstant(Val: NewFromMask.countr_zero(), DL: dl, VT));
14936 return DAG.getNode(Opcode: ARMISD::BFI, DL: dl, VT, N1: CombineBFI.getOperand(i: 0), N2: From1,
14937 N3: DAG.getConstant(Val: ~NewToMask, DL: dl, VT));
14938 }
14939
14940 // Reassociate BFI(BFI (A, B, M1), C, M2) to BFI(BFI (A, C, M2), B, M1) so
14941 // that lower bit insertions are performed first, providing that M1 and M2
14942 // do no overlap. This can allow multiple BFI instructions to be combined
14943 // together by the other folds above.
14944 if (N->getOperand(Num: 0).getOpcode() == ARMISD::BFI) {
14945 APInt ToMask1 = ~N->getConstantOperandAPInt(Num: 2);
14946 APInt ToMask2 = ~N0.getConstantOperandAPInt(i: 2);
14947
14948 if (!N0.hasOneUse() || (ToMask1 & ToMask2) != 0 ||
14949 ToMask1.countl_zero() < ToMask2.countl_zero())
14950 return SDValue();
14951
14952 EVT VT = N->getValueType(ResNo: 0);
14953 SDLoc dl(N);
14954 SDValue BFI1 = DAG.getNode(Opcode: ARMISD::BFI, DL: dl, VT, N1: N0.getOperand(i: 0),
14955 N2: N->getOperand(Num: 1), N3: N->getOperand(Num: 2));
14956 return DAG.getNode(Opcode: ARMISD::BFI, DL: dl, VT, N1: BFI1, N2: N0.getOperand(i: 1),
14957 N3: N0.getOperand(i: 2));
14958 }
14959
14960 return SDValue();
14961}
14962
14963// Check that N is CMPZ(CSINC(0, 0, CC, X)),
14964// or CMPZ(CMOV(1, 0, CC, $cpsr, X))
14965// return X if valid.
14966static SDValue IsCMPZCSINC(SDNode *Cmp, ARMCC::CondCodes &CC) {
14967 if (Cmp->getOpcode() != ARMISD::CMPZ || !isNullConstant(V: Cmp->getOperand(Num: 1)))
14968 return SDValue();
14969 SDValue CSInc = Cmp->getOperand(Num: 0);
14970
14971 // Ignore any `And 1` nodes that may not yet have been removed. We are
14972 // looking for a value that produces 1/0, so these have no effect on the
14973 // code.
14974 while (CSInc.getOpcode() == ISD::AND &&
14975 isa<ConstantSDNode>(Val: CSInc.getOperand(i: 1)) &&
14976 CSInc.getConstantOperandVal(i: 1) == 1 && CSInc->hasOneUse())
14977 CSInc = CSInc.getOperand(i: 0);
14978
14979 if (CSInc.getOpcode() == ARMISD::CSINC &&
14980 isNullConstant(V: CSInc.getOperand(i: 0)) &&
14981 isNullConstant(V: CSInc.getOperand(i: 1)) && CSInc->hasOneUse()) {
14982 CC = (ARMCC::CondCodes)CSInc.getConstantOperandVal(i: 2);
14983 return CSInc.getOperand(i: 3);
14984 }
14985 if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(V: CSInc.getOperand(i: 0)) &&
14986 isNullConstant(V: CSInc.getOperand(i: 1)) && CSInc->hasOneUse()) {
14987 CC = (ARMCC::CondCodes)CSInc.getConstantOperandVal(i: 2);
14988 return CSInc.getOperand(i: 4);
14989 }
14990 if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(V: CSInc.getOperand(i: 1)) &&
14991 isNullConstant(V: CSInc.getOperand(i: 0)) && CSInc->hasOneUse()) {
14992 CC = ARMCC::getOppositeCondition(
14993 CC: (ARMCC::CondCodes)CSInc.getConstantOperandVal(i: 2));
14994 return CSInc.getOperand(i: 4);
14995 }
14996 return SDValue();
14997}
14998
14999static SDValue PerformCMPZCombine(SDNode *N, SelectionDAG &DAG) {
15000 // Given CMPZ(CSINC(C, 0, 0, EQ), 0), we can just use C directly. As in
15001 // t92: glue = ARMISD::CMPZ t74, 0
15002 // t93: i32 = ARMISD::CSINC 0, 0, 1, t92
15003 // t96: glue = ARMISD::CMPZ t93, 0
15004 // t114: i32 = ARMISD::CSINV 0, 0, 0, t96
15005 ARMCC::CondCodes Cond;
15006 if (SDValue C = IsCMPZCSINC(Cmp: N, CC&: Cond))
15007 if (Cond == ARMCC::EQ)
15008 return C;
15009 return SDValue();
15010}
15011
15012static SDValue PerformCSETCombine(SDNode *N, SelectionDAG &DAG) {
15013 // Fold away an unneccessary CMPZ/CSINC
15014 // CSXYZ A, B, C1 (CMPZ (CSINC 0, 0, C2, D), 0) ->
15015 // if C1==EQ -> CSXYZ A, B, C2, D
15016 // if C1==NE -> CSXYZ A, B, NOT(C2), D
15017 ARMCC::CondCodes Cond;
15018 if (SDValue C = IsCMPZCSINC(Cmp: N->getOperand(Num: 3).getNode(), CC&: Cond)) {
15019 if (N->getConstantOperandVal(2) == ARMCC::EQ)
15020 return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
15021 N->getOperand(1),
15022 DAG.getConstant(Cond, SDLoc(N), MVT::i32), C);
15023 if (N->getConstantOperandVal(2) == ARMCC::NE)
15024 return DAG.getNode(
15025 N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
15026 N->getOperand(1),
15027 DAG.getConstant(ARMCC::getOppositeCondition(Cond), SDLoc(N), MVT::i32), C);
15028 }
15029 return SDValue();
15030}
15031
15032/// PerformVMOVRRDCombine - Target-specific dag combine xforms for
15033/// ARMISD::VMOVRRD.
15034static SDValue PerformVMOVRRDCombine(SDNode *N,
15035 TargetLowering::DAGCombinerInfo &DCI,
15036 const ARMSubtarget *Subtarget) {
15037 // vmovrrd(vmovdrr x, y) -> x,y
15038 SDValue InDouble = N->getOperand(Num: 0);
15039 if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64())
15040 return DCI.CombineTo(N, Res0: InDouble.getOperand(i: 0), Res1: InDouble.getOperand(i: 1));
15041
15042 // vmovrrd(load f64) -> (load i32), (load i32)
15043 SDNode *InNode = InDouble.getNode();
15044 if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
15045 InNode->getValueType(0) == MVT::f64 &&
15046 InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
15047 !cast<LoadSDNode>(InNode)->isVolatile()) {
15048 // TODO: Should this be done for non-FrameIndex operands?
15049 LoadSDNode *LD = cast<LoadSDNode>(Val: InNode);
15050
15051 SelectionDAG &DAG = DCI.DAG;
15052 SDLoc DL(LD);
15053 SDValue BasePtr = LD->getBasePtr();
15054 SDValue NewLD1 =
15055 DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(),
15056 LD->getAlign(), LD->getMemOperand()->getFlags());
15057
15058 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
15059 DAG.getConstant(4, DL, MVT::i32));
15060
15061 SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr,
15062 LD->getPointerInfo().getWithOffset(4),
15063 commonAlignment(LD->getAlign(), 4),
15064 LD->getMemOperand()->getFlags());
15065
15066 DAG.ReplaceAllUsesOfValueWith(From: SDValue(LD, 1), To: NewLD2.getValue(R: 1));
15067 if (DCI.DAG.getDataLayout().isBigEndian())
15068 std::swap (a&: NewLD1, b&: NewLD2);
15069 SDValue Result = DCI.CombineTo(N, Res0: NewLD1, Res1: NewLD2);
15070 return Result;
15071 }
15072
15073 // VMOVRRD(extract(..(build_vector(a, b, c, d)))) -> a,b or c,d
15074 // VMOVRRD(extract(insert_vector(insert_vector(.., a, l1), b, l2))) -> a,b
15075 if (InDouble.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15076 isa<ConstantSDNode>(Val: InDouble.getOperand(i: 1))) {
15077 SDValue BV = InDouble.getOperand(i: 0);
15078 // Look up through any nop bitcasts and vector_reg_casts. bitcasts may
15079 // change lane order under big endian.
15080 bool BVSwap = BV.getOpcode() == ISD::BITCAST;
15081 while (
15082 (BV.getOpcode() == ISD::BITCAST ||
15083 BV.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
15084 (BV.getValueType() == MVT::v2f64 || BV.getValueType() == MVT::v2i64)) {
15085 BVSwap = BV.getOpcode() == ISD::BITCAST;
15086 BV = BV.getOperand(i: 0);
15087 }
15088 if (BV.getValueType() != MVT::v4i32)
15089 return SDValue();
15090
15091 // Handle buildvectors, pulling out the correct lane depending on
15092 // endianness.
15093 unsigned Offset = InDouble.getConstantOperandVal(i: 1) == 1 ? 2 : 0;
15094 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
15095 SDValue Op0 = BV.getOperand(i: Offset);
15096 SDValue Op1 = BV.getOperand(i: Offset + 1);
15097 if (!Subtarget->isLittle() && BVSwap)
15098 std::swap(a&: Op0, b&: Op1);
15099
15100 return DCI.DAG.getMergeValues(Ops: {Op0, Op1}, dl: SDLoc(N));
15101 }
15102
15103 // A chain of insert_vectors, grabbing the correct value of the chain of
15104 // inserts.
15105 SDValue Op0, Op1;
15106 while (BV.getOpcode() == ISD::INSERT_VECTOR_ELT) {
15107 if (isa<ConstantSDNode>(Val: BV.getOperand(i: 2))) {
15108 if (BV.getConstantOperandVal(i: 2) == Offset)
15109 Op0 = BV.getOperand(i: 1);
15110 if (BV.getConstantOperandVal(i: 2) == Offset + 1)
15111 Op1 = BV.getOperand(i: 1);
15112 }
15113 BV = BV.getOperand(i: 0);
15114 }
15115 if (!Subtarget->isLittle() && BVSwap)
15116 std::swap(a&: Op0, b&: Op1);
15117 if (Op0 && Op1)
15118 return DCI.DAG.getMergeValues(Ops: {Op0, Op1}, dl: SDLoc(N));
15119 }
15120
15121 return SDValue();
15122}
15123
15124/// PerformVMOVDRRCombine - Target-specific dag combine xforms for
15125/// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands.
15126static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) {
15127 // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
15128 SDValue Op0 = N->getOperand(Num: 0);
15129 SDValue Op1 = N->getOperand(Num: 1);
15130 if (Op0.getOpcode() == ISD::BITCAST)
15131 Op0 = Op0.getOperand(i: 0);
15132 if (Op1.getOpcode() == ISD::BITCAST)
15133 Op1 = Op1.getOperand(i: 0);
15134 if (Op0.getOpcode() == ARMISD::VMOVRRD &&
15135 Op0.getNode() == Op1.getNode() &&
15136 Op0.getResNo() == 0 && Op1.getResNo() == 1)
15137 return DAG.getNode(Opcode: ISD::BITCAST, DL: SDLoc(N),
15138 VT: N->getValueType(ResNo: 0), Operand: Op0.getOperand(i: 0));
15139 return SDValue();
15140}
15141
15142static SDValue PerformVMOVhrCombine(SDNode *N,
15143 TargetLowering::DAGCombinerInfo &DCI) {
15144 SDValue Op0 = N->getOperand(Num: 0);
15145
15146 // VMOVhr (VMOVrh (X)) -> X
15147 if (Op0->getOpcode() == ARMISD::VMOVrh)
15148 return Op0->getOperand(Num: 0);
15149
15150 // FullFP16: half values are passed in S-registers, and we don't
15151 // need any of the bitcast and moves:
15152 //
15153 // t2: f32,ch1,gl1? = CopyFromReg ch, Register:f32 %0, gl?
15154 // t5: i32 = bitcast t2
15155 // t18: f16 = ARMISD::VMOVhr t5
15156 // =>
15157 // tN: f16,ch2,gl2? = CopyFromReg ch, Register::f32 %0, gl?
15158 if (Op0->getOpcode() == ISD::BITCAST) {
15159 SDValue Copy = Op0->getOperand(Num: 0);
15160 if (Copy.getValueType() == MVT::f32 &&
15161 Copy->getOpcode() == ISD::CopyFromReg) {
15162 bool HasGlue = Copy->getNumOperands() == 3;
15163 SDValue Ops[] = {Copy->getOperand(Num: 0), Copy->getOperand(Num: 1),
15164 HasGlue ? Copy->getOperand(Num: 2) : SDValue()};
15165 EVT OutTys[] = {N->getValueType(0), MVT::Other, MVT::Glue};
15166 SDValue NewCopy =
15167 DCI.DAG.getNode(Opcode: ISD::CopyFromReg, DL: SDLoc(N),
15168 VTList: DCI.DAG.getVTList(VTs: ArrayRef(OutTys, HasGlue ? 3 : 2)),
15169 Ops: ArrayRef(Ops, HasGlue ? 3 : 2));
15170
15171 // Update Users, Chains, and Potential Glue.
15172 DCI.DAG.ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: NewCopy.getValue(R: 0));
15173 DCI.DAG.ReplaceAllUsesOfValueWith(From: Copy.getValue(R: 1), To: NewCopy.getValue(R: 1));
15174 if (HasGlue)
15175 DCI.DAG.ReplaceAllUsesOfValueWith(From: Copy.getValue(R: 2),
15176 To: NewCopy.getValue(R: 2));
15177
15178 return NewCopy;
15179 }
15180 }
15181
15182 // fold (VMOVhr (load x)) -> (load (f16*)x)
15183 if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(Val&: Op0)) {
15184 if (LN0->hasOneUse() && LN0->isUnindexed() &&
15185 LN0->getMemoryVT() == MVT::i16) {
15186 SDValue Load =
15187 DCI.DAG.getLoad(VT: N->getValueType(ResNo: 0), dl: SDLoc(N), Chain: LN0->getChain(),
15188 Ptr: LN0->getBasePtr(), MMO: LN0->getMemOperand());
15189 DCI.DAG.ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Load.getValue(R: 0));
15190 DCI.DAG.ReplaceAllUsesOfValueWith(From: Op0.getValue(R: 1), To: Load.getValue(R: 1));
15191 return Load;
15192 }
15193 }
15194
15195 // Only the bottom 16 bits of the source register are used.
15196 APInt DemandedMask = APInt::getLowBitsSet(numBits: 32, loBitsSet: 16);
15197 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
15198 if (TLI.SimplifyDemandedBits(Op: Op0, DemandedBits: DemandedMask, DCI))
15199 return SDValue(N, 0);
15200
15201 return SDValue();
15202}
15203
15204static SDValue PerformVMOVrhCombine(SDNode *N, SelectionDAG &DAG) {
15205 SDValue N0 = N->getOperand(Num: 0);
15206 EVT VT = N->getValueType(ResNo: 0);
15207
15208 // fold (VMOVrh (fpconst x)) -> const x
15209 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: N0)) {
15210 APFloat V = C->getValueAPF();
15211 return DAG.getConstant(Val: V.bitcastToAPInt().getZExtValue(), DL: SDLoc(N), VT);
15212 }
15213
15214 // fold (VMOVrh (load x)) -> (zextload (i16*)x)
15215 if (ISD::isNormalLoad(N: N0.getNode()) && N0.hasOneUse()) {
15216 LoadSDNode *LN0 = cast<LoadSDNode>(Val&: N0);
15217
15218 SDValue Load =
15219 DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(),
15220 LN0->getBasePtr(), MVT::i16, LN0->getMemOperand());
15221 DAG.ReplaceAllUsesOfValueWith(From: SDValue(N, 0), To: Load.getValue(R: 0));
15222 DAG.ReplaceAllUsesOfValueWith(From: N0.getValue(R: 1), To: Load.getValue(R: 1));
15223 return Load;
15224 }
15225
15226 // Fold VMOVrh(extract(x, n)) -> vgetlaneu(x, n)
15227 if (N0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15228 isa<ConstantSDNode>(Val: N0->getOperand(Num: 1)))
15229 return DAG.getNode(Opcode: ARMISD::VGETLANEu, DL: SDLoc(N), VT, N1: N0->getOperand(Num: 0),
15230 N2: N0->getOperand(Num: 1));
15231
15232 return SDValue();
15233}
15234
15235/// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
15236/// are normal, non-volatile loads. If so, it is profitable to bitcast an
15237/// i64 vector to have f64 elements, since the value can then be loaded
15238/// directly into a VFP register.
15239static bool hasNormalLoadOperand(SDNode *N) {
15240 unsigned NumElts = N->getValueType(ResNo: 0).getVectorNumElements();
15241 for (unsigned i = 0; i < NumElts; ++i) {
15242 SDNode *Elt = N->getOperand(Num: i).getNode();
15243 if (ISD::isNormalLoad(N: Elt) && !cast<LoadSDNode>(Val: Elt)->isVolatile())
15244 return true;
15245 }
15246 return false;
15247}
15248
15249/// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
15250/// ISD::BUILD_VECTOR.
15251static SDValue PerformBUILD_VECTORCombine(SDNode *N,
15252 TargetLowering::DAGCombinerInfo &DCI,
15253 const ARMSubtarget *Subtarget) {
15254 // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
15255 // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value
15256 // into a pair of GPRs, which is fine when the value is used as a scalar,
15257 // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
15258 SelectionDAG &DAG = DCI.DAG;
15259 if (N->getNumOperands() == 2)
15260 if (SDValue RV = PerformVMOVDRRCombine(N, DAG))
15261 return RV;
15262
15263 // Load i64 elements as f64 values so that type legalization does not split
15264 // them up into i32 values.
15265 EVT VT = N->getValueType(ResNo: 0);
15266 if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
15267 return SDValue();
15268 SDLoc dl(N);
15269 SmallVector<SDValue, 8> Ops;
15270 unsigned NumElts = VT.getVectorNumElements();
15271 for (unsigned i = 0; i < NumElts; ++i) {
15272 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
15273 Ops.push_back(Elt: V);
15274 // Make the DAGCombiner fold the bitcast.
15275 DCI.AddToWorklist(N: V.getNode());
15276 }
15277 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
15278 SDValue BV = DAG.getBuildVector(VT: FloatVT, DL: dl, Ops);
15279 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT, Operand: BV);
15280}
15281
15282/// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
15283static SDValue
15284PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
15285 // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
15286 // At that time, we may have inserted bitcasts from integer to float.
15287 // If these bitcasts have survived DAGCombine, change the lowering of this
15288 // BUILD_VECTOR in something more vector friendly, i.e., that does not
15289 // force to use floating point types.
15290
15291 // Make sure we can change the type of the vector.
15292 // This is possible iff:
15293 // 1. The vector is only used in a bitcast to a integer type. I.e.,
15294 // 1.1. Vector is used only once.
15295 // 1.2. Use is a bit convert to an integer type.
15296 // 2. The size of its operands are 32-bits (64-bits are not legal).
15297 EVT VT = N->getValueType(ResNo: 0);
15298 EVT EltVT = VT.getVectorElementType();
15299
15300 // Check 1.1. and 2.
15301 if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
15302 return SDValue();
15303
15304 // By construction, the input type must be float.
15305 assert(EltVT == MVT::f32 && "Unexpected type!");
15306
15307 // Check 1.2.
15308 SDNode *Use = *N->use_begin();
15309 if (Use->getOpcode() != ISD::BITCAST ||
15310 Use->getValueType(ResNo: 0).isFloatingPoint())
15311 return SDValue();
15312
15313 // Check profitability.
15314 // Model is, if more than half of the relevant operands are bitcast from
15315 // i32, turn the build_vector into a sequence of insert_vector_elt.
15316 // Relevant operands are everything that is not statically
15317 // (i.e., at compile time) bitcasted.
15318 unsigned NumOfBitCastedElts = 0;
15319 unsigned NumElts = VT.getVectorNumElements();
15320 unsigned NumOfRelevantElts = NumElts;
15321 for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
15322 SDValue Elt = N->getOperand(Num: Idx);
15323 if (Elt->getOpcode() == ISD::BITCAST) {
15324 // Assume only bit cast to i32 will go away.
15325 if (Elt->getOperand(0).getValueType() == MVT::i32)
15326 ++NumOfBitCastedElts;
15327 } else if (Elt.isUndef() || isa<ConstantSDNode>(Val: Elt))
15328 // Constants are statically casted, thus do not count them as
15329 // relevant operands.
15330 --NumOfRelevantElts;
15331 }
15332
15333 // Check if more than half of the elements require a non-free bitcast.
15334 if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
15335 return SDValue();
15336
15337 SelectionDAG &DAG = DCI.DAG;
15338 // Create the new vector type.
15339 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
15340 // Check if the type is legal.
15341 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15342 if (!TLI.isTypeLegal(VT: VecVT))
15343 return SDValue();
15344
15345 // Combine:
15346 // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
15347 // => BITCAST INSERT_VECTOR_ELT
15348 // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
15349 // (BITCAST EN), N.
15350 SDValue Vec = DAG.getUNDEF(VT: VecVT);
15351 SDLoc dl(N);
15352 for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
15353 SDValue V = N->getOperand(Num: Idx);
15354 if (V.isUndef())
15355 continue;
15356 if (V.getOpcode() == ISD::BITCAST &&
15357 V->getOperand(0).getValueType() == MVT::i32)
15358 // Fold obvious case.
15359 V = V.getOperand(i: 0);
15360 else {
15361 V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
15362 // Make the DAGCombiner fold the bitcasts.
15363 DCI.AddToWorklist(N: V.getNode());
15364 }
15365 SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32);
15366 Vec = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: dl, VT: VecVT, N1: Vec, N2: V, N3: LaneIdx);
15367 }
15368 Vec = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT, Operand: Vec);
15369 // Make the DAGCombiner fold the bitcasts.
15370 DCI.AddToWorklist(N: Vec.getNode());
15371 return Vec;
15372}
15373
15374static SDValue
15375PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
15376 EVT VT = N->getValueType(ResNo: 0);
15377 SDValue Op = N->getOperand(Num: 0);
15378 SDLoc dl(N);
15379
15380 // PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x)
15381 if (Op->getOpcode() == ARMISD::PREDICATE_CAST) {
15382 // If the valuetypes are the same, we can remove the cast entirely.
15383 if (Op->getOperand(Num: 0).getValueType() == VT)
15384 return Op->getOperand(Num: 0);
15385 return DCI.DAG.getNode(Opcode: ARMISD::PREDICATE_CAST, DL: dl, VT, Operand: Op->getOperand(Num: 0));
15386 }
15387
15388 // Turn pred_cast(xor x, -1) into xor(pred_cast x, -1), in order to produce
15389 // more VPNOT which might get folded as else predicates.
15390 if (Op.getValueType() == MVT::i32 && isBitwiseNot(Op)) {
15391 SDValue X =
15392 DCI.DAG.getNode(Opcode: ARMISD::PREDICATE_CAST, DL: dl, VT, Operand: Op->getOperand(Num: 0));
15393 SDValue C = DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT,
15394 DCI.DAG.getConstant(65535, dl, MVT::i32));
15395 return DCI.DAG.getNode(Opcode: ISD::XOR, DL: dl, VT, N1: X, N2: C);
15396 }
15397
15398 // Only the bottom 16 bits of the source register are used.
15399 if (Op.getValueType() == MVT::i32) {
15400 APInt DemandedMask = APInt::getLowBitsSet(numBits: 32, loBitsSet: 16);
15401 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
15402 if (TLI.SimplifyDemandedBits(Op, DemandedBits: DemandedMask, DCI))
15403 return SDValue(N, 0);
15404 }
15405 return SDValue();
15406}
15407
15408static SDValue PerformVECTOR_REG_CASTCombine(SDNode *N, SelectionDAG &DAG,
15409 const ARMSubtarget *ST) {
15410 EVT VT = N->getValueType(ResNo: 0);
15411 SDValue Op = N->getOperand(Num: 0);
15412 SDLoc dl(N);
15413
15414 // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST
15415 if (ST->isLittle())
15416 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT, Operand: Op);
15417
15418 // VECTOR_REG_CAST undef -> undef
15419 if (Op.isUndef())
15420 return DAG.getUNDEF(VT);
15421
15422 // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x)
15423 if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) {
15424 // If the valuetypes are the same, we can remove the cast entirely.
15425 if (Op->getOperand(Num: 0).getValueType() == VT)
15426 return Op->getOperand(Num: 0);
15427 return DAG.getNode(Opcode: ARMISD::VECTOR_REG_CAST, DL: dl, VT, Operand: Op->getOperand(Num: 0));
15428 }
15429
15430 return SDValue();
15431}
15432
15433static SDValue PerformVCMPCombine(SDNode *N, SelectionDAG &DAG,
15434 const ARMSubtarget *Subtarget) {
15435 if (!Subtarget->hasMVEIntegerOps())
15436 return SDValue();
15437
15438 EVT VT = N->getValueType(ResNo: 0);
15439 SDValue Op0 = N->getOperand(Num: 0);
15440 SDValue Op1 = N->getOperand(Num: 1);
15441 ARMCC::CondCodes Cond = (ARMCC::CondCodes)N->getConstantOperandVal(Num: 2);
15442 SDLoc dl(N);
15443
15444 // vcmp X, 0, cc -> vcmpz X, cc
15445 if (isZeroVector(N: Op1))
15446 return DAG.getNode(Opcode: ARMISD::VCMPZ, DL: dl, VT, N1: Op0, N2: N->getOperand(Num: 2));
15447
15448 unsigned SwappedCond = getSwappedCondition(CC: Cond);
15449 if (isValidMVECond(CC: SwappedCond, IsFloat: VT.isFloatingPoint())) {
15450 // vcmp 0, X, cc -> vcmpz X, reversed(cc)
15451 if (isZeroVector(Op0))
15452 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1,
15453 DAG.getConstant(SwappedCond, dl, MVT::i32));
15454 // vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc)
15455 if (Op0->getOpcode() == ARMISD::VDUP && Op1->getOpcode() != ARMISD::VDUP)
15456 return DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0,
15457 DAG.getConstant(SwappedCond, dl, MVT::i32));
15458 }
15459
15460 return SDValue();
15461}
15462
15463/// PerformInsertEltCombine - Target-specific dag combine xforms for
15464/// ISD::INSERT_VECTOR_ELT.
15465static SDValue PerformInsertEltCombine(SDNode *N,
15466 TargetLowering::DAGCombinerInfo &DCI) {
15467 // Bitcast an i64 load inserted into a vector to f64.
15468 // Otherwise, the i64 value will be legalized to a pair of i32 values.
15469 EVT VT = N->getValueType(ResNo: 0);
15470 SDNode *Elt = N->getOperand(Num: 1).getNode();
15471 if (VT.getVectorElementType() != MVT::i64 ||
15472 !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
15473 return SDValue();
15474
15475 SelectionDAG &DAG = DCI.DAG;
15476 SDLoc dl(N);
15477 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
15478 VT.getVectorNumElements());
15479 SDValue Vec = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: FloatVT, Operand: N->getOperand(Num: 0));
15480 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
15481 // Make the DAGCombiner fold the bitcasts.
15482 DCI.AddToWorklist(N: Vec.getNode());
15483 DCI.AddToWorklist(N: V.getNode());
15484 SDValue InsElt = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: dl, VT: FloatVT,
15485 N1: Vec, N2: V, N3: N->getOperand(Num: 2));
15486 return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT, Operand: InsElt);
15487}
15488
15489// Convert a pair of extracts from the same base vector to a VMOVRRD. Either
15490// directly or bitcast to an integer if the original is a float vector.
15491// extract(x, n); extract(x, n+1) -> VMOVRRD(extract v2f64 x, n/2)
15492// bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD(extract x, n/2)
15493static SDValue
15494PerformExtractEltToVMOVRRD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
15495 EVT VT = N->getValueType(ResNo: 0);
15496 SDLoc dl(N);
15497
15498 if (!DCI.isAfterLegalizeDAG() || VT != MVT::i32 ||
15499 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(MVT::f64))
15500 return SDValue();
15501
15502 SDValue Ext = SDValue(N, 0);
15503 if (Ext.getOpcode() == ISD::BITCAST &&
15504 Ext.getOperand(0).getValueType() == MVT::f32)
15505 Ext = Ext.getOperand(i: 0);
15506 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
15507 !isa<ConstantSDNode>(Val: Ext.getOperand(i: 1)) ||
15508 Ext.getConstantOperandVal(i: 1) % 2 != 0)
15509 return SDValue();
15510 if (Ext->use_size() == 1 &&
15511 (Ext->use_begin()->getOpcode() == ISD::SINT_TO_FP ||
15512 Ext->use_begin()->getOpcode() == ISD::UINT_TO_FP))
15513 return SDValue();
15514
15515 SDValue Op0 = Ext.getOperand(i: 0);
15516 EVT VecVT = Op0.getValueType();
15517 unsigned ResNo = Op0.getResNo();
15518 unsigned Lane = Ext.getConstantOperandVal(i: 1);
15519 if (VecVT.getVectorNumElements() != 4)
15520 return SDValue();
15521
15522 // Find another extract, of Lane + 1
15523 auto OtherIt = find_if(Range: Op0->uses(), P: [&](SDNode *V) {
15524 return V->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15525 isa<ConstantSDNode>(Val: V->getOperand(Num: 1)) &&
15526 V->getConstantOperandVal(Num: 1) == Lane + 1 &&
15527 V->getOperand(Num: 0).getResNo() == ResNo;
15528 });
15529 if (OtherIt == Op0->uses().end())
15530 return SDValue();
15531
15532 // For float extracts, we need to be converting to a i32 for both vector
15533 // lanes.
15534 SDValue OtherExt(*OtherIt, 0);
15535 if (OtherExt.getValueType() != MVT::i32) {
15536 if (OtherExt->use_size() != 1 ||
15537 OtherExt->use_begin()->getOpcode() != ISD::BITCAST ||
15538 OtherExt->use_begin()->getValueType(0) != MVT::i32)
15539 return SDValue();
15540 OtherExt = SDValue(*OtherExt->use_begin(), 0);
15541 }
15542
15543 // Convert the type to a f64 and extract with a VMOVRRD.
15544 SDValue F64 = DCI.DAG.getNode(
15545 ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15546 DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v2f64, Op0),
15547 DCI.DAG.getConstant(Ext.getConstantOperandVal(1) / 2, dl, MVT::i32));
15548 SDValue VMOVRRD =
15549 DCI.DAG.getNode(ARMISD::VMOVRRD, dl, {MVT::i32, MVT::i32}, F64);
15550
15551 DCI.CombineTo(N: OtherExt.getNode(), Res: SDValue(VMOVRRD.getNode(), 1));
15552 return VMOVRRD;
15553}
15554
15555static SDValue PerformExtractEltCombine(SDNode *N,
15556 TargetLowering::DAGCombinerInfo &DCI,
15557 const ARMSubtarget *ST) {
15558 SDValue Op0 = N->getOperand(Num: 0);
15559 EVT VT = N->getValueType(ResNo: 0);
15560 SDLoc dl(N);
15561
15562 // extract (vdup x) -> x
15563 if (Op0->getOpcode() == ARMISD::VDUP) {
15564 SDValue X = Op0->getOperand(Num: 0);
15565 if (VT == MVT::f16 && X.getValueType() == MVT::i32)
15566 return DCI.DAG.getNode(Opcode: ARMISD::VMOVhr, DL: dl, VT, Operand: X);
15567 if (VT == MVT::i32 && X.getValueType() == MVT::f16)
15568 return DCI.DAG.getNode(Opcode: ARMISD::VMOVrh, DL: dl, VT, Operand: X);
15569 if (VT == MVT::f32 && X.getValueType() == MVT::i32)
15570 return DCI.DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT, Operand: X);
15571
15572 while (X.getValueType() != VT && X->getOpcode() == ISD::BITCAST)
15573 X = X->getOperand(Num: 0);
15574 if (X.getValueType() == VT)
15575 return X;
15576 }
15577
15578 // extract ARM_BUILD_VECTOR -> x
15579 if (Op0->getOpcode() == ARMISD::BUILD_VECTOR &&
15580 isa<ConstantSDNode>(Val: N->getOperand(Num: 1)) &&
15581 N->getConstantOperandVal(Num: 1) < Op0.getNumOperands()) {
15582 return Op0.getOperand(i: N->getConstantOperandVal(Num: 1));
15583 }
15584
15585 // extract(bitcast(BUILD_VECTOR(VMOVDRR(a, b), ..))) -> a or b
15586 if (Op0.getValueType() == MVT::v4i32 &&
15587 isa<ConstantSDNode>(N->getOperand(1)) &&
15588 Op0.getOpcode() == ISD::BITCAST &&
15589 Op0.getOperand(0).getOpcode() == ISD::BUILD_VECTOR &&
15590 Op0.getOperand(0).getValueType() == MVT::v2f64) {
15591 SDValue BV = Op0.getOperand(i: 0);
15592 unsigned Offset = N->getConstantOperandVal(Num: 1);
15593 SDValue MOV = BV.getOperand(i: Offset < 2 ? 0 : 1);
15594 if (MOV.getOpcode() == ARMISD::VMOVDRR)
15595 return MOV.getOperand(i: ST->isLittle() ? Offset % 2 : 1 - Offset % 2);
15596 }
15597
15598 // extract x, n; extract x, n+1 -> VMOVRRD x
15599 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
15600 return R;
15601
15602 // extract (MVETrunc(x)) -> extract x
15603 if (Op0->getOpcode() == ARMISD::MVETRUNC) {
15604 unsigned Idx = N->getConstantOperandVal(Num: 1);
15605 unsigned Vec =
15606 Idx / Op0->getOperand(Num: 0).getValueType().getVectorNumElements();
15607 unsigned SubIdx =
15608 Idx % Op0->getOperand(Num: 0).getValueType().getVectorNumElements();
15609 return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Op0.getOperand(Vec),
15610 DCI.DAG.getConstant(SubIdx, dl, MVT::i32));
15611 }
15612
15613 return SDValue();
15614}
15615
15616static SDValue PerformSignExtendInregCombine(SDNode *N, SelectionDAG &DAG) {
15617 SDValue Op = N->getOperand(Num: 0);
15618 EVT VT = N->getValueType(ResNo: 0);
15619
15620 // sext_inreg(VGETLANEu) -> VGETLANEs
15621 if (Op.getOpcode() == ARMISD::VGETLANEu &&
15622 cast<VTSDNode>(Val: N->getOperand(Num: 1))->getVT() ==
15623 Op.getOperand(i: 0).getValueType().getScalarType())
15624 return DAG.getNode(Opcode: ARMISD::VGETLANEs, DL: SDLoc(N), VT, N1: Op.getOperand(i: 0),
15625 N2: Op.getOperand(i: 1));
15626
15627 return SDValue();
15628}
15629
15630static SDValue
15631PerformInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
15632 SDValue Vec = N->getOperand(Num: 0);
15633 SDValue SubVec = N->getOperand(Num: 1);
15634 uint64_t IdxVal = N->getConstantOperandVal(Num: 2);
15635 EVT VecVT = Vec.getValueType();
15636 EVT SubVT = SubVec.getValueType();
15637
15638 // Only do this for legal fixed vector types.
15639 if (!VecVT.isFixedLengthVector() ||
15640 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT: VecVT) ||
15641 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT: SubVT))
15642 return SDValue();
15643
15644 // Ignore widening patterns.
15645 if (IdxVal == 0 && Vec.isUndef())
15646 return SDValue();
15647
15648 // Subvector must be half the width and an "aligned" insertion.
15649 unsigned NumSubElts = SubVT.getVectorNumElements();
15650 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
15651 (IdxVal != 0 && IdxVal != NumSubElts))
15652 return SDValue();
15653
15654 // Fold insert_subvector -> concat_vectors
15655 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
15656 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
15657 SDLoc DL(N);
15658 SDValue Lo, Hi;
15659 if (IdxVal == 0) {
15660 Lo = SubVec;
15661 Hi = DCI.DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: SubVT, N1: Vec,
15662 N2: DCI.DAG.getVectorIdxConstant(Val: NumSubElts, DL));
15663 } else {
15664 Lo = DCI.DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: SubVT, N1: Vec,
15665 N2: DCI.DAG.getVectorIdxConstant(Val: 0, DL));
15666 Hi = SubVec;
15667 }
15668 return DCI.DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: VecVT, N1: Lo, N2: Hi);
15669}
15670
15671// shuffle(MVETrunc(x, y)) -> VMOVN(x, y)
15672static SDValue PerformShuffleVMOVNCombine(ShuffleVectorSDNode *N,
15673 SelectionDAG &DAG) {
15674 SDValue Trunc = N->getOperand(Num: 0);
15675 EVT VT = Trunc.getValueType();
15676 if (Trunc.getOpcode() != ARMISD::MVETRUNC || !N->getOperand(Num: 1).isUndef())
15677 return SDValue();
15678
15679 SDLoc DL(Trunc);
15680 if (isVMOVNTruncMask(N->getMask(), VT, false))
15681 return DAG.getNode(
15682 ARMISD::VMOVN, DL, VT,
15683 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
15684 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
15685 DAG.getConstant(1, DL, MVT::i32));
15686 else if (isVMOVNTruncMask(N->getMask(), VT, true))
15687 return DAG.getNode(
15688 ARMISD::VMOVN, DL, VT,
15689 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
15690 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
15691 DAG.getConstant(1, DL, MVT::i32));
15692 return SDValue();
15693}
15694
15695/// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
15696/// ISD::VECTOR_SHUFFLE.
15697static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
15698 if (SDValue R = PerformShuffleVMOVNCombine(N: cast<ShuffleVectorSDNode>(Val: N), DAG))
15699 return R;
15700
15701 // The LLVM shufflevector instruction does not require the shuffle mask
15702 // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
15703 // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the
15704 // operands do not match the mask length, they are extended by concatenating
15705 // them with undef vectors. That is probably the right thing for other
15706 // targets, but for NEON it is better to concatenate two double-register
15707 // size vector operands into a single quad-register size vector. Do that
15708 // transformation here:
15709 // shuffle(concat(v1, undef), concat(v2, undef)) ->
15710 // shuffle(concat(v1, v2), undef)
15711 SDValue Op0 = N->getOperand(Num: 0);
15712 SDValue Op1 = N->getOperand(Num: 1);
15713 if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
15714 Op1.getOpcode() != ISD::CONCAT_VECTORS ||
15715 Op0.getNumOperands() != 2 ||
15716 Op1.getNumOperands() != 2)
15717 return SDValue();
15718 SDValue Concat0Op1 = Op0.getOperand(i: 1);
15719 SDValue Concat1Op1 = Op1.getOperand(i: 1);
15720 if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef())
15721 return SDValue();
15722 // Skip the transformation if any of the types are illegal.
15723 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15724 EVT VT = N->getValueType(ResNo: 0);
15725 if (!TLI.isTypeLegal(VT) ||
15726 !TLI.isTypeLegal(VT: Concat0Op1.getValueType()) ||
15727 !TLI.isTypeLegal(VT: Concat1Op1.getValueType()))
15728 return SDValue();
15729
15730 SDValue NewConcat = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(N), VT,
15731 N1: Op0.getOperand(i: 0), N2: Op1.getOperand(i: 0));
15732 // Translate the shuffle mask.
15733 SmallVector<int, 16> NewMask;
15734 unsigned NumElts = VT.getVectorNumElements();
15735 unsigned HalfElts = NumElts/2;
15736 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Val: N);
15737 for (unsigned n = 0; n < NumElts; ++n) {
15738 int MaskElt = SVN->getMaskElt(Idx: n);
15739 int NewElt = -1;
15740 if (MaskElt < (int)HalfElts)
15741 NewElt = MaskElt;
15742 else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
15743 NewElt = HalfElts + MaskElt - NumElts;
15744 NewMask.push_back(Elt: NewElt);
15745 }
15746 return DAG.getVectorShuffle(VT, dl: SDLoc(N), N1: NewConcat,
15747 N2: DAG.getUNDEF(VT), Mask: NewMask);
15748}
15749
15750/// Load/store instruction that can be merged with a base address
15751/// update
15752struct BaseUpdateTarget {
15753 SDNode *N;
15754 bool isIntrinsic;
15755 bool isStore;
15756 unsigned AddrOpIdx;
15757};
15758
15759struct BaseUpdateUser {
15760 /// Instruction that updates a pointer
15761 SDNode *N;
15762 /// Pointer increment operand
15763 SDValue Inc;
15764 /// Pointer increment value if it is a constant, or 0 otherwise
15765 unsigned ConstInc;
15766};
15767
15768static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target,
15769 struct BaseUpdateUser &User,
15770 bool SimpleConstIncOnly,
15771 TargetLowering::DAGCombinerInfo &DCI) {
15772 SelectionDAG &DAG = DCI.DAG;
15773 SDNode *N = Target.N;
15774 MemSDNode *MemN = cast<MemSDNode>(Val: N);
15775 SDLoc dl(N);
15776
15777 // Find the new opcode for the updating load/store.
15778 bool isLoadOp = true;
15779 bool isLaneOp = false;
15780 // Workaround for vst1x and vld1x intrinsics which do not have alignment
15781 // as an operand.
15782 bool hasAlignment = true;
15783 unsigned NewOpc = 0;
15784 unsigned NumVecs = 0;
15785 if (Target.isIntrinsic) {
15786 unsigned IntNo = N->getConstantOperandVal(Num: 1);
15787 switch (IntNo) {
15788 default:
15789 llvm_unreachable("unexpected intrinsic for Neon base update");
15790 case Intrinsic::arm_neon_vld1:
15791 NewOpc = ARMISD::VLD1_UPD;
15792 NumVecs = 1;
15793 break;
15794 case Intrinsic::arm_neon_vld2:
15795 NewOpc = ARMISD::VLD2_UPD;
15796 NumVecs = 2;
15797 break;
15798 case Intrinsic::arm_neon_vld3:
15799 NewOpc = ARMISD::VLD3_UPD;
15800 NumVecs = 3;
15801 break;
15802 case Intrinsic::arm_neon_vld4:
15803 NewOpc = ARMISD::VLD4_UPD;
15804 NumVecs = 4;
15805 break;
15806 case Intrinsic::arm_neon_vld1x2:
15807 NewOpc = ARMISD::VLD1x2_UPD;
15808 NumVecs = 2;
15809 hasAlignment = false;
15810 break;
15811 case Intrinsic::arm_neon_vld1x3:
15812 NewOpc = ARMISD::VLD1x3_UPD;
15813 NumVecs = 3;
15814 hasAlignment = false;
15815 break;
15816 case Intrinsic::arm_neon_vld1x4:
15817 NewOpc = ARMISD::VLD1x4_UPD;
15818 NumVecs = 4;
15819 hasAlignment = false;
15820 break;
15821 case Intrinsic::arm_neon_vld2dup:
15822 NewOpc = ARMISD::VLD2DUP_UPD;
15823 NumVecs = 2;
15824 break;
15825 case Intrinsic::arm_neon_vld3dup:
15826 NewOpc = ARMISD::VLD3DUP_UPD;
15827 NumVecs = 3;
15828 break;
15829 case Intrinsic::arm_neon_vld4dup:
15830 NewOpc = ARMISD::VLD4DUP_UPD;
15831 NumVecs = 4;
15832 break;
15833 case Intrinsic::arm_neon_vld2lane:
15834 NewOpc = ARMISD::VLD2LN_UPD;
15835 NumVecs = 2;
15836 isLaneOp = true;
15837 break;
15838 case Intrinsic::arm_neon_vld3lane:
15839 NewOpc = ARMISD::VLD3LN_UPD;
15840 NumVecs = 3;
15841 isLaneOp = true;
15842 break;
15843 case Intrinsic::arm_neon_vld4lane:
15844 NewOpc = ARMISD::VLD4LN_UPD;
15845 NumVecs = 4;
15846 isLaneOp = true;
15847 break;
15848 case Intrinsic::arm_neon_vst1:
15849 NewOpc = ARMISD::VST1_UPD;
15850 NumVecs = 1;
15851 isLoadOp = false;
15852 break;
15853 case Intrinsic::arm_neon_vst2:
15854 NewOpc = ARMISD::VST2_UPD;
15855 NumVecs = 2;
15856 isLoadOp = false;
15857 break;
15858 case Intrinsic::arm_neon_vst3:
15859 NewOpc = ARMISD::VST3_UPD;
15860 NumVecs = 3;
15861 isLoadOp = false;
15862 break;
15863 case Intrinsic::arm_neon_vst4:
15864 NewOpc = ARMISD::VST4_UPD;
15865 NumVecs = 4;
15866 isLoadOp = false;
15867 break;
15868 case Intrinsic::arm_neon_vst2lane:
15869 NewOpc = ARMISD::VST2LN_UPD;
15870 NumVecs = 2;
15871 isLoadOp = false;
15872 isLaneOp = true;
15873 break;
15874 case Intrinsic::arm_neon_vst3lane:
15875 NewOpc = ARMISD::VST3LN_UPD;
15876 NumVecs = 3;
15877 isLoadOp = false;
15878 isLaneOp = true;
15879 break;
15880 case Intrinsic::arm_neon_vst4lane:
15881 NewOpc = ARMISD::VST4LN_UPD;
15882 NumVecs = 4;
15883 isLoadOp = false;
15884 isLaneOp = true;
15885 break;
15886 case Intrinsic::arm_neon_vst1x2:
15887 NewOpc = ARMISD::VST1x2_UPD;
15888 NumVecs = 2;
15889 isLoadOp = false;
15890 hasAlignment = false;
15891 break;
15892 case Intrinsic::arm_neon_vst1x3:
15893 NewOpc = ARMISD::VST1x3_UPD;
15894 NumVecs = 3;
15895 isLoadOp = false;
15896 hasAlignment = false;
15897 break;
15898 case Intrinsic::arm_neon_vst1x4:
15899 NewOpc = ARMISD::VST1x4_UPD;
15900 NumVecs = 4;
15901 isLoadOp = false;
15902 hasAlignment = false;
15903 break;
15904 }
15905 } else {
15906 isLaneOp = true;
15907 switch (N->getOpcode()) {
15908 default:
15909 llvm_unreachable("unexpected opcode for Neon base update");
15910 case ARMISD::VLD1DUP:
15911 NewOpc = ARMISD::VLD1DUP_UPD;
15912 NumVecs = 1;
15913 break;
15914 case ARMISD::VLD2DUP:
15915 NewOpc = ARMISD::VLD2DUP_UPD;
15916 NumVecs = 2;
15917 break;
15918 case ARMISD::VLD3DUP:
15919 NewOpc = ARMISD::VLD3DUP_UPD;
15920 NumVecs = 3;
15921 break;
15922 case ARMISD::VLD4DUP:
15923 NewOpc = ARMISD::VLD4DUP_UPD;
15924 NumVecs = 4;
15925 break;
15926 case ISD::LOAD:
15927 NewOpc = ARMISD::VLD1_UPD;
15928 NumVecs = 1;
15929 isLaneOp = false;
15930 break;
15931 case ISD::STORE:
15932 NewOpc = ARMISD::VST1_UPD;
15933 NumVecs = 1;
15934 isLaneOp = false;
15935 isLoadOp = false;
15936 break;
15937 }
15938 }
15939
15940 // Find the size of memory referenced by the load/store.
15941 EVT VecTy;
15942 if (isLoadOp) {
15943 VecTy = N->getValueType(ResNo: 0);
15944 } else if (Target.isIntrinsic) {
15945 VecTy = N->getOperand(Num: Target.AddrOpIdx + 1).getValueType();
15946 } else {
15947 assert(Target.isStore &&
15948 "Node has to be a load, a store, or an intrinsic!");
15949 VecTy = N->getOperand(Num: 1).getValueType();
15950 }
15951
15952 bool isVLDDUPOp =
15953 NewOpc == ARMISD::VLD1DUP_UPD || NewOpc == ARMISD::VLD2DUP_UPD ||
15954 NewOpc == ARMISD::VLD3DUP_UPD || NewOpc == ARMISD::VLD4DUP_UPD;
15955
15956 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
15957 if (isLaneOp || isVLDDUPOp)
15958 NumBytes /= VecTy.getVectorNumElements();
15959
15960 if (NumBytes >= 3 * 16 && User.ConstInc != NumBytes) {
15961 // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
15962 // separate instructions that make it harder to use a non-constant update.
15963 return false;
15964 }
15965
15966 if (SimpleConstIncOnly && User.ConstInc != NumBytes)
15967 return false;
15968
15969 // OK, we found an ADD we can fold into the base update.
15970 // Now, create a _UPD node, taking care of not breaking alignment.
15971
15972 EVT AlignedVecTy = VecTy;
15973 Align Alignment = MemN->getAlign();
15974
15975 // If this is a less-than-standard-aligned load/store, change the type to
15976 // match the standard alignment.
15977 // The alignment is overlooked when selecting _UPD variants; and it's
15978 // easier to introduce bitcasts here than fix that.
15979 // There are 3 ways to get to this base-update combine:
15980 // - intrinsics: they are assumed to be properly aligned (to the standard
15981 // alignment of the memory type), so we don't need to do anything.
15982 // - ARMISD::VLDx nodes: they are only generated from the aforementioned
15983 // intrinsics, so, likewise, there's nothing to do.
15984 // - generic load/store instructions: the alignment is specified as an
15985 // explicit operand, rather than implicitly as the standard alignment
15986 // of the memory type (like the intrisics). We need to change the
15987 // memory type to match the explicit alignment. That way, we don't
15988 // generate non-standard-aligned ARMISD::VLDx nodes.
15989 if (isa<LSBaseSDNode>(Val: N)) {
15990 if (Alignment.value() < VecTy.getScalarSizeInBits() / 8) {
15991 MVT EltTy = MVT::getIntegerVT(BitWidth: Alignment.value() * 8);
15992 assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
15993 assert(!isLaneOp && "Unexpected generic load/store lane.");
15994 unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
15995 AlignedVecTy = MVT::getVectorVT(VT: EltTy, NumElements: NumElts);
15996 }
15997 // Don't set an explicit alignment on regular load/stores that we want
15998 // to transform to VLD/VST 1_UPD nodes.
15999 // This matches the behavior of regular load/stores, which only get an
16000 // explicit alignment if the MMO alignment is larger than the standard
16001 // alignment of the memory type.
16002 // Intrinsics, however, always get an explicit alignment, set to the
16003 // alignment of the MMO.
16004 Alignment = Align(1);
16005 }
16006
16007 // Create the new updating load/store node.
16008 // First, create an SDVTList for the new updating node's results.
16009 EVT Tys[6];
16010 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
16011 unsigned n;
16012 for (n = 0; n < NumResultVecs; ++n)
16013 Tys[n] = AlignedVecTy;
16014 Tys[n++] = MVT::i32;
16015 Tys[n] = MVT::Other;
16016 SDVTList SDTys = DAG.getVTList(VTs: ArrayRef(Tys, NumResultVecs + 2));
16017
16018 // Then, gather the new node's operands.
16019 SmallVector<SDValue, 8> Ops;
16020 Ops.push_back(Elt: N->getOperand(Num: 0)); // incoming chain
16021 Ops.push_back(Elt: N->getOperand(Num: Target.AddrOpIdx));
16022 Ops.push_back(Elt: User.Inc);
16023
16024 if (StoreSDNode *StN = dyn_cast<StoreSDNode>(Val: N)) {
16025 // Try to match the intrinsic's signature
16026 Ops.push_back(Elt: StN->getValue());
16027 } else {
16028 // Loads (and of course intrinsics) match the intrinsics' signature,
16029 // so just add all but the alignment operand.
16030 unsigned LastOperand =
16031 hasAlignment ? N->getNumOperands() - 1 : N->getNumOperands();
16032 for (unsigned i = Target.AddrOpIdx + 1; i < LastOperand; ++i)
16033 Ops.push_back(Elt: N->getOperand(Num: i));
16034 }
16035
16036 // For all node types, the alignment operand is always the last one.
16037 Ops.push_back(DAG.getConstant(Alignment.value(), dl, MVT::i32));
16038
16039 // If this is a non-standard-aligned STORE, the penultimate operand is the
16040 // stored value. Bitcast it to the aligned type.
16041 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
16042 SDValue &StVal = Ops[Ops.size() - 2];
16043 StVal = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: AlignedVecTy, Operand: StVal);
16044 }
16045
16046 EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
16047 SDValue UpdN = DAG.getMemIntrinsicNode(Opcode: NewOpc, dl, VTList: SDTys, Ops, MemVT: LoadVT,
16048 MMO: MemN->getMemOperand());
16049
16050 // Update the uses.
16051 SmallVector<SDValue, 5> NewResults;
16052 for (unsigned i = 0; i < NumResultVecs; ++i)
16053 NewResults.push_back(Elt: SDValue(UpdN.getNode(), i));
16054
16055 // If this is an non-standard-aligned LOAD, the first result is the loaded
16056 // value. Bitcast it to the expected result type.
16057 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
16058 SDValue &LdVal = NewResults[0];
16059 LdVal = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: VecTy, Operand: LdVal);
16060 }
16061
16062 NewResults.push_back(Elt: SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
16063 DCI.CombineTo(N, To: NewResults);
16064 DCI.CombineTo(N: User.N, Res: SDValue(UpdN.getNode(), NumResultVecs));
16065
16066 return true;
16067}
16068
16069// If (opcode ptr inc) is and ADD-like instruction, return the
16070// increment value. Otherwise return 0.
16071static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr,
16072 SDValue Inc, const SelectionDAG &DAG) {
16073 ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Val: Inc.getNode());
16074 if (!CInc)
16075 return 0;
16076
16077 switch (Opcode) {
16078 case ARMISD::VLD1_UPD:
16079 case ISD::ADD:
16080 return CInc->getZExtValue();
16081 case ISD::OR: {
16082 if (DAG.haveNoCommonBitsSet(A: Ptr, B: Inc)) {
16083 // (OR ptr inc) is the same as (ADD ptr inc)
16084 return CInc->getZExtValue();
16085 }
16086 return 0;
16087 }
16088 default:
16089 return 0;
16090 }
16091}
16092
16093static bool findPointerConstIncrement(SDNode *N, SDValue *Ptr, SDValue *CInc) {
16094 switch (N->getOpcode()) {
16095 case ISD::ADD:
16096 case ISD::OR: {
16097 if (isa<ConstantSDNode>(Val: N->getOperand(Num: 1))) {
16098 *Ptr = N->getOperand(Num: 0);
16099 *CInc = N->getOperand(Num: 1);
16100 return true;
16101 }
16102 return false;
16103 }
16104 case ARMISD::VLD1_UPD: {
16105 if (isa<ConstantSDNode>(Val: N->getOperand(Num: 2))) {
16106 *Ptr = N->getOperand(Num: 1);
16107 *CInc = N->getOperand(Num: 2);
16108 return true;
16109 }
16110 return false;
16111 }
16112 default:
16113 return false;
16114 }
16115}
16116
16117static bool isValidBaseUpdate(SDNode *N, SDNode *User) {
16118 // Check that the add is independent of the load/store.
16119 // Otherwise, folding it would create a cycle. Search through Addr
16120 // as well, since the User may not be a direct user of Addr and
16121 // only share a base pointer.
16122 SmallPtrSet<const SDNode *, 32> Visited;
16123 SmallVector<const SDNode *, 16> Worklist;
16124 Worklist.push_back(Elt: N);
16125 Worklist.push_back(Elt: User);
16126 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
16127 SDNode::hasPredecessorHelper(N: User, Visited, Worklist))
16128 return false;
16129 return true;
16130}
16131
16132/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
16133/// NEON load/store intrinsics, and generic vector load/stores, to merge
16134/// base address updates.
16135/// For generic load/stores, the memory type is assumed to be a vector.
16136/// The caller is assumed to have checked legality.
16137static SDValue CombineBaseUpdate(SDNode *N,
16138 TargetLowering::DAGCombinerInfo &DCI) {
16139 const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
16140 N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
16141 const bool isStore = N->getOpcode() == ISD::STORE;
16142 const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
16143 BaseUpdateTarget Target = {.N: N, .isIntrinsic: isIntrinsic, .isStore: isStore, .AddrOpIdx: AddrOpIdx};
16144
16145 SDValue Addr = N->getOperand(Num: AddrOpIdx);
16146
16147 SmallVector<BaseUpdateUser, 8> BaseUpdates;
16148
16149 // Search for a use of the address operand that is an increment.
16150 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
16151 UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
16152 SDNode *User = *UI;
16153 if (UI.getUse().getResNo() != Addr.getResNo() ||
16154 User->getNumOperands() != 2)
16155 continue;
16156
16157 SDValue Inc = User->getOperand(Num: UI.getOperandNo() == 1 ? 0 : 1);
16158 unsigned ConstInc =
16159 getPointerConstIncrement(Opcode: User->getOpcode(), Ptr: Addr, Inc, DAG: DCI.DAG);
16160
16161 if (ConstInc || User->getOpcode() == ISD::ADD)
16162 BaseUpdates.push_back(Elt: {.N: User, .Inc: Inc, .ConstInc: ConstInc});
16163 }
16164
16165 // If the address is a constant pointer increment itself, find
16166 // another constant increment that has the same base operand
16167 SDValue Base;
16168 SDValue CInc;
16169 if (findPointerConstIncrement(N: Addr.getNode(), Ptr: &Base, CInc: &CInc)) {
16170 unsigned Offset =
16171 getPointerConstIncrement(Opcode: Addr->getOpcode(), Ptr: Base, Inc: CInc, DAG: DCI.DAG);
16172 for (SDNode::use_iterator UI = Base->use_begin(), UE = Base->use_end();
16173 UI != UE; ++UI) {
16174
16175 SDNode *User = *UI;
16176 if (UI.getUse().getResNo() != Base.getResNo() || User == Addr.getNode() ||
16177 User->getNumOperands() != 2)
16178 continue;
16179
16180 SDValue UserInc = User->getOperand(Num: UI.getOperandNo() == 0 ? 1 : 0);
16181 unsigned UserOffset =
16182 getPointerConstIncrement(Opcode: User->getOpcode(), Ptr: Base, Inc: UserInc, DAG: DCI.DAG);
16183
16184 if (!UserOffset || UserOffset <= Offset)
16185 continue;
16186
16187 unsigned NewConstInc = UserOffset - Offset;
16188 SDValue NewInc = DCI.DAG.getConstant(NewConstInc, SDLoc(N), MVT::i32);
16189 BaseUpdates.push_back(Elt: {.N: User, .Inc: NewInc, .ConstInc: NewConstInc});
16190 }
16191 }
16192
16193 // Try to fold the load/store with an update that matches memory
16194 // access size. This should work well for sequential loads.
16195 //
16196 // Filter out invalid updates as well.
16197 unsigned NumValidUpd = BaseUpdates.size();
16198 for (unsigned I = 0; I < NumValidUpd;) {
16199 BaseUpdateUser &User = BaseUpdates[I];
16200 if (!isValidBaseUpdate(N, User: User.N)) {
16201 --NumValidUpd;
16202 std::swap(a&: BaseUpdates[I], b&: BaseUpdates[NumValidUpd]);
16203 continue;
16204 }
16205
16206 if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/true, DCI))
16207 return SDValue();
16208 ++I;
16209 }
16210 BaseUpdates.resize(N: NumValidUpd);
16211
16212 // Try to fold with other users. Non-constant updates are considered
16213 // first, and constant updates are sorted to not break a sequence of
16214 // strided accesses (if there is any).
16215 std::stable_sort(first: BaseUpdates.begin(), last: BaseUpdates.end(),
16216 comp: [](const BaseUpdateUser &LHS, const BaseUpdateUser &RHS) {
16217 return LHS.ConstInc < RHS.ConstInc;
16218 });
16219 for (BaseUpdateUser &User : BaseUpdates) {
16220 if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/false, DCI))
16221 return SDValue();
16222 }
16223 return SDValue();
16224}
16225
16226static SDValue PerformVLDCombine(SDNode *N,
16227 TargetLowering::DAGCombinerInfo &DCI) {
16228 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
16229 return SDValue();
16230
16231 return CombineBaseUpdate(N, DCI);
16232}
16233
16234static SDValue PerformMVEVLDCombine(SDNode *N,
16235 TargetLowering::DAGCombinerInfo &DCI) {
16236 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
16237 return SDValue();
16238
16239 SelectionDAG &DAG = DCI.DAG;
16240 SDValue Addr = N->getOperand(Num: 2);
16241 MemSDNode *MemN = cast<MemSDNode>(Val: N);
16242 SDLoc dl(N);
16243
16244 // For the stores, where there are multiple intrinsics we only actually want
16245 // to post-inc the last of the them.
16246 unsigned IntNo = N->getConstantOperandVal(Num: 1);
16247 if (IntNo == Intrinsic::arm_mve_vst2q && N->getConstantOperandVal(5) != 1)
16248 return SDValue();
16249 if (IntNo == Intrinsic::arm_mve_vst4q && N->getConstantOperandVal(7) != 3)
16250 return SDValue();
16251
16252 // Search for a use of the address operand that is an increment.
16253 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
16254 UE = Addr.getNode()->use_end();
16255 UI != UE; ++UI) {
16256 SDNode *User = *UI;
16257 if (User->getOpcode() != ISD::ADD ||
16258 UI.getUse().getResNo() != Addr.getResNo())
16259 continue;
16260
16261 // Check that the add is independent of the load/store. Otherwise, folding
16262 // it would create a cycle. We can avoid searching through Addr as it's a
16263 // predecessor to both.
16264 SmallPtrSet<const SDNode *, 32> Visited;
16265 SmallVector<const SDNode *, 16> Worklist;
16266 Visited.insert(Ptr: Addr.getNode());
16267 Worklist.push_back(Elt: N);
16268 Worklist.push_back(Elt: User);
16269 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
16270 SDNode::hasPredecessorHelper(N: User, Visited, Worklist))
16271 continue;
16272
16273 // Find the new opcode for the updating load/store.
16274 bool isLoadOp = true;
16275 unsigned NewOpc = 0;
16276 unsigned NumVecs = 0;
16277 switch (IntNo) {
16278 default:
16279 llvm_unreachable("unexpected intrinsic for MVE VLDn combine");
16280 case Intrinsic::arm_mve_vld2q:
16281 NewOpc = ARMISD::VLD2_UPD;
16282 NumVecs = 2;
16283 break;
16284 case Intrinsic::arm_mve_vld4q:
16285 NewOpc = ARMISD::VLD4_UPD;
16286 NumVecs = 4;
16287 break;
16288 case Intrinsic::arm_mve_vst2q:
16289 NewOpc = ARMISD::VST2_UPD;
16290 NumVecs = 2;
16291 isLoadOp = false;
16292 break;
16293 case Intrinsic::arm_mve_vst4q:
16294 NewOpc = ARMISD::VST4_UPD;
16295 NumVecs = 4;
16296 isLoadOp = false;
16297 break;
16298 }
16299
16300 // Find the size of memory referenced by the load/store.
16301 EVT VecTy;
16302 if (isLoadOp) {
16303 VecTy = N->getValueType(ResNo: 0);
16304 } else {
16305 VecTy = N->getOperand(Num: 3).getValueType();
16306 }
16307
16308 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
16309
16310 // If the increment is a constant, it must match the memory ref size.
16311 SDValue Inc = User->getOperand(Num: User->getOperand(Num: 0) == Addr ? 1 : 0);
16312 ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Val: Inc.getNode());
16313 if (!CInc || CInc->getZExtValue() != NumBytes)
16314 continue;
16315
16316 // Create the new updating load/store node.
16317 // First, create an SDVTList for the new updating node's results.
16318 EVT Tys[6];
16319 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
16320 unsigned n;
16321 for (n = 0; n < NumResultVecs; ++n)
16322 Tys[n] = VecTy;
16323 Tys[n++] = MVT::i32;
16324 Tys[n] = MVT::Other;
16325 SDVTList SDTys = DAG.getVTList(VTs: ArrayRef(Tys, NumResultVecs + 2));
16326
16327 // Then, gather the new node's operands.
16328 SmallVector<SDValue, 8> Ops;
16329 Ops.push_back(Elt: N->getOperand(Num: 0)); // incoming chain
16330 Ops.push_back(Elt: N->getOperand(Num: 2)); // ptr
16331 Ops.push_back(Elt: Inc);
16332
16333 for (unsigned i = 3; i < N->getNumOperands(); ++i)
16334 Ops.push_back(Elt: N->getOperand(Num: i));
16335
16336 SDValue UpdN = DAG.getMemIntrinsicNode(Opcode: NewOpc, dl, VTList: SDTys, Ops, MemVT: VecTy,
16337 MMO: MemN->getMemOperand());
16338
16339 // Update the uses.
16340 SmallVector<SDValue, 5> NewResults;
16341 for (unsigned i = 0; i < NumResultVecs; ++i)
16342 NewResults.push_back(Elt: SDValue(UpdN.getNode(), i));
16343
16344 NewResults.push_back(Elt: SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
16345 DCI.CombineTo(N, To: NewResults);
16346 DCI.CombineTo(N: User, Res: SDValue(UpdN.getNode(), NumResultVecs));
16347
16348 break;
16349 }
16350
16351 return SDValue();
16352}
16353
16354/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
16355/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
16356/// are also VDUPLANEs. If so, combine them to a vldN-dup operation and
16357/// return true.
16358static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
16359 SelectionDAG &DAG = DCI.DAG;
16360 EVT VT = N->getValueType(ResNo: 0);
16361 // vldN-dup instructions only support 64-bit vectors for N > 1.
16362 if (!VT.is64BitVector())
16363 return false;
16364
16365 // Check if the VDUPLANE operand is a vldN-dup intrinsic.
16366 SDNode *VLD = N->getOperand(Num: 0).getNode();
16367 if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
16368 return false;
16369 unsigned NumVecs = 0;
16370 unsigned NewOpc = 0;
16371 unsigned IntNo = VLD->getConstantOperandVal(Num: 1);
16372 if (IntNo == Intrinsic::arm_neon_vld2lane) {
16373 NumVecs = 2;
16374 NewOpc = ARMISD::VLD2DUP;
16375 } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
16376 NumVecs = 3;
16377 NewOpc = ARMISD::VLD3DUP;
16378 } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
16379 NumVecs = 4;
16380 NewOpc = ARMISD::VLD4DUP;
16381 } else {
16382 return false;
16383 }
16384
16385 // First check that all the vldN-lane uses are VDUPLANEs and that the lane
16386 // numbers match the load.
16387 unsigned VLDLaneNo = VLD->getConstantOperandVal(Num: NumVecs + 3);
16388 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
16389 UI != UE; ++UI) {
16390 // Ignore uses of the chain result.
16391 if (UI.getUse().getResNo() == NumVecs)
16392 continue;
16393 SDNode *User = *UI;
16394 if (User->getOpcode() != ARMISD::VDUPLANE ||
16395 VLDLaneNo != User->getConstantOperandVal(Num: 1))
16396 return false;
16397 }
16398
16399 // Create the vldN-dup node.
16400 EVT Tys[5];
16401 unsigned n;
16402 for (n = 0; n < NumVecs; ++n)
16403 Tys[n] = VT;
16404 Tys[n] = MVT::Other;
16405 SDVTList SDTys = DAG.getVTList(VTs: ArrayRef(Tys, NumVecs + 1));
16406 SDValue Ops[] = { VLD->getOperand(Num: 0), VLD->getOperand(Num: 2) };
16407 MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(Val: VLD);
16408 SDValue VLDDup = DAG.getMemIntrinsicNode(Opcode: NewOpc, dl: SDLoc(VLD), VTList: SDTys,
16409 Ops, MemVT: VLDMemInt->getMemoryVT(),
16410 MMO: VLDMemInt->getMemOperand());
16411
16412 // Update the uses.
16413 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
16414 UI != UE; ++UI) {
16415 unsigned ResNo = UI.getUse().getResNo();
16416 // Ignore uses of the chain result.
16417 if (ResNo == NumVecs)
16418 continue;
16419 SDNode *User = *UI;
16420 DCI.CombineTo(N: User, Res: SDValue(VLDDup.getNode(), ResNo));
16421 }
16422
16423 // Now the vldN-lane intrinsic is dead except for its chain result.
16424 // Update uses of the chain.
16425 std::vector<SDValue> VLDDupResults;
16426 for (unsigned n = 0; n < NumVecs; ++n)
16427 VLDDupResults.push_back(x: SDValue(VLDDup.getNode(), n));
16428 VLDDupResults.push_back(x: SDValue(VLDDup.getNode(), NumVecs));
16429 DCI.CombineTo(N: VLD, To: VLDDupResults);
16430
16431 return true;
16432}
16433
16434/// PerformVDUPLANECombine - Target-specific dag combine xforms for
16435/// ARMISD::VDUPLANE.
16436static SDValue PerformVDUPLANECombine(SDNode *N,
16437 TargetLowering::DAGCombinerInfo &DCI,
16438 const ARMSubtarget *Subtarget) {
16439 SDValue Op = N->getOperand(Num: 0);
16440 EVT VT = N->getValueType(ResNo: 0);
16441
16442 // On MVE, we just convert the VDUPLANE to a VDUP with an extract.
16443 if (Subtarget->hasMVEIntegerOps()) {
16444 EVT ExtractVT = VT.getVectorElementType();
16445 // We need to ensure we are creating a legal type.
16446 if (!DCI.DAG.getTargetLoweringInfo().isTypeLegal(ExtractVT))
16447 ExtractVT = MVT::i32;
16448 SDValue Extract = DCI.DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SDLoc(N), VT: ExtractVT,
16449 N1: N->getOperand(Num: 0), N2: N->getOperand(Num: 1));
16450 return DCI.DAG.getNode(Opcode: ARMISD::VDUP, DL: SDLoc(N), VT, Operand: Extract);
16451 }
16452
16453 // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
16454 // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
16455 if (CombineVLDDUP(N, DCI))
16456 return SDValue(N, 0);
16457
16458 // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
16459 // redundant. Ignore bit_converts for now; element sizes are checked below.
16460 while (Op.getOpcode() == ISD::BITCAST)
16461 Op = Op.getOperand(i: 0);
16462 if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
16463 return SDValue();
16464
16465 // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
16466 unsigned EltSize = Op.getScalarValueSizeInBits();
16467 // The canonical VMOV for a zero vector uses a 32-bit element size.
16468 unsigned Imm = Op.getConstantOperandVal(i: 0);
16469 unsigned EltBits;
16470 if (ARM_AM::decodeVMOVModImm(ModImm: Imm, EltBits) == 0)
16471 EltSize = 8;
16472 if (EltSize > VT.getScalarSizeInBits())
16473 return SDValue();
16474
16475 return DCI.DAG.getNode(Opcode: ISD::BITCAST, DL: SDLoc(N), VT, Operand: Op);
16476}
16477
16478/// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
16479static SDValue PerformVDUPCombine(SDNode *N, SelectionDAG &DAG,
16480 const ARMSubtarget *Subtarget) {
16481 SDValue Op = N->getOperand(Num: 0);
16482 SDLoc dl(N);
16483
16484 if (Subtarget->hasMVEIntegerOps()) {
16485 // Convert VDUP f32 -> VDUP BITCAST i32 under MVE, as we know the value will
16486 // need to come from a GPR.
16487 if (Op.getValueType() == MVT::f32)
16488 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
16489 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op));
16490 else if (Op.getValueType() == MVT::f16)
16491 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
16492 DAG.getNode(ARMISD::VMOVrh, dl, MVT::i32, Op));
16493 }
16494
16495 if (!Subtarget->hasNEON())
16496 return SDValue();
16497
16498 // Match VDUP(LOAD) -> VLD1DUP.
16499 // We match this pattern here rather than waiting for isel because the
16500 // transform is only legal for unindexed loads.
16501 LoadSDNode *LD = dyn_cast<LoadSDNode>(Val: Op.getNode());
16502 if (LD && Op.hasOneUse() && LD->isUnindexed() &&
16503 LD->getMemoryVT() == N->getValueType(ResNo: 0).getVectorElementType()) {
16504 SDValue Ops[] = {LD->getOperand(0), LD->getOperand(1),
16505 DAG.getConstant(LD->getAlign().value(), SDLoc(N), MVT::i32)};
16506 SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other);
16507 SDValue VLDDup =
16508 DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys, Ops,
16509 LD->getMemoryVT(), LD->getMemOperand());
16510 DAG.ReplaceAllUsesOfValueWith(From: SDValue(LD, 1), To: VLDDup.getValue(R: 1));
16511 return VLDDup;
16512 }
16513
16514 return SDValue();
16515}
16516
16517static SDValue PerformLOADCombine(SDNode *N,
16518 TargetLowering::DAGCombinerInfo &DCI,
16519 const ARMSubtarget *Subtarget) {
16520 EVT VT = N->getValueType(ResNo: 0);
16521
16522 // If this is a legal vector load, try to combine it into a VLD1_UPD.
16523 if (Subtarget->hasNEON() && ISD::isNormalLoad(N) && VT.isVector() &&
16524 DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
16525 return CombineBaseUpdate(N, DCI);
16526
16527 return SDValue();
16528}
16529
16530// Optimize trunc store (of multiple scalars) to shuffle and store. First,
16531// pack all of the elements in one place. Next, store to memory in fewer
16532// chunks.
16533static SDValue PerformTruncatingStoreCombine(StoreSDNode *St,
16534 SelectionDAG &DAG) {
16535 SDValue StVal = St->getValue();
16536 EVT VT = StVal.getValueType();
16537 if (!St->isTruncatingStore() || !VT.isVector())
16538 return SDValue();
16539 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16540 EVT StVT = St->getMemoryVT();
16541 unsigned NumElems = VT.getVectorNumElements();
16542 assert(StVT != VT && "Cannot truncate to the same type");
16543 unsigned FromEltSz = VT.getScalarSizeInBits();
16544 unsigned ToEltSz = StVT.getScalarSizeInBits();
16545
16546 // From, To sizes and ElemCount must be pow of two
16547 if (!isPowerOf2_32(Value: NumElems * FromEltSz * ToEltSz))
16548 return SDValue();
16549
16550 // We are going to use the original vector elt for storing.
16551 // Accumulated smaller vector elements must be a multiple of the store size.
16552 if (0 != (NumElems * FromEltSz) % ToEltSz)
16553 return SDValue();
16554
16555 unsigned SizeRatio = FromEltSz / ToEltSz;
16556 assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
16557
16558 // Create a type on which we perform the shuffle.
16559 EVT WideVecVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: StVT.getScalarType(),
16560 NumElements: NumElems * SizeRatio);
16561 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
16562
16563 SDLoc DL(St);
16564 SDValue WideVec = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: WideVecVT, Operand: StVal);
16565 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
16566 for (unsigned i = 0; i < NumElems; ++i)
16567 ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1
16568 : i * SizeRatio;
16569
16570 // Can't shuffle using an illegal type.
16571 if (!TLI.isTypeLegal(VT: WideVecVT))
16572 return SDValue();
16573
16574 SDValue Shuff = DAG.getVectorShuffle(
16575 VT: WideVecVT, dl: DL, N1: WideVec, N2: DAG.getUNDEF(VT: WideVec.getValueType()), Mask: ShuffleVec);
16576 // At this point all of the data is stored at the bottom of the
16577 // register. We now need to save it to mem.
16578
16579 // Find the largest store unit
16580 MVT StoreType = MVT::i8;
16581 for (MVT Tp : MVT::integer_valuetypes()) {
16582 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
16583 StoreType = Tp;
16584 }
16585 // Didn't find a legal store type.
16586 if (!TLI.isTypeLegal(VT: StoreType))
16587 return SDValue();
16588
16589 // Bitcast the original vector into a vector of store-size units
16590 EVT StoreVecVT =
16591 EVT::getVectorVT(Context&: *DAG.getContext(), VT: StoreType,
16592 NumElements: VT.getSizeInBits() / EVT(StoreType).getSizeInBits());
16593 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
16594 SDValue ShuffWide = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: StoreVecVT, Operand: Shuff);
16595 SmallVector<SDValue, 8> Chains;
16596 SDValue Increment = DAG.getConstant(Val: StoreType.getSizeInBits() / 8, DL,
16597 VT: TLI.getPointerTy(DL: DAG.getDataLayout()));
16598 SDValue BasePtr = St->getBasePtr();
16599
16600 // Perform one or more big stores into memory.
16601 unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits();
16602 for (unsigned I = 0; I < E; I++) {
16603 SDValue SubVec = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: StoreType,
16604 N1: ShuffWide, N2: DAG.getIntPtrConstant(Val: I, DL));
16605 SDValue Ch =
16606 DAG.getStore(Chain: St->getChain(), dl: DL, Val: SubVec, Ptr: BasePtr, PtrInfo: St->getPointerInfo(),
16607 Alignment: St->getAlign(), MMOFlags: St->getMemOperand()->getFlags());
16608 BasePtr =
16609 DAG.getNode(Opcode: ISD::ADD, DL, VT: BasePtr.getValueType(), N1: BasePtr, N2: Increment);
16610 Chains.push_back(Elt: Ch);
16611 }
16612 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
16613}
16614
16615// Try taking a single vector store from an fpround (which would otherwise turn
16616// into an expensive buildvector) and splitting it into a series of narrowing
16617// stores.
16618static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St,
16619 SelectionDAG &DAG) {
16620 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16621 return SDValue();
16622 SDValue Trunc = St->getValue();
16623 if (Trunc->getOpcode() != ISD::FP_ROUND)
16624 return SDValue();
16625 EVT FromVT = Trunc->getOperand(Num: 0).getValueType();
16626 EVT ToVT = Trunc.getValueType();
16627 if (!ToVT.isVector())
16628 return SDValue();
16629 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements());
16630 EVT ToEltVT = ToVT.getVectorElementType();
16631 EVT FromEltVT = FromVT.getVectorElementType();
16632
16633 if (FromEltVT != MVT::f32 || ToEltVT != MVT::f16)
16634 return SDValue();
16635
16636 unsigned NumElements = 4;
16637 if (FromVT.getVectorNumElements() % NumElements != 0)
16638 return SDValue();
16639
16640 // Test if the Trunc will be convertable to a VMOVN with a shuffle, and if so
16641 // use the VMOVN over splitting the store. We are looking for patterns of:
16642 // !rev: 0 N 1 N+1 2 N+2 ...
16643 // rev: N 0 N+1 1 N+2 2 ...
16644 // The shuffle may either be a single source (in which case N = NumElts/2) or
16645 // two inputs extended with concat to the same size (in which case N =
16646 // NumElts).
16647 auto isVMOVNShuffle = [&](ShuffleVectorSDNode *SVN, bool Rev) {
16648 ArrayRef<int> M = SVN->getMask();
16649 unsigned NumElts = ToVT.getVectorNumElements();
16650 if (SVN->getOperand(Num: 1).isUndef())
16651 NumElts /= 2;
16652
16653 unsigned Off0 = Rev ? NumElts : 0;
16654 unsigned Off1 = Rev ? 0 : NumElts;
16655
16656 for (unsigned I = 0; I < NumElts; I += 2) {
16657 if (M[I] >= 0 && M[I] != (int)(Off0 + I / 2))
16658 return false;
16659 if (M[I + 1] >= 0 && M[I + 1] != (int)(Off1 + I / 2))
16660 return false;
16661 }
16662
16663 return true;
16664 };
16665
16666 if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Val: Trunc.getOperand(i: 0)))
16667 if (isVMOVNShuffle(Shuffle, false) || isVMOVNShuffle(Shuffle, true))
16668 return SDValue();
16669
16670 LLVMContext &C = *DAG.getContext();
16671 SDLoc DL(St);
16672 // Details about the old store
16673 SDValue Ch = St->getChain();
16674 SDValue BasePtr = St->getBasePtr();
16675 Align Alignment = St->getOriginalAlign();
16676 MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags();
16677 AAMDNodes AAInfo = St->getAAInfo();
16678
16679 // We split the store into slices of NumElements. fp16 trunc stores are vcvt
16680 // and then stored as truncating integer stores.
16681 EVT NewFromVT = EVT::getVectorVT(Context&: C, VT: FromEltVT, NumElements);
16682 EVT NewToVT = EVT::getVectorVT(
16683 Context&: C, VT: EVT::getIntegerVT(Context&: C, BitWidth: ToEltVT.getSizeInBits()), NumElements);
16684
16685 SmallVector<SDValue, 4> Stores;
16686 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
16687 unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8;
16688 SDValue NewPtr =
16689 DAG.getObjectPtrOffset(SL: DL, Ptr: BasePtr, Offset: TypeSize::getFixed(ExactSize: NewOffset));
16690
16691 SDValue Extract =
16692 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0),
16693 DAG.getConstant(i * NumElements, DL, MVT::i32));
16694
16695 SDValue FPTrunc =
16696 DAG.getNode(ARMISD::VCVTN, DL, MVT::v8f16, DAG.getUNDEF(MVT::v8f16),
16697 Extract, DAG.getConstant(0, DL, MVT::i32));
16698 Extract = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v4i32, FPTrunc);
16699
16700 SDValue Store = DAG.getTruncStore(
16701 Chain: Ch, dl: DL, Val: Extract, Ptr: NewPtr, PtrInfo: St->getPointerInfo().getWithOffset(O: NewOffset),
16702 SVT: NewToVT, Alignment, MMOFlags, AAInfo);
16703 Stores.push_back(Elt: Store);
16704 }
16705 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
16706}
16707
16708// Try taking a single vector store from an MVETRUNC (which would otherwise turn
16709// into an expensive buildvector) and splitting it into a series of narrowing
16710// stores.
16711static SDValue PerformSplittingMVETruncToNarrowingStores(StoreSDNode *St,
16712 SelectionDAG &DAG) {
16713 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16714 return SDValue();
16715 SDValue Trunc = St->getValue();
16716 if (Trunc->getOpcode() != ARMISD::MVETRUNC)
16717 return SDValue();
16718 EVT FromVT = Trunc->getOperand(Num: 0).getValueType();
16719 EVT ToVT = Trunc.getValueType();
16720
16721 LLVMContext &C = *DAG.getContext();
16722 SDLoc DL(St);
16723 // Details about the old store
16724 SDValue Ch = St->getChain();
16725 SDValue BasePtr = St->getBasePtr();
16726 Align Alignment = St->getOriginalAlign();
16727 MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags();
16728 AAMDNodes AAInfo = St->getAAInfo();
16729
16730 EVT NewToVT = EVT::getVectorVT(Context&: C, VT: ToVT.getVectorElementType(),
16731 NumElements: FromVT.getVectorNumElements());
16732
16733 SmallVector<SDValue, 4> Stores;
16734 for (unsigned i = 0; i < Trunc.getNumOperands(); i++) {
16735 unsigned NewOffset =
16736 i * FromVT.getVectorNumElements() * ToVT.getScalarSizeInBits() / 8;
16737 SDValue NewPtr =
16738 DAG.getObjectPtrOffset(SL: DL, Ptr: BasePtr, Offset: TypeSize::getFixed(ExactSize: NewOffset));
16739
16740 SDValue Extract = Trunc.getOperand(i);
16741 SDValue Store = DAG.getTruncStore(
16742 Chain: Ch, dl: DL, Val: Extract, Ptr: NewPtr, PtrInfo: St->getPointerInfo().getWithOffset(O: NewOffset),
16743 SVT: NewToVT, Alignment, MMOFlags, AAInfo);
16744 Stores.push_back(Elt: Store);
16745 }
16746 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
16747}
16748
16749// Given a floating point store from an extracted vector, with an integer
16750// VGETLANE that already exists, store the existing VGETLANEu directly. This can
16751// help reduce fp register pressure, doesn't require the fp extract and allows
16752// use of more integer post-inc stores not available with vstr.
16753static SDValue PerformExtractFpToIntStores(StoreSDNode *St, SelectionDAG &DAG) {
16754 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16755 return SDValue();
16756 SDValue Extract = St->getValue();
16757 EVT VT = Extract.getValueType();
16758 // For now only uses f16. This may be useful for f32 too, but that will
16759 // be bitcast(extract), not the VGETLANEu we currently check here.
16760 if (VT != MVT::f16 || Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16761 return SDValue();
16762
16763 SDNode *GetLane =
16764 DAG.getNodeIfExists(ARMISD::VGETLANEu, DAG.getVTList(MVT::i32),
16765 {Extract.getOperand(0), Extract.getOperand(1)});
16766 if (!GetLane)
16767 return SDValue();
16768
16769 LLVMContext &C = *DAG.getContext();
16770 SDLoc DL(St);
16771 // Create a new integer store to replace the existing floating point version.
16772 SDValue Ch = St->getChain();
16773 SDValue BasePtr = St->getBasePtr();
16774 Align Alignment = St->getOriginalAlign();
16775 MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags();
16776 AAMDNodes AAInfo = St->getAAInfo();
16777 EVT NewToVT = EVT::getIntegerVT(Context&: C, BitWidth: VT.getSizeInBits());
16778 SDValue Store = DAG.getTruncStore(Chain: Ch, dl: DL, Val: SDValue(GetLane, 0), Ptr: BasePtr,
16779 PtrInfo: St->getPointerInfo(), SVT: NewToVT, Alignment,
16780 MMOFlags, AAInfo);
16781
16782 return Store;
16783}
16784
16785/// PerformSTORECombine - Target-specific dag combine xforms for
16786/// ISD::STORE.
16787static SDValue PerformSTORECombine(SDNode *N,
16788 TargetLowering::DAGCombinerInfo &DCI,
16789 const ARMSubtarget *Subtarget) {
16790 StoreSDNode *St = cast<StoreSDNode>(Val: N);
16791 if (St->isVolatile())
16792 return SDValue();
16793 SDValue StVal = St->getValue();
16794 EVT VT = StVal.getValueType();
16795
16796 if (Subtarget->hasNEON())
16797 if (SDValue Store = PerformTruncatingStoreCombine(St, DAG&: DCI.DAG))
16798 return Store;
16799
16800 if (Subtarget->hasMVEFloatOps())
16801 if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DAG&: DCI.DAG))
16802 return NewToken;
16803
16804 if (Subtarget->hasMVEIntegerOps()) {
16805 if (SDValue NewChain = PerformExtractFpToIntStores(St, DAG&: DCI.DAG))
16806 return NewChain;
16807 if (SDValue NewToken =
16808 PerformSplittingMVETruncToNarrowingStores(St, DAG&: DCI.DAG))
16809 return NewToken;
16810 }
16811
16812 if (!ISD::isNormalStore(N: St))
16813 return SDValue();
16814
16815 // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
16816 // ARM stores of arguments in the same cache line.
16817 if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
16818 StVal.getNode()->hasOneUse()) {
16819 SelectionDAG &DAG = DCI.DAG;
16820 bool isBigEndian = DAG.getDataLayout().isBigEndian();
16821 SDLoc DL(St);
16822 SDValue BasePtr = St->getBasePtr();
16823 SDValue NewST1 = DAG.getStore(
16824 Chain: St->getChain(), dl: DL, Val: StVal.getNode()->getOperand(Num: isBigEndian ? 1 : 0),
16825 Ptr: BasePtr, PtrInfo: St->getPointerInfo(), Alignment: St->getOriginalAlign(),
16826 MMOFlags: St->getMemOperand()->getFlags());
16827
16828 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
16829 DAG.getConstant(4, DL, MVT::i32));
16830 return DAG.getStore(Chain: NewST1.getValue(R: 0), dl: DL,
16831 Val: StVal.getNode()->getOperand(Num: isBigEndian ? 0 : 1),
16832 Ptr: OffsetPtr, PtrInfo: St->getPointerInfo().getWithOffset(O: 4),
16833 Alignment: St->getOriginalAlign(),
16834 MMOFlags: St->getMemOperand()->getFlags());
16835 }
16836
16837 if (StVal.getValueType() == MVT::i64 &&
16838 StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
16839
16840 // Bitcast an i64 store extracted from a vector to f64.
16841 // Otherwise, the i64 value will be legalized to a pair of i32 values.
16842 SelectionDAG &DAG = DCI.DAG;
16843 SDLoc dl(StVal);
16844 SDValue IntVec = StVal.getOperand(i: 0);
16845 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
16846 IntVec.getValueType().getVectorNumElements());
16847 SDValue Vec = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: FloatVT, Operand: IntVec);
16848 SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
16849 Vec, StVal.getOperand(1));
16850 dl = SDLoc(N);
16851 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
16852 // Make the DAGCombiner fold the bitcasts.
16853 DCI.AddToWorklist(N: Vec.getNode());
16854 DCI.AddToWorklist(N: ExtElt.getNode());
16855 DCI.AddToWorklist(N: V.getNode());
16856 return DAG.getStore(Chain: St->getChain(), dl, Val: V, Ptr: St->getBasePtr(),
16857 PtrInfo: St->getPointerInfo(), Alignment: St->getAlign(),
16858 MMOFlags: St->getMemOperand()->getFlags(), AAInfo: St->getAAInfo());
16859 }
16860
16861 // If this is a legal vector store, try to combine it into a VST1_UPD.
16862 if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() &&
16863 DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
16864 return CombineBaseUpdate(N, DCI);
16865
16866 return SDValue();
16867}
16868
16869/// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
16870/// can replace combinations of VMUL and VCVT (floating-point to integer)
16871/// when the VMUL has a constant operand that is a power of 2.
16872///
16873/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
16874/// vmul.f32 d16, d17, d16
16875/// vcvt.s32.f32 d16, d16
16876/// becomes:
16877/// vcvt.s32.f32 d16, d16, #3
16878static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG,
16879 const ARMSubtarget *Subtarget) {
16880 if (!Subtarget->hasNEON())
16881 return SDValue();
16882
16883 SDValue Op = N->getOperand(Num: 0);
16884 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
16885 Op.getOpcode() != ISD::FMUL)
16886 return SDValue();
16887
16888 SDValue ConstVec = Op->getOperand(Num: 1);
16889 if (!isa<BuildVectorSDNode>(Val: ConstVec))
16890 return SDValue();
16891
16892 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
16893 uint32_t FloatBits = FloatTy.getSizeInBits();
16894 MVT IntTy = N->getSimpleValueType(ResNo: 0).getVectorElementType();
16895 uint32_t IntBits = IntTy.getSizeInBits();
16896 unsigned NumLanes = Op.getValueType().getVectorNumElements();
16897 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
16898 // These instructions only exist converting from f32 to i32. We can handle
16899 // smaller integers by generating an extra truncate, but larger ones would
16900 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
16901 // these intructions only support v2i32/v4i32 types.
16902 return SDValue();
16903 }
16904
16905 BitVector UndefElements;
16906 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Val&: ConstVec);
16907 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(UndefElements: &UndefElements, BitWidth: 33);
16908 if (C == -1 || C == 0 || C > 32)
16909 return SDValue();
16910
16911 SDLoc dl(N);
16912 bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
16913 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
16914 Intrinsic::arm_neon_vcvtfp2fxu;
16915 SDValue FixConv = DAG.getNode(
16916 ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
16917 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0),
16918 DAG.getConstant(C, dl, MVT::i32));
16919
16920 if (IntBits < FloatBits)
16921 FixConv = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: N->getValueType(ResNo: 0), Operand: FixConv);
16922
16923 return FixConv;
16924}
16925
16926static SDValue PerformFAddVSelectCombine(SDNode *N, SelectionDAG &DAG,
16927 const ARMSubtarget *Subtarget) {
16928 if (!Subtarget->hasMVEFloatOps())
16929 return SDValue();
16930
16931 // Turn (fadd x, (vselect c, y, -0.0)) into (vselect c, (fadd x, y), x)
16932 // The second form can be more easily turned into a predicated vadd, and
16933 // possibly combined into a fma to become a predicated vfma.
16934 SDValue Op0 = N->getOperand(Num: 0);
16935 SDValue Op1 = N->getOperand(Num: 1);
16936 EVT VT = N->getValueType(ResNo: 0);
16937 SDLoc DL(N);
16938
16939 // The identity element for a fadd is -0.0 or +0.0 when the nsz flag is set,
16940 // which these VMOV's represent.
16941 auto isIdentitySplat = [&](SDValue Op, bool NSZ) {
16942 if (Op.getOpcode() != ISD::BITCAST ||
16943 Op.getOperand(i: 0).getOpcode() != ARMISD::VMOVIMM)
16944 return false;
16945 uint64_t ImmVal = Op.getOperand(i: 0).getConstantOperandVal(i: 0);
16946 if (VT == MVT::v4f32 && (ImmVal == 1664 || (ImmVal == 0 && NSZ)))
16947 return true;
16948 if (VT == MVT::v8f16 && (ImmVal == 2688 || (ImmVal == 0 && NSZ)))
16949 return true;
16950 return false;
16951 };
16952
16953 if (Op0.getOpcode() == ISD::VSELECT && Op1.getOpcode() != ISD::VSELECT)
16954 std::swap(a&: Op0, b&: Op1);
16955
16956 if (Op1.getOpcode() != ISD::VSELECT)
16957 return SDValue();
16958
16959 SDNodeFlags FaddFlags = N->getFlags();
16960 bool NSZ = FaddFlags.hasNoSignedZeros();
16961 if (!isIdentitySplat(Op1.getOperand(i: 2), NSZ))
16962 return SDValue();
16963
16964 SDValue FAdd =
16965 DAG.getNode(Opcode: ISD::FADD, DL, VT, N1: Op0, N2: Op1.getOperand(i: 1), Flags: FaddFlags);
16966 return DAG.getNode(Opcode: ISD::VSELECT, DL, VT, N1: Op1.getOperand(i: 0), N2: FAdd, N3: Op0, Flags: FaddFlags);
16967}
16968
16969static SDValue PerformFADDVCMLACombine(SDNode *N, SelectionDAG &DAG) {
16970 SDValue LHS = N->getOperand(Num: 0);
16971 SDValue RHS = N->getOperand(Num: 1);
16972 EVT VT = N->getValueType(ResNo: 0);
16973 SDLoc DL(N);
16974
16975 if (!N->getFlags().hasAllowReassociation())
16976 return SDValue();
16977
16978 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
16979 auto ReassocComplex = [&](SDValue A, SDValue B) {
16980 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
16981 return SDValue();
16982 unsigned Opc = A.getConstantOperandVal(i: 0);
16983 if (Opc != Intrinsic::arm_mve_vcmlaq)
16984 return SDValue();
16985 SDValue VCMLA = DAG.getNode(
16986 Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT, N1: A.getOperand(i: 0), N2: A.getOperand(i: 1),
16987 N3: DAG.getNode(Opcode: ISD::FADD, DL, VT, N1: A.getOperand(i: 2), N2: B, Flags: N->getFlags()),
16988 N4: A.getOperand(i: 3), N5: A.getOperand(i: 4));
16989 VCMLA->setFlags(A->getFlags());
16990 return VCMLA;
16991 };
16992 if (SDValue R = ReassocComplex(LHS, RHS))
16993 return R;
16994 if (SDValue R = ReassocComplex(RHS, LHS))
16995 return R;
16996
16997 return SDValue();
16998}
16999
17000static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
17001 const ARMSubtarget *Subtarget) {
17002 if (SDValue S = PerformFAddVSelectCombine(N, DAG, Subtarget))
17003 return S;
17004 if (SDValue S = PerformFADDVCMLACombine(N, DAG))
17005 return S;
17006 return SDValue();
17007}
17008
17009/// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
17010/// can replace combinations of VCVT (integer to floating-point) and VDIV
17011/// when the VDIV has a constant operand that is a power of 2.
17012///
17013/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
17014/// vcvt.f32.s32 d16, d16
17015/// vdiv.f32 d16, d17, d16
17016/// becomes:
17017/// vcvt.f32.s32 d16, d16, #3
17018static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG,
17019 const ARMSubtarget *Subtarget) {
17020 if (!Subtarget->hasNEON())
17021 return SDValue();
17022
17023 SDValue Op = N->getOperand(Num: 0);
17024 unsigned OpOpcode = Op.getNode()->getOpcode();
17025 if (!N->getValueType(ResNo: 0).isVector() || !N->getValueType(ResNo: 0).isSimple() ||
17026 (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
17027 return SDValue();
17028
17029 SDValue ConstVec = N->getOperand(Num: 1);
17030 if (!isa<BuildVectorSDNode>(Val: ConstVec))
17031 return SDValue();
17032
17033 MVT FloatTy = N->getSimpleValueType(ResNo: 0).getVectorElementType();
17034 uint32_t FloatBits = FloatTy.getSizeInBits();
17035 MVT IntTy = Op.getOperand(i: 0).getSimpleValueType().getVectorElementType();
17036 uint32_t IntBits = IntTy.getSizeInBits();
17037 unsigned NumLanes = Op.getValueType().getVectorNumElements();
17038 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
17039 // These instructions only exist converting from i32 to f32. We can handle
17040 // smaller integers by generating an extra extend, but larger ones would
17041 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
17042 // these intructions only support v2i32/v4i32 types.
17043 return SDValue();
17044 }
17045
17046 BitVector UndefElements;
17047 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Val&: ConstVec);
17048 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(UndefElements: &UndefElements, BitWidth: 33);
17049 if (C == -1 || C == 0 || C > 32)
17050 return SDValue();
17051
17052 SDLoc dl(N);
17053 bool isSigned = OpOpcode == ISD::SINT_TO_FP;
17054 SDValue ConvInput = Op.getOperand(i: 0);
17055 if (IntBits < FloatBits)
17056 ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
17057 dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
17058 ConvInput);
17059
17060 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp :
17061 Intrinsic::arm_neon_vcvtfxu2fp;
17062 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
17063 Op.getValueType(),
17064 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32),
17065 ConvInput, DAG.getConstant(C, dl, MVT::i32));
17066}
17067
17068static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG,
17069 const ARMSubtarget *ST) {
17070 if (!ST->hasMVEIntegerOps())
17071 return SDValue();
17072
17073 assert(N->getOpcode() == ISD::VECREDUCE_ADD);
17074 EVT ResVT = N->getValueType(ResNo: 0);
17075 SDValue N0 = N->getOperand(Num: 0);
17076 SDLoc dl(N);
17077
17078 // Try to turn vecreduce_add(add(x, y)) into vecreduce(x) + vecreduce(y)
17079 if (ResVT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
17080 (N0.getValueType() == MVT::v4i32 || N0.getValueType() == MVT::v8i16 ||
17081 N0.getValueType() == MVT::v16i8)) {
17082 SDValue Red0 = DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL: dl, VT: ResVT, Operand: N0.getOperand(i: 0));
17083 SDValue Red1 = DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL: dl, VT: ResVT, Operand: N0.getOperand(i: 1));
17084 return DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: ResVT, N1: Red0, N2: Red1);
17085 }
17086
17087 // We are looking for something that will have illegal types if left alone,
17088 // but that we can convert to a single instruction under MVE. For example
17089 // vecreduce_add(sext(A, v8i32)) => VADDV.s16 A
17090 // or
17091 // vecreduce_add(mul(zext(A, v16i32), zext(B, v16i32))) => VMLADAV.u8 A, B
17092
17093 // The legal cases are:
17094 // VADDV u/s 8/16/32
17095 // VMLAV u/s 8/16/32
17096 // VADDLV u/s 32
17097 // VMLALV u/s 16/32
17098
17099 // If the input vector is smaller than legal (v4i8/v4i16 for example) we can
17100 // extend it and use v4i32 instead.
17101 auto ExtTypeMatches = [](SDValue A, ArrayRef<MVT> ExtTypes) {
17102 EVT AVT = A.getValueType();
17103 return any_of(Range&: ExtTypes, P: [&](MVT Ty) {
17104 return AVT.getVectorNumElements() == Ty.getVectorNumElements() &&
17105 AVT.bitsLE(VT: Ty);
17106 });
17107 };
17108 auto ExtendIfNeeded = [&](SDValue A, unsigned ExtendCode) {
17109 EVT AVT = A.getValueType();
17110 if (!AVT.is128BitVector())
17111 A = DAG.getNode(Opcode: ExtendCode, DL: dl,
17112 VT: AVT.changeVectorElementType(EltVT: MVT::getIntegerVT(
17113 BitWidth: 128 / AVT.getVectorMinNumElements())),
17114 Operand: A);
17115 return A;
17116 };
17117 auto IsVADDV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes) {
17118 if (ResVT != RetTy || N0->getOpcode() != ExtendCode)
17119 return SDValue();
17120 SDValue A = N0->getOperand(Num: 0);
17121 if (ExtTypeMatches(A, ExtTypes))
17122 return ExtendIfNeeded(A, ExtendCode);
17123 return SDValue();
17124 };
17125 auto IsPredVADDV = [&](MVT RetTy, unsigned ExtendCode,
17126 ArrayRef<MVT> ExtTypes, SDValue &Mask) {
17127 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
17128 !ISD::isBuildVectorAllZeros(N: N0->getOperand(Num: 2).getNode()))
17129 return SDValue();
17130 Mask = N0->getOperand(Num: 0);
17131 SDValue Ext = N0->getOperand(Num: 1);
17132 if (Ext->getOpcode() != ExtendCode)
17133 return SDValue();
17134 SDValue A = Ext->getOperand(Num: 0);
17135 if (ExtTypeMatches(A, ExtTypes))
17136 return ExtendIfNeeded(A, ExtendCode);
17137 return SDValue();
17138 };
17139 auto IsVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
17140 SDValue &A, SDValue &B) {
17141 // For a vmla we are trying to match a larger pattern:
17142 // ExtA = sext/zext A
17143 // ExtB = sext/zext B
17144 // Mul = mul ExtA, ExtB
17145 // vecreduce.add Mul
17146 // There might also be en extra extend between the mul and the addreduce, so
17147 // long as the bitwidth is high enough to make them equivalent (for example
17148 // original v8i16 might be mul at v8i32 and the reduce happens at v8i64).
17149 if (ResVT != RetTy)
17150 return false;
17151 SDValue Mul = N0;
17152 if (Mul->getOpcode() == ExtendCode &&
17153 Mul->getOperand(Num: 0).getScalarValueSizeInBits() * 2 >=
17154 ResVT.getScalarSizeInBits())
17155 Mul = Mul->getOperand(Num: 0);
17156 if (Mul->getOpcode() != ISD::MUL)
17157 return false;
17158 SDValue ExtA = Mul->getOperand(Num: 0);
17159 SDValue ExtB = Mul->getOperand(Num: 1);
17160 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
17161 return false;
17162 A = ExtA->getOperand(Num: 0);
17163 B = ExtB->getOperand(Num: 0);
17164 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
17165 A = ExtendIfNeeded(A, ExtendCode);
17166 B = ExtendIfNeeded(B, ExtendCode);
17167 return true;
17168 }
17169 return false;
17170 };
17171 auto IsPredVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
17172 SDValue &A, SDValue &B, SDValue &Mask) {
17173 // Same as the pattern above with a select for the zero predicated lanes
17174 // ExtA = sext/zext A
17175 // ExtB = sext/zext B
17176 // Mul = mul ExtA, ExtB
17177 // N0 = select Mask, Mul, 0
17178 // vecreduce.add N0
17179 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
17180 !ISD::isBuildVectorAllZeros(N: N0->getOperand(Num: 2).getNode()))
17181 return false;
17182 Mask = N0->getOperand(Num: 0);
17183 SDValue Mul = N0->getOperand(Num: 1);
17184 if (Mul->getOpcode() == ExtendCode &&
17185 Mul->getOperand(Num: 0).getScalarValueSizeInBits() * 2 >=
17186 ResVT.getScalarSizeInBits())
17187 Mul = Mul->getOperand(Num: 0);
17188 if (Mul->getOpcode() != ISD::MUL)
17189 return false;
17190 SDValue ExtA = Mul->getOperand(Num: 0);
17191 SDValue ExtB = Mul->getOperand(Num: 1);
17192 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
17193 return false;
17194 A = ExtA->getOperand(Num: 0);
17195 B = ExtB->getOperand(Num: 0);
17196 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
17197 A = ExtendIfNeeded(A, ExtendCode);
17198 B = ExtendIfNeeded(B, ExtendCode);
17199 return true;
17200 }
17201 return false;
17202 };
17203 auto Create64bitNode = [&](unsigned Opcode, ArrayRef<SDValue> Ops) {
17204 // Split illegal MVT::v16i8->i64 vector reductions into two legal v8i16->i64
17205 // reductions. The operands are extended with MVEEXT, but as they are
17206 // reductions the lane orders do not matter. MVEEXT may be combined with
17207 // loads to produce two extending loads, or else they will be expanded to
17208 // VREV/VMOVL.
17209 EVT VT = Ops[0].getValueType();
17210 if (VT == MVT::v16i8) {
17211 assert((Opcode == ARMISD::VMLALVs || Opcode == ARMISD::VMLALVu) &&
17212 "Unexpected illegal long reduction opcode");
17213 bool IsUnsigned = Opcode == ARMISD::VMLALVu;
17214
17215 SDValue Ext0 =
17216 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
17217 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[0]);
17218 SDValue Ext1 =
17219 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
17220 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[1]);
17221
17222 SDValue MLA0 = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
17223 Ext0, Ext1);
17224 SDValue MLA1 =
17225 DAG.getNode(IsUnsigned ? ARMISD::VMLALVAu : ARMISD::VMLALVAs, dl,
17226 DAG.getVTList(MVT::i32, MVT::i32), MLA0, MLA0.getValue(1),
17227 Ext0.getValue(1), Ext1.getValue(1));
17228 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, MLA1, MLA1.getValue(1));
17229 }
17230 SDValue Node = DAG.getNode(Opcode, dl, {MVT::i32, MVT::i32}, Ops);
17231 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Node,
17232 SDValue(Node.getNode(), 1));
17233 };
17234
17235 SDValue A, B;
17236 SDValue Mask;
17237 if (IsVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
17238 return DAG.getNode(Opcode: ARMISD::VMLAVs, DL: dl, VT: ResVT, N1: A, N2: B);
17239 if (IsVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
17240 return DAG.getNode(Opcode: ARMISD::VMLAVu, DL: dl, VT: ResVT, N1: A, N2: B);
17241 if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
17242 A, B))
17243 return Create64bitNode(ARMISD::VMLALVs, {A, B});
17244 if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
17245 A, B))
17246 return Create64bitNode(ARMISD::VMLALVu, {A, B});
17247 if (IsVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B))
17248 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17249 DAG.getNode(ARMISD::VMLAVs, dl, MVT::i32, A, B));
17250 if (IsVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B))
17251 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17252 DAG.getNode(ARMISD::VMLAVu, dl, MVT::i32, A, B));
17253
17254 if (IsPredVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
17255 Mask))
17256 return DAG.getNode(Opcode: ARMISD::VMLAVps, DL: dl, VT: ResVT, N1: A, N2: B, N3: Mask);
17257 if (IsPredVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
17258 Mask))
17259 return DAG.getNode(Opcode: ARMISD::VMLAVpu, DL: dl, VT: ResVT, N1: A, N2: B, N3: Mask);
17260 if (IsPredVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
17261 Mask))
17262 return Create64bitNode(ARMISD::VMLALVps, {A, B, Mask});
17263 if (IsPredVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
17264 Mask))
17265 return Create64bitNode(ARMISD::VMLALVpu, {A, B, Mask});
17266 if (IsPredVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B, Mask))
17267 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17268 DAG.getNode(ARMISD::VMLAVps, dl, MVT::i32, A, B, Mask));
17269 if (IsPredVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B, Mask))
17270 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17271 DAG.getNode(ARMISD::VMLAVpu, dl, MVT::i32, A, B, Mask));
17272
17273 if (SDValue A = IsVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}))
17274 return DAG.getNode(Opcode: ARMISD::VADDVs, DL: dl, VT: ResVT, Operand: A);
17275 if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}))
17276 return DAG.getNode(Opcode: ARMISD::VADDVu, DL: dl, VT: ResVT, Operand: A);
17277 if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}))
17278 return Create64bitNode(ARMISD::VADDLVs, {A});
17279 if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}))
17280 return Create64bitNode(ARMISD::VADDLVu, {A});
17281 if (SDValue A = IsVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}))
17282 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17283 DAG.getNode(ARMISD::VADDVs, dl, MVT::i32, A));
17284 if (SDValue A = IsVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}))
17285 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17286 DAG.getNode(ARMISD::VADDVu, dl, MVT::i32, A));
17287
17288 if (SDValue A = IsPredVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
17289 return DAG.getNode(Opcode: ARMISD::VADDVps, DL: dl, VT: ResVT, N1: A, N2: Mask);
17290 if (SDValue A = IsPredVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
17291 return DAG.getNode(Opcode: ARMISD::VADDVpu, DL: dl, VT: ResVT, N1: A, N2: Mask);
17292 if (SDValue A = IsPredVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}, Mask))
17293 return Create64bitNode(ARMISD::VADDLVps, {A, Mask});
17294 if (SDValue A = IsPredVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}, Mask))
17295 return Create64bitNode(ARMISD::VADDLVpu, {A, Mask});
17296 if (SDValue A = IsPredVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, Mask))
17297 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17298 DAG.getNode(ARMISD::VADDVps, dl, MVT::i32, A, Mask));
17299 if (SDValue A = IsPredVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, Mask))
17300 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17301 DAG.getNode(ARMISD::VADDVpu, dl, MVT::i32, A, Mask));
17302
17303 // Some complications. We can get a case where the two inputs of the mul are
17304 // the same, then the output sext will have been helpfully converted to a
17305 // zext. Turn it back.
17306 SDValue Op = N0;
17307 if (Op->getOpcode() == ISD::VSELECT)
17308 Op = Op->getOperand(Num: 1);
17309 if (Op->getOpcode() == ISD::ZERO_EXTEND &&
17310 Op->getOperand(Num: 0)->getOpcode() == ISD::MUL) {
17311 SDValue Mul = Op->getOperand(Num: 0);
17312 if (Mul->getOperand(Num: 0) == Mul->getOperand(Num: 1) &&
17313 Mul->getOperand(Num: 0)->getOpcode() == ISD::SIGN_EXTEND) {
17314 SDValue Ext = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL: dl, VT: N0->getValueType(ResNo: 0), Operand: Mul);
17315 if (Op != N0)
17316 Ext = DAG.getNode(Opcode: ISD::VSELECT, DL: dl, VT: N0->getValueType(ResNo: 0),
17317 N1: N0->getOperand(Num: 0), N2: Ext, N3: N0->getOperand(Num: 2));
17318 return DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL: dl, VT: ResVT, Operand: Ext);
17319 }
17320 }
17321
17322 return SDValue();
17323}
17324
17325// Looks for vaddv(shuffle) or vmlav(shuffle, shuffle), with a shuffle where all
17326// the lanes are used. Due to the reduction being commutative the shuffle can be
17327// removed.
17328static SDValue PerformReduceShuffleCombine(SDNode *N, SelectionDAG &DAG) {
17329 unsigned VecOp = N->getOperand(Num: 0).getValueType().isVector() ? 0 : 2;
17330 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(Val: N->getOperand(Num: VecOp));
17331 if (!Shuf || !Shuf->getOperand(Num: 1).isUndef())
17332 return SDValue();
17333
17334 // Check all elements are used once in the mask.
17335 ArrayRef<int> Mask = Shuf->getMask();
17336 APInt SetElts(Mask.size(), 0);
17337 for (int E : Mask) {
17338 if (E < 0 || E >= (int)Mask.size())
17339 return SDValue();
17340 SetElts.setBit(E);
17341 }
17342 if (!SetElts.isAllOnes())
17343 return SDValue();
17344
17345 if (N->getNumOperands() != VecOp + 1) {
17346 auto *Shuf2 = dyn_cast<ShuffleVectorSDNode>(Val: N->getOperand(Num: VecOp + 1));
17347 if (!Shuf2 || !Shuf2->getOperand(Num: 1).isUndef() || Shuf2->getMask() != Mask)
17348 return SDValue();
17349 }
17350
17351 SmallVector<SDValue> Ops;
17352 for (SDValue Op : N->ops()) {
17353 if (Op.getValueType().isVector())
17354 Ops.push_back(Elt: Op.getOperand(i: 0));
17355 else
17356 Ops.push_back(Elt: Op);
17357 }
17358 return DAG.getNode(Opcode: N->getOpcode(), DL: SDLoc(N), VTList: N->getVTList(), Ops);
17359}
17360
17361static SDValue PerformVMOVNCombine(SDNode *N,
17362 TargetLowering::DAGCombinerInfo &DCI) {
17363 SDValue Op0 = N->getOperand(Num: 0);
17364 SDValue Op1 = N->getOperand(Num: 1);
17365 unsigned IsTop = N->getConstantOperandVal(Num: 2);
17366
17367 // VMOVNT a undef -> a
17368 // VMOVNB a undef -> a
17369 // VMOVNB undef a -> a
17370 if (Op1->isUndef())
17371 return Op0;
17372 if (Op0->isUndef() && !IsTop)
17373 return Op1;
17374
17375 // VMOVNt(c, VQMOVNb(a, b)) => VQMOVNt(c, b)
17376 // VMOVNb(c, VQMOVNb(a, b)) => VQMOVNb(c, b)
17377 if ((Op1->getOpcode() == ARMISD::VQMOVNs ||
17378 Op1->getOpcode() == ARMISD::VQMOVNu) &&
17379 Op1->getConstantOperandVal(Num: 2) == 0)
17380 return DCI.DAG.getNode(Opcode: Op1->getOpcode(), DL: SDLoc(Op1), VT: N->getValueType(ResNo: 0),
17381 N1: Op0, N2: Op1->getOperand(Num: 1), N3: N->getOperand(Num: 2));
17382
17383 // Only the bottom lanes from Qm (Op1) and either the top or bottom lanes from
17384 // Qd (Op0) are demanded from a VMOVN, depending on whether we are inserting
17385 // into the top or bottom lanes.
17386 unsigned NumElts = N->getValueType(ResNo: 0).getVectorNumElements();
17387 APInt Op1DemandedElts = APInt::getSplat(NewLen: NumElts, V: APInt::getLowBitsSet(numBits: 2, loBitsSet: 1));
17388 APInt Op0DemandedElts =
17389 IsTop ? Op1DemandedElts
17390 : APInt::getSplat(NewLen: NumElts, V: APInt::getHighBitsSet(numBits: 2, hiBitsSet: 1));
17391
17392 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
17393 if (TLI.SimplifyDemandedVectorElts(Op: Op0, DemandedElts: Op0DemandedElts, DCI))
17394 return SDValue(N, 0);
17395 if (TLI.SimplifyDemandedVectorElts(Op: Op1, DemandedElts: Op1DemandedElts, DCI))
17396 return SDValue(N, 0);
17397
17398 return SDValue();
17399}
17400
17401static SDValue PerformVQMOVNCombine(SDNode *N,
17402 TargetLowering::DAGCombinerInfo &DCI) {
17403 SDValue Op0 = N->getOperand(Num: 0);
17404 unsigned IsTop = N->getConstantOperandVal(Num: 2);
17405
17406 unsigned NumElts = N->getValueType(ResNo: 0).getVectorNumElements();
17407 APInt Op0DemandedElts =
17408 APInt::getSplat(NewLen: NumElts, V: IsTop ? APInt::getLowBitsSet(numBits: 2, loBitsSet: 1)
17409 : APInt::getHighBitsSet(numBits: 2, hiBitsSet: 1));
17410
17411 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
17412 if (TLI.SimplifyDemandedVectorElts(Op: Op0, DemandedElts: Op0DemandedElts, DCI))
17413 return SDValue(N, 0);
17414 return SDValue();
17415}
17416
17417static SDValue PerformVQDMULHCombine(SDNode *N,
17418 TargetLowering::DAGCombinerInfo &DCI) {
17419 EVT VT = N->getValueType(ResNo: 0);
17420 SDValue LHS = N->getOperand(Num: 0);
17421 SDValue RHS = N->getOperand(Num: 1);
17422
17423 auto *Shuf0 = dyn_cast<ShuffleVectorSDNode>(Val&: LHS);
17424 auto *Shuf1 = dyn_cast<ShuffleVectorSDNode>(Val&: RHS);
17425 // Turn VQDMULH(shuffle, shuffle) -> shuffle(VQDMULH)
17426 if (Shuf0 && Shuf1 && Shuf0->getMask().equals(RHS: Shuf1->getMask()) &&
17427 LHS.getOperand(i: 1).isUndef() && RHS.getOperand(i: 1).isUndef() &&
17428 (LHS.hasOneUse() || RHS.hasOneUse() || LHS == RHS)) {
17429 SDLoc DL(N);
17430 SDValue NewBinOp = DCI.DAG.getNode(Opcode: N->getOpcode(), DL, VT,
17431 N1: LHS.getOperand(i: 0), N2: RHS.getOperand(i: 0));
17432 SDValue UndefV = LHS.getOperand(i: 1);
17433 return DCI.DAG.getVectorShuffle(VT, dl: DL, N1: NewBinOp, N2: UndefV, Mask: Shuf0->getMask());
17434 }
17435 return SDValue();
17436}
17437
17438static SDValue PerformLongShiftCombine(SDNode *N, SelectionDAG &DAG) {
17439 SDLoc DL(N);
17440 SDValue Op0 = N->getOperand(Num: 0);
17441 SDValue Op1 = N->getOperand(Num: 1);
17442
17443 // Turn X << -C -> X >> C and viceversa. The negative shifts can come up from
17444 // uses of the intrinsics.
17445 if (auto C = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 2))) {
17446 int ShiftAmt = C->getSExtValue();
17447 if (ShiftAmt == 0) {
17448 SDValue Merge = DAG.getMergeValues(Ops: {Op0, Op1}, dl: DL);
17449 DAG.ReplaceAllUsesWith(From: N, To: Merge.getNode());
17450 return SDValue();
17451 }
17452
17453 if (ShiftAmt >= -32 && ShiftAmt < 0) {
17454 unsigned NewOpcode =
17455 N->getOpcode() == ARMISD::LSLL ? ARMISD::LSRL : ARMISD::LSLL;
17456 SDValue NewShift = DAG.getNode(NewOpcode, DL, N->getVTList(), Op0, Op1,
17457 DAG.getConstant(-ShiftAmt, DL, MVT::i32));
17458 DAG.ReplaceAllUsesWith(From: N, To: NewShift.getNode());
17459 return NewShift;
17460 }
17461 }
17462
17463 return SDValue();
17464}
17465
17466/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
17467SDValue ARMTargetLowering::PerformIntrinsicCombine(SDNode *N,
17468 DAGCombinerInfo &DCI) const {
17469 SelectionDAG &DAG = DCI.DAG;
17470 unsigned IntNo = N->getConstantOperandVal(Num: 0);
17471 switch (IntNo) {
17472 default:
17473 // Don't do anything for most intrinsics.
17474 break;
17475
17476 // Vector shifts: check for immediate versions and lower them.
17477 // Note: This is done during DAG combining instead of DAG legalizing because
17478 // the build_vectors for 64-bit vector element shift counts are generally
17479 // not legal, and it is hard to see their values after they get legalized to
17480 // loads from a constant pool.
17481 case Intrinsic::arm_neon_vshifts:
17482 case Intrinsic::arm_neon_vshiftu:
17483 case Intrinsic::arm_neon_vrshifts:
17484 case Intrinsic::arm_neon_vrshiftu:
17485 case Intrinsic::arm_neon_vrshiftn:
17486 case Intrinsic::arm_neon_vqshifts:
17487 case Intrinsic::arm_neon_vqshiftu:
17488 case Intrinsic::arm_neon_vqshiftsu:
17489 case Intrinsic::arm_neon_vqshiftns:
17490 case Intrinsic::arm_neon_vqshiftnu:
17491 case Intrinsic::arm_neon_vqshiftnsu:
17492 case Intrinsic::arm_neon_vqrshiftns:
17493 case Intrinsic::arm_neon_vqrshiftnu:
17494 case Intrinsic::arm_neon_vqrshiftnsu: {
17495 EVT VT = N->getOperand(Num: 1).getValueType();
17496 int64_t Cnt;
17497 unsigned VShiftOpc = 0;
17498
17499 switch (IntNo) {
17500 case Intrinsic::arm_neon_vshifts:
17501 case Intrinsic::arm_neon_vshiftu:
17502 if (isVShiftLImm(Op: N->getOperand(Num: 2), VT, isLong: false, Cnt)) {
17503 VShiftOpc = ARMISD::VSHLIMM;
17504 break;
17505 }
17506 if (isVShiftRImm(Op: N->getOperand(Num: 2), VT, isNarrow: false, isIntrinsic: true, Cnt)) {
17507 VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM
17508 : ARMISD::VSHRuIMM);
17509 break;
17510 }
17511 return SDValue();
17512
17513 case Intrinsic::arm_neon_vrshifts:
17514 case Intrinsic::arm_neon_vrshiftu:
17515 if (isVShiftRImm(Op: N->getOperand(Num: 2), VT, isNarrow: false, isIntrinsic: true, Cnt))
17516 break;
17517 return SDValue();
17518
17519 case Intrinsic::arm_neon_vqshifts:
17520 case Intrinsic::arm_neon_vqshiftu:
17521 if (isVShiftLImm(Op: N->getOperand(Num: 2), VT, isLong: false, Cnt))
17522 break;
17523 return SDValue();
17524
17525 case Intrinsic::arm_neon_vqshiftsu:
17526 if (isVShiftLImm(Op: N->getOperand(Num: 2), VT, isLong: false, Cnt))
17527 break;
17528 llvm_unreachable("invalid shift count for vqshlu intrinsic");
17529
17530 case Intrinsic::arm_neon_vrshiftn:
17531 case Intrinsic::arm_neon_vqshiftns:
17532 case Intrinsic::arm_neon_vqshiftnu:
17533 case Intrinsic::arm_neon_vqshiftnsu:
17534 case Intrinsic::arm_neon_vqrshiftns:
17535 case Intrinsic::arm_neon_vqrshiftnu:
17536 case Intrinsic::arm_neon_vqrshiftnsu:
17537 // Narrowing shifts require an immediate right shift.
17538 if (isVShiftRImm(Op: N->getOperand(Num: 2), VT, isNarrow: true, isIntrinsic: true, Cnt))
17539 break;
17540 llvm_unreachable("invalid shift count for narrowing vector shift "
17541 "intrinsic");
17542
17543 default:
17544 llvm_unreachable("unhandled vector shift");
17545 }
17546
17547 switch (IntNo) {
17548 case Intrinsic::arm_neon_vshifts:
17549 case Intrinsic::arm_neon_vshiftu:
17550 // Opcode already set above.
17551 break;
17552 case Intrinsic::arm_neon_vrshifts:
17553 VShiftOpc = ARMISD::VRSHRsIMM;
17554 break;
17555 case Intrinsic::arm_neon_vrshiftu:
17556 VShiftOpc = ARMISD::VRSHRuIMM;
17557 break;
17558 case Intrinsic::arm_neon_vrshiftn:
17559 VShiftOpc = ARMISD::VRSHRNIMM;
17560 break;
17561 case Intrinsic::arm_neon_vqshifts:
17562 VShiftOpc = ARMISD::VQSHLsIMM;
17563 break;
17564 case Intrinsic::arm_neon_vqshiftu:
17565 VShiftOpc = ARMISD::VQSHLuIMM;
17566 break;
17567 case Intrinsic::arm_neon_vqshiftsu:
17568 VShiftOpc = ARMISD::VQSHLsuIMM;
17569 break;
17570 case Intrinsic::arm_neon_vqshiftns:
17571 VShiftOpc = ARMISD::VQSHRNsIMM;
17572 break;
17573 case Intrinsic::arm_neon_vqshiftnu:
17574 VShiftOpc = ARMISD::VQSHRNuIMM;
17575 break;
17576 case Intrinsic::arm_neon_vqshiftnsu:
17577 VShiftOpc = ARMISD::VQSHRNsuIMM;
17578 break;
17579 case Intrinsic::arm_neon_vqrshiftns:
17580 VShiftOpc = ARMISD::VQRSHRNsIMM;
17581 break;
17582 case Intrinsic::arm_neon_vqrshiftnu:
17583 VShiftOpc = ARMISD::VQRSHRNuIMM;
17584 break;
17585 case Intrinsic::arm_neon_vqrshiftnsu:
17586 VShiftOpc = ARMISD::VQRSHRNsuIMM;
17587 break;
17588 }
17589
17590 SDLoc dl(N);
17591 return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
17592 N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32));
17593 }
17594
17595 case Intrinsic::arm_neon_vshiftins: {
17596 EVT VT = N->getOperand(Num: 1).getValueType();
17597 int64_t Cnt;
17598 unsigned VShiftOpc = 0;
17599
17600 if (isVShiftLImm(Op: N->getOperand(Num: 3), VT, isLong: false, Cnt))
17601 VShiftOpc = ARMISD::VSLIIMM;
17602 else if (isVShiftRImm(Op: N->getOperand(Num: 3), VT, isNarrow: false, isIntrinsic: true, Cnt))
17603 VShiftOpc = ARMISD::VSRIIMM;
17604 else {
17605 llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
17606 }
17607
17608 SDLoc dl(N);
17609 return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
17610 N->getOperand(1), N->getOperand(2),
17611 DAG.getConstant(Cnt, dl, MVT::i32));
17612 }
17613
17614 case Intrinsic::arm_neon_vqrshifts:
17615 case Intrinsic::arm_neon_vqrshiftu:
17616 // No immediate versions of these to check for.
17617 break;
17618
17619 case Intrinsic::arm_mve_vqdmlah:
17620 case Intrinsic::arm_mve_vqdmlash:
17621 case Intrinsic::arm_mve_vqrdmlah:
17622 case Intrinsic::arm_mve_vqrdmlash:
17623 case Intrinsic::arm_mve_vmla_n_predicated:
17624 case Intrinsic::arm_mve_vmlas_n_predicated:
17625 case Intrinsic::arm_mve_vqdmlah_predicated:
17626 case Intrinsic::arm_mve_vqdmlash_predicated:
17627 case Intrinsic::arm_mve_vqrdmlah_predicated:
17628 case Intrinsic::arm_mve_vqrdmlash_predicated: {
17629 // These intrinsics all take an i32 scalar operand which is narrowed to the
17630 // size of a single lane of the vector type they return. So we don't need
17631 // any bits of that operand above that point, which allows us to eliminate
17632 // uxth/sxth.
17633 unsigned BitWidth = N->getValueType(ResNo: 0).getScalarSizeInBits();
17634 APInt DemandedMask = APInt::getLowBitsSet(numBits: 32, loBitsSet: BitWidth);
17635 if (SimplifyDemandedBits(Op: N->getOperand(Num: 3), DemandedBits: DemandedMask, DCI))
17636 return SDValue();
17637 break;
17638 }
17639
17640 case Intrinsic::arm_mve_minv:
17641 case Intrinsic::arm_mve_maxv:
17642 case Intrinsic::arm_mve_minav:
17643 case Intrinsic::arm_mve_maxav:
17644 case Intrinsic::arm_mve_minv_predicated:
17645 case Intrinsic::arm_mve_maxv_predicated:
17646 case Intrinsic::arm_mve_minav_predicated:
17647 case Intrinsic::arm_mve_maxav_predicated: {
17648 // These intrinsics all take an i32 scalar operand which is narrowed to the
17649 // size of a single lane of the vector type they take as the other input.
17650 unsigned BitWidth = N->getOperand(Num: 2)->getValueType(ResNo: 0).getScalarSizeInBits();
17651 APInt DemandedMask = APInt::getLowBitsSet(numBits: 32, loBitsSet: BitWidth);
17652 if (SimplifyDemandedBits(Op: N->getOperand(Num: 1), DemandedBits: DemandedMask, DCI))
17653 return SDValue();
17654 break;
17655 }
17656
17657 case Intrinsic::arm_mve_addv: {
17658 // Turn this intrinsic straight into the appropriate ARMISD::VADDV node,
17659 // which allow PerformADDVecReduce to turn it into VADDLV when possible.
17660 bool Unsigned = N->getConstantOperandVal(Num: 2);
17661 unsigned Opc = Unsigned ? ARMISD::VADDVu : ARMISD::VADDVs;
17662 return DAG.getNode(Opcode: Opc, DL: SDLoc(N), VTList: N->getVTList(), N: N->getOperand(Num: 1));
17663 }
17664
17665 case Intrinsic::arm_mve_addlv:
17666 case Intrinsic::arm_mve_addlv_predicated: {
17667 // Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR
17668 // which recombines the two outputs into an i64
17669 bool Unsigned = N->getConstantOperandVal(Num: 2);
17670 unsigned Opc = IntNo == Intrinsic::arm_mve_addlv ?
17671 (Unsigned ? ARMISD::VADDLVu : ARMISD::VADDLVs) :
17672 (Unsigned ? ARMISD::VADDLVpu : ARMISD::VADDLVps);
17673
17674 SmallVector<SDValue, 4> Ops;
17675 for (unsigned i = 1, e = N->getNumOperands(); i < e; i++)
17676 if (i != 2) // skip the unsigned flag
17677 Ops.push_back(Elt: N->getOperand(Num: i));
17678
17679 SDLoc dl(N);
17680 SDValue val = DAG.getNode(Opc, dl, {MVT::i32, MVT::i32}, Ops);
17681 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, val.getValue(0),
17682 val.getValue(1));
17683 }
17684 }
17685
17686 return SDValue();
17687}
17688
17689/// PerformShiftCombine - Checks for immediate versions of vector shifts and
17690/// lowers them. As with the vector shift intrinsics, this is done during DAG
17691/// combining instead of DAG legalizing because the build_vectors for 64-bit
17692/// vector element shift counts are generally not legal, and it is hard to see
17693/// their values after they get legalized to loads from a constant pool.
17694static SDValue PerformShiftCombine(SDNode *N,
17695 TargetLowering::DAGCombinerInfo &DCI,
17696 const ARMSubtarget *ST) {
17697 SelectionDAG &DAG = DCI.DAG;
17698 EVT VT = N->getValueType(ResNo: 0);
17699
17700 if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 &&
17701 N->getOperand(0)->getOpcode() == ISD::AND &&
17702 N->getOperand(0)->hasOneUse()) {
17703 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
17704 return SDValue();
17705 // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't
17706 // usually show up because instcombine prefers to canonicalize it to
17707 // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come
17708 // out of GEP lowering in some cases.
17709 SDValue N0 = N->getOperand(Num: 0);
17710 ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
17711 if (!ShiftAmtNode)
17712 return SDValue();
17713 uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue());
17714 ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(Val: N0->getOperand(Num: 1));
17715 if (!AndMaskNode)
17716 return SDValue();
17717 uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue());
17718 // Don't transform uxtb/uxth.
17719 if (AndMask == 255 || AndMask == 65535)
17720 return SDValue();
17721 if (isMask_32(Value: AndMask)) {
17722 uint32_t MaskedBits = llvm::countl_zero(Val: AndMask);
17723 if (MaskedBits > ShiftAmt) {
17724 SDLoc DL(N);
17725 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
17726 DAG.getConstant(MaskedBits, DL, MVT::i32));
17727 return DAG.getNode(
17728 ISD::SRL, DL, MVT::i32, SHL,
17729 DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32));
17730 }
17731 }
17732 }
17733
17734 // Nothing to be done for scalar shifts.
17735 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17736 if (!VT.isVector() || !TLI.isTypeLegal(VT))
17737 return SDValue();
17738 if (ST->hasMVEIntegerOps())
17739 return SDValue();
17740
17741 int64_t Cnt;
17742
17743 switch (N->getOpcode()) {
17744 default: llvm_unreachable("unexpected shift opcode");
17745
17746 case ISD::SHL:
17747 if (isVShiftLImm(Op: N->getOperand(Num: 1), VT, isLong: false, Cnt)) {
17748 SDLoc dl(N);
17749 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
17750 DAG.getConstant(Cnt, dl, MVT::i32));
17751 }
17752 break;
17753
17754 case ISD::SRA:
17755 case ISD::SRL:
17756 if (isVShiftRImm(Op: N->getOperand(Num: 1), VT, isNarrow: false, isIntrinsic: false, Cnt)) {
17757 unsigned VShiftOpc =
17758 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
17759 SDLoc dl(N);
17760 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
17761 DAG.getConstant(Cnt, dl, MVT::i32));
17762 }
17763 }
17764 return SDValue();
17765}
17766
17767// Look for a sign/zero/fpextend extend of a larger than legal load. This can be
17768// split into multiple extending loads, which are simpler to deal with than an
17769// arbitrary extend. For fp extends we use an integer extending load and a VCVTL
17770// to convert the type to an f32.
17771static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) {
17772 SDValue N0 = N->getOperand(Num: 0);
17773 if (N0.getOpcode() != ISD::LOAD)
17774 return SDValue();
17775 LoadSDNode *LD = cast<LoadSDNode>(Val: N0.getNode());
17776 if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() ||
17777 LD->getExtensionType() != ISD::NON_EXTLOAD)
17778 return SDValue();
17779 EVT FromVT = LD->getValueType(ResNo: 0);
17780 EVT ToVT = N->getValueType(ResNo: 0);
17781 if (!ToVT.isVector())
17782 return SDValue();
17783 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements());
17784 EVT ToEltVT = ToVT.getVectorElementType();
17785 EVT FromEltVT = FromVT.getVectorElementType();
17786
17787 unsigned NumElements = 0;
17788 if (ToEltVT == MVT::i32 && FromEltVT == MVT::i8)
17789 NumElements = 4;
17790 if (ToEltVT == MVT::f32 && FromEltVT == MVT::f16)
17791 NumElements = 4;
17792 if (NumElements == 0 ||
17793 (FromEltVT != MVT::f16 && FromVT.getVectorNumElements() == NumElements) ||
17794 FromVT.getVectorNumElements() % NumElements != 0 ||
17795 !isPowerOf2_32(NumElements))
17796 return SDValue();
17797
17798 LLVMContext &C = *DAG.getContext();
17799 SDLoc DL(LD);
17800 // Details about the old load
17801 SDValue Ch = LD->getChain();
17802 SDValue BasePtr = LD->getBasePtr();
17803 Align Alignment = LD->getOriginalAlign();
17804 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
17805 AAMDNodes AAInfo = LD->getAAInfo();
17806
17807 ISD::LoadExtType NewExtType =
17808 N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
17809 SDValue Offset = DAG.getUNDEF(VT: BasePtr.getValueType());
17810 EVT NewFromVT = EVT::getVectorVT(
17811 Context&: C, VT: EVT::getIntegerVT(Context&: C, BitWidth: FromEltVT.getScalarSizeInBits()), NumElements);
17812 EVT NewToVT = EVT::getVectorVT(
17813 Context&: C, VT: EVT::getIntegerVT(Context&: C, BitWidth: ToEltVT.getScalarSizeInBits()), NumElements);
17814
17815 SmallVector<SDValue, 4> Loads;
17816 SmallVector<SDValue, 4> Chains;
17817 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
17818 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
17819 SDValue NewPtr =
17820 DAG.getObjectPtrOffset(SL: DL, Ptr: BasePtr, Offset: TypeSize::getFixed(ExactSize: NewOffset));
17821
17822 SDValue NewLoad =
17823 DAG.getLoad(AM: ISD::UNINDEXED, ExtType: NewExtType, VT: NewToVT, dl: DL, Chain: Ch, Ptr: NewPtr, Offset,
17824 PtrInfo: LD->getPointerInfo().getWithOffset(O: NewOffset), MemVT: NewFromVT,
17825 Alignment, MMOFlags, AAInfo);
17826 Loads.push_back(Elt: NewLoad);
17827 Chains.push_back(Elt: SDValue(NewLoad.getNode(), 1));
17828 }
17829
17830 // Float truncs need to extended with VCVTB's into their floating point types.
17831 if (FromEltVT == MVT::f16) {
17832 SmallVector<SDValue, 4> Extends;
17833
17834 for (unsigned i = 0; i < Loads.size(); i++) {
17835 SDValue LoadBC =
17836 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v8f16, Loads[i]);
17837 SDValue FPExt = DAG.getNode(ARMISD::VCVTL, DL, MVT::v4f32, LoadBC,
17838 DAG.getConstant(0, DL, MVT::i32));
17839 Extends.push_back(Elt: FPExt);
17840 }
17841
17842 Loads = Extends;
17843 }
17844
17845 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
17846 DAG.ReplaceAllUsesOfValueWith(From: SDValue(LD, 1), To: NewChain);
17847 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: ToVT, Ops: Loads);
17848}
17849
17850/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
17851/// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
17852static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
17853 const ARMSubtarget *ST) {
17854 SDValue N0 = N->getOperand(Num: 0);
17855
17856 // Check for sign- and zero-extensions of vector extract operations of 8- and
17857 // 16-bit vector elements. NEON and MVE support these directly. They are
17858 // handled during DAG combining because type legalization will promote them
17859 // to 32-bit types and it is messy to recognize the operations after that.
17860 if ((ST->hasNEON() || ST->hasMVEIntegerOps()) &&
17861 N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
17862 SDValue Vec = N0.getOperand(i: 0);
17863 SDValue Lane = N0.getOperand(i: 1);
17864 EVT VT = N->getValueType(ResNo: 0);
17865 EVT EltVT = N0.getValueType();
17866 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17867
17868 if (VT == MVT::i32 &&
17869 (EltVT == MVT::i8 || EltVT == MVT::i16) &&
17870 TLI.isTypeLegal(Vec.getValueType()) &&
17871 isa<ConstantSDNode>(Lane)) {
17872
17873 unsigned Opc = 0;
17874 switch (N->getOpcode()) {
17875 default: llvm_unreachable("unexpected opcode");
17876 case ISD::SIGN_EXTEND:
17877 Opc = ARMISD::VGETLANEs;
17878 break;
17879 case ISD::ZERO_EXTEND:
17880 case ISD::ANY_EXTEND:
17881 Opc = ARMISD::VGETLANEu;
17882 break;
17883 }
17884 return DAG.getNode(Opcode: Opc, DL: SDLoc(N), VT, N1: Vec, N2: Lane);
17885 }
17886 }
17887
17888 if (ST->hasMVEIntegerOps())
17889 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
17890 return NewLoad;
17891
17892 return SDValue();
17893}
17894
17895static SDValue PerformFPExtendCombine(SDNode *N, SelectionDAG &DAG,
17896 const ARMSubtarget *ST) {
17897 if (ST->hasMVEFloatOps())
17898 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
17899 return NewLoad;
17900
17901 return SDValue();
17902}
17903
17904// Lower smin(smax(x, C1), C2) to ssat or usat, if they have saturating
17905// constant bounds.
17906static SDValue PerformMinMaxToSatCombine(SDValue Op, SelectionDAG &DAG,
17907 const ARMSubtarget *Subtarget) {
17908 if ((Subtarget->isThumb() || !Subtarget->hasV6Ops()) &&
17909 !Subtarget->isThumb2())
17910 return SDValue();
17911
17912 EVT VT = Op.getValueType();
17913 SDValue Op0 = Op.getOperand(i: 0);
17914
17915 if (VT != MVT::i32 ||
17916 (Op0.getOpcode() != ISD::SMIN && Op0.getOpcode() != ISD::SMAX) ||
17917 !isa<ConstantSDNode>(Op.getOperand(1)) ||
17918 !isa<ConstantSDNode>(Op0.getOperand(1)))
17919 return SDValue();
17920
17921 SDValue Min = Op;
17922 SDValue Max = Op0;
17923 SDValue Input = Op0.getOperand(i: 0);
17924 if (Min.getOpcode() == ISD::SMAX)
17925 std::swap(a&: Min, b&: Max);
17926
17927 APInt MinC = Min.getConstantOperandAPInt(i: 1);
17928 APInt MaxC = Max.getConstantOperandAPInt(i: 1);
17929
17930 if (Min.getOpcode() != ISD::SMIN || Max.getOpcode() != ISD::SMAX ||
17931 !(MinC + 1).isPowerOf2())
17932 return SDValue();
17933
17934 SDLoc DL(Op);
17935 if (MinC == ~MaxC)
17936 return DAG.getNode(Opcode: ARMISD::SSAT, DL, VT, N1: Input,
17937 N2: DAG.getConstant(Val: MinC.countr_one(), DL, VT));
17938 if (MaxC == 0)
17939 return DAG.getNode(Opcode: ARMISD::USAT, DL, VT, N1: Input,
17940 N2: DAG.getConstant(Val: MinC.countr_one(), DL, VT));
17941
17942 return SDValue();
17943}
17944
17945/// PerformMinMaxCombine - Target-specific DAG combining for creating truncating
17946/// saturates.
17947static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG,
17948 const ARMSubtarget *ST) {
17949 EVT VT = N->getValueType(ResNo: 0);
17950 SDValue N0 = N->getOperand(Num: 0);
17951
17952 if (VT == MVT::i32)
17953 return PerformMinMaxToSatCombine(Op: SDValue(N, 0), DAG, Subtarget: ST);
17954
17955 if (!ST->hasMVEIntegerOps())
17956 return SDValue();
17957
17958 if (SDValue V = PerformVQDMULHCombine(N, DAG))
17959 return V;
17960
17961 if (VT != MVT::v4i32 && VT != MVT::v8i16)
17962 return SDValue();
17963
17964 auto IsSignedSaturate = [&](SDNode *Min, SDNode *Max) {
17965 // Check one is a smin and the other is a smax
17966 if (Min->getOpcode() != ISD::SMIN)
17967 std::swap(a&: Min, b&: Max);
17968 if (Min->getOpcode() != ISD::SMIN || Max->getOpcode() != ISD::SMAX)
17969 return false;
17970
17971 APInt SaturateC;
17972 if (VT == MVT::v4i32)
17973 SaturateC = APInt(32, (1 << 15) - 1, true);
17974 else //if (VT == MVT::v8i16)
17975 SaturateC = APInt(16, (1 << 7) - 1, true);
17976
17977 APInt MinC, MaxC;
17978 if (!ISD::isConstantSplatVector(N: Min->getOperand(Num: 1).getNode(), SplatValue&: MinC) ||
17979 MinC != SaturateC)
17980 return false;
17981 if (!ISD::isConstantSplatVector(N: Max->getOperand(Num: 1).getNode(), SplatValue&: MaxC) ||
17982 MaxC != ~SaturateC)
17983 return false;
17984 return true;
17985 };
17986
17987 if (IsSignedSaturate(N, N0.getNode())) {
17988 SDLoc DL(N);
17989 MVT ExtVT, HalfVT;
17990 if (VT == MVT::v4i32) {
17991 HalfVT = MVT::v8i16;
17992 ExtVT = MVT::v4i16;
17993 } else { // if (VT == MVT::v8i16)
17994 HalfVT = MVT::v16i8;
17995 ExtVT = MVT::v8i8;
17996 }
17997
17998 // Create a VQMOVNB with undef top lanes, then signed extended into the top
17999 // half. That extend will hopefully be removed if only the bottom bits are
18000 // demanded (though a truncating store, for example).
18001 SDValue VQMOVN =
18002 DAG.getNode(ARMISD::VQMOVNs, DL, HalfVT, DAG.getUNDEF(HalfVT),
18003 N0->getOperand(0), DAG.getConstant(0, DL, MVT::i32));
18004 SDValue Bitcast = DAG.getNode(Opcode: ARMISD::VECTOR_REG_CAST, DL, VT, Operand: VQMOVN);
18005 return DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT, N1: Bitcast,
18006 N2: DAG.getValueType(ExtVT));
18007 }
18008
18009 auto IsUnsignedSaturate = [&](SDNode *Min) {
18010 // For unsigned, we just need to check for <= 0xffff
18011 if (Min->getOpcode() != ISD::UMIN)
18012 return false;
18013
18014 APInt SaturateC;
18015 if (VT == MVT::v4i32)
18016 SaturateC = APInt(32, (1 << 16) - 1, true);
18017 else //if (VT == MVT::v8i16)
18018 SaturateC = APInt(16, (1 << 8) - 1, true);
18019
18020 APInt MinC;
18021 if (!ISD::isConstantSplatVector(N: Min->getOperand(Num: 1).getNode(), SplatValue&: MinC) ||
18022 MinC != SaturateC)
18023 return false;
18024 return true;
18025 };
18026
18027 if (IsUnsignedSaturate(N)) {
18028 SDLoc DL(N);
18029 MVT HalfVT;
18030 unsigned ExtConst;
18031 if (VT == MVT::v4i32) {
18032 HalfVT = MVT::v8i16;
18033 ExtConst = 0x0000FFFF;
18034 } else { //if (VT == MVT::v8i16)
18035 HalfVT = MVT::v16i8;
18036 ExtConst = 0x00FF;
18037 }
18038
18039 // Create a VQMOVNB with undef top lanes, then ZExt into the top half with
18040 // an AND. That extend will hopefully be removed if only the bottom bits are
18041 // demanded (though a truncating store, for example).
18042 SDValue VQMOVN =
18043 DAG.getNode(ARMISD::VQMOVNu, DL, HalfVT, DAG.getUNDEF(HalfVT), N0,
18044 DAG.getConstant(0, DL, MVT::i32));
18045 SDValue Bitcast = DAG.getNode(Opcode: ARMISD::VECTOR_REG_CAST, DL, VT, Operand: VQMOVN);
18046 return DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Bitcast,
18047 N2: DAG.getConstant(Val: ExtConst, DL, VT));
18048 }
18049
18050 return SDValue();
18051}
18052
18053static const APInt *isPowerOf2Constant(SDValue V) {
18054 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: V);
18055 if (!C)
18056 return nullptr;
18057 const APInt *CV = &C->getAPIntValue();
18058 return CV->isPowerOf2() ? CV : nullptr;
18059}
18060
18061SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const {
18062 // If we have a CMOV, OR and AND combination such as:
18063 // if (x & CN)
18064 // y |= CM;
18065 //
18066 // And:
18067 // * CN is a single bit;
18068 // * All bits covered by CM are known zero in y
18069 //
18070 // Then we can convert this into a sequence of BFI instructions. This will
18071 // always be a win if CM is a single bit, will always be no worse than the
18072 // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is
18073 // three bits (due to the extra IT instruction).
18074
18075 SDValue Op0 = CMOV->getOperand(Num: 0);
18076 SDValue Op1 = CMOV->getOperand(Num: 1);
18077 auto CC = CMOV->getConstantOperandAPInt(Num: 2).getLimitedValue();
18078 SDValue CmpZ = CMOV->getOperand(Num: 4);
18079
18080 // The compare must be against zero.
18081 if (!isNullConstant(V: CmpZ->getOperand(Num: 1)))
18082 return SDValue();
18083
18084 assert(CmpZ->getOpcode() == ARMISD::CMPZ);
18085 SDValue And = CmpZ->getOperand(Num: 0);
18086 if (And->getOpcode() != ISD::AND)
18087 return SDValue();
18088 const APInt *AndC = isPowerOf2Constant(V: And->getOperand(Num: 1));
18089 if (!AndC)
18090 return SDValue();
18091 SDValue X = And->getOperand(Num: 0);
18092
18093 if (CC == ARMCC::EQ) {
18094 // We're performing an "equal to zero" compare. Swap the operands so we
18095 // canonicalize on a "not equal to zero" compare.
18096 std::swap(a&: Op0, b&: Op1);
18097 } else {
18098 assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");
18099 }
18100
18101 if (Op1->getOpcode() != ISD::OR)
18102 return SDValue();
18103
18104 ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Val: Op1->getOperand(Num: 1));
18105 if (!OrC)
18106 return SDValue();
18107 SDValue Y = Op1->getOperand(Num: 0);
18108
18109 if (Op0 != Y)
18110 return SDValue();
18111
18112 // Now, is it profitable to continue?
18113 APInt OrCI = OrC->getAPIntValue();
18114 unsigned Heuristic = Subtarget->isThumb() ? 3 : 2;
18115 if (OrCI.popcount() > Heuristic)
18116 return SDValue();
18117
18118 // Lastly, can we determine that the bits defined by OrCI
18119 // are zero in Y?
18120 KnownBits Known = DAG.computeKnownBits(Op: Y);
18121 if ((OrCI & Known.Zero) != OrCI)
18122 return SDValue();
18123
18124 // OK, we can do the combine.
18125 SDValue V = Y;
18126 SDLoc dl(X);
18127 EVT VT = X.getValueType();
18128 unsigned BitInX = AndC->logBase2();
18129
18130 if (BitInX != 0) {
18131 // We must shift X first.
18132 X = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT, N1: X,
18133 N2: DAG.getConstant(Val: BitInX, DL: dl, VT));
18134 }
18135
18136 for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits();
18137 BitInY < NumActiveBits; ++BitInY) {
18138 if (OrCI[BitInY] == 0)
18139 continue;
18140 APInt Mask(VT.getSizeInBits(), 0);
18141 Mask.setBit(BitInY);
18142 V = DAG.getNode(Opcode: ARMISD::BFI, DL: dl, VT, N1: V, N2: X,
18143 // Confusingly, the operand is an *inverted* mask.
18144 N3: DAG.getConstant(Val: ~Mask, DL: dl, VT));
18145 }
18146
18147 return V;
18148}
18149
18150// Given N, the value controlling the conditional branch, search for the loop
18151// intrinsic, returning it, along with how the value is used. We need to handle
18152// patterns such as the following:
18153// (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit)
18154// (brcond (setcc (loop.decrement), 0, eq), exit)
18155// (brcond (setcc (loop.decrement), 0, ne), header)
18156static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm,
18157 bool &Negate) {
18158 switch (N->getOpcode()) {
18159 default:
18160 break;
18161 case ISD::XOR: {
18162 if (!isa<ConstantSDNode>(Val: N.getOperand(i: 1)))
18163 return SDValue();
18164 if (!cast<ConstantSDNode>(Val: N.getOperand(i: 1))->isOne())
18165 return SDValue();
18166 Negate = !Negate;
18167 return SearchLoopIntrinsic(N: N.getOperand(i: 0), CC, Imm, Negate);
18168 }
18169 case ISD::SETCC: {
18170 auto *Const = dyn_cast<ConstantSDNode>(Val: N.getOperand(i: 1));
18171 if (!Const)
18172 return SDValue();
18173 if (Const->isZero())
18174 Imm = 0;
18175 else if (Const->isOne())
18176 Imm = 1;
18177 else
18178 return SDValue();
18179 CC = cast<CondCodeSDNode>(Val: N.getOperand(i: 2))->get();
18180 return SearchLoopIntrinsic(N: N->getOperand(Num: 0), CC, Imm, Negate);
18181 }
18182 case ISD::INTRINSIC_W_CHAIN: {
18183 unsigned IntOp = N.getConstantOperandVal(i: 1);
18184 if (IntOp != Intrinsic::test_start_loop_iterations &&
18185 IntOp != Intrinsic::loop_decrement_reg)
18186 return SDValue();
18187 return N;
18188 }
18189 }
18190 return SDValue();
18191}
18192
18193static SDValue PerformHWLoopCombine(SDNode *N,
18194 TargetLowering::DAGCombinerInfo &DCI,
18195 const ARMSubtarget *ST) {
18196
18197 // The hwloop intrinsics that we're interested are used for control-flow,
18198 // either for entering or exiting the loop:
18199 // - test.start.loop.iterations will test whether its operand is zero. If it
18200 // is zero, the proceeding branch should not enter the loop.
18201 // - loop.decrement.reg also tests whether its operand is zero. If it is
18202 // zero, the proceeding branch should not branch back to the beginning of
18203 // the loop.
18204 // So here, we need to check that how the brcond is using the result of each
18205 // of the intrinsics to ensure that we're branching to the right place at the
18206 // right time.
18207
18208 ISD::CondCode CC;
18209 SDValue Cond;
18210 int Imm = 1;
18211 bool Negate = false;
18212 SDValue Chain = N->getOperand(Num: 0);
18213 SDValue Dest;
18214
18215 if (N->getOpcode() == ISD::BRCOND) {
18216 CC = ISD::SETEQ;
18217 Cond = N->getOperand(Num: 1);
18218 Dest = N->getOperand(Num: 2);
18219 } else {
18220 assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!");
18221 CC = cast<CondCodeSDNode>(Val: N->getOperand(Num: 1))->get();
18222 Cond = N->getOperand(Num: 2);
18223 Dest = N->getOperand(Num: 4);
18224 if (auto *Const = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 3))) {
18225 if (!Const->isOne() && !Const->isZero())
18226 return SDValue();
18227 Imm = Const->getZExtValue();
18228 } else
18229 return SDValue();
18230 }
18231
18232 SDValue Int = SearchLoopIntrinsic(N: Cond, CC, Imm, Negate);
18233 if (!Int)
18234 return SDValue();
18235
18236 if (Negate)
18237 CC = ISD::getSetCCInverse(CC, /* Integer inverse */ MVT::i32);
18238
18239 auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) {
18240 return (CC == ISD::SETEQ && Imm == 0) ||
18241 (CC == ISD::SETNE && Imm == 1) ||
18242 (CC == ISD::SETLT && Imm == 1) ||
18243 (CC == ISD::SETULT && Imm == 1);
18244 };
18245
18246 auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) {
18247 return (CC == ISD::SETEQ && Imm == 1) ||
18248 (CC == ISD::SETNE && Imm == 0) ||
18249 (CC == ISD::SETGT && Imm == 0) ||
18250 (CC == ISD::SETUGT && Imm == 0) ||
18251 (CC == ISD::SETGE && Imm == 1) ||
18252 (CC == ISD::SETUGE && Imm == 1);
18253 };
18254
18255 assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) &&
18256 "unsupported condition");
18257
18258 SDLoc dl(Int);
18259 SelectionDAG &DAG = DCI.DAG;
18260 SDValue Elements = Int.getOperand(i: 2);
18261 unsigned IntOp = Int->getConstantOperandVal(Num: 1);
18262 assert((N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BR)
18263 && "expected single br user");
18264 SDNode *Br = *N->use_begin();
18265 SDValue OtherTarget = Br->getOperand(Num: 1);
18266
18267 // Update the unconditional branch to branch to the given Dest.
18268 auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) {
18269 SDValue NewBrOps[] = { Br->getOperand(Num: 0), Dest };
18270 SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps);
18271 DAG.ReplaceAllUsesOfValueWith(From: SDValue(Br, 0), To: NewBr);
18272 };
18273
18274 if (IntOp == Intrinsic::test_start_loop_iterations) {
18275 SDValue Res;
18276 SDValue Setup = DAG.getNode(ARMISD::WLSSETUP, dl, MVT::i32, Elements);
18277 // We expect this 'instruction' to branch when the counter is zero.
18278 if (IsTrueIfZero(CC, Imm)) {
18279 SDValue Ops[] = {Chain, Setup, Dest};
18280 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
18281 } else {
18282 // The logic is the reverse of what we need for WLS, so find the other
18283 // basic block target: the target of the proceeding br.
18284 UpdateUncondBr(Br, Dest, DAG);
18285
18286 SDValue Ops[] = {Chain, Setup, OtherTarget};
18287 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
18288 }
18289 // Update LR count to the new value
18290 DAG.ReplaceAllUsesOfValueWith(From: Int.getValue(R: 0), To: Setup);
18291 // Update chain
18292 DAG.ReplaceAllUsesOfValueWith(From: Int.getValue(R: 2), To: Int.getOperand(i: 0));
18293 return Res;
18294 } else {
18295 SDValue Size =
18296 DAG.getTargetConstant(Int.getConstantOperandVal(3), dl, MVT::i32);
18297 SDValue Args[] = { Int.getOperand(i: 0), Elements, Size, };
18298 SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl,
18299 DAG.getVTList(MVT::i32, MVT::Other), Args);
18300 DAG.ReplaceAllUsesWith(From: Int.getNode(), To: LoopDec.getNode());
18301
18302 // We expect this instruction to branch when the count is not zero.
18303 SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget;
18304
18305 // Update the unconditional branch to target the loop preheader if we've
18306 // found the condition has been reversed.
18307 if (Target == OtherTarget)
18308 UpdateUncondBr(Br, Dest, DAG);
18309
18310 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
18311 SDValue(LoopDec.getNode(), 1), Chain);
18312
18313 SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target };
18314 return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs);
18315 }
18316 return SDValue();
18317}
18318
18319/// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
18320SDValue
18321ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const {
18322 SDValue Cmp = N->getOperand(Num: 4);
18323 if (Cmp.getOpcode() != ARMISD::CMPZ)
18324 // Only looking at NE cases.
18325 return SDValue();
18326
18327 EVT VT = N->getValueType(ResNo: 0);
18328 SDLoc dl(N);
18329 SDValue LHS = Cmp.getOperand(i: 0);
18330 SDValue RHS = Cmp.getOperand(i: 1);
18331 SDValue Chain = N->getOperand(Num: 0);
18332 SDValue BB = N->getOperand(Num: 1);
18333 SDValue ARMcc = N->getOperand(Num: 2);
18334 ARMCC::CondCodes CC = (ARMCC::CondCodes)ARMcc->getAsZExtVal();
18335
18336 // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0))
18337 // -> (brcond Chain BB CC CPSR Cmp)
18338 if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() &&
18339 LHS->getOperand(Num: 0)->getOpcode() == ARMISD::CMOV &&
18340 LHS->getOperand(Num: 0)->hasOneUse() &&
18341 isNullConstant(V: LHS->getOperand(Num: 0)->getOperand(Num: 0)) &&
18342 isOneConstant(V: LHS->getOperand(Num: 0)->getOperand(Num: 1)) &&
18343 isOneConstant(V: LHS->getOperand(Num: 1)) && isNullConstant(V: RHS)) {
18344 return DAG.getNode(
18345 Opcode: ARMISD::BRCOND, DL: dl, VT, N1: Chain, N2: BB, N3: LHS->getOperand(Num: 0)->getOperand(Num: 2),
18346 N4: LHS->getOperand(Num: 0)->getOperand(Num: 3), N5: LHS->getOperand(Num: 0)->getOperand(Num: 4));
18347 }
18348
18349 return SDValue();
18350}
18351
18352/// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
18353SDValue
18354ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
18355 SDValue Cmp = N->getOperand(Num: 4);
18356 if (Cmp.getOpcode() != ARMISD::CMPZ)
18357 // Only looking at EQ and NE cases.
18358 return SDValue();
18359
18360 EVT VT = N->getValueType(ResNo: 0);
18361 SDLoc dl(N);
18362 SDValue LHS = Cmp.getOperand(i: 0);
18363 SDValue RHS = Cmp.getOperand(i: 1);
18364 SDValue FalseVal = N->getOperand(Num: 0);
18365 SDValue TrueVal = N->getOperand(Num: 1);
18366 SDValue ARMcc = N->getOperand(Num: 2);
18367 ARMCC::CondCodes CC = (ARMCC::CondCodes)ARMcc->getAsZExtVal();
18368
18369 // BFI is only available on V6T2+.
18370 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) {
18371 SDValue R = PerformCMOVToBFICombine(CMOV: N, DAG);
18372 if (R)
18373 return R;
18374 }
18375
18376 // Simplify
18377 // mov r1, r0
18378 // cmp r1, x
18379 // mov r0, y
18380 // moveq r0, x
18381 // to
18382 // cmp r0, x
18383 // movne r0, y
18384 //
18385 // mov r1, r0
18386 // cmp r1, x
18387 // mov r0, x
18388 // movne r0, y
18389 // to
18390 // cmp r0, x
18391 // movne r0, y
18392 /// FIXME: Turn this into a target neutral optimization?
18393 SDValue Res;
18394 if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
18395 Res = DAG.getNode(Opcode: ARMISD::CMOV, DL: dl, VT, N1: LHS, N2: TrueVal, N3: ARMcc,
18396 N4: N->getOperand(Num: 3), N5: Cmp);
18397 } else if (CC == ARMCC::EQ && TrueVal == RHS) {
18398 SDValue ARMcc;
18399 SDValue NewCmp = getARMCmp(LHS, RHS, CC: ISD::SETNE, ARMcc, DAG, dl);
18400 Res = DAG.getNode(Opcode: ARMISD::CMOV, DL: dl, VT, N1: LHS, N2: FalseVal, N3: ARMcc,
18401 N4: N->getOperand(Num: 3), N5: NewCmp);
18402 }
18403
18404 // (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0))
18405 // -> (cmov F T CC CPSR Cmp)
18406 if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse() &&
18407 isNullConstant(V: LHS->getOperand(Num: 0)) && isOneConstant(V: LHS->getOperand(Num: 1)) &&
18408 isNullConstant(V: RHS)) {
18409 return DAG.getNode(Opcode: ARMISD::CMOV, DL: dl, VT, N1: FalseVal, N2: TrueVal,
18410 N3: LHS->getOperand(Num: 2), N4: LHS->getOperand(Num: 3),
18411 N5: LHS->getOperand(Num: 4));
18412 }
18413
18414 if (!VT.isInteger())
18415 return SDValue();
18416
18417 // Fold away an unneccessary CMPZ/CMOV
18418 // CMOV A, B, C1, $cpsr, (CMPZ (CMOV 1, 0, C2, D), 0) ->
18419 // if C1==EQ -> CMOV A, B, C2, $cpsr, D
18420 // if C1==NE -> CMOV A, B, NOT(C2), $cpsr, D
18421 if (N->getConstantOperandVal(Num: 2) == ARMCC::EQ ||
18422 N->getConstantOperandVal(Num: 2) == ARMCC::NE) {
18423 ARMCC::CondCodes Cond;
18424 if (SDValue C = IsCMPZCSINC(Cmp: N->getOperand(Num: 4).getNode(), CC&: Cond)) {
18425 if (N->getConstantOperandVal(Num: 2) == ARMCC::NE)
18426 Cond = ARMCC::getOppositeCondition(CC: Cond);
18427 return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
18428 N->getOperand(1),
18429 DAG.getTargetConstant(Cond, SDLoc(N), MVT::i32),
18430 N->getOperand(3), C);
18431 }
18432 }
18433
18434 // Materialize a boolean comparison for integers so we can avoid branching.
18435 if (isNullConstant(V: FalseVal)) {
18436 if (CC == ARMCC::EQ && isOneConstant(V: TrueVal)) {
18437 if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) {
18438 // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it
18439 // right 5 bits will make that 32 be 1, otherwise it will be 0.
18440 // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5
18441 SDValue Sub = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT, N1: LHS, N2: RHS);
18442 Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub),
18443 DAG.getConstant(5, dl, MVT::i32));
18444 } else {
18445 // CMOV 0, 1, ==, (CMPZ x, y) ->
18446 // (UADDO_CARRY (SUB x, y), t:0, t:1)
18447 // where t = (USUBO_CARRY 0, (SUB x, y), 0)
18448 //
18449 // The USUBO_CARRY computes 0 - (x - y) and this will give a borrow when
18450 // x != y. In other words, a carry C == 1 when x == y, C == 0
18451 // otherwise.
18452 // The final UADDO_CARRY computes
18453 // x - y + (0 - (x - y)) + C == C
18454 SDValue Sub = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT, N1: LHS, N2: RHS);
18455 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18456 SDValue Neg = DAG.getNode(Opcode: ISD::USUBO, DL: dl, VTList: VTs, N1: FalseVal, N2: Sub);
18457 // ISD::USUBO_CARRY returns a borrow but we want the carry here
18458 // actually.
18459 SDValue Carry =
18460 DAG.getNode(ISD::SUB, dl, MVT::i32,
18461 DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1));
18462 Res = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL: dl, VTList: VTs, N1: Sub, N2: Neg, N3: Carry);
18463 }
18464 } else if (CC == ARMCC::NE && !isNullConstant(V: RHS) &&
18465 (!Subtarget->isThumb1Only() || isPowerOf2Constant(V: TrueVal))) {
18466 // This seems pointless but will allow us to combine it further below.
18467 // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1
18468 SDValue Sub =
18469 DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
18470 SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
18471 Sub.getValue(1), SDValue());
18472 Res = DAG.getNode(Opcode: ARMISD::CMOV, DL: dl, VT, N1: Sub, N2: TrueVal, N3: ARMcc,
18473 N4: N->getOperand(Num: 3), N5: CPSRGlue.getValue(R: 1));
18474 FalseVal = Sub;
18475 }
18476 } else if (isNullConstant(V: TrueVal)) {
18477 if (CC == ARMCC::EQ && !isNullConstant(V: RHS) &&
18478 (!Subtarget->isThumb1Only() || isPowerOf2Constant(V: FalseVal))) {
18479 // This seems pointless but will allow us to combine it further below
18480 // Note that we change == for != as this is the dual for the case above.
18481 // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1
18482 SDValue Sub =
18483 DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
18484 SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
18485 Sub.getValue(1), SDValue());
18486 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal,
18487 DAG.getConstant(ARMCC::NE, dl, MVT::i32),
18488 N->getOperand(3), CPSRGlue.getValue(1));
18489 FalseVal = Sub;
18490 }
18491 }
18492
18493 // On Thumb1, the DAG above may be further combined if z is a power of 2
18494 // (z == 2 ^ K).
18495 // CMOV (SUBS x, y), z, !=, (SUBS x, y):1 ->
18496 // t1 = (USUBO (SUB x, y), 1)
18497 // t2 = (USUBO_CARRY (SUB x, y), t1:0, t1:1)
18498 // Result = if K != 0 then (SHL t2:0, K) else t2:0
18499 //
18500 // This also handles the special case of comparing against zero; it's
18501 // essentially, the same pattern, except there's no SUBS:
18502 // CMOV x, z, !=, (CMPZ x, 0) ->
18503 // t1 = (USUBO x, 1)
18504 // t2 = (USUBO_CARRY x, t1:0, t1:1)
18505 // Result = if K != 0 then (SHL t2:0, K) else t2:0
18506 const APInt *TrueConst;
18507 if (Subtarget->isThumb1Only() && CC == ARMCC::NE &&
18508 ((FalseVal.getOpcode() == ARMISD::SUBS &&
18509 FalseVal.getOperand(i: 0) == LHS && FalseVal.getOperand(i: 1) == RHS) ||
18510 (FalseVal == LHS && isNullConstant(V: RHS))) &&
18511 (TrueConst = isPowerOf2Constant(V: TrueVal))) {
18512 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18513 unsigned ShiftAmount = TrueConst->logBase2();
18514 if (ShiftAmount)
18515 TrueVal = DAG.getConstant(Val: 1, DL: dl, VT);
18516 SDValue Subc = DAG.getNode(Opcode: ISD::USUBO, DL: dl, VTList: VTs, N1: FalseVal, N2: TrueVal);
18517 Res = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL: dl, VTList: VTs, N1: FalseVal, N2: Subc,
18518 N3: Subc.getValue(R: 1));
18519
18520 if (ShiftAmount)
18521 Res = DAG.getNode(ISD::SHL, dl, VT, Res,
18522 DAG.getConstant(ShiftAmount, dl, MVT::i32));
18523 }
18524
18525 if (Res.getNode()) {
18526 KnownBits Known = DAG.computeKnownBits(Op: SDValue(N,0));
18527 // Capture demanded bits information that would be otherwise lost.
18528 if (Known.Zero == 0xfffffffe)
18529 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18530 DAG.getValueType(MVT::i1));
18531 else if (Known.Zero == 0xffffff00)
18532 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18533 DAG.getValueType(MVT::i8));
18534 else if (Known.Zero == 0xffff0000)
18535 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18536 DAG.getValueType(MVT::i16));
18537 }
18538
18539 return Res;
18540}
18541
18542static SDValue PerformBITCASTCombine(SDNode *N,
18543 TargetLowering::DAGCombinerInfo &DCI,
18544 const ARMSubtarget *ST) {
18545 SelectionDAG &DAG = DCI.DAG;
18546 SDValue Src = N->getOperand(Num: 0);
18547 EVT DstVT = N->getValueType(ResNo: 0);
18548
18549 // Convert v4f32 bitcast (v4i32 vdup (i32)) -> v4f32 vdup (i32) under MVE.
18550 if (ST->hasMVEIntegerOps() && Src.getOpcode() == ARMISD::VDUP) {
18551 EVT SrcVT = Src.getValueType();
18552 if (SrcVT.getScalarSizeInBits() == DstVT.getScalarSizeInBits())
18553 return DAG.getNode(Opcode: ARMISD::VDUP, DL: SDLoc(N), VT: DstVT, Operand: Src.getOperand(i: 0));
18554 }
18555
18556 // We may have a bitcast of something that has already had this bitcast
18557 // combine performed on it, so skip past any VECTOR_REG_CASTs.
18558 while (Src.getOpcode() == ARMISD::VECTOR_REG_CAST)
18559 Src = Src.getOperand(i: 0);
18560
18561 // Bitcast from element-wise VMOV or VMVN doesn't need VREV if the VREV that
18562 // would be generated is at least the width of the element type.
18563 EVT SrcVT = Src.getValueType();
18564 if ((Src.getOpcode() == ARMISD::VMOVIMM ||
18565 Src.getOpcode() == ARMISD::VMVNIMM ||
18566 Src.getOpcode() == ARMISD::VMOVFPIMM) &&
18567 SrcVT.getScalarSizeInBits() <= DstVT.getScalarSizeInBits() &&
18568 DAG.getDataLayout().isBigEndian())
18569 return DAG.getNode(Opcode: ARMISD::VECTOR_REG_CAST, DL: SDLoc(N), VT: DstVT, Operand: Src);
18570
18571 // bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD x
18572 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
18573 return R;
18574
18575 return SDValue();
18576}
18577
18578// Some combines for the MVETrunc truncations legalizer helper. Also lowers the
18579// node into stack operations after legalizeOps.
18580SDValue ARMTargetLowering::PerformMVETruncCombine(
18581 SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
18582 SelectionDAG &DAG = DCI.DAG;
18583 EVT VT = N->getValueType(ResNo: 0);
18584 SDLoc DL(N);
18585
18586 // MVETrunc(Undef, Undef) -> Undef
18587 if (all_of(Range: N->ops(), P: [](SDValue Op) { return Op.isUndef(); }))
18588 return DAG.getUNDEF(VT);
18589
18590 // MVETrunc(MVETrunc a b, MVETrunc c, d) -> MVETrunc
18591 if (N->getNumOperands() == 2 &&
18592 N->getOperand(Num: 0).getOpcode() == ARMISD::MVETRUNC &&
18593 N->getOperand(Num: 1).getOpcode() == ARMISD::MVETRUNC)
18594 return DAG.getNode(Opcode: ARMISD::MVETRUNC, DL, VT, N1: N->getOperand(Num: 0).getOperand(i: 0),
18595 N2: N->getOperand(Num: 0).getOperand(i: 1),
18596 N3: N->getOperand(Num: 1).getOperand(i: 0),
18597 N4: N->getOperand(Num: 1).getOperand(i: 1));
18598
18599 // MVETrunc(shuffle, shuffle) -> VMOVN
18600 if (N->getNumOperands() == 2 &&
18601 N->getOperand(Num: 0).getOpcode() == ISD::VECTOR_SHUFFLE &&
18602 N->getOperand(Num: 1).getOpcode() == ISD::VECTOR_SHUFFLE) {
18603 auto *S0 = cast<ShuffleVectorSDNode>(Val: N->getOperand(Num: 0).getNode());
18604 auto *S1 = cast<ShuffleVectorSDNode>(Val: N->getOperand(Num: 1).getNode());
18605
18606 if (S0->getOperand(Num: 0) == S1->getOperand(Num: 0) &&
18607 S0->getOperand(Num: 1) == S1->getOperand(Num: 1)) {
18608 // Construct complete shuffle mask
18609 SmallVector<int, 8> Mask(S0->getMask());
18610 Mask.append(in_start: S1->getMask().begin(), in_end: S1->getMask().end());
18611
18612 if (isVMOVNTruncMask(Mask, VT, false))
18613 return DAG.getNode(
18614 ARMISD::VMOVN, DL, VT,
18615 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
18616 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
18617 DAG.getConstant(1, DL, MVT::i32));
18618 if (isVMOVNTruncMask(Mask, VT, true))
18619 return DAG.getNode(
18620 ARMISD::VMOVN, DL, VT,
18621 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
18622 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
18623 DAG.getConstant(1, DL, MVT::i32));
18624 }
18625 }
18626
18627 // For MVETrunc of a buildvector or shuffle, it can be beneficial to lower the
18628 // truncate to a buildvector to allow the generic optimisations to kick in.
18629 if (all_of(Range: N->ops(), P: [](SDValue Op) {
18630 return Op.getOpcode() == ISD::BUILD_VECTOR ||
18631 Op.getOpcode() == ISD::VECTOR_SHUFFLE ||
18632 (Op.getOpcode() == ISD::BITCAST &&
18633 Op.getOperand(i: 0).getOpcode() == ISD::BUILD_VECTOR);
18634 })) {
18635 SmallVector<SDValue, 8> Extracts;
18636 for (unsigned Op = 0; Op < N->getNumOperands(); Op++) {
18637 SDValue O = N->getOperand(Num: Op);
18638 for (unsigned i = 0; i < O.getValueType().getVectorNumElements(); i++) {
18639 SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, O,
18640 DAG.getConstant(i, DL, MVT::i32));
18641 Extracts.push_back(Elt: Ext);
18642 }
18643 }
18644 return DAG.getBuildVector(VT, DL, Ops: Extracts);
18645 }
18646
18647 // If we are late in the legalization process and nothing has optimised
18648 // the trunc to anything better, lower it to a stack store and reload,
18649 // performing the truncation whilst keeping the lanes in the correct order:
18650 // VSTRH.32 a, stack; VSTRH.32 b, stack+8; VLDRW.32 stack;
18651 if (!DCI.isAfterLegalizeDAG())
18652 return SDValue();
18653
18654 SDValue StackPtr = DAG.CreateStackTemporary(Bytes: TypeSize::getFixed(ExactSize: 16), Alignment: Align(4));
18655 int SPFI = cast<FrameIndexSDNode>(Val: StackPtr.getNode())->getIndex();
18656 int NumIns = N->getNumOperands();
18657 assert((NumIns == 2 || NumIns == 4) &&
18658 "Expected 2 or 4 inputs to an MVETrunc");
18659 EVT StoreVT = VT.getHalfNumVectorElementsVT(Context&: *DAG.getContext());
18660 if (N->getNumOperands() == 4)
18661 StoreVT = StoreVT.getHalfNumVectorElementsVT(Context&: *DAG.getContext());
18662
18663 SmallVector<SDValue> Chains;
18664 for (int I = 0; I < NumIns; I++) {
18665 SDValue Ptr = DAG.getNode(
18666 Opcode: ISD::ADD, DL, VT: StackPtr.getValueType(), N1: StackPtr,
18667 N2: DAG.getConstant(Val: I * 16 / NumIns, DL, VT: StackPtr.getValueType()));
18668 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(
18669 MF&: DAG.getMachineFunction(), FI: SPFI, Offset: I * 16 / NumIns);
18670 SDValue Ch = DAG.getTruncStore(Chain: DAG.getEntryNode(), dl: DL, Val: N->getOperand(Num: I),
18671 Ptr, PtrInfo: MPI, SVT: StoreVT, Alignment: Align(4));
18672 Chains.push_back(Elt: Ch);
18673 }
18674
18675 SDValue Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
18676 MachinePointerInfo MPI =
18677 MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI: SPFI, Offset: 0);
18678 return DAG.getLoad(VT, dl: DL, Chain, Ptr: StackPtr, PtrInfo: MPI, Alignment: Align(4));
18679}
18680
18681// Take a MVEEXT(load x) and split that into (extload x, extload x+8)
18682static SDValue PerformSplittingMVEEXTToWideningLoad(SDNode *N,
18683 SelectionDAG &DAG) {
18684 SDValue N0 = N->getOperand(Num: 0);
18685 LoadSDNode *LD = dyn_cast<LoadSDNode>(Val: N0.getNode());
18686 if (!LD || !LD->isSimple() || !N0.hasOneUse() || LD->isIndexed())
18687 return SDValue();
18688
18689 EVT FromVT = LD->getMemoryVT();
18690 EVT ToVT = N->getValueType(ResNo: 0);
18691 if (!ToVT.isVector())
18692 return SDValue();
18693 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements() * 2);
18694 EVT ToEltVT = ToVT.getVectorElementType();
18695 EVT FromEltVT = FromVT.getVectorElementType();
18696
18697 unsigned NumElements = 0;
18698 if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8))
18699 NumElements = 4;
18700 if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8)
18701 NumElements = 8;
18702 assert(NumElements != 0);
18703
18704 ISD::LoadExtType NewExtType =
18705 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
18706 if (LD->getExtensionType() != ISD::NON_EXTLOAD &&
18707 LD->getExtensionType() != ISD::EXTLOAD &&
18708 LD->getExtensionType() != NewExtType)
18709 return SDValue();
18710
18711 LLVMContext &C = *DAG.getContext();
18712 SDLoc DL(LD);
18713 // Details about the old load
18714 SDValue Ch = LD->getChain();
18715 SDValue BasePtr = LD->getBasePtr();
18716 Align Alignment = LD->getOriginalAlign();
18717 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
18718 AAMDNodes AAInfo = LD->getAAInfo();
18719
18720 SDValue Offset = DAG.getUNDEF(VT: BasePtr.getValueType());
18721 EVT NewFromVT = EVT::getVectorVT(
18722 Context&: C, VT: EVT::getIntegerVT(Context&: C, BitWidth: FromEltVT.getScalarSizeInBits()), NumElements);
18723 EVT NewToVT = EVT::getVectorVT(
18724 Context&: C, VT: EVT::getIntegerVT(Context&: C, BitWidth: ToEltVT.getScalarSizeInBits()), NumElements);
18725
18726 SmallVector<SDValue, 4> Loads;
18727 SmallVector<SDValue, 4> Chains;
18728 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
18729 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
18730 SDValue NewPtr =
18731 DAG.getObjectPtrOffset(SL: DL, Ptr: BasePtr, Offset: TypeSize::getFixed(ExactSize: NewOffset));
18732
18733 SDValue NewLoad =
18734 DAG.getLoad(AM: ISD::UNINDEXED, ExtType: NewExtType, VT: NewToVT, dl: DL, Chain: Ch, Ptr: NewPtr, Offset,
18735 PtrInfo: LD->getPointerInfo().getWithOffset(O: NewOffset), MemVT: NewFromVT,
18736 Alignment, MMOFlags, AAInfo);
18737 Loads.push_back(Elt: NewLoad);
18738 Chains.push_back(Elt: SDValue(NewLoad.getNode(), 1));
18739 }
18740
18741 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
18742 DAG.ReplaceAllUsesOfValueWith(From: SDValue(LD, 1), To: NewChain);
18743 return DAG.getMergeValues(Ops: Loads, dl: DL);
18744}
18745
18746// Perform combines for MVEEXT. If it has not be optimized to anything better
18747// before lowering, it gets converted to stack store and extloads performing the
18748// extend whilst still keeping the same lane ordering.
18749SDValue ARMTargetLowering::PerformMVEExtCombine(
18750 SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const {
18751 SelectionDAG &DAG = DCI.DAG;
18752 EVT VT = N->getValueType(ResNo: 0);
18753 SDLoc DL(N);
18754 assert(N->getNumValues() == 2 && "Expected MVEEXT with 2 elements");
18755 assert((VT == MVT::v4i32 || VT == MVT::v8i16) && "Unexpected MVEEXT type");
18756
18757 EVT ExtVT = N->getOperand(Num: 0).getValueType().getHalfNumVectorElementsVT(
18758 Context&: *DAG.getContext());
18759 auto Extend = [&](SDValue V) {
18760 SDValue VVT = DAG.getNode(Opcode: ARMISD::VECTOR_REG_CAST, DL, VT, Operand: V);
18761 return N->getOpcode() == ARMISD::MVESEXT
18762 ? DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT, N1: VVT,
18763 N2: DAG.getValueType(ExtVT))
18764 : DAG.getZeroExtendInReg(Op: VVT, DL, VT: ExtVT);
18765 };
18766
18767 // MVEEXT(VDUP) -> SIGN_EXTEND_INREG(VDUP)
18768 if (N->getOperand(Num: 0).getOpcode() == ARMISD::VDUP) {
18769 SDValue Ext = Extend(N->getOperand(Num: 0));
18770 return DAG.getMergeValues(Ops: {Ext, Ext}, dl: DL);
18771 }
18772
18773 // MVEEXT(shuffle) -> SIGN_EXTEND_INREG/ZERO_EXTEND_INREG
18774 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Val: N->getOperand(Num: 0))) {
18775 ArrayRef<int> Mask = SVN->getMask();
18776 assert(Mask.size() == 2 * VT.getVectorNumElements());
18777 assert(Mask.size() == SVN->getValueType(0).getVectorNumElements());
18778 unsigned Rev = VT == MVT::v4i32 ? ARMISD::VREV32 : ARMISD::VREV16;
18779 SDValue Op0 = SVN->getOperand(Num: 0);
18780 SDValue Op1 = SVN->getOperand(Num: 1);
18781
18782 auto CheckInregMask = [&](int Start, int Offset) {
18783 for (int Idx = 0, E = VT.getVectorNumElements(); Idx < E; ++Idx)
18784 if (Mask[Start + Idx] >= 0 && Mask[Start + Idx] != Idx * 2 + Offset)
18785 return false;
18786 return true;
18787 };
18788 SDValue V0 = SDValue(N, 0);
18789 SDValue V1 = SDValue(N, 1);
18790 if (CheckInregMask(0, 0))
18791 V0 = Extend(Op0);
18792 else if (CheckInregMask(0, 1))
18793 V0 = Extend(DAG.getNode(Opcode: Rev, DL, VT: SVN->getValueType(ResNo: 0), Operand: Op0));
18794 else if (CheckInregMask(0, Mask.size()))
18795 V0 = Extend(Op1);
18796 else if (CheckInregMask(0, Mask.size() + 1))
18797 V0 = Extend(DAG.getNode(Opcode: Rev, DL, VT: SVN->getValueType(ResNo: 0), Operand: Op1));
18798
18799 if (CheckInregMask(VT.getVectorNumElements(), Mask.size()))
18800 V1 = Extend(Op1);
18801 else if (CheckInregMask(VT.getVectorNumElements(), Mask.size() + 1))
18802 V1 = Extend(DAG.getNode(Opcode: Rev, DL, VT: SVN->getValueType(ResNo: 0), Operand: Op1));
18803 else if (CheckInregMask(VT.getVectorNumElements(), 0))
18804 V1 = Extend(Op0);
18805 else if (CheckInregMask(VT.getVectorNumElements(), 1))
18806 V1 = Extend(DAG.getNode(Opcode: Rev, DL, VT: SVN->getValueType(ResNo: 0), Operand: Op0));
18807
18808 if (V0.getNode() != N || V1.getNode() != N)
18809 return DAG.getMergeValues(Ops: {V0, V1}, dl: DL);
18810 }
18811
18812 // MVEEXT(load) -> extload, extload
18813 if (N->getOperand(Num: 0)->getOpcode() == ISD::LOAD)
18814 if (SDValue L = PerformSplittingMVEEXTToWideningLoad(N, DAG))
18815 return L;
18816
18817 if (!DCI.isAfterLegalizeDAG())
18818 return SDValue();
18819
18820 // Lower to a stack store and reload:
18821 // VSTRW.32 a, stack; VLDRH.32 stack; VLDRH.32 stack+8;
18822 SDValue StackPtr = DAG.CreateStackTemporary(Bytes: TypeSize::getFixed(ExactSize: 16), Alignment: Align(4));
18823 int SPFI = cast<FrameIndexSDNode>(Val: StackPtr.getNode())->getIndex();
18824 int NumOuts = N->getNumValues();
18825 assert((NumOuts == 2 || NumOuts == 4) &&
18826 "Expected 2 or 4 outputs to an MVEEXT");
18827 EVT LoadVT = N->getOperand(Num: 0).getValueType().getHalfNumVectorElementsVT(
18828 Context&: *DAG.getContext());
18829 if (N->getNumOperands() == 4)
18830 LoadVT = LoadVT.getHalfNumVectorElementsVT(Context&: *DAG.getContext());
18831
18832 MachinePointerInfo MPI =
18833 MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI: SPFI, Offset: 0);
18834 SDValue Chain = DAG.getStore(Chain: DAG.getEntryNode(), dl: DL, Val: N->getOperand(Num: 0),
18835 Ptr: StackPtr, PtrInfo: MPI, Alignment: Align(4));
18836
18837 SmallVector<SDValue> Loads;
18838 for (int I = 0; I < NumOuts; I++) {
18839 SDValue Ptr = DAG.getNode(
18840 Opcode: ISD::ADD, DL, VT: StackPtr.getValueType(), N1: StackPtr,
18841 N2: DAG.getConstant(Val: I * 16 / NumOuts, DL, VT: StackPtr.getValueType()));
18842 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(
18843 MF&: DAG.getMachineFunction(), FI: SPFI, Offset: I * 16 / NumOuts);
18844 SDValue Load = DAG.getExtLoad(
18845 ExtType: N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD, dl: DL,
18846 VT, Chain, Ptr, PtrInfo: MPI, MemVT: LoadVT, Alignment: Align(4));
18847 Loads.push_back(Elt: Load);
18848 }
18849
18850 return DAG.getMergeValues(Ops: Loads, dl: DL);
18851}
18852
18853SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
18854 DAGCombinerInfo &DCI) const {
18855 switch (N->getOpcode()) {
18856 default: break;
18857 case ISD::SELECT_CC:
18858 case ISD::SELECT: return PerformSELECTCombine(N, DCI, Subtarget);
18859 case ISD::VSELECT: return PerformVSELECTCombine(N, DCI, Subtarget);
18860 case ISD::SETCC: return PerformVSetCCToVCTPCombine(N, DCI, Subtarget);
18861 case ISD::ABS: return PerformABSCombine(N, DCI, Subtarget);
18862 case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget);
18863 case ARMISD::UMLAL: return PerformUMLALCombine(N, DAG&: DCI.DAG, Subtarget);
18864 case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget);
18865 case ISD::SUB: return PerformSUBCombine(N, DCI, Subtarget);
18866 case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget);
18867 case ISD::OR: return PerformORCombine(N, DCI, Subtarget);
18868 case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget);
18869 case ISD::AND: return PerformANDCombine(N, DCI, Subtarget);
18870 case ISD::BRCOND:
18871 case ISD::BR_CC: return PerformHWLoopCombine(N, DCI, ST: Subtarget);
18872 case ARMISD::ADDC:
18873 case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget);
18874 case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget);
18875 case ARMISD::BFI: return PerformBFICombine(N, DAG&: DCI.DAG);
18876 case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
18877 case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DAG&: DCI.DAG);
18878 case ARMISD::VMOVhr: return PerformVMOVhrCombine(N, DCI);
18879 case ARMISD::VMOVrh: return PerformVMOVrhCombine(N, DAG&: DCI.DAG);
18880 case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget);
18881 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
18882 case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
18883 case ISD::EXTRACT_VECTOR_ELT:
18884 return PerformExtractEltCombine(N, DCI, ST: Subtarget);
18885 case ISD::SIGN_EXTEND_INREG: return PerformSignExtendInregCombine(N, DAG&: DCI.DAG);
18886 case ISD::INSERT_SUBVECTOR: return PerformInsertSubvectorCombine(N, DCI);
18887 case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DAG&: DCI.DAG);
18888 case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI, Subtarget);
18889 case ARMISD::VDUP: return PerformVDUPCombine(N, DAG&: DCI.DAG, Subtarget);
18890 case ISD::FP_TO_SINT:
18891 case ISD::FP_TO_UINT:
18892 return PerformVCVTCombine(N, DAG&: DCI.DAG, Subtarget);
18893 case ISD::FADD:
18894 return PerformFADDCombine(N, DAG&: DCI.DAG, Subtarget);
18895 case ISD::FDIV:
18896 return PerformVDIVCombine(N, DAG&: DCI.DAG, Subtarget);
18897 case ISD::INTRINSIC_WO_CHAIN:
18898 return PerformIntrinsicCombine(N, DCI);
18899 case ISD::SHL:
18900 case ISD::SRA:
18901 case ISD::SRL:
18902 return PerformShiftCombine(N, DCI, ST: Subtarget);
18903 case ISD::SIGN_EXTEND:
18904 case ISD::ZERO_EXTEND:
18905 case ISD::ANY_EXTEND:
18906 return PerformExtendCombine(N, DAG&: DCI.DAG, ST: Subtarget);
18907 case ISD::FP_EXTEND:
18908 return PerformFPExtendCombine(N, DAG&: DCI.DAG, ST: Subtarget);
18909 case ISD::SMIN:
18910 case ISD::UMIN:
18911 case ISD::SMAX:
18912 case ISD::UMAX:
18913 return PerformMinMaxCombine(N, DAG&: DCI.DAG, ST: Subtarget);
18914 case ARMISD::CMOV:
18915 return PerformCMOVCombine(N, DAG&: DCI.DAG);
18916 case ARMISD::BRCOND:
18917 return PerformBRCONDCombine(N, DAG&: DCI.DAG);
18918 case ARMISD::CMPZ:
18919 return PerformCMPZCombine(N, DAG&: DCI.DAG);
18920 case ARMISD::CSINC:
18921 case ARMISD::CSINV:
18922 case ARMISD::CSNEG:
18923 return PerformCSETCombine(N, DAG&: DCI.DAG);
18924 case ISD::LOAD:
18925 return PerformLOADCombine(N, DCI, Subtarget);
18926 case ARMISD::VLD1DUP:
18927 case ARMISD::VLD2DUP:
18928 case ARMISD::VLD3DUP:
18929 case ARMISD::VLD4DUP:
18930 return PerformVLDCombine(N, DCI);
18931 case ARMISD::BUILD_VECTOR:
18932 return PerformARMBUILD_VECTORCombine(N, DCI);
18933 case ISD::BITCAST:
18934 return PerformBITCASTCombine(N, DCI, ST: Subtarget);
18935 case ARMISD::PREDICATE_CAST:
18936 return PerformPREDICATE_CASTCombine(N, DCI);
18937 case ARMISD::VECTOR_REG_CAST:
18938 return PerformVECTOR_REG_CASTCombine(N, DAG&: DCI.DAG, ST: Subtarget);
18939 case ARMISD::MVETRUNC:
18940 return PerformMVETruncCombine(N, DCI);
18941 case ARMISD::MVESEXT:
18942 case ARMISD::MVEZEXT:
18943 return PerformMVEExtCombine(N, DCI);
18944 case ARMISD::VCMP:
18945 return PerformVCMPCombine(N, DAG&: DCI.DAG, Subtarget);
18946 case ISD::VECREDUCE_ADD:
18947 return PerformVECREDUCE_ADDCombine(N, DAG&: DCI.DAG, ST: Subtarget);
18948 case ARMISD::VADDVs:
18949 case ARMISD::VADDVu:
18950 case ARMISD::VADDLVs:
18951 case ARMISD::VADDLVu:
18952 case ARMISD::VADDLVAs:
18953 case ARMISD::VADDLVAu:
18954 case ARMISD::VMLAVs:
18955 case ARMISD::VMLAVu:
18956 case ARMISD::VMLALVs:
18957 case ARMISD::VMLALVu:
18958 case ARMISD::VMLALVAs:
18959 case ARMISD::VMLALVAu:
18960 return PerformReduceShuffleCombine(N, DAG&: DCI.DAG);
18961 case ARMISD::VMOVN:
18962 return PerformVMOVNCombine(N, DCI);
18963 case ARMISD::VQMOVNs:
18964 case ARMISD::VQMOVNu:
18965 return PerformVQMOVNCombine(N, DCI);
18966 case ARMISD::VQDMULH:
18967 return PerformVQDMULHCombine(N, DCI);
18968 case ARMISD::ASRL:
18969 case ARMISD::LSRL:
18970 case ARMISD::LSLL:
18971 return PerformLongShiftCombine(N, DAG&: DCI.DAG);
18972 case ARMISD::SMULWB: {
18973 unsigned BitWidth = N->getValueType(ResNo: 0).getSizeInBits();
18974 APInt DemandedMask = APInt::getLowBitsSet(numBits: BitWidth, loBitsSet: 16);
18975 if (SimplifyDemandedBits(Op: N->getOperand(Num: 1), DemandedBits: DemandedMask, DCI))
18976 return SDValue();
18977 break;
18978 }
18979 case ARMISD::SMULWT: {
18980 unsigned BitWidth = N->getValueType(ResNo: 0).getSizeInBits();
18981 APInt DemandedMask = APInt::getHighBitsSet(numBits: BitWidth, hiBitsSet: 16);
18982 if (SimplifyDemandedBits(Op: N->getOperand(Num: 1), DemandedBits: DemandedMask, DCI))
18983 return SDValue();
18984 break;
18985 }
18986 case ARMISD::SMLALBB:
18987 case ARMISD::QADD16b:
18988 case ARMISD::QSUB16b:
18989 case ARMISD::UQADD16b:
18990 case ARMISD::UQSUB16b: {
18991 unsigned BitWidth = N->getValueType(ResNo: 0).getSizeInBits();
18992 APInt DemandedMask = APInt::getLowBitsSet(numBits: BitWidth, loBitsSet: 16);
18993 if ((SimplifyDemandedBits(Op: N->getOperand(Num: 0), DemandedBits: DemandedMask, DCI)) ||
18994 (SimplifyDemandedBits(Op: N->getOperand(Num: 1), DemandedBits: DemandedMask, DCI)))
18995 return SDValue();
18996 break;
18997 }
18998 case ARMISD::SMLALBT: {
18999 unsigned LowWidth = N->getOperand(Num: 0).getValueType().getSizeInBits();
19000 APInt LowMask = APInt::getLowBitsSet(numBits: LowWidth, loBitsSet: 16);
19001 unsigned HighWidth = N->getOperand(Num: 1).getValueType().getSizeInBits();
19002 APInt HighMask = APInt::getHighBitsSet(numBits: HighWidth, hiBitsSet: 16);
19003 if ((SimplifyDemandedBits(Op: N->getOperand(Num: 0), DemandedBits: LowMask, DCI)) ||
19004 (SimplifyDemandedBits(Op: N->getOperand(Num: 1), DemandedBits: HighMask, DCI)))
19005 return SDValue();
19006 break;
19007 }
19008 case ARMISD::SMLALTB: {
19009 unsigned HighWidth = N->getOperand(Num: 0).getValueType().getSizeInBits();
19010 APInt HighMask = APInt::getHighBitsSet(numBits: HighWidth, hiBitsSet: 16);
19011 unsigned LowWidth = N->getOperand(Num: 1).getValueType().getSizeInBits();
19012 APInt LowMask = APInt::getLowBitsSet(numBits: LowWidth, loBitsSet: 16);
19013 if ((SimplifyDemandedBits(Op: N->getOperand(Num: 0), DemandedBits: HighMask, DCI)) ||
19014 (SimplifyDemandedBits(Op: N->getOperand(Num: 1), DemandedBits: LowMask, DCI)))
19015 return SDValue();
19016 break;
19017 }
19018 case ARMISD::SMLALTT: {
19019 unsigned BitWidth = N->getValueType(ResNo: 0).getSizeInBits();
19020 APInt DemandedMask = APInt::getHighBitsSet(numBits: BitWidth, hiBitsSet: 16);
19021 if ((SimplifyDemandedBits(Op: N->getOperand(Num: 0), DemandedBits: DemandedMask, DCI)) ||
19022 (SimplifyDemandedBits(Op: N->getOperand(Num: 1), DemandedBits: DemandedMask, DCI)))
19023 return SDValue();
19024 break;
19025 }
19026 case ARMISD::QADD8b:
19027 case ARMISD::QSUB8b:
19028 case ARMISD::UQADD8b:
19029 case ARMISD::UQSUB8b: {
19030 unsigned BitWidth = N->getValueType(ResNo: 0).getSizeInBits();
19031 APInt DemandedMask = APInt::getLowBitsSet(numBits: BitWidth, loBitsSet: 8);
19032 if ((SimplifyDemandedBits(Op: N->getOperand(Num: 0), DemandedBits: DemandedMask, DCI)) ||
19033 (SimplifyDemandedBits(Op: N->getOperand(Num: 1), DemandedBits: DemandedMask, DCI)))
19034 return SDValue();
19035 break;
19036 }
19037 case ISD::INTRINSIC_VOID:
19038 case ISD::INTRINSIC_W_CHAIN:
19039 switch (N->getConstantOperandVal(Num: 1)) {
19040 case Intrinsic::arm_neon_vld1:
19041 case Intrinsic::arm_neon_vld1x2:
19042 case Intrinsic::arm_neon_vld1x3:
19043 case Intrinsic::arm_neon_vld1x4:
19044 case Intrinsic::arm_neon_vld2:
19045 case Intrinsic::arm_neon_vld3:
19046 case Intrinsic::arm_neon_vld4:
19047 case Intrinsic::arm_neon_vld2lane:
19048 case Intrinsic::arm_neon_vld3lane:
19049 case Intrinsic::arm_neon_vld4lane:
19050 case Intrinsic::arm_neon_vld2dup:
19051 case Intrinsic::arm_neon_vld3dup:
19052 case Intrinsic::arm_neon_vld4dup:
19053 case Intrinsic::arm_neon_vst1:
19054 case Intrinsic::arm_neon_vst1x2:
19055 case Intrinsic::arm_neon_vst1x3:
19056 case Intrinsic::arm_neon_vst1x4:
19057 case Intrinsic::arm_neon_vst2:
19058 case Intrinsic::arm_neon_vst3:
19059 case Intrinsic::arm_neon_vst4:
19060 case Intrinsic::arm_neon_vst2lane:
19061 case Intrinsic::arm_neon_vst3lane:
19062 case Intrinsic::arm_neon_vst4lane:
19063 return PerformVLDCombine(N, DCI);
19064 case Intrinsic::arm_mve_vld2q:
19065 case Intrinsic::arm_mve_vld4q:
19066 case Intrinsic::arm_mve_vst2q:
19067 case Intrinsic::arm_mve_vst4q:
19068 return PerformMVEVLDCombine(N, DCI);
19069 default: break;
19070 }
19071 break;
19072 }
19073 return SDValue();
19074}
19075
19076bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
19077 EVT VT) const {
19078 return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
19079}
19080
19081bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned,
19082 Align Alignment,
19083 MachineMemOperand::Flags,
19084 unsigned *Fast) const {
19085 // Depends what it gets converted into if the type is weird.
19086 if (!VT.isSimple())
19087 return false;
19088
19089 // The AllowsUnaligned flag models the SCTLR.A setting in ARM cpus
19090 bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
19091 auto Ty = VT.getSimpleVT().SimpleTy;
19092
19093 if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) {
19094 // Unaligned access can use (for example) LRDB, LRDH, LDR
19095 if (AllowsUnaligned) {
19096 if (Fast)
19097 *Fast = Subtarget->hasV7Ops();
19098 return true;
19099 }
19100 }
19101
19102 if (Ty == MVT::f64 || Ty == MVT::v2f64) {
19103 // For any little-endian targets with neon, we can support unaligned ld/st
19104 // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
19105 // A big-endian target may also explicitly support unaligned accesses
19106 if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) {
19107 if (Fast)
19108 *Fast = 1;
19109 return true;
19110 }
19111 }
19112
19113 if (!Subtarget->hasMVEIntegerOps())
19114 return false;
19115
19116 // These are for predicates
19117 if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1 ||
19118 Ty == MVT::v2i1)) {
19119 if (Fast)
19120 *Fast = 1;
19121 return true;
19122 }
19123
19124 // These are for truncated stores/narrowing loads. They are fine so long as
19125 // the alignment is at least the size of the item being loaded
19126 if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) &&
19127 Alignment >= VT.getScalarSizeInBits() / 8) {
19128 if (Fast)
19129 *Fast = true;
19130 return true;
19131 }
19132
19133 // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and
19134 // VSTRW.U32 all store the vector register in exactly the same format, and
19135 // differ only in the range of their immediate offset field and the required
19136 // alignment. So there is always a store that can be used, regardless of
19137 // actual type.
19138 //
19139 // For big endian, that is not the case. But can still emit a (VSTRB.U8;
19140 // VREV64.8) pair and get the same effect. This will likely be better than
19141 // aligning the vector through the stack.
19142 if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 ||
19143 Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 ||
19144 Ty == MVT::v2f64) {
19145 if (Fast)
19146 *Fast = 1;
19147 return true;
19148 }
19149
19150 return false;
19151}
19152
19153
19154EVT ARMTargetLowering::getOptimalMemOpType(
19155 const MemOp &Op, const AttributeList &FuncAttributes) const {
19156 // See if we can use NEON instructions for this...
19157 if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() &&
19158 !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
19159 unsigned Fast;
19160 if (Op.size() >= 16 &&
19161 (Op.isAligned(Align(16)) ||
19162 (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, Align(1),
19163 MachineMemOperand::MONone, &Fast) &&
19164 Fast))) {
19165 return MVT::v2f64;
19166 } else if (Op.size() >= 8 &&
19167 (Op.isAligned(Align(8)) ||
19168 (allowsMisalignedMemoryAccesses(
19169 MVT::f64, 0, Align(1), MachineMemOperand::MONone, &Fast) &&
19170 Fast))) {
19171 return MVT::f64;
19172 }
19173 }
19174
19175 // Let the target-independent logic figure it out.
19176 return MVT::Other;
19177}
19178
19179// 64-bit integers are split into their high and low parts and held in two
19180// different registers, so the trunc is free since the low register can just
19181// be used.
19182bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const {
19183 if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
19184 return false;
19185 unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
19186 unsigned DestBits = DstTy->getPrimitiveSizeInBits();
19187 return (SrcBits == 64 && DestBits == 32);
19188}
19189
19190bool ARMTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const {
19191 if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() ||
19192 !DstVT.isInteger())
19193 return false;
19194 unsigned SrcBits = SrcVT.getSizeInBits();
19195 unsigned DestBits = DstVT.getSizeInBits();
19196 return (SrcBits == 64 && DestBits == 32);
19197}
19198
19199bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
19200 if (Val.getOpcode() != ISD::LOAD)
19201 return false;
19202
19203 EVT VT1 = Val.getValueType();
19204 if (!VT1.isSimple() || !VT1.isInteger() ||
19205 !VT2.isSimple() || !VT2.isInteger())
19206 return false;
19207
19208 switch (VT1.getSimpleVT().SimpleTy) {
19209 default: break;
19210 case MVT::i1:
19211 case MVT::i8:
19212 case MVT::i16:
19213 // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
19214 return true;
19215 }
19216
19217 return false;
19218}
19219
19220bool ARMTargetLowering::isFNegFree(EVT VT) const {
19221 if (!VT.isSimple())
19222 return false;
19223
19224 // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that
19225 // negate values directly (fneg is free). So, we don't want to let the DAG
19226 // combiner rewrite fneg into xors and some other instructions. For f16 and
19227 // FullFP16 argument passing, some bitcast nodes may be introduced,
19228 // triggering this DAG combine rewrite, so we are avoiding that with this.
19229 switch (VT.getSimpleVT().SimpleTy) {
19230 default: break;
19231 case MVT::f16:
19232 return Subtarget->hasFullFP16();
19233 }
19234
19235 return false;
19236}
19237
19238/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
19239/// of the vector elements.
19240static bool areExtractExts(Value *Ext1, Value *Ext2) {
19241 auto areExtDoubled = [](Instruction *Ext) {
19242 return Ext->getType()->getScalarSizeInBits() ==
19243 2 * Ext->getOperand(i: 0)->getType()->getScalarSizeInBits();
19244 };
19245
19246 if (!match(V: Ext1, P: m_ZExtOrSExt(Op: m_Value())) ||
19247 !match(V: Ext2, P: m_ZExtOrSExt(Op: m_Value())) ||
19248 !areExtDoubled(cast<Instruction>(Val: Ext1)) ||
19249 !areExtDoubled(cast<Instruction>(Val: Ext2)))
19250 return false;
19251
19252 return true;
19253}
19254
19255/// Check if sinking \p I's operands to I's basic block is profitable, because
19256/// the operands can be folded into a target instruction, e.g.
19257/// sext/zext can be folded into vsubl.
19258bool ARMTargetLowering::shouldSinkOperands(Instruction *I,
19259 SmallVectorImpl<Use *> &Ops) const {
19260 if (!I->getType()->isVectorTy())
19261 return false;
19262
19263 if (Subtarget->hasNEON()) {
19264 switch (I->getOpcode()) {
19265 case Instruction::Sub:
19266 case Instruction::Add: {
19267 if (!areExtractExts(Ext1: I->getOperand(i: 0), Ext2: I->getOperand(i: 1)))
19268 return false;
19269 Ops.push_back(Elt: &I->getOperandUse(i: 0));
19270 Ops.push_back(Elt: &I->getOperandUse(i: 1));
19271 return true;
19272 }
19273 default:
19274 return false;
19275 }
19276 }
19277
19278 if (!Subtarget->hasMVEIntegerOps())
19279 return false;
19280
19281 auto IsFMSMul = [&](Instruction *I) {
19282 if (!I->hasOneUse())
19283 return false;
19284 auto *Sub = cast<Instruction>(Val: *I->users().begin());
19285 return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(i: 1) == I;
19286 };
19287 auto IsFMS = [&](Instruction *I) {
19288 if (match(V: I->getOperand(i: 0), P: m_FNeg(X: m_Value())) ||
19289 match(V: I->getOperand(i: 1), P: m_FNeg(X: m_Value())))
19290 return true;
19291 return false;
19292 };
19293
19294 auto IsSinker = [&](Instruction *I, int Operand) {
19295 switch (I->getOpcode()) {
19296 case Instruction::Add:
19297 case Instruction::Mul:
19298 case Instruction::FAdd:
19299 case Instruction::ICmp:
19300 case Instruction::FCmp:
19301 return true;
19302 case Instruction::FMul:
19303 return !IsFMSMul(I);
19304 case Instruction::Sub:
19305 case Instruction::FSub:
19306 case Instruction::Shl:
19307 case Instruction::LShr:
19308 case Instruction::AShr:
19309 return Operand == 1;
19310 case Instruction::Call:
19311 if (auto *II = dyn_cast<IntrinsicInst>(Val: I)) {
19312 switch (II->getIntrinsicID()) {
19313 case Intrinsic::fma:
19314 return !IsFMS(I);
19315 case Intrinsic::sadd_sat:
19316 case Intrinsic::uadd_sat:
19317 case Intrinsic::arm_mve_add_predicated:
19318 case Intrinsic::arm_mve_mul_predicated:
19319 case Intrinsic::arm_mve_qadd_predicated:
19320 case Intrinsic::arm_mve_vhadd:
19321 case Intrinsic::arm_mve_hadd_predicated:
19322 case Intrinsic::arm_mve_vqdmull:
19323 case Intrinsic::arm_mve_vqdmull_predicated:
19324 case Intrinsic::arm_mve_vqdmulh:
19325 case Intrinsic::arm_mve_qdmulh_predicated:
19326 case Intrinsic::arm_mve_vqrdmulh:
19327 case Intrinsic::arm_mve_qrdmulh_predicated:
19328 case Intrinsic::arm_mve_fma_predicated:
19329 return true;
19330 case Intrinsic::ssub_sat:
19331 case Intrinsic::usub_sat:
19332 case Intrinsic::arm_mve_sub_predicated:
19333 case Intrinsic::arm_mve_qsub_predicated:
19334 case Intrinsic::arm_mve_hsub_predicated:
19335 case Intrinsic::arm_mve_vhsub:
19336 return Operand == 1;
19337 default:
19338 return false;
19339 }
19340 }
19341 return false;
19342 default:
19343 return false;
19344 }
19345 };
19346
19347 for (auto OpIdx : enumerate(First: I->operands())) {
19348 Instruction *Op = dyn_cast<Instruction>(Val: OpIdx.value().get());
19349 // Make sure we are not already sinking this operand
19350 if (!Op || any_of(Range&: Ops, P: [&](Use *U) { return U->get() == Op; }))
19351 continue;
19352
19353 Instruction *Shuffle = Op;
19354 if (Shuffle->getOpcode() == Instruction::BitCast)
19355 Shuffle = dyn_cast<Instruction>(Val: Shuffle->getOperand(i: 0));
19356 // We are looking for a splat that can be sunk.
19357 if (!Shuffle ||
19358 !match(V: Shuffle, P: m_Shuffle(
19359 v1: m_InsertElt(Val: m_Undef(), Elt: m_Value(), Idx: m_ZeroInt()),
19360 v2: m_Undef(), mask: m_ZeroMask())))
19361 continue;
19362 if (!IsSinker(I, OpIdx.index()))
19363 continue;
19364
19365 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
19366 // and vector registers
19367 for (Use &U : Op->uses()) {
19368 Instruction *Insn = cast<Instruction>(Val: U.getUser());
19369 if (!IsSinker(Insn, U.getOperandNo()))
19370 return false;
19371 }
19372
19373 Ops.push_back(Elt: &Shuffle->getOperandUse(i: 0));
19374 if (Shuffle != Op)
19375 Ops.push_back(Elt: &Op->getOperandUse(i: 0));
19376 Ops.push_back(Elt: &OpIdx.value());
19377 }
19378 return true;
19379}
19380
19381Type *ARMTargetLowering::shouldConvertSplatType(ShuffleVectorInst *SVI) const {
19382 if (!Subtarget->hasMVEIntegerOps())
19383 return nullptr;
19384 Type *SVIType = SVI->getType();
19385 Type *ScalarType = SVIType->getScalarType();
19386
19387 if (ScalarType->isFloatTy())
19388 return Type::getInt32Ty(C&: SVIType->getContext());
19389 if (ScalarType->isHalfTy())
19390 return Type::getInt16Ty(C&: SVIType->getContext());
19391 return nullptr;
19392}
19393
19394bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
19395 EVT VT = ExtVal.getValueType();
19396
19397 if (!isTypeLegal(VT))
19398 return false;
19399
19400 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(Val: ExtVal.getOperand(i: 0))) {
19401 if (Ld->isExpandingLoad())
19402 return false;
19403 }
19404
19405 if (Subtarget->hasMVEIntegerOps())
19406 return true;
19407
19408 // Don't create a loadext if we can fold the extension into a wide/long
19409 // instruction.
19410 // If there's more than one user instruction, the loadext is desirable no
19411 // matter what. There can be two uses by the same instruction.
19412 if (ExtVal->use_empty() ||
19413 !ExtVal->use_begin()->isOnlyUserOf(N: ExtVal.getNode()))
19414 return true;
19415
19416 SDNode *U = *ExtVal->use_begin();
19417 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB ||
19418 U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM))
19419 return false;
19420
19421 return true;
19422}
19423
19424bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
19425 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
19426 return false;
19427
19428 if (!isTypeLegal(VT: EVT::getEVT(Ty: Ty1)))
19429 return false;
19430
19431 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
19432
19433 // Assuming the caller doesn't have a zeroext or signext return parameter,
19434 // truncation all the way down to i1 is valid.
19435 return true;
19436}
19437
19438/// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
19439/// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
19440/// expanded to FMAs when this method returns true, otherwise fmuladd is
19441/// expanded to fmul + fadd.
19442///
19443/// ARM supports both fused and unfused multiply-add operations; we already
19444/// lower a pair of fmul and fadd to the latter so it's not clear that there
19445/// would be a gain or that the gain would be worthwhile enough to risk
19446/// correctness bugs.
19447///
19448/// For MVE, we set this to true as it helps simplify the need for some
19449/// patterns (and we don't have the non-fused floating point instruction).
19450bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
19451 EVT VT) const {
19452 if (!VT.isSimple())
19453 return false;
19454
19455 switch (VT.getSimpleVT().SimpleTy) {
19456 case MVT::v4f32:
19457 case MVT::v8f16:
19458 return Subtarget->hasMVEFloatOps();
19459 case MVT::f16:
19460 return Subtarget->useFPVFMx16();
19461 case MVT::f32:
19462 return Subtarget->useFPVFMx();
19463 case MVT::f64:
19464 return Subtarget->useFPVFMx64();
19465 default:
19466 break;
19467 }
19468
19469 return false;
19470}
19471
19472static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
19473 if (V < 0)
19474 return false;
19475
19476 unsigned Scale = 1;
19477 switch (VT.getSimpleVT().SimpleTy) {
19478 case MVT::i1:
19479 case MVT::i8:
19480 // Scale == 1;
19481 break;
19482 case MVT::i16:
19483 // Scale == 2;
19484 Scale = 2;
19485 break;
19486 default:
19487 // On thumb1 we load most things (i32, i64, floats, etc) with a LDR
19488 // Scale == 4;
19489 Scale = 4;
19490 break;
19491 }
19492
19493 if ((V & (Scale - 1)) != 0)
19494 return false;
19495 return isUInt<5>(x: V / Scale);
19496}
19497
19498static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
19499 const ARMSubtarget *Subtarget) {
19500 if (!VT.isInteger() && !VT.isFloatingPoint())
19501 return false;
19502 if (VT.isVector() && Subtarget->hasNEON())
19503 return false;
19504 if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() &&
19505 !Subtarget->hasMVEFloatOps())
19506 return false;
19507
19508 bool IsNeg = false;
19509 if (V < 0) {
19510 IsNeg = true;
19511 V = -V;
19512 }
19513
19514 unsigned NumBytes = std::max(a: (unsigned)VT.getSizeInBits() / 8, b: 1U);
19515
19516 // MVE: size * imm7
19517 if (VT.isVector() && Subtarget->hasMVEIntegerOps()) {
19518 switch (VT.getSimpleVT().getVectorElementType().SimpleTy) {
19519 case MVT::i32:
19520 case MVT::f32:
19521 return isShiftedUInt<7,2>(x: V);
19522 case MVT::i16:
19523 case MVT::f16:
19524 return isShiftedUInt<7,1>(x: V);
19525 case MVT::i8:
19526 return isUInt<7>(x: V);
19527 default:
19528 return false;
19529 }
19530 }
19531
19532 // half VLDR: 2 * imm8
19533 if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16())
19534 return isShiftedUInt<8, 1>(x: V);
19535 // VLDR and LDRD: 4 * imm8
19536 if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8)
19537 return isShiftedUInt<8, 2>(x: V);
19538
19539 if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) {
19540 // + imm12 or - imm8
19541 if (IsNeg)
19542 return isUInt<8>(x: V);
19543 return isUInt<12>(x: V);
19544 }
19545
19546 return false;
19547}
19548
19549/// isLegalAddressImmediate - Return true if the integer value can be used
19550/// as the offset of the target addressing mode for load / store of the
19551/// given type.
19552static bool isLegalAddressImmediate(int64_t V, EVT VT,
19553 const ARMSubtarget *Subtarget) {
19554 if (V == 0)
19555 return true;
19556
19557 if (!VT.isSimple())
19558 return false;
19559
19560 if (Subtarget->isThumb1Only())
19561 return isLegalT1AddressImmediate(V, VT);
19562 else if (Subtarget->isThumb2())
19563 return isLegalT2AddressImmediate(V, VT, Subtarget);
19564
19565 // ARM mode.
19566 if (V < 0)
19567 V = - V;
19568 switch (VT.getSimpleVT().SimpleTy) {
19569 default: return false;
19570 case MVT::i1:
19571 case MVT::i8:
19572 case MVT::i32:
19573 // +- imm12
19574 return isUInt<12>(x: V);
19575 case MVT::i16:
19576 // +- imm8
19577 return isUInt<8>(x: V);
19578 case MVT::f32:
19579 case MVT::f64:
19580 if (!Subtarget->hasVFP2Base()) // FIXME: NEON?
19581 return false;
19582 return isShiftedUInt<8, 2>(x: V);
19583 }
19584}
19585
19586bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM,
19587 EVT VT) const {
19588 int Scale = AM.Scale;
19589 if (Scale < 0)
19590 return false;
19591
19592 switch (VT.getSimpleVT().SimpleTy) {
19593 default: return false;
19594 case MVT::i1:
19595 case MVT::i8:
19596 case MVT::i16:
19597 case MVT::i32:
19598 if (Scale == 1)
19599 return true;
19600 // r + r << imm
19601 Scale = Scale & ~1;
19602 return Scale == 2 || Scale == 4 || Scale == 8;
19603 case MVT::i64:
19604 // FIXME: What are we trying to model here? ldrd doesn't have an r + r
19605 // version in Thumb mode.
19606 // r + r
19607 if (Scale == 1)
19608 return true;
19609 // r * 2 (this can be lowered to r + r).
19610 if (!AM.HasBaseReg && Scale == 2)
19611 return true;
19612 return false;
19613 case MVT::isVoid:
19614 // Note, we allow "void" uses (basically, uses that aren't loads or
19615 // stores), because arm allows folding a scale into many arithmetic
19616 // operations. This should be made more precise and revisited later.
19617
19618 // Allow r << imm, but the imm has to be a multiple of two.
19619 if (Scale & 1) return false;
19620 return isPowerOf2_32(Value: Scale);
19621 }
19622}
19623
19624bool ARMTargetLowering::isLegalT1ScaledAddressingMode(const AddrMode &AM,
19625 EVT VT) const {
19626 const int Scale = AM.Scale;
19627
19628 // Negative scales are not supported in Thumb1.
19629 if (Scale < 0)
19630 return false;
19631
19632 // Thumb1 addressing modes do not support register scaling excepting the
19633 // following cases:
19634 // 1. Scale == 1 means no scaling.
19635 // 2. Scale == 2 this can be lowered to r + r if there is no base register.
19636 return (Scale == 1) || (!AM.HasBaseReg && Scale == 2);
19637}
19638
19639/// isLegalAddressingMode - Return true if the addressing mode represented
19640/// by AM is legal for this target, for a load/store of the specified type.
19641bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL,
19642 const AddrMode &AM, Type *Ty,
19643 unsigned AS, Instruction *I) const {
19644 EVT VT = getValueType(DL, Ty, AllowUnknown: true);
19645 if (!isLegalAddressImmediate(V: AM.BaseOffs, VT, Subtarget))
19646 return false;
19647
19648 // Can never fold addr of global into load/store.
19649 if (AM.BaseGV)
19650 return false;
19651
19652 switch (AM.Scale) {
19653 case 0: // no scale reg, must be "r+i" or "r", or "i".
19654 break;
19655 default:
19656 // ARM doesn't support any R+R*scale+imm addr modes.
19657 if (AM.BaseOffs)
19658 return false;
19659
19660 if (!VT.isSimple())
19661 return false;
19662
19663 if (Subtarget->isThumb1Only())
19664 return isLegalT1ScaledAddressingMode(AM, VT);
19665
19666 if (Subtarget->isThumb2())
19667 return isLegalT2ScaledAddressingMode(AM, VT);
19668
19669 int Scale = AM.Scale;
19670 switch (VT.getSimpleVT().SimpleTy) {
19671 default: return false;
19672 case MVT::i1:
19673 case MVT::i8:
19674 case MVT::i32:
19675 if (Scale < 0) Scale = -Scale;
19676 if (Scale == 1)
19677 return true;
19678 // r + r << imm
19679 return isPowerOf2_32(Value: Scale & ~1);
19680 case MVT::i16:
19681 case MVT::i64:
19682 // r +/- r
19683 if (Scale == 1 || (AM.HasBaseReg && Scale == -1))
19684 return true;
19685 // r * 2 (this can be lowered to r + r).
19686 if (!AM.HasBaseReg && Scale == 2)
19687 return true;
19688 return false;
19689
19690 case MVT::isVoid:
19691 // Note, we allow "void" uses (basically, uses that aren't loads or
19692 // stores), because arm allows folding a scale into many arithmetic
19693 // operations. This should be made more precise and revisited later.
19694
19695 // Allow r << imm, but the imm has to be a multiple of two.
19696 if (Scale & 1) return false;
19697 return isPowerOf2_32(Value: Scale);
19698 }
19699 }
19700 return true;
19701}
19702
19703/// isLegalICmpImmediate - Return true if the specified immediate is legal
19704/// icmp immediate, that is the target has icmp instructions which can compare
19705/// a register against the immediate without having to materialize the
19706/// immediate into a register.
19707bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
19708 // Thumb2 and ARM modes can use cmn for negative immediates.
19709 if (!Subtarget->isThumb())
19710 return ARM_AM::getSOImmVal(Arg: (uint32_t)Imm) != -1 ||
19711 ARM_AM::getSOImmVal(Arg: -(uint32_t)Imm) != -1;
19712 if (Subtarget->isThumb2())
19713 return ARM_AM::getT2SOImmVal(Arg: (uint32_t)Imm) != -1 ||
19714 ARM_AM::getT2SOImmVal(Arg: -(uint32_t)Imm) != -1;
19715 // Thumb1 doesn't have cmn, and only 8-bit immediates.
19716 return Imm >= 0 && Imm <= 255;
19717}
19718
19719/// isLegalAddImmediate - Return true if the specified immediate is a legal add
19720/// *or sub* immediate, that is the target has add or sub instructions which can
19721/// add a register with the immediate without having to materialize the
19722/// immediate into a register.
19723bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const {
19724 // Same encoding for add/sub, just flip the sign.
19725 int64_t AbsImm = std::abs(i: Imm);
19726 if (!Subtarget->isThumb())
19727 return ARM_AM::getSOImmVal(Arg: AbsImm) != -1;
19728 if (Subtarget->isThumb2())
19729 return ARM_AM::getT2SOImmVal(Arg: AbsImm) != -1;
19730 // Thumb1 only has 8-bit unsigned immediate.
19731 return AbsImm >= 0 && AbsImm <= 255;
19732}
19733
19734// Return false to prevent folding
19735// (mul (add r, c0), c1) -> (add (mul r, c1), c0*c1) in DAGCombine,
19736// if the folding leads to worse code.
19737bool ARMTargetLowering::isMulAddWithConstProfitable(SDValue AddNode,
19738 SDValue ConstNode) const {
19739 // Let the DAGCombiner decide for vector types and large types.
19740 const EVT VT = AddNode.getValueType();
19741 if (VT.isVector() || VT.getScalarSizeInBits() > 32)
19742 return true;
19743
19744 // It is worse if c0 is legal add immediate, while c1*c0 is not
19745 // and has to be composed by at least two instructions.
19746 const ConstantSDNode *C0Node = cast<ConstantSDNode>(Val: AddNode.getOperand(i: 1));
19747 const ConstantSDNode *C1Node = cast<ConstantSDNode>(Val&: ConstNode);
19748 const int64_t C0 = C0Node->getSExtValue();
19749 APInt CA = C0Node->getAPIntValue() * C1Node->getAPIntValue();
19750 if (!isLegalAddImmediate(Imm: C0) || isLegalAddImmediate(Imm: CA.getSExtValue()))
19751 return true;
19752 if (ConstantMaterializationCost(Val: (unsigned)CA.getZExtValue(), Subtarget) > 1)
19753 return false;
19754
19755 // Default to true and let the DAGCombiner decide.
19756 return true;
19757}
19758
19759static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,
19760 bool isSEXTLoad, SDValue &Base,
19761 SDValue &Offset, bool &isInc,
19762 SelectionDAG &DAG) {
19763 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19764 return false;
19765
19766 if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
19767 // AddressingMode 3
19768 Base = Ptr->getOperand(Num: 0);
19769 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Val: Ptr->getOperand(Num: 1))) {
19770 int RHSC = (int)RHS->getZExtValue();
19771 if (RHSC < 0 && RHSC > -256) {
19772 assert(Ptr->getOpcode() == ISD::ADD);
19773 isInc = false;
19774 Offset = DAG.getConstant(Val: -RHSC, DL: SDLoc(Ptr), VT: RHS->getValueType(ResNo: 0));
19775 return true;
19776 }
19777 }
19778 isInc = (Ptr->getOpcode() == ISD::ADD);
19779 Offset = Ptr->getOperand(Num: 1);
19780 return true;
19781 } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
19782 // AddressingMode 2
19783 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Val: Ptr->getOperand(Num: 1))) {
19784 int RHSC = (int)RHS->getZExtValue();
19785 if (RHSC < 0 && RHSC > -0x1000) {
19786 assert(Ptr->getOpcode() == ISD::ADD);
19787 isInc = false;
19788 Offset = DAG.getConstant(Val: -RHSC, DL: SDLoc(Ptr), VT: RHS->getValueType(ResNo: 0));
19789 Base = Ptr->getOperand(Num: 0);
19790 return true;
19791 }
19792 }
19793
19794 if (Ptr->getOpcode() == ISD::ADD) {
19795 isInc = true;
19796 ARM_AM::ShiftOpc ShOpcVal=
19797 ARM_AM::getShiftOpcForNode(Opcode: Ptr->getOperand(Num: 0).getOpcode());
19798 if (ShOpcVal != ARM_AM::no_shift) {
19799 Base = Ptr->getOperand(Num: 1);
19800 Offset = Ptr->getOperand(Num: 0);
19801 } else {
19802 Base = Ptr->getOperand(Num: 0);
19803 Offset = Ptr->getOperand(Num: 1);
19804 }
19805 return true;
19806 }
19807
19808 isInc = (Ptr->getOpcode() == ISD::ADD);
19809 Base = Ptr->getOperand(Num: 0);
19810 Offset = Ptr->getOperand(Num: 1);
19811 return true;
19812 }
19813
19814 // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
19815 return false;
19816}
19817
19818static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT,
19819 bool isSEXTLoad, SDValue &Base,
19820 SDValue &Offset, bool &isInc,
19821 SelectionDAG &DAG) {
19822 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19823 return false;
19824
19825 Base = Ptr->getOperand(Num: 0);
19826 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Val: Ptr->getOperand(Num: 1))) {
19827 int RHSC = (int)RHS->getZExtValue();
19828 if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
19829 assert(Ptr->getOpcode() == ISD::ADD);
19830 isInc = false;
19831 Offset = DAG.getConstant(Val: -RHSC, DL: SDLoc(Ptr), VT: RHS->getValueType(ResNo: 0));
19832 return true;
19833 } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
19834 isInc = Ptr->getOpcode() == ISD::ADD;
19835 Offset = DAG.getConstant(Val: RHSC, DL: SDLoc(Ptr), VT: RHS->getValueType(ResNo: 0));
19836 return true;
19837 }
19838 }
19839
19840 return false;
19841}
19842
19843static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment,
19844 bool isSEXTLoad, bool IsMasked, bool isLE,
19845 SDValue &Base, SDValue &Offset,
19846 bool &isInc, SelectionDAG &DAG) {
19847 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19848 return false;
19849 if (!isa<ConstantSDNode>(Val: Ptr->getOperand(Num: 1)))
19850 return false;
19851
19852 // We allow LE non-masked loads to change the type (for example use a vldrb.8
19853 // as opposed to a vldrw.32). This can allow extra addressing modes or
19854 // alignments for what is otherwise an equivalent instruction.
19855 bool CanChangeType = isLE && !IsMasked;
19856
19857 ConstantSDNode *RHS = cast<ConstantSDNode>(Val: Ptr->getOperand(Num: 1));
19858 int RHSC = (int)RHS->getZExtValue();
19859
19860 auto IsInRange = [&](int RHSC, int Limit, int Scale) {
19861 if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) {
19862 assert(Ptr->getOpcode() == ISD::ADD);
19863 isInc = false;
19864 Offset = DAG.getConstant(Val: -RHSC, DL: SDLoc(Ptr), VT: RHS->getValueType(ResNo: 0));
19865 return true;
19866 } else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) {
19867 isInc = Ptr->getOpcode() == ISD::ADD;
19868 Offset = DAG.getConstant(Val: RHSC, DL: SDLoc(Ptr), VT: RHS->getValueType(ResNo: 0));
19869 return true;
19870 }
19871 return false;
19872 };
19873
19874 // Try to find a matching instruction based on s/zext, Alignment, Offset and
19875 // (in BE/masked) type.
19876 Base = Ptr->getOperand(Num: 0);
19877 if (VT == MVT::v4i16) {
19878 if (Alignment >= 2 && IsInRange(RHSC, 0x80, 2))
19879 return true;
19880 } else if (VT == MVT::v4i8 || VT == MVT::v8i8) {
19881 if (IsInRange(RHSC, 0x80, 1))
19882 return true;
19883 } else if (Alignment >= 4 &&
19884 (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) &&
19885 IsInRange(RHSC, 0x80, 4))
19886 return true;
19887 else if (Alignment >= 2 &&
19888 (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) &&
19889 IsInRange(RHSC, 0x80, 2))
19890 return true;
19891 else if ((CanChangeType || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1))
19892 return true;
19893 return false;
19894}
19895
19896/// getPreIndexedAddressParts - returns true by value, base pointer and
19897/// offset pointer and addressing mode by reference if the node's address
19898/// can be legally represented as pre-indexed load / store address.
19899bool
19900ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
19901 SDValue &Offset,
19902 ISD::MemIndexedMode &AM,
19903 SelectionDAG &DAG) const {
19904 if (Subtarget->isThumb1Only())
19905 return false;
19906
19907 EVT VT;
19908 SDValue Ptr;
19909 Align Alignment;
19910 bool isSEXTLoad = false;
19911 bool IsMasked = false;
19912 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val: N)) {
19913 Ptr = LD->getBasePtr();
19914 VT = LD->getMemoryVT();
19915 Alignment = LD->getAlign();
19916 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19917 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(Val: N)) {
19918 Ptr = ST->getBasePtr();
19919 VT = ST->getMemoryVT();
19920 Alignment = ST->getAlign();
19921 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(Val: N)) {
19922 Ptr = LD->getBasePtr();
19923 VT = LD->getMemoryVT();
19924 Alignment = LD->getAlign();
19925 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19926 IsMasked = true;
19927 } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(Val: N)) {
19928 Ptr = ST->getBasePtr();
19929 VT = ST->getMemoryVT();
19930 Alignment = ST->getAlign();
19931 IsMasked = true;
19932 } else
19933 return false;
19934
19935 bool isInc;
19936 bool isLegal = false;
19937 if (VT.isVector())
19938 isLegal = Subtarget->hasMVEIntegerOps() &&
19939 getMVEIndexedAddressParts(
19940 Ptr: Ptr.getNode(), VT, Alignment, isSEXTLoad, IsMasked,
19941 isLE: Subtarget->isLittle(), Base, Offset, isInc, DAG);
19942 else {
19943 if (Subtarget->isThumb2())
19944 isLegal = getT2IndexedAddressParts(Ptr: Ptr.getNode(), VT, isSEXTLoad, Base,
19945 Offset, isInc, DAG);
19946 else
19947 isLegal = getARMIndexedAddressParts(Ptr: Ptr.getNode(), VT, isSEXTLoad, Base,
19948 Offset, isInc, DAG);
19949 }
19950 if (!isLegal)
19951 return false;
19952
19953 AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
19954 return true;
19955}
19956
19957/// getPostIndexedAddressParts - returns true by value, base pointer and
19958/// offset pointer and addressing mode by reference if this node can be
19959/// combined with a load / store to form a post-indexed load / store.
19960bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
19961 SDValue &Base,
19962 SDValue &Offset,
19963 ISD::MemIndexedMode &AM,
19964 SelectionDAG &DAG) const {
19965 EVT VT;
19966 SDValue Ptr;
19967 Align Alignment;
19968 bool isSEXTLoad = false, isNonExt;
19969 bool IsMasked = false;
19970 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val: N)) {
19971 VT = LD->getMemoryVT();
19972 Ptr = LD->getBasePtr();
19973 Alignment = LD->getAlign();
19974 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19975 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
19976 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(Val: N)) {
19977 VT = ST->getMemoryVT();
19978 Ptr = ST->getBasePtr();
19979 Alignment = ST->getAlign();
19980 isNonExt = !ST->isTruncatingStore();
19981 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(Val: N)) {
19982 VT = LD->getMemoryVT();
19983 Ptr = LD->getBasePtr();
19984 Alignment = LD->getAlign();
19985 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19986 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
19987 IsMasked = true;
19988 } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(Val: N)) {
19989 VT = ST->getMemoryVT();
19990 Ptr = ST->getBasePtr();
19991 Alignment = ST->getAlign();
19992 isNonExt = !ST->isTruncatingStore();
19993 IsMasked = true;
19994 } else
19995 return false;
19996
19997 if (Subtarget->isThumb1Only()) {
19998 // Thumb-1 can do a limited post-inc load or store as an updating LDM. It
19999 // must be non-extending/truncating, i32, with an offset of 4.
20000 assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!");
20001 if (Op->getOpcode() != ISD::ADD || !isNonExt)
20002 return false;
20003 auto *RHS = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1));
20004 if (!RHS || RHS->getZExtValue() != 4)
20005 return false;
20006 if (Alignment < Align(4))
20007 return false;
20008
20009 Offset = Op->getOperand(Num: 1);
20010 Base = Op->getOperand(Num: 0);
20011 AM = ISD::POST_INC;
20012 return true;
20013 }
20014
20015 bool isInc;
20016 bool isLegal = false;
20017 if (VT.isVector())
20018 isLegal = Subtarget->hasMVEIntegerOps() &&
20019 getMVEIndexedAddressParts(Ptr: Op, VT, Alignment, isSEXTLoad, IsMasked,
20020 isLE: Subtarget->isLittle(), Base, Offset,
20021 isInc, DAG);
20022 else {
20023 if (Subtarget->isThumb2())
20024 isLegal = getT2IndexedAddressParts(Ptr: Op, VT, isSEXTLoad, Base, Offset,
20025 isInc, DAG);
20026 else
20027 isLegal = getARMIndexedAddressParts(Ptr: Op, VT, isSEXTLoad, Base, Offset,
20028 isInc, DAG);
20029 }
20030 if (!isLegal)
20031 return false;
20032
20033 if (Ptr != Base) {
20034 // Swap base ptr and offset to catch more post-index load / store when
20035 // it's legal. In Thumb2 mode, offset must be an immediate.
20036 if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
20037 !Subtarget->isThumb2())
20038 std::swap(a&: Base, b&: Offset);
20039
20040 // Post-indexed load / store update the base pointer.
20041 if (Ptr != Base)
20042 return false;
20043 }
20044
20045 AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
20046 return true;
20047}
20048
20049void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
20050 KnownBits &Known,
20051 const APInt &DemandedElts,
20052 const SelectionDAG &DAG,
20053 unsigned Depth) const {
20054 unsigned BitWidth = Known.getBitWidth();
20055 Known.resetAll();
20056 switch (Op.getOpcode()) {
20057 default: break;
20058 case ARMISD::ADDC:
20059 case ARMISD::ADDE:
20060 case ARMISD::SUBC:
20061 case ARMISD::SUBE:
20062 // Special cases when we convert a carry to a boolean.
20063 if (Op.getResNo() == 0) {
20064 SDValue LHS = Op.getOperand(i: 0);
20065 SDValue RHS = Op.getOperand(i: 1);
20066 // (ADDE 0, 0, C) will give us a single bit.
20067 if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(V: LHS) &&
20068 isNullConstant(V: RHS)) {
20069 Known.Zero |= APInt::getHighBitsSet(numBits: BitWidth, hiBitsSet: BitWidth - 1);
20070 return;
20071 }
20072 }
20073 break;
20074 case ARMISD::CMOV: {
20075 // Bits are known zero/one if known on the LHS and RHS.
20076 Known = DAG.computeKnownBits(Op: Op.getOperand(i: 0), Depth: Depth+1);
20077 if (Known.isUnknown())
20078 return;
20079
20080 KnownBits KnownRHS = DAG.computeKnownBits(Op: Op.getOperand(i: 1), Depth: Depth+1);
20081 Known = Known.intersectWith(RHS: KnownRHS);
20082 return;
20083 }
20084 case ISD::INTRINSIC_W_CHAIN: {
20085 Intrinsic::ID IntID =
20086 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(Num: 1));
20087 switch (IntID) {
20088 default: return;
20089 case Intrinsic::arm_ldaex:
20090 case Intrinsic::arm_ldrex: {
20091 EVT VT = cast<MemIntrinsicSDNode>(Val: Op)->getMemoryVT();
20092 unsigned MemBits = VT.getScalarSizeInBits();
20093 Known.Zero |= APInt::getHighBitsSet(numBits: BitWidth, hiBitsSet: BitWidth - MemBits);
20094 return;
20095 }
20096 }
20097 }
20098 case ARMISD::BFI: {
20099 // Conservatively, we can recurse down the first operand
20100 // and just mask out all affected bits.
20101 Known = DAG.computeKnownBits(Op: Op.getOperand(i: 0), Depth: Depth + 1);
20102
20103 // The operand to BFI is already a mask suitable for removing the bits it
20104 // sets.
20105 const APInt &Mask = Op.getConstantOperandAPInt(i: 2);
20106 Known.Zero &= Mask;
20107 Known.One &= Mask;
20108 return;
20109 }
20110 case ARMISD::VGETLANEs:
20111 case ARMISD::VGETLANEu: {
20112 const SDValue &SrcSV = Op.getOperand(i: 0);
20113 EVT VecVT = SrcSV.getValueType();
20114 assert(VecVT.isVector() && "VGETLANE expected a vector type");
20115 const unsigned NumSrcElts = VecVT.getVectorNumElements();
20116 ConstantSDNode *Pos = cast<ConstantSDNode>(Val: Op.getOperand(i: 1).getNode());
20117 assert(Pos->getAPIntValue().ult(NumSrcElts) &&
20118 "VGETLANE index out of bounds");
20119 unsigned Idx = Pos->getZExtValue();
20120 APInt DemandedElt = APInt::getOneBitSet(numBits: NumSrcElts, BitNo: Idx);
20121 Known = DAG.computeKnownBits(Op: SrcSV, DemandedElts: DemandedElt, Depth: Depth + 1);
20122
20123 EVT VT = Op.getValueType();
20124 const unsigned DstSz = VT.getScalarSizeInBits();
20125 const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits();
20126 (void)SrcSz;
20127 assert(SrcSz == Known.getBitWidth());
20128 assert(DstSz > SrcSz);
20129 if (Op.getOpcode() == ARMISD::VGETLANEs)
20130 Known = Known.sext(BitWidth: DstSz);
20131 else {
20132 Known = Known.zext(BitWidth: DstSz);
20133 }
20134 assert(DstSz == Known.getBitWidth());
20135 break;
20136 }
20137 case ARMISD::VMOVrh: {
20138 KnownBits KnownOp = DAG.computeKnownBits(Op: Op->getOperand(Num: 0), Depth: Depth + 1);
20139 assert(KnownOp.getBitWidth() == 16);
20140 Known = KnownOp.zext(BitWidth: 32);
20141 break;
20142 }
20143 case ARMISD::CSINC:
20144 case ARMISD::CSINV:
20145 case ARMISD::CSNEG: {
20146 KnownBits KnownOp0 = DAG.computeKnownBits(Op: Op->getOperand(Num: 0), Depth: Depth + 1);
20147 KnownBits KnownOp1 = DAG.computeKnownBits(Op: Op->getOperand(Num: 1), Depth: Depth + 1);
20148
20149 // The result is either:
20150 // CSINC: KnownOp0 or KnownOp1 + 1
20151 // CSINV: KnownOp0 or ~KnownOp1
20152 // CSNEG: KnownOp0 or KnownOp1 * -1
20153 if (Op.getOpcode() == ARMISD::CSINC)
20154 KnownOp1 = KnownBits::computeForAddSub(
20155 /*Add=*/true, /*NSW=*/false, /*NUW=*/false, LHS: KnownOp1,
20156 RHS: KnownBits::makeConstant(C: APInt(32, 1)));
20157 else if (Op.getOpcode() == ARMISD::CSINV)
20158 std::swap(a&: KnownOp1.Zero, b&: KnownOp1.One);
20159 else if (Op.getOpcode() == ARMISD::CSNEG)
20160 KnownOp1 = KnownBits::mul(
20161 LHS: KnownOp1, RHS: KnownBits::makeConstant(C: APInt(32, -1)));
20162
20163 Known = KnownOp0.intersectWith(RHS: KnownOp1);
20164 break;
20165 }
20166 }
20167}
20168
20169bool ARMTargetLowering::targetShrinkDemandedConstant(
20170 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
20171 TargetLoweringOpt &TLO) const {
20172 // Delay optimization, so we don't have to deal with illegal types, or block
20173 // optimizations.
20174 if (!TLO.LegalOps)
20175 return false;
20176
20177 // Only optimize AND for now.
20178 if (Op.getOpcode() != ISD::AND)
20179 return false;
20180
20181 EVT VT = Op.getValueType();
20182
20183 // Ignore vectors.
20184 if (VT.isVector())
20185 return false;
20186
20187 assert(VT == MVT::i32 && "Unexpected integer type");
20188
20189 // Make sure the RHS really is a constant.
20190 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 1));
20191 if (!C)
20192 return false;
20193
20194 unsigned Mask = C->getZExtValue();
20195
20196 unsigned Demanded = DemandedBits.getZExtValue();
20197 unsigned ShrunkMask = Mask & Demanded;
20198 unsigned ExpandedMask = Mask | ~Demanded;
20199
20200 // If the mask is all zeros, let the target-independent code replace the
20201 // result with zero.
20202 if (ShrunkMask == 0)
20203 return false;
20204
20205 // If the mask is all ones, erase the AND. (Currently, the target-independent
20206 // code won't do this, so we have to do it explicitly to avoid an infinite
20207 // loop in obscure cases.)
20208 if (ExpandedMask == ~0U)
20209 return TLO.CombineTo(O: Op, N: Op.getOperand(i: 0));
20210
20211 auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool {
20212 return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0;
20213 };
20214 auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool {
20215 if (NewMask == Mask)
20216 return true;
20217 SDLoc DL(Op);
20218 SDValue NewC = TLO.DAG.getConstant(Val: NewMask, DL, VT);
20219 SDValue NewOp = TLO.DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Op.getOperand(i: 0), N2: NewC);
20220 return TLO.CombineTo(O: Op, N: NewOp);
20221 };
20222
20223 // Prefer uxtb mask.
20224 if (IsLegalMask(0xFF))
20225 return UseMask(0xFF);
20226
20227 // Prefer uxth mask.
20228 if (IsLegalMask(0xFFFF))
20229 return UseMask(0xFFFF);
20230
20231 // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2.
20232 // FIXME: Prefer a contiguous sequence of bits for other optimizations.
20233 if (ShrunkMask < 256)
20234 return UseMask(ShrunkMask);
20235
20236 // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2.
20237 // FIXME: Prefer a contiguous sequence of bits for other optimizations.
20238 if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256)
20239 return UseMask(ExpandedMask);
20240
20241 // Potential improvements:
20242 //
20243 // We could try to recognize lsls+lsrs or lsrs+lsls pairs here.
20244 // We could try to prefer Thumb1 immediates which can be lowered to a
20245 // two-instruction sequence.
20246 // We could try to recognize more legal ARM/Thumb2 immediates here.
20247
20248 return false;
20249}
20250
20251bool ARMTargetLowering::SimplifyDemandedBitsForTargetNode(
20252 SDValue Op, const APInt &OriginalDemandedBits,
20253 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
20254 unsigned Depth) const {
20255 unsigned Opc = Op.getOpcode();
20256
20257 switch (Opc) {
20258 case ARMISD::ASRL:
20259 case ARMISD::LSRL: {
20260 // If this is result 0 and the other result is unused, see if the demand
20261 // bits allow us to shrink this long shift into a standard small shift in
20262 // the opposite direction.
20263 if (Op.getResNo() == 0 && !Op->hasAnyUseOfValue(Value: 1) &&
20264 isa<ConstantSDNode>(Val: Op->getOperand(Num: 2))) {
20265 unsigned ShAmt = Op->getConstantOperandVal(Num: 2);
20266 if (ShAmt < 32 && OriginalDemandedBits.isSubsetOf(APInt::getAllOnes(32)
20267 << (32 - ShAmt)))
20268 return TLO.CombineTo(
20269 Op, TLO.DAG.getNode(
20270 ISD::SHL, SDLoc(Op), MVT::i32, Op.getOperand(1),
20271 TLO.DAG.getConstant(32 - ShAmt, SDLoc(Op), MVT::i32)));
20272 }
20273 break;
20274 }
20275 case ARMISD::VBICIMM: {
20276 SDValue Op0 = Op.getOperand(i: 0);
20277 unsigned ModImm = Op.getConstantOperandVal(i: 1);
20278 unsigned EltBits = 0;
20279 uint64_t Mask = ARM_AM::decodeVMOVModImm(ModImm, EltBits);
20280 if ((OriginalDemandedBits & Mask) == 0)
20281 return TLO.CombineTo(O: Op, N: Op0);
20282 }
20283 }
20284
20285 return TargetLowering::SimplifyDemandedBitsForTargetNode(
20286 Op, DemandedBits: OriginalDemandedBits, DemandedElts: OriginalDemandedElts, Known, TLO, Depth);
20287}
20288
20289//===----------------------------------------------------------------------===//
20290// ARM Inline Assembly Support
20291//===----------------------------------------------------------------------===//
20292
20293bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
20294 // Looking for "rev" which is V6+.
20295 if (!Subtarget->hasV6Ops())
20296 return false;
20297
20298 InlineAsm *IA = cast<InlineAsm>(Val: CI->getCalledOperand());
20299 StringRef AsmStr = IA->getAsmString();
20300 SmallVector<StringRef, 4> AsmPieces;
20301 SplitString(Source: AsmStr, OutFragments&: AsmPieces, Delimiters: ";\n");
20302
20303 switch (AsmPieces.size()) {
20304 default: return false;
20305 case 1:
20306 AsmStr = AsmPieces[0];
20307 AsmPieces.clear();
20308 SplitString(Source: AsmStr, OutFragments&: AsmPieces, Delimiters: " \t,");
20309
20310 // rev $0, $1
20311 if (AsmPieces.size() == 3 &&
20312 AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" &&
20313 IA->getConstraintString().compare(pos: 0, n1: 4, s: "=l,l") == 0) {
20314 IntegerType *Ty = dyn_cast<IntegerType>(Val: CI->getType());
20315 if (Ty && Ty->getBitWidth() == 32)
20316 return IntrinsicLowering::LowerToByteSwap(CI);
20317 }
20318 break;
20319 }
20320
20321 return false;
20322}
20323
20324const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
20325 // At this point, we have to lower this constraint to something else, so we
20326 // lower it to an "r" or "w". However, by doing this we will force the result
20327 // to be in register, while the X constraint is much more permissive.
20328 //
20329 // Although we are correct (we are free to emit anything, without
20330 // constraints), we might break use cases that would expect us to be more
20331 // efficient and emit something else.
20332 if (!Subtarget->hasVFP2Base())
20333 return "r";
20334 if (ConstraintVT.isFloatingPoint())
20335 return "w";
20336 if (ConstraintVT.isVector() && Subtarget->hasNEON() &&
20337 (ConstraintVT.getSizeInBits() == 64 ||
20338 ConstraintVT.getSizeInBits() == 128))
20339 return "w";
20340
20341 return "r";
20342}
20343
20344/// getConstraintType - Given a constraint letter, return the type of
20345/// constraint it is for this target.
20346ARMTargetLowering::ConstraintType
20347ARMTargetLowering::getConstraintType(StringRef Constraint) const {
20348 unsigned S = Constraint.size();
20349 if (S == 1) {
20350 switch (Constraint[0]) {
20351 default: break;
20352 case 'l': return C_RegisterClass;
20353 case 'w': return C_RegisterClass;
20354 case 'h': return C_RegisterClass;
20355 case 'x': return C_RegisterClass;
20356 case 't': return C_RegisterClass;
20357 case 'j': return C_Immediate; // Constant for movw.
20358 // An address with a single base register. Due to the way we
20359 // currently handle addresses it is the same as an 'r' memory constraint.
20360 case 'Q': return C_Memory;
20361 }
20362 } else if (S == 2) {
20363 switch (Constraint[0]) {
20364 default: break;
20365 case 'T': return C_RegisterClass;
20366 // All 'U+' constraints are addresses.
20367 case 'U': return C_Memory;
20368 }
20369 }
20370 return TargetLowering::getConstraintType(Constraint);
20371}
20372
20373/// Examine constraint type and operand type and determine a weight value.
20374/// This object must already have been set up with the operand type
20375/// and the current alternative constraint selected.
20376TargetLowering::ConstraintWeight
20377ARMTargetLowering::getSingleConstraintMatchWeight(
20378 AsmOperandInfo &info, const char *constraint) const {
20379 ConstraintWeight weight = CW_Invalid;
20380 Value *CallOperandVal = info.CallOperandVal;
20381 // If we don't have a value, we can't do a match,
20382 // but allow it at the lowest weight.
20383 if (!CallOperandVal)
20384 return CW_Default;
20385 Type *type = CallOperandVal->getType();
20386 // Look at the constraint type.
20387 switch (*constraint) {
20388 default:
20389 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
20390 break;
20391 case 'l':
20392 if (type->isIntegerTy()) {
20393 if (Subtarget->isThumb())
20394 weight = CW_SpecificReg;
20395 else
20396 weight = CW_Register;
20397 }
20398 break;
20399 case 'w':
20400 if (type->isFloatingPointTy())
20401 weight = CW_Register;
20402 break;
20403 }
20404 return weight;
20405}
20406
20407using RCPair = std::pair<unsigned, const TargetRegisterClass *>;
20408
20409RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
20410 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
20411 switch (Constraint.size()) {
20412 case 1:
20413 // GCC ARM Constraint Letters
20414 switch (Constraint[0]) {
20415 case 'l': // Low regs or general regs.
20416 if (Subtarget->isThumb())
20417 return RCPair(0U, &ARM::tGPRRegClass);
20418 return RCPair(0U, &ARM::GPRRegClass);
20419 case 'h': // High regs or no regs.
20420 if (Subtarget->isThumb())
20421 return RCPair(0U, &ARM::hGPRRegClass);
20422 break;
20423 case 'r':
20424 if (Subtarget->isThumb1Only())
20425 return RCPair(0U, &ARM::tGPRRegClass);
20426 return RCPair(0U, &ARM::GPRRegClass);
20427 case 'w':
20428 if (VT == MVT::Other)
20429 break;
20430 if (VT == MVT::f32 || VT == MVT::f16 || VT == MVT::bf16)
20431 return RCPair(0U, &ARM::SPRRegClass);
20432 if (VT.getSizeInBits() == 64)
20433 return RCPair(0U, &ARM::DPRRegClass);
20434 if (VT.getSizeInBits() == 128)
20435 return RCPair(0U, &ARM::QPRRegClass);
20436 break;
20437 case 'x':
20438 if (VT == MVT::Other)
20439 break;
20440 if (VT == MVT::f32 || VT == MVT::f16 || VT == MVT::bf16)
20441 return RCPair(0U, &ARM::SPR_8RegClass);
20442 if (VT.getSizeInBits() == 64)
20443 return RCPair(0U, &ARM::DPR_8RegClass);
20444 if (VT.getSizeInBits() == 128)
20445 return RCPair(0U, &ARM::QPR_8RegClass);
20446 break;
20447 case 't':
20448 if (VT == MVT::Other)
20449 break;
20450 if (VT == MVT::f32 || VT == MVT::i32 || VT == MVT::f16 || VT == MVT::bf16)
20451 return RCPair(0U, &ARM::SPRRegClass);
20452 if (VT.getSizeInBits() == 64)
20453 return RCPair(0U, &ARM::DPR_VFP2RegClass);
20454 if (VT.getSizeInBits() == 128)
20455 return RCPair(0U, &ARM::QPR_VFP2RegClass);
20456 break;
20457 }
20458 break;
20459
20460 case 2:
20461 if (Constraint[0] == 'T') {
20462 switch (Constraint[1]) {
20463 default:
20464 break;
20465 case 'e':
20466 return RCPair(0U, &ARM::tGPREvenRegClass);
20467 case 'o':
20468 return RCPair(0U, &ARM::tGPROddRegClass);
20469 }
20470 }
20471 break;
20472
20473 default:
20474 break;
20475 }
20476
20477 if (StringRef("{cc}").equals_insensitive(Constraint))
20478 return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
20479
20480 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
20481}
20482
20483/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
20484/// vector. If it is invalid, don't add anything to Ops.
20485void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
20486 StringRef Constraint,
20487 std::vector<SDValue> &Ops,
20488 SelectionDAG &DAG) const {
20489 SDValue Result;
20490
20491 // Currently only support length 1 constraints.
20492 if (Constraint.size() != 1)
20493 return;
20494
20495 char ConstraintLetter = Constraint[0];
20496 switch (ConstraintLetter) {
20497 default: break;
20498 case 'j':
20499 case 'I': case 'J': case 'K': case 'L':
20500 case 'M': case 'N': case 'O':
20501 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Op);
20502 if (!C)
20503 return;
20504
20505 int64_t CVal64 = C->getSExtValue();
20506 int CVal = (int) CVal64;
20507 // None of these constraints allow values larger than 32 bits. Check
20508 // that the value fits in an int.
20509 if (CVal != CVal64)
20510 return;
20511
20512 switch (ConstraintLetter) {
20513 case 'j':
20514 // Constant suitable for movw, must be between 0 and
20515 // 65535.
20516 if (Subtarget->hasV6T2Ops() || (Subtarget->hasV8MBaselineOps()))
20517 if (CVal >= 0 && CVal <= 65535)
20518 break;
20519 return;
20520 case 'I':
20521 if (Subtarget->isThumb1Only()) {
20522 // This must be a constant between 0 and 255, for ADD
20523 // immediates.
20524 if (CVal >= 0 && CVal <= 255)
20525 break;
20526 } else if (Subtarget->isThumb2()) {
20527 // A constant that can be used as an immediate value in a
20528 // data-processing instruction.
20529 if (ARM_AM::getT2SOImmVal(Arg: CVal) != -1)
20530 break;
20531 } else {
20532 // A constant that can be used as an immediate value in a
20533 // data-processing instruction.
20534 if (ARM_AM::getSOImmVal(Arg: CVal) != -1)
20535 break;
20536 }
20537 return;
20538
20539 case 'J':
20540 if (Subtarget->isThumb1Only()) {
20541 // This must be a constant between -255 and -1, for negated ADD
20542 // immediates. This can be used in GCC with an "n" modifier that
20543 // prints the negated value, for use with SUB instructions. It is
20544 // not useful otherwise but is implemented for compatibility.
20545 if (CVal >= -255 && CVal <= -1)
20546 break;
20547 } else {
20548 // This must be a constant between -4095 and 4095. It is not clear
20549 // what this constraint is intended for. Implemented for
20550 // compatibility with GCC.
20551 if (CVal >= -4095 && CVal <= 4095)
20552 break;
20553 }
20554 return;
20555
20556 case 'K':
20557 if (Subtarget->isThumb1Only()) {
20558 // A 32-bit value where only one byte has a nonzero value. Exclude
20559 // zero to match GCC. This constraint is used by GCC internally for
20560 // constants that can be loaded with a move/shift combination.
20561 // It is not useful otherwise but is implemented for compatibility.
20562 if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(V: CVal))
20563 break;
20564 } else if (Subtarget->isThumb2()) {
20565 // A constant whose bitwise inverse can be used as an immediate
20566 // value in a data-processing instruction. This can be used in GCC
20567 // with a "B" modifier that prints the inverted value, for use with
20568 // BIC and MVN instructions. It is not useful otherwise but is
20569 // implemented for compatibility.
20570 if (ARM_AM::getT2SOImmVal(Arg: ~CVal) != -1)
20571 break;
20572 } else {
20573 // A constant whose bitwise inverse can be used as an immediate
20574 // value in a data-processing instruction. This can be used in GCC
20575 // with a "B" modifier that prints the inverted value, for use with
20576 // BIC and MVN instructions. It is not useful otherwise but is
20577 // implemented for compatibility.
20578 if (ARM_AM::getSOImmVal(Arg: ~CVal) != -1)
20579 break;
20580 }
20581 return;
20582
20583 case 'L':
20584 if (Subtarget->isThumb1Only()) {
20585 // This must be a constant between -7 and 7,
20586 // for 3-operand ADD/SUB immediate instructions.
20587 if (CVal >= -7 && CVal < 7)
20588 break;
20589 } else if (Subtarget->isThumb2()) {
20590 // A constant whose negation can be used as an immediate value in a
20591 // data-processing instruction. This can be used in GCC with an "n"
20592 // modifier that prints the negated value, for use with SUB
20593 // instructions. It is not useful otherwise but is implemented for
20594 // compatibility.
20595 if (ARM_AM::getT2SOImmVal(Arg: -CVal) != -1)
20596 break;
20597 } else {
20598 // A constant whose negation can be used as an immediate value in a
20599 // data-processing instruction. This can be used in GCC with an "n"
20600 // modifier that prints the negated value, for use with SUB
20601 // instructions. It is not useful otherwise but is implemented for
20602 // compatibility.
20603 if (ARM_AM::getSOImmVal(Arg: -CVal) != -1)
20604 break;
20605 }
20606 return;
20607
20608 case 'M':
20609 if (Subtarget->isThumb1Only()) {
20610 // This must be a multiple of 4 between 0 and 1020, for
20611 // ADD sp + immediate.
20612 if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
20613 break;
20614 } else {
20615 // A power of two or a constant between 0 and 32. This is used in
20616 // GCC for the shift amount on shifted register operands, but it is
20617 // useful in general for any shift amounts.
20618 if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
20619 break;
20620 }
20621 return;
20622
20623 case 'N':
20624 if (Subtarget->isThumb1Only()) {
20625 // This must be a constant between 0 and 31, for shift amounts.
20626 if (CVal >= 0 && CVal <= 31)
20627 break;
20628 }
20629 return;
20630
20631 case 'O':
20632 if (Subtarget->isThumb1Only()) {
20633 // This must be a multiple of 4 between -508 and 508, for
20634 // ADD/SUB sp = sp + immediate.
20635 if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
20636 break;
20637 }
20638 return;
20639 }
20640 Result = DAG.getTargetConstant(Val: CVal, DL: SDLoc(Op), VT: Op.getValueType());
20641 break;
20642 }
20643
20644 if (Result.getNode()) {
20645 Ops.push_back(x: Result);
20646 return;
20647 }
20648 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
20649}
20650
20651static RTLIB::Libcall getDivRemLibcall(
20652 const SDNode *N, MVT::SimpleValueType SVT) {
20653 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
20654 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
20655 "Unhandled Opcode in getDivRemLibcall");
20656 bool isSigned = N->getOpcode() == ISD::SDIVREM ||
20657 N->getOpcode() == ISD::SREM;
20658 RTLIB::Libcall LC;
20659 switch (SVT) {
20660 default: llvm_unreachable("Unexpected request for libcall!");
20661 case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break;
20662 case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
20663 case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
20664 case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
20665 }
20666 return LC;
20667}
20668
20669static TargetLowering::ArgListTy getDivRemArgList(
20670 const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) {
20671 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
20672 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
20673 "Unhandled Opcode in getDivRemArgList");
20674 bool isSigned = N->getOpcode() == ISD::SDIVREM ||
20675 N->getOpcode() == ISD::SREM;
20676 TargetLowering::ArgListTy Args;
20677 TargetLowering::ArgListEntry Entry;
20678 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
20679 EVT ArgVT = N->getOperand(Num: i).getValueType();
20680 Type *ArgTy = ArgVT.getTypeForEVT(Context&: *Context);
20681 Entry.Node = N->getOperand(Num: i);
20682 Entry.Ty = ArgTy;
20683 Entry.IsSExt = isSigned;
20684 Entry.IsZExt = !isSigned;
20685 Args.push_back(x: Entry);
20686 }
20687 if (Subtarget->isTargetWindows() && Args.size() >= 2)
20688 std::swap(a&: Args[0], b&: Args[1]);
20689 return Args;
20690}
20691
20692SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
20693 assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
20694 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
20695 Subtarget->isTargetWindows()) &&
20696 "Register-based DivRem lowering only");
20697 unsigned Opcode = Op->getOpcode();
20698 assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
20699 "Invalid opcode for Div/Rem lowering");
20700 bool isSigned = (Opcode == ISD::SDIVREM);
20701 EVT VT = Op->getValueType(ResNo: 0);
20702 SDLoc dl(Op);
20703
20704 if (VT == MVT::i64 && isa<ConstantSDNode>(Op.getOperand(1))) {
20705 SmallVector<SDValue> Result;
20706 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i32, DAG)) {
20707 SDValue Res0 =
20708 DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: dl, VT, N1: Result[0], N2: Result[1]);
20709 SDValue Res1 =
20710 DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: dl, VT, N1: Result[2], N2: Result[3]);
20711 return DAG.getNode(Opcode: ISD::MERGE_VALUES, DL: dl, VTList: Op->getVTList(),
20712 Ops: {Res0, Res1});
20713 }
20714 }
20715
20716 Type *Ty = VT.getTypeForEVT(Context&: *DAG.getContext());
20717
20718 // If the target has hardware divide, use divide + multiply + subtract:
20719 // div = a / b
20720 // rem = a - b * div
20721 // return {div, rem}
20722 // This should be lowered into UDIV/SDIV + MLS later on.
20723 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
20724 : Subtarget->hasDivideInARMMode();
20725 if (hasDivide && Op->getValueType(0).isSimple() &&
20726 Op->getSimpleValueType(0) == MVT::i32) {
20727 unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
20728 const SDValue Dividend = Op->getOperand(Num: 0);
20729 const SDValue Divisor = Op->getOperand(Num: 1);
20730 SDValue Div = DAG.getNode(Opcode: DivOpcode, DL: dl, VT, N1: Dividend, N2: Divisor);
20731 SDValue Mul = DAG.getNode(Opcode: ISD::MUL, DL: dl, VT, N1: Div, N2: Divisor);
20732 SDValue Rem = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT, N1: Dividend, N2: Mul);
20733
20734 SDValue Values[2] = {Div, Rem};
20735 return DAG.getNode(Opcode: ISD::MERGE_VALUES, DL: dl, VTList: DAG.getVTList(VT1: VT, VT2: VT), Ops: Values);
20736 }
20737
20738 RTLIB::Libcall LC = getDivRemLibcall(N: Op.getNode(),
20739 SVT: VT.getSimpleVT().SimpleTy);
20740 SDValue InChain = DAG.getEntryNode();
20741
20742 TargetLowering::ArgListTy Args = getDivRemArgList(N: Op.getNode(),
20743 Context: DAG.getContext(),
20744 Subtarget);
20745
20746 SDValue Callee = DAG.getExternalSymbol(Sym: getLibcallName(Call: LC),
20747 VT: getPointerTy(DL: DAG.getDataLayout()));
20748
20749 Type *RetTy = StructType::get(elt1: Ty, elts: Ty);
20750
20751 if (Subtarget->isTargetWindows())
20752 InChain = WinDBZCheckDenominator(DAG, N: Op.getNode(), InChain);
20753
20754 TargetLowering::CallLoweringInfo CLI(DAG);
20755 CLI.setDebugLoc(dl).setChain(InChain)
20756 .setCallee(CC: getLibcallCallingConv(Call: LC), ResultType: RetTy, Target: Callee, ArgsList: std::move(Args))
20757 .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
20758
20759 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
20760 return CallInfo.first;
20761}
20762
20763// Lowers REM using divmod helpers
20764// see RTABI section 4.2/4.3
20765SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
20766 EVT VT = N->getValueType(ResNo: 0);
20767
20768 if (VT == MVT::i64 && isa<ConstantSDNode>(N->getOperand(1))) {
20769 SmallVector<SDValue> Result;
20770 if (expandDIVREMByConstant(N, Result, MVT::i32, DAG))
20771 return DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: SDLoc(N), VT: N->getValueType(ResNo: 0),
20772 N1: Result[0], N2: Result[1]);
20773 }
20774
20775 // Build return types (div and rem)
20776 std::vector<Type*> RetTyParams;
20777 Type *RetTyElement;
20778
20779 switch (VT.getSimpleVT().SimpleTy) {
20780 default: llvm_unreachable("Unexpected request for libcall!");
20781 case MVT::i8: RetTyElement = Type::getInt8Ty(C&: *DAG.getContext()); break;
20782 case MVT::i16: RetTyElement = Type::getInt16Ty(C&: *DAG.getContext()); break;
20783 case MVT::i32: RetTyElement = Type::getInt32Ty(C&: *DAG.getContext()); break;
20784 case MVT::i64: RetTyElement = Type::getInt64Ty(C&: *DAG.getContext()); break;
20785 }
20786
20787 RetTyParams.push_back(x: RetTyElement);
20788 RetTyParams.push_back(x: RetTyElement);
20789 ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams);
20790 Type *RetTy = StructType::get(Context&: *DAG.getContext(), Elements: ret);
20791
20792 RTLIB::Libcall LC = getDivRemLibcall(N, SVT: N->getValueType(ResNo: 0).getSimpleVT().
20793 SimpleTy);
20794 SDValue InChain = DAG.getEntryNode();
20795 TargetLowering::ArgListTy Args = getDivRemArgList(N, Context: DAG.getContext(),
20796 Subtarget);
20797 bool isSigned = N->getOpcode() == ISD::SREM;
20798 SDValue Callee = DAG.getExternalSymbol(Sym: getLibcallName(Call: LC),
20799 VT: getPointerTy(DL: DAG.getDataLayout()));
20800
20801 if (Subtarget->isTargetWindows())
20802 InChain = WinDBZCheckDenominator(DAG, N, InChain);
20803
20804 // Lower call
20805 CallLoweringInfo CLI(DAG);
20806 CLI.setChain(InChain)
20807 .setCallee(CC: CallingConv::ARM_AAPCS, ResultType: RetTy, Target: Callee, ArgsList: std::move(Args))
20808 .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N));
20809 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
20810
20811 // Return second (rem) result operand (first contains div)
20812 SDNode *ResNode = CallResult.first.getNode();
20813 assert(ResNode->getNumOperands() == 2 && "divmod should return two operands");
20814 return ResNode->getOperand(Num: 1);
20815}
20816
20817SDValue
20818ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
20819 assert(Subtarget->isTargetWindows() && "unsupported target platform");
20820 SDLoc DL(Op);
20821
20822 // Get the inputs.
20823 SDValue Chain = Op.getOperand(i: 0);
20824 SDValue Size = Op.getOperand(i: 1);
20825
20826 if (DAG.getMachineFunction().getFunction().hasFnAttribute(
20827 Kind: "no-stack-arg-probe")) {
20828 MaybeAlign Align =
20829 cast<ConstantSDNode>(Val: Op.getOperand(i: 2))->getMaybeAlignValue();
20830 SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
20831 Chain = SP.getValue(R: 1);
20832 SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size);
20833 if (Align)
20834 SP =
20835 DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),
20836 DAG.getConstant(-(uint64_t)Align->value(), DL, MVT::i32));
20837 Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP);
20838 SDValue Ops[2] = { SP, Chain };
20839 return DAG.getMergeValues(Ops, dl: DL);
20840 }
20841
20842 SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
20843 DAG.getConstant(2, DL, MVT::i32));
20844
20845 SDValue Glue;
20846 Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Glue);
20847 Glue = Chain.getValue(R: 1);
20848
20849 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
20850 Chain = DAG.getNode(Opcode: ARMISD::WIN__CHKSTK, DL, VTList: NodeTys, N1: Chain, N2: Glue);
20851
20852 SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
20853 Chain = NewSP.getValue(R: 1);
20854
20855 SDValue Ops[2] = { NewSP, Chain };
20856 return DAG.getMergeValues(Ops, dl: DL);
20857}
20858
20859SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
20860 bool IsStrict = Op->isStrictFPOpcode();
20861 SDValue SrcVal = Op.getOperand(i: IsStrict ? 1 : 0);
20862 const unsigned DstSz = Op.getValueType().getSizeInBits();
20863 const unsigned SrcSz = SrcVal.getValueType().getSizeInBits();
20864 assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 &&
20865 "Unexpected type for custom-lowering FP_EXTEND");
20866
20867 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
20868 "With both FP DP and 16, any FP conversion is legal!");
20869
20870 assert(!(DstSz == 32 && Subtarget->hasFP16()) &&
20871 "With FP16, 16 to 32 conversion is legal!");
20872
20873 // Converting from 32 -> 64 is valid if we have FP64.
20874 if (SrcSz == 32 && DstSz == 64 && Subtarget->hasFP64()) {
20875 // FIXME: Remove this when we have strict fp instruction selection patterns
20876 if (IsStrict) {
20877 SDLoc Loc(Op);
20878 SDValue Result = DAG.getNode(Opcode: ISD::FP_EXTEND,
20879 DL: Loc, VT: Op.getValueType(), Operand: SrcVal);
20880 return DAG.getMergeValues(Ops: {Result, Op.getOperand(i: 0)}, dl: Loc);
20881 }
20882 return Op;
20883 }
20884
20885 // Either we are converting from 16 -> 64, without FP16 and/or
20886 // FP.double-precision or without Armv8-fp. So we must do it in two
20887 // steps.
20888 // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32
20889 // without FP16. So we must do a function call.
20890 SDLoc Loc(Op);
20891 RTLIB::Libcall LC;
20892 MakeLibCallOptions CallOptions;
20893 SDValue Chain = IsStrict ? Op.getOperand(i: 0) : SDValue();
20894 for (unsigned Sz = SrcSz; Sz <= 32 && Sz < DstSz; Sz *= 2) {
20895 bool Supported = (Sz == 16 ? Subtarget->hasFP16() : Subtarget->hasFP64());
20896 MVT SrcVT = (Sz == 16 ? MVT::f16 : MVT::f32);
20897 MVT DstVT = (Sz == 16 ? MVT::f32 : MVT::f64);
20898 if (Supported) {
20899 if (IsStrict) {
20900 SrcVal = DAG.getNode(ISD::STRICT_FP_EXTEND, Loc,
20901 {DstVT, MVT::Other}, {Chain, SrcVal});
20902 Chain = SrcVal.getValue(R: 1);
20903 } else {
20904 SrcVal = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: Loc, VT: DstVT, Operand: SrcVal);
20905 }
20906 } else {
20907 LC = RTLIB::getFPEXT(OpVT: SrcVT, RetVT: DstVT);
20908 assert(LC != RTLIB::UNKNOWN_LIBCALL &&
20909 "Unexpected type for custom-lowering FP_EXTEND");
20910 std::tie(args&: SrcVal, args&: Chain) = makeLibCall(DAG, LC, RetVT: DstVT, Ops: SrcVal, CallOptions,
20911 dl: Loc, Chain);
20912 }
20913 }
20914
20915 return IsStrict ? DAG.getMergeValues(Ops: {SrcVal, Chain}, dl: Loc) : SrcVal;
20916}
20917
20918SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
20919 bool IsStrict = Op->isStrictFPOpcode();
20920
20921 SDValue SrcVal = Op.getOperand(i: IsStrict ? 1 : 0);
20922 EVT SrcVT = SrcVal.getValueType();
20923 EVT DstVT = Op.getValueType();
20924 const unsigned DstSz = Op.getValueType().getSizeInBits();
20925 const unsigned SrcSz = SrcVT.getSizeInBits();
20926 (void)DstSz;
20927 assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 &&
20928 "Unexpected type for custom-lowering FP_ROUND");
20929
20930 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
20931 "With both FP DP and 16, any FP conversion is legal!");
20932
20933 SDLoc Loc(Op);
20934
20935 // Instruction from 32 -> 16 if hasFP16 is valid
20936 if (SrcSz == 32 && Subtarget->hasFP16())
20937 return Op;
20938
20939 // Lib call from 32 -> 16 / 64 -> [32, 16]
20940 RTLIB::Libcall LC = RTLIB::getFPROUND(OpVT: SrcVT, RetVT: DstVT);
20941 assert(LC != RTLIB::UNKNOWN_LIBCALL &&
20942 "Unexpected type for custom-lowering FP_ROUND");
20943 MakeLibCallOptions CallOptions;
20944 SDValue Chain = IsStrict ? Op.getOperand(i: 0) : SDValue();
20945 SDValue Result;
20946 std::tie(args&: Result, args&: Chain) = makeLibCall(DAG, LC, RetVT: DstVT, Ops: SrcVal, CallOptions,
20947 dl: Loc, Chain);
20948 return IsStrict ? DAG.getMergeValues(Ops: {Result, Chain}, dl: Loc) : Result;
20949}
20950
20951bool
20952ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
20953 // The ARM target isn't yet aware of offsets.
20954 return false;
20955}
20956
20957bool ARM::isBitFieldInvertedMask(unsigned v) {
20958 if (v == 0xffffffff)
20959 return false;
20960
20961 // there can be 1's on either or both "outsides", all the "inside"
20962 // bits must be 0's
20963 return isShiftedMask_32(Value: ~v);
20964}
20965
20966/// isFPImmLegal - Returns true if the target can instruction select the
20967/// specified FP immediate natively. If false, the legalizer will
20968/// materialize the FP immediate as a load from a constant pool.
20969bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
20970 bool ForCodeSize) const {
20971 if (!Subtarget->hasVFP3Base())
20972 return false;
20973 if (VT == MVT::f16 && Subtarget->hasFullFP16())
20974 return ARM_AM::getFP16Imm(FPImm: Imm) != -1;
20975 if (VT == MVT::f32 && Subtarget->hasFullFP16() &&
20976 ARM_AM::getFP32FP16Imm(Imm) != -1)
20977 return true;
20978 if (VT == MVT::f32)
20979 return ARM_AM::getFP32Imm(FPImm: Imm) != -1;
20980 if (VT == MVT::f64 && Subtarget->hasFP64())
20981 return ARM_AM::getFP64Imm(FPImm: Imm) != -1;
20982 return false;
20983}
20984
20985/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
20986/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
20987/// specified in the intrinsic calls.
20988bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
20989 const CallInst &I,
20990 MachineFunction &MF,
20991 unsigned Intrinsic) const {
20992 switch (Intrinsic) {
20993 case Intrinsic::arm_neon_vld1:
20994 case Intrinsic::arm_neon_vld2:
20995 case Intrinsic::arm_neon_vld3:
20996 case Intrinsic::arm_neon_vld4:
20997 case Intrinsic::arm_neon_vld2lane:
20998 case Intrinsic::arm_neon_vld3lane:
20999 case Intrinsic::arm_neon_vld4lane:
21000 case Intrinsic::arm_neon_vld2dup:
21001 case Intrinsic::arm_neon_vld3dup:
21002 case Intrinsic::arm_neon_vld4dup: {
21003 Info.opc = ISD::INTRINSIC_W_CHAIN;
21004 // Conservatively set memVT to the entire set of vectors loaded.
21005 auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
21006 uint64_t NumElts = DL.getTypeSizeInBits(Ty: I.getType()) / 64;
21007 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
21008 Info.ptrVal = I.getArgOperand(i: 0);
21009 Info.offset = 0;
21010 Value *AlignArg = I.getArgOperand(i: I.arg_size() - 1);
21011 Info.align = cast<ConstantInt>(Val: AlignArg)->getMaybeAlignValue();
21012 // volatile loads with NEON intrinsics not supported
21013 Info.flags = MachineMemOperand::MOLoad;
21014 return true;
21015 }
21016 case Intrinsic::arm_neon_vld1x2:
21017 case Intrinsic::arm_neon_vld1x3:
21018 case Intrinsic::arm_neon_vld1x4: {
21019 Info.opc = ISD::INTRINSIC_W_CHAIN;
21020 // Conservatively set memVT to the entire set of vectors loaded.
21021 auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
21022 uint64_t NumElts = DL.getTypeSizeInBits(Ty: I.getType()) / 64;
21023 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
21024 Info.ptrVal = I.getArgOperand(i: I.arg_size() - 1);
21025 Info.offset = 0;
21026 Info.align.reset();
21027 // volatile loads with NEON intrinsics not supported
21028 Info.flags = MachineMemOperand::MOLoad;
21029 return true;
21030 }
21031 case Intrinsic::arm_neon_vst1:
21032 case Intrinsic::arm_neon_vst2:
21033 case Intrinsic::arm_neon_vst3:
21034 case Intrinsic::arm_neon_vst4:
21035 case Intrinsic::arm_neon_vst2lane:
21036 case Intrinsic::arm_neon_vst3lane:
21037 case Intrinsic::arm_neon_vst4lane: {
21038 Info.opc = ISD::INTRINSIC_VOID;
21039 // Conservatively set memVT to the entire set of vectors stored.
21040 auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
21041 unsigned NumElts = 0;
21042 for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
21043 Type *ArgTy = I.getArgOperand(i: ArgI)->getType();
21044 if (!ArgTy->isVectorTy())
21045 break;
21046 NumElts += DL.getTypeSizeInBits(Ty: ArgTy) / 64;
21047 }
21048 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
21049 Info.ptrVal = I.getArgOperand(i: 0);
21050 Info.offset = 0;
21051 Value *AlignArg = I.getArgOperand(i: I.arg_size() - 1);
21052 Info.align = cast<ConstantInt>(Val: AlignArg)->getMaybeAlignValue();
21053 // volatile stores with NEON intrinsics not supported
21054 Info.flags = MachineMemOperand::MOStore;
21055 return true;
21056 }
21057 case Intrinsic::arm_neon_vst1x2:
21058 case Intrinsic::arm_neon_vst1x3:
21059 case Intrinsic::arm_neon_vst1x4: {
21060 Info.opc = ISD::INTRINSIC_VOID;
21061 // Conservatively set memVT to the entire set of vectors stored.
21062 auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
21063 unsigned NumElts = 0;
21064 for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
21065 Type *ArgTy = I.getArgOperand(i: ArgI)->getType();
21066 if (!ArgTy->isVectorTy())
21067 break;
21068 NumElts += DL.getTypeSizeInBits(Ty: ArgTy) / 64;
21069 }
21070 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
21071 Info.ptrVal = I.getArgOperand(i: 0);
21072 Info.offset = 0;
21073 Info.align.reset();
21074 // volatile stores with NEON intrinsics not supported
21075 Info.flags = MachineMemOperand::MOStore;
21076 return true;
21077 }
21078 case Intrinsic::arm_mve_vld2q:
21079 case Intrinsic::arm_mve_vld4q: {
21080 Info.opc = ISD::INTRINSIC_W_CHAIN;
21081 // Conservatively set memVT to the entire set of vectors loaded.
21082 Type *VecTy = cast<StructType>(Val: I.getType())->getElementType(N: 1);
21083 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vld2q ? 2 : 4;
21084 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
21085 Info.ptrVal = I.getArgOperand(i: 0);
21086 Info.offset = 0;
21087 Info.align = Align(VecTy->getScalarSizeInBits() / 8);
21088 // volatile loads with MVE intrinsics not supported
21089 Info.flags = MachineMemOperand::MOLoad;
21090 return true;
21091 }
21092 case Intrinsic::arm_mve_vst2q:
21093 case Intrinsic::arm_mve_vst4q: {
21094 Info.opc = ISD::INTRINSIC_VOID;
21095 // Conservatively set memVT to the entire set of vectors stored.
21096 Type *VecTy = I.getArgOperand(i: 1)->getType();
21097 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vst2q ? 2 : 4;
21098 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
21099 Info.ptrVal = I.getArgOperand(i: 0);
21100 Info.offset = 0;
21101 Info.align = Align(VecTy->getScalarSizeInBits() / 8);
21102 // volatile stores with MVE intrinsics not supported
21103 Info.flags = MachineMemOperand::MOStore;
21104 return true;
21105 }
21106 case Intrinsic::arm_mve_vldr_gather_base:
21107 case Intrinsic::arm_mve_vldr_gather_base_predicated: {
21108 Info.opc = ISD::INTRINSIC_W_CHAIN;
21109 Info.ptrVal = nullptr;
21110 Info.memVT = MVT::getVT(Ty: I.getType());
21111 Info.align = Align(1);
21112 Info.flags |= MachineMemOperand::MOLoad;
21113 return true;
21114 }
21115 case Intrinsic::arm_mve_vldr_gather_base_wb:
21116 case Intrinsic::arm_mve_vldr_gather_base_wb_predicated: {
21117 Info.opc = ISD::INTRINSIC_W_CHAIN;
21118 Info.ptrVal = nullptr;
21119 Info.memVT = MVT::getVT(Ty: I.getType()->getContainedType(i: 0));
21120 Info.align = Align(1);
21121 Info.flags |= MachineMemOperand::MOLoad;
21122 return true;
21123 }
21124 case Intrinsic::arm_mve_vldr_gather_offset:
21125 case Intrinsic::arm_mve_vldr_gather_offset_predicated: {
21126 Info.opc = ISD::INTRINSIC_W_CHAIN;
21127 Info.ptrVal = nullptr;
21128 MVT DataVT = MVT::getVT(Ty: I.getType());
21129 unsigned MemSize = cast<ConstantInt>(Val: I.getArgOperand(i: 2))->getZExtValue();
21130 Info.memVT = MVT::getVectorVT(VT: MVT::getIntegerVT(BitWidth: MemSize),
21131 NumElements: DataVT.getVectorNumElements());
21132 Info.align = Align(1);
21133 Info.flags |= MachineMemOperand::MOLoad;
21134 return true;
21135 }
21136 case Intrinsic::arm_mve_vstr_scatter_base:
21137 case Intrinsic::arm_mve_vstr_scatter_base_predicated: {
21138 Info.opc = ISD::INTRINSIC_VOID;
21139 Info.ptrVal = nullptr;
21140 Info.memVT = MVT::getVT(Ty: I.getArgOperand(i: 2)->getType());
21141 Info.align = Align(1);
21142 Info.flags |= MachineMemOperand::MOStore;
21143 return true;
21144 }
21145 case Intrinsic::arm_mve_vstr_scatter_base_wb:
21146 case Intrinsic::arm_mve_vstr_scatter_base_wb_predicated: {
21147 Info.opc = ISD::INTRINSIC_W_CHAIN;
21148 Info.ptrVal = nullptr;
21149 Info.memVT = MVT::getVT(Ty: I.getArgOperand(i: 2)->getType());
21150 Info.align = Align(1);
21151 Info.flags |= MachineMemOperand::MOStore;
21152 return true;
21153 }
21154 case Intrinsic::arm_mve_vstr_scatter_offset:
21155 case Intrinsic::arm_mve_vstr_scatter_offset_predicated: {
21156 Info.opc = ISD::INTRINSIC_VOID;
21157 Info.ptrVal = nullptr;
21158 MVT DataVT = MVT::getVT(Ty: I.getArgOperand(i: 2)->getType());
21159 unsigned MemSize = cast<ConstantInt>(Val: I.getArgOperand(i: 3))->getZExtValue();
21160 Info.memVT = MVT::getVectorVT(VT: MVT::getIntegerVT(BitWidth: MemSize),
21161 NumElements: DataVT.getVectorNumElements());
21162 Info.align = Align(1);
21163 Info.flags |= MachineMemOperand::MOStore;
21164 return true;
21165 }
21166 case Intrinsic::arm_ldaex:
21167 case Intrinsic::arm_ldrex: {
21168 auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
21169 Type *ValTy = I.getParamElementType(ArgNo: 0);
21170 Info.opc = ISD::INTRINSIC_W_CHAIN;
21171 Info.memVT = MVT::getVT(Ty: ValTy);
21172 Info.ptrVal = I.getArgOperand(i: 0);
21173 Info.offset = 0;
21174 Info.align = DL.getABITypeAlign(Ty: ValTy);
21175 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
21176 return true;
21177 }
21178 case Intrinsic::arm_stlex:
21179 case Intrinsic::arm_strex: {
21180 auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
21181 Type *ValTy = I.getParamElementType(ArgNo: 1);
21182 Info.opc = ISD::INTRINSIC_W_CHAIN;
21183 Info.memVT = MVT::getVT(Ty: ValTy);
21184 Info.ptrVal = I.getArgOperand(i: 1);
21185 Info.offset = 0;
21186 Info.align = DL.getABITypeAlign(Ty: ValTy);
21187 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
21188 return true;
21189 }
21190 case Intrinsic::arm_stlexd:
21191 case Intrinsic::arm_strexd:
21192 Info.opc = ISD::INTRINSIC_W_CHAIN;
21193 Info.memVT = MVT::i64;
21194 Info.ptrVal = I.getArgOperand(i: 2);
21195 Info.offset = 0;
21196 Info.align = Align(8);
21197 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
21198 return true;
21199
21200 case Intrinsic::arm_ldaexd:
21201 case Intrinsic::arm_ldrexd:
21202 Info.opc = ISD::INTRINSIC_W_CHAIN;
21203 Info.memVT = MVT::i64;
21204 Info.ptrVal = I.getArgOperand(i: 0);
21205 Info.offset = 0;
21206 Info.align = Align(8);
21207 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
21208 return true;
21209
21210 default:
21211 break;
21212 }
21213
21214 return false;
21215}
21216
21217/// Returns true if it is beneficial to convert a load of a constant
21218/// to just the constant itself.
21219bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
21220 Type *Ty) const {
21221 assert(Ty->isIntegerTy());
21222
21223 unsigned Bits = Ty->getPrimitiveSizeInBits();
21224 if (Bits == 0 || Bits > 32)
21225 return false;
21226 return true;
21227}
21228
21229bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
21230 unsigned Index) const {
21231 if (!isOperationLegalOrCustom(Op: ISD::EXTRACT_SUBVECTOR, VT: ResVT))
21232 return false;
21233
21234 return (Index == 0 || Index == ResVT.getVectorNumElements());
21235}
21236
21237Instruction *ARMTargetLowering::makeDMB(IRBuilderBase &Builder,
21238 ARM_MB::MemBOpt Domain) const {
21239 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21240
21241 // First, if the target has no DMB, see what fallback we can use.
21242 if (!Subtarget->hasDataBarrier()) {
21243 // Some ARMv6 cpus can support data barriers with an mcr instruction.
21244 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
21245 // here.
21246 if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
21247 Function *MCR = Intrinsic::getDeclaration(M, Intrinsic::arm_mcr);
21248 Value* args[6] = {Builder.getInt32(C: 15), Builder.getInt32(C: 0),
21249 Builder.getInt32(C: 0), Builder.getInt32(C: 7),
21250 Builder.getInt32(C: 10), Builder.getInt32(C: 5)};
21251 return Builder.CreateCall(Callee: MCR, Args: args);
21252 } else {
21253 // Instead of using barriers, atomic accesses on these subtargets use
21254 // libcalls.
21255 llvm_unreachable("makeDMB on a target so old that it has no barriers");
21256 }
21257 } else {
21258 Function *DMB = Intrinsic::getDeclaration(M, Intrinsic::arm_dmb);
21259 // Only a full system barrier exists in the M-class architectures.
21260 Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
21261 Constant *CDomain = Builder.getInt32(C: Domain);
21262 return Builder.CreateCall(Callee: DMB, Args: CDomain);
21263 }
21264}
21265
21266// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
21267Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
21268 Instruction *Inst,
21269 AtomicOrdering Ord) const {
21270 switch (Ord) {
21271 case AtomicOrdering::NotAtomic:
21272 case AtomicOrdering::Unordered:
21273 llvm_unreachable("Invalid fence: unordered/non-atomic");
21274 case AtomicOrdering::Monotonic:
21275 case AtomicOrdering::Acquire:
21276 return nullptr; // Nothing to do
21277 case AtomicOrdering::SequentiallyConsistent:
21278 if (!Inst->hasAtomicStore())
21279 return nullptr; // Nothing to do
21280 [[fallthrough]];
21281 case AtomicOrdering::Release:
21282 case AtomicOrdering::AcquireRelease:
21283 if (Subtarget->preferISHSTBarriers())
21284 return makeDMB(Builder, Domain: ARM_MB::ISHST);
21285 // FIXME: add a comment with a link to documentation justifying this.
21286 else
21287 return makeDMB(Builder, Domain: ARM_MB::ISH);
21288 }
21289 llvm_unreachable("Unknown fence ordering in emitLeadingFence");
21290}
21291
21292Instruction *ARMTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
21293 Instruction *Inst,
21294 AtomicOrdering Ord) const {
21295 switch (Ord) {
21296 case AtomicOrdering::NotAtomic:
21297 case AtomicOrdering::Unordered:
21298 llvm_unreachable("Invalid fence: unordered/not-atomic");
21299 case AtomicOrdering::Monotonic:
21300 case AtomicOrdering::Release:
21301 return nullptr; // Nothing to do
21302 case AtomicOrdering::Acquire:
21303 case AtomicOrdering::AcquireRelease:
21304 case AtomicOrdering::SequentiallyConsistent:
21305 return makeDMB(Builder, Domain: ARM_MB::ISH);
21306 }
21307 llvm_unreachable("Unknown fence ordering in emitTrailingFence");
21308}
21309
21310// Loads and stores less than 64-bits are already atomic; ones above that
21311// are doomed anyway, so defer to the default libcall and blame the OS when
21312// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
21313// anything for those.
21314TargetLoweringBase::AtomicExpansionKind
21315ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
21316 bool has64BitAtomicStore;
21317 if (Subtarget->isMClass())
21318 has64BitAtomicStore = false;
21319 else if (Subtarget->isThumb())
21320 has64BitAtomicStore = Subtarget->hasV7Ops();
21321 else
21322 has64BitAtomicStore = Subtarget->hasV6Ops();
21323
21324 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
21325 return Size == 64 && has64BitAtomicStore ? AtomicExpansionKind::Expand
21326 : AtomicExpansionKind::None;
21327}
21328
21329// Loads and stores less than 64-bits are already atomic; ones above that
21330// are doomed anyway, so defer to the default libcall and blame the OS when
21331// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
21332// anything for those.
21333// FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
21334// guarantee, see DDI0406C ARM architecture reference manual,
21335// sections A8.8.72-74 LDRD)
21336TargetLowering::AtomicExpansionKind
21337ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
21338 bool has64BitAtomicLoad;
21339 if (Subtarget->isMClass())
21340 has64BitAtomicLoad = false;
21341 else if (Subtarget->isThumb())
21342 has64BitAtomicLoad = Subtarget->hasV7Ops();
21343 else
21344 has64BitAtomicLoad = Subtarget->hasV6Ops();
21345
21346 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
21347 return (Size == 64 && has64BitAtomicLoad) ? AtomicExpansionKind::LLOnly
21348 : AtomicExpansionKind::None;
21349}
21350
21351// For the real atomic operations, we have ldrex/strex up to 32 bits,
21352// and up to 64 bits on the non-M profiles
21353TargetLowering::AtomicExpansionKind
21354ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
21355 if (AI->isFloatingPointOperation())
21356 return AtomicExpansionKind::CmpXChg;
21357
21358 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
21359 bool hasAtomicRMW;
21360 if (Subtarget->isMClass())
21361 hasAtomicRMW = Subtarget->hasV8MBaselineOps();
21362 else if (Subtarget->isThumb())
21363 hasAtomicRMW = Subtarget->hasV7Ops();
21364 else
21365 hasAtomicRMW = Subtarget->hasV6Ops();
21366 if (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) {
21367 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21368 // implement atomicrmw without spilling. If the target address is also on
21369 // the stack and close enough to the spill slot, this can lead to a
21370 // situation where the monitor always gets cleared and the atomic operation
21371 // can never succeed. So at -O0 lower this operation to a CAS loop.
21372 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
21373 return AtomicExpansionKind::CmpXChg;
21374 return AtomicExpansionKind::LLSC;
21375 }
21376 return AtomicExpansionKind::None;
21377}
21378
21379// Similar to shouldExpandAtomicRMWInIR, ldrex/strex can be used up to 32
21380// bits, and up to 64 bits on the non-M profiles.
21381TargetLowering::AtomicExpansionKind
21382ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const {
21383 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21384 // implement cmpxchg without spilling. If the address being exchanged is also
21385 // on the stack and close enough to the spill slot, this can lead to a
21386 // situation where the monitor always gets cleared and the atomic operation
21387 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
21388 unsigned Size = AI->getOperand(i_nocapture: 1)->getType()->getPrimitiveSizeInBits();
21389 bool HasAtomicCmpXchg;
21390 if (Subtarget->isMClass())
21391 HasAtomicCmpXchg = Subtarget->hasV8MBaselineOps();
21392 else if (Subtarget->isThumb())
21393 HasAtomicCmpXchg = Subtarget->hasV7Ops();
21394 else
21395 HasAtomicCmpXchg = Subtarget->hasV6Ops();
21396 if (getTargetMachine().getOptLevel() != CodeGenOptLevel::None &&
21397 HasAtomicCmpXchg && Size <= (Subtarget->isMClass() ? 32U : 64U))
21398 return AtomicExpansionKind::LLSC;
21399 return AtomicExpansionKind::None;
21400}
21401
21402bool ARMTargetLowering::shouldInsertFencesForAtomic(
21403 const Instruction *I) const {
21404 return InsertFencesForAtomic;
21405}
21406
21407bool ARMTargetLowering::useLoadStackGuardNode() const {
21408 // ROPI/RWPI are not supported currently.
21409 return !Subtarget->isROPI() && !Subtarget->isRWPI();
21410}
21411
21412void ARMTargetLowering::insertSSPDeclarations(Module &M) const {
21413 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
21414 return TargetLowering::insertSSPDeclarations(M);
21415
21416 // MSVC CRT has a global variable holding security cookie.
21417 M.getOrInsertGlobal(Name: "__security_cookie",
21418 Ty: PointerType::getUnqual(C&: M.getContext()));
21419
21420 // MSVC CRT has a function to validate security cookie.
21421 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
21422 Name: "__security_check_cookie", RetTy: Type::getVoidTy(C&: M.getContext()),
21423 Args: PointerType::getUnqual(C&: M.getContext()));
21424 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
21425 F->addParamAttr(0, Attribute::AttrKind::InReg);
21426}
21427
21428Value *ARMTargetLowering::getSDagStackGuard(const Module &M) const {
21429 // MSVC CRT has a global variable holding security cookie.
21430 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
21431 return M.getGlobalVariable(Name: "__security_cookie");
21432 return TargetLowering::getSDagStackGuard(M);
21433}
21434
21435Function *ARMTargetLowering::getSSPStackGuardCheck(const Module &M) const {
21436 // MSVC CRT has a function to validate security cookie.
21437 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
21438 return M.getFunction(Name: "__security_check_cookie");
21439 return TargetLowering::getSSPStackGuardCheck(M);
21440}
21441
21442bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
21443 unsigned &Cost) const {
21444 // If we do not have NEON, vector types are not natively supported.
21445 if (!Subtarget->hasNEON())
21446 return false;
21447
21448 // Floating point values and vector values map to the same register file.
21449 // Therefore, although we could do a store extract of a vector type, this is
21450 // better to leave at float as we have more freedom in the addressing mode for
21451 // those.
21452 if (VectorTy->isFPOrFPVectorTy())
21453 return false;
21454
21455 // If the index is unknown at compile time, this is very expensive to lower
21456 // and it is not possible to combine the store with the extract.
21457 if (!isa<ConstantInt>(Val: Idx))
21458 return false;
21459
21460 assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
21461 unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedValue();
21462 // We can do a store + vector extract on any vector that fits perfectly in a D
21463 // or Q register.
21464 if (BitWidth == 64 || BitWidth == 128) {
21465 Cost = 0;
21466 return true;
21467 }
21468 return false;
21469}
21470
21471bool ARMTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
21472 return Subtarget->hasV6T2Ops();
21473}
21474
21475bool ARMTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
21476 return Subtarget->hasV6T2Ops();
21477}
21478
21479bool ARMTargetLowering::isMaskAndCmp0FoldingBeneficial(
21480 const Instruction &AndI) const {
21481 if (!Subtarget->hasV7Ops())
21482 return false;
21483
21484 // Sink the `and` instruction only if the mask would fit into a modified
21485 // immediate operand.
21486 ConstantInt *Mask = dyn_cast<ConstantInt>(Val: AndI.getOperand(i: 1));
21487 if (!Mask || Mask->getValue().getBitWidth() > 32u)
21488 return false;
21489 auto MaskVal = unsigned(Mask->getValue().getZExtValue());
21490 return (Subtarget->isThumb2() ? ARM_AM::getT2SOImmVal(Arg: MaskVal)
21491 : ARM_AM::getSOImmVal(Arg: MaskVal)) != -1;
21492}
21493
21494TargetLowering::ShiftLegalizationStrategy
21495ARMTargetLowering::preferredShiftLegalizationStrategy(
21496 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
21497 if (Subtarget->hasMinSize() && !Subtarget->isTargetWindows())
21498 return ShiftLegalizationStrategy::LowerToLibcall;
21499 return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,
21500 ExpansionFactor);
21501}
21502
21503Value *ARMTargetLowering::emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy,
21504 Value *Addr,
21505 AtomicOrdering Ord) const {
21506 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21507 bool IsAcquire = isAcquireOrStronger(AO: Ord);
21508
21509 // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
21510 // intrinsic must return {i32, i32} and we have to recombine them into a
21511 // single i64 here.
21512 if (ValueTy->getPrimitiveSizeInBits() == 64) {
21513 Intrinsic::ID Int =
21514 IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
21515 Function *Ldrex = Intrinsic::getDeclaration(M, id: Int);
21516
21517 Value *LoHi = Builder.CreateCall(Callee: Ldrex, Args: Addr, Name: "lohi");
21518
21519 Value *Lo = Builder.CreateExtractValue(Agg: LoHi, Idxs: 0, Name: "lo");
21520 Value *Hi = Builder.CreateExtractValue(Agg: LoHi, Idxs: 1, Name: "hi");
21521 if (!Subtarget->isLittle())
21522 std::swap (a&: Lo, b&: Hi);
21523 Lo = Builder.CreateZExt(V: Lo, DestTy: ValueTy, Name: "lo64");
21524 Hi = Builder.CreateZExt(V: Hi, DestTy: ValueTy, Name: "hi64");
21525 return Builder.CreateOr(
21526 LHS: Lo, RHS: Builder.CreateShl(LHS: Hi, RHS: ConstantInt::get(Ty: ValueTy, V: 32)), Name: "val64");
21527 }
21528
21529 Type *Tys[] = { Addr->getType() };
21530 Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
21531 Function *Ldrex = Intrinsic::getDeclaration(M, id: Int, Tys);
21532 CallInst *CI = Builder.CreateCall(Callee: Ldrex, Args: Addr);
21533
21534 CI->addParamAttr(
21535 0, Attribute::get(M->getContext(), Attribute::ElementType, ValueTy));
21536 return Builder.CreateTruncOrBitCast(V: CI, DestTy: ValueTy);
21537}
21538
21539void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
21540 IRBuilderBase &Builder) const {
21541 if (!Subtarget->hasV7Ops())
21542 return;
21543 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21544 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex));
21545}
21546
21547Value *ARMTargetLowering::emitStoreConditional(IRBuilderBase &Builder,
21548 Value *Val, Value *Addr,
21549 AtomicOrdering Ord) const {
21550 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21551 bool IsRelease = isReleaseOrStronger(AO: Ord);
21552
21553 // Since the intrinsics must have legal type, the i64 intrinsics take two
21554 // parameters: "i32, i32". We must marshal Val into the appropriate form
21555 // before the call.
21556 if (Val->getType()->getPrimitiveSizeInBits() == 64) {
21557 Intrinsic::ID Int =
21558 IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
21559 Function *Strex = Intrinsic::getDeclaration(M, id: Int);
21560 Type *Int32Ty = Type::getInt32Ty(C&: M->getContext());
21561
21562 Value *Lo = Builder.CreateTrunc(V: Val, DestTy: Int32Ty, Name: "lo");
21563 Value *Hi = Builder.CreateTrunc(V: Builder.CreateLShr(LHS: Val, RHS: 32), DestTy: Int32Ty, Name: "hi");
21564 if (!Subtarget->isLittle())
21565 std::swap(a&: Lo, b&: Hi);
21566 return Builder.CreateCall(Callee: Strex, Args: {Lo, Hi, Addr});
21567 }
21568
21569 Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
21570 Type *Tys[] = { Addr->getType() };
21571 Function *Strex = Intrinsic::getDeclaration(M, id: Int, Tys);
21572
21573 CallInst *CI = Builder.CreateCall(
21574 Callee: Strex, Args: {Builder.CreateZExtOrBitCast(
21575 V: Val, DestTy: Strex->getFunctionType()->getParamType(i: 0)),
21576 Addr});
21577 CI->addParamAttr(1, Attribute::get(M->getContext(), Attribute::ElementType,
21578 Val->getType()));
21579 return CI;
21580}
21581
21582
21583bool ARMTargetLowering::alignLoopsWithOptSize() const {
21584 return Subtarget->isMClass();
21585}
21586
21587/// A helper function for determining the number of interleaved accesses we
21588/// will generate when lowering accesses of the given type.
21589unsigned
21590ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
21591 const DataLayout &DL) const {
21592 return (DL.getTypeSizeInBits(Ty: VecTy) + 127) / 128;
21593}
21594
21595bool ARMTargetLowering::isLegalInterleavedAccessType(
21596 unsigned Factor, FixedVectorType *VecTy, Align Alignment,
21597 const DataLayout &DL) const {
21598
21599 unsigned VecSize = DL.getTypeSizeInBits(Ty: VecTy);
21600 unsigned ElSize = DL.getTypeSizeInBits(Ty: VecTy->getElementType());
21601
21602 if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps())
21603 return false;
21604
21605 // Ensure the vector doesn't have f16 elements. Even though we could do an
21606 // i16 vldN, we can't hold the f16 vectors and will end up converting via
21607 // f32.
21608 if (Subtarget->hasNEON() && VecTy->getElementType()->isHalfTy())
21609 return false;
21610 if (Subtarget->hasMVEIntegerOps() && Factor == 3)
21611 return false;
21612
21613 // Ensure the number of vector elements is greater than 1.
21614 if (VecTy->getNumElements() < 2)
21615 return false;
21616
21617 // Ensure the element type is legal.
21618 if (ElSize != 8 && ElSize != 16 && ElSize != 32)
21619 return false;
21620 // And the alignment if high enough under MVE.
21621 if (Subtarget->hasMVEIntegerOps() && Alignment < ElSize / 8)
21622 return false;
21623
21624 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
21625 // 128 will be split into multiple interleaved accesses.
21626 if (Subtarget->hasNEON() && VecSize == 64)
21627 return true;
21628 return VecSize % 128 == 0;
21629}
21630
21631unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const {
21632 if (Subtarget->hasNEON())
21633 return 4;
21634 if (Subtarget->hasMVEIntegerOps())
21635 return MVEMaxSupportedInterleaveFactor;
21636 return TargetLoweringBase::getMaxSupportedInterleaveFactor();
21637}
21638
21639/// Lower an interleaved load into a vldN intrinsic.
21640///
21641/// E.g. Lower an interleaved load (Factor = 2):
21642/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
21643/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
21644/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
21645///
21646/// Into:
21647/// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)
21648/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
21649/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
21650bool ARMTargetLowering::lowerInterleavedLoad(
21651 LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
21652 ArrayRef<unsigned> Indices, unsigned Factor) const {
21653 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
21654 "Invalid interleave factor");
21655 assert(!Shuffles.empty() && "Empty shufflevector input");
21656 assert(Shuffles.size() == Indices.size() &&
21657 "Unmatched number of shufflevectors and indices");
21658
21659 auto *VecTy = cast<FixedVectorType>(Val: Shuffles[0]->getType());
21660 Type *EltTy = VecTy->getElementType();
21661
21662 const DataLayout &DL = LI->getModule()->getDataLayout();
21663 Align Alignment = LI->getAlign();
21664
21665 // Skip if we do not have NEON and skip illegal vector types. We can
21666 // "legalize" wide vector types into multiple interleaved accesses as long as
21667 // the vector types are divisible by 128.
21668 if (!isLegalInterleavedAccessType(Factor, VecTy, Alignment, DL))
21669 return false;
21670
21671 unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
21672
21673 // A pointer vector can not be the return type of the ldN intrinsics. Need to
21674 // load integer vectors first and then convert to pointer vectors.
21675 if (EltTy->isPointerTy())
21676 VecTy = FixedVectorType::get(ElementType: DL.getIntPtrType(EltTy), FVTy: VecTy);
21677
21678 IRBuilder<> Builder(LI);
21679
21680 // The base address of the load.
21681 Value *BaseAddr = LI->getPointerOperand();
21682
21683 if (NumLoads > 1) {
21684 // If we're going to generate more than one load, reset the sub-vector type
21685 // to something legal.
21686 VecTy = FixedVectorType::get(ElementType: VecTy->getElementType(),
21687 NumElts: VecTy->getNumElements() / NumLoads);
21688 }
21689
21690 assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");
21691
21692 auto createLoadIntrinsic = [&](Value *BaseAddr) {
21693 if (Subtarget->hasNEON()) {
21694 Type *PtrTy = Builder.getPtrTy(AddrSpace: LI->getPointerAddressSpace());
21695 Type *Tys[] = {VecTy, PtrTy};
21696 static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
21697 Intrinsic::arm_neon_vld3,
21698 Intrinsic::arm_neon_vld4};
21699 Function *VldnFunc =
21700 Intrinsic::getDeclaration(M: LI->getModule(), id: LoadInts[Factor - 2], Tys);
21701
21702 SmallVector<Value *, 2> Ops;
21703 Ops.push_back(Elt: BaseAddr);
21704 Ops.push_back(Elt: Builder.getInt32(C: LI->getAlign().value()));
21705
21706 return Builder.CreateCall(Callee: VldnFunc, Args: Ops, Name: "vldN");
21707 } else {
21708 assert((Factor == 2 || Factor == 4) &&
21709 "expected interleave factor of 2 or 4 for MVE");
21710 Intrinsic::ID LoadInts =
21711 Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q;
21712 Type *PtrTy = Builder.getPtrTy(AddrSpace: LI->getPointerAddressSpace());
21713 Type *Tys[] = {VecTy, PtrTy};
21714 Function *VldnFunc =
21715 Intrinsic::getDeclaration(M: LI->getModule(), id: LoadInts, Tys);
21716
21717 SmallVector<Value *, 2> Ops;
21718 Ops.push_back(Elt: BaseAddr);
21719 return Builder.CreateCall(Callee: VldnFunc, Args: Ops, Name: "vldN");
21720 }
21721 };
21722
21723 // Holds sub-vectors extracted from the load intrinsic return values. The
21724 // sub-vectors are associated with the shufflevector instructions they will
21725 // replace.
21726 DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;
21727
21728 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
21729 // If we're generating more than one load, compute the base address of
21730 // subsequent loads as an offset from the previous.
21731 if (LoadCount > 0)
21732 BaseAddr = Builder.CreateConstGEP1_32(Ty: VecTy->getElementType(), Ptr: BaseAddr,
21733 Idx0: VecTy->getNumElements() * Factor);
21734
21735 CallInst *VldN = createLoadIntrinsic(BaseAddr);
21736
21737 // Replace uses of each shufflevector with the corresponding vector loaded
21738 // by ldN.
21739 for (unsigned i = 0; i < Shuffles.size(); i++) {
21740 ShuffleVectorInst *SV = Shuffles[i];
21741 unsigned Index = Indices[i];
21742
21743 Value *SubVec = Builder.CreateExtractValue(Agg: VldN, Idxs: Index);
21744
21745 // Convert the integer vector to pointer vector if the element is pointer.
21746 if (EltTy->isPointerTy())
21747 SubVec = Builder.CreateIntToPtr(
21748 V: SubVec,
21749 DestTy: FixedVectorType::get(ElementType: SV->getType()->getElementType(), FVTy: VecTy));
21750
21751 SubVecs[SV].push_back(Elt: SubVec);
21752 }
21753 }
21754
21755 // Replace uses of the shufflevector instructions with the sub-vectors
21756 // returned by the load intrinsic. If a shufflevector instruction is
21757 // associated with more than one sub-vector, those sub-vectors will be
21758 // concatenated into a single wide vector.
21759 for (ShuffleVectorInst *SVI : Shuffles) {
21760 auto &SubVec = SubVecs[SVI];
21761 auto *WideVec =
21762 SubVec.size() > 1 ? concatenateVectors(Builder, Vecs: SubVec) : SubVec[0];
21763 SVI->replaceAllUsesWith(V: WideVec);
21764 }
21765
21766 return true;
21767}
21768
21769/// Lower an interleaved store into a vstN intrinsic.
21770///
21771/// E.g. Lower an interleaved store (Factor = 3):
21772/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
21773/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
21774/// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4
21775///
21776/// Into:
21777/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
21778/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
21779/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
21780/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
21781///
21782/// Note that the new shufflevectors will be removed and we'll only generate one
21783/// vst3 instruction in CodeGen.
21784///
21785/// Example for a more general valid mask (Factor 3). Lower:
21786/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
21787/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
21788/// store <12 x i32> %i.vec, <12 x i32>* %ptr
21789///
21790/// Into:
21791/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
21792/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
21793/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
21794/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
21795bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
21796 ShuffleVectorInst *SVI,
21797 unsigned Factor) const {
21798 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
21799 "Invalid interleave factor");
21800
21801 auto *VecTy = cast<FixedVectorType>(Val: SVI->getType());
21802 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
21803
21804 unsigned LaneLen = VecTy->getNumElements() / Factor;
21805 Type *EltTy = VecTy->getElementType();
21806 auto *SubVecTy = FixedVectorType::get(ElementType: EltTy, NumElts: LaneLen);
21807
21808 const DataLayout &DL = SI->getModule()->getDataLayout();
21809 Align Alignment = SI->getAlign();
21810
21811 // Skip if we do not have NEON and skip illegal vector types. We can
21812 // "legalize" wide vector types into multiple interleaved accesses as long as
21813 // the vector types are divisible by 128.
21814 if (!isLegalInterleavedAccessType(Factor, VecTy: SubVecTy, Alignment, DL))
21815 return false;
21816
21817 unsigned NumStores = getNumInterleavedAccesses(VecTy: SubVecTy, DL);
21818
21819 Value *Op0 = SVI->getOperand(i_nocapture: 0);
21820 Value *Op1 = SVI->getOperand(i_nocapture: 1);
21821 IRBuilder<> Builder(SI);
21822
21823 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
21824 // vectors to integer vectors.
21825 if (EltTy->isPointerTy()) {
21826 Type *IntTy = DL.getIntPtrType(EltTy);
21827
21828 // Convert to the corresponding integer vector.
21829 auto *IntVecTy =
21830 FixedVectorType::get(ElementType: IntTy, FVTy: cast<FixedVectorType>(Val: Op0->getType()));
21831 Op0 = Builder.CreatePtrToInt(V: Op0, DestTy: IntVecTy);
21832 Op1 = Builder.CreatePtrToInt(V: Op1, DestTy: IntVecTy);
21833
21834 SubVecTy = FixedVectorType::get(ElementType: IntTy, NumElts: LaneLen);
21835 }
21836
21837 // The base address of the store.
21838 Value *BaseAddr = SI->getPointerOperand();
21839
21840 if (NumStores > 1) {
21841 // If we're going to generate more than one store, reset the lane length
21842 // and sub-vector type to something legal.
21843 LaneLen /= NumStores;
21844 SubVecTy = FixedVectorType::get(ElementType: SubVecTy->getElementType(), NumElts: LaneLen);
21845 }
21846
21847 assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!");
21848
21849 auto Mask = SVI->getShuffleMask();
21850
21851 auto createStoreIntrinsic = [&](Value *BaseAddr,
21852 SmallVectorImpl<Value *> &Shuffles) {
21853 if (Subtarget->hasNEON()) {
21854 static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
21855 Intrinsic::arm_neon_vst3,
21856 Intrinsic::arm_neon_vst4};
21857 Type *PtrTy = Builder.getPtrTy(AddrSpace: SI->getPointerAddressSpace());
21858 Type *Tys[] = {PtrTy, SubVecTy};
21859
21860 Function *VstNFunc = Intrinsic::getDeclaration(
21861 M: SI->getModule(), id: StoreInts[Factor - 2], Tys);
21862
21863 SmallVector<Value *, 6> Ops;
21864 Ops.push_back(Elt: BaseAddr);
21865 append_range(C&: Ops, R&: Shuffles);
21866 Ops.push_back(Elt: Builder.getInt32(C: SI->getAlign().value()));
21867 Builder.CreateCall(Callee: VstNFunc, Args: Ops);
21868 } else {
21869 assert((Factor == 2 || Factor == 4) &&
21870 "expected interleave factor of 2 or 4 for MVE");
21871 Intrinsic::ID StoreInts =
21872 Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q;
21873 Type *PtrTy = Builder.getPtrTy(AddrSpace: SI->getPointerAddressSpace());
21874 Type *Tys[] = {PtrTy, SubVecTy};
21875 Function *VstNFunc =
21876 Intrinsic::getDeclaration(M: SI->getModule(), id: StoreInts, Tys);
21877
21878 SmallVector<Value *, 6> Ops;
21879 Ops.push_back(Elt: BaseAddr);
21880 append_range(C&: Ops, R&: Shuffles);
21881 for (unsigned F = 0; F < Factor; F++) {
21882 Ops.push_back(Elt: Builder.getInt32(C: F));
21883 Builder.CreateCall(Callee: VstNFunc, Args: Ops);
21884 Ops.pop_back();
21885 }
21886 }
21887 };
21888
21889 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
21890 // If we generating more than one store, we compute the base address of
21891 // subsequent stores as an offset from the previous.
21892 if (StoreCount > 0)
21893 BaseAddr = Builder.CreateConstGEP1_32(Ty: SubVecTy->getElementType(),
21894 Ptr: BaseAddr, Idx0: LaneLen * Factor);
21895
21896 SmallVector<Value *, 4> Shuffles;
21897
21898 // Split the shufflevector operands into sub vectors for the new vstN call.
21899 for (unsigned i = 0; i < Factor; i++) {
21900 unsigned IdxI = StoreCount * LaneLen * Factor + i;
21901 if (Mask[IdxI] >= 0) {
21902 Shuffles.push_back(Elt: Builder.CreateShuffleVector(
21903 V1: Op0, V2: Op1, Mask: createSequentialMask(Start: Mask[IdxI], NumInts: LaneLen, NumUndefs: 0)));
21904 } else {
21905 unsigned StartMask = 0;
21906 for (unsigned j = 1; j < LaneLen; j++) {
21907 unsigned IdxJ = StoreCount * LaneLen * Factor + j;
21908 if (Mask[IdxJ * Factor + IdxI] >= 0) {
21909 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
21910 break;
21911 }
21912 }
21913 // Note: If all elements in a chunk are undefs, StartMask=0!
21914 // Note: Filling undef gaps with random elements is ok, since
21915 // those elements were being written anyway (with undefs).
21916 // In the case of all undefs we're defaulting to using elems from 0
21917 // Note: StartMask cannot be negative, it's checked in
21918 // isReInterleaveMask
21919 Shuffles.push_back(Elt: Builder.CreateShuffleVector(
21920 V1: Op0, V2: Op1, Mask: createSequentialMask(Start: StartMask, NumInts: LaneLen, NumUndefs: 0)));
21921 }
21922 }
21923
21924 createStoreIntrinsic(BaseAddr, Shuffles);
21925 }
21926 return true;
21927}
21928
21929enum HABaseType {
21930 HA_UNKNOWN = 0,
21931 HA_FLOAT,
21932 HA_DOUBLE,
21933 HA_VECT64,
21934 HA_VECT128
21935};
21936
21937static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
21938 uint64_t &Members) {
21939 if (auto *ST = dyn_cast<StructType>(Val: Ty)) {
21940 for (unsigned i = 0; i < ST->getNumElements(); ++i) {
21941 uint64_t SubMembers = 0;
21942 if (!isHomogeneousAggregate(Ty: ST->getElementType(N: i), Base, Members&: SubMembers))
21943 return false;
21944 Members += SubMembers;
21945 }
21946 } else if (auto *AT = dyn_cast<ArrayType>(Val: Ty)) {
21947 uint64_t SubMembers = 0;
21948 if (!isHomogeneousAggregate(Ty: AT->getElementType(), Base, Members&: SubMembers))
21949 return false;
21950 Members += SubMembers * AT->getNumElements();
21951 } else if (Ty->isFloatTy()) {
21952 if (Base != HA_UNKNOWN && Base != HA_FLOAT)
21953 return false;
21954 Members = 1;
21955 Base = HA_FLOAT;
21956 } else if (Ty->isDoubleTy()) {
21957 if (Base != HA_UNKNOWN && Base != HA_DOUBLE)
21958 return false;
21959 Members = 1;
21960 Base = HA_DOUBLE;
21961 } else if (auto *VT = dyn_cast<VectorType>(Val: Ty)) {
21962 Members = 1;
21963 switch (Base) {
21964 case HA_FLOAT:
21965 case HA_DOUBLE:
21966 return false;
21967 case HA_VECT64:
21968 return VT->getPrimitiveSizeInBits().getFixedValue() == 64;
21969 case HA_VECT128:
21970 return VT->getPrimitiveSizeInBits().getFixedValue() == 128;
21971 case HA_UNKNOWN:
21972 switch (VT->getPrimitiveSizeInBits().getFixedValue()) {
21973 case 64:
21974 Base = HA_VECT64;
21975 return true;
21976 case 128:
21977 Base = HA_VECT128;
21978 return true;
21979 default:
21980 return false;
21981 }
21982 }
21983 }
21984
21985 return (Members > 0 && Members <= 4);
21986}
21987
21988/// Return the correct alignment for the current calling convention.
21989Align ARMTargetLowering::getABIAlignmentForCallingConv(
21990 Type *ArgTy, const DataLayout &DL) const {
21991 const Align ABITypeAlign = DL.getABITypeAlign(Ty: ArgTy);
21992 if (!ArgTy->isVectorTy())
21993 return ABITypeAlign;
21994
21995 // Avoid over-aligning vector parameters. It would require realigning the
21996 // stack and waste space for no real benefit.
21997 return std::min(a: ABITypeAlign, b: DL.getStackAlignment());
21998}
21999
22000/// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
22001/// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
22002/// passing according to AAPCS rules.
22003bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
22004 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
22005 const DataLayout &DL) const {
22006 if (getEffectiveCallingConv(CC: CallConv, isVarArg) !=
22007 CallingConv::ARM_AAPCS_VFP)
22008 return false;
22009
22010 HABaseType Base = HA_UNKNOWN;
22011 uint64_t Members = 0;
22012 bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
22013 LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
22014
22015 bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
22016 return IsHA || IsIntArray;
22017}
22018
22019Register ARMTargetLowering::getExceptionPointerRegister(
22020 const Constant *PersonalityFn) const {
22021 // Platforms which do not use SjLj EH may return values in these registers
22022 // via the personality function.
22023 return Subtarget->useSjLjEH() ? Register() : ARM::R0;
22024}
22025
22026Register ARMTargetLowering::getExceptionSelectorRegister(
22027 const Constant *PersonalityFn) const {
22028 // Platforms which do not use SjLj EH may return values in these registers
22029 // via the personality function.
22030 return Subtarget->useSjLjEH() ? Register() : ARM::R1;
22031}
22032
22033void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
22034 // Update IsSplitCSR in ARMFunctionInfo.
22035 ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>();
22036 AFI->setIsSplitCSR(true);
22037}
22038
22039void ARMTargetLowering::insertCopiesSplitCSR(
22040 MachineBasicBlock *Entry,
22041 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
22042 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
22043 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(MF: Entry->getParent());
22044 if (!IStart)
22045 return;
22046
22047 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
22048 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
22049 MachineBasicBlock::iterator MBBI = Entry->begin();
22050 for (const MCPhysReg *I = IStart; *I; ++I) {
22051 const TargetRegisterClass *RC = nullptr;
22052 if (ARM::GPRRegClass.contains(*I))
22053 RC = &ARM::GPRRegClass;
22054 else if (ARM::DPRRegClass.contains(*I))
22055 RC = &ARM::DPRRegClass;
22056 else
22057 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
22058
22059 Register NewVR = MRI->createVirtualRegister(RegClass: RC);
22060 // Create copy from CSR to a virtual register.
22061 // FIXME: this currently does not emit CFI pseudo-instructions, it works
22062 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
22063 // nounwind. If we want to generalize this later, we may need to emit
22064 // CFI pseudo-instructions.
22065 assert(Entry->getParent()->getFunction().hasFnAttribute(
22066 Attribute::NoUnwind) &&
22067 "Function should be nounwind in insertCopiesSplitCSR!");
22068 Entry->addLiveIn(PhysReg: *I);
22069 BuildMI(BB&: *Entry, I: MBBI, MIMD: DebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: NewVR)
22070 .addReg(RegNo: *I);
22071
22072 // Insert the copy-back instructions right before the terminator.
22073 for (auto *Exit : Exits)
22074 BuildMI(BB&: *Exit, I: Exit->getFirstTerminator(), MIMD: DebugLoc(),
22075 MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: *I)
22076 .addReg(RegNo: NewVR);
22077 }
22078}
22079
22080void ARMTargetLowering::finalizeLowering(MachineFunction &MF) const {
22081 MF.getFrameInfo().computeMaxCallFrameSize(MF);
22082 TargetLoweringBase::finalizeLowering(MF);
22083}
22084
22085bool ARMTargetLowering::isComplexDeinterleavingSupported() const {
22086 return Subtarget->hasMVEIntegerOps();
22087}
22088
22089bool ARMTargetLowering::isComplexDeinterleavingOperationSupported(
22090 ComplexDeinterleavingOperation Operation, Type *Ty) const {
22091 auto *VTy = dyn_cast<FixedVectorType>(Val: Ty);
22092 if (!VTy)
22093 return false;
22094
22095 auto *ScalarTy = VTy->getScalarType();
22096 unsigned NumElements = VTy->getNumElements();
22097
22098 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
22099 if (VTyWidth < 128 || !llvm::isPowerOf2_32(Value: VTyWidth))
22100 return false;
22101
22102 // Both VCADD and VCMUL/VCMLA support the same types, F16 and F32
22103 if (ScalarTy->isHalfTy() || ScalarTy->isFloatTy())
22104 return Subtarget->hasMVEFloatOps();
22105
22106 if (Operation != ComplexDeinterleavingOperation::CAdd)
22107 return false;
22108
22109 return Subtarget->hasMVEIntegerOps() &&
22110 (ScalarTy->isIntegerTy(Bitwidth: 8) || ScalarTy->isIntegerTy(Bitwidth: 16) ||
22111 ScalarTy->isIntegerTy(Bitwidth: 32));
22112}
22113
22114Value *ARMTargetLowering::createComplexDeinterleavingIR(
22115 IRBuilderBase &B, ComplexDeinterleavingOperation OperationType,
22116 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
22117 Value *Accumulator) const {
22118
22119 FixedVectorType *Ty = cast<FixedVectorType>(Val: InputA->getType());
22120
22121 unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getNumElements();
22122
22123 assert(TyWidth >= 128 && "Width of vector type must be at least 128 bits");
22124
22125 if (TyWidth > 128) {
22126 int Stride = Ty->getNumElements() / 2;
22127 auto SplitSeq = llvm::seq<int>(Begin: 0, End: Ty->getNumElements());
22128 auto SplitSeqVec = llvm::to_vector(Range&: SplitSeq);
22129 ArrayRef<int> LowerSplitMask(&SplitSeqVec[0], Stride);
22130 ArrayRef<int> UpperSplitMask(&SplitSeqVec[Stride], Stride);
22131
22132 auto *LowerSplitA = B.CreateShuffleVector(V: InputA, Mask: LowerSplitMask);
22133 auto *LowerSplitB = B.CreateShuffleVector(V: InputB, Mask: LowerSplitMask);
22134 auto *UpperSplitA = B.CreateShuffleVector(V: InputA, Mask: UpperSplitMask);
22135 auto *UpperSplitB = B.CreateShuffleVector(V: InputB, Mask: UpperSplitMask);
22136 Value *LowerSplitAcc = nullptr;
22137 Value *UpperSplitAcc = nullptr;
22138
22139 if (Accumulator) {
22140 LowerSplitAcc = B.CreateShuffleVector(V: Accumulator, Mask: LowerSplitMask);
22141 UpperSplitAcc = B.CreateShuffleVector(V: Accumulator, Mask: UpperSplitMask);
22142 }
22143
22144 auto *LowerSplitInt = createComplexDeinterleavingIR(
22145 B, OperationType, Rotation, InputA: LowerSplitA, InputB: LowerSplitB, Accumulator: LowerSplitAcc);
22146 auto *UpperSplitInt = createComplexDeinterleavingIR(
22147 B, OperationType, Rotation, InputA: UpperSplitA, InputB: UpperSplitB, Accumulator: UpperSplitAcc);
22148
22149 ArrayRef<int> JoinMask(&SplitSeqVec[0], Ty->getNumElements());
22150 return B.CreateShuffleVector(V1: LowerSplitInt, V2: UpperSplitInt, Mask: JoinMask);
22151 }
22152
22153 auto *IntTy = Type::getInt32Ty(C&: B.getContext());
22154
22155 ConstantInt *ConstRotation = nullptr;
22156 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
22157 ConstRotation = ConstantInt::get(Ty: IntTy, V: (int)Rotation);
22158
22159 if (Accumulator)
22160 return B.CreateIntrinsic(Intrinsic::arm_mve_vcmlaq, Ty,
22161 {ConstRotation, Accumulator, InputB, InputA});
22162 return B.CreateIntrinsic(Intrinsic::arm_mve_vcmulq, Ty,
22163 {ConstRotation, InputB, InputA});
22164 }
22165
22166 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
22167 // 1 means the value is not halved.
22168 auto *ConstHalving = ConstantInt::get(Ty: IntTy, V: 1);
22169
22170 if (Rotation == ComplexDeinterleavingRotation::Rotation_90)
22171 ConstRotation = ConstantInt::get(Ty: IntTy, V: 0);
22172 else if (Rotation == ComplexDeinterleavingRotation::Rotation_270)
22173 ConstRotation = ConstantInt::get(Ty: IntTy, V: 1);
22174
22175 if (!ConstRotation)
22176 return nullptr; // Invalid rotation for arm_mve_vcaddq
22177
22178 return B.CreateIntrinsic(Intrinsic::arm_mve_vcaddq, Ty,
22179 {ConstHalving, ConstRotation, InputA, InputB});
22180 }
22181
22182 return nullptr;
22183}
22184

source code of llvm/lib/Target/ARM/ARMISelLowering.cpp