1//===---------- X86.cpp - Emit LLVM Code for builtins ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This contains code to emit Builtin calls as LLVM code.
10//
11//===----------------------------------------------------------------------===//
12
13#include "CGBuiltin.h"
14#include "clang/Basic/TargetBuiltins.h"
15#include "llvm/IR/InlineAsm.h"
16#include "llvm/IR/IntrinsicsX86.h"
17#include "llvm/TargetParser/X86TargetParser.h"
18
19using namespace clang;
20using namespace CodeGen;
21using namespace llvm;
22
23static std::optional<CodeGenFunction::MSVCIntrin>
24translateX86ToMsvcIntrin(unsigned BuiltinID) {
25 using MSVCIntrin = CodeGenFunction::MSVCIntrin;
26 switch (BuiltinID) {
27 default:
28 return std::nullopt;
29 case clang::X86::BI_BitScanForward:
30 case clang::X86::BI_BitScanForward64:
31 return MSVCIntrin::_BitScanForward;
32 case clang::X86::BI_BitScanReverse:
33 case clang::X86::BI_BitScanReverse64:
34 return MSVCIntrin::_BitScanReverse;
35 case clang::X86::BI_InterlockedAnd64:
36 return MSVCIntrin::_InterlockedAnd;
37 case clang::X86::BI_InterlockedCompareExchange128:
38 return MSVCIntrin::_InterlockedCompareExchange128;
39 case clang::X86::BI_InterlockedExchange64:
40 return MSVCIntrin::_InterlockedExchange;
41 case clang::X86::BI_InterlockedExchangeAdd64:
42 return MSVCIntrin::_InterlockedExchangeAdd;
43 case clang::X86::BI_InterlockedExchangeSub64:
44 return MSVCIntrin::_InterlockedExchangeSub;
45 case clang::X86::BI_InterlockedOr64:
46 return MSVCIntrin::_InterlockedOr;
47 case clang::X86::BI_InterlockedXor64:
48 return MSVCIntrin::_InterlockedXor;
49 case clang::X86::BI_InterlockedDecrement64:
50 return MSVCIntrin::_InterlockedDecrement;
51 case clang::X86::BI_InterlockedIncrement64:
52 return MSVCIntrin::_InterlockedIncrement;
53 }
54 llvm_unreachable("must return from switch");
55}
56
57// Convert the mask from an integer type to a vector of i1.
58static Value *getMaskVecValue(CodeGenFunction &CGF, Value *Mask,
59 unsigned NumElts) {
60
61 auto *MaskTy = llvm::FixedVectorType::get(
62 ElementType: CGF.Builder.getInt1Ty(),
63 NumElts: cast<IntegerType>(Val: Mask->getType())->getBitWidth());
64 Value *MaskVec = CGF.Builder.CreateBitCast(V: Mask, DestTy: MaskTy);
65
66 // If we have less than 8 elements, then the starting mask was an i8 and
67 // we need to extract down to the right number of elements.
68 if (NumElts < 8) {
69 int Indices[4];
70 for (unsigned i = 0; i != NumElts; ++i)
71 Indices[i] = i;
72 MaskVec = CGF.Builder.CreateShuffleVector(
73 V1: MaskVec, V2: MaskVec, Mask: ArrayRef(Indices, NumElts), Name: "extract");
74 }
75 return MaskVec;
76}
77
78static Value *EmitX86MaskedStore(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
79 Align Alignment) {
80 Value *Ptr = Ops[0];
81
82 Value *MaskVec = getMaskVecValue(
83 CGF, Mask: Ops[2],
84 NumElts: cast<llvm::FixedVectorType>(Val: Ops[1]->getType())->getNumElements());
85
86 return CGF.Builder.CreateMaskedStore(Val: Ops[1], Ptr, Alignment, Mask: MaskVec);
87}
88
89static Value *EmitX86MaskedLoad(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
90 Align Alignment) {
91 llvm::Type *Ty = Ops[1]->getType();
92 Value *Ptr = Ops[0];
93
94 Value *MaskVec = getMaskVecValue(
95 CGF, Mask: Ops[2], NumElts: cast<llvm::FixedVectorType>(Val: Ty)->getNumElements());
96
97 return CGF.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask: MaskVec, PassThru: Ops[1]);
98}
99
100static Value *EmitX86ExpandLoad(CodeGenFunction &CGF,
101 ArrayRef<Value *> Ops) {
102 auto *ResultTy = cast<llvm::VectorType>(Val: Ops[1]->getType());
103 Value *Ptr = Ops[0];
104
105 Value *MaskVec = getMaskVecValue(
106 CGF, Mask: Ops[2], NumElts: cast<FixedVectorType>(Val: ResultTy)->getNumElements());
107
108 llvm::Function *F = CGF.CGM.getIntrinsic(Intrinsic::masked_expandload,
109 ResultTy);
110 return CGF.Builder.CreateCall(Callee: F, Args: { Ptr, MaskVec, Ops[1] });
111}
112
113static Value *EmitX86CompressExpand(CodeGenFunction &CGF,
114 ArrayRef<Value *> Ops,
115 bool IsCompress) {
116 auto *ResultTy = cast<llvm::FixedVectorType>(Val: Ops[1]->getType());
117
118 Value *MaskVec = getMaskVecValue(CGF, Mask: Ops[2], NumElts: ResultTy->getNumElements());
119
120 Intrinsic::ID IID = IsCompress ? Intrinsic::x86_avx512_mask_compress
121 : Intrinsic::x86_avx512_mask_expand;
122 llvm::Function *F = CGF.CGM.getIntrinsic(IID, Tys: ResultTy);
123 return CGF.Builder.CreateCall(Callee: F, Args: { Ops[0], Ops[1], MaskVec });
124}
125
126static Value *EmitX86CompressStore(CodeGenFunction &CGF,
127 ArrayRef<Value *> Ops) {
128 auto *ResultTy = cast<llvm::FixedVectorType>(Val: Ops[1]->getType());
129 Value *Ptr = Ops[0];
130
131 Value *MaskVec = getMaskVecValue(CGF, Mask: Ops[2], NumElts: ResultTy->getNumElements());
132
133 llvm::Function *F = CGF.CGM.getIntrinsic(Intrinsic::masked_compressstore,
134 ResultTy);
135 return CGF.Builder.CreateCall(Callee: F, Args: { Ops[1], Ptr, MaskVec });
136}
137
138static Value *EmitX86MaskLogic(CodeGenFunction &CGF, Instruction::BinaryOps Opc,
139 ArrayRef<Value *> Ops,
140 bool InvertLHS = false) {
141 unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
142 Value *LHS = getMaskVecValue(CGF, Mask: Ops[0], NumElts);
143 Value *RHS = getMaskVecValue(CGF, Mask: Ops[1], NumElts);
144
145 if (InvertLHS)
146 LHS = CGF.Builder.CreateNot(V: LHS);
147
148 return CGF.Builder.CreateBitCast(V: CGF.Builder.CreateBinOp(Opc, LHS, RHS),
149 DestTy: Ops[0]->getType());
150}
151
152static Value *EmitX86FunnelShift(CodeGenFunction &CGF, Value *Op0, Value *Op1,
153 Value *Amt, bool IsRight) {
154 llvm::Type *Ty = Op0->getType();
155
156 // Amount may be scalar immediate, in which case create a splat vector.
157 // Funnel shifts amounts are treated as modulo and types are all power-of-2 so
158 // we only care about the lowest log2 bits anyway.
159 if (Amt->getType() != Ty) {
160 unsigned NumElts = cast<llvm::FixedVectorType>(Val: Ty)->getNumElements();
161 Amt = CGF.Builder.CreateIntCast(V: Amt, DestTy: Ty->getScalarType(), isSigned: false);
162 Amt = CGF.Builder.CreateVectorSplat(NumElts, V: Amt);
163 }
164
165 unsigned IID = IsRight ? Intrinsic::fshr : Intrinsic::fshl;
166 Function *F = CGF.CGM.getIntrinsic(IID, Tys: Ty);
167 return CGF.Builder.CreateCall(Callee: F, Args: {Op0, Op1, Amt});
168}
169
170static Value *EmitX86vpcom(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
171 bool IsSigned) {
172 Value *Op0 = Ops[0];
173 Value *Op1 = Ops[1];
174 llvm::Type *Ty = Op0->getType();
175 uint64_t Imm = cast<llvm::ConstantInt>(Val: Ops[2])->getZExtValue() & 0x7;
176
177 CmpInst::Predicate Pred;
178 switch (Imm) {
179 case 0x0:
180 Pred = IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
181 break;
182 case 0x1:
183 Pred = IsSigned ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE;
184 break;
185 case 0x2:
186 Pred = IsSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
187 break;
188 case 0x3:
189 Pred = IsSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE;
190 break;
191 case 0x4:
192 Pred = ICmpInst::ICMP_EQ;
193 break;
194 case 0x5:
195 Pred = ICmpInst::ICMP_NE;
196 break;
197 case 0x6:
198 return llvm::Constant::getNullValue(Ty); // FALSE
199 case 0x7:
200 return llvm::Constant::getAllOnesValue(Ty); // TRUE
201 default:
202 llvm_unreachable("Unexpected XOP vpcom/vpcomu predicate");
203 }
204
205 Value *Cmp = CGF.Builder.CreateICmp(P: Pred, LHS: Op0, RHS: Op1);
206 Value *Res = CGF.Builder.CreateSExt(V: Cmp, DestTy: Ty);
207 return Res;
208}
209
210static Value *EmitX86Select(CodeGenFunction &CGF,
211 Value *Mask, Value *Op0, Value *Op1) {
212
213 // If the mask is all ones just return first argument.
214 if (const auto *C = dyn_cast<Constant>(Val: Mask))
215 if (C->isAllOnesValue())
216 return Op0;
217
218 Mask = getMaskVecValue(
219 CGF, Mask, NumElts: cast<llvm::FixedVectorType>(Val: Op0->getType())->getNumElements());
220
221 return CGF.Builder.CreateSelect(C: Mask, True: Op0, False: Op1);
222}
223
224static Value *EmitX86ScalarSelect(CodeGenFunction &CGF,
225 Value *Mask, Value *Op0, Value *Op1) {
226 // If the mask is all ones just return first argument.
227 if (const auto *C = dyn_cast<Constant>(Val: Mask))
228 if (C->isAllOnesValue())
229 return Op0;
230
231 auto *MaskTy = llvm::FixedVectorType::get(
232 ElementType: CGF.Builder.getInt1Ty(), NumElts: Mask->getType()->getIntegerBitWidth());
233 Mask = CGF.Builder.CreateBitCast(V: Mask, DestTy: MaskTy);
234 Mask = CGF.Builder.CreateExtractElement(Vec: Mask, Idx: (uint64_t)0);
235 return CGF.Builder.CreateSelect(C: Mask, True: Op0, False: Op1);
236}
237
238static Value *EmitX86MaskedCompareResult(CodeGenFunction &CGF, Value *Cmp,
239 unsigned NumElts, Value *MaskIn) {
240 if (MaskIn) {
241 const auto *C = dyn_cast<Constant>(Val: MaskIn);
242 if (!C || !C->isAllOnesValue())
243 Cmp = CGF.Builder.CreateAnd(LHS: Cmp, RHS: getMaskVecValue(CGF, Mask: MaskIn, NumElts));
244 }
245
246 if (NumElts < 8) {
247 int Indices[8];
248 for (unsigned i = 0; i != NumElts; ++i)
249 Indices[i] = i;
250 for (unsigned i = NumElts; i != 8; ++i)
251 Indices[i] = i % NumElts + NumElts;
252 Cmp = CGF.Builder.CreateShuffleVector(
253 V1: Cmp, V2: llvm::Constant::getNullValue(Ty: Cmp->getType()), Mask: Indices);
254 }
255
256 return CGF.Builder.CreateBitCast(V: Cmp,
257 DestTy: IntegerType::get(C&: CGF.getLLVMContext(),
258 NumBits: std::max(a: NumElts, b: 8U)));
259}
260
261static Value *EmitX86MaskedCompare(CodeGenFunction &CGF, unsigned CC,
262 bool Signed, ArrayRef<Value *> Ops) {
263 assert((Ops.size() == 2 || Ops.size() == 4) &&
264 "Unexpected number of arguments");
265 unsigned NumElts =
266 cast<llvm::FixedVectorType>(Val: Ops[0]->getType())->getNumElements();
267 Value *Cmp;
268
269 if (CC == 3) {
270 Cmp = Constant::getNullValue(
271 Ty: llvm::FixedVectorType::get(ElementType: CGF.Builder.getInt1Ty(), NumElts));
272 } else if (CC == 7) {
273 Cmp = Constant::getAllOnesValue(
274 Ty: llvm::FixedVectorType::get(ElementType: CGF.Builder.getInt1Ty(), NumElts));
275 } else {
276 ICmpInst::Predicate Pred;
277 switch (CC) {
278 default: llvm_unreachable("Unknown condition code");
279 case 0: Pred = ICmpInst::ICMP_EQ; break;
280 case 1: Pred = Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; break;
281 case 2: Pred = Signed ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; break;
282 case 4: Pred = ICmpInst::ICMP_NE; break;
283 case 5: Pred = Signed ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE; break;
284 case 6: Pred = Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; break;
285 }
286 Cmp = CGF.Builder.CreateICmp(P: Pred, LHS: Ops[0], RHS: Ops[1]);
287 }
288
289 Value *MaskIn = nullptr;
290 if (Ops.size() == 4)
291 MaskIn = Ops[3];
292
293 return EmitX86MaskedCompareResult(CGF, Cmp, NumElts, MaskIn);
294}
295
296static Value *EmitX86ConvertToMask(CodeGenFunction &CGF, Value *In) {
297 Value *Zero = Constant::getNullValue(Ty: In->getType());
298 return EmitX86MaskedCompare(CGF, CC: 1, Signed: true, Ops: { In, Zero });
299}
300
301static Value *EmitX86ConvertIntToFp(CodeGenFunction &CGF, const CallExpr *E,
302 ArrayRef<Value *> Ops, bool IsSigned) {
303 unsigned Rnd = cast<llvm::ConstantInt>(Val: Ops[3])->getZExtValue();
304 llvm::Type *Ty = Ops[1]->getType();
305
306 Value *Res;
307 if (Rnd != 4) {
308 Intrinsic::ID IID = IsSigned ? Intrinsic::x86_avx512_sitofp_round
309 : Intrinsic::x86_avx512_uitofp_round;
310 Function *F = CGF.CGM.getIntrinsic(IID, Tys: { Ty, Ops[0]->getType() });
311 Res = CGF.Builder.CreateCall(Callee: F, Args: { Ops[0], Ops[3] });
312 } else {
313 CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
314 Res = IsSigned ? CGF.Builder.CreateSIToFP(V: Ops[0], DestTy: Ty)
315 : CGF.Builder.CreateUIToFP(V: Ops[0], DestTy: Ty);
316 }
317
318 return EmitX86Select(CGF, Mask: Ops[2], Op0: Res, Op1: Ops[1]);
319}
320
321// Lowers X86 FMA intrinsics to IR.
322static Value *EmitX86FMAExpr(CodeGenFunction &CGF, const CallExpr *E,
323 ArrayRef<Value *> Ops, unsigned BuiltinID,
324 bool IsAddSub) {
325
326 bool Subtract = false;
327 Intrinsic::ID IID = Intrinsic::not_intrinsic;
328 switch (BuiltinID) {
329 default: break;
330 case clang::X86::BI__builtin_ia32_vfmsubph512_mask3:
331 Subtract = true;
332 [[fallthrough]];
333 case clang::X86::BI__builtin_ia32_vfmaddph512_mask:
334 case clang::X86::BI__builtin_ia32_vfmaddph512_maskz:
335 case clang::X86::BI__builtin_ia32_vfmaddph512_mask3:
336 IID = Intrinsic::x86_avx512fp16_vfmadd_ph_512;
337 break;
338 case clang::X86::BI__builtin_ia32_vfmsubaddph512_mask3:
339 Subtract = true;
340 [[fallthrough]];
341 case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask:
342 case clang::X86::BI__builtin_ia32_vfmaddsubph512_maskz:
343 case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask3:
344 IID = Intrinsic::x86_avx512fp16_vfmaddsub_ph_512;
345 break;
346 case clang::X86::BI__builtin_ia32_vfmsubps512_mask3:
347 Subtract = true;
348 [[fallthrough]];
349 case clang::X86::BI__builtin_ia32_vfmaddps512_mask:
350 case clang::X86::BI__builtin_ia32_vfmaddps512_maskz:
351 case clang::X86::BI__builtin_ia32_vfmaddps512_mask3:
352 IID = Intrinsic::x86_avx512_vfmadd_ps_512; break;
353 case clang::X86::BI__builtin_ia32_vfmsubpd512_mask3:
354 Subtract = true;
355 [[fallthrough]];
356 case clang::X86::BI__builtin_ia32_vfmaddpd512_mask:
357 case clang::X86::BI__builtin_ia32_vfmaddpd512_maskz:
358 case clang::X86::BI__builtin_ia32_vfmaddpd512_mask3:
359 IID = Intrinsic::x86_avx512_vfmadd_pd_512; break;
360 case clang::X86::BI__builtin_ia32_vfmsubaddps512_mask3:
361 Subtract = true;
362 [[fallthrough]];
363 case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask:
364 case clang::X86::BI__builtin_ia32_vfmaddsubps512_maskz:
365 case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask3:
366 IID = Intrinsic::x86_avx512_vfmaddsub_ps_512;
367 break;
368 case clang::X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
369 Subtract = true;
370 [[fallthrough]];
371 case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask:
372 case clang::X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
373 case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
374 IID = Intrinsic::x86_avx512_vfmaddsub_pd_512;
375 break;
376 }
377
378 Value *A = Ops[0];
379 Value *B = Ops[1];
380 Value *C = Ops[2];
381
382 if (Subtract)
383 C = CGF.Builder.CreateFNeg(V: C);
384
385 Value *Res;
386
387 // Only handle in case of _MM_FROUND_CUR_DIRECTION/4 (no rounding).
388 if (IID != Intrinsic::not_intrinsic &&
389 (cast<llvm::ConstantInt>(Val: Ops.back())->getZExtValue() != (uint64_t)4 ||
390 IsAddSub)) {
391 Function *Intr = CGF.CGM.getIntrinsic(IID);
392 Res = CGF.Builder.CreateCall(Callee: Intr, Args: {A, B, C, Ops.back() });
393 } else {
394 llvm::Type *Ty = A->getType();
395 Function *FMA;
396 if (CGF.Builder.getIsFPConstrained()) {
397 CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
398 FMA = CGF.CGM.getIntrinsic(Intrinsic::experimental_constrained_fma, Ty);
399 Res = CGF.Builder.CreateConstrainedFPCall(Callee: FMA, Args: {A, B, C});
400 } else {
401 FMA = CGF.CGM.getIntrinsic(Intrinsic::fma, Ty);
402 Res = CGF.Builder.CreateCall(Callee: FMA, Args: {A, B, C});
403 }
404 }
405
406 // Handle any required masking.
407 Value *MaskFalseVal = nullptr;
408 switch (BuiltinID) {
409 case clang::X86::BI__builtin_ia32_vfmaddph512_mask:
410 case clang::X86::BI__builtin_ia32_vfmaddps512_mask:
411 case clang::X86::BI__builtin_ia32_vfmaddpd512_mask:
412 case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask:
413 case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask:
414 case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask:
415 MaskFalseVal = Ops[0];
416 break;
417 case clang::X86::BI__builtin_ia32_vfmaddph512_maskz:
418 case clang::X86::BI__builtin_ia32_vfmaddps512_maskz:
419 case clang::X86::BI__builtin_ia32_vfmaddpd512_maskz:
420 case clang::X86::BI__builtin_ia32_vfmaddsubph512_maskz:
421 case clang::X86::BI__builtin_ia32_vfmaddsubps512_maskz:
422 case clang::X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
423 MaskFalseVal = Constant::getNullValue(Ty: Ops[0]->getType());
424 break;
425 case clang::X86::BI__builtin_ia32_vfmsubph512_mask3:
426 case clang::X86::BI__builtin_ia32_vfmaddph512_mask3:
427 case clang::X86::BI__builtin_ia32_vfmsubps512_mask3:
428 case clang::X86::BI__builtin_ia32_vfmaddps512_mask3:
429 case clang::X86::BI__builtin_ia32_vfmsubpd512_mask3:
430 case clang::X86::BI__builtin_ia32_vfmaddpd512_mask3:
431 case clang::X86::BI__builtin_ia32_vfmsubaddph512_mask3:
432 case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask3:
433 case clang::X86::BI__builtin_ia32_vfmsubaddps512_mask3:
434 case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask3:
435 case clang::X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
436 case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
437 MaskFalseVal = Ops[2];
438 break;
439 }
440
441 if (MaskFalseVal)
442 return EmitX86Select(CGF, Mask: Ops[3], Op0: Res, Op1: MaskFalseVal);
443
444 return Res;
445}
446
447static Value *EmitScalarFMAExpr(CodeGenFunction &CGF, const CallExpr *E,
448 MutableArrayRef<Value *> Ops, Value *Upper,
449 bool ZeroMask = false, unsigned PTIdx = 0,
450 bool NegAcc = false) {
451 unsigned Rnd = 4;
452 if (Ops.size() > 4)
453 Rnd = cast<llvm::ConstantInt>(Val: Ops[4])->getZExtValue();
454
455 if (NegAcc)
456 Ops[2] = CGF.Builder.CreateFNeg(V: Ops[2]);
457
458 Ops[0] = CGF.Builder.CreateExtractElement(Vec: Ops[0], Idx: (uint64_t)0);
459 Ops[1] = CGF.Builder.CreateExtractElement(Vec: Ops[1], Idx: (uint64_t)0);
460 Ops[2] = CGF.Builder.CreateExtractElement(Vec: Ops[2], Idx: (uint64_t)0);
461 Value *Res;
462 if (Rnd != 4) {
463 Intrinsic::ID IID;
464
465 switch (Ops[0]->getType()->getPrimitiveSizeInBits()) {
466 case 16:
467 IID = Intrinsic::x86_avx512fp16_vfmadd_f16;
468 break;
469 case 32:
470 IID = Intrinsic::x86_avx512_vfmadd_f32;
471 break;
472 case 64:
473 IID = Intrinsic::x86_avx512_vfmadd_f64;
474 break;
475 default:
476 llvm_unreachable("Unexpected size");
477 }
478 Res = CGF.Builder.CreateCall(Callee: CGF.CGM.getIntrinsic(IID),
479 Args: {Ops[0], Ops[1], Ops[2], Ops[4]});
480 } else if (CGF.Builder.getIsFPConstrained()) {
481 CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
482 Function *FMA = CGF.CGM.getIntrinsic(
483 Intrinsic::experimental_constrained_fma, Ops[0]->getType());
484 Res = CGF.Builder.CreateConstrainedFPCall(Callee: FMA, Args: Ops.slice(N: 0, M: 3));
485 } else {
486 Function *FMA = CGF.CGM.getIntrinsic(Intrinsic::fma, Ops[0]->getType());
487 Res = CGF.Builder.CreateCall(Callee: FMA, Args: Ops.slice(N: 0, M: 3));
488 }
489 // If we have more than 3 arguments, we need to do masking.
490 if (Ops.size() > 3) {
491 Value *PassThru = ZeroMask ? Constant::getNullValue(Ty: Res->getType())
492 : Ops[PTIdx];
493
494 // If we negated the accumulator and the its the PassThru value we need to
495 // bypass the negate. Conveniently Upper should be the same thing in this
496 // case.
497 if (NegAcc && PTIdx == 2)
498 PassThru = CGF.Builder.CreateExtractElement(Vec: Upper, Idx: (uint64_t)0);
499
500 Res = EmitX86ScalarSelect(CGF, Mask: Ops[3], Op0: Res, Op1: PassThru);
501 }
502 return CGF.Builder.CreateInsertElement(Vec: Upper, NewElt: Res, Idx: (uint64_t)0);
503}
504
505static Value *EmitX86Muldq(CodeGenFunction &CGF, bool IsSigned,
506 ArrayRef<Value *> Ops) {
507 llvm::Type *Ty = Ops[0]->getType();
508 // Arguments have a vXi32 type so cast to vXi64.
509 Ty = llvm::FixedVectorType::get(ElementType: CGF.Int64Ty,
510 NumElts: Ty->getPrimitiveSizeInBits() / 64);
511 Value *LHS = CGF.Builder.CreateBitCast(V: Ops[0], DestTy: Ty);
512 Value *RHS = CGF.Builder.CreateBitCast(V: Ops[1], DestTy: Ty);
513
514 if (IsSigned) {
515 // Shift left then arithmetic shift right.
516 Constant *ShiftAmt = ConstantInt::get(Ty, V: 32);
517 LHS = CGF.Builder.CreateShl(LHS, RHS: ShiftAmt);
518 LHS = CGF.Builder.CreateAShr(LHS, RHS: ShiftAmt);
519 RHS = CGF.Builder.CreateShl(LHS: RHS, RHS: ShiftAmt);
520 RHS = CGF.Builder.CreateAShr(LHS: RHS, RHS: ShiftAmt);
521 } else {
522 // Clear the upper bits.
523 Constant *Mask = ConstantInt::get(Ty, V: 0xffffffff);
524 LHS = CGF.Builder.CreateAnd(LHS, RHS: Mask);
525 RHS = CGF.Builder.CreateAnd(LHS: RHS, RHS: Mask);
526 }
527
528 return CGF.Builder.CreateMul(LHS, RHS);
529}
530
531// Emit a masked pternlog intrinsic. This only exists because the header has to
532// use a macro and we aren't able to pass the input argument to a pternlog
533// builtin and a select builtin without evaluating it twice.
534static Value *EmitX86Ternlog(CodeGenFunction &CGF, bool ZeroMask,
535 ArrayRef<Value *> Ops) {
536 llvm::Type *Ty = Ops[0]->getType();
537
538 unsigned VecWidth = Ty->getPrimitiveSizeInBits();
539 unsigned EltWidth = Ty->getScalarSizeInBits();
540 Intrinsic::ID IID;
541 if (VecWidth == 128 && EltWidth == 32)
542 IID = Intrinsic::x86_avx512_pternlog_d_128;
543 else if (VecWidth == 256 && EltWidth == 32)
544 IID = Intrinsic::x86_avx512_pternlog_d_256;
545 else if (VecWidth == 512 && EltWidth == 32)
546 IID = Intrinsic::x86_avx512_pternlog_d_512;
547 else if (VecWidth == 128 && EltWidth == 64)
548 IID = Intrinsic::x86_avx512_pternlog_q_128;
549 else if (VecWidth == 256 && EltWidth == 64)
550 IID = Intrinsic::x86_avx512_pternlog_q_256;
551 else if (VecWidth == 512 && EltWidth == 64)
552 IID = Intrinsic::x86_avx512_pternlog_q_512;
553 else
554 llvm_unreachable("Unexpected intrinsic");
555
556 Value *Ternlog = CGF.Builder.CreateCall(Callee: CGF.CGM.getIntrinsic(IID),
557 Args: Ops.drop_back());
558 Value *PassThru = ZeroMask ? ConstantAggregateZero::get(Ty) : Ops[0];
559 return EmitX86Select(CGF, Mask: Ops[4], Op0: Ternlog, Op1: PassThru);
560}
561
562static Value *EmitX86SExtMask(CodeGenFunction &CGF, Value *Op,
563 llvm::Type *DstTy) {
564 unsigned NumberOfElements =
565 cast<llvm::FixedVectorType>(Val: DstTy)->getNumElements();
566 Value *Mask = getMaskVecValue(CGF, Mask: Op, NumElts: NumberOfElements);
567 return CGF.Builder.CreateSExt(V: Mask, DestTy: DstTy, Name: "vpmovm2");
568}
569
570Value *CodeGenFunction::EmitX86CpuIs(const CallExpr *E) {
571 const Expr *CPUExpr = E->getArg(Arg: 0)->IgnoreParenCasts();
572 StringRef CPUStr = cast<clang::StringLiteral>(Val: CPUExpr)->getString();
573 return EmitX86CpuIs(CPUStr);
574}
575
576// Convert F16 halfs to floats.
577static Value *EmitX86CvtF16ToFloatExpr(CodeGenFunction &CGF,
578 ArrayRef<Value *> Ops,
579 llvm::Type *DstTy) {
580 assert((Ops.size() == 1 || Ops.size() == 3 || Ops.size() == 4) &&
581 "Unknown cvtph2ps intrinsic");
582
583 // If the SAE intrinsic doesn't use default rounding then we can't upgrade.
584 if (Ops.size() == 4 && cast<llvm::ConstantInt>(Val: Ops[3])->getZExtValue() != 4) {
585 Function *F =
586 CGF.CGM.getIntrinsic(Intrinsic::x86_avx512_mask_vcvtph2ps_512);
587 return CGF.Builder.CreateCall(Callee: F, Args: {Ops[0], Ops[1], Ops[2], Ops[3]});
588 }
589
590 unsigned NumDstElts = cast<llvm::FixedVectorType>(Val: DstTy)->getNumElements();
591 Value *Src = Ops[0];
592
593 // Extract the subvector.
594 if (NumDstElts !=
595 cast<llvm::FixedVectorType>(Val: Src->getType())->getNumElements()) {
596 assert(NumDstElts == 4 && "Unexpected vector size");
597 Src = CGF.Builder.CreateShuffleVector(V: Src, Mask: {0, 1, 2, 3});
598 }
599
600 // Bitcast from vXi16 to vXf16.
601 auto *HalfTy = llvm::FixedVectorType::get(
602 ElementType: llvm::Type::getHalfTy(C&: CGF.getLLVMContext()), NumElts: NumDstElts);
603 Src = CGF.Builder.CreateBitCast(V: Src, DestTy: HalfTy);
604
605 // Perform the fp-extension.
606 Value *Res = CGF.Builder.CreateFPExt(V: Src, DestTy: DstTy, Name: "cvtph2ps");
607
608 if (Ops.size() >= 3)
609 Res = EmitX86Select(CGF, Mask: Ops[2], Op0: Res, Op1: Ops[1]);
610 return Res;
611}
612
613Value *CodeGenFunction::EmitX86CpuIs(StringRef CPUStr) {
614
615 llvm::Type *Int32Ty = Builder.getInt32Ty();
616
617 // Matching the struct layout from the compiler-rt/libgcc structure that is
618 // filled in:
619 // unsigned int __cpu_vendor;
620 // unsigned int __cpu_type;
621 // unsigned int __cpu_subtype;
622 // unsigned int __cpu_features[1];
623 llvm::Type *STy = llvm::StructType::get(elt1: Int32Ty, elts: Int32Ty, elts: Int32Ty,
624 elts: llvm::ArrayType::get(ElementType: Int32Ty, NumElements: 1));
625
626 // Grab the global __cpu_model.
627 llvm::Constant *CpuModel = CGM.CreateRuntimeVariable(Ty: STy, Name: "__cpu_model");
628 cast<llvm::GlobalValue>(Val: CpuModel)->setDSOLocal(true);
629
630 // Calculate the index needed to access the correct field based on the
631 // range. Also adjust the expected value.
632 auto [Index, Value] = StringSwitch<std::pair<unsigned, unsigned>>(CPUStr)
633#define X86_VENDOR(ENUM, STRING) \
634 .Case(STRING, {0u, static_cast<unsigned>(llvm::X86::ENUM)})
635#define X86_CPU_TYPE_ALIAS(ENUM, ALIAS) \
636 .Case(ALIAS, {1u, static_cast<unsigned>(llvm::X86::ENUM)})
637#define X86_CPU_TYPE(ENUM, STR) \
638 .Case(STR, {1u, static_cast<unsigned>(llvm::X86::ENUM)})
639#define X86_CPU_SUBTYPE_ALIAS(ENUM, ALIAS) \
640 .Case(ALIAS, {2u, static_cast<unsigned>(llvm::X86::ENUM)})
641#define X86_CPU_SUBTYPE(ENUM, STR) \
642 .Case(STR, {2u, static_cast<unsigned>(llvm::X86::ENUM)})
643#include "llvm/TargetParser/X86TargetParser.def"
644 .Default(Value: {0, 0});
645 assert(Value != 0 && "Invalid CPUStr passed to CpuIs");
646
647 // Grab the appropriate field from __cpu_model.
648 llvm::Value *Idxs[] = {ConstantInt::get(Ty: Int32Ty, V: 0),
649 ConstantInt::get(Ty: Int32Ty, V: Index)};
650 llvm::Value *CpuValue = Builder.CreateInBoundsGEP(Ty: STy, Ptr: CpuModel, IdxList: Idxs);
651 CpuValue = Builder.CreateAlignedLoad(Ty: Int32Ty, Addr: CpuValue,
652 Align: CharUnits::fromQuantity(Quantity: 4));
653
654 // Check the value of the field against the requested value.
655 return Builder.CreateICmpEQ(LHS: CpuValue,
656 RHS: llvm::ConstantInt::get(Ty: Int32Ty, V: Value));
657}
658
659Value *CodeGenFunction::EmitX86CpuSupports(const CallExpr *E) {
660 const Expr *FeatureExpr = E->getArg(Arg: 0)->IgnoreParenCasts();
661 StringRef FeatureStr = cast<StringLiteral>(Val: FeatureExpr)->getString();
662 if (!getContext().getTargetInfo().validateCpuSupports(Name: FeatureStr))
663 return Builder.getFalse();
664 return EmitX86CpuSupports(FeatureStrs: FeatureStr);
665}
666
667Value *CodeGenFunction::EmitX86CpuSupports(ArrayRef<StringRef> FeatureStrs) {
668 return EmitX86CpuSupports(FeatureMask: llvm::X86::getCpuSupportsMask(FeatureStrs));
669}
670
671llvm::Value *
672CodeGenFunction::EmitX86CpuSupports(std::array<uint32_t, 4> FeatureMask) {
673 Value *Result = Builder.getTrue();
674 if (FeatureMask[0] != 0) {
675 // Matching the struct layout from the compiler-rt/libgcc structure that is
676 // filled in:
677 // unsigned int __cpu_vendor;
678 // unsigned int __cpu_type;
679 // unsigned int __cpu_subtype;
680 // unsigned int __cpu_features[1];
681 llvm::Type *STy = llvm::StructType::get(elt1: Int32Ty, elts: Int32Ty, elts: Int32Ty,
682 elts: llvm::ArrayType::get(ElementType: Int32Ty, NumElements: 1));
683
684 // Grab the global __cpu_model.
685 llvm::Constant *CpuModel = CGM.CreateRuntimeVariable(Ty: STy, Name: "__cpu_model");
686 cast<llvm::GlobalValue>(Val: CpuModel)->setDSOLocal(true);
687
688 // Grab the first (0th) element from the field __cpu_features off of the
689 // global in the struct STy.
690 Value *Idxs[] = {Builder.getInt32(C: 0), Builder.getInt32(C: 3),
691 Builder.getInt32(C: 0)};
692 Value *CpuFeatures = Builder.CreateInBoundsGEP(Ty: STy, Ptr: CpuModel, IdxList: Idxs);
693 Value *Features = Builder.CreateAlignedLoad(Ty: Int32Ty, Addr: CpuFeatures,
694 Align: CharUnits::fromQuantity(Quantity: 4));
695
696 // Check the value of the bit corresponding to the feature requested.
697 Value *Mask = Builder.getInt32(C: FeatureMask[0]);
698 Value *Bitset = Builder.CreateAnd(LHS: Features, RHS: Mask);
699 Value *Cmp = Builder.CreateICmpEQ(LHS: Bitset, RHS: Mask);
700 Result = Builder.CreateAnd(LHS: Result, RHS: Cmp);
701 }
702
703 llvm::Type *ATy = llvm::ArrayType::get(ElementType: Int32Ty, NumElements: 3);
704 llvm::Constant *CpuFeatures2 =
705 CGM.CreateRuntimeVariable(Ty: ATy, Name: "__cpu_features2");
706 cast<llvm::GlobalValue>(Val: CpuFeatures2)->setDSOLocal(true);
707 for (int i = 1; i != 4; ++i) {
708 const uint32_t M = FeatureMask[i];
709 if (!M)
710 continue;
711 Value *Idxs[] = {Builder.getInt32(C: 0), Builder.getInt32(C: i - 1)};
712 Value *Features = Builder.CreateAlignedLoad(
713 Ty: Int32Ty, Addr: Builder.CreateInBoundsGEP(Ty: ATy, Ptr: CpuFeatures2, IdxList: Idxs),
714 Align: CharUnits::fromQuantity(Quantity: 4));
715 // Check the value of the bit corresponding to the feature requested.
716 Value *Mask = Builder.getInt32(C: M);
717 Value *Bitset = Builder.CreateAnd(LHS: Features, RHS: Mask);
718 Value *Cmp = Builder.CreateICmpEQ(LHS: Bitset, RHS: Mask);
719 Result = Builder.CreateAnd(LHS: Result, RHS: Cmp);
720 }
721
722 return Result;
723}
724
725Value *CodeGenFunction::EmitX86CpuInit() {
726 llvm::FunctionType *FTy = llvm::FunctionType::get(Result: VoidTy,
727 /*Variadic*/ isVarArg: false);
728 llvm::FunctionCallee Func =
729 CGM.CreateRuntimeFunction(Ty: FTy, Name: "__cpu_indicator_init");
730 cast<llvm::GlobalValue>(Val: Func.getCallee())->setDSOLocal(true);
731 cast<llvm::GlobalValue>(Val: Func.getCallee())
732 ->setDLLStorageClass(llvm::GlobalValue::DefaultStorageClass);
733 return Builder.CreateCall(Callee: Func);
734}
735
736
737Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
738 const CallExpr *E) {
739 if (BuiltinID == Builtin::BI__builtin_cpu_is)
740 return EmitX86CpuIs(E);
741 if (BuiltinID == Builtin::BI__builtin_cpu_supports)
742 return EmitX86CpuSupports(E);
743 if (BuiltinID == Builtin::BI__builtin_cpu_init)
744 return EmitX86CpuInit();
745
746 // Handle MSVC intrinsics before argument evaluation to prevent double
747 // evaluation.
748 if (std::optional<MSVCIntrin> MsvcIntId = translateX86ToMsvcIntrin(BuiltinID))
749 return EmitMSVCBuiltinExpr(BuiltinID: *MsvcIntId, E);
750
751 SmallVector<Value*, 4> Ops;
752 bool IsMaskFCmp = false;
753 bool IsConjFMA = false;
754
755 // Find out if any arguments are required to be integer constant expressions.
756 unsigned ICEArguments = 0;
757 ASTContext::GetBuiltinTypeError Error;
758 getContext().GetBuiltinType(ID: BuiltinID, Error, IntegerConstantArgs: &ICEArguments);
759 assert(Error == ASTContext::GE_None && "Should not codegen an error");
760
761 for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
762 Ops.push_back(Elt: EmitScalarOrConstFoldImmArg(ICEArguments, Idx: i, E));
763 }
764
765 // These exist so that the builtin that takes an immediate can be bounds
766 // checked by clang to avoid passing bad immediates to the backend. Since
767 // AVX has a larger immediate than SSE we would need separate builtins to
768 // do the different bounds checking. Rather than create a clang specific
769 // SSE only builtin, this implements eight separate builtins to match gcc
770 // implementation.
771 auto getCmpIntrinsicCall = [this, &Ops](Intrinsic::ID ID, unsigned Imm) {
772 Ops.push_back(Elt: llvm::ConstantInt::get(Ty: Int8Ty, V: Imm));
773 llvm::Function *F = CGM.getIntrinsic(IID: ID);
774 return Builder.CreateCall(Callee: F, Args: Ops);
775 };
776
777 // For the vector forms of FP comparisons, translate the builtins directly to
778 // IR.
779 // TODO: The builtins could be removed if the SSE header files used vector
780 // extension comparisons directly (vector ordered/unordered may need
781 // additional support via __builtin_isnan()).
782 auto getVectorFCmpIR = [this, &Ops, E](CmpInst::Predicate Pred,
783 bool IsSignaling) {
784 CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
785 Value *Cmp;
786 if (IsSignaling)
787 Cmp = Builder.CreateFCmpS(P: Pred, LHS: Ops[0], RHS: Ops[1]);
788 else
789 Cmp = Builder.CreateFCmp(P: Pred, LHS: Ops[0], RHS: Ops[1]);
790 llvm::VectorType *FPVecTy = cast<llvm::VectorType>(Val: Ops[0]->getType());
791 llvm::VectorType *IntVecTy = llvm::VectorType::getInteger(VTy: FPVecTy);
792 Value *Sext = Builder.CreateSExt(V: Cmp, DestTy: IntVecTy);
793 return Builder.CreateBitCast(V: Sext, DestTy: FPVecTy);
794 };
795
796 switch (BuiltinID) {
797 default: return nullptr;
798 case X86::BI_mm_prefetch: {
799 Value *Address = Ops[0];
800 ConstantInt *C = cast<ConstantInt>(Val: Ops[1]);
801 Value *RW = ConstantInt::get(Ty: Int32Ty, V: (C->getZExtValue() >> 2) & 0x1);
802 Value *Locality = ConstantInt::get(Ty: Int32Ty, V: C->getZExtValue() & 0x3);
803 Value *Data = ConstantInt::get(Ty: Int32Ty, V: 1);
804 Function *F = CGM.getIntrinsic(Intrinsic::prefetch, Address->getType());
805 return Builder.CreateCall(Callee: F, Args: {Address, RW, Locality, Data});
806 }
807 case X86::BI_mm_clflush: {
808 return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_clflush),
809 Ops[0]);
810 }
811 case X86::BI_mm_lfence: {
812 return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_lfence));
813 }
814 case X86::BI_mm_mfence: {
815 return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_mfence));
816 }
817 case X86::BI_mm_sfence: {
818 return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse_sfence));
819 }
820 case X86::BI_mm_pause: {
821 return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_pause));
822 }
823 case X86::BI__rdtsc: {
824 return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_rdtsc));
825 }
826 case X86::BI__builtin_ia32_rdtscp: {
827 Value *Call = Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_rdtscp));
828 Builder.CreateDefaultAlignedStore(Val: Builder.CreateExtractValue(Agg: Call, Idxs: 1),
829 Addr: Ops[0]);
830 return Builder.CreateExtractValue(Agg: Call, Idxs: 0);
831 }
832 case X86::BI__builtin_ia32_lzcnt_u16:
833 case X86::BI__builtin_ia32_lzcnt_u32:
834 case X86::BI__builtin_ia32_lzcnt_u64: {
835 Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Ops[0]->getType());
836 return Builder.CreateCall(Callee: F, Args: {Ops[0], Builder.getInt1(V: false)});
837 }
838 case X86::BI__builtin_ia32_tzcnt_u16:
839 case X86::BI__builtin_ia32_tzcnt_u32:
840 case X86::BI__builtin_ia32_tzcnt_u64: {
841 Function *F = CGM.getIntrinsic(Intrinsic::cttz, Ops[0]->getType());
842 return Builder.CreateCall(Callee: F, Args: {Ops[0], Builder.getInt1(V: false)});
843 }
844 case X86::BI__builtin_ia32_undef128:
845 case X86::BI__builtin_ia32_undef256:
846 case X86::BI__builtin_ia32_undef512:
847 // The x86 definition of "undef" is not the same as the LLVM definition
848 // (PR32176). We leave optimizing away an unnecessary zero constant to the
849 // IR optimizer and backend.
850 // TODO: If we had a "freeze" IR instruction to generate a fixed undef
851 // value, we should use that here instead of a zero.
852 return llvm::Constant::getNullValue(Ty: ConvertType(E->getType()));
853 case X86::BI__builtin_ia32_vec_ext_v4hi:
854 case X86::BI__builtin_ia32_vec_ext_v16qi:
855 case X86::BI__builtin_ia32_vec_ext_v8hi:
856 case X86::BI__builtin_ia32_vec_ext_v4si:
857 case X86::BI__builtin_ia32_vec_ext_v4sf:
858 case X86::BI__builtin_ia32_vec_ext_v2di:
859 case X86::BI__builtin_ia32_vec_ext_v32qi:
860 case X86::BI__builtin_ia32_vec_ext_v16hi:
861 case X86::BI__builtin_ia32_vec_ext_v8si:
862 case X86::BI__builtin_ia32_vec_ext_v4di: {
863 unsigned NumElts =
864 cast<llvm::FixedVectorType>(Val: Ops[0]->getType())->getNumElements();
865 uint64_t Index = cast<ConstantInt>(Val: Ops[1])->getZExtValue();
866 Index &= NumElts - 1;
867 // These builtins exist so we can ensure the index is an ICE and in range.
868 // Otherwise we could just do this in the header file.
869 return Builder.CreateExtractElement(Vec: Ops[0], Idx: Index);
870 }
871 case X86::BI__builtin_ia32_vec_set_v4hi:
872 case X86::BI__builtin_ia32_vec_set_v16qi:
873 case X86::BI__builtin_ia32_vec_set_v8hi:
874 case X86::BI__builtin_ia32_vec_set_v4si:
875 case X86::BI__builtin_ia32_vec_set_v2di:
876 case X86::BI__builtin_ia32_vec_set_v32qi:
877 case X86::BI__builtin_ia32_vec_set_v16hi:
878 case X86::BI__builtin_ia32_vec_set_v8si:
879 case X86::BI__builtin_ia32_vec_set_v4di: {
880 unsigned NumElts =
881 cast<llvm::FixedVectorType>(Val: Ops[0]->getType())->getNumElements();
882 unsigned Index = cast<ConstantInt>(Val: Ops[2])->getZExtValue();
883 Index &= NumElts - 1;
884 // These builtins exist so we can ensure the index is an ICE and in range.
885 // Otherwise we could just do this in the header file.
886 return Builder.CreateInsertElement(Vec: Ops[0], NewElt: Ops[1], Idx: Index);
887 }
888 case X86::BI_mm_setcsr:
889 case X86::BI__builtin_ia32_ldmxcsr: {
890 RawAddress Tmp = CreateMemTemp(T: E->getArg(Arg: 0)->getType());
891 Builder.CreateStore(Val: Ops[0], Addr: Tmp);
892 return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse_ldmxcsr),
893 Tmp.getPointer());
894 }
895 case X86::BI_mm_getcsr:
896 case X86::BI__builtin_ia32_stmxcsr: {
897 RawAddress Tmp = CreateMemTemp(E->getType());
898 Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse_stmxcsr),
899 Tmp.getPointer());
900 return Builder.CreateLoad(Addr: Tmp, Name: "stmxcsr");
901 }
902 case X86::BI__builtin_ia32_xsave:
903 case X86::BI__builtin_ia32_xsave64:
904 case X86::BI__builtin_ia32_xrstor:
905 case X86::BI__builtin_ia32_xrstor64:
906 case X86::BI__builtin_ia32_xsaveopt:
907 case X86::BI__builtin_ia32_xsaveopt64:
908 case X86::BI__builtin_ia32_xrstors:
909 case X86::BI__builtin_ia32_xrstors64:
910 case X86::BI__builtin_ia32_xsavec:
911 case X86::BI__builtin_ia32_xsavec64:
912 case X86::BI__builtin_ia32_xsaves:
913 case X86::BI__builtin_ia32_xsaves64:
914 case X86::BI__builtin_ia32_xsetbv:
915 case X86::BI_xsetbv: {
916 Intrinsic::ID ID;
917#define INTRINSIC_X86_XSAVE_ID(NAME) \
918 case X86::BI__builtin_ia32_##NAME: \
919 ID = Intrinsic::x86_##NAME; \
920 break
921 switch (BuiltinID) {
922 default: llvm_unreachable("Unsupported intrinsic!");
923 INTRINSIC_X86_XSAVE_ID(xsave);
924 INTRINSIC_X86_XSAVE_ID(xsave64);
925 INTRINSIC_X86_XSAVE_ID(xrstor);
926 INTRINSIC_X86_XSAVE_ID(xrstor64);
927 INTRINSIC_X86_XSAVE_ID(xsaveopt);
928 INTRINSIC_X86_XSAVE_ID(xsaveopt64);
929 INTRINSIC_X86_XSAVE_ID(xrstors);
930 INTRINSIC_X86_XSAVE_ID(xrstors64);
931 INTRINSIC_X86_XSAVE_ID(xsavec);
932 INTRINSIC_X86_XSAVE_ID(xsavec64);
933 INTRINSIC_X86_XSAVE_ID(xsaves);
934 INTRINSIC_X86_XSAVE_ID(xsaves64);
935 INTRINSIC_X86_XSAVE_ID(xsetbv);
936 case X86::BI_xsetbv:
937 ID = Intrinsic::x86_xsetbv;
938 break;
939 }
940#undef INTRINSIC_X86_XSAVE_ID
941 Value *Mhi = Builder.CreateTrunc(
942 V: Builder.CreateLShr(LHS: Ops[1], RHS: ConstantInt::get(Ty: Int64Ty, V: 32)), DestTy: Int32Ty);
943 Value *Mlo = Builder.CreateTrunc(V: Ops[1], DestTy: Int32Ty);
944 Ops[1] = Mhi;
945 Ops.push_back(Elt: Mlo);
946 return Builder.CreateCall(Callee: CGM.getIntrinsic(IID: ID), Args: Ops);
947 }
948 case X86::BI__builtin_ia32_xgetbv:
949 case X86::BI_xgetbv:
950 return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_xgetbv), Ops);
951 case X86::BI__builtin_ia32_storedqudi128_mask:
952 case X86::BI__builtin_ia32_storedqusi128_mask:
953 case X86::BI__builtin_ia32_storedquhi128_mask:
954 case X86::BI__builtin_ia32_storedquqi128_mask:
955 case X86::BI__builtin_ia32_storeupd128_mask:
956 case X86::BI__builtin_ia32_storeups128_mask:
957 case X86::BI__builtin_ia32_storedqudi256_mask:
958 case X86::BI__builtin_ia32_storedqusi256_mask:
959 case X86::BI__builtin_ia32_storedquhi256_mask:
960 case X86::BI__builtin_ia32_storedquqi256_mask:
961 case X86::BI__builtin_ia32_storeupd256_mask:
962 case X86::BI__builtin_ia32_storeups256_mask:
963 case X86::BI__builtin_ia32_storedqudi512_mask:
964 case X86::BI__builtin_ia32_storedqusi512_mask:
965 case X86::BI__builtin_ia32_storedquhi512_mask:
966 case X86::BI__builtin_ia32_storedquqi512_mask:
967 case X86::BI__builtin_ia32_storeupd512_mask:
968 case X86::BI__builtin_ia32_storeups512_mask:
969 return EmitX86MaskedStore(CGF&: *this, Ops, Alignment: Align(1));
970
971 case X86::BI__builtin_ia32_storesbf16128_mask:
972 case X86::BI__builtin_ia32_storesh128_mask:
973 case X86::BI__builtin_ia32_storess128_mask:
974 case X86::BI__builtin_ia32_storesd128_mask:
975 return EmitX86MaskedStore(CGF&: *this, Ops, Alignment: Align(1));
976
977 case X86::BI__builtin_ia32_cvtmask2b128:
978 case X86::BI__builtin_ia32_cvtmask2b256:
979 case X86::BI__builtin_ia32_cvtmask2b512:
980 case X86::BI__builtin_ia32_cvtmask2w128:
981 case X86::BI__builtin_ia32_cvtmask2w256:
982 case X86::BI__builtin_ia32_cvtmask2w512:
983 case X86::BI__builtin_ia32_cvtmask2d128:
984 case X86::BI__builtin_ia32_cvtmask2d256:
985 case X86::BI__builtin_ia32_cvtmask2d512:
986 case X86::BI__builtin_ia32_cvtmask2q128:
987 case X86::BI__builtin_ia32_cvtmask2q256:
988 case X86::BI__builtin_ia32_cvtmask2q512:
989 return EmitX86SExtMask(*this, Ops[0], ConvertType(E->getType()));
990
991 case X86::BI__builtin_ia32_cvtb2mask128:
992 case X86::BI__builtin_ia32_cvtb2mask256:
993 case X86::BI__builtin_ia32_cvtb2mask512:
994 case X86::BI__builtin_ia32_cvtw2mask128:
995 case X86::BI__builtin_ia32_cvtw2mask256:
996 case X86::BI__builtin_ia32_cvtw2mask512:
997 case X86::BI__builtin_ia32_cvtd2mask128:
998 case X86::BI__builtin_ia32_cvtd2mask256:
999 case X86::BI__builtin_ia32_cvtd2mask512:
1000 case X86::BI__builtin_ia32_cvtq2mask128:
1001 case X86::BI__builtin_ia32_cvtq2mask256:
1002 case X86::BI__builtin_ia32_cvtq2mask512:
1003 return EmitX86ConvertToMask(CGF&: *this, In: Ops[0]);
1004
1005 case X86::BI__builtin_ia32_cvtdq2ps512_mask:
1006 case X86::BI__builtin_ia32_cvtqq2ps512_mask:
1007 case X86::BI__builtin_ia32_cvtqq2pd512_mask:
1008 case X86::BI__builtin_ia32_vcvtw2ph512_mask:
1009 case X86::BI__builtin_ia32_vcvtdq2ph512_mask:
1010 case X86::BI__builtin_ia32_vcvtqq2ph512_mask:
1011 return EmitX86ConvertIntToFp(CGF&: *this, E, Ops, /*IsSigned*/ true);
1012 case X86::BI__builtin_ia32_cvtudq2ps512_mask:
1013 case X86::BI__builtin_ia32_cvtuqq2ps512_mask:
1014 case X86::BI__builtin_ia32_cvtuqq2pd512_mask:
1015 case X86::BI__builtin_ia32_vcvtuw2ph512_mask:
1016 case X86::BI__builtin_ia32_vcvtudq2ph512_mask:
1017 case X86::BI__builtin_ia32_vcvtuqq2ph512_mask:
1018 return EmitX86ConvertIntToFp(CGF&: *this, E, Ops, /*IsSigned*/ false);
1019
1020 case X86::BI__builtin_ia32_vfmaddss3:
1021 case X86::BI__builtin_ia32_vfmaddsd3:
1022 case X86::BI__builtin_ia32_vfmaddsh3_mask:
1023 case X86::BI__builtin_ia32_vfmaddss3_mask:
1024 case X86::BI__builtin_ia32_vfmaddsd3_mask:
1025 return EmitScalarFMAExpr(CGF&: *this, E, Ops, Upper: Ops[0]);
1026 case X86::BI__builtin_ia32_vfmaddss:
1027 case X86::BI__builtin_ia32_vfmaddsd:
1028 return EmitScalarFMAExpr(CGF&: *this, E, Ops,
1029 Upper: Constant::getNullValue(Ty: Ops[0]->getType()));
1030 case X86::BI__builtin_ia32_vfmaddsh3_maskz:
1031 case X86::BI__builtin_ia32_vfmaddss3_maskz:
1032 case X86::BI__builtin_ia32_vfmaddsd3_maskz:
1033 return EmitScalarFMAExpr(CGF&: *this, E, Ops, Upper: Ops[0], /*ZeroMask*/ true);
1034 case X86::BI__builtin_ia32_vfmaddsh3_mask3:
1035 case X86::BI__builtin_ia32_vfmaddss3_mask3:
1036 case X86::BI__builtin_ia32_vfmaddsd3_mask3:
1037 return EmitScalarFMAExpr(CGF&: *this, E, Ops, Upper: Ops[2], /*ZeroMask*/ false, PTIdx: 2);
1038 case X86::BI__builtin_ia32_vfmsubsh3_mask3:
1039 case X86::BI__builtin_ia32_vfmsubss3_mask3:
1040 case X86::BI__builtin_ia32_vfmsubsd3_mask3:
1041 return EmitScalarFMAExpr(CGF&: *this, E, Ops, Upper: Ops[2], /*ZeroMask*/ false, PTIdx: 2,
1042 /*NegAcc*/ true);
1043 case X86::BI__builtin_ia32_vfmaddph:
1044 case X86::BI__builtin_ia32_vfmaddps:
1045 case X86::BI__builtin_ia32_vfmaddpd:
1046 case X86::BI__builtin_ia32_vfmaddph256:
1047 case X86::BI__builtin_ia32_vfmaddps256:
1048 case X86::BI__builtin_ia32_vfmaddpd256:
1049 case X86::BI__builtin_ia32_vfmaddph512_mask:
1050 case X86::BI__builtin_ia32_vfmaddph512_maskz:
1051 case X86::BI__builtin_ia32_vfmaddph512_mask3:
1052 case X86::BI__builtin_ia32_vfmaddbf16128:
1053 case X86::BI__builtin_ia32_vfmaddbf16256:
1054 case X86::BI__builtin_ia32_vfmaddbf16512:
1055 case X86::BI__builtin_ia32_vfmaddps512_mask:
1056 case X86::BI__builtin_ia32_vfmaddps512_maskz:
1057 case X86::BI__builtin_ia32_vfmaddps512_mask3:
1058 case X86::BI__builtin_ia32_vfmsubps512_mask3:
1059 case X86::BI__builtin_ia32_vfmaddpd512_mask:
1060 case X86::BI__builtin_ia32_vfmaddpd512_maskz:
1061 case X86::BI__builtin_ia32_vfmaddpd512_mask3:
1062 case X86::BI__builtin_ia32_vfmsubpd512_mask3:
1063 case X86::BI__builtin_ia32_vfmsubph512_mask3:
1064 return EmitX86FMAExpr(CGF&: *this, E, Ops, BuiltinID, /*IsAddSub*/ false);
1065 case X86::BI__builtin_ia32_vfmaddsubph512_mask:
1066 case X86::BI__builtin_ia32_vfmaddsubph512_maskz:
1067 case X86::BI__builtin_ia32_vfmaddsubph512_mask3:
1068 case X86::BI__builtin_ia32_vfmsubaddph512_mask3:
1069 case X86::BI__builtin_ia32_vfmaddsubps512_mask:
1070 case X86::BI__builtin_ia32_vfmaddsubps512_maskz:
1071 case X86::BI__builtin_ia32_vfmaddsubps512_mask3:
1072 case X86::BI__builtin_ia32_vfmsubaddps512_mask3:
1073 case X86::BI__builtin_ia32_vfmaddsubpd512_mask:
1074 case X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
1075 case X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
1076 case X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
1077 return EmitX86FMAExpr(CGF&: *this, E, Ops, BuiltinID, /*IsAddSub*/ true);
1078
1079 case X86::BI__builtin_ia32_movdqa32store128_mask:
1080 case X86::BI__builtin_ia32_movdqa64store128_mask:
1081 case X86::BI__builtin_ia32_storeaps128_mask:
1082 case X86::BI__builtin_ia32_storeapd128_mask:
1083 case X86::BI__builtin_ia32_movdqa32store256_mask:
1084 case X86::BI__builtin_ia32_movdqa64store256_mask:
1085 case X86::BI__builtin_ia32_storeaps256_mask:
1086 case X86::BI__builtin_ia32_storeapd256_mask:
1087 case X86::BI__builtin_ia32_movdqa32store512_mask:
1088 case X86::BI__builtin_ia32_movdqa64store512_mask:
1089 case X86::BI__builtin_ia32_storeaps512_mask:
1090 case X86::BI__builtin_ia32_storeapd512_mask:
1091 return EmitX86MaskedStore(
1092 CGF&: *this, Ops,
1093 Alignment: getContext().getTypeAlignInChars(T: E->getArg(Arg: 1)->getType()).getAsAlign());
1094
1095 case X86::BI__builtin_ia32_loadups128_mask:
1096 case X86::BI__builtin_ia32_loadups256_mask:
1097 case X86::BI__builtin_ia32_loadups512_mask:
1098 case X86::BI__builtin_ia32_loadupd128_mask:
1099 case X86::BI__builtin_ia32_loadupd256_mask:
1100 case X86::BI__builtin_ia32_loadupd512_mask:
1101 case X86::BI__builtin_ia32_loaddquqi128_mask:
1102 case X86::BI__builtin_ia32_loaddquqi256_mask:
1103 case X86::BI__builtin_ia32_loaddquqi512_mask:
1104 case X86::BI__builtin_ia32_loaddquhi128_mask:
1105 case X86::BI__builtin_ia32_loaddquhi256_mask:
1106 case X86::BI__builtin_ia32_loaddquhi512_mask:
1107 case X86::BI__builtin_ia32_loaddqusi128_mask:
1108 case X86::BI__builtin_ia32_loaddqusi256_mask:
1109 case X86::BI__builtin_ia32_loaddqusi512_mask:
1110 case X86::BI__builtin_ia32_loaddqudi128_mask:
1111 case X86::BI__builtin_ia32_loaddqudi256_mask:
1112 case X86::BI__builtin_ia32_loaddqudi512_mask:
1113 return EmitX86MaskedLoad(CGF&: *this, Ops, Alignment: Align(1));
1114
1115 case X86::BI__builtin_ia32_loadsbf16128_mask:
1116 case X86::BI__builtin_ia32_loadsh128_mask:
1117 case X86::BI__builtin_ia32_loadss128_mask:
1118 case X86::BI__builtin_ia32_loadsd128_mask:
1119 return EmitX86MaskedLoad(CGF&: *this, Ops, Alignment: Align(1));
1120
1121 case X86::BI__builtin_ia32_loadaps128_mask:
1122 case X86::BI__builtin_ia32_loadaps256_mask:
1123 case X86::BI__builtin_ia32_loadaps512_mask:
1124 case X86::BI__builtin_ia32_loadapd128_mask:
1125 case X86::BI__builtin_ia32_loadapd256_mask:
1126 case X86::BI__builtin_ia32_loadapd512_mask:
1127 case X86::BI__builtin_ia32_movdqa32load128_mask:
1128 case X86::BI__builtin_ia32_movdqa32load256_mask:
1129 case X86::BI__builtin_ia32_movdqa32load512_mask:
1130 case X86::BI__builtin_ia32_movdqa64load128_mask:
1131 case X86::BI__builtin_ia32_movdqa64load256_mask:
1132 case X86::BI__builtin_ia32_movdqa64load512_mask:
1133 return EmitX86MaskedLoad(
1134 CGF&: *this, Ops,
1135 Alignment: getContext().getTypeAlignInChars(T: E->getArg(Arg: 1)->getType()).getAsAlign());
1136
1137 case X86::BI__builtin_ia32_expandloaddf128_mask:
1138 case X86::BI__builtin_ia32_expandloaddf256_mask:
1139 case X86::BI__builtin_ia32_expandloaddf512_mask:
1140 case X86::BI__builtin_ia32_expandloadsf128_mask:
1141 case X86::BI__builtin_ia32_expandloadsf256_mask:
1142 case X86::BI__builtin_ia32_expandloadsf512_mask:
1143 case X86::BI__builtin_ia32_expandloaddi128_mask:
1144 case X86::BI__builtin_ia32_expandloaddi256_mask:
1145 case X86::BI__builtin_ia32_expandloaddi512_mask:
1146 case X86::BI__builtin_ia32_expandloadsi128_mask:
1147 case X86::BI__builtin_ia32_expandloadsi256_mask:
1148 case X86::BI__builtin_ia32_expandloadsi512_mask:
1149 case X86::BI__builtin_ia32_expandloadhi128_mask:
1150 case X86::BI__builtin_ia32_expandloadhi256_mask:
1151 case X86::BI__builtin_ia32_expandloadhi512_mask:
1152 case X86::BI__builtin_ia32_expandloadqi128_mask:
1153 case X86::BI__builtin_ia32_expandloadqi256_mask:
1154 case X86::BI__builtin_ia32_expandloadqi512_mask:
1155 return EmitX86ExpandLoad(CGF&: *this, Ops);
1156
1157 case X86::BI__builtin_ia32_compressstoredf128_mask:
1158 case X86::BI__builtin_ia32_compressstoredf256_mask:
1159 case X86::BI__builtin_ia32_compressstoredf512_mask:
1160 case X86::BI__builtin_ia32_compressstoresf128_mask:
1161 case X86::BI__builtin_ia32_compressstoresf256_mask:
1162 case X86::BI__builtin_ia32_compressstoresf512_mask:
1163 case X86::BI__builtin_ia32_compressstoredi128_mask:
1164 case X86::BI__builtin_ia32_compressstoredi256_mask:
1165 case X86::BI__builtin_ia32_compressstoredi512_mask:
1166 case X86::BI__builtin_ia32_compressstoresi128_mask:
1167 case X86::BI__builtin_ia32_compressstoresi256_mask:
1168 case X86::BI__builtin_ia32_compressstoresi512_mask:
1169 case X86::BI__builtin_ia32_compressstorehi128_mask:
1170 case X86::BI__builtin_ia32_compressstorehi256_mask:
1171 case X86::BI__builtin_ia32_compressstorehi512_mask:
1172 case X86::BI__builtin_ia32_compressstoreqi128_mask:
1173 case X86::BI__builtin_ia32_compressstoreqi256_mask:
1174 case X86::BI__builtin_ia32_compressstoreqi512_mask:
1175 return EmitX86CompressStore(CGF&: *this, Ops);
1176
1177 case X86::BI__builtin_ia32_expanddf128_mask:
1178 case X86::BI__builtin_ia32_expanddf256_mask:
1179 case X86::BI__builtin_ia32_expanddf512_mask:
1180 case X86::BI__builtin_ia32_expandsf128_mask:
1181 case X86::BI__builtin_ia32_expandsf256_mask:
1182 case X86::BI__builtin_ia32_expandsf512_mask:
1183 case X86::BI__builtin_ia32_expanddi128_mask:
1184 case X86::BI__builtin_ia32_expanddi256_mask:
1185 case X86::BI__builtin_ia32_expanddi512_mask:
1186 case X86::BI__builtin_ia32_expandsi128_mask:
1187 case X86::BI__builtin_ia32_expandsi256_mask:
1188 case X86::BI__builtin_ia32_expandsi512_mask:
1189 case X86::BI__builtin_ia32_expandhi128_mask:
1190 case X86::BI__builtin_ia32_expandhi256_mask:
1191 case X86::BI__builtin_ia32_expandhi512_mask:
1192 case X86::BI__builtin_ia32_expandqi128_mask:
1193 case X86::BI__builtin_ia32_expandqi256_mask:
1194 case X86::BI__builtin_ia32_expandqi512_mask:
1195 return EmitX86CompressExpand(CGF&: *this, Ops, /*IsCompress*/false);
1196
1197 case X86::BI__builtin_ia32_compressdf128_mask:
1198 case X86::BI__builtin_ia32_compressdf256_mask:
1199 case X86::BI__builtin_ia32_compressdf512_mask:
1200 case X86::BI__builtin_ia32_compresssf128_mask:
1201 case X86::BI__builtin_ia32_compresssf256_mask:
1202 case X86::BI__builtin_ia32_compresssf512_mask:
1203 case X86::BI__builtin_ia32_compressdi128_mask:
1204 case X86::BI__builtin_ia32_compressdi256_mask:
1205 case X86::BI__builtin_ia32_compressdi512_mask:
1206 case X86::BI__builtin_ia32_compresssi128_mask:
1207 case X86::BI__builtin_ia32_compresssi256_mask:
1208 case X86::BI__builtin_ia32_compresssi512_mask:
1209 case X86::BI__builtin_ia32_compresshi128_mask:
1210 case X86::BI__builtin_ia32_compresshi256_mask:
1211 case X86::BI__builtin_ia32_compresshi512_mask:
1212 case X86::BI__builtin_ia32_compressqi128_mask:
1213 case X86::BI__builtin_ia32_compressqi256_mask:
1214 case X86::BI__builtin_ia32_compressqi512_mask:
1215 return EmitX86CompressExpand(CGF&: *this, Ops, /*IsCompress*/true);
1216
1217 case X86::BI__builtin_ia32_gather3div2df:
1218 case X86::BI__builtin_ia32_gather3div2di:
1219 case X86::BI__builtin_ia32_gather3div4df:
1220 case X86::BI__builtin_ia32_gather3div4di:
1221 case X86::BI__builtin_ia32_gather3div4sf:
1222 case X86::BI__builtin_ia32_gather3div4si:
1223 case X86::BI__builtin_ia32_gather3div8sf:
1224 case X86::BI__builtin_ia32_gather3div8si:
1225 case X86::BI__builtin_ia32_gather3siv2df:
1226 case X86::BI__builtin_ia32_gather3siv2di:
1227 case X86::BI__builtin_ia32_gather3siv4df:
1228 case X86::BI__builtin_ia32_gather3siv4di:
1229 case X86::BI__builtin_ia32_gather3siv4sf:
1230 case X86::BI__builtin_ia32_gather3siv4si:
1231 case X86::BI__builtin_ia32_gather3siv8sf:
1232 case X86::BI__builtin_ia32_gather3siv8si:
1233 case X86::BI__builtin_ia32_gathersiv8df:
1234 case X86::BI__builtin_ia32_gathersiv16sf:
1235 case X86::BI__builtin_ia32_gatherdiv8df:
1236 case X86::BI__builtin_ia32_gatherdiv16sf:
1237 case X86::BI__builtin_ia32_gathersiv8di:
1238 case X86::BI__builtin_ia32_gathersiv16si:
1239 case X86::BI__builtin_ia32_gatherdiv8di:
1240 case X86::BI__builtin_ia32_gatherdiv16si: {
1241 Intrinsic::ID IID;
1242 switch (BuiltinID) {
1243 default: llvm_unreachable("Unexpected builtin");
1244 case X86::BI__builtin_ia32_gather3div2df:
1245 IID = Intrinsic::x86_avx512_mask_gather3div2_df;
1246 break;
1247 case X86::BI__builtin_ia32_gather3div2di:
1248 IID = Intrinsic::x86_avx512_mask_gather3div2_di;
1249 break;
1250 case X86::BI__builtin_ia32_gather3div4df:
1251 IID = Intrinsic::x86_avx512_mask_gather3div4_df;
1252 break;
1253 case X86::BI__builtin_ia32_gather3div4di:
1254 IID = Intrinsic::x86_avx512_mask_gather3div4_di;
1255 break;
1256 case X86::BI__builtin_ia32_gather3div4sf:
1257 IID = Intrinsic::x86_avx512_mask_gather3div4_sf;
1258 break;
1259 case X86::BI__builtin_ia32_gather3div4si:
1260 IID = Intrinsic::x86_avx512_mask_gather3div4_si;
1261 break;
1262 case X86::BI__builtin_ia32_gather3div8sf:
1263 IID = Intrinsic::x86_avx512_mask_gather3div8_sf;
1264 break;
1265 case X86::BI__builtin_ia32_gather3div8si:
1266 IID = Intrinsic::x86_avx512_mask_gather3div8_si;
1267 break;
1268 case X86::BI__builtin_ia32_gather3siv2df:
1269 IID = Intrinsic::x86_avx512_mask_gather3siv2_df;
1270 break;
1271 case X86::BI__builtin_ia32_gather3siv2di:
1272 IID = Intrinsic::x86_avx512_mask_gather3siv2_di;
1273 break;
1274 case X86::BI__builtin_ia32_gather3siv4df:
1275 IID = Intrinsic::x86_avx512_mask_gather3siv4_df;
1276 break;
1277 case X86::BI__builtin_ia32_gather3siv4di:
1278 IID = Intrinsic::x86_avx512_mask_gather3siv4_di;
1279 break;
1280 case X86::BI__builtin_ia32_gather3siv4sf:
1281 IID = Intrinsic::x86_avx512_mask_gather3siv4_sf;
1282 break;
1283 case X86::BI__builtin_ia32_gather3siv4si:
1284 IID = Intrinsic::x86_avx512_mask_gather3siv4_si;
1285 break;
1286 case X86::BI__builtin_ia32_gather3siv8sf:
1287 IID = Intrinsic::x86_avx512_mask_gather3siv8_sf;
1288 break;
1289 case X86::BI__builtin_ia32_gather3siv8si:
1290 IID = Intrinsic::x86_avx512_mask_gather3siv8_si;
1291 break;
1292 case X86::BI__builtin_ia32_gathersiv8df:
1293 IID = Intrinsic::x86_avx512_mask_gather_dpd_512;
1294 break;
1295 case X86::BI__builtin_ia32_gathersiv16sf:
1296 IID = Intrinsic::x86_avx512_mask_gather_dps_512;
1297 break;
1298 case X86::BI__builtin_ia32_gatherdiv8df:
1299 IID = Intrinsic::x86_avx512_mask_gather_qpd_512;
1300 break;
1301 case X86::BI__builtin_ia32_gatherdiv16sf:
1302 IID = Intrinsic::x86_avx512_mask_gather_qps_512;
1303 break;
1304 case X86::BI__builtin_ia32_gathersiv8di:
1305 IID = Intrinsic::x86_avx512_mask_gather_dpq_512;
1306 break;
1307 case X86::BI__builtin_ia32_gathersiv16si:
1308 IID = Intrinsic::x86_avx512_mask_gather_dpi_512;
1309 break;
1310 case X86::BI__builtin_ia32_gatherdiv8di:
1311 IID = Intrinsic::x86_avx512_mask_gather_qpq_512;
1312 break;
1313 case X86::BI__builtin_ia32_gatherdiv16si:
1314 IID = Intrinsic::x86_avx512_mask_gather_qpi_512;
1315 break;
1316 }
1317
1318 unsigned MinElts = std::min(
1319 a: cast<llvm::FixedVectorType>(Val: Ops[0]->getType())->getNumElements(),
1320 b: cast<llvm::FixedVectorType>(Val: Ops[2]->getType())->getNumElements());
1321 Ops[3] = getMaskVecValue(CGF&: *this, Mask: Ops[3], NumElts: MinElts);
1322 Function *Intr = CGM.getIntrinsic(IID);
1323 return Builder.CreateCall(Callee: Intr, Args: Ops);
1324 }
1325
1326 case X86::BI__builtin_ia32_scattersiv8df:
1327 case X86::BI__builtin_ia32_scattersiv16sf:
1328 case X86::BI__builtin_ia32_scatterdiv8df:
1329 case X86::BI__builtin_ia32_scatterdiv16sf:
1330 case X86::BI__builtin_ia32_scattersiv8di:
1331 case X86::BI__builtin_ia32_scattersiv16si:
1332 case X86::BI__builtin_ia32_scatterdiv8di:
1333 case X86::BI__builtin_ia32_scatterdiv16si:
1334 case X86::BI__builtin_ia32_scatterdiv2df:
1335 case X86::BI__builtin_ia32_scatterdiv2di:
1336 case X86::BI__builtin_ia32_scatterdiv4df:
1337 case X86::BI__builtin_ia32_scatterdiv4di:
1338 case X86::BI__builtin_ia32_scatterdiv4sf:
1339 case X86::BI__builtin_ia32_scatterdiv4si:
1340 case X86::BI__builtin_ia32_scatterdiv8sf:
1341 case X86::BI__builtin_ia32_scatterdiv8si:
1342 case X86::BI__builtin_ia32_scattersiv2df:
1343 case X86::BI__builtin_ia32_scattersiv2di:
1344 case X86::BI__builtin_ia32_scattersiv4df:
1345 case X86::BI__builtin_ia32_scattersiv4di:
1346 case X86::BI__builtin_ia32_scattersiv4sf:
1347 case X86::BI__builtin_ia32_scattersiv4si:
1348 case X86::BI__builtin_ia32_scattersiv8sf:
1349 case X86::BI__builtin_ia32_scattersiv8si: {
1350 Intrinsic::ID IID;
1351 switch (BuiltinID) {
1352 default: llvm_unreachable("Unexpected builtin");
1353 case X86::BI__builtin_ia32_scattersiv8df:
1354 IID = Intrinsic::x86_avx512_mask_scatter_dpd_512;
1355 break;
1356 case X86::BI__builtin_ia32_scattersiv16sf:
1357 IID = Intrinsic::x86_avx512_mask_scatter_dps_512;
1358 break;
1359 case X86::BI__builtin_ia32_scatterdiv8df:
1360 IID = Intrinsic::x86_avx512_mask_scatter_qpd_512;
1361 break;
1362 case X86::BI__builtin_ia32_scatterdiv16sf:
1363 IID = Intrinsic::x86_avx512_mask_scatter_qps_512;
1364 break;
1365 case X86::BI__builtin_ia32_scattersiv8di:
1366 IID = Intrinsic::x86_avx512_mask_scatter_dpq_512;
1367 break;
1368 case X86::BI__builtin_ia32_scattersiv16si:
1369 IID = Intrinsic::x86_avx512_mask_scatter_dpi_512;
1370 break;
1371 case X86::BI__builtin_ia32_scatterdiv8di:
1372 IID = Intrinsic::x86_avx512_mask_scatter_qpq_512;
1373 break;
1374 case X86::BI__builtin_ia32_scatterdiv16si:
1375 IID = Intrinsic::x86_avx512_mask_scatter_qpi_512;
1376 break;
1377 case X86::BI__builtin_ia32_scatterdiv2df:
1378 IID = Intrinsic::x86_avx512_mask_scatterdiv2_df;
1379 break;
1380 case X86::BI__builtin_ia32_scatterdiv2di:
1381 IID = Intrinsic::x86_avx512_mask_scatterdiv2_di;
1382 break;
1383 case X86::BI__builtin_ia32_scatterdiv4df:
1384 IID = Intrinsic::x86_avx512_mask_scatterdiv4_df;
1385 break;
1386 case X86::BI__builtin_ia32_scatterdiv4di:
1387 IID = Intrinsic::x86_avx512_mask_scatterdiv4_di;
1388 break;
1389 case X86::BI__builtin_ia32_scatterdiv4sf:
1390 IID = Intrinsic::x86_avx512_mask_scatterdiv4_sf;
1391 break;
1392 case X86::BI__builtin_ia32_scatterdiv4si:
1393 IID = Intrinsic::x86_avx512_mask_scatterdiv4_si;
1394 break;
1395 case X86::BI__builtin_ia32_scatterdiv8sf:
1396 IID = Intrinsic::x86_avx512_mask_scatterdiv8_sf;
1397 break;
1398 case X86::BI__builtin_ia32_scatterdiv8si:
1399 IID = Intrinsic::x86_avx512_mask_scatterdiv8_si;
1400 break;
1401 case X86::BI__builtin_ia32_scattersiv2df:
1402 IID = Intrinsic::x86_avx512_mask_scattersiv2_df;
1403 break;
1404 case X86::BI__builtin_ia32_scattersiv2di:
1405 IID = Intrinsic::x86_avx512_mask_scattersiv2_di;
1406 break;
1407 case X86::BI__builtin_ia32_scattersiv4df:
1408 IID = Intrinsic::x86_avx512_mask_scattersiv4_df;
1409 break;
1410 case X86::BI__builtin_ia32_scattersiv4di:
1411 IID = Intrinsic::x86_avx512_mask_scattersiv4_di;
1412 break;
1413 case X86::BI__builtin_ia32_scattersiv4sf:
1414 IID = Intrinsic::x86_avx512_mask_scattersiv4_sf;
1415 break;
1416 case X86::BI__builtin_ia32_scattersiv4si:
1417 IID = Intrinsic::x86_avx512_mask_scattersiv4_si;
1418 break;
1419 case X86::BI__builtin_ia32_scattersiv8sf:
1420 IID = Intrinsic::x86_avx512_mask_scattersiv8_sf;
1421 break;
1422 case X86::BI__builtin_ia32_scattersiv8si:
1423 IID = Intrinsic::x86_avx512_mask_scattersiv8_si;
1424 break;
1425 }
1426
1427 unsigned MinElts = std::min(
1428 a: cast<llvm::FixedVectorType>(Val: Ops[2]->getType())->getNumElements(),
1429 b: cast<llvm::FixedVectorType>(Val: Ops[3]->getType())->getNumElements());
1430 Ops[1] = getMaskVecValue(CGF&: *this, Mask: Ops[1], NumElts: MinElts);
1431 Function *Intr = CGM.getIntrinsic(IID);
1432 return Builder.CreateCall(Callee: Intr, Args: Ops);
1433 }
1434
1435 case X86::BI__builtin_ia32_vextractf128_pd256:
1436 case X86::BI__builtin_ia32_vextractf128_ps256:
1437 case X86::BI__builtin_ia32_vextractf128_si256:
1438 case X86::BI__builtin_ia32_extract128i256:
1439 case X86::BI__builtin_ia32_extractf64x4_mask:
1440 case X86::BI__builtin_ia32_extractf32x4_mask:
1441 case X86::BI__builtin_ia32_extracti64x4_mask:
1442 case X86::BI__builtin_ia32_extracti32x4_mask:
1443 case X86::BI__builtin_ia32_extractf32x8_mask:
1444 case X86::BI__builtin_ia32_extracti32x8_mask:
1445 case X86::BI__builtin_ia32_extractf32x4_256_mask:
1446 case X86::BI__builtin_ia32_extracti32x4_256_mask:
1447 case X86::BI__builtin_ia32_extractf64x2_256_mask:
1448 case X86::BI__builtin_ia32_extracti64x2_256_mask:
1449 case X86::BI__builtin_ia32_extractf64x2_512_mask:
1450 case X86::BI__builtin_ia32_extracti64x2_512_mask: {
1451 auto *DstTy = cast<llvm::FixedVectorType>(ConvertType(E->getType()));
1452 unsigned NumElts = DstTy->getNumElements();
1453 unsigned SrcNumElts =
1454 cast<llvm::FixedVectorType>(Val: Ops[0]->getType())->getNumElements();
1455 unsigned SubVectors = SrcNumElts / NumElts;
1456 unsigned Index = cast<ConstantInt>(Val: Ops[1])->getZExtValue();
1457 assert(llvm::isPowerOf2_32(SubVectors) && "Expected power of 2 subvectors");
1458 Index &= SubVectors - 1; // Remove any extra bits.
1459 Index *= NumElts;
1460
1461 int Indices[16];
1462 for (unsigned i = 0; i != NumElts; ++i)
1463 Indices[i] = i + Index;
1464
1465 Value *Res = Builder.CreateShuffleVector(V: Ops[0], Mask: ArrayRef(Indices, NumElts),
1466 Name: "extract");
1467
1468 if (Ops.size() == 4)
1469 Res = EmitX86Select(CGF&: *this, Mask: Ops[3], Op0: Res, Op1: Ops[2]);
1470
1471 return Res;
1472 }
1473 case X86::BI__builtin_ia32_vinsertf128_pd256:
1474 case X86::BI__builtin_ia32_vinsertf128_ps256:
1475 case X86::BI__builtin_ia32_vinsertf128_si256:
1476 case X86::BI__builtin_ia32_insert128i256:
1477 case X86::BI__builtin_ia32_insertf64x4:
1478 case X86::BI__builtin_ia32_insertf32x4:
1479 case X86::BI__builtin_ia32_inserti64x4:
1480 case X86::BI__builtin_ia32_inserti32x4:
1481 case X86::BI__builtin_ia32_insertf32x8:
1482 case X86::BI__builtin_ia32_inserti32x8:
1483 case X86::BI__builtin_ia32_insertf32x4_256:
1484 case X86::BI__builtin_ia32_inserti32x4_256:
1485 case X86::BI__builtin_ia32_insertf64x2_256:
1486 case X86::BI__builtin_ia32_inserti64x2_256:
1487 case X86::BI__builtin_ia32_insertf64x2_512:
1488 case X86::BI__builtin_ia32_inserti64x2_512: {
1489 unsigned DstNumElts =
1490 cast<llvm::FixedVectorType>(Val: Ops[0]->getType())->getNumElements();
1491 unsigned SrcNumElts =
1492 cast<llvm::FixedVectorType>(Val: Ops[1]->getType())->getNumElements();
1493 unsigned SubVectors = DstNumElts / SrcNumElts;
1494 unsigned Index = cast<ConstantInt>(Val: Ops[2])->getZExtValue();
1495 assert(llvm::isPowerOf2_32(SubVectors) && "Expected power of 2 subvectors");
1496 Index &= SubVectors - 1; // Remove any extra bits.
1497 Index *= SrcNumElts;
1498
1499 int Indices[16];
1500 for (unsigned i = 0; i != DstNumElts; ++i)
1501 Indices[i] = (i >= SrcNumElts) ? SrcNumElts + (i % SrcNumElts) : i;
1502
1503 Value *Op1 = Builder.CreateShuffleVector(
1504 V: Ops[1], Mask: ArrayRef(Indices, DstNumElts), Name: "widen");
1505
1506 for (unsigned i = 0; i != DstNumElts; ++i) {
1507 if (i >= Index && i < (Index + SrcNumElts))
1508 Indices[i] = (i - Index) + DstNumElts;
1509 else
1510 Indices[i] = i;
1511 }
1512
1513 return Builder.CreateShuffleVector(V1: Ops[0], V2: Op1,
1514 Mask: ArrayRef(Indices, DstNumElts), Name: "insert");
1515 }
1516 case X86::BI__builtin_ia32_pmovqd512_mask:
1517 case X86::BI__builtin_ia32_pmovwb512_mask: {
1518 Value *Res = Builder.CreateTrunc(V: Ops[0], DestTy: Ops[1]->getType());
1519 return EmitX86Select(CGF&: *this, Mask: Ops[2], Op0: Res, Op1: Ops[1]);
1520 }
1521 case X86::BI__builtin_ia32_pmovdb512_mask:
1522 case X86::BI__builtin_ia32_pmovdw512_mask:
1523 case X86::BI__builtin_ia32_pmovqw512_mask: {
1524 if (const auto *C = dyn_cast<Constant>(Val: Ops[2]))
1525 if (C->isAllOnesValue())
1526 return Builder.CreateTrunc(V: Ops[0], DestTy: Ops[1]->getType());
1527
1528 Intrinsic::ID IID;
1529 switch (BuiltinID) {
1530 default: llvm_unreachable("Unsupported intrinsic!");
1531 case X86::BI__builtin_ia32_pmovdb512_mask:
1532 IID = Intrinsic::x86_avx512_mask_pmov_db_512;
1533 break;
1534 case X86::BI__builtin_ia32_pmovdw512_mask:
1535 IID = Intrinsic::x86_avx512_mask_pmov_dw_512;
1536 break;
1537 case X86::BI__builtin_ia32_pmovqw512_mask:
1538 IID = Intrinsic::x86_avx512_mask_pmov_qw_512;
1539 break;
1540 }
1541
1542 Function *Intr = CGM.getIntrinsic(IID);
1543 return Builder.CreateCall(Callee: Intr, Args: Ops);
1544 }
1545 case X86::BI__builtin_ia32_pblendw128:
1546 case X86::BI__builtin_ia32_blendpd:
1547 case X86::BI__builtin_ia32_blendps:
1548 case X86::BI__builtin_ia32_blendpd256:
1549 case X86::BI__builtin_ia32_blendps256:
1550 case X86::BI__builtin_ia32_pblendw256:
1551 case X86::BI__builtin_ia32_pblendd128:
1552 case X86::BI__builtin_ia32_pblendd256: {
1553 unsigned NumElts =
1554 cast<llvm::FixedVectorType>(Val: Ops[0]->getType())->getNumElements();
1555 unsigned Imm = cast<llvm::ConstantInt>(Val: Ops[2])->getZExtValue();
1556
1557 int Indices[16];
1558 // If there are more than 8 elements, the immediate is used twice so make
1559 // sure we handle that.
1560 for (unsigned i = 0; i != NumElts; ++i)
1561 Indices[i] = ((Imm >> (i % 8)) & 0x1) ? NumElts + i : i;
1562
1563 return Builder.CreateShuffleVector(V1: Ops[0], V2: Ops[1],
1564 Mask: ArrayRef(Indices, NumElts), Name: "blend");
1565 }
1566 case X86::BI__builtin_ia32_pshuflw:
1567 case X86::BI__builtin_ia32_pshuflw256:
1568 case X86::BI__builtin_ia32_pshuflw512: {
1569 uint32_t Imm = cast<llvm::ConstantInt>(Val: Ops[1])->getZExtValue();
1570 auto *Ty = cast<llvm::FixedVectorType>(Val: Ops[0]->getType());
1571 unsigned NumElts = Ty->getNumElements();
1572
1573 // Splat the 8-bits of immediate 4 times to help the loop wrap around.
1574 Imm = (Imm & 0xff) * 0x01010101;
1575
1576 int Indices[32];
1577 for (unsigned l = 0; l != NumElts; l += 8) {
1578 for (unsigned i = 0; i != 4; ++i) {
1579 Indices[l + i] = l + (Imm & 3);
1580 Imm >>= 2;
1581 }
1582 for (unsigned i = 4; i != 8; ++i)
1583 Indices[l + i] = l + i;
1584 }
1585
1586 return Builder.CreateShuffleVector(V: Ops[0], Mask: ArrayRef(Indices, NumElts),
1587 Name: "pshuflw");
1588 }
1589 case X86::BI__builtin_ia32_pshufhw:
1590 case X86::BI__builtin_ia32_pshufhw256:
1591 case X86::BI__builtin_ia32_pshufhw512: {
1592 uint32_t Imm = cast<llvm::ConstantInt>(Val: Ops[1])->getZExtValue();
1593 auto *Ty = cast<llvm::FixedVectorType>(Val: Ops[0]->getType());
1594 unsigned NumElts = Ty->getNumElements();
1595
1596 // Splat the 8-bits of immediate 4 times to help the loop wrap around.
1597 Imm = (Imm & 0xff) * 0x01010101;
1598
1599 int Indices[32];
1600 for (unsigned l = 0; l != NumElts; l += 8) {
1601 for (unsigned i = 0; i != 4; ++i)
1602 Indices[l + i] = l + i;
1603 for (unsigned i = 4; i != 8; ++i) {
1604 Indices[l + i] = l + 4 + (Imm & 3);
1605 Imm >>= 2;
1606 }
1607 }
1608
1609 return Builder.CreateShuffleVector(V: Ops[0], Mask: ArrayRef(Indices, NumElts),
1610 Name: "pshufhw");
1611 }
1612 case X86::BI__builtin_ia32_pshufd:
1613 case X86::BI__builtin_ia32_pshufd256:
1614 case X86::BI__builtin_ia32_pshufd512:
1615 case X86::BI__builtin_ia32_vpermilpd:
1616 case X86::BI__builtin_ia32_vpermilps:
1617 case X86::BI__builtin_ia32_vpermilpd256:
1618 case X86::BI__builtin_ia32_vpermilps256:
1619 case X86::BI__builtin_ia32_vpermilpd512:
1620 case X86::BI__builtin_ia32_vpermilps512: {
1621 uint32_t Imm = cast<llvm::ConstantInt>(Val: Ops[1])->getZExtValue();
1622 auto *Ty = cast<llvm::FixedVectorType>(Val: Ops[0]->getType());
1623 unsigned NumElts = Ty->getNumElements();
1624 unsigned NumLanes = Ty->getPrimitiveSizeInBits() / 128;
1625 unsigned NumLaneElts = NumElts / NumLanes;
1626
1627 // Splat the 8-bits of immediate 4 times to help the loop wrap around.
1628 Imm = (Imm & 0xff) * 0x01010101;
1629
1630 int Indices[16];
1631 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
1632 for (unsigned i = 0; i != NumLaneElts; ++i) {
1633 Indices[i + l] = (Imm % NumLaneElts) + l;
1634 Imm /= NumLaneElts;
1635 }
1636 }
1637
1638 return Builder.CreateShuffleVector(V: Ops[0], Mask: ArrayRef(Indices, NumElts),
1639 Name: "permil");
1640 }
1641 case X86::BI__builtin_ia32_shufpd:
1642 case X86::BI__builtin_ia32_shufpd256:
1643 case X86::BI__builtin_ia32_shufpd512:
1644 case X86::BI__builtin_ia32_shufps:
1645 case X86::BI__builtin_ia32_shufps256:
1646 case X86::BI__builtin_ia32_shufps512: {
1647 uint32_t Imm = cast<llvm::ConstantInt>(Val: Ops[2])->getZExtValue();
1648 auto *Ty = cast<llvm::FixedVectorType>(Val: Ops[0]->getType());
1649 unsigned NumElts = Ty->getNumElements();
1650 unsigned NumLanes = Ty->getPrimitiveSizeInBits() / 128;
1651 unsigned NumLaneElts = NumElts / NumLanes;
1652
1653 // Splat the 8-bits of immediate 4 times to help the loop wrap around.
1654 Imm = (Imm & 0xff) * 0x01010101;
1655
1656 int Indices[16];
1657 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
1658 for (unsigned i = 0; i != NumLaneElts; ++i) {
1659 unsigned Index = Imm % NumLaneElts;
1660 Imm /= NumLaneElts;
1661 if (i >= (NumLaneElts / 2))
1662 Index += NumElts;
1663 Indices[l + i] = l + Index;
1664 }
1665 }
1666
1667 return Builder.CreateShuffleVector(V1: Ops[0], V2: Ops[1],
1668 Mask: ArrayRef(Indices, NumElts), Name: "shufp");
1669 }
1670 case X86::BI__builtin_ia32_permdi256:
1671 case X86::BI__builtin_ia32_permdf256:
1672 case X86::BI__builtin_ia32_permdi512:
1673 case X86::BI__builtin_ia32_permdf512: {
1674 unsigned Imm = cast<llvm::ConstantInt>(Val: Ops[1])->getZExtValue();
1675 auto *Ty = cast<llvm::FixedVectorType>(Val: Ops[0]->getType());
1676 unsigned NumElts = Ty->getNumElements();
1677
1678 // These intrinsics operate on 256-bit lanes of four 64-bit elements.
1679 int Indices[8];
1680 for (unsigned l = 0; l != NumElts; l += 4)
1681 for (unsigned i = 0; i != 4; ++i)
1682 Indices[l + i] = l + ((Imm >> (2 * i)) & 0x3);
1683
1684 return Builder.CreateShuffleVector(V: Ops[0], Mask: ArrayRef(Indices, NumElts),
1685 Name: "perm");
1686 }
1687 case X86::BI__builtin_ia32_palignr128:
1688 case X86::BI__builtin_ia32_palignr256:
1689 case X86::BI__builtin_ia32_palignr512: {
1690 unsigned ShiftVal = cast<llvm::ConstantInt>(Val: Ops[2])->getZExtValue() & 0xff;
1691
1692 unsigned NumElts =
1693 cast<llvm::FixedVectorType>(Val: Ops[0]->getType())->getNumElements();
1694 assert(NumElts % 16 == 0);
1695
1696 // If palignr is shifting the pair of vectors more than the size of two
1697 // lanes, emit zero.
1698 if (ShiftVal >= 32)
1699 return llvm::Constant::getNullValue(Ty: ConvertType(E->getType()));
1700
1701 // If palignr is shifting the pair of input vectors more than one lane,
1702 // but less than two lanes, convert to shifting in zeroes.
1703 if (ShiftVal > 16) {
1704 ShiftVal -= 16;
1705 Ops[1] = Ops[0];
1706 Ops[0] = llvm::Constant::getNullValue(Ty: Ops[0]->getType());
1707 }
1708
1709 int Indices[64];
1710 // 256-bit palignr operates on 128-bit lanes so we need to handle that
1711 for (unsigned l = 0; l != NumElts; l += 16) {
1712 for (unsigned i = 0; i != 16; ++i) {
1713 unsigned Idx = ShiftVal + i;
1714 if (Idx >= 16)
1715 Idx += NumElts - 16; // End of lane, switch operand.
1716 Indices[l + i] = Idx + l;
1717 }
1718 }
1719
1720 return Builder.CreateShuffleVector(V1: Ops[1], V2: Ops[0],
1721 Mask: ArrayRef(Indices, NumElts), Name: "palignr");
1722 }
1723 case X86::BI__builtin_ia32_alignd128:
1724 case X86::BI__builtin_ia32_alignd256:
1725 case X86::BI__builtin_ia32_alignd512:
1726 case X86::BI__builtin_ia32_alignq128:
1727 case X86::BI__builtin_ia32_alignq256:
1728 case X86::BI__builtin_ia32_alignq512: {
1729 unsigned NumElts =
1730 cast<llvm::FixedVectorType>(Val: Ops[0]->getType())->getNumElements();
1731 unsigned ShiftVal = cast<llvm::ConstantInt>(Val: Ops[2])->getZExtValue() & 0xff;
1732
1733 // Mask the shift amount to width of a vector.
1734 ShiftVal &= NumElts - 1;
1735
1736 int Indices[16];
1737 for (unsigned i = 0; i != NumElts; ++i)
1738 Indices[i] = i + ShiftVal;
1739
1740 return Builder.CreateShuffleVector(V1: Ops[1], V2: Ops[0],
1741 Mask: ArrayRef(Indices, NumElts), Name: "valign");
1742 }
1743 case X86::BI__builtin_ia32_shuf_f32x4_256:
1744 case X86::BI__builtin_ia32_shuf_f64x2_256:
1745 case X86::BI__builtin_ia32_shuf_i32x4_256:
1746 case X86::BI__builtin_ia32_shuf_i64x2_256:
1747 case X86::BI__builtin_ia32_shuf_f32x4:
1748 case X86::BI__builtin_ia32_shuf_f64x2:
1749 case X86::BI__builtin_ia32_shuf_i32x4:
1750 case X86::BI__builtin_ia32_shuf_i64x2: {
1751 unsigned Imm = cast<llvm::ConstantInt>(Val: Ops[2])->getZExtValue();
1752 auto *Ty = cast<llvm::FixedVectorType>(Val: Ops[0]->getType());
1753 unsigned NumElts = Ty->getNumElements();
1754 unsigned NumLanes = Ty->getPrimitiveSizeInBits() == 512 ? 4 : 2;
1755 unsigned NumLaneElts = NumElts / NumLanes;
1756
1757 int Indices[16];
1758 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
1759 unsigned Index = (Imm % NumLanes) * NumLaneElts;
1760 Imm /= NumLanes; // Discard the bits we just used.
1761 if (l >= (NumElts / 2))
1762 Index += NumElts; // Switch to other source.
1763 for (unsigned i = 0; i != NumLaneElts; ++i) {
1764 Indices[l + i] = Index + i;
1765 }
1766 }
1767
1768 return Builder.CreateShuffleVector(V1: Ops[0], V2: Ops[1],
1769 Mask: ArrayRef(Indices, NumElts), Name: "shuf");
1770 }
1771
1772 case X86::BI__builtin_ia32_vperm2f128_pd256:
1773 case X86::BI__builtin_ia32_vperm2f128_ps256:
1774 case X86::BI__builtin_ia32_vperm2f128_si256:
1775 case X86::BI__builtin_ia32_permti256: {
1776 unsigned Imm = cast<llvm::ConstantInt>(Val: Ops[2])->getZExtValue();
1777 unsigned NumElts =
1778 cast<llvm::FixedVectorType>(Val: Ops[0]->getType())->getNumElements();
1779
1780 // This takes a very simple approach since there are two lanes and a
1781 // shuffle can have 2 inputs. So we reserve the first input for the first
1782 // lane and the second input for the second lane. This may result in
1783 // duplicate sources, but this can be dealt with in the backend.
1784
1785 Value *OutOps[2];
1786 int Indices[8];
1787 for (unsigned l = 0; l != 2; ++l) {
1788 // Determine the source for this lane.
1789 if (Imm & (1 << ((l * 4) + 3)))
1790 OutOps[l] = llvm::ConstantAggregateZero::get(Ty: Ops[0]->getType());
1791 else if (Imm & (1 << ((l * 4) + 1)))
1792 OutOps[l] = Ops[1];
1793 else
1794 OutOps[l] = Ops[0];
1795
1796 for (unsigned i = 0; i != NumElts/2; ++i) {
1797 // Start with ith element of the source for this lane.
1798 unsigned Idx = (l * NumElts) + i;
1799 // If bit 0 of the immediate half is set, switch to the high half of
1800 // the source.
1801 if (Imm & (1 << (l * 4)))
1802 Idx += NumElts/2;
1803 Indices[(l * (NumElts/2)) + i] = Idx;
1804 }
1805 }
1806
1807 return Builder.CreateShuffleVector(V1: OutOps[0], V2: OutOps[1],
1808 Mask: ArrayRef(Indices, NumElts), Name: "vperm");
1809 }
1810
1811 case X86::BI__builtin_ia32_pslldqi128_byteshift:
1812 case X86::BI__builtin_ia32_pslldqi256_byteshift:
1813 case X86::BI__builtin_ia32_pslldqi512_byteshift: {
1814 unsigned ShiftVal = cast<llvm::ConstantInt>(Val: Ops[1])->getZExtValue() & 0xff;
1815 auto *ResultType = cast<llvm::FixedVectorType>(Val: Ops[0]->getType());
1816 // Builtin type is vXi64 so multiply by 8 to get bytes.
1817 unsigned NumElts = ResultType->getNumElements() * 8;
1818
1819 // If pslldq is shifting the vector more than 15 bytes, emit zero.
1820 if (ShiftVal >= 16)
1821 return llvm::Constant::getNullValue(Ty: ResultType);
1822
1823 int Indices[64];
1824 // 256/512-bit pslldq operates on 128-bit lanes so we need to handle that
1825 for (unsigned l = 0; l != NumElts; l += 16) {
1826 for (unsigned i = 0; i != 16; ++i) {
1827 unsigned Idx = NumElts + i - ShiftVal;
1828 if (Idx < NumElts) Idx -= NumElts - 16; // end of lane, switch operand.
1829 Indices[l + i] = Idx + l;
1830 }
1831 }
1832
1833 auto *VecTy = llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts);
1834 Value *Cast = Builder.CreateBitCast(V: Ops[0], DestTy: VecTy, Name: "cast");
1835 Value *Zero = llvm::Constant::getNullValue(Ty: VecTy);
1836 Value *SV = Builder.CreateShuffleVector(
1837 V1: Zero, V2: Cast, Mask: ArrayRef(Indices, NumElts), Name: "pslldq");
1838 return Builder.CreateBitCast(V: SV, DestTy: Ops[0]->getType(), Name: "cast");
1839 }
1840 case X86::BI__builtin_ia32_psrldqi128_byteshift:
1841 case X86::BI__builtin_ia32_psrldqi256_byteshift:
1842 case X86::BI__builtin_ia32_psrldqi512_byteshift: {
1843 unsigned ShiftVal = cast<llvm::ConstantInt>(Val: Ops[1])->getZExtValue() & 0xff;
1844 auto *ResultType = cast<llvm::FixedVectorType>(Val: Ops[0]->getType());
1845 // Builtin type is vXi64 so multiply by 8 to get bytes.
1846 unsigned NumElts = ResultType->getNumElements() * 8;
1847
1848 // If psrldq is shifting the vector more than 15 bytes, emit zero.
1849 if (ShiftVal >= 16)
1850 return llvm::Constant::getNullValue(Ty: ResultType);
1851
1852 int Indices[64];
1853 // 256/512-bit psrldq operates on 128-bit lanes so we need to handle that
1854 for (unsigned l = 0; l != NumElts; l += 16) {
1855 for (unsigned i = 0; i != 16; ++i) {
1856 unsigned Idx = i + ShiftVal;
1857 if (Idx >= 16) Idx += NumElts - 16; // end of lane, switch operand.
1858 Indices[l + i] = Idx + l;
1859 }
1860 }
1861
1862 auto *VecTy = llvm::FixedVectorType::get(ElementType: Int8Ty, NumElts);
1863 Value *Cast = Builder.CreateBitCast(V: Ops[0], DestTy: VecTy, Name: "cast");
1864 Value *Zero = llvm::Constant::getNullValue(Ty: VecTy);
1865 Value *SV = Builder.CreateShuffleVector(
1866 V1: Cast, V2: Zero, Mask: ArrayRef(Indices, NumElts), Name: "psrldq");
1867 return Builder.CreateBitCast(V: SV, DestTy: ResultType, Name: "cast");
1868 }
1869 case X86::BI__builtin_ia32_kshiftliqi:
1870 case X86::BI__builtin_ia32_kshiftlihi:
1871 case X86::BI__builtin_ia32_kshiftlisi:
1872 case X86::BI__builtin_ia32_kshiftlidi: {
1873 unsigned ShiftVal = cast<llvm::ConstantInt>(Val: Ops[1])->getZExtValue() & 0xff;
1874 unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
1875
1876 if (ShiftVal >= NumElts)
1877 return llvm::Constant::getNullValue(Ty: Ops[0]->getType());
1878
1879 Value *In = getMaskVecValue(CGF&: *this, Mask: Ops[0], NumElts);
1880
1881 int Indices[64];
1882 for (unsigned i = 0; i != NumElts; ++i)
1883 Indices[i] = NumElts + i - ShiftVal;
1884
1885 Value *Zero = llvm::Constant::getNullValue(Ty: In->getType());
1886 Value *SV = Builder.CreateShuffleVector(
1887 V1: Zero, V2: In, Mask: ArrayRef(Indices, NumElts), Name: "kshiftl");
1888 return Builder.CreateBitCast(V: SV, DestTy: Ops[0]->getType());
1889 }
1890 case X86::BI__builtin_ia32_kshiftriqi:
1891 case X86::BI__builtin_ia32_kshiftrihi:
1892 case X86::BI__builtin_ia32_kshiftrisi:
1893 case X86::BI__builtin_ia32_kshiftridi: {
1894 unsigned ShiftVal = cast<llvm::ConstantInt>(Val: Ops[1])->getZExtValue() & 0xff;
1895 unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
1896
1897 if (ShiftVal >= NumElts)
1898 return llvm::Constant::getNullValue(Ty: Ops[0]->getType());
1899
1900 Value *In = getMaskVecValue(CGF&: *this, Mask: Ops[0], NumElts);
1901
1902 int Indices[64];
1903 for (unsigned i = 0; i != NumElts; ++i)
1904 Indices[i] = i + ShiftVal;
1905
1906 Value *Zero = llvm::Constant::getNullValue(Ty: In->getType());
1907 Value *SV = Builder.CreateShuffleVector(
1908 V1: In, V2: Zero, Mask: ArrayRef(Indices, NumElts), Name: "kshiftr");
1909 return Builder.CreateBitCast(V: SV, DestTy: Ops[0]->getType());
1910 }
1911 case X86::BI__builtin_ia32_movnti:
1912 case X86::BI__builtin_ia32_movnti64:
1913 case X86::BI__builtin_ia32_movntsd:
1914 case X86::BI__builtin_ia32_movntss: {
1915 llvm::MDNode *Node = llvm::MDNode::get(
1916 Context&: getLLVMContext(), MDs: llvm::ConstantAsMetadata::get(C: Builder.getInt32(C: 1)));
1917
1918 Value *Ptr = Ops[0];
1919 Value *Src = Ops[1];
1920
1921 // Extract the 0'th element of the source vector.
1922 if (BuiltinID == X86::BI__builtin_ia32_movntsd ||
1923 BuiltinID == X86::BI__builtin_ia32_movntss)
1924 Src = Builder.CreateExtractElement(Vec: Src, Idx: (uint64_t)0, Name: "extract");
1925
1926 // Unaligned nontemporal store of the scalar value.
1927 StoreInst *SI = Builder.CreateDefaultAlignedStore(Val: Src, Addr: Ptr);
1928 SI->setMetadata(KindID: llvm::LLVMContext::MD_nontemporal, Node);
1929 SI->setAlignment(llvm::Align(1));
1930 return SI;
1931 }
1932 // Rotate is a special case of funnel shift - 1st 2 args are the same.
1933 case X86::BI__builtin_ia32_vprotb:
1934 case X86::BI__builtin_ia32_vprotw:
1935 case X86::BI__builtin_ia32_vprotd:
1936 case X86::BI__builtin_ia32_vprotq:
1937 case X86::BI__builtin_ia32_vprotbi:
1938 case X86::BI__builtin_ia32_vprotwi:
1939 case X86::BI__builtin_ia32_vprotdi:
1940 case X86::BI__builtin_ia32_vprotqi:
1941 case X86::BI__builtin_ia32_prold128:
1942 case X86::BI__builtin_ia32_prold256:
1943 case X86::BI__builtin_ia32_prold512:
1944 case X86::BI__builtin_ia32_prolq128:
1945 case X86::BI__builtin_ia32_prolq256:
1946 case X86::BI__builtin_ia32_prolq512:
1947 case X86::BI__builtin_ia32_prolvd128:
1948 case X86::BI__builtin_ia32_prolvd256:
1949 case X86::BI__builtin_ia32_prolvd512:
1950 case X86::BI__builtin_ia32_prolvq128:
1951 case X86::BI__builtin_ia32_prolvq256:
1952 case X86::BI__builtin_ia32_prolvq512:
1953 return EmitX86FunnelShift(CGF&: *this, Op0: Ops[0], Op1: Ops[0], Amt: Ops[1], IsRight: false);
1954 case X86::BI__builtin_ia32_prord128:
1955 case X86::BI__builtin_ia32_prord256:
1956 case X86::BI__builtin_ia32_prord512:
1957 case X86::BI__builtin_ia32_prorq128:
1958 case X86::BI__builtin_ia32_prorq256:
1959 case X86::BI__builtin_ia32_prorq512:
1960 case X86::BI__builtin_ia32_prorvd128:
1961 case X86::BI__builtin_ia32_prorvd256:
1962 case X86::BI__builtin_ia32_prorvd512:
1963 case X86::BI__builtin_ia32_prorvq128:
1964 case X86::BI__builtin_ia32_prorvq256:
1965 case X86::BI__builtin_ia32_prorvq512:
1966 return EmitX86FunnelShift(CGF&: *this, Op0: Ops[0], Op1: Ops[0], Amt: Ops[1], IsRight: true);
1967 case X86::BI__builtin_ia32_selectb_128:
1968 case X86::BI__builtin_ia32_selectb_256:
1969 case X86::BI__builtin_ia32_selectb_512:
1970 case X86::BI__builtin_ia32_selectw_128:
1971 case X86::BI__builtin_ia32_selectw_256:
1972 case X86::BI__builtin_ia32_selectw_512:
1973 case X86::BI__builtin_ia32_selectd_128:
1974 case X86::BI__builtin_ia32_selectd_256:
1975 case X86::BI__builtin_ia32_selectd_512:
1976 case X86::BI__builtin_ia32_selectq_128:
1977 case X86::BI__builtin_ia32_selectq_256:
1978 case X86::BI__builtin_ia32_selectq_512:
1979 case X86::BI__builtin_ia32_selectph_128:
1980 case X86::BI__builtin_ia32_selectph_256:
1981 case X86::BI__builtin_ia32_selectph_512:
1982 case X86::BI__builtin_ia32_selectpbf_128:
1983 case X86::BI__builtin_ia32_selectpbf_256:
1984 case X86::BI__builtin_ia32_selectpbf_512:
1985 case X86::BI__builtin_ia32_selectps_128:
1986 case X86::BI__builtin_ia32_selectps_256:
1987 case X86::BI__builtin_ia32_selectps_512:
1988 case X86::BI__builtin_ia32_selectpd_128:
1989 case X86::BI__builtin_ia32_selectpd_256:
1990 case X86::BI__builtin_ia32_selectpd_512:
1991 return EmitX86Select(CGF&: *this, Mask: Ops[0], Op0: Ops[1], Op1: Ops[2]);
1992 case X86::BI__builtin_ia32_selectsh_128:
1993 case X86::BI__builtin_ia32_selectsbf_128:
1994 case X86::BI__builtin_ia32_selectss_128:
1995 case X86::BI__builtin_ia32_selectsd_128: {
1996 Value *A = Builder.CreateExtractElement(Vec: Ops[1], Idx: (uint64_t)0);
1997 Value *B = Builder.CreateExtractElement(Vec: Ops[2], Idx: (uint64_t)0);
1998 A = EmitX86ScalarSelect(CGF&: *this, Mask: Ops[0], Op0: A, Op1: B);
1999 return Builder.CreateInsertElement(Vec: Ops[1], NewElt: A, Idx: (uint64_t)0);
2000 }
2001 case X86::BI__builtin_ia32_cmpb128_mask:
2002 case X86::BI__builtin_ia32_cmpb256_mask:
2003 case X86::BI__builtin_ia32_cmpb512_mask:
2004 case X86::BI__builtin_ia32_cmpw128_mask:
2005 case X86::BI__builtin_ia32_cmpw256_mask:
2006 case X86::BI__builtin_ia32_cmpw512_mask:
2007 case X86::BI__builtin_ia32_cmpd128_mask:
2008 case X86::BI__builtin_ia32_cmpd256_mask:
2009 case X86::BI__builtin_ia32_cmpd512_mask:
2010 case X86::BI__builtin_ia32_cmpq128_mask:
2011 case X86::BI__builtin_ia32_cmpq256_mask:
2012 case X86::BI__builtin_ia32_cmpq512_mask: {
2013 unsigned CC = cast<llvm::ConstantInt>(Val: Ops[2])->getZExtValue() & 0x7;
2014 return EmitX86MaskedCompare(CGF&: *this, CC, Signed: true, Ops);
2015 }
2016 case X86::BI__builtin_ia32_ucmpb128_mask:
2017 case X86::BI__builtin_ia32_ucmpb256_mask:
2018 case X86::BI__builtin_ia32_ucmpb512_mask:
2019 case X86::BI__builtin_ia32_ucmpw128_mask:
2020 case X86::BI__builtin_ia32_ucmpw256_mask:
2021 case X86::BI__builtin_ia32_ucmpw512_mask:
2022 case X86::BI__builtin_ia32_ucmpd128_mask:
2023 case X86::BI__builtin_ia32_ucmpd256_mask:
2024 case X86::BI__builtin_ia32_ucmpd512_mask:
2025 case X86::BI__builtin_ia32_ucmpq128_mask:
2026 case X86::BI__builtin_ia32_ucmpq256_mask:
2027 case X86::BI__builtin_ia32_ucmpq512_mask: {
2028 unsigned CC = cast<llvm::ConstantInt>(Val: Ops[2])->getZExtValue() & 0x7;
2029 return EmitX86MaskedCompare(CGF&: *this, CC, Signed: false, Ops);
2030 }
2031 case X86::BI__builtin_ia32_vpcomb:
2032 case X86::BI__builtin_ia32_vpcomw:
2033 case X86::BI__builtin_ia32_vpcomd:
2034 case X86::BI__builtin_ia32_vpcomq:
2035 return EmitX86vpcom(CGF&: *this, Ops, IsSigned: true);
2036 case X86::BI__builtin_ia32_vpcomub:
2037 case X86::BI__builtin_ia32_vpcomuw:
2038 case X86::BI__builtin_ia32_vpcomud:
2039 case X86::BI__builtin_ia32_vpcomuq:
2040 return EmitX86vpcom(CGF&: *this, Ops, IsSigned: false);
2041
2042 case X86::BI__builtin_ia32_kortestcqi:
2043 case X86::BI__builtin_ia32_kortestchi:
2044 case X86::BI__builtin_ia32_kortestcsi:
2045 case X86::BI__builtin_ia32_kortestcdi: {
2046 Value *Or = EmitX86MaskLogic(CGF&: *this, Opc: Instruction::Or, Ops);
2047 Value *C = llvm::Constant::getAllOnesValue(Ty: Ops[0]->getType());
2048 Value *Cmp = Builder.CreateICmpEQ(LHS: Or, RHS: C);
2049 return Builder.CreateZExt(V: Cmp, DestTy: ConvertType(E->getType()));
2050 }
2051 case X86::BI__builtin_ia32_kortestzqi:
2052 case X86::BI__builtin_ia32_kortestzhi:
2053 case X86::BI__builtin_ia32_kortestzsi:
2054 case X86::BI__builtin_ia32_kortestzdi: {
2055 Value *Or = EmitX86MaskLogic(CGF&: *this, Opc: Instruction::Or, Ops);
2056 Value *C = llvm::Constant::getNullValue(Ty: Ops[0]->getType());
2057 Value *Cmp = Builder.CreateICmpEQ(LHS: Or, RHS: C);
2058 return Builder.CreateZExt(V: Cmp, DestTy: ConvertType(E->getType()));
2059 }
2060
2061 case X86::BI__builtin_ia32_ktestcqi:
2062 case X86::BI__builtin_ia32_ktestzqi:
2063 case X86::BI__builtin_ia32_ktestchi:
2064 case X86::BI__builtin_ia32_ktestzhi:
2065 case X86::BI__builtin_ia32_ktestcsi:
2066 case X86::BI__builtin_ia32_ktestzsi:
2067 case X86::BI__builtin_ia32_ktestcdi:
2068 case X86::BI__builtin_ia32_ktestzdi: {
2069 Intrinsic::ID IID;
2070 switch (BuiltinID) {
2071 default: llvm_unreachable("Unsupported intrinsic!");
2072 case X86::BI__builtin_ia32_ktestcqi:
2073 IID = Intrinsic::x86_avx512_ktestc_b;
2074 break;
2075 case X86::BI__builtin_ia32_ktestzqi:
2076 IID = Intrinsic::x86_avx512_ktestz_b;
2077 break;
2078 case X86::BI__builtin_ia32_ktestchi:
2079 IID = Intrinsic::x86_avx512_ktestc_w;
2080 break;
2081 case X86::BI__builtin_ia32_ktestzhi:
2082 IID = Intrinsic::x86_avx512_ktestz_w;
2083 break;
2084 case X86::BI__builtin_ia32_ktestcsi:
2085 IID = Intrinsic::x86_avx512_ktestc_d;
2086 break;
2087 case X86::BI__builtin_ia32_ktestzsi:
2088 IID = Intrinsic::x86_avx512_ktestz_d;
2089 break;
2090 case X86::BI__builtin_ia32_ktestcdi:
2091 IID = Intrinsic::x86_avx512_ktestc_q;
2092 break;
2093 case X86::BI__builtin_ia32_ktestzdi:
2094 IID = Intrinsic::x86_avx512_ktestz_q;
2095 break;
2096 }
2097
2098 unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
2099 Value *LHS = getMaskVecValue(CGF&: *this, Mask: Ops[0], NumElts);
2100 Value *RHS = getMaskVecValue(CGF&: *this, Mask: Ops[1], NumElts);
2101 Function *Intr = CGM.getIntrinsic(IID);
2102 return Builder.CreateCall(Callee: Intr, Args: {LHS, RHS});
2103 }
2104
2105 case X86::BI__builtin_ia32_kaddqi:
2106 case X86::BI__builtin_ia32_kaddhi:
2107 case X86::BI__builtin_ia32_kaddsi:
2108 case X86::BI__builtin_ia32_kadddi: {
2109 Intrinsic::ID IID;
2110 switch (BuiltinID) {
2111 default: llvm_unreachable("Unsupported intrinsic!");
2112 case X86::BI__builtin_ia32_kaddqi:
2113 IID = Intrinsic::x86_avx512_kadd_b;
2114 break;
2115 case X86::BI__builtin_ia32_kaddhi:
2116 IID = Intrinsic::x86_avx512_kadd_w;
2117 break;
2118 case X86::BI__builtin_ia32_kaddsi:
2119 IID = Intrinsic::x86_avx512_kadd_d;
2120 break;
2121 case X86::BI__builtin_ia32_kadddi:
2122 IID = Intrinsic::x86_avx512_kadd_q;
2123 break;
2124 }
2125
2126 unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
2127 Value *LHS = getMaskVecValue(CGF&: *this, Mask: Ops[0], NumElts);
2128 Value *RHS = getMaskVecValue(CGF&: *this, Mask: Ops[1], NumElts);
2129 Function *Intr = CGM.getIntrinsic(IID);
2130 Value *Res = Builder.CreateCall(Callee: Intr, Args: {LHS, RHS});
2131 return Builder.CreateBitCast(V: Res, DestTy: Ops[0]->getType());
2132 }
2133 case X86::BI__builtin_ia32_kandqi:
2134 case X86::BI__builtin_ia32_kandhi:
2135 case X86::BI__builtin_ia32_kandsi:
2136 case X86::BI__builtin_ia32_kanddi:
2137 return EmitX86MaskLogic(CGF&: *this, Opc: Instruction::And, Ops);
2138 case X86::BI__builtin_ia32_kandnqi:
2139 case X86::BI__builtin_ia32_kandnhi:
2140 case X86::BI__builtin_ia32_kandnsi:
2141 case X86::BI__builtin_ia32_kandndi:
2142 return EmitX86MaskLogic(CGF&: *this, Opc: Instruction::And, Ops, InvertLHS: true);
2143 case X86::BI__builtin_ia32_korqi:
2144 case X86::BI__builtin_ia32_korhi:
2145 case X86::BI__builtin_ia32_korsi:
2146 case X86::BI__builtin_ia32_kordi:
2147 return EmitX86MaskLogic(CGF&: *this, Opc: Instruction::Or, Ops);
2148 case X86::BI__builtin_ia32_kxnorqi:
2149 case X86::BI__builtin_ia32_kxnorhi:
2150 case X86::BI__builtin_ia32_kxnorsi:
2151 case X86::BI__builtin_ia32_kxnordi:
2152 return EmitX86MaskLogic(CGF&: *this, Opc: Instruction::Xor, Ops, InvertLHS: true);
2153 case X86::BI__builtin_ia32_kxorqi:
2154 case X86::BI__builtin_ia32_kxorhi:
2155 case X86::BI__builtin_ia32_kxorsi:
2156 case X86::BI__builtin_ia32_kxordi:
2157 return EmitX86MaskLogic(CGF&: *this, Opc: Instruction::Xor, Ops);
2158 case X86::BI__builtin_ia32_knotqi:
2159 case X86::BI__builtin_ia32_knothi:
2160 case X86::BI__builtin_ia32_knotsi:
2161 case X86::BI__builtin_ia32_knotdi: {
2162 unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
2163 Value *Res = getMaskVecValue(CGF&: *this, Mask: Ops[0], NumElts);
2164 return Builder.CreateBitCast(V: Builder.CreateNot(V: Res),
2165 DestTy: Ops[0]->getType());
2166 }
2167 case X86::BI__builtin_ia32_kmovb:
2168 case X86::BI__builtin_ia32_kmovw:
2169 case X86::BI__builtin_ia32_kmovd:
2170 case X86::BI__builtin_ia32_kmovq: {
2171 // Bitcast to vXi1 type and then back to integer. This gets the mask
2172 // register type into the IR, but might be optimized out depending on
2173 // what's around it.
2174 unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
2175 Value *Res = getMaskVecValue(CGF&: *this, Mask: Ops[0], NumElts);
2176 return Builder.CreateBitCast(V: Res, DestTy: Ops[0]->getType());
2177 }
2178
2179 case X86::BI__builtin_ia32_kunpckdi:
2180 case X86::BI__builtin_ia32_kunpcksi:
2181 case X86::BI__builtin_ia32_kunpckhi: {
2182 unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
2183 Value *LHS = getMaskVecValue(CGF&: *this, Mask: Ops[0], NumElts);
2184 Value *RHS = getMaskVecValue(CGF&: *this, Mask: Ops[1], NumElts);
2185 int Indices[64];
2186 for (unsigned i = 0; i != NumElts; ++i)
2187 Indices[i] = i;
2188
2189 // First extract half of each vector. This gives better codegen than
2190 // doing it in a single shuffle.
2191 LHS = Builder.CreateShuffleVector(V1: LHS, V2: LHS, Mask: ArrayRef(Indices, NumElts / 2));
2192 RHS = Builder.CreateShuffleVector(V1: RHS, V2: RHS, Mask: ArrayRef(Indices, NumElts / 2));
2193 // Concat the vectors.
2194 // NOTE: Operands are swapped to match the intrinsic definition.
2195 Value *Res =
2196 Builder.CreateShuffleVector(V1: RHS, V2: LHS, Mask: ArrayRef(Indices, NumElts));
2197 return Builder.CreateBitCast(V: Res, DestTy: Ops[0]->getType());
2198 }
2199
2200 case X86::BI__builtin_ia32_vplzcntd_128:
2201 case X86::BI__builtin_ia32_vplzcntd_256:
2202 case X86::BI__builtin_ia32_vplzcntd_512:
2203 case X86::BI__builtin_ia32_vplzcntq_128:
2204 case X86::BI__builtin_ia32_vplzcntq_256:
2205 case X86::BI__builtin_ia32_vplzcntq_512: {
2206 Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Ops[0]->getType());
2207 return Builder.CreateCall(Callee: F, Args: {Ops[0],Builder.getInt1(V: false)});
2208 }
2209 case X86::BI__builtin_ia32_sqrtss:
2210 case X86::BI__builtin_ia32_sqrtsd: {
2211 Value *A = Builder.CreateExtractElement(Vec: Ops[0], Idx: (uint64_t)0);
2212 Function *F;
2213 if (Builder.getIsFPConstrained()) {
2214 CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
2215 F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt,
2216 A->getType());
2217 A = Builder.CreateConstrainedFPCall(Callee: F, Args: {A});
2218 } else {
2219 F = CGM.getIntrinsic(Intrinsic::sqrt, A->getType());
2220 A = Builder.CreateCall(Callee: F, Args: {A});
2221 }
2222 return Builder.CreateInsertElement(Vec: Ops[0], NewElt: A, Idx: (uint64_t)0);
2223 }
2224 case X86::BI__builtin_ia32_sqrtsh_round_mask:
2225 case X86::BI__builtin_ia32_sqrtsd_round_mask:
2226 case X86::BI__builtin_ia32_sqrtss_round_mask: {
2227 unsigned CC = cast<llvm::ConstantInt>(Val: Ops[4])->getZExtValue();
2228 // Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
2229 // otherwise keep the intrinsic.
2230 if (CC != 4) {
2231 Intrinsic::ID IID;
2232
2233 switch (BuiltinID) {
2234 default:
2235 llvm_unreachable("Unsupported intrinsic!");
2236 case X86::BI__builtin_ia32_sqrtsh_round_mask:
2237 IID = Intrinsic::x86_avx512fp16_mask_sqrt_sh;
2238 break;
2239 case X86::BI__builtin_ia32_sqrtsd_round_mask:
2240 IID = Intrinsic::x86_avx512_mask_sqrt_sd;
2241 break;
2242 case X86::BI__builtin_ia32_sqrtss_round_mask:
2243 IID = Intrinsic::x86_avx512_mask_sqrt_ss;
2244 break;
2245 }
2246 return Builder.CreateCall(Callee: CGM.getIntrinsic(IID), Args: Ops);
2247 }
2248 Value *A = Builder.CreateExtractElement(Vec: Ops[1], Idx: (uint64_t)0);
2249 Function *F;
2250 if (Builder.getIsFPConstrained()) {
2251 CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
2252 F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt,
2253 A->getType());
2254 A = Builder.CreateConstrainedFPCall(Callee: F, Args: A);
2255 } else {
2256 F = CGM.getIntrinsic(Intrinsic::sqrt, A->getType());
2257 A = Builder.CreateCall(Callee: F, Args: A);
2258 }
2259 Value *Src = Builder.CreateExtractElement(Vec: Ops[2], Idx: (uint64_t)0);
2260 A = EmitX86ScalarSelect(CGF&: *this, Mask: Ops[3], Op0: A, Op1: Src);
2261 return Builder.CreateInsertElement(Vec: Ops[0], NewElt: A, Idx: (uint64_t)0);
2262 }
2263 case X86::BI__builtin_ia32_sqrtpd256:
2264 case X86::BI__builtin_ia32_sqrtpd:
2265 case X86::BI__builtin_ia32_sqrtps256:
2266 case X86::BI__builtin_ia32_sqrtps:
2267 case X86::BI__builtin_ia32_sqrtph256:
2268 case X86::BI__builtin_ia32_sqrtph:
2269 case X86::BI__builtin_ia32_sqrtph512:
2270 case X86::BI__builtin_ia32_vsqrtbf16256:
2271 case X86::BI__builtin_ia32_vsqrtbf16:
2272 case X86::BI__builtin_ia32_vsqrtbf16512:
2273 case X86::BI__builtin_ia32_sqrtps512:
2274 case X86::BI__builtin_ia32_sqrtpd512: {
2275 if (Ops.size() == 2) {
2276 unsigned CC = cast<llvm::ConstantInt>(Val: Ops[1])->getZExtValue();
2277 // Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
2278 // otherwise keep the intrinsic.
2279 if (CC != 4) {
2280 Intrinsic::ID IID;
2281
2282 switch (BuiltinID) {
2283 default:
2284 llvm_unreachable("Unsupported intrinsic!");
2285 case X86::BI__builtin_ia32_sqrtph512:
2286 IID = Intrinsic::x86_avx512fp16_sqrt_ph_512;
2287 break;
2288 case X86::BI__builtin_ia32_sqrtps512:
2289 IID = Intrinsic::x86_avx512_sqrt_ps_512;
2290 break;
2291 case X86::BI__builtin_ia32_sqrtpd512:
2292 IID = Intrinsic::x86_avx512_sqrt_pd_512;
2293 break;
2294 }
2295 return Builder.CreateCall(Callee: CGM.getIntrinsic(IID), Args: Ops);
2296 }
2297 }
2298 if (Builder.getIsFPConstrained()) {
2299 CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
2300 Function *F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt,
2301 Ops[0]->getType());
2302 return Builder.CreateConstrainedFPCall(Callee: F, Args: Ops[0]);
2303 } else {
2304 Function *F = CGM.getIntrinsic(Intrinsic::sqrt, Ops[0]->getType());
2305 return Builder.CreateCall(Callee: F, Args: Ops[0]);
2306 }
2307 }
2308
2309 case X86::BI__builtin_ia32_pmuludq128:
2310 case X86::BI__builtin_ia32_pmuludq256:
2311 case X86::BI__builtin_ia32_pmuludq512:
2312 return EmitX86Muldq(CGF&: *this, /*IsSigned*/false, Ops);
2313
2314 case X86::BI__builtin_ia32_pmuldq128:
2315 case X86::BI__builtin_ia32_pmuldq256:
2316 case X86::BI__builtin_ia32_pmuldq512:
2317 return EmitX86Muldq(CGF&: *this, /*IsSigned*/true, Ops);
2318
2319 case X86::BI__builtin_ia32_pternlogd512_mask:
2320 case X86::BI__builtin_ia32_pternlogq512_mask:
2321 case X86::BI__builtin_ia32_pternlogd128_mask:
2322 case X86::BI__builtin_ia32_pternlogd256_mask:
2323 case X86::BI__builtin_ia32_pternlogq128_mask:
2324 case X86::BI__builtin_ia32_pternlogq256_mask:
2325 return EmitX86Ternlog(CGF&: *this, /*ZeroMask*/false, Ops);
2326
2327 case X86::BI__builtin_ia32_pternlogd512_maskz:
2328 case X86::BI__builtin_ia32_pternlogq512_maskz:
2329 case X86::BI__builtin_ia32_pternlogd128_maskz:
2330 case X86::BI__builtin_ia32_pternlogd256_maskz:
2331 case X86::BI__builtin_ia32_pternlogq128_maskz:
2332 case X86::BI__builtin_ia32_pternlogq256_maskz:
2333 return EmitX86Ternlog(CGF&: *this, /*ZeroMask*/true, Ops);
2334
2335 case X86::BI__builtin_ia32_vpshldd128:
2336 case X86::BI__builtin_ia32_vpshldd256:
2337 case X86::BI__builtin_ia32_vpshldd512:
2338 case X86::BI__builtin_ia32_vpshldq128:
2339 case X86::BI__builtin_ia32_vpshldq256:
2340 case X86::BI__builtin_ia32_vpshldq512:
2341 case X86::BI__builtin_ia32_vpshldw128:
2342 case X86::BI__builtin_ia32_vpshldw256:
2343 case X86::BI__builtin_ia32_vpshldw512:
2344 return EmitX86FunnelShift(CGF&: *this, Op0: Ops[0], Op1: Ops[1], Amt: Ops[2], IsRight: false);
2345
2346 case X86::BI__builtin_ia32_vpshrdd128:
2347 case X86::BI__builtin_ia32_vpshrdd256:
2348 case X86::BI__builtin_ia32_vpshrdd512:
2349 case X86::BI__builtin_ia32_vpshrdq128:
2350 case X86::BI__builtin_ia32_vpshrdq256:
2351 case X86::BI__builtin_ia32_vpshrdq512:
2352 case X86::BI__builtin_ia32_vpshrdw128:
2353 case X86::BI__builtin_ia32_vpshrdw256:
2354 case X86::BI__builtin_ia32_vpshrdw512:
2355 // Ops 0 and 1 are swapped.
2356 return EmitX86FunnelShift(CGF&: *this, Op0: Ops[1], Op1: Ops[0], Amt: Ops[2], IsRight: true);
2357
2358 case X86::BI__builtin_ia32_vpshldvd128:
2359 case X86::BI__builtin_ia32_vpshldvd256:
2360 case X86::BI__builtin_ia32_vpshldvd512:
2361 case X86::BI__builtin_ia32_vpshldvq128:
2362 case X86::BI__builtin_ia32_vpshldvq256:
2363 case X86::BI__builtin_ia32_vpshldvq512:
2364 case X86::BI__builtin_ia32_vpshldvw128:
2365 case X86::BI__builtin_ia32_vpshldvw256:
2366 case X86::BI__builtin_ia32_vpshldvw512:
2367 return EmitX86FunnelShift(CGF&: *this, Op0: Ops[0], Op1: Ops[1], Amt: Ops[2], IsRight: false);
2368
2369 case X86::BI__builtin_ia32_vpshrdvd128:
2370 case X86::BI__builtin_ia32_vpshrdvd256:
2371 case X86::BI__builtin_ia32_vpshrdvd512:
2372 case X86::BI__builtin_ia32_vpshrdvq128:
2373 case X86::BI__builtin_ia32_vpshrdvq256:
2374 case X86::BI__builtin_ia32_vpshrdvq512:
2375 case X86::BI__builtin_ia32_vpshrdvw128:
2376 case X86::BI__builtin_ia32_vpshrdvw256:
2377 case X86::BI__builtin_ia32_vpshrdvw512:
2378 // Ops 0 and 1 are swapped.
2379 return EmitX86FunnelShift(CGF&: *this, Op0: Ops[1], Op1: Ops[0], Amt: Ops[2], IsRight: true);
2380
2381 // Reductions
2382 case X86::BI__builtin_ia32_reduce_fadd_pd512:
2383 case X86::BI__builtin_ia32_reduce_fadd_ps512:
2384 case X86::BI__builtin_ia32_reduce_fadd_ph512:
2385 case X86::BI__builtin_ia32_reduce_fadd_ph256:
2386 case X86::BI__builtin_ia32_reduce_fadd_ph128: {
2387 Function *F =
2388 CGM.getIntrinsic(Intrinsic::vector_reduce_fadd, Ops[1]->getType());
2389 IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
2390 Builder.getFastMathFlags().setAllowReassoc();
2391 return Builder.CreateCall(Callee: F, Args: {Ops[0], Ops[1]});
2392 }
2393 case X86::BI__builtin_ia32_reduce_fmul_pd512:
2394 case X86::BI__builtin_ia32_reduce_fmul_ps512:
2395 case X86::BI__builtin_ia32_reduce_fmul_ph512:
2396 case X86::BI__builtin_ia32_reduce_fmul_ph256:
2397 case X86::BI__builtin_ia32_reduce_fmul_ph128: {
2398 Function *F =
2399 CGM.getIntrinsic(Intrinsic::vector_reduce_fmul, Ops[1]->getType());
2400 IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
2401 Builder.getFastMathFlags().setAllowReassoc();
2402 return Builder.CreateCall(Callee: F, Args: {Ops[0], Ops[1]});
2403 }
2404 case X86::BI__builtin_ia32_reduce_fmax_pd512:
2405 case X86::BI__builtin_ia32_reduce_fmax_ps512:
2406 case X86::BI__builtin_ia32_reduce_fmax_ph512:
2407 case X86::BI__builtin_ia32_reduce_fmax_ph256:
2408 case X86::BI__builtin_ia32_reduce_fmax_ph128: {
2409 Function *F =
2410 CGM.getIntrinsic(Intrinsic::vector_reduce_fmax, Ops[0]->getType());
2411 IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
2412 Builder.getFastMathFlags().setNoNaNs();
2413 return Builder.CreateCall(Callee: F, Args: {Ops[0]});
2414 }
2415 case X86::BI__builtin_ia32_reduce_fmin_pd512:
2416 case X86::BI__builtin_ia32_reduce_fmin_ps512:
2417 case X86::BI__builtin_ia32_reduce_fmin_ph512:
2418 case X86::BI__builtin_ia32_reduce_fmin_ph256:
2419 case X86::BI__builtin_ia32_reduce_fmin_ph128: {
2420 Function *F =
2421 CGM.getIntrinsic(Intrinsic::vector_reduce_fmin, Ops[0]->getType());
2422 IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
2423 Builder.getFastMathFlags().setNoNaNs();
2424 return Builder.CreateCall(Callee: F, Args: {Ops[0]});
2425 }
2426
2427 case X86::BI__builtin_ia32_rdrand16_step:
2428 case X86::BI__builtin_ia32_rdrand32_step:
2429 case X86::BI__builtin_ia32_rdrand64_step:
2430 case X86::BI__builtin_ia32_rdseed16_step:
2431 case X86::BI__builtin_ia32_rdseed32_step:
2432 case X86::BI__builtin_ia32_rdseed64_step: {
2433 Intrinsic::ID ID;
2434 switch (BuiltinID) {
2435 default: llvm_unreachable("Unsupported intrinsic!");
2436 case X86::BI__builtin_ia32_rdrand16_step:
2437 ID = Intrinsic::x86_rdrand_16;
2438 break;
2439 case X86::BI__builtin_ia32_rdrand32_step:
2440 ID = Intrinsic::x86_rdrand_32;
2441 break;
2442 case X86::BI__builtin_ia32_rdrand64_step:
2443 ID = Intrinsic::x86_rdrand_64;
2444 break;
2445 case X86::BI__builtin_ia32_rdseed16_step:
2446 ID = Intrinsic::x86_rdseed_16;
2447 break;
2448 case X86::BI__builtin_ia32_rdseed32_step:
2449 ID = Intrinsic::x86_rdseed_32;
2450 break;
2451 case X86::BI__builtin_ia32_rdseed64_step:
2452 ID = Intrinsic::x86_rdseed_64;
2453 break;
2454 }
2455
2456 Value *Call = Builder.CreateCall(Callee: CGM.getIntrinsic(IID: ID));
2457 Builder.CreateDefaultAlignedStore(Val: Builder.CreateExtractValue(Agg: Call, Idxs: 0),
2458 Addr: Ops[0]);
2459 return Builder.CreateExtractValue(Agg: Call, Idxs: 1);
2460 }
2461 case X86::BI__builtin_ia32_addcarryx_u32:
2462 case X86::BI__builtin_ia32_addcarryx_u64:
2463 case X86::BI__builtin_ia32_subborrow_u32:
2464 case X86::BI__builtin_ia32_subborrow_u64: {
2465 Intrinsic::ID IID;
2466 switch (BuiltinID) {
2467 default: llvm_unreachable("Unsupported intrinsic!");
2468 case X86::BI__builtin_ia32_addcarryx_u32:
2469 IID = Intrinsic::x86_addcarry_32;
2470 break;
2471 case X86::BI__builtin_ia32_addcarryx_u64:
2472 IID = Intrinsic::x86_addcarry_64;
2473 break;
2474 case X86::BI__builtin_ia32_subborrow_u32:
2475 IID = Intrinsic::x86_subborrow_32;
2476 break;
2477 case X86::BI__builtin_ia32_subborrow_u64:
2478 IID = Intrinsic::x86_subborrow_64;
2479 break;
2480 }
2481
2482 Value *Call = Builder.CreateCall(Callee: CGM.getIntrinsic(IID),
2483 Args: { Ops[0], Ops[1], Ops[2] });
2484 Builder.CreateDefaultAlignedStore(Val: Builder.CreateExtractValue(Agg: Call, Idxs: 1),
2485 Addr: Ops[3]);
2486 return Builder.CreateExtractValue(Agg: Call, Idxs: 0);
2487 }
2488
2489 case X86::BI__builtin_ia32_fpclassps128_mask:
2490 case X86::BI__builtin_ia32_fpclassps256_mask:
2491 case X86::BI__builtin_ia32_fpclassps512_mask:
2492 case X86::BI__builtin_ia32_vfpclassbf16128_mask:
2493 case X86::BI__builtin_ia32_vfpclassbf16256_mask:
2494 case X86::BI__builtin_ia32_vfpclassbf16512_mask:
2495 case X86::BI__builtin_ia32_fpclassph128_mask:
2496 case X86::BI__builtin_ia32_fpclassph256_mask:
2497 case X86::BI__builtin_ia32_fpclassph512_mask:
2498 case X86::BI__builtin_ia32_fpclasspd128_mask:
2499 case X86::BI__builtin_ia32_fpclasspd256_mask:
2500 case X86::BI__builtin_ia32_fpclasspd512_mask: {
2501 unsigned NumElts =
2502 cast<llvm::FixedVectorType>(Val: Ops[0]->getType())->getNumElements();
2503 Value *MaskIn = Ops[2];
2504 Ops.erase(CI: &Ops[2]);
2505
2506 Intrinsic::ID ID;
2507 switch (BuiltinID) {
2508 default: llvm_unreachable("Unsupported intrinsic!");
2509 case X86::BI__builtin_ia32_vfpclassbf16128_mask:
2510 ID = Intrinsic::x86_avx10_fpclass_bf16_128;
2511 break;
2512 case X86::BI__builtin_ia32_vfpclassbf16256_mask:
2513 ID = Intrinsic::x86_avx10_fpclass_bf16_256;
2514 break;
2515 case X86::BI__builtin_ia32_vfpclassbf16512_mask:
2516 ID = Intrinsic::x86_avx10_fpclass_bf16_512;
2517 break;
2518 case X86::BI__builtin_ia32_fpclassph128_mask:
2519 ID = Intrinsic::x86_avx512fp16_fpclass_ph_128;
2520 break;
2521 case X86::BI__builtin_ia32_fpclassph256_mask:
2522 ID = Intrinsic::x86_avx512fp16_fpclass_ph_256;
2523 break;
2524 case X86::BI__builtin_ia32_fpclassph512_mask:
2525 ID = Intrinsic::x86_avx512fp16_fpclass_ph_512;
2526 break;
2527 case X86::BI__builtin_ia32_fpclassps128_mask:
2528 ID = Intrinsic::x86_avx512_fpclass_ps_128;
2529 break;
2530 case X86::BI__builtin_ia32_fpclassps256_mask:
2531 ID = Intrinsic::x86_avx512_fpclass_ps_256;
2532 break;
2533 case X86::BI__builtin_ia32_fpclassps512_mask:
2534 ID = Intrinsic::x86_avx512_fpclass_ps_512;
2535 break;
2536 case X86::BI__builtin_ia32_fpclasspd128_mask:
2537 ID = Intrinsic::x86_avx512_fpclass_pd_128;
2538 break;
2539 case X86::BI__builtin_ia32_fpclasspd256_mask:
2540 ID = Intrinsic::x86_avx512_fpclass_pd_256;
2541 break;
2542 case X86::BI__builtin_ia32_fpclasspd512_mask:
2543 ID = Intrinsic::x86_avx512_fpclass_pd_512;
2544 break;
2545 }
2546
2547 Value *Fpclass = Builder.CreateCall(Callee: CGM.getIntrinsic(IID: ID), Args: Ops);
2548 return EmitX86MaskedCompareResult(CGF&: *this, Cmp: Fpclass, NumElts, MaskIn);
2549 }
2550
2551 case X86::BI__builtin_ia32_vp2intersect_q_512:
2552 case X86::BI__builtin_ia32_vp2intersect_q_256:
2553 case X86::BI__builtin_ia32_vp2intersect_q_128:
2554 case X86::BI__builtin_ia32_vp2intersect_d_512:
2555 case X86::BI__builtin_ia32_vp2intersect_d_256:
2556 case X86::BI__builtin_ia32_vp2intersect_d_128: {
2557 unsigned NumElts =
2558 cast<llvm::FixedVectorType>(Val: Ops[0]->getType())->getNumElements();
2559 Intrinsic::ID ID;
2560
2561 switch (BuiltinID) {
2562 default: llvm_unreachable("Unsupported intrinsic!");
2563 case X86::BI__builtin_ia32_vp2intersect_q_512:
2564 ID = Intrinsic::x86_avx512_vp2intersect_q_512;
2565 break;
2566 case X86::BI__builtin_ia32_vp2intersect_q_256:
2567 ID = Intrinsic::x86_avx512_vp2intersect_q_256;
2568 break;
2569 case X86::BI__builtin_ia32_vp2intersect_q_128:
2570 ID = Intrinsic::x86_avx512_vp2intersect_q_128;
2571 break;
2572 case X86::BI__builtin_ia32_vp2intersect_d_512:
2573 ID = Intrinsic::x86_avx512_vp2intersect_d_512;
2574 break;
2575 case X86::BI__builtin_ia32_vp2intersect_d_256:
2576 ID = Intrinsic::x86_avx512_vp2intersect_d_256;
2577 break;
2578 case X86::BI__builtin_ia32_vp2intersect_d_128:
2579 ID = Intrinsic::x86_avx512_vp2intersect_d_128;
2580 break;
2581 }
2582
2583 Value *Call = Builder.CreateCall(Callee: CGM.getIntrinsic(IID: ID), Args: {Ops[0], Ops[1]});
2584 Value *Result = Builder.CreateExtractValue(Agg: Call, Idxs: 0);
2585 Result = EmitX86MaskedCompareResult(CGF&: *this, Cmp: Result, NumElts, MaskIn: nullptr);
2586 Builder.CreateDefaultAlignedStore(Val: Result, Addr: Ops[2]);
2587
2588 Result = Builder.CreateExtractValue(Agg: Call, Idxs: 1);
2589 Result = EmitX86MaskedCompareResult(CGF&: *this, Cmp: Result, NumElts, MaskIn: nullptr);
2590 return Builder.CreateDefaultAlignedStore(Val: Result, Addr: Ops[3]);
2591 }
2592
2593 case X86::BI__builtin_ia32_vpmultishiftqb128:
2594 case X86::BI__builtin_ia32_vpmultishiftqb256:
2595 case X86::BI__builtin_ia32_vpmultishiftqb512: {
2596 Intrinsic::ID ID;
2597 switch (BuiltinID) {
2598 default: llvm_unreachable("Unsupported intrinsic!");
2599 case X86::BI__builtin_ia32_vpmultishiftqb128:
2600 ID = Intrinsic::x86_avx512_pmultishift_qb_128;
2601 break;
2602 case X86::BI__builtin_ia32_vpmultishiftqb256:
2603 ID = Intrinsic::x86_avx512_pmultishift_qb_256;
2604 break;
2605 case X86::BI__builtin_ia32_vpmultishiftqb512:
2606 ID = Intrinsic::x86_avx512_pmultishift_qb_512;
2607 break;
2608 }
2609
2610 return Builder.CreateCall(Callee: CGM.getIntrinsic(IID: ID), Args: Ops);
2611 }
2612
2613 case X86::BI__builtin_ia32_vpshufbitqmb128_mask:
2614 case X86::BI__builtin_ia32_vpshufbitqmb256_mask:
2615 case X86::BI__builtin_ia32_vpshufbitqmb512_mask: {
2616 unsigned NumElts =
2617 cast<llvm::FixedVectorType>(Val: Ops[0]->getType())->getNumElements();
2618 Value *MaskIn = Ops[2];
2619 Ops.erase(CI: &Ops[2]);
2620
2621 Intrinsic::ID ID;
2622 switch (BuiltinID) {
2623 default: llvm_unreachable("Unsupported intrinsic!");
2624 case X86::BI__builtin_ia32_vpshufbitqmb128_mask:
2625 ID = Intrinsic::x86_avx512_vpshufbitqmb_128;
2626 break;
2627 case X86::BI__builtin_ia32_vpshufbitqmb256_mask:
2628 ID = Intrinsic::x86_avx512_vpshufbitqmb_256;
2629 break;
2630 case X86::BI__builtin_ia32_vpshufbitqmb512_mask:
2631 ID = Intrinsic::x86_avx512_vpshufbitqmb_512;
2632 break;
2633 }
2634
2635 Value *Shufbit = Builder.CreateCall(Callee: CGM.getIntrinsic(IID: ID), Args: Ops);
2636 return EmitX86MaskedCompareResult(CGF&: *this, Cmp: Shufbit, NumElts, MaskIn);
2637 }
2638
2639 // packed comparison intrinsics
2640 case X86::BI__builtin_ia32_cmpeqps:
2641 case X86::BI__builtin_ia32_cmpeqpd:
2642 return getVectorFCmpIR(CmpInst::FCMP_OEQ, /*IsSignaling*/false);
2643 case X86::BI__builtin_ia32_cmpltps:
2644 case X86::BI__builtin_ia32_cmpltpd:
2645 return getVectorFCmpIR(CmpInst::FCMP_OLT, /*IsSignaling*/true);
2646 case X86::BI__builtin_ia32_cmpleps:
2647 case X86::BI__builtin_ia32_cmplepd:
2648 return getVectorFCmpIR(CmpInst::FCMP_OLE, /*IsSignaling*/true);
2649 case X86::BI__builtin_ia32_cmpunordps:
2650 case X86::BI__builtin_ia32_cmpunordpd:
2651 return getVectorFCmpIR(CmpInst::FCMP_UNO, /*IsSignaling*/false);
2652 case X86::BI__builtin_ia32_cmpneqps:
2653 case X86::BI__builtin_ia32_cmpneqpd:
2654 return getVectorFCmpIR(CmpInst::FCMP_UNE, /*IsSignaling*/false);
2655 case X86::BI__builtin_ia32_cmpnltps:
2656 case X86::BI__builtin_ia32_cmpnltpd:
2657 return getVectorFCmpIR(CmpInst::FCMP_UGE, /*IsSignaling*/true);
2658 case X86::BI__builtin_ia32_cmpnleps:
2659 case X86::BI__builtin_ia32_cmpnlepd:
2660 return getVectorFCmpIR(CmpInst::FCMP_UGT, /*IsSignaling*/true);
2661 case X86::BI__builtin_ia32_cmpordps:
2662 case X86::BI__builtin_ia32_cmpordpd:
2663 return getVectorFCmpIR(CmpInst::FCMP_ORD, /*IsSignaling*/false);
2664 case X86::BI__builtin_ia32_cmpph128_mask:
2665 case X86::BI__builtin_ia32_cmpph256_mask:
2666 case X86::BI__builtin_ia32_cmpph512_mask:
2667 case X86::BI__builtin_ia32_cmpps128_mask:
2668 case X86::BI__builtin_ia32_cmpps256_mask:
2669 case X86::BI__builtin_ia32_cmpps512_mask:
2670 case X86::BI__builtin_ia32_cmppd128_mask:
2671 case X86::BI__builtin_ia32_cmppd256_mask:
2672 case X86::BI__builtin_ia32_cmppd512_mask:
2673 case X86::BI__builtin_ia32_vcmpbf16512_mask:
2674 case X86::BI__builtin_ia32_vcmpbf16256_mask:
2675 case X86::BI__builtin_ia32_vcmpbf16128_mask:
2676 IsMaskFCmp = true;
2677 [[fallthrough]];
2678 case X86::BI__builtin_ia32_cmpps:
2679 case X86::BI__builtin_ia32_cmpps256:
2680 case X86::BI__builtin_ia32_cmppd:
2681 case X86::BI__builtin_ia32_cmppd256: {
2682 // Lowering vector comparisons to fcmp instructions, while
2683 // ignoring signalling behaviour requested
2684 // ignoring rounding mode requested
2685 // This is only possible if fp-model is not strict and FENV_ACCESS is off.
2686
2687 // The third argument is the comparison condition, and integer in the
2688 // range [0, 31]
2689 unsigned CC = cast<llvm::ConstantInt>(Val: Ops[2])->getZExtValue() & 0x1f;
2690
2691 // Lowering to IR fcmp instruction.
2692 // Ignoring requested signaling behaviour,
2693 // e.g. both _CMP_GT_OS & _CMP_GT_OQ are translated to FCMP_OGT.
2694 FCmpInst::Predicate Pred;
2695 bool IsSignaling;
2696 // Predicates for 16-31 repeat the 0-15 predicates. Only the signalling
2697 // behavior is inverted. We'll handle that after the switch.
2698 switch (CC & 0xf) {
2699 case 0x00: Pred = FCmpInst::FCMP_OEQ; IsSignaling = false; break;
2700 case 0x01: Pred = FCmpInst::FCMP_OLT; IsSignaling = true; break;
2701 case 0x02: Pred = FCmpInst::FCMP_OLE; IsSignaling = true; break;
2702 case 0x03: Pred = FCmpInst::FCMP_UNO; IsSignaling = false; break;
2703 case 0x04: Pred = FCmpInst::FCMP_UNE; IsSignaling = false; break;
2704 case 0x05: Pred = FCmpInst::FCMP_UGE; IsSignaling = true; break;
2705 case 0x06: Pred = FCmpInst::FCMP_UGT; IsSignaling = true; break;
2706 case 0x07: Pred = FCmpInst::FCMP_ORD; IsSignaling = false; break;
2707 case 0x08: Pred = FCmpInst::FCMP_UEQ; IsSignaling = false; break;
2708 case 0x09: Pred = FCmpInst::FCMP_ULT; IsSignaling = true; break;
2709 case 0x0a: Pred = FCmpInst::FCMP_ULE; IsSignaling = true; break;
2710 case 0x0b: Pred = FCmpInst::FCMP_FALSE; IsSignaling = false; break;
2711 case 0x0c: Pred = FCmpInst::FCMP_ONE; IsSignaling = false; break;
2712 case 0x0d: Pred = FCmpInst::FCMP_OGE; IsSignaling = true; break;
2713 case 0x0e: Pred = FCmpInst::FCMP_OGT; IsSignaling = true; break;
2714 case 0x0f: Pred = FCmpInst::FCMP_TRUE; IsSignaling = false; break;
2715 default: llvm_unreachable("Unhandled CC");
2716 }
2717
2718 // Invert the signalling behavior for 16-31.
2719 if (CC & 0x10)
2720 IsSignaling = !IsSignaling;
2721
2722 // If the predicate is true or false and we're using constrained intrinsics,
2723 // we don't have a compare intrinsic we can use. Just use the legacy X86
2724 // specific intrinsic.
2725 // If the intrinsic is mask enabled and we're using constrained intrinsics,
2726 // use the legacy X86 specific intrinsic.
2727 if (Builder.getIsFPConstrained() &&
2728 (Pred == FCmpInst::FCMP_TRUE || Pred == FCmpInst::FCMP_FALSE ||
2729 IsMaskFCmp)) {
2730
2731 Intrinsic::ID IID;
2732 switch (BuiltinID) {
2733 default: llvm_unreachable("Unexpected builtin");
2734 case X86::BI__builtin_ia32_cmpps:
2735 IID = Intrinsic::x86_sse_cmp_ps;
2736 break;
2737 case X86::BI__builtin_ia32_cmpps256:
2738 IID = Intrinsic::x86_avx_cmp_ps_256;
2739 break;
2740 case X86::BI__builtin_ia32_cmppd:
2741 IID = Intrinsic::x86_sse2_cmp_pd;
2742 break;
2743 case X86::BI__builtin_ia32_cmppd256:
2744 IID = Intrinsic::x86_avx_cmp_pd_256;
2745 break;
2746 case X86::BI__builtin_ia32_cmpph128_mask:
2747 IID = Intrinsic::x86_avx512fp16_mask_cmp_ph_128;
2748 break;
2749 case X86::BI__builtin_ia32_cmpph256_mask:
2750 IID = Intrinsic::x86_avx512fp16_mask_cmp_ph_256;
2751 break;
2752 case X86::BI__builtin_ia32_cmpph512_mask:
2753 IID = Intrinsic::x86_avx512fp16_mask_cmp_ph_512;
2754 break;
2755 case X86::BI__builtin_ia32_cmpps512_mask:
2756 IID = Intrinsic::x86_avx512_mask_cmp_ps_512;
2757 break;
2758 case X86::BI__builtin_ia32_cmppd512_mask:
2759 IID = Intrinsic::x86_avx512_mask_cmp_pd_512;
2760 break;
2761 case X86::BI__builtin_ia32_cmpps128_mask:
2762 IID = Intrinsic::x86_avx512_mask_cmp_ps_128;
2763 break;
2764 case X86::BI__builtin_ia32_cmpps256_mask:
2765 IID = Intrinsic::x86_avx512_mask_cmp_ps_256;
2766 break;
2767 case X86::BI__builtin_ia32_cmppd128_mask:
2768 IID = Intrinsic::x86_avx512_mask_cmp_pd_128;
2769 break;
2770 case X86::BI__builtin_ia32_cmppd256_mask:
2771 IID = Intrinsic::x86_avx512_mask_cmp_pd_256;
2772 break;
2773 }
2774
2775 Function *Intr = CGM.getIntrinsic(IID);
2776 if (IsMaskFCmp) {
2777 unsigned NumElts =
2778 cast<llvm::FixedVectorType>(Val: Ops[0]->getType())->getNumElements();
2779 Ops[3] = getMaskVecValue(CGF&: *this, Mask: Ops[3], NumElts);
2780 Value *Cmp = Builder.CreateCall(Callee: Intr, Args: Ops);
2781 return EmitX86MaskedCompareResult(CGF&: *this, Cmp, NumElts, MaskIn: nullptr);
2782 }
2783
2784 return Builder.CreateCall(Callee: Intr, Args: Ops);
2785 }
2786
2787 // Builtins without the _mask suffix return a vector of integers
2788 // of the same width as the input vectors
2789 if (IsMaskFCmp) {
2790 // We ignore SAE if strict FP is disabled. We only keep precise
2791 // exception behavior under strict FP.
2792 // NOTE: If strict FP does ever go through here a CGFPOptionsRAII
2793 // object will be required.
2794 unsigned NumElts =
2795 cast<llvm::FixedVectorType>(Val: Ops[0]->getType())->getNumElements();
2796 Value *Cmp;
2797 if (IsSignaling)
2798 Cmp = Builder.CreateFCmpS(P: Pred, LHS: Ops[0], RHS: Ops[1]);
2799 else
2800 Cmp = Builder.CreateFCmp(P: Pred, LHS: Ops[0], RHS: Ops[1]);
2801 return EmitX86MaskedCompareResult(CGF&: *this, Cmp, NumElts, MaskIn: Ops[3]);
2802 }
2803
2804 return getVectorFCmpIR(Pred, IsSignaling);
2805 }
2806
2807 // SSE scalar comparison intrinsics
2808 case X86::BI__builtin_ia32_cmpeqss:
2809 return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 0);
2810 case X86::BI__builtin_ia32_cmpltss:
2811 return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 1);
2812 case X86::BI__builtin_ia32_cmpless:
2813 return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 2);
2814 case X86::BI__builtin_ia32_cmpunordss:
2815 return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 3);
2816 case X86::BI__builtin_ia32_cmpneqss:
2817 return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 4);
2818 case X86::BI__builtin_ia32_cmpnltss:
2819 return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 5);
2820 case X86::BI__builtin_ia32_cmpnless:
2821 return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 6);
2822 case X86::BI__builtin_ia32_cmpordss:
2823 return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 7);
2824 case X86::BI__builtin_ia32_cmpeqsd:
2825 return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 0);
2826 case X86::BI__builtin_ia32_cmpltsd:
2827 return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 1);
2828 case X86::BI__builtin_ia32_cmplesd:
2829 return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 2);
2830 case X86::BI__builtin_ia32_cmpunordsd:
2831 return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 3);
2832 case X86::BI__builtin_ia32_cmpneqsd:
2833 return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 4);
2834 case X86::BI__builtin_ia32_cmpnltsd:
2835 return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 5);
2836 case X86::BI__builtin_ia32_cmpnlesd:
2837 return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 6);
2838 case X86::BI__builtin_ia32_cmpordsd:
2839 return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 7);
2840
2841 // f16c half2float intrinsics
2842 case X86::BI__builtin_ia32_vcvtph2ps:
2843 case X86::BI__builtin_ia32_vcvtph2ps256:
2844 case X86::BI__builtin_ia32_vcvtph2ps_mask:
2845 case X86::BI__builtin_ia32_vcvtph2ps256_mask:
2846 case X86::BI__builtin_ia32_vcvtph2ps512_mask: {
2847 CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
2848 return EmitX86CvtF16ToFloatExpr(*this, Ops, ConvertType(E->getType()));
2849 }
2850
2851 // AVX512 bf16 intrinsics
2852 case X86::BI__builtin_ia32_cvtneps2bf16_128_mask: {
2853 Ops[2] = getMaskVecValue(
2854 CGF&: *this, Mask: Ops[2],
2855 NumElts: cast<llvm::FixedVectorType>(Val: Ops[0]->getType())->getNumElements());
2856 Intrinsic::ID IID = Intrinsic::x86_avx512bf16_mask_cvtneps2bf16_128;
2857 return Builder.CreateCall(Callee: CGM.getIntrinsic(IID), Args: Ops);
2858 }
2859 case X86::BI__builtin_ia32_cvtsbf162ss_32:
2860 return Builder.CreateFPExt(V: Ops[0], DestTy: Builder.getFloatTy());
2861
2862 case X86::BI__builtin_ia32_cvtneps2bf16_256_mask:
2863 case X86::BI__builtin_ia32_cvtneps2bf16_512_mask: {
2864 Intrinsic::ID IID;
2865 switch (BuiltinID) {
2866 default: llvm_unreachable("Unsupported intrinsic!");
2867 case X86::BI__builtin_ia32_cvtneps2bf16_256_mask:
2868 IID = Intrinsic::x86_avx512bf16_cvtneps2bf16_256;
2869 break;
2870 case X86::BI__builtin_ia32_cvtneps2bf16_512_mask:
2871 IID = Intrinsic::x86_avx512bf16_cvtneps2bf16_512;
2872 break;
2873 }
2874 Value *Res = Builder.CreateCall(Callee: CGM.getIntrinsic(IID), Args: Ops[0]);
2875 return EmitX86Select(CGF&: *this, Mask: Ops[2], Op0: Res, Op1: Ops[1]);
2876 }
2877
2878 case X86::BI__cpuid:
2879 case X86::BI__cpuidex: {
2880 Value *FuncId = EmitScalarExpr(E: E->getArg(Arg: 1));
2881 Value *SubFuncId = BuiltinID == X86::BI__cpuidex
2882 ? EmitScalarExpr(E->getArg(2))
2883 : llvm::ConstantInt::get(Int32Ty, 0);
2884
2885 llvm::StructType *CpuidRetTy =
2886 llvm::StructType::get(elt1: Int32Ty, elts: Int32Ty, elts: Int32Ty, elts: Int32Ty);
2887 llvm::FunctionType *FTy =
2888 llvm::FunctionType::get(Result: CpuidRetTy, Params: {Int32Ty, Int32Ty}, isVarArg: false);
2889
2890 StringRef Asm, Constraints;
2891 if (getTarget().getTriple().getArch() == llvm::Triple::x86) {
2892 Asm = "cpuid";
2893 Constraints = "={ax},={bx},={cx},={dx},{ax},{cx}";
2894 } else {
2895 // x86-64 uses %rbx as the base register, so preserve it.
2896 Asm = "xchgq %rbx, ${1:q}\n"
2897 "cpuid\n"
2898 "xchgq %rbx, ${1:q}";
2899 Constraints = "={ax},=r,={cx},={dx},0,2";
2900 }
2901
2902 llvm::InlineAsm *IA = llvm::InlineAsm::get(Ty: FTy, AsmString: Asm, Constraints,
2903 /*hasSideEffects=*/false);
2904 Value *IACall = Builder.CreateCall(Callee: IA, Args: {FuncId, SubFuncId});
2905 Value *BasePtr = EmitScalarExpr(E: E->getArg(Arg: 0));
2906 Value *Store = nullptr;
2907 for (unsigned i = 0; i < 4; i++) {
2908 Value *Extracted = Builder.CreateExtractValue(Agg: IACall, Idxs: i);
2909 Value *StorePtr = Builder.CreateConstInBoundsGEP1_32(Ty: Int32Ty, Ptr: BasePtr, Idx0: i);
2910 Store = Builder.CreateAlignedStore(Extracted, StorePtr, getIntAlign());
2911 }
2912
2913 // Return the last store instruction to signal that we have emitted the
2914 // the intrinsic.
2915 return Store;
2916 }
2917
2918 case X86::BI__emul:
2919 case X86::BI__emulu: {
2920 llvm::Type *Int64Ty = llvm::IntegerType::get(C&: getLLVMContext(), NumBits: 64);
2921 bool isSigned = (BuiltinID == X86::BI__emul);
2922 Value *LHS = Builder.CreateIntCast(V: Ops[0], DestTy: Int64Ty, isSigned);
2923 Value *RHS = Builder.CreateIntCast(V: Ops[1], DestTy: Int64Ty, isSigned);
2924 return Builder.CreateMul(LHS, RHS, Name: "", HasNUW: !isSigned, HasNSW: isSigned);
2925 }
2926 case X86::BI__mulh:
2927 case X86::BI__umulh:
2928 case X86::BI_mul128:
2929 case X86::BI_umul128: {
2930 llvm::Type *ResType = ConvertType(E->getType());
2931 llvm::Type *Int128Ty = llvm::IntegerType::get(C&: getLLVMContext(), NumBits: 128);
2932
2933 bool IsSigned = (BuiltinID == X86::BI__mulh || BuiltinID == X86::BI_mul128);
2934 Value *LHS = Builder.CreateIntCast(V: Ops[0], DestTy: Int128Ty, isSigned: IsSigned);
2935 Value *RHS = Builder.CreateIntCast(V: Ops[1], DestTy: Int128Ty, isSigned: IsSigned);
2936
2937 Value *MulResult, *HigherBits;
2938 if (IsSigned) {
2939 MulResult = Builder.CreateNSWMul(LHS, RHS);
2940 HigherBits = Builder.CreateAShr(LHS: MulResult, RHS: 64);
2941 } else {
2942 MulResult = Builder.CreateNUWMul(LHS, RHS);
2943 HigherBits = Builder.CreateLShr(LHS: MulResult, RHS: 64);
2944 }
2945 HigherBits = Builder.CreateIntCast(V: HigherBits, DestTy: ResType, isSigned: IsSigned);
2946
2947 if (BuiltinID == X86::BI__mulh || BuiltinID == X86::BI__umulh)
2948 return HigherBits;
2949
2950 Address HighBitsAddress = EmitPointerWithAlignment(Addr: E->getArg(Arg: 2));
2951 Builder.CreateStore(Val: HigherBits, Addr: HighBitsAddress);
2952 return Builder.CreateIntCast(V: MulResult, DestTy: ResType, isSigned: IsSigned);
2953 }
2954
2955 case X86::BI__faststorefence: {
2956 return Builder.CreateFence(Ordering: llvm::AtomicOrdering::SequentiallyConsistent,
2957 SSID: llvm::SyncScope::System);
2958 }
2959 case X86::BI__shiftleft128:
2960 case X86::BI__shiftright128: {
2961 llvm::Function *F = CGM.getIntrinsic(
2962 BuiltinID == X86::BI__shiftleft128 ? Intrinsic::fshl : Intrinsic::fshr,
2963 Int64Ty);
2964 // Flip low/high ops and zero-extend amount to matching type.
2965 // shiftleft128(Low, High, Amt) -> fshl(High, Low, Amt)
2966 // shiftright128(Low, High, Amt) -> fshr(High, Low, Amt)
2967 std::swap(a&: Ops[0], b&: Ops[1]);
2968 Ops[2] = Builder.CreateZExt(V: Ops[2], DestTy: Int64Ty);
2969 return Builder.CreateCall(Callee: F, Args: Ops);
2970 }
2971 case X86::BI_ReadWriteBarrier:
2972 case X86::BI_ReadBarrier:
2973 case X86::BI_WriteBarrier: {
2974 return Builder.CreateFence(Ordering: llvm::AtomicOrdering::SequentiallyConsistent,
2975 SSID: llvm::SyncScope::SingleThread);
2976 }
2977
2978 case X86::BI_AddressOfReturnAddress: {
2979 Function *F =
2980 CGM.getIntrinsic(Intrinsic::addressofreturnaddress, AllocaInt8PtrTy);
2981 return Builder.CreateCall(Callee: F);
2982 }
2983 case X86::BI__stosb: {
2984 // We treat __stosb as a volatile memset - it may not generate "rep stosb"
2985 // instruction, but it will create a memset that won't be optimized away.
2986 return Builder.CreateMemSet(Ptr: Ops[0], Val: Ops[1], Size: Ops[2], Align: Align(1), isVolatile: true);
2987 }
2988 // Corresponding to intrisics which will return 2 tiles (tile0_tile1).
2989 case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
2990 case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal:
2991 case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
2992 case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal:
2993 case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
2994 case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal:
2995 case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal:
2996 case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal: {
2997 Intrinsic::ID IID;
2998 switch (BuiltinID) {
2999 default:
3000 llvm_unreachable("Unsupported intrinsic!");
3001 case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
3002 IID = Intrinsic::x86_t2rpntlvwz0_internal;
3003 break;
3004 case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal:
3005 IID = Intrinsic::x86_t2rpntlvwz0rs_internal;
3006 break;
3007 case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
3008 IID = Intrinsic::x86_t2rpntlvwz0t1_internal;
3009 break;
3010 case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal:
3011 IID = Intrinsic::x86_t2rpntlvwz0rst1_internal;
3012 break;
3013 case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
3014 IID = Intrinsic::x86_t2rpntlvwz1_internal;
3015 break;
3016 case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal:
3017 IID = Intrinsic::x86_t2rpntlvwz1rs_internal;
3018 break;
3019 case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal:
3020 IID = Intrinsic::x86_t2rpntlvwz1t1_internal;
3021 break;
3022 case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal:
3023 IID = Intrinsic::x86_t2rpntlvwz1rst1_internal;
3024 break;
3025 }
3026
3027 // Ops = (Row0, Col0, Col1, DstPtr0, DstPtr1, SrcPtr, Stride)
3028 Value *Call = Builder.CreateCall(Callee: CGM.getIntrinsic(IID),
3029 Args: {Ops[0], Ops[1], Ops[2], Ops[5], Ops[6]});
3030
3031 auto *PtrTy = E->getArg(Arg: 3)->getType()->getAs<PointerType>();
3032 assert(PtrTy && "arg3 must be of pointer type");
3033 QualType PtreeTy = PtrTy->getPointeeType();
3034 llvm::Type *TyPtee = ConvertType(T: PtreeTy);
3035
3036 // Bitcast amx type (x86_amx) to vector type (256 x i32)
3037 // Then store tile0 into DstPtr0
3038 Value *T0 = Builder.CreateExtractValue(Agg: Call, Idxs: 0);
3039 Value *VecT0 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector,
3040 {TyPtee}, {T0});
3041 Builder.CreateDefaultAlignedStore(Val: VecT0, Addr: Ops[3]);
3042
3043 // Then store tile1 into DstPtr1
3044 Value *T1 = Builder.CreateExtractValue(Agg: Call, Idxs: 1);
3045 Value *VecT1 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector,
3046 {TyPtee}, {T1});
3047 Value *Store = Builder.CreateDefaultAlignedStore(Val: VecT1, Addr: Ops[4]);
3048
3049 // Note: Here we escape directly use x86_tilestored64_internal to store
3050 // the results due to it can't make sure the Mem written scope. This may
3051 // cause shapes reloads after first amx intrinsic, which current amx reg-
3052 // ister allocation has no ability to handle it.
3053
3054 return Store;
3055 }
3056 case X86::BI__ud2:
3057 // llvm.trap makes a ud2a instruction on x86.
3058 return EmitTrapCall(Intrinsic::trap);
3059 case X86::BI__int2c: {
3060 // This syscall signals a driver assertion failure in x86 NT kernels.
3061 llvm::FunctionType *FTy = llvm::FunctionType::get(Result: VoidTy, isVarArg: false);
3062 llvm::InlineAsm *IA =
3063 llvm::InlineAsm::get(Ty: FTy, AsmString: "int $$0x2c", Constraints: "", /*hasSideEffects=*/true);
3064 llvm::AttributeList NoReturnAttr = llvm::AttributeList::get(
3065 getLLVMContext(), llvm::AttributeList::FunctionIndex,
3066 llvm::Attribute::NoReturn);
3067 llvm::CallInst *CI = Builder.CreateCall(Callee: IA);
3068 CI->setAttributes(NoReturnAttr);
3069 return CI;
3070 }
3071 case X86::BI__readfsbyte:
3072 case X86::BI__readfsword:
3073 case X86::BI__readfsdword:
3074 case X86::BI__readfsqword: {
3075 llvm::Type *IntTy = ConvertType(E->getType());
3076 Value *Ptr = Builder.CreateIntToPtr(
3077 V: Ops[0], DestTy: llvm::PointerType::get(C&: getLLVMContext(), AddressSpace: 257));
3078 LoadInst *Load = Builder.CreateAlignedLoad(
3079 IntTy, Ptr, getContext().getTypeAlignInChars(E->getType()));
3080 Load->setVolatile(true);
3081 return Load;
3082 }
3083 case X86::BI__readgsbyte:
3084 case X86::BI__readgsword:
3085 case X86::BI__readgsdword:
3086 case X86::BI__readgsqword: {
3087 llvm::Type *IntTy = ConvertType(E->getType());
3088 Value *Ptr = Builder.CreateIntToPtr(
3089 V: Ops[0], DestTy: llvm::PointerType::get(C&: getLLVMContext(), AddressSpace: 256));
3090 LoadInst *Load = Builder.CreateAlignedLoad(
3091 IntTy, Ptr, getContext().getTypeAlignInChars(E->getType()));
3092 Load->setVolatile(true);
3093 return Load;
3094 }
3095 case X86::BI__builtin_ia32_encodekey128_u32: {
3096 Intrinsic::ID IID = Intrinsic::x86_encodekey128;
3097
3098 Value *Call = Builder.CreateCall(Callee: CGM.getIntrinsic(IID), Args: {Ops[0], Ops[1]});
3099
3100 for (int i = 0; i < 3; ++i) {
3101 Value *Extract = Builder.CreateExtractValue(Agg: Call, Idxs: i + 1);
3102 Value *Ptr = Builder.CreateConstGEP1_32(Ty: Int8Ty, Ptr: Ops[2], Idx0: i * 16);
3103 Builder.CreateAlignedStore(Val: Extract, Ptr, Align: Align(1));
3104 }
3105
3106 return Builder.CreateExtractValue(Agg: Call, Idxs: 0);
3107 }
3108 case X86::BI__builtin_ia32_encodekey256_u32: {
3109 Intrinsic::ID IID = Intrinsic::x86_encodekey256;
3110
3111 Value *Call =
3112 Builder.CreateCall(Callee: CGM.getIntrinsic(IID), Args: {Ops[0], Ops[1], Ops[2]});
3113
3114 for (int i = 0; i < 4; ++i) {
3115 Value *Extract = Builder.CreateExtractValue(Agg: Call, Idxs: i + 1);
3116 Value *Ptr = Builder.CreateConstGEP1_32(Ty: Int8Ty, Ptr: Ops[3], Idx0: i * 16);
3117 Builder.CreateAlignedStore(Val: Extract, Ptr, Align: Align(1));
3118 }
3119
3120 return Builder.CreateExtractValue(Agg: Call, Idxs: 0);
3121 }
3122 case X86::BI__builtin_ia32_aesenc128kl_u8:
3123 case X86::BI__builtin_ia32_aesdec128kl_u8:
3124 case X86::BI__builtin_ia32_aesenc256kl_u8:
3125 case X86::BI__builtin_ia32_aesdec256kl_u8: {
3126 Intrinsic::ID IID;
3127 StringRef BlockName;
3128 switch (BuiltinID) {
3129 default:
3130 llvm_unreachable("Unexpected builtin");
3131 case X86::BI__builtin_ia32_aesenc128kl_u8:
3132 IID = Intrinsic::x86_aesenc128kl;
3133 BlockName = "aesenc128kl";
3134 break;
3135 case X86::BI__builtin_ia32_aesdec128kl_u8:
3136 IID = Intrinsic::x86_aesdec128kl;
3137 BlockName = "aesdec128kl";
3138 break;
3139 case X86::BI__builtin_ia32_aesenc256kl_u8:
3140 IID = Intrinsic::x86_aesenc256kl;
3141 BlockName = "aesenc256kl";
3142 break;
3143 case X86::BI__builtin_ia32_aesdec256kl_u8:
3144 IID = Intrinsic::x86_aesdec256kl;
3145 BlockName = "aesdec256kl";
3146 break;
3147 }
3148
3149 Value *Call = Builder.CreateCall(Callee: CGM.getIntrinsic(IID), Args: {Ops[1], Ops[2]});
3150
3151 BasicBlock *NoError =
3152 createBasicBlock(name: BlockName + "_no_error", parent: this->CurFn);
3153 BasicBlock *Error = createBasicBlock(name: BlockName + "_error", parent: this->CurFn);
3154 BasicBlock *End = createBasicBlock(name: BlockName + "_end", parent: this->CurFn);
3155
3156 Value *Ret = Builder.CreateExtractValue(Agg: Call, Idxs: 0);
3157 Value *Succ = Builder.CreateTrunc(V: Ret, DestTy: Builder.getInt1Ty());
3158 Value *Out = Builder.CreateExtractValue(Agg: Call, Idxs: 1);
3159 Builder.CreateCondBr(Cond: Succ, True: NoError, False: Error);
3160
3161 Builder.SetInsertPoint(NoError);
3162 Builder.CreateDefaultAlignedStore(Val: Out, Addr: Ops[0]);
3163 Builder.CreateBr(Dest: End);
3164
3165 Builder.SetInsertPoint(Error);
3166 Constant *Zero = llvm::Constant::getNullValue(Ty: Out->getType());
3167 Builder.CreateDefaultAlignedStore(Val: Zero, Addr: Ops[0]);
3168 Builder.CreateBr(Dest: End);
3169
3170 Builder.SetInsertPoint(End);
3171 return Builder.CreateExtractValue(Agg: Call, Idxs: 0);
3172 }
3173 case X86::BI__builtin_ia32_aesencwide128kl_u8:
3174 case X86::BI__builtin_ia32_aesdecwide128kl_u8:
3175 case X86::BI__builtin_ia32_aesencwide256kl_u8:
3176 case X86::BI__builtin_ia32_aesdecwide256kl_u8: {
3177 Intrinsic::ID IID;
3178 StringRef BlockName;
3179 switch (BuiltinID) {
3180 case X86::BI__builtin_ia32_aesencwide128kl_u8:
3181 IID = Intrinsic::x86_aesencwide128kl;
3182 BlockName = "aesencwide128kl";
3183 break;
3184 case X86::BI__builtin_ia32_aesdecwide128kl_u8:
3185 IID = Intrinsic::x86_aesdecwide128kl;
3186 BlockName = "aesdecwide128kl";
3187 break;
3188 case X86::BI__builtin_ia32_aesencwide256kl_u8:
3189 IID = Intrinsic::x86_aesencwide256kl;
3190 BlockName = "aesencwide256kl";
3191 break;
3192 case X86::BI__builtin_ia32_aesdecwide256kl_u8:
3193 IID = Intrinsic::x86_aesdecwide256kl;
3194 BlockName = "aesdecwide256kl";
3195 break;
3196 }
3197
3198 llvm::Type *Ty = FixedVectorType::get(ElementType: Builder.getInt64Ty(), NumElts: 2);
3199 Value *InOps[9];
3200 InOps[0] = Ops[2];
3201 for (int i = 0; i != 8; ++i) {
3202 Value *Ptr = Builder.CreateConstGEP1_32(Ty, Ptr: Ops[1], Idx0: i);
3203 InOps[i + 1] = Builder.CreateAlignedLoad(Ty, Ptr, Align: Align(16));
3204 }
3205
3206 Value *Call = Builder.CreateCall(Callee: CGM.getIntrinsic(IID), Args: InOps);
3207
3208 BasicBlock *NoError =
3209 createBasicBlock(name: BlockName + "_no_error", parent: this->CurFn);
3210 BasicBlock *Error = createBasicBlock(name: BlockName + "_error", parent: this->CurFn);
3211 BasicBlock *End = createBasicBlock(name: BlockName + "_end", parent: this->CurFn);
3212
3213 Value *Ret = Builder.CreateExtractValue(Agg: Call, Idxs: 0);
3214 Value *Succ = Builder.CreateTrunc(V: Ret, DestTy: Builder.getInt1Ty());
3215 Builder.CreateCondBr(Cond: Succ, True: NoError, False: Error);
3216
3217 Builder.SetInsertPoint(NoError);
3218 for (int i = 0; i != 8; ++i) {
3219 Value *Extract = Builder.CreateExtractValue(Agg: Call, Idxs: i + 1);
3220 Value *Ptr = Builder.CreateConstGEP1_32(Ty: Extract->getType(), Ptr: Ops[0], Idx0: i);
3221 Builder.CreateAlignedStore(Val: Extract, Ptr, Align: Align(16));
3222 }
3223 Builder.CreateBr(Dest: End);
3224
3225 Builder.SetInsertPoint(Error);
3226 for (int i = 0; i != 8; ++i) {
3227 Value *Out = Builder.CreateExtractValue(Agg: Call, Idxs: i + 1);
3228 Constant *Zero = llvm::Constant::getNullValue(Ty: Out->getType());
3229 Value *Ptr = Builder.CreateConstGEP1_32(Ty: Out->getType(), Ptr: Ops[0], Idx0: i);
3230 Builder.CreateAlignedStore(Val: Zero, Ptr, Align: Align(16));
3231 }
3232 Builder.CreateBr(Dest: End);
3233
3234 Builder.SetInsertPoint(End);
3235 return Builder.CreateExtractValue(Agg: Call, Idxs: 0);
3236 }
3237 case X86::BI__builtin_ia32_vfcmaddcph512_mask:
3238 IsConjFMA = true;
3239 [[fallthrough]];
3240 case X86::BI__builtin_ia32_vfmaddcph512_mask: {
3241 Intrinsic::ID IID = IsConjFMA
3242 ? Intrinsic::x86_avx512fp16_mask_vfcmadd_cph_512
3243 : Intrinsic::x86_avx512fp16_mask_vfmadd_cph_512;
3244 Value *Call = Builder.CreateCall(Callee: CGM.getIntrinsic(IID), Args: Ops);
3245 return EmitX86Select(CGF&: *this, Mask: Ops[3], Op0: Call, Op1: Ops[0]);
3246 }
3247 case X86::BI__builtin_ia32_vfcmaddcsh_round_mask:
3248 IsConjFMA = true;
3249 [[fallthrough]];
3250 case X86::BI__builtin_ia32_vfmaddcsh_round_mask: {
3251 Intrinsic::ID IID = IsConjFMA ? Intrinsic::x86_avx512fp16_mask_vfcmadd_csh
3252 : Intrinsic::x86_avx512fp16_mask_vfmadd_csh;
3253 Value *Call = Builder.CreateCall(Callee: CGM.getIntrinsic(IID), Args: Ops);
3254 Value *And = Builder.CreateAnd(LHS: Ops[3], RHS: llvm::ConstantInt::get(Ty: Int8Ty, V: 1));
3255 return EmitX86Select(CGF&: *this, Mask: And, Op0: Call, Op1: Ops[0]);
3256 }
3257 case X86::BI__builtin_ia32_vfcmaddcsh_round_mask3:
3258 IsConjFMA = true;
3259 [[fallthrough]];
3260 case X86::BI__builtin_ia32_vfmaddcsh_round_mask3: {
3261 Intrinsic::ID IID = IsConjFMA ? Intrinsic::x86_avx512fp16_mask_vfcmadd_csh
3262 : Intrinsic::x86_avx512fp16_mask_vfmadd_csh;
3263 Value *Call = Builder.CreateCall(Callee: CGM.getIntrinsic(IID), Args: Ops);
3264 static constexpr int Mask[] = {0, 5, 6, 7};
3265 return Builder.CreateShuffleVector(V1: Call, V2: Ops[2], Mask);
3266 }
3267 case X86::BI__builtin_ia32_prefetchi:
3268 return Builder.CreateCall(
3269 CGM.getIntrinsic(Intrinsic::prefetch, Ops[0]->getType()),
3270 {Ops[0], llvm::ConstantInt::get(Int32Ty, 0), Ops[1],
3271 llvm::ConstantInt::get(Int32Ty, 0)});
3272 }
3273}
3274

source code of clang/lib/CodeGen/TargetBuiltins/X86.cpp