AMDGPULibCalls.cpp source code [llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp]

1	//===- AMDGPULibCalls.cpp -------------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// This file does AMD library function optimizations.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "AMDGPU.h"
15	#include "AMDGPULibFunc.h"
16	#include "GCNSubtarget.h"
17	#include "llvm/Analysis/AssumptionCache.h"
18	#include "llvm/Analysis/TargetLibraryInfo.h"
19	#include "llvm/Analysis/ValueTracking.h"
20	#include "llvm/IR/AttributeMask.h"
21	#include "llvm/IR/Dominators.h"
22	#include "llvm/IR/IRBuilder.h"
23	#include "llvm/IR/IntrinsicInst.h"
24	#include "llvm/IR/IntrinsicsAMDGPU.h"
25	#include "llvm/IR/PatternMatch.h"
26	#include "llvm/InitializePasses.h"
27	#include <cmath>
28
29	#define DEBUG_TYPE "amdgpu-simplifylib"
30
31	using namespace llvm;
32	using namespace llvm::PatternMatch;
33
34	static cl::opt<bool> EnablePreLink("amdgpu-prelink",
35	cl::desc ("Enable pre-link mode optimizations"),
36	cl::init(Val: false),
37	cl::Hidden);
38
39	static cl::list<std::string> UseNative("amdgpu-use-native",
40	cl::desc ("Comma separated list of functions to replace with native, or all"),
41	cl::CommaSeparated, cl::ValueOptional,
42	cl::Hidden);
43
44	#define MATH_PI numbers::pi
45	#define MATH_E numbers::e
46	#define MATH_SQRT2 numbers::sqrt2
47	#define MATH_SQRT1_2 numbers::inv_sqrt2
48
49	namespace llvm {
50
51	class AMDGPULibCalls {
52	private:
53	const TargetLibraryInfo TLInfo = nullptr*;
54	AssumptionCache AC = nullptr*;
55	DominatorTree DT = nullptr*;
56
57	typedef llvm::AMDGPULibFunc FuncInfo;
58
59	bool UnsafeFPMath = false;
60
61	// -fuse-native.
62	bool AllNative = false;
63
64	bool useNativeFunc(const StringRef F) const;
65
66	// Return a pointer (pointer expr) to the function if function definition with
67	// "FuncName" exists. It may create a new function prototype in pre-link mode.
68	FunctionCallee getFunction(Module M, const* FuncInfo &fInfo);
69
70	bool parseFunctionName(const StringRef &FMangledName, FuncInfo &FInfo);
71
72	bool TDOFold(CallInst CI, const* FuncInfo &FInfo);
73
74	/ Specialized optimizations /
75
76	// pow/powr/pown
77	bool fold_pow(FPMathOperator FPOp, IRBuilder<> &B, const* FuncInfo &FInfo);
78
79	// rootn
80	bool fold_rootn(FPMathOperator FPOp, IRBuilder<> &B, const* FuncInfo &FInfo);
81
82	// -fuse-native for sincos
83	bool sincosUseNative(CallInst aCI, const* FuncInfo &FInfo);
84
85	// evaluate calls if calls' arguments are constants.
86	bool evaluateScalarMathFunc(const FuncInfo &FInfo, double &Res0, double &Res1,
87	Constant copr0, Constant copr1);
88	bool evaluateCall(CallInst aCI, const* FuncInfo &FInfo);
89
90	/// Insert a value to sincos function \p Fsincos. Returns (value of sin, value
91	/// of cos, sincos call).
92	std::tuple<Value , Value , Value > insertSinCos(Value Arg,
93	FastMathFlags FMF,
94	IRBuilder<> &B,
95	FunctionCallee Fsincos);
96
97	// sin/cos
98	bool fold_sincos(FPMathOperator FPOp, IRBuilder<> &B, const* FuncInfo &FInfo);
99
100	// __read_pipe/__write_pipe
101	bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
102	const FuncInfo &FInfo);
103
104	// Get a scalar native builtin single argument FP function
105	FunctionCallee getNativeFunction(Module M, const* FuncInfo &FInfo);
106
107	/// Substitute a call to a known libcall with an intrinsic call. If \p
108	/// AllowMinSize is true, allow the replacement in a minsize function.
109	bool shouldReplaceLibcallWithIntrinsic(const CallInst *CI,
110	bool AllowMinSizeF32 = false,
111	bool AllowF64 = false,
112	bool AllowStrictFP = false);
113	void replaceLibCallWithSimpleIntrinsic(IRBuilder<> &B, CallInst *CI,
114	Intrinsic::ID IntrID);
115
116	bool tryReplaceLibcallWithSimpleIntrinsic(IRBuilder<> &B, CallInst *CI,
117	Intrinsic::ID IntrID,
118	bool AllowMinSizeF32 = false,
119	bool AllowF64 = false,
120	bool AllowStrictFP = false);
121
122	protected:
123	bool isUnsafeMath(const FPMathOperator FPOp) const*;
124	bool isUnsafeFiniteOnlyMath(const FPMathOperator FPOp) const*;
125
126	bool canIncreasePrecisionOfConstantFold(const FPMathOperator FPOp) const*;
127
128	static void replaceCall(Instruction I, Value With) {
129	I->replaceAllUsesWith(V: With);
130	I->eraseFromParent();
131	}
132
133	static void replaceCall(FPMathOperator I, Value With) {
134	replaceCall(I: cast<Instruction>(Val: I), With);
135	}
136
137	public:
138	AMDGPULibCalls() {}
139
140	bool fold(CallInst *CI);
141
142	void initFunction(Function &F, FunctionAnalysisManager &FAM);
143	void initNativeFuncs();
144
145	// Replace a normal math function call with that native version
146	bool useNative(CallInst *CI);
147	};
148
149	} // end llvm namespace
150
151	template <typename IRB>
152	static CallInst CreateCallEx(IRB &B, FunctionCallee Callee, Value Arg,
153	const Twine &Name = "") {
154	CallInst *R = B.CreateCall(Callee, Arg, Name);
155	if (Function *F = dyn_cast<Function>(Val: Callee.getCallee()))
156	R->setCallingConv(F->getCallingConv());
157	return R;
158	}
159
160	template <typename IRB>
161	static CallInst CreateCallEx2(IRB &B, FunctionCallee Callee, Value Arg1,
162	Value Arg2, const* Twine &Name = "") {
163	CallInst *R = B.CreateCall(Callee, {Arg1, Arg2}, Name);
164	if (Function *F = dyn_cast<Function>(Val: Callee.getCallee()))
165	R->setCallingConv(F->getCallingConv());
166	return R;
167	}
168
169	static FunctionType getPownType(FunctionType FT) {
170	Type *PowNExpTy = Type::getInt32Ty(C&: FT->getContext());
171	if (VectorType *VecTy = dyn_cast<VectorType>(Val: FT->getReturnType()))
172	PowNExpTy = VectorType::get(ElementType: PowNExpTy, EC: VecTy->getElementCount());
173
174	return FunctionType::get(Result: FT->getReturnType(),
175	Params: {FT->getParamType(i: `0`), PowNExpTy}, isVarArg: false);
176	}
177
178	// Data structures for table-driven optimizations.
179	// FuncTbl works for both f32 and f64 functions with 1 input argument
180
181	struct TableEntry {
182	double result;
183	double input;
184	};
185
186	/ a list of {result, input} /
187	static const TableEntry tbl_acos[] = {
188	{MATH_PI / `2.0`, .input: `0.0`},
189	{MATH_PI / `2.0`, .input: -`0.0`},
190	{.result: `0.0`, .input: `1.0`},
191	{MATH_PI, .input: -`1.0`}
192	};
193	static const TableEntry tbl_acosh[] = {
194	{.result: `0.0`, .input: `1.0`}
195	};
196	static const TableEntry tbl_acospi[] = {
197	{.result: `0.5`, .input: `0.0`},
198	{.result: `0.5`, .input: -`0.0`},
199	{.result: `0.0`, .input: `1.0`},
200	{.result: `1.0`, .input: -`1.0`}
201	};
202	static const TableEntry tbl_asin[] = {
203	{.result: `0.0`, .input: `0.0`},
204	{.result: -`0.0`, .input: -`0.0`},
205	{MATH_PI / `2.0`, .input: `1.0`},
206	{.result: -MATH_PI / `2.0`, .input: -`1.0`}
207	};
208	static const TableEntry tbl_asinh[] = {
209	{.result: `0.0`, .input: `0.0`},
210	{.result: -`0.0`, .input: -`0.0`}
211	};
212	static const TableEntry tbl_asinpi[] = {
213	{.result: `0.0`, .input: `0.0`},
214	{.result: -`0.0`, .input: -`0.0`},
215	{.result: `0.5`, .input: `1.0`},
216	{.result: -`0.5`, .input: -`1.0`}
217	};
218	static const TableEntry tbl_atan[] = {
219	{.result: `0.0`, .input: `0.0`},
220	{.result: -`0.0`, .input: -`0.0`},
221	{MATH_PI / `4.0`, .input: `1.0`},
222	{.result: -MATH_PI / `4.0`, .input: -`1.0`}
223	};
224	static const TableEntry tbl_atanh[] = {
225	{.result: `0.0`, .input: `0.0`},
226	{.result: -`0.0`, .input: -`0.0`}
227	};
228	static const TableEntry tbl_atanpi[] = {
229	{.result: `0.0`, .input: `0.0`},
230	{.result: -`0.0`, .input: -`0.0`},
231	{.result: `0.25`, .input: `1.0`},
232	{.result: -`0.25`, .input: -`1.0`}
233	};
234	static const TableEntry tbl_cbrt[] = {
235	{.result: `0.0`, .input: `0.0`},
236	{.result: -`0.0`, .input: -`0.0`},
237	{.result: `1.0`, .input: `1.0`},
238	{.result: -`1.0`, .input: -`1.0`},
239	};
240	static const TableEntry tbl_cos[] = {
241	{.result: `1.0`, .input: `0.0`},
242	{.result: `1.0`, .input: -`0.0`}
243	};
244	static const TableEntry tbl_cosh[] = {
245	{.result: `1.0`, .input: `0.0`},
246	{.result: `1.0`, .input: -`0.0`}
247	};
248	static const TableEntry tbl_cospi[] = {
249	{.result: `1.0`, .input: `0.0`},
250	{.result: `1.0`, .input: -`0.0`}
251	};
252	static const TableEntry tbl_erfc[] = {
253	{.result: `1.0`, .input: `0.0`},
254	{.result: `1.0`, .input: -`0.0`}
255	};
256	static const TableEntry tbl_erf[] = {
257	{.result: `0.0`, .input: `0.0`},
258	{.result: -`0.0`, .input: -`0.0`}
259	};
260	static const TableEntry tbl_exp[] = {
261	{.result: `1.0`, .input: `0.0`},
262	{.result: `1.0`, .input: -`0.0`},
263	{MATH_E, .input: `1.0`}
264	};
265	static const TableEntry tbl_exp2[] = {
266	{.result: `1.0`, .input: `0.0`},
267	{.result: `1.0`, .input: -`0.0`},
268	{.result: `2.0`, .input: `1.0`}
269	};
270	static const TableEntry tbl_exp10[] = {
271	{.result: `1.0`, .input: `0.0`},
272	{.result: `1.0`, .input: -`0.0`},
273	{.result: `10.0`, .input: `1.0`}
274	};
275	static const TableEntry tbl_expm1[] = {
276	{.result: `0.0`, .input: `0.0`},
277	{.result: -`0.0`, .input: -`0.0`}
278	};
279	static const TableEntry tbl_log[] = {
280	{.result: `0.0`, .input: `1.0`},
281	{.result: `1.0`, MATH_E}
282	};
283	static const TableEntry tbl_log2[] = {
284	{.result: `0.0`, .input: `1.0`},
285	{.result: `1.0`, .input: `2.0`}
286	};
287	static const TableEntry tbl_log10[] = {
288	{.result: `0.0`, .input: `1.0`},
289	{.result: `1.0`, .input: `10.0`}
290	};
291	static const TableEntry tbl_rsqrt[] = {
292	{.result: `1.0`, .input: `1.0`},
293	{MATH_SQRT1_2, .input: `2.0`}
294	};
295	static const TableEntry tbl_sin[] = {
296	{.result: `0.0`, .input: `0.0`},
297	{.result: -`0.0`, .input: -`0.0`}
298	};
299	static const TableEntry tbl_sinh[] = {
300	{.result: `0.0`, .input: `0.0`},
301	{.result: -`0.0`, .input: -`0.0`}
302	};
303	static const TableEntry tbl_sinpi[] = {
304	{.result: `0.0`, .input: `0.0`},
305	{.result: -`0.0`, .input: -`0.0`}
306	};
307	static const TableEntry tbl_sqrt[] = {
308	{.result: `0.0`, .input: `0.0`},
309	{.result: `1.0`, .input: `1.0`},
310	{MATH_SQRT2, .input: `2.0`}
311	};
312	static const TableEntry tbl_tan[] = {
313	{.result: `0.0`, .input: `0.0`},
314	{.result: -`0.0`, .input: -`0.0`}
315	};
316	static const TableEntry tbl_tanh[] = {
317	{.result: `0.0`, .input: `0.0`},
318	{.result: -`0.0`, .input: -`0.0`}
319	};
320	static const TableEntry tbl_tanpi[] = {
321	{.result: `0.0`, .input: `0.0`},
322	{.result: -`0.0`, .input: -`0.0`}
323	};
324	static const TableEntry tbl_tgamma[] = {
325	{.result: `1.0`, .input: `1.0`},
326	{.result: `1.0`, .input: `2.0`},
327	{.result: `2.0`, .input: `3.0`},
328	{.result: `6.0`, .input: `4.0`}
329	};
330
331	static bool HasNative(AMDGPULibFunc::EFuncId id) {
332	switch(id) {
333	case AMDGPULibFunc::EI_DIVIDE:
334	case AMDGPULibFunc::EI_COS:
335	case AMDGPULibFunc::EI_EXP:
336	case AMDGPULibFunc::EI_EXP2:
337	case AMDGPULibFunc::EI_EXP10:
338	case AMDGPULibFunc::EI_LOG:
339	case AMDGPULibFunc::EI_LOG2:
340	case AMDGPULibFunc::EI_LOG10:
341	case AMDGPULibFunc::EI_POWR:
342	case AMDGPULibFunc::EI_RECIP:
343	case AMDGPULibFunc::EI_RSQRT:
344	case AMDGPULibFunc::EI_SIN:
345	case AMDGPULibFunc::EI_SINCOS:
346	case AMDGPULibFunc::EI_SQRT:
347	case AMDGPULibFunc::EI_TAN:
348	return true;
349	default:;
350	}
351	return false;
352	}
353
354	using TableRef = ArrayRef<TableEntry>;
355
356	static TableRef getOptTable(AMDGPULibFunc::EFuncId id) {
357	switch(id) {
358	case AMDGPULibFunc::EI_ACOS: return TableRef (tbl_acos);
359	case AMDGPULibFunc::EI_ACOSH: return TableRef (tbl_acosh);
360	case AMDGPULibFunc::EI_ACOSPI: return TableRef (tbl_acospi);
361	case AMDGPULibFunc::EI_ASIN: return TableRef (tbl_asin);
362	case AMDGPULibFunc::EI_ASINH: return TableRef (tbl_asinh);
363	case AMDGPULibFunc::EI_ASINPI: return TableRef (tbl_asinpi);
364	case AMDGPULibFunc::EI_ATAN: return TableRef (tbl_atan);
365	case AMDGPULibFunc::EI_ATANH: return TableRef (tbl_atanh);
366	case AMDGPULibFunc::EI_ATANPI: return TableRef (tbl_atanpi);
367	case AMDGPULibFunc::EI_CBRT: return TableRef (tbl_cbrt);
368	case AMDGPULibFunc::EI_NCOS:
369	case AMDGPULibFunc::EI_COS: return TableRef (tbl_cos);
370	case AMDGPULibFunc::EI_COSH: return TableRef (tbl_cosh);
371	case AMDGPULibFunc::EI_COSPI: return TableRef (tbl_cospi);
372	case AMDGPULibFunc::EI_ERFC: return TableRef (tbl_erfc);
373	case AMDGPULibFunc::EI_ERF: return TableRef (tbl_erf);
374	case AMDGPULibFunc::EI_EXP: return TableRef (tbl_exp);
375	case AMDGPULibFunc::EI_NEXP2:
376	case AMDGPULibFunc::EI_EXP2: return TableRef (tbl_exp2);
377	case AMDGPULibFunc::EI_EXP10: return TableRef (tbl_exp10);
378	case AMDGPULibFunc::EI_EXPM1: return TableRef (tbl_expm1);
379	case AMDGPULibFunc::EI_LOG: return TableRef (tbl_log);
380	case AMDGPULibFunc::EI_NLOG2:
381	case AMDGPULibFunc::EI_LOG2: return TableRef (tbl_log2);
382	case AMDGPULibFunc::EI_LOG10: return TableRef (tbl_log10);
383	case AMDGPULibFunc::EI_NRSQRT:
384	case AMDGPULibFunc::EI_RSQRT: return TableRef (tbl_rsqrt);
385	case AMDGPULibFunc::EI_NSIN:
386	case AMDGPULibFunc::EI_SIN: return TableRef (tbl_sin);
387	case AMDGPULibFunc::EI_SINH: return TableRef (tbl_sinh);
388	case AMDGPULibFunc::EI_SINPI: return TableRef (tbl_sinpi);
389	case AMDGPULibFunc::EI_NSQRT:
390	case AMDGPULibFunc::EI_SQRT: return TableRef (tbl_sqrt);
391	case AMDGPULibFunc::EI_TAN: return TableRef (tbl_tan);
392	case AMDGPULibFunc::EI_TANH: return TableRef (tbl_tanh);
393	case AMDGPULibFunc::EI_TANPI: return TableRef (tbl_tanpi);
394	case AMDGPULibFunc::EI_TGAMMA: return TableRef (tbl_tgamma);
395	default:;
396	}
397	return TableRef ();
398	}
399
400	static inline int getVecSize(const AMDGPULibFunc& FInfo) {
401	return FInfo.getLeads()[`0`].VectorSize;
402	}
403
404	static inline AMDGPULibFunc::EType getArgType(const AMDGPULibFunc& FInfo) {
405	return (AMDGPULibFunc::EType)FInfo.getLeads()[`0`].ArgType;
406	}
407
408	FunctionCallee AMDGPULibCalls::getFunction(Module M, const* FuncInfo &fInfo) {
409	// If we are doing PreLinkOpt, the function is external. So it is safe to
410	// use getOrInsertFunction() at this stage.
411
412	return EnablePreLink ? AMDGPULibFunc::getOrInsertFunction(M, fInfo)
413	: AMDGPULibFunc::getFunction(M, fInfo);
414	}
415
416	bool AMDGPULibCalls::parseFunctionName(const StringRef &FMangledName,
417	FuncInfo &FInfo) {
418	return AMDGPULibFunc::parse(MangledName: FMangledName, Ptr&: FInfo);
419	}
420
421	bool AMDGPULibCalls::isUnsafeMath(const FPMathOperator FPOp) const* {
422	return UnsafeFPMath \|\| FPOp->isFast();
423	}
424
425	bool AMDGPULibCalls::isUnsafeFiniteOnlyMath(const FPMathOperator FPOp) const* {
426	return UnsafeFPMath \|\|
427	(FPOp->hasApproxFunc() && FPOp->hasNoNaNs() && FPOp->hasNoInfs());
428	}
429
430	bool AMDGPULibCalls::canIncreasePrecisionOfConstantFold(
431	const FPMathOperator FPOp) const* {
432	// TODO: Refine to approxFunc or contract
433	return isUnsafeMath(FPOp);
434	}
435
436	void AMDGPULibCalls::initFunction(Function &F, FunctionAnalysisManager &FAM) {
437	UnsafeFPMath = F.getFnAttribute(Kind: "unsafe-fp-math").getValueAsBool();
438	AC = &FAM.getResult<AssumptionAnalysis>(IR&: F);
439	TLInfo = &FAM.getResult<TargetLibraryAnalysis>(IR&: F);
440	DT = FAM.getCachedResult<DominatorTreeAnalysis>(IR&: F);
441	}
442
443	bool AMDGPULibCalls::useNativeFunc(const StringRef F) const {
444	return AllNative \|\| llvm::is_contained(Range&: UseNative, Element: F);
445	}
446
447	void AMDGPULibCalls::initNativeFuncs() {
448	AllNative = useNativeFunc(F: "all") \|\|
449	(UseNative.getNumOccurrences() && UseNative.size() == `1` &&
450	UseNative.begin()->empty());
451	}
452
453	bool AMDGPULibCalls::sincosUseNative(CallInst aCI, const* FuncInfo &FInfo) {
454	bool native_sin = useNativeFunc(F: "sin");
455	bool native_cos = useNativeFunc(F: "cos");
456
457	if (native_sin && native_cos) {
458	Module *M = aCI->getModule();
459	Value *opr0 = aCI->getArgOperand(i: `0`);
460
461	AMDGPULibFunc nf;
462	nf.getLeads()[`0`].ArgType = FInfo.getLeads()[`0`].ArgType;
463	nf.getLeads()[`0`].VectorSize = FInfo.getLeads()[`0`].VectorSize;
464
465	nf.setPrefix(AMDGPULibFunc::NATIVE);
466	nf.setId(AMDGPULibFunc::EI_SIN);
467	FunctionCallee sinExpr = getFunction(M, fInfo: nf);
468
469	nf.setPrefix(AMDGPULibFunc::NATIVE);
470	nf.setId(AMDGPULibFunc::EI_COS);
471	FunctionCallee cosExpr = getFunction(M, fInfo: nf);
472	if (sinExpr && cosExpr) {
473	Value *sinval =
474	CallInst::Create(Func: sinExpr, Args: opr0, NameStr: "splitsin", InsertBefore: aCI->getIterator());
475	Value *cosval =
476	CallInst::Create(Func: cosExpr, Args: opr0, NameStr: "splitcos", InsertBefore: aCI->getIterator());
477	new StoreInst (cosval, aCI->getArgOperand(i: `1`), aCI->getIterator());
478
479	DEBUG_WITH_TYPE("usenative", dbgs() << "<useNative> replace " << *aCI
480	<< " with native version of sin/cos");
481
482	replaceCall(I: aCI, With: sinval);
483	return true;
484	}
485	}
486	return false;
487	}
488
489	bool AMDGPULibCalls::useNative(CallInst *aCI) {
490	Function *Callee = aCI->getCalledFunction();
491	if (!Callee \|\| aCI->isNoBuiltin())
492	return false;
493
494	FuncInfo FInfo;
495	if (!parseFunctionName(FMangledName: Callee->getName(), FInfo) \|\| !FInfo.isMangled() \|\|
496	FInfo.getPrefix() != AMDGPULibFunc::NOPFX \|\|
497	getArgType(FInfo) == AMDGPULibFunc::F64 \|\| !HasNative(id: FInfo.getId()) \|\|
498	!(AllNative \|\| useNativeFunc(F: FInfo.getName()))) {
499	return false;
500	}
501
502	if (FInfo.getId() == AMDGPULibFunc::EI_SINCOS)
503	return sincosUseNative(aCI, FInfo);
504
505	FInfo.setPrefix(AMDGPULibFunc::NATIVE);
506	FunctionCallee F = getFunction(M: aCI->getModule(), fInfo: FInfo);
507	if (!F)
508	return false;
509
510	aCI->setCalledFunction(F);
511	DEBUG_WITH_TYPE("usenative", dbgs() << "<useNative> replace " << *aCI
512	<< " with native version");
513	return true;
514	}
515
516	// Clang emits call of __read_pipe_2 or __read_pipe_4 for OpenCL read_pipe
517	// builtin, with appended type size and alignment arguments, where 2 or 4
518	// indicates the original number of arguments. The library has optimized version
519	// of __read_pipe_2/__read_pipe_4 when the type size and alignment has the same
520	// power of 2 value. This function transforms __read_pipe_2 to __read_pipe_2_N
521	// for such cases where N is the size in bytes of the type (N = 1, 2, 4, 8, ...,
522	// 128). The same for __read_pipe_4, write_pipe_2, and write_pipe_4.
523	bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
524	const FuncInfo &FInfo) {
525	auto *Callee = CI->getCalledFunction();
526	if (!Callee->isDeclaration())
527	return false;
528
529	assert(Callee->hasName() && "Invalid read_pipe/write_pipe function");
530	auto *M = Callee->getParent();
531	std::string Name = std::string (Callee->getName());
532	auto NumArg = CI->arg_size();
533	if (NumArg != `4` && NumArg != `6`)
534	return false;
535	ConstantInt *PacketSize =
536	dyn_cast<ConstantInt>(Val: CI->getArgOperand(i: NumArg - `2`));
537	ConstantInt *PacketAlign =
538	dyn_cast<ConstantInt>(Val: CI->getArgOperand(i: NumArg - `1`));
539	if (!PacketSize \|\| !PacketAlign)
540	return false;
541
542	unsigned Size = PacketSize->getZExtValue();
543	Align Alignment = PacketAlign->getAlignValue();
544	if (Alignment != Size)
545	return false;
546
547	unsigned PtrArgLoc = CI->arg_size() - `3`;
548	Value *PtrArg = CI->getArgOperand(i: PtrArgLoc);
549	Type *PtrTy = PtrArg->getType();
550
551	SmallVector<llvm::Type *, `6`> ArgTys;
552	for (unsigned I = `0`; I != PtrArgLoc; ++I)
553	ArgTys.push_back(Elt: CI->getArgOperand(i: I)->getType());
554	ArgTys.push_back(Elt: PtrTy);
555
556	Name = Name + "_" + std::to_string(val: Size);
557	auto *FTy = FunctionType::get(Result: Callee->getReturnType(),
558	Params: ArrayRef<Type >(ArgTys), isVarArg: false*);
559	AMDGPULibFunc NewLibFunc(Name, FTy);
560	FunctionCallee F = AMDGPULibFunc::getOrInsertFunction(M, fInfo: NewLibFunc);
561	if (!F)
562	return false;
563
564	SmallVector<Value *, `6`> Args;
565	for (unsigned I = `0`; I != PtrArgLoc; ++I)
566	Args.push_back(Elt: CI->getArgOperand(i: I));
567	Args.push_back(Elt: PtrArg);
568
569	auto *NCI = B.CreateCall(Callee: F, Args);
570	NCI->setAttributes(CI->getAttributes());
571	CI->replaceAllUsesWith(V: NCI);
572	CI->dropAllReferences();
573	CI->eraseFromParent();
574
575	return true;
576	}
577
578	static bool isKnownIntegral(const Value V, const* DataLayout &DL,
579	FastMathFlags FMF) {
580	if (isa<UndefValue>(Val: V))
581	return true;
582
583	if (const ConstantFP *CF = dyn_cast<ConstantFP>(Val: V))
584	return CF->getValueAPF().isInteger();
585
586	if (const ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(Val: V)) {
587	for (unsigned i = `0`, e = CDV->getNumElements(); i != e; ++i) {
588	Constant *ConstElt = CDV->getElementAsConstant(i);
589	if (isa<UndefValue>(Val: ConstElt))
590	continue;
591	const ConstantFP *CFP = dyn_cast<ConstantFP>(Val: ConstElt);
592	if (!CFP \|\| !CFP->getValue().isInteger())
593	return false;
594	}
595
596	return true;
597	}
598
599	const Instruction *I = dyn_cast<Instruction>(Val: V);
600	if (!I)
601	return false;
602
603	switch (I->getOpcode()) {
604	case Instruction::SIToFP:
605	case Instruction::UIToFP:
606	// TODO: Could check nofpclass(inf) on incoming argument
607	if (FMF.noInfs())
608	return true;
609
610	// Need to check int size cannot produce infinity, which computeKnownFPClass
611	// knows how to do already.
612	return isKnownNeverInfinity(V: I, /Depth=/`0`, SQ: SimplifyQuery (DL));
613	case Instruction::Call: {
614	const CallInst *CI = cast<CallInst>(Val: I);
615	switch (CI->getIntrinsicID()) {
616	case Intrinsic::trunc:
617	case Intrinsic::floor:
618	case Intrinsic::ceil:
619	case Intrinsic::rint:
620	case Intrinsic::nearbyint:
621	case Intrinsic::round:
622	case Intrinsic::roundeven:
623	return (FMF.noInfs() && FMF.noNaNs()) \|\|
624	isKnownNeverInfOrNaN(V: I, /Depth=/`0`, SQ: SimplifyQuery (DL));
625	default:
626	break;
627	}
628
629	break;
630	}
631	default:
632	break;
633	}
634
635	return false;
636	}
637
638	// This function returns false if no change; return true otherwise.
639	bool AMDGPULibCalls::fold(CallInst *CI) {
640	Function *Callee = CI->getCalledFunction();
641	// Ignore indirect calls.
642	if (!Callee \|\| Callee->isIntrinsic() \|\| CI->isNoBuiltin())
643	return false;
644
645	FuncInfo FInfo;
646	if (!parseFunctionName(FMangledName: Callee->getName(), FInfo))
647	return false;
648
649	// Further check the number of arguments to see if they match.
650	// TODO: Check calling convention matches too
651	if (!FInfo.isCompatibleSignature(FuncTy: CI->getFunctionType()))
652	return false;
653
654	LLVM_DEBUG(dbgs() << "AMDIC: try folding " << *CI << `'\n'`);
655
656	if (TDOFold(CI, FInfo))
657	return true;
658
659	IRBuilder<> B(CI);
660	if (CI->isStrictFP())
661	B.setIsFPConstrained(true);
662
663	if (FPMathOperator *FPOp = dyn_cast<FPMathOperator>(Val: CI)) {
664	// Under unsafe-math, evaluate calls if possible.
665	// According to Brian Sumner, we can do this for all f32 function calls
666	// using host's double function calls.
667	if (canIncreasePrecisionOfConstantFold(FPOp) && evaluateCall(aCI: CI, FInfo))
668	return true;
669
670	// Copy fast flags from the original call.
671	FastMathFlags FMF = FPOp->getFastMathFlags();
672	B.setFastMathFlags(FMF);
673
674	// Specialized optimizations for each function call.
675	//
676	// TODO: Handle native functions
677	switch (FInfo.getId()) {
678	case AMDGPULibFunc::EI_EXP:
679	if (FMF.none())
680	return false;
681	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::IntrID: exp,
682	AllowMinSizeF32: FMF.approxFunc());
683	case AMDGPULibFunc::EI_EXP2:
684	if (FMF.none())
685	return false;
686	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::IntrID: exp2,
687	AllowMinSizeF32: FMF.approxFunc());
688	case AMDGPULibFunc::EI_LOG:
689	if (FMF.none())
690	return false;
691	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::IntrID: log,
692	AllowMinSizeF32: FMF.approxFunc());
693	case AMDGPULibFunc::EI_LOG2:
694	if (FMF.none())
695	return false;
696	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::IntrID: log2,
697	AllowMinSizeF32: FMF.approxFunc());
698	case AMDGPULibFunc::EI_LOG10:
699	if (FMF.none())
700	return false;
701	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::IntrID: log10,
702	AllowMinSizeF32: FMF.approxFunc());
703	case AMDGPULibFunc::EI_FMIN:
704	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::IntrID: minnum,
705	AllowMinSizeF32: true, AllowF64: true);
706	case AMDGPULibFunc::EI_FMAX:
707	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::IntrID: maxnum,
708	AllowMinSizeF32: true, AllowF64: true);
709	case AMDGPULibFunc::EI_FMA:
710	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::IntrID: fma, AllowMinSizeF32: true,
711	AllowF64: true);
712	case AMDGPULibFunc::EI_MAD:
713	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::IntrID: fmuladd,
714	AllowMinSizeF32: true, AllowF64: true);
715	case AMDGPULibFunc::EI_FABS:
716	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::IntrID: fabs, AllowMinSizeF32: true,
717	AllowF64: true, AllowStrictFP: true);
718	case AMDGPULibFunc::EI_COPYSIGN:
719	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::IntrID: copysign,
720	AllowMinSizeF32: true, AllowF64: true, AllowStrictFP: true);
721	case AMDGPULibFunc::EI_FLOOR:
722	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::IntrID: floor, AllowMinSizeF32: true,
723	AllowF64: true);
724	case AMDGPULibFunc::EI_CEIL:
725	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::IntrID: ceil, AllowMinSizeF32: true,
726	AllowF64: true);
727	case AMDGPULibFunc::EI_TRUNC:
728	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::IntrID: trunc, AllowMinSizeF32: true,
729	AllowF64: true);
730	case AMDGPULibFunc::EI_RINT:
731	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::IntrID: rint, AllowMinSizeF32: true,
732	AllowF64: true);
733	case AMDGPULibFunc::EI_ROUND:
734	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::IntrID: round, AllowMinSizeF32: true,
735	AllowF64: true);
736	case AMDGPULibFunc::EI_LDEXP: {
737	if (!shouldReplaceLibcallWithIntrinsic(CI, AllowMinSizeF32: true, AllowF64: true))
738	return false;
739
740	Value *Arg1 = CI->getArgOperand(i: `1`);
741	if (VectorType *VecTy = dyn_cast<VectorType>(Val: CI->getType());
742	VecTy && !isa<VectorType>(Val: Arg1->getType())) {
743	Value *SplatArg1 = B.CreateVectorSplat(EC: VecTy->getElementCount(), V: Arg1);
744	CI->setArgOperand(i: `1`, v: SplatArg1);
745	}
746
747	CI->setCalledFunction(Intrinsic::getDeclaration(
748	M: CI->getModule(), Intrinsic::id: ldexp,
749	Tys: {CI->getType(), CI->getArgOperand(i: `1`)->getType()}));
750	return true;
751	}
752	case AMDGPULibFunc::EI_POW: {
753	Module *M = Callee->getParent();
754	AMDGPULibFunc PowrInfo(AMDGPULibFunc::EI_POWR, FInfo);
755	FunctionCallee PowrFunc = getFunction(M, fInfo: PowrInfo);
756	CallInst *Call = cast<CallInst>(Val: FPOp);
757
758	// pow(x, y) -> powr(x, y) for x >= -0.0
759	// TODO: Account for flags on current call
760	if (PowrFunc &&
761	cannotBeOrderedLessThanZero(
762	V: FPOp->getOperand(i: `0`), /Depth=/`0`,
763	SQ: SimplifyQuery (M->getDataLayout(), TLInfo, DT, AC, Call))) {
764	Call->setCalledFunction(PowrFunc);
765	return fold_pow(FPOp, B, FInfo: PowrInfo) \|\| true;
766	}
767
768	// pow(x, y) -> pown(x, y) for known integral y
769	if (isKnownIntegral(V: FPOp->getOperand(i: `1`), DL: M->getDataLayout(),
770	FMF: FPOp->getFastMathFlags())) {
771	FunctionType *PownType = getPownType(FT: CI->getFunctionType());
772	AMDGPULibFunc PownInfo(AMDGPULibFunc::EI_POWN, PownType, true);
773	FunctionCallee PownFunc = getFunction(M, fInfo: PownInfo);
774	if (PownFunc) {
775	// TODO: If the incoming integral value is an sitofp/uitofp, it won't
776	// fold out without a known range. We can probably take the source
777	// value directly.
778	Value *CastedArg =
779	B.CreateFPToSI(V: FPOp->getOperand(i: `1`), DestTy: PownType->getParamType(i: `1`));
780	// Have to drop any nofpclass attributes on the original call site.
781	Call->removeParamAttrs(
782	ArgNo: `1`, AttrsToRemove: AttributeFuncs::typeIncompatible(Ty: CastedArg->getType()));
783	Call->setCalledFunction(PownFunc);
784	Call->setArgOperand(i: `1`, v: CastedArg);
785	return fold_pow(FPOp, B, FInfo: PownInfo) \|\| true;
786	}
787	}
788
789	return fold_pow(FPOp, B, FInfo);
790	}
791	case AMDGPULibFunc::EI_POWR:
792	case AMDGPULibFunc::EI_POWN:
793	return fold_pow(FPOp, B, FInfo);
794	case AMDGPULibFunc::EI_ROOTN:
795	return fold_rootn(FPOp, B, FInfo);
796	case AMDGPULibFunc::EI_SQRT:
797	// TODO: Allow with strictfp + constrained intrinsic
798	return tryReplaceLibcallWithSimpleIntrinsic(
799	B, CI, Intrinsic::IntrID: sqrt, AllowMinSizeF32: true, AllowF64: true, /AllowStrictFP=/false);
800	case AMDGPULibFunc::EI_COS:
801	case AMDGPULibFunc::EI_SIN:
802	return fold_sincos(FPOp, B, FInfo);
803	default:
804	break;
805	}
806	} else {
807	// Specialized optimizations for each function call
808	switch (FInfo.getId()) {
809	case AMDGPULibFunc::EI_READ_PIPE_2:
810	case AMDGPULibFunc::EI_READ_PIPE_4:
811	case AMDGPULibFunc::EI_WRITE_PIPE_2:
812	case AMDGPULibFunc::EI_WRITE_PIPE_4:
813	return fold_read_write_pipe(CI, B, FInfo);
814	default:
815	break;
816	}
817	}
818
819	return false;
820	}
821
822	bool AMDGPULibCalls::TDOFold(CallInst CI, const* FuncInfo &FInfo) {
823	// Table-Driven optimization
824	const TableRef tr = getOptTable(id: FInfo.getId());
825	if (tr.empty())
826	return false;
827
828	int const sz = (int)tr.size();
829	Value *opr0 = CI->getArgOperand(i: `0`);
830
831	if (getVecSize(FInfo) > `1`) {
832	if (ConstantDataVector *CV = dyn_cast<ConstantDataVector>(Val: opr0)) {
833	SmallVector<double, `0`> DVal;
834	for (int eltNo = `0`; eltNo < getVecSize(FInfo); ++eltNo) {
835	ConstantFP *eltval = dyn_cast<ConstantFP>(
836	Val: CV->getElementAsConstant(i: (unsigned)eltNo));
837	assert(eltval && "Non-FP arguments in math function!");
838	bool found = false;
839	for (int i=`0`; i < sz; ++i) {
840	if (eltval->isExactlyValue(V: tr [i].input)) {
841	DVal.push_back(Elt: tr [i].result);
842	found = true;
843	break;
844	}
845	}
846	if (!found) {
847	// This vector constants not handled yet.
848	return false;
849	}
850	}
851	LLVMContext &context = CI->getParent()->getParent()->getContext();
852	Constant *nval;
853	if (getArgType(FInfo) == AMDGPULibFunc::F32) {
854	SmallVector<float, `0`> FVal;
855	for (unsigned i = `0`; i < DVal.size(); ++i) {
856	FVal.push_back(Elt: (float)DVal [i]);
857	}
858	ArrayRef<float> tmp(FVal);
859	nval = ConstantDataVector::get(Context&: context, Elts: tmp);
860	} else { // F64
861	ArrayRef<double> tmp(DVal);
862	nval = ConstantDataVector::get(Context&: context, Elts: tmp);
863	}
864	LLVM_DEBUG(errs() << "AMDIC: " << CI << " ---> " << nval << "\n");
865	replaceCall(I: CI, With: nval);
866	return true;
867	}
868	} else {
869	// Scalar version
870	if (ConstantFP *CF = dyn_cast<ConstantFP>(Val: opr0)) {
871	for (int i = `0`; i < sz; ++i) {
872	if (CF->isExactlyValue(V: tr [i].input)) {
873	Value *nval = ConstantFP::get(Ty: CF->getType(), V: tr [i].result);
874	LLVM_DEBUG(errs() << "AMDIC: " << CI << " ---> " << nval << "\n");
875	replaceCall(I: CI, With: nval);
876	return true;
877	}
878	}
879	}
880	}
881
882	return false;
883	}
884
885	namespace llvm {
886	static double log2(double V) {
887	#if _XOPEN_SOURCE >= 600 \|\| defined(_ISOC99_SOURCE) \|\| _POSIX_C_SOURCE >= 200112L
888	return ::log2(x: V);
889	#else
890	return log(V) / numbers::ln2;
891	#endif
892	}
893	}
894
895	bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B,
896	const FuncInfo &FInfo) {
897	assert((FInfo.getId() == AMDGPULibFunc::EI_POW \|\|
898	FInfo.getId() == AMDGPULibFunc::EI_POWR \|\|
899	FInfo.getId() == AMDGPULibFunc::EI_POWN) &&
900	"fold_pow: encounter a wrong function call");
901
902	Module *M = B.GetInsertBlock()->getModule();
903	Type *eltType = FPOp->getType()->getScalarType();
904	Value *opr0 = FPOp->getOperand(i: `0`);
905	Value *opr1 = FPOp->getOperand(i: `1`);
906
907	const APFloat CF = nullptr*;
908	const APInt CINT = nullptr*;
909	if (!match(V: opr1, P: m_APFloatAllowPoison(Res&: CF)))
910	match(V: opr1, P: m_APIntAllowPoison(Res&: CINT));
911
912	// 0x1111111 means that we don't do anything for this call.
913	int ci_opr1 = (CINT ? (int)CINT->getSExtValue() : `0x1111111`);
914
915	if ((CF && CF->isZero()) \|\| (CINT && ci_opr1 == `0`)) {
916	// pow/powr/pown(x, 0) == 1
917	LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1\n");
918	Constant *cnval = ConstantFP::get(Ty: eltType, V: `1.0`);
919	if (getVecSize(FInfo) > `1`) {
920	cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval);
921	}
922	replaceCall(I: FPOp, With: cnval);
923	return true;
924	}
925	if ((CF && CF->isExactlyValue(V: `1.0`)) \|\| (CINT && ci_opr1 == `1`)) {
926	// pow/powr/pown(x, 1.0) = x
927	LLVM_DEBUG(errs() << "AMDIC: " << FPOp << " ---> " << opr0 << "\n");
928	replaceCall(I: FPOp, With: opr0);
929	return true;
930	}
931	if ((CF && CF->isExactlyValue(V: `2.0`)) \|\| (CINT && ci_opr1 == `2`)) {
932	// pow/powr/pown(x, 2.0) = xx*
933	LLVM_DEBUG(errs() << "AMDIC: " << FPOp << " ---> " << opr0 << " * "
934	<< *opr0 << "\n");
935	Value *nval = B.CreateFMul(L: opr0, R: opr0, Name: "__pow2");
936	replaceCall(I: FPOp, With: nval);
937	return true;
938	}
939	if ((CF && CF->isExactlyValue(V: -`1.0`)) \|\| (CINT && ci_opr1 == -`1`)) {
940	// pow/powr/pown(x, -1.0) = 1.0/x
941	LLVM_DEBUG(errs() << "AMDIC: " << FPOp << " ---> 1 / " << opr0 << "\n");
942	Constant *cnval = ConstantFP::get(Ty: eltType, V: `1.0`);
943	if (getVecSize(FInfo) > `1`) {
944	cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval);
945	}
946	Value *nval = B.CreateFDiv(L: cnval, R: opr0, Name: "__powrecip");
947	replaceCall(I: FPOp, With: nval);
948	return true;
949	}
950
951	if (CF && (CF->isExactlyValue(V: `0.5`) \|\| CF->isExactlyValue(V: -`0.5`))) {
952	// pow[r](x, [-]0.5) = sqrt(x)
953	bool issqrt = CF->isExactlyValue(V: `0.5`);
954	if (FunctionCallee FPExpr =
955	getFunction(M, fInfo: AMDGPULibFunc (issqrt ? AMDGPULibFunc::EI_SQRT
956	: AMDGPULibFunc::EI_RSQRT,
957	FInfo))) {
958	LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << FInfo.getName()
959	<< `'('` << *opr0 << ")\n");
960	Value *nval = CreateCallEx(B,Callee: FPExpr, Arg: opr0, Name: issqrt ? "__pow2sqrt"
961	: "__pow2rsqrt");
962	replaceCall(I: FPOp, With: nval);
963	return true;
964	}
965	}
966
967	if (!isUnsafeFiniteOnlyMath(FPOp))
968	return false;
969
970	// Unsafe Math optimization
971
972	// Remember that ci_opr1 is set if opr1 is integral
973	if (CF) {
974	double dval = (getArgType(FInfo) == AMDGPULibFunc::F32)
975	? (double)CF->convertToFloat()
976	: CF->convertToDouble();
977	int ival = (int)dval;
978	if ((double)ival == dval) {
979	ci_opr1 = ival;
980	} else
981	ci_opr1 = `0x11111111`;
982	}
983
984	// pow/powr/pown(x, c) = [1/](xx..x); where
985	// trunc(c) == c && the number of x == c && \|c\| <= 12
986	unsigned abs_opr1 = (ci_opr1 < `0`) ? -ci_opr1 : ci_opr1;
987	if (abs_opr1 <= `12`) {
988	Constant *cnval;
989	Value *nval;
990	if (abs_opr1 == `0`) {
991	cnval = ConstantFP::get(Ty: eltType, V: `1.0`);
992	if (getVecSize(FInfo) > `1`) {
993	cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval);
994	}
995	nval = cnval;
996	} else {
997	Value valx2 = nullptr*;
998	nval = nullptr;
999	while (abs_opr1 > `0`) {
1000	valx2 = valx2 ? B.CreateFMul(L: valx2, R: valx2, Name: "__powx2") : opr0;
1001	if (abs_opr1 & `1`) {
1002	nval = nval ? B.CreateFMul(L: nval, R: valx2, Name: "__powprod") : valx2;
1003	}
1004	abs_opr1 >>= `1`;
1005	}
1006	}
1007
1008	if (ci_opr1 < `0`) {
1009	cnval = ConstantFP::get(Ty: eltType, V: `1.0`);
1010	if (getVecSize(FInfo) > `1`) {
1011	cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval);
1012	}
1013	nval = B.CreateFDiv(L: cnval, R: nval, Name: "__1powprod");
1014	}
1015	LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> "
1016	<< ((ci_opr1 < `0`) ? "1/prod(" : "prod(") << *opr0
1017	<< ")\n");
1018	replaceCall(I: FPOp, With: nval);
1019	return true;
1020	}
1021
1022	// If we should use the generic intrinsic instead of emitting a libcall
1023	const bool ShouldUseIntrinsic = eltType->isFloatTy() \|\| eltType->isHalfTy();
1024
1025	// powr ---> exp2(y log2(x))*
1026	// pown/pow ---> powr(fabs(x), y) \| (x & ((int)y << 31))
1027	FunctionCallee ExpExpr;
1028	if (ShouldUseIntrinsic)
1029	ExpExpr = Intrinsic::getDeclaration(M, Intrinsic::id: exp2, Tys: {FPOp->getType()});
1030	else {
1031	ExpExpr = getFunction(M, fInfo: AMDGPULibFunc (AMDGPULibFunc::EI_EXP2, FInfo));
1032	if (!ExpExpr)
1033	return false;
1034	}
1035
1036	bool needlog = false;
1037	bool needabs = false;
1038	bool needcopysign = false;
1039	Constant cnval = nullptr*;
1040	if (getVecSize(FInfo) == `1`) {
1041	CF = nullptr;
1042	match(V: opr0, P: m_APFloatAllowPoison(Res&: CF));
1043
1044	if (CF) {
1045	double V = (getArgType(FInfo) == AMDGPULibFunc::F32)
1046	? (double)CF->convertToFloat()
1047	: CF->convertToDouble();
1048
1049	V = log2(V: std::abs(x: V));
1050	cnval = ConstantFP::get(Ty: eltType, V);
1051	needcopysign = (FInfo.getId() != AMDGPULibFunc::EI_POWR) &&
1052	CF->isNegative();
1053	} else {
1054	needlog = true;
1055	needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR;
1056	}
1057	} else {
1058	ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(Val: opr0);
1059
1060	if (!CDV) {
1061	needlog = true;
1062	needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR;
1063	} else {
1064	assert ((int)CDV->getNumElements() == getVecSize(FInfo) &&
1065	"Wrong vector size detected");
1066
1067	SmallVector<double, `0`> DVal;
1068	for (int i=`0`; i < getVecSize(FInfo); ++i) {
1069	double V = CDV->getElementAsAPFloat(i).convertToDouble();
1070	if (V < `0.0`) needcopysign = true;
1071	V = log2(V: std::abs(x: V));
1072	DVal.push_back(Elt: V);
1073	}
1074	if (getArgType(FInfo) == AMDGPULibFunc::F32) {
1075	SmallVector<float, `0`> FVal;
1076	for (unsigned i=`0`; i < DVal.size(); ++i) {
1077	FVal.push_back(Elt: (float)DVal [i]);
1078	}
1079	ArrayRef<float> tmp(FVal);
1080	cnval = ConstantDataVector::get(Context&: M->getContext(), Elts: tmp);
1081	} else {
1082	ArrayRef<double> tmp(DVal);
1083	cnval = ConstantDataVector::get(Context&: M->getContext(), Elts: tmp);
1084	}
1085	}
1086	}
1087
1088	if (needcopysign && (FInfo.getId() == AMDGPULibFunc::EI_POW)) {
1089	// We cannot handle corner cases for a general pow() function, give up
1090	// unless y is a constant integral value. Then proceed as if it were pown.
1091	if (!isKnownIntegral(V: opr1, DL: M->getDataLayout(), FMF: FPOp->getFastMathFlags()))
1092	return false;
1093	}
1094
1095	Value *nval;
1096	if (needabs) {
1097	nval = B.CreateUnaryIntrinsic(Intrinsic::ID: fabs, V: opr0, FMFSource: nullptr, Name: "__fabs");
1098	} else {
1099	nval = cnval ? cnval : opr0;
1100	}
1101	if (needlog) {
1102	FunctionCallee LogExpr;
1103	if (ShouldUseIntrinsic) {
1104	LogExpr =
1105	Intrinsic::getDeclaration(M, Intrinsic::id: log2, Tys: {FPOp->getType()});
1106	} else {
1107	LogExpr = getFunction(M, fInfo: AMDGPULibFunc (AMDGPULibFunc::EI_LOG2, FInfo));
1108	if (!LogExpr)
1109	return false;
1110	}
1111
1112	nval = CreateCallEx(B,Callee: LogExpr, Arg: nval, Name: "__log2");
1113	}
1114
1115	if (FInfo.getId() == AMDGPULibFunc::EI_POWN) {
1116	// convert int(32) to fp(f32 or f64)
1117	opr1 = B.CreateSIToFP(V: opr1, DestTy: nval->getType(), Name: "pownI2F");
1118	}
1119	nval = B.CreateFMul(L: opr1, R: nval, Name: "__ylogx");
1120	nval = CreateCallEx(B,Callee: ExpExpr, Arg: nval, Name: "__exp2");
1121
1122	if (needcopysign) {
1123	Value *opr_n;
1124	Type* rTy = opr0->getType();
1125	Type* nTyS = B.getIntNTy(N: eltType->getPrimitiveSizeInBits());
1126	Type *nTy = nTyS;
1127	if (const auto *vTy = dyn_cast<FixedVectorType>(Val: rTy))
1128	nTy = FixedVectorType::get(ElementType: nTyS, FVTy: vTy);
1129	unsigned size = nTy->getScalarSizeInBits();
1130	opr_n = FPOp->getOperand(i: `1`);
1131	if (opr_n->getType()->isIntegerTy())
1132	opr_n = B.CreateZExtOrTrunc(V: opr_n, DestTy: nTy, Name: "__ytou");
1133	else
1134	opr_n = B.CreateFPToSI(V: opr1, DestTy: nTy, Name: "__ytou");
1135
1136	Value *sign = B.CreateShl(LHS: opr_n, RHS: size-`1`, Name: "__yeven");
1137	sign = B.CreateAnd(LHS: B.CreateBitCast(V: opr0, DestTy: nTy), RHS: sign, Name: "__pow_sign");
1138	nval = B.CreateOr(LHS: B.CreateBitCast(V: nval, DestTy: nTy), RHS: sign);
1139	nval = B.CreateBitCast(V: nval, DestTy: opr0->getType());
1140	}
1141
1142	LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> "
1143	<< "exp2(" << opr1 << " log2(" << *opr0 << "))\n");
1144	replaceCall(I: FPOp, With: nval);
1145
1146	return true;
1147	}
1148
1149	bool AMDGPULibCalls::fold_rootn(FPMathOperator *FPOp, IRBuilder<> &B,
1150	const FuncInfo &FInfo) {
1151	// skip vector function
1152	if (getVecSize(FInfo) != `1`)
1153	return false;
1154
1155	Value *opr0 = FPOp->getOperand(i: `0`);
1156	Value *opr1 = FPOp->getOperand(i: `1`);
1157
1158	ConstantInt *CINT = dyn_cast<ConstantInt>(Val: opr1);
1159	if (!CINT) {
1160	return false;
1161	}
1162	int ci_opr1 = (int)CINT->getSExtValue();
1163	if (ci_opr1 == `1`) { // rootn(x, 1) = x
1164	LLVM_DEBUG(errs() << "AMDIC: " << FPOp << " ---> " << opr0 << "\n");
1165	replaceCall(I: FPOp, With: opr0);
1166	return true;
1167	}
1168
1169	Module *M = B.GetInsertBlock()->getModule();
1170	if (ci_opr1 == `2`) { // rootn(x, 2) = sqrt(x)
1171	if (FunctionCallee FPExpr =
1172	getFunction(M, fInfo: AMDGPULibFunc (AMDGPULibFunc::EI_SQRT, FInfo))) {
1173	LLVM_DEBUG(errs() << "AMDIC: " << FPOp << " ---> sqrt(" << opr0
1174	<< ")\n");
1175	Value *nval = CreateCallEx(B,Callee: FPExpr, Arg: opr0, Name: "__rootn2sqrt");
1176	replaceCall(I: FPOp, With: nval);
1177	return true;
1178	}
1179	} else if (ci_opr1 == `3`) { // rootn(x, 3) = cbrt(x)
1180	if (FunctionCallee FPExpr =
1181	getFunction(M, fInfo: AMDGPULibFunc (AMDGPULibFunc::EI_CBRT, FInfo))) {
1182	LLVM_DEBUG(errs() << "AMDIC: " << FPOp << " ---> cbrt(" << opr0
1183	<< ")\n");
1184	Value *nval = CreateCallEx(B,Callee: FPExpr, Arg: opr0, Name: "__rootn2cbrt");
1185	replaceCall(I: FPOp, With: nval);
1186	return true;
1187	}
1188	} else if (ci_opr1 == -`1`) { // rootn(x, -1) = 1.0/x
1189	LLVM_DEBUG(errs() << "AMDIC: " << FPOp << " ---> 1.0 / " << opr0 << "\n");
1190	Value *nval = B.CreateFDiv(L: ConstantFP::get(Ty: opr0->getType(), V: `1.0`),
1191	R: opr0,
1192	Name: "__rootn2div");
1193	replaceCall(I: FPOp, With: nval);
1194	return true;
1195	} else if (ci_opr1 == -`2`) { // rootn(x, -2) = rsqrt(x)
1196	if (FunctionCallee FPExpr =
1197	getFunction(M, fInfo: AMDGPULibFunc (AMDGPULibFunc::EI_RSQRT, FInfo))) {
1198	LLVM_DEBUG(errs() << "AMDIC: " << FPOp << " ---> rsqrt(" << opr0
1199	<< ")\n");
1200	Value *nval = CreateCallEx(B,Callee: FPExpr, Arg: opr0, Name: "__rootn2rsqrt");
1201	replaceCall(I: FPOp, With: nval);
1202	return true;
1203	}
1204	}
1205	return false;
1206	}
1207
1208	// Get a scalar native builtin single argument FP function
1209	FunctionCallee AMDGPULibCalls::getNativeFunction(Module *M,
1210	const FuncInfo &FInfo) {
1211	if (getArgType(FInfo) == AMDGPULibFunc::F64 \|\| !HasNative(id: FInfo.getId()))
1212	return nullptr;
1213	FuncInfo nf = FInfo;
1214	nf.setPrefix(AMDGPULibFunc::NATIVE);
1215	return getFunction(M, fInfo: nf);
1216	}
1217
1218	// Some library calls are just wrappers around llvm intrinsics, but compiled
1219	// conservatively. Preserve the flags from the original call site by
1220	// substituting them with direct calls with all the flags.
1221	bool AMDGPULibCalls::shouldReplaceLibcallWithIntrinsic(const CallInst *CI,
1222	bool AllowMinSizeF32,
1223	bool AllowF64,
1224	bool AllowStrictFP) {
1225	Type *FltTy = CI->getType()->getScalarType();
1226	const bool IsF32 = FltTy->isFloatTy();
1227
1228	// f64 intrinsics aren't implemented for most operations.
1229	if (!IsF32 && !FltTy->isHalfTy() && (!AllowF64 \|\| !FltTy->isDoubleTy()))
1230	return false;
1231
1232	// We're implicitly inlining by replacing the libcall with the intrinsic, so
1233	// don't do it for noinline call sites.
1234	if (CI->isNoInline())
1235	return false;
1236
1237	const Function *ParentF = CI->getFunction();
1238	// TODO: Handle strictfp
1239	if (!AllowStrictFP && ParentF->hasFnAttribute(Attribute::StrictFP))
1240	return false;
1241
1242	if (IsF32 && !AllowMinSizeF32 && ParentF->hasMinSize())
1243	return false;
1244	return true;
1245	}
1246
1247	void AMDGPULibCalls::replaceLibCallWithSimpleIntrinsic(IRBuilder<> &B,
1248	CallInst *CI,
1249	Intrinsic::ID IntrID) {
1250	if (CI->arg_size() == `2`) {
1251	Value *Arg0 = CI->getArgOperand(i: `0`);
1252	Value *Arg1 = CI->getArgOperand(i: `1`);
1253	VectorType *Arg0VecTy = dyn_cast<VectorType>(Val: Arg0->getType());
1254	VectorType *Arg1VecTy = dyn_cast<VectorType>(Val: Arg1->getType());
1255	if (Arg0VecTy && !Arg1VecTy) {
1256	Value *SplatRHS = B.CreateVectorSplat(EC: Arg0VecTy->getElementCount(), V: Arg1);
1257	CI->setArgOperand(i: `1`, v: SplatRHS);
1258	} else if (!Arg0VecTy && Arg1VecTy) {
1259	Value *SplatLHS = B.CreateVectorSplat(EC: Arg1VecTy->getElementCount(), V: Arg0);
1260	CI->setArgOperand(i: `0`, v: SplatLHS);
1261	}
1262	}
1263
1264	CI->setCalledFunction(
1265	Intrinsic::getDeclaration(M: CI->getModule(), id: IntrID, Tys: {CI->getType()}));
1266	}
1267
1268	bool AMDGPULibCalls::tryReplaceLibcallWithSimpleIntrinsic(
1269	IRBuilder<> &B, CallInst CI, Intrinsic::ID IntrID, bool* AllowMinSizeF32,
1270	bool AllowF64, bool AllowStrictFP) {
1271	if (!shouldReplaceLibcallWithIntrinsic(CI, AllowMinSizeF32, AllowF64,
1272	AllowStrictFP))
1273	return false;
1274	replaceLibCallWithSimpleIntrinsic(B, CI, IntrID);
1275	return true;
1276	}
1277
1278	std::tuple<Value , Value , Value *>
1279	AMDGPULibCalls::insertSinCos(Value *Arg, FastMathFlags FMF, IRBuilder<> &B,
1280	FunctionCallee Fsincos) {
1281	DebugLoc DL = B.getCurrentDebugLocation();
1282	Function *F = B.GetInsertBlock()->getParent();
1283	B.SetInsertPointPastAllocas(F);
1284
1285	AllocaInst Alloc = B.CreateAlloca(Ty: Arg->getType(), ArraySize: nullptr*, Name: "__sincos_");
1286
1287	if (Instruction *ArgInst = dyn_cast<Instruction>(Val: Arg)) {
1288	// If the argument is an instruction, it must dominate all uses so put our
1289	// sincos call there. Otherwise, right after the allocas works well enough
1290	// if it's an argument or constant.
1291
1292	B.SetInsertPoint(TheBB: ArgInst->getParent(), IP: ++ArgInst->getIterator());
1293
1294	// SetInsertPoint unwelcomely always tries to set the debug loc.
1295	B.SetCurrentDebugLocation(DL);
1296	}
1297
1298	Type *CosPtrTy = Fsincos.getFunctionType()->getParamType(i: `1`);
1299
1300	// The allocaInst allocates the memory in private address space. This need
1301	// to be addrspacecasted to point to the address space of cos pointer type.
1302	// In OpenCL 2.0 this is generic, while in 1.2 that is private.
1303	Value *CastAlloc = B.CreateAddrSpaceCast(V: Alloc, DestTy: CosPtrTy);
1304
1305	CallInst *SinCos = CreateCallEx2(B, Callee: Fsincos, Arg1: Arg, Arg2: CastAlloc);
1306
1307	// TODO: Is it worth trying to preserve the location for the cos calls for the
1308	// load?
1309
1310	LoadInst *LoadCos = B.CreateLoad(Ty: Alloc->getAllocatedType(), Ptr: Alloc);
1311	return {SinCos, LoadCos, SinCos};
1312	}
1313
1314	// fold sin, cos -> sincos.
1315	bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B,
1316	const FuncInfo &fInfo) {
1317	assert(fInfo.getId() == AMDGPULibFunc::EI_SIN \|\|
1318	fInfo.getId() == AMDGPULibFunc::EI_COS);
1319
1320	if ((getArgType(FInfo: fInfo) != AMDGPULibFunc::F32 &&
1321	getArgType(FInfo: fInfo) != AMDGPULibFunc::F64) \|\|
1322	fInfo.getPrefix() != AMDGPULibFunc::NOPFX)
1323	return false;
1324
1325	bool const isSin = fInfo.getId() == AMDGPULibFunc::EI_SIN;
1326
1327	Value *CArgVal = FPOp->getOperand(i: `0`);
1328	CallInst *CI = cast<CallInst>(Val: FPOp);
1329
1330	Function *F = B.GetInsertBlock()->getParent();
1331	Module *M = F->getParent();
1332
1333	// Merge the sin and cos. For OpenCL 2.0, there may only be a generic pointer
1334	// implementation. Prefer the private form if available.
1335	AMDGPULibFunc SinCosLibFuncPrivate(AMDGPULibFunc::EI_SINCOS, fInfo);
1336	SinCosLibFuncPrivate.getLeads()[`0`].PtrKind =
1337	AMDGPULibFunc::getEPtrKindFromAddrSpace(AS: AMDGPUAS::PRIVATE_ADDRESS);
1338
1339	AMDGPULibFunc SinCosLibFuncGeneric(AMDGPULibFunc::EI_SINCOS, fInfo);
1340	SinCosLibFuncGeneric.getLeads()[`0`].PtrKind =
1341	AMDGPULibFunc::getEPtrKindFromAddrSpace(AS: AMDGPUAS::FLAT_ADDRESS);
1342
1343	FunctionCallee FSinCosPrivate = getFunction(M, fInfo: SinCosLibFuncPrivate);
1344	FunctionCallee FSinCosGeneric = getFunction(M, fInfo: SinCosLibFuncGeneric);
1345	FunctionCallee FSinCos = FSinCosPrivate ? FSinCosPrivate : FSinCosGeneric;
1346	if (!FSinCos)
1347	return false;
1348
1349	SmallVector<CallInst *> SinCalls;
1350	SmallVector<CallInst *> CosCalls;
1351	SmallVector<CallInst *> SinCosCalls;
1352	FuncInfo PartnerInfo(isSin ? AMDGPULibFunc::EI_COS : AMDGPULibFunc::EI_SIN,
1353	fInfo);
1354	const std::string PairName = PartnerInfo.mangle();
1355
1356	StringRef SinName = isSin ? CI->getCalledFunction()->getName() : PairName;
1357	StringRef CosName = isSin ? PairName : CI->getCalledFunction()->getName();
1358	const std::string SinCosPrivateName = SinCosLibFuncPrivate.mangle();
1359	const std::string SinCosGenericName = SinCosLibFuncGeneric.mangle();
1360
1361	// Intersect the two sets of flags.
1362	FastMathFlags FMF = FPOp->getFastMathFlags();
1363	MDNode *FPMath = CI->getMetadata(KindID: LLVMContext::MD_fpmath);
1364
1365	SmallVector<DILocation *> MergeDbgLocs = {CI->getDebugLoc()};
1366
1367	for (User* U : CArgVal->users()) {
1368	CallInst *XI = dyn_cast<CallInst>(Val: U);
1369	if (!XI \|\| XI->getFunction() != F \|\| XI->isNoBuiltin())
1370	continue;
1371
1372	Function *UCallee = XI->getCalledFunction();
1373	if (!UCallee)
1374	continue;
1375
1376	bool Handled = true;
1377
1378	if (UCallee->getName() == SinName)
1379	SinCalls.push_back(Elt: XI);
1380	else if (UCallee->getName() == CosName)
1381	CosCalls.push_back(Elt: XI);
1382	else if (UCallee->getName() == SinCosPrivateName \|\|
1383	UCallee->getName() == SinCosGenericName)
1384	SinCosCalls.push_back(Elt: XI);
1385	else
1386	Handled = false;
1387
1388	if (Handled) {
1389	MergeDbgLocs.push_back(Elt: XI->getDebugLoc());
1390	auto *OtherOp = cast<FPMathOperator>(Val: XI);
1391	FMF &= OtherOp->getFastMathFlags();
1392	FPMath = MDNode::getMostGenericFPMath(
1393	A: FPMath, B: XI->getMetadata(KindID: LLVMContext::MD_fpmath));
1394	}
1395	}
1396
1397	if (SinCalls.empty() \|\| CosCalls.empty())
1398	return false;
1399
1400	B.setFastMathFlags(FMF);
1401	B.setDefaultFPMathTag(FPMath);
1402	DILocation *DbgLoc = DILocation::getMergedLocations(Locs: MergeDbgLocs);
1403	B.SetCurrentDebugLocation(DbgLoc);
1404
1405	auto [Sin, Cos, SinCos] = insertSinCos(Arg: CArgVal, FMF, B, Fsincos: FSinCos);
1406
1407	auto replaceTrigInsts = [](ArrayRef<CallInst > Calls, Value Res) {
1408	for (CallInst *C : Calls)
1409	C->replaceAllUsesWith(V: Res);
1410
1411	// Leave the other dead instructions to avoid clobbering iterators.
1412	};
1413
1414	replaceTrigInsts (SinCalls, Sin);
1415	replaceTrigInsts (CosCalls, Cos);
1416	replaceTrigInsts (SinCosCalls, SinCos);
1417
1418	// It's safe to delete the original now.
1419	CI->eraseFromParent();
1420	return true;
1421	}
1422
1423	bool AMDGPULibCalls::evaluateScalarMathFunc(const FuncInfo &FInfo, double &Res0,
1424	double &Res1, Constant *copr0,
1425	Constant *copr1) {
1426	// By default, opr0/opr1/opr3 holds values of float/double type.
1427	// If they are not float/double, each function has to its
1428	// operand separately.
1429	double opr0 = `0.0`, opr1 = `0.0`;
1430	ConstantFP *fpopr0 = dyn_cast_or_null<ConstantFP>(Val: copr0);
1431	ConstantFP *fpopr1 = dyn_cast_or_null<ConstantFP>(Val: copr1);
1432	if (fpopr0) {
1433	opr0 = (getArgType(FInfo) == AMDGPULibFunc::F64)
1434	? fpopr0->getValueAPF().convertToDouble()
1435	: (double)fpopr0->getValueAPF().convertToFloat();
1436	}
1437
1438	if (fpopr1) {
1439	opr1 = (getArgType(FInfo) == AMDGPULibFunc::F64)
1440	? fpopr1->getValueAPF().convertToDouble()
1441	: (double)fpopr1->getValueAPF().convertToFloat();
1442	}
1443
1444	switch (FInfo.getId()) {
1445	default : return false;
1446
1447	case AMDGPULibFunc::EI_ACOS:
1448	Res0 = acos(x: opr0);
1449	return true;
1450
1451	case AMDGPULibFunc::EI_ACOSH:
1452	// acosh(x) == log(x + sqrt(xx - 1))*
1453	Res0 = log(x: opr0 + sqrt(x: opr0*opr0 - `1.0`));
1454	return true;
1455
1456	case AMDGPULibFunc::EI_ACOSPI:
1457	Res0 = acos(x: opr0) / MATH_PI;
1458	return true;
1459
1460	case AMDGPULibFunc::EI_ASIN:
1461	Res0 = asin(x: opr0);
1462	return true;
1463
1464	case AMDGPULibFunc::EI_ASINH:
1465	// asinh(x) == log(x + sqrt(xx + 1))*
1466	Res0 = log(x: opr0 + sqrt(x: opr0*opr0 + `1.0`));
1467	return true;
1468
1469	case AMDGPULibFunc::EI_ASINPI:
1470	Res0 = asin(x: opr0) / MATH_PI;
1471	return true;
1472
1473	case AMDGPULibFunc::EI_ATAN:
1474	Res0 = atan(x: opr0);
1475	return true;
1476
1477	case AMDGPULibFunc::EI_ATANH:
1478	// atanh(x) == (log(x+1) - log(x-1))/2;
1479	Res0 = (log(x: opr0 + `1.0`) - log(x: opr0 - `1.0`))/`2.0`;
1480	return true;
1481
1482	case AMDGPULibFunc::EI_ATANPI:
1483	Res0 = atan(x: opr0) / MATH_PI;
1484	return true;
1485
1486	case AMDGPULibFunc::EI_CBRT:
1487	Res0 = (opr0 < `0.0`) ? -pow(x: -opr0, y: `1.0`/`3.0`) : pow(x: opr0, y: `1.0`/`3.0`);
1488	return true;
1489
1490	case AMDGPULibFunc::EI_COS:
1491	Res0 = cos(x: opr0);
1492	return true;
1493
1494	case AMDGPULibFunc::EI_COSH:
1495	Res0 = cosh(x: opr0);
1496	return true;
1497
1498	case AMDGPULibFunc::EI_COSPI:
1499	Res0 = cos(MATH_PI * opr0);
1500	return true;
1501
1502	case AMDGPULibFunc::EI_EXP:
1503	Res0 = exp(x: opr0);
1504	return true;
1505
1506	case AMDGPULibFunc::EI_EXP2:
1507	Res0 = pow(x: `2.0`, y: opr0);
1508	return true;
1509
1510	case AMDGPULibFunc::EI_EXP10:
1511	Res0 = pow(x: `10.0`, y: opr0);
1512	return true;
1513
1514	case AMDGPULibFunc::EI_LOG:
1515	Res0 = log(x: opr0);
1516	return true;
1517
1518	case AMDGPULibFunc::EI_LOG2:
1519	Res0 = log(x: opr0) / log(x: `2.0`);
1520	return true;
1521
1522	case AMDGPULibFunc::EI_LOG10:
1523	Res0 = log(x: opr0) / log(x: `10.0`);
1524	return true;
1525
1526	case AMDGPULibFunc::EI_RSQRT:
1527	Res0 = `1.0` / sqrt(x: opr0);
1528	return true;
1529
1530	case AMDGPULibFunc::EI_SIN:
1531	Res0 = sin(x: opr0);
1532	return true;
1533
1534	case AMDGPULibFunc::EI_SINH:
1535	Res0 = sinh(x: opr0);
1536	return true;
1537
1538	case AMDGPULibFunc::EI_SINPI:
1539	Res0 = sin(MATH_PI * opr0);
1540	return true;
1541
1542	case AMDGPULibFunc::EI_TAN:
1543	Res0 = tan(x: opr0);
1544	return true;
1545
1546	case AMDGPULibFunc::EI_TANH:
1547	Res0 = tanh(x: opr0);
1548	return true;
1549
1550	case AMDGPULibFunc::EI_TANPI:
1551	Res0 = tan(MATH_PI * opr0);
1552	return true;
1553
1554	// two-arg functions
1555	case AMDGPULibFunc::EI_POW:
1556	case AMDGPULibFunc::EI_POWR:
1557	Res0 = pow(x: opr0, y: opr1);
1558	return true;
1559
1560	case AMDGPULibFunc::EI_POWN: {
1561	if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(Val: copr1)) {
1562	double val = (double)iopr1->getSExtValue();
1563	Res0 = pow(x: opr0, y: val);
1564	return true;
1565	}
1566	return false;
1567	}
1568
1569	case AMDGPULibFunc::EI_ROOTN: {
1570	if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(Val: copr1)) {
1571	double val = (double)iopr1->getSExtValue();
1572	Res0 = pow(x: opr0, y: `1.0` / val);
1573	return true;
1574	}
1575	return false;
1576	}
1577
1578	// with ptr arg
1579	case AMDGPULibFunc::EI_SINCOS:
1580	Res0 = sin(x: opr0);
1581	Res1 = cos(x: opr0);
1582	return true;
1583	}
1584
1585	return false;
1586	}
1587
1588	bool AMDGPULibCalls::evaluateCall(CallInst aCI, const* FuncInfo &FInfo) {
1589	int numArgs = (int)aCI->arg_size();
1590	if (numArgs > `3`)
1591	return false;
1592
1593	Constant copr0 = nullptr*;
1594	Constant copr1 = nullptr*;
1595	if (numArgs > `0`) {
1596	if ((copr0 = dyn_cast<Constant>(Val: aCI->getArgOperand(i: `0`))) == nullptr)
1597	return false;
1598	}
1599
1600	if (numArgs > `1`) {
1601	if ((copr1 = dyn_cast<Constant>(Val: aCI->getArgOperand(i: `1`))) == nullptr) {
1602	if (FInfo.getId() != AMDGPULibFunc::EI_SINCOS)
1603	return false;
1604	}
1605	}
1606
1607	// At this point, all arguments to aCI are constants.
1608
1609	// max vector size is 16, and sincos will generate two results.
1610	double DVal0[`16`], DVal1[`16`];
1611	int FuncVecSize = getVecSize(FInfo);
1612	bool hasTwoResults = (FInfo.getId() == AMDGPULibFunc::EI_SINCOS);
1613	if (FuncVecSize == `1`) {
1614	if (!evaluateScalarMathFunc(FInfo, Res0&: DVal0[`0`], Res1&: DVal1[`0`], copr0, copr1)) {
1615	return false;
1616	}
1617	} else {
1618	ConstantDataVector *CDV0 = dyn_cast_or_null<ConstantDataVector>(Val: copr0);
1619	ConstantDataVector *CDV1 = dyn_cast_or_null<ConstantDataVector>(Val: copr1);
1620	for (int i = `0`; i < FuncVecSize; ++i) {
1621	Constant celt0 = CDV0 ? CDV0->getElementAsConstant(i) : nullptr*;
1622	Constant celt1 = CDV1 ? CDV1->getElementAsConstant(i) : nullptr*;
1623	if (!evaluateScalarMathFunc(FInfo, Res0&: DVal0[i], Res1&: DVal1[i], copr0: celt0, copr1: celt1)) {
1624	return false;
1625	}
1626	}
1627	}
1628
1629	LLVMContext &context = aCI->getContext();
1630	Constant nval0, nval1;
1631	if (FuncVecSize == `1`) {
1632	nval0 = ConstantFP::get(Ty: aCI->getType(), V: DVal0[`0`]);
1633	if (hasTwoResults)
1634	nval1 = ConstantFP::get(Ty: aCI->getType(), V: DVal1[`0`]);
1635	} else {
1636	if (getArgType(FInfo) == AMDGPULibFunc::F32) {
1637	SmallVector <float, `0`> FVal0, FVal1;
1638	for (int i = `0`; i < FuncVecSize; ++i)
1639	FVal0.push_back(Elt: (float)DVal0[i]);
1640	ArrayRef<float> tmp0(FVal0);
1641	nval0 = ConstantDataVector::get(Context&: context, Elts: tmp0);
1642	if (hasTwoResults) {
1643	for (int i = `0`; i < FuncVecSize; ++i)
1644	FVal1.push_back(Elt: (float)DVal1[i]);
1645	ArrayRef<float> tmp1(FVal1);
1646	nval1 = ConstantDataVector::get(Context&: context, Elts: tmp1);
1647	}
1648	} else {
1649	ArrayRef<double> tmp0(DVal0);
1650	nval0 = ConstantDataVector::get(Context&: context, Elts: tmp0);
1651	if (hasTwoResults) {
1652	ArrayRef<double> tmp1(DVal1);
1653	nval1 = ConstantDataVector::get(Context&: context, Elts: tmp1);
1654	}
1655	}
1656	}
1657
1658	if (hasTwoResults) {
1659	// sincos
1660	assert(FInfo.getId() == AMDGPULibFunc::EI_SINCOS &&
1661	"math function with ptr arg not supported yet");
1662	new StoreInst (nval1, aCI->getArgOperand(i: `1`), aCI->getIterator());
1663	}
1664
1665	replaceCall(I: aCI, With: nval0);
1666	return true;
1667	}
1668
1669	PreservedAnalyses AMDGPUSimplifyLibCallsPass::run(Function &F,
1670	FunctionAnalysisManager &AM) {
1671	AMDGPULibCalls Simplifier;
1672	Simplifier.initNativeFuncs();
1673	Simplifier.initFunction(F, FAM&: AM);
1674
1675	bool Changed = false;
1676
1677	LLVM_DEBUG(dbgs() << "AMDIC: process function ";
1678	F.printAsOperand(dbgs(), false, F.getParent()); dbgs() << `'\n'`;);
1679
1680	for (auto &BB : F) {
1681	for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;) {
1682	// Ignore non-calls.
1683	CallInst *CI = dyn_cast<CallInst>(Val&: I);
1684	++I;
1685
1686	if (CI) {
1687	if (Simplifier.fold(CI))
1688	Changed = true;
1689	}
1690	}
1691	}
1692	return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
1693	}
1694
1695	PreservedAnalyses AMDGPUUseNativeCallsPass::run(Function &F,
1696	FunctionAnalysisManager &AM) {
1697	if (UseNative.empty())
1698	return PreservedAnalyses::all();
1699
1700	AMDGPULibCalls Simplifier;
1701	Simplifier.initNativeFuncs();
1702	Simplifier.initFunction(F, FAM&: AM);
1703
1704	bool Changed = false;
1705	for (auto &BB : F) {
1706	for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;) {
1707	// Ignore non-calls.
1708	CallInst *CI = dyn_cast<CallInst>(Val&: I);
1709	++I;
1710	if (CI && Simplifier.useNative(aCI: CI))
1711	Changed = true;
1712	}
1713	}
1714	return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
1715	}
1716

source code of llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp