1 | //===- AMDGPULibCalls.cpp -------------------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// This file does AMD library function optimizations. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "AMDGPU.h" |
15 | #include "AMDGPULibFunc.h" |
16 | #include "GCNSubtarget.h" |
17 | #include "llvm/Analysis/AssumptionCache.h" |
18 | #include "llvm/Analysis/TargetLibraryInfo.h" |
19 | #include "llvm/Analysis/ValueTracking.h" |
20 | #include "llvm/IR/AttributeMask.h" |
21 | #include "llvm/IR/Dominators.h" |
22 | #include "llvm/IR/IRBuilder.h" |
23 | #include "llvm/IR/IntrinsicInst.h" |
24 | #include "llvm/IR/IntrinsicsAMDGPU.h" |
25 | #include "llvm/IR/PatternMatch.h" |
26 | #include "llvm/InitializePasses.h" |
27 | #include <cmath> |
28 | |
29 | #define DEBUG_TYPE "amdgpu-simplifylib" |
30 | |
31 | using namespace llvm; |
32 | using namespace llvm::PatternMatch; |
33 | |
34 | static cl::opt<bool> EnablePreLink("amdgpu-prelink" , |
35 | cl::desc("Enable pre-link mode optimizations" ), |
36 | cl::init(Val: false), |
37 | cl::Hidden); |
38 | |
39 | static cl::list<std::string> UseNative("amdgpu-use-native" , |
40 | cl::desc("Comma separated list of functions to replace with native, or all" ), |
41 | cl::CommaSeparated, cl::ValueOptional, |
42 | cl::Hidden); |
43 | |
44 | #define MATH_PI numbers::pi |
45 | #define MATH_E numbers::e |
46 | #define MATH_SQRT2 numbers::sqrt2 |
47 | #define MATH_SQRT1_2 numbers::inv_sqrt2 |
48 | |
49 | namespace llvm { |
50 | |
51 | class AMDGPULibCalls { |
52 | private: |
53 | const TargetLibraryInfo *TLInfo = nullptr; |
54 | AssumptionCache *AC = nullptr; |
55 | DominatorTree *DT = nullptr; |
56 | |
57 | typedef llvm::AMDGPULibFunc FuncInfo; |
58 | |
59 | bool UnsafeFPMath = false; |
60 | |
61 | // -fuse-native. |
62 | bool AllNative = false; |
63 | |
64 | bool useNativeFunc(const StringRef F) const; |
65 | |
66 | // Return a pointer (pointer expr) to the function if function definition with |
67 | // "FuncName" exists. It may create a new function prototype in pre-link mode. |
68 | FunctionCallee getFunction(Module *M, const FuncInfo &fInfo); |
69 | |
70 | bool parseFunctionName(const StringRef &FMangledName, FuncInfo &FInfo); |
71 | |
72 | bool TDOFold(CallInst *CI, const FuncInfo &FInfo); |
73 | |
74 | /* Specialized optimizations */ |
75 | |
76 | // pow/powr/pown |
77 | bool fold_pow(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo); |
78 | |
79 | // rootn |
80 | bool fold_rootn(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo); |
81 | |
82 | // -fuse-native for sincos |
83 | bool sincosUseNative(CallInst *aCI, const FuncInfo &FInfo); |
84 | |
85 | // evaluate calls if calls' arguments are constants. |
86 | bool evaluateScalarMathFunc(const FuncInfo &FInfo, double &Res0, double &Res1, |
87 | Constant *copr0, Constant *copr1); |
88 | bool evaluateCall(CallInst *aCI, const FuncInfo &FInfo); |
89 | |
90 | /// Insert a value to sincos function \p Fsincos. Returns (value of sin, value |
91 | /// of cos, sincos call). |
92 | std::tuple<Value *, Value *, Value *> insertSinCos(Value *Arg, |
93 | FastMathFlags FMF, |
94 | IRBuilder<> &B, |
95 | FunctionCallee Fsincos); |
96 | |
97 | // sin/cos |
98 | bool fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo); |
99 | |
100 | // __read_pipe/__write_pipe |
101 | bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B, |
102 | const FuncInfo &FInfo); |
103 | |
104 | // Get a scalar native builtin single argument FP function |
105 | FunctionCallee getNativeFunction(Module *M, const FuncInfo &FInfo); |
106 | |
107 | /// Substitute a call to a known libcall with an intrinsic call. If \p |
108 | /// AllowMinSize is true, allow the replacement in a minsize function. |
109 | bool shouldReplaceLibcallWithIntrinsic(const CallInst *CI, |
110 | bool AllowMinSizeF32 = false, |
111 | bool AllowF64 = false, |
112 | bool AllowStrictFP = false); |
113 | void replaceLibCallWithSimpleIntrinsic(IRBuilder<> &B, CallInst *CI, |
114 | Intrinsic::ID IntrID); |
115 | |
116 | bool tryReplaceLibcallWithSimpleIntrinsic(IRBuilder<> &B, CallInst *CI, |
117 | Intrinsic::ID IntrID, |
118 | bool AllowMinSizeF32 = false, |
119 | bool AllowF64 = false, |
120 | bool AllowStrictFP = false); |
121 | |
122 | protected: |
123 | bool isUnsafeMath(const FPMathOperator *FPOp) const; |
124 | bool isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const; |
125 | |
126 | bool canIncreasePrecisionOfConstantFold(const FPMathOperator *FPOp) const; |
127 | |
128 | static void replaceCall(Instruction *I, Value *With) { |
129 | I->replaceAllUsesWith(V: With); |
130 | I->eraseFromParent(); |
131 | } |
132 | |
133 | static void replaceCall(FPMathOperator *I, Value *With) { |
134 | replaceCall(I: cast<Instruction>(Val: I), With); |
135 | } |
136 | |
137 | public: |
138 | AMDGPULibCalls() {} |
139 | |
140 | bool fold(CallInst *CI); |
141 | |
142 | void initFunction(Function &F, FunctionAnalysisManager &FAM); |
143 | void initNativeFuncs(); |
144 | |
145 | // Replace a normal math function call with that native version |
146 | bool useNative(CallInst *CI); |
147 | }; |
148 | |
149 | } // end llvm namespace |
150 | |
151 | template <typename IRB> |
152 | static CallInst *CreateCallEx(IRB &B, FunctionCallee Callee, Value *Arg, |
153 | const Twine &Name = "" ) { |
154 | CallInst *R = B.CreateCall(Callee, Arg, Name); |
155 | if (Function *F = dyn_cast<Function>(Val: Callee.getCallee())) |
156 | R->setCallingConv(F->getCallingConv()); |
157 | return R; |
158 | } |
159 | |
160 | template <typename IRB> |
161 | static CallInst *CreateCallEx2(IRB &B, FunctionCallee Callee, Value *Arg1, |
162 | Value *Arg2, const Twine &Name = "" ) { |
163 | CallInst *R = B.CreateCall(Callee, {Arg1, Arg2}, Name); |
164 | if (Function *F = dyn_cast<Function>(Val: Callee.getCallee())) |
165 | R->setCallingConv(F->getCallingConv()); |
166 | return R; |
167 | } |
168 | |
169 | static FunctionType *getPownType(FunctionType *FT) { |
170 | Type *PowNExpTy = Type::getInt32Ty(C&: FT->getContext()); |
171 | if (VectorType *VecTy = dyn_cast<VectorType>(Val: FT->getReturnType())) |
172 | PowNExpTy = VectorType::get(ElementType: PowNExpTy, EC: VecTy->getElementCount()); |
173 | |
174 | return FunctionType::get(Result: FT->getReturnType(), |
175 | Params: {FT->getParamType(i: 0), PowNExpTy}, isVarArg: false); |
176 | } |
177 | |
178 | // Data structures for table-driven optimizations. |
179 | // FuncTbl works for both f32 and f64 functions with 1 input argument |
180 | |
181 | struct TableEntry { |
182 | double result; |
183 | double input; |
184 | }; |
185 | |
186 | /* a list of {result, input} */ |
187 | static const TableEntry tbl_acos[] = { |
188 | {MATH_PI / 2.0, .input: 0.0}, |
189 | {MATH_PI / 2.0, .input: -0.0}, |
190 | {.result: 0.0, .input: 1.0}, |
191 | {MATH_PI, .input: -1.0} |
192 | }; |
193 | static const TableEntry tbl_acosh[] = { |
194 | {.result: 0.0, .input: 1.0} |
195 | }; |
196 | static const TableEntry tbl_acospi[] = { |
197 | {.result: 0.5, .input: 0.0}, |
198 | {.result: 0.5, .input: -0.0}, |
199 | {.result: 0.0, .input: 1.0}, |
200 | {.result: 1.0, .input: -1.0} |
201 | }; |
202 | static const TableEntry tbl_asin[] = { |
203 | {.result: 0.0, .input: 0.0}, |
204 | {.result: -0.0, .input: -0.0}, |
205 | {MATH_PI / 2.0, .input: 1.0}, |
206 | {.result: -MATH_PI / 2.0, .input: -1.0} |
207 | }; |
208 | static const TableEntry tbl_asinh[] = { |
209 | {.result: 0.0, .input: 0.0}, |
210 | {.result: -0.0, .input: -0.0} |
211 | }; |
212 | static const TableEntry tbl_asinpi[] = { |
213 | {.result: 0.0, .input: 0.0}, |
214 | {.result: -0.0, .input: -0.0}, |
215 | {.result: 0.5, .input: 1.0}, |
216 | {.result: -0.5, .input: -1.0} |
217 | }; |
218 | static const TableEntry tbl_atan[] = { |
219 | {.result: 0.0, .input: 0.0}, |
220 | {.result: -0.0, .input: -0.0}, |
221 | {MATH_PI / 4.0, .input: 1.0}, |
222 | {.result: -MATH_PI / 4.0, .input: -1.0} |
223 | }; |
224 | static const TableEntry tbl_atanh[] = { |
225 | {.result: 0.0, .input: 0.0}, |
226 | {.result: -0.0, .input: -0.0} |
227 | }; |
228 | static const TableEntry tbl_atanpi[] = { |
229 | {.result: 0.0, .input: 0.0}, |
230 | {.result: -0.0, .input: -0.0}, |
231 | {.result: 0.25, .input: 1.0}, |
232 | {.result: -0.25, .input: -1.0} |
233 | }; |
234 | static const TableEntry tbl_cbrt[] = { |
235 | {.result: 0.0, .input: 0.0}, |
236 | {.result: -0.0, .input: -0.0}, |
237 | {.result: 1.0, .input: 1.0}, |
238 | {.result: -1.0, .input: -1.0}, |
239 | }; |
240 | static const TableEntry tbl_cos[] = { |
241 | {.result: 1.0, .input: 0.0}, |
242 | {.result: 1.0, .input: -0.0} |
243 | }; |
244 | static const TableEntry tbl_cosh[] = { |
245 | {.result: 1.0, .input: 0.0}, |
246 | {.result: 1.0, .input: -0.0} |
247 | }; |
248 | static const TableEntry tbl_cospi[] = { |
249 | {.result: 1.0, .input: 0.0}, |
250 | {.result: 1.0, .input: -0.0} |
251 | }; |
252 | static const TableEntry tbl_erfc[] = { |
253 | {.result: 1.0, .input: 0.0}, |
254 | {.result: 1.0, .input: -0.0} |
255 | }; |
256 | static const TableEntry tbl_erf[] = { |
257 | {.result: 0.0, .input: 0.0}, |
258 | {.result: -0.0, .input: -0.0} |
259 | }; |
260 | static const TableEntry tbl_exp[] = { |
261 | {.result: 1.0, .input: 0.0}, |
262 | {.result: 1.0, .input: -0.0}, |
263 | {MATH_E, .input: 1.0} |
264 | }; |
265 | static const TableEntry tbl_exp2[] = { |
266 | {.result: 1.0, .input: 0.0}, |
267 | {.result: 1.0, .input: -0.0}, |
268 | {.result: 2.0, .input: 1.0} |
269 | }; |
270 | static const TableEntry tbl_exp10[] = { |
271 | {.result: 1.0, .input: 0.0}, |
272 | {.result: 1.0, .input: -0.0}, |
273 | {.result: 10.0, .input: 1.0} |
274 | }; |
275 | static const TableEntry tbl_expm1[] = { |
276 | {.result: 0.0, .input: 0.0}, |
277 | {.result: -0.0, .input: -0.0} |
278 | }; |
279 | static const TableEntry tbl_log[] = { |
280 | {.result: 0.0, .input: 1.0}, |
281 | {.result: 1.0, MATH_E} |
282 | }; |
283 | static const TableEntry tbl_log2[] = { |
284 | {.result: 0.0, .input: 1.0}, |
285 | {.result: 1.0, .input: 2.0} |
286 | }; |
287 | static const TableEntry tbl_log10[] = { |
288 | {.result: 0.0, .input: 1.0}, |
289 | {.result: 1.0, .input: 10.0} |
290 | }; |
291 | static const TableEntry tbl_rsqrt[] = { |
292 | {.result: 1.0, .input: 1.0}, |
293 | {MATH_SQRT1_2, .input: 2.0} |
294 | }; |
295 | static const TableEntry tbl_sin[] = { |
296 | {.result: 0.0, .input: 0.0}, |
297 | {.result: -0.0, .input: -0.0} |
298 | }; |
299 | static const TableEntry tbl_sinh[] = { |
300 | {.result: 0.0, .input: 0.0}, |
301 | {.result: -0.0, .input: -0.0} |
302 | }; |
303 | static const TableEntry tbl_sinpi[] = { |
304 | {.result: 0.0, .input: 0.0}, |
305 | {.result: -0.0, .input: -0.0} |
306 | }; |
307 | static const TableEntry tbl_sqrt[] = { |
308 | {.result: 0.0, .input: 0.0}, |
309 | {.result: 1.0, .input: 1.0}, |
310 | {MATH_SQRT2, .input: 2.0} |
311 | }; |
312 | static const TableEntry tbl_tan[] = { |
313 | {.result: 0.0, .input: 0.0}, |
314 | {.result: -0.0, .input: -0.0} |
315 | }; |
316 | static const TableEntry tbl_tanh[] = { |
317 | {.result: 0.0, .input: 0.0}, |
318 | {.result: -0.0, .input: -0.0} |
319 | }; |
320 | static const TableEntry tbl_tanpi[] = { |
321 | {.result: 0.0, .input: 0.0}, |
322 | {.result: -0.0, .input: -0.0} |
323 | }; |
324 | static const TableEntry tbl_tgamma[] = { |
325 | {.result: 1.0, .input: 1.0}, |
326 | {.result: 1.0, .input: 2.0}, |
327 | {.result: 2.0, .input: 3.0}, |
328 | {.result: 6.0, .input: 4.0} |
329 | }; |
330 | |
331 | static bool HasNative(AMDGPULibFunc::EFuncId id) { |
332 | switch(id) { |
333 | case AMDGPULibFunc::EI_DIVIDE: |
334 | case AMDGPULibFunc::EI_COS: |
335 | case AMDGPULibFunc::EI_EXP: |
336 | case AMDGPULibFunc::EI_EXP2: |
337 | case AMDGPULibFunc::EI_EXP10: |
338 | case AMDGPULibFunc::EI_LOG: |
339 | case AMDGPULibFunc::EI_LOG2: |
340 | case AMDGPULibFunc::EI_LOG10: |
341 | case AMDGPULibFunc::EI_POWR: |
342 | case AMDGPULibFunc::EI_RECIP: |
343 | case AMDGPULibFunc::EI_RSQRT: |
344 | case AMDGPULibFunc::EI_SIN: |
345 | case AMDGPULibFunc::EI_SINCOS: |
346 | case AMDGPULibFunc::EI_SQRT: |
347 | case AMDGPULibFunc::EI_TAN: |
348 | return true; |
349 | default:; |
350 | } |
351 | return false; |
352 | } |
353 | |
354 | using TableRef = ArrayRef<TableEntry>; |
355 | |
356 | static TableRef getOptTable(AMDGPULibFunc::EFuncId id) { |
357 | switch(id) { |
358 | case AMDGPULibFunc::EI_ACOS: return TableRef(tbl_acos); |
359 | case AMDGPULibFunc::EI_ACOSH: return TableRef(tbl_acosh); |
360 | case AMDGPULibFunc::EI_ACOSPI: return TableRef(tbl_acospi); |
361 | case AMDGPULibFunc::EI_ASIN: return TableRef(tbl_asin); |
362 | case AMDGPULibFunc::EI_ASINH: return TableRef(tbl_asinh); |
363 | case AMDGPULibFunc::EI_ASINPI: return TableRef(tbl_asinpi); |
364 | case AMDGPULibFunc::EI_ATAN: return TableRef(tbl_atan); |
365 | case AMDGPULibFunc::EI_ATANH: return TableRef(tbl_atanh); |
366 | case AMDGPULibFunc::EI_ATANPI: return TableRef(tbl_atanpi); |
367 | case AMDGPULibFunc::EI_CBRT: return TableRef(tbl_cbrt); |
368 | case AMDGPULibFunc::EI_NCOS: |
369 | case AMDGPULibFunc::EI_COS: return TableRef(tbl_cos); |
370 | case AMDGPULibFunc::EI_COSH: return TableRef(tbl_cosh); |
371 | case AMDGPULibFunc::EI_COSPI: return TableRef(tbl_cospi); |
372 | case AMDGPULibFunc::EI_ERFC: return TableRef(tbl_erfc); |
373 | case AMDGPULibFunc::EI_ERF: return TableRef(tbl_erf); |
374 | case AMDGPULibFunc::EI_EXP: return TableRef(tbl_exp); |
375 | case AMDGPULibFunc::EI_NEXP2: |
376 | case AMDGPULibFunc::EI_EXP2: return TableRef(tbl_exp2); |
377 | case AMDGPULibFunc::EI_EXP10: return TableRef(tbl_exp10); |
378 | case AMDGPULibFunc::EI_EXPM1: return TableRef(tbl_expm1); |
379 | case AMDGPULibFunc::EI_LOG: return TableRef(tbl_log); |
380 | case AMDGPULibFunc::EI_NLOG2: |
381 | case AMDGPULibFunc::EI_LOG2: return TableRef(tbl_log2); |
382 | case AMDGPULibFunc::EI_LOG10: return TableRef(tbl_log10); |
383 | case AMDGPULibFunc::EI_NRSQRT: |
384 | case AMDGPULibFunc::EI_RSQRT: return TableRef(tbl_rsqrt); |
385 | case AMDGPULibFunc::EI_NSIN: |
386 | case AMDGPULibFunc::EI_SIN: return TableRef(tbl_sin); |
387 | case AMDGPULibFunc::EI_SINH: return TableRef(tbl_sinh); |
388 | case AMDGPULibFunc::EI_SINPI: return TableRef(tbl_sinpi); |
389 | case AMDGPULibFunc::EI_NSQRT: |
390 | case AMDGPULibFunc::EI_SQRT: return TableRef(tbl_sqrt); |
391 | case AMDGPULibFunc::EI_TAN: return TableRef(tbl_tan); |
392 | case AMDGPULibFunc::EI_TANH: return TableRef(tbl_tanh); |
393 | case AMDGPULibFunc::EI_TANPI: return TableRef(tbl_tanpi); |
394 | case AMDGPULibFunc::EI_TGAMMA: return TableRef(tbl_tgamma); |
395 | default:; |
396 | } |
397 | return TableRef(); |
398 | } |
399 | |
400 | static inline int getVecSize(const AMDGPULibFunc& FInfo) { |
401 | return FInfo.getLeads()[0].VectorSize; |
402 | } |
403 | |
404 | static inline AMDGPULibFunc::EType getArgType(const AMDGPULibFunc& FInfo) { |
405 | return (AMDGPULibFunc::EType)FInfo.getLeads()[0].ArgType; |
406 | } |
407 | |
408 | FunctionCallee AMDGPULibCalls::getFunction(Module *M, const FuncInfo &fInfo) { |
409 | // If we are doing PreLinkOpt, the function is external. So it is safe to |
410 | // use getOrInsertFunction() at this stage. |
411 | |
412 | return EnablePreLink ? AMDGPULibFunc::getOrInsertFunction(M, fInfo) |
413 | : AMDGPULibFunc::getFunction(M, fInfo); |
414 | } |
415 | |
416 | bool AMDGPULibCalls::parseFunctionName(const StringRef &FMangledName, |
417 | FuncInfo &FInfo) { |
418 | return AMDGPULibFunc::parse(MangledName: FMangledName, Ptr&: FInfo); |
419 | } |
420 | |
421 | bool AMDGPULibCalls::isUnsafeMath(const FPMathOperator *FPOp) const { |
422 | return UnsafeFPMath || FPOp->isFast(); |
423 | } |
424 | |
425 | bool AMDGPULibCalls::isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const { |
426 | return UnsafeFPMath || |
427 | (FPOp->hasApproxFunc() && FPOp->hasNoNaNs() && FPOp->hasNoInfs()); |
428 | } |
429 | |
430 | bool AMDGPULibCalls::canIncreasePrecisionOfConstantFold( |
431 | const FPMathOperator *FPOp) const { |
432 | // TODO: Refine to approxFunc or contract |
433 | return isUnsafeMath(FPOp); |
434 | } |
435 | |
436 | void AMDGPULibCalls::initFunction(Function &F, FunctionAnalysisManager &FAM) { |
437 | UnsafeFPMath = F.getFnAttribute(Kind: "unsafe-fp-math" ).getValueAsBool(); |
438 | AC = &FAM.getResult<AssumptionAnalysis>(IR&: F); |
439 | TLInfo = &FAM.getResult<TargetLibraryAnalysis>(IR&: F); |
440 | DT = FAM.getCachedResult<DominatorTreeAnalysis>(IR&: F); |
441 | } |
442 | |
443 | bool AMDGPULibCalls::useNativeFunc(const StringRef F) const { |
444 | return AllNative || llvm::is_contained(Range&: UseNative, Element: F); |
445 | } |
446 | |
447 | void AMDGPULibCalls::initNativeFuncs() { |
448 | AllNative = useNativeFunc(F: "all" ) || |
449 | (UseNative.getNumOccurrences() && UseNative.size() == 1 && |
450 | UseNative.begin()->empty()); |
451 | } |
452 | |
453 | bool AMDGPULibCalls::sincosUseNative(CallInst *aCI, const FuncInfo &FInfo) { |
454 | bool native_sin = useNativeFunc(F: "sin" ); |
455 | bool native_cos = useNativeFunc(F: "cos" ); |
456 | |
457 | if (native_sin && native_cos) { |
458 | Module *M = aCI->getModule(); |
459 | Value *opr0 = aCI->getArgOperand(i: 0); |
460 | |
461 | AMDGPULibFunc nf; |
462 | nf.getLeads()[0].ArgType = FInfo.getLeads()[0].ArgType; |
463 | nf.getLeads()[0].VectorSize = FInfo.getLeads()[0].VectorSize; |
464 | |
465 | nf.setPrefix(AMDGPULibFunc::NATIVE); |
466 | nf.setId(AMDGPULibFunc::EI_SIN); |
467 | FunctionCallee sinExpr = getFunction(M, fInfo: nf); |
468 | |
469 | nf.setPrefix(AMDGPULibFunc::NATIVE); |
470 | nf.setId(AMDGPULibFunc::EI_COS); |
471 | FunctionCallee cosExpr = getFunction(M, fInfo: nf); |
472 | if (sinExpr && cosExpr) { |
473 | Value *sinval = |
474 | CallInst::Create(Func: sinExpr, Args: opr0, NameStr: "splitsin" , InsertBefore: aCI->getIterator()); |
475 | Value *cosval = |
476 | CallInst::Create(Func: cosExpr, Args: opr0, NameStr: "splitcos" , InsertBefore: aCI->getIterator()); |
477 | new StoreInst(cosval, aCI->getArgOperand(i: 1), aCI->getIterator()); |
478 | |
479 | DEBUG_WITH_TYPE("usenative" , dbgs() << "<useNative> replace " << *aCI |
480 | << " with native version of sin/cos" ); |
481 | |
482 | replaceCall(I: aCI, With: sinval); |
483 | return true; |
484 | } |
485 | } |
486 | return false; |
487 | } |
488 | |
489 | bool AMDGPULibCalls::useNative(CallInst *aCI) { |
490 | Function *Callee = aCI->getCalledFunction(); |
491 | if (!Callee || aCI->isNoBuiltin()) |
492 | return false; |
493 | |
494 | FuncInfo FInfo; |
495 | if (!parseFunctionName(FMangledName: Callee->getName(), FInfo) || !FInfo.isMangled() || |
496 | FInfo.getPrefix() != AMDGPULibFunc::NOPFX || |
497 | getArgType(FInfo) == AMDGPULibFunc::F64 || !HasNative(id: FInfo.getId()) || |
498 | !(AllNative || useNativeFunc(F: FInfo.getName()))) { |
499 | return false; |
500 | } |
501 | |
502 | if (FInfo.getId() == AMDGPULibFunc::EI_SINCOS) |
503 | return sincosUseNative(aCI, FInfo); |
504 | |
505 | FInfo.setPrefix(AMDGPULibFunc::NATIVE); |
506 | FunctionCallee F = getFunction(M: aCI->getModule(), fInfo: FInfo); |
507 | if (!F) |
508 | return false; |
509 | |
510 | aCI->setCalledFunction(F); |
511 | DEBUG_WITH_TYPE("usenative" , dbgs() << "<useNative> replace " << *aCI |
512 | << " with native version" ); |
513 | return true; |
514 | } |
515 | |
516 | // Clang emits call of __read_pipe_2 or __read_pipe_4 for OpenCL read_pipe |
517 | // builtin, with appended type size and alignment arguments, where 2 or 4 |
518 | // indicates the original number of arguments. The library has optimized version |
519 | // of __read_pipe_2/__read_pipe_4 when the type size and alignment has the same |
520 | // power of 2 value. This function transforms __read_pipe_2 to __read_pipe_2_N |
521 | // for such cases where N is the size in bytes of the type (N = 1, 2, 4, 8, ..., |
522 | // 128). The same for __read_pipe_4, write_pipe_2, and write_pipe_4. |
523 | bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B, |
524 | const FuncInfo &FInfo) { |
525 | auto *Callee = CI->getCalledFunction(); |
526 | if (!Callee->isDeclaration()) |
527 | return false; |
528 | |
529 | assert(Callee->hasName() && "Invalid read_pipe/write_pipe function" ); |
530 | auto *M = Callee->getParent(); |
531 | std::string Name = std::string(Callee->getName()); |
532 | auto NumArg = CI->arg_size(); |
533 | if (NumArg != 4 && NumArg != 6) |
534 | return false; |
535 | ConstantInt *PacketSize = |
536 | dyn_cast<ConstantInt>(Val: CI->getArgOperand(i: NumArg - 2)); |
537 | ConstantInt *PacketAlign = |
538 | dyn_cast<ConstantInt>(Val: CI->getArgOperand(i: NumArg - 1)); |
539 | if (!PacketSize || !PacketAlign) |
540 | return false; |
541 | |
542 | unsigned Size = PacketSize->getZExtValue(); |
543 | Align Alignment = PacketAlign->getAlignValue(); |
544 | if (Alignment != Size) |
545 | return false; |
546 | |
547 | unsigned PtrArgLoc = CI->arg_size() - 3; |
548 | Value *PtrArg = CI->getArgOperand(i: PtrArgLoc); |
549 | Type *PtrTy = PtrArg->getType(); |
550 | |
551 | SmallVector<llvm::Type *, 6> ArgTys; |
552 | for (unsigned I = 0; I != PtrArgLoc; ++I) |
553 | ArgTys.push_back(Elt: CI->getArgOperand(i: I)->getType()); |
554 | ArgTys.push_back(Elt: PtrTy); |
555 | |
556 | Name = Name + "_" + std::to_string(val: Size); |
557 | auto *FTy = FunctionType::get(Result: Callee->getReturnType(), |
558 | Params: ArrayRef<Type *>(ArgTys), isVarArg: false); |
559 | AMDGPULibFunc NewLibFunc(Name, FTy); |
560 | FunctionCallee F = AMDGPULibFunc::getOrInsertFunction(M, fInfo: NewLibFunc); |
561 | if (!F) |
562 | return false; |
563 | |
564 | SmallVector<Value *, 6> Args; |
565 | for (unsigned I = 0; I != PtrArgLoc; ++I) |
566 | Args.push_back(Elt: CI->getArgOperand(i: I)); |
567 | Args.push_back(Elt: PtrArg); |
568 | |
569 | auto *NCI = B.CreateCall(Callee: F, Args); |
570 | NCI->setAttributes(CI->getAttributes()); |
571 | CI->replaceAllUsesWith(V: NCI); |
572 | CI->dropAllReferences(); |
573 | CI->eraseFromParent(); |
574 | |
575 | return true; |
576 | } |
577 | |
578 | static bool isKnownIntegral(const Value *V, const DataLayout &DL, |
579 | FastMathFlags FMF) { |
580 | if (isa<UndefValue>(Val: V)) |
581 | return true; |
582 | |
583 | if (const ConstantFP *CF = dyn_cast<ConstantFP>(Val: V)) |
584 | return CF->getValueAPF().isInteger(); |
585 | |
586 | if (const ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(Val: V)) { |
587 | for (unsigned i = 0, e = CDV->getNumElements(); i != e; ++i) { |
588 | Constant *ConstElt = CDV->getElementAsConstant(i); |
589 | if (isa<UndefValue>(Val: ConstElt)) |
590 | continue; |
591 | const ConstantFP *CFP = dyn_cast<ConstantFP>(Val: ConstElt); |
592 | if (!CFP || !CFP->getValue().isInteger()) |
593 | return false; |
594 | } |
595 | |
596 | return true; |
597 | } |
598 | |
599 | const Instruction *I = dyn_cast<Instruction>(Val: V); |
600 | if (!I) |
601 | return false; |
602 | |
603 | switch (I->getOpcode()) { |
604 | case Instruction::SIToFP: |
605 | case Instruction::UIToFP: |
606 | // TODO: Could check nofpclass(inf) on incoming argument |
607 | if (FMF.noInfs()) |
608 | return true; |
609 | |
610 | // Need to check int size cannot produce infinity, which computeKnownFPClass |
611 | // knows how to do already. |
612 | return isKnownNeverInfinity(V: I, /*Depth=*/0, SQ: SimplifyQuery(DL)); |
613 | case Instruction::Call: { |
614 | const CallInst *CI = cast<CallInst>(Val: I); |
615 | switch (CI->getIntrinsicID()) { |
616 | case Intrinsic::trunc: |
617 | case Intrinsic::floor: |
618 | case Intrinsic::ceil: |
619 | case Intrinsic::rint: |
620 | case Intrinsic::nearbyint: |
621 | case Intrinsic::round: |
622 | case Intrinsic::roundeven: |
623 | return (FMF.noInfs() && FMF.noNaNs()) || |
624 | isKnownNeverInfOrNaN(V: I, /*Depth=*/0, SQ: SimplifyQuery(DL)); |
625 | default: |
626 | break; |
627 | } |
628 | |
629 | break; |
630 | } |
631 | default: |
632 | break; |
633 | } |
634 | |
635 | return false; |
636 | } |
637 | |
638 | // This function returns false if no change; return true otherwise. |
639 | bool AMDGPULibCalls::fold(CallInst *CI) { |
640 | Function *Callee = CI->getCalledFunction(); |
641 | // Ignore indirect calls. |
642 | if (!Callee || Callee->isIntrinsic() || CI->isNoBuiltin()) |
643 | return false; |
644 | |
645 | FuncInfo FInfo; |
646 | if (!parseFunctionName(FMangledName: Callee->getName(), FInfo)) |
647 | return false; |
648 | |
649 | // Further check the number of arguments to see if they match. |
650 | // TODO: Check calling convention matches too |
651 | if (!FInfo.isCompatibleSignature(FuncTy: CI->getFunctionType())) |
652 | return false; |
653 | |
654 | LLVM_DEBUG(dbgs() << "AMDIC: try folding " << *CI << '\n'); |
655 | |
656 | if (TDOFold(CI, FInfo)) |
657 | return true; |
658 | |
659 | IRBuilder<> B(CI); |
660 | if (CI->isStrictFP()) |
661 | B.setIsFPConstrained(true); |
662 | |
663 | if (FPMathOperator *FPOp = dyn_cast<FPMathOperator>(Val: CI)) { |
664 | // Under unsafe-math, evaluate calls if possible. |
665 | // According to Brian Sumner, we can do this for all f32 function calls |
666 | // using host's double function calls. |
667 | if (canIncreasePrecisionOfConstantFold(FPOp) && evaluateCall(aCI: CI, FInfo)) |
668 | return true; |
669 | |
670 | // Copy fast flags from the original call. |
671 | FastMathFlags FMF = FPOp->getFastMathFlags(); |
672 | B.setFastMathFlags(FMF); |
673 | |
674 | // Specialized optimizations for each function call. |
675 | // |
676 | // TODO: Handle native functions |
677 | switch (FInfo.getId()) { |
678 | case AMDGPULibFunc::EI_EXP: |
679 | if (FMF.none()) |
680 | return false; |
681 | return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::IntrID: exp, |
682 | AllowMinSizeF32: FMF.approxFunc()); |
683 | case AMDGPULibFunc::EI_EXP2: |
684 | if (FMF.none()) |
685 | return false; |
686 | return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::IntrID: exp2, |
687 | AllowMinSizeF32: FMF.approxFunc()); |
688 | case AMDGPULibFunc::EI_LOG: |
689 | if (FMF.none()) |
690 | return false; |
691 | return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::IntrID: log, |
692 | AllowMinSizeF32: FMF.approxFunc()); |
693 | case AMDGPULibFunc::EI_LOG2: |
694 | if (FMF.none()) |
695 | return false; |
696 | return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::IntrID: log2, |
697 | AllowMinSizeF32: FMF.approxFunc()); |
698 | case AMDGPULibFunc::EI_LOG10: |
699 | if (FMF.none()) |
700 | return false; |
701 | return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::IntrID: log10, |
702 | AllowMinSizeF32: FMF.approxFunc()); |
703 | case AMDGPULibFunc::EI_FMIN: |
704 | return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::IntrID: minnum, |
705 | AllowMinSizeF32: true, AllowF64: true); |
706 | case AMDGPULibFunc::EI_FMAX: |
707 | return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::IntrID: maxnum, |
708 | AllowMinSizeF32: true, AllowF64: true); |
709 | case AMDGPULibFunc::EI_FMA: |
710 | return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::IntrID: fma, AllowMinSizeF32: true, |
711 | AllowF64: true); |
712 | case AMDGPULibFunc::EI_MAD: |
713 | return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::IntrID: fmuladd, |
714 | AllowMinSizeF32: true, AllowF64: true); |
715 | case AMDGPULibFunc::EI_FABS: |
716 | return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::IntrID: fabs, AllowMinSizeF32: true, |
717 | AllowF64: true, AllowStrictFP: true); |
718 | case AMDGPULibFunc::EI_COPYSIGN: |
719 | return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::IntrID: copysign, |
720 | AllowMinSizeF32: true, AllowF64: true, AllowStrictFP: true); |
721 | case AMDGPULibFunc::EI_FLOOR: |
722 | return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::IntrID: floor, AllowMinSizeF32: true, |
723 | AllowF64: true); |
724 | case AMDGPULibFunc::EI_CEIL: |
725 | return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::IntrID: ceil, AllowMinSizeF32: true, |
726 | AllowF64: true); |
727 | case AMDGPULibFunc::EI_TRUNC: |
728 | return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::IntrID: trunc, AllowMinSizeF32: true, |
729 | AllowF64: true); |
730 | case AMDGPULibFunc::EI_RINT: |
731 | return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::IntrID: rint, AllowMinSizeF32: true, |
732 | AllowF64: true); |
733 | case AMDGPULibFunc::EI_ROUND: |
734 | return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::IntrID: round, AllowMinSizeF32: true, |
735 | AllowF64: true); |
736 | case AMDGPULibFunc::EI_LDEXP: { |
737 | if (!shouldReplaceLibcallWithIntrinsic(CI, AllowMinSizeF32: true, AllowF64: true)) |
738 | return false; |
739 | |
740 | Value *Arg1 = CI->getArgOperand(i: 1); |
741 | if (VectorType *VecTy = dyn_cast<VectorType>(Val: CI->getType()); |
742 | VecTy && !isa<VectorType>(Val: Arg1->getType())) { |
743 | Value *SplatArg1 = B.CreateVectorSplat(EC: VecTy->getElementCount(), V: Arg1); |
744 | CI->setArgOperand(i: 1, v: SplatArg1); |
745 | } |
746 | |
747 | CI->setCalledFunction(Intrinsic::getDeclaration( |
748 | M: CI->getModule(), Intrinsic::id: ldexp, |
749 | Tys: {CI->getType(), CI->getArgOperand(i: 1)->getType()})); |
750 | return true; |
751 | } |
752 | case AMDGPULibFunc::EI_POW: { |
753 | Module *M = Callee->getParent(); |
754 | AMDGPULibFunc PowrInfo(AMDGPULibFunc::EI_POWR, FInfo); |
755 | FunctionCallee PowrFunc = getFunction(M, fInfo: PowrInfo); |
756 | CallInst *Call = cast<CallInst>(Val: FPOp); |
757 | |
758 | // pow(x, y) -> powr(x, y) for x >= -0.0 |
759 | // TODO: Account for flags on current call |
760 | if (PowrFunc && |
761 | cannotBeOrderedLessThanZero( |
762 | V: FPOp->getOperand(i: 0), /*Depth=*/0, |
763 | SQ: SimplifyQuery(M->getDataLayout(), TLInfo, DT, AC, Call))) { |
764 | Call->setCalledFunction(PowrFunc); |
765 | return fold_pow(FPOp, B, FInfo: PowrInfo) || true; |
766 | } |
767 | |
768 | // pow(x, y) -> pown(x, y) for known integral y |
769 | if (isKnownIntegral(V: FPOp->getOperand(i: 1), DL: M->getDataLayout(), |
770 | FMF: FPOp->getFastMathFlags())) { |
771 | FunctionType *PownType = getPownType(FT: CI->getFunctionType()); |
772 | AMDGPULibFunc PownInfo(AMDGPULibFunc::EI_POWN, PownType, true); |
773 | FunctionCallee PownFunc = getFunction(M, fInfo: PownInfo); |
774 | if (PownFunc) { |
775 | // TODO: If the incoming integral value is an sitofp/uitofp, it won't |
776 | // fold out without a known range. We can probably take the source |
777 | // value directly. |
778 | Value *CastedArg = |
779 | B.CreateFPToSI(V: FPOp->getOperand(i: 1), DestTy: PownType->getParamType(i: 1)); |
780 | // Have to drop any nofpclass attributes on the original call site. |
781 | Call->removeParamAttrs( |
782 | ArgNo: 1, AttrsToRemove: AttributeFuncs::typeIncompatible(Ty: CastedArg->getType())); |
783 | Call->setCalledFunction(PownFunc); |
784 | Call->setArgOperand(i: 1, v: CastedArg); |
785 | return fold_pow(FPOp, B, FInfo: PownInfo) || true; |
786 | } |
787 | } |
788 | |
789 | return fold_pow(FPOp, B, FInfo); |
790 | } |
791 | case AMDGPULibFunc::EI_POWR: |
792 | case AMDGPULibFunc::EI_POWN: |
793 | return fold_pow(FPOp, B, FInfo); |
794 | case AMDGPULibFunc::EI_ROOTN: |
795 | return fold_rootn(FPOp, B, FInfo); |
796 | case AMDGPULibFunc::EI_SQRT: |
797 | // TODO: Allow with strictfp + constrained intrinsic |
798 | return tryReplaceLibcallWithSimpleIntrinsic( |
799 | B, CI, Intrinsic::IntrID: sqrt, AllowMinSizeF32: true, AllowF64: true, /*AllowStrictFP=*/false); |
800 | case AMDGPULibFunc::EI_COS: |
801 | case AMDGPULibFunc::EI_SIN: |
802 | return fold_sincos(FPOp, B, FInfo); |
803 | default: |
804 | break; |
805 | } |
806 | } else { |
807 | // Specialized optimizations for each function call |
808 | switch (FInfo.getId()) { |
809 | case AMDGPULibFunc::EI_READ_PIPE_2: |
810 | case AMDGPULibFunc::EI_READ_PIPE_4: |
811 | case AMDGPULibFunc::EI_WRITE_PIPE_2: |
812 | case AMDGPULibFunc::EI_WRITE_PIPE_4: |
813 | return fold_read_write_pipe(CI, B, FInfo); |
814 | default: |
815 | break; |
816 | } |
817 | } |
818 | |
819 | return false; |
820 | } |
821 | |
822 | bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) { |
823 | // Table-Driven optimization |
824 | const TableRef tr = getOptTable(id: FInfo.getId()); |
825 | if (tr.empty()) |
826 | return false; |
827 | |
828 | int const sz = (int)tr.size(); |
829 | Value *opr0 = CI->getArgOperand(i: 0); |
830 | |
831 | if (getVecSize(FInfo) > 1) { |
832 | if (ConstantDataVector *CV = dyn_cast<ConstantDataVector>(Val: opr0)) { |
833 | SmallVector<double, 0> DVal; |
834 | for (int eltNo = 0; eltNo < getVecSize(FInfo); ++eltNo) { |
835 | ConstantFP *eltval = dyn_cast<ConstantFP>( |
836 | Val: CV->getElementAsConstant(i: (unsigned)eltNo)); |
837 | assert(eltval && "Non-FP arguments in math function!" ); |
838 | bool found = false; |
839 | for (int i=0; i < sz; ++i) { |
840 | if (eltval->isExactlyValue(V: tr[i].input)) { |
841 | DVal.push_back(Elt: tr[i].result); |
842 | found = true; |
843 | break; |
844 | } |
845 | } |
846 | if (!found) { |
847 | // This vector constants not handled yet. |
848 | return false; |
849 | } |
850 | } |
851 | LLVMContext &context = CI->getParent()->getParent()->getContext(); |
852 | Constant *nval; |
853 | if (getArgType(FInfo) == AMDGPULibFunc::F32) { |
854 | SmallVector<float, 0> FVal; |
855 | for (unsigned i = 0; i < DVal.size(); ++i) { |
856 | FVal.push_back(Elt: (float)DVal[i]); |
857 | } |
858 | ArrayRef<float> tmp(FVal); |
859 | nval = ConstantDataVector::get(Context&: context, Elts: tmp); |
860 | } else { // F64 |
861 | ArrayRef<double> tmp(DVal); |
862 | nval = ConstantDataVector::get(Context&: context, Elts: tmp); |
863 | } |
864 | LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n" ); |
865 | replaceCall(I: CI, With: nval); |
866 | return true; |
867 | } |
868 | } else { |
869 | // Scalar version |
870 | if (ConstantFP *CF = dyn_cast<ConstantFP>(Val: opr0)) { |
871 | for (int i = 0; i < sz; ++i) { |
872 | if (CF->isExactlyValue(V: tr[i].input)) { |
873 | Value *nval = ConstantFP::get(Ty: CF->getType(), V: tr[i].result); |
874 | LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n" ); |
875 | replaceCall(I: CI, With: nval); |
876 | return true; |
877 | } |
878 | } |
879 | } |
880 | } |
881 | |
882 | return false; |
883 | } |
884 | |
885 | namespace llvm { |
886 | static double log2(double V) { |
887 | #if _XOPEN_SOURCE >= 600 || defined(_ISOC99_SOURCE) || _POSIX_C_SOURCE >= 200112L |
888 | return ::log2(x: V); |
889 | #else |
890 | return log(V) / numbers::ln2; |
891 | #endif |
892 | } |
893 | } |
894 | |
895 | bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B, |
896 | const FuncInfo &FInfo) { |
897 | assert((FInfo.getId() == AMDGPULibFunc::EI_POW || |
898 | FInfo.getId() == AMDGPULibFunc::EI_POWR || |
899 | FInfo.getId() == AMDGPULibFunc::EI_POWN) && |
900 | "fold_pow: encounter a wrong function call" ); |
901 | |
902 | Module *M = B.GetInsertBlock()->getModule(); |
903 | Type *eltType = FPOp->getType()->getScalarType(); |
904 | Value *opr0 = FPOp->getOperand(i: 0); |
905 | Value *opr1 = FPOp->getOperand(i: 1); |
906 | |
907 | const APFloat *CF = nullptr; |
908 | const APInt *CINT = nullptr; |
909 | if (!match(V: opr1, P: m_APFloatAllowPoison(Res&: CF))) |
910 | match(V: opr1, P: m_APIntAllowPoison(Res&: CINT)); |
911 | |
912 | // 0x1111111 means that we don't do anything for this call. |
913 | int ci_opr1 = (CINT ? (int)CINT->getSExtValue() : 0x1111111); |
914 | |
915 | if ((CF && CF->isZero()) || (CINT && ci_opr1 == 0)) { |
916 | // pow/powr/pown(x, 0) == 1 |
917 | LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1\n" ); |
918 | Constant *cnval = ConstantFP::get(Ty: eltType, V: 1.0); |
919 | if (getVecSize(FInfo) > 1) { |
920 | cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval); |
921 | } |
922 | replaceCall(I: FPOp, With: cnval); |
923 | return true; |
924 | } |
925 | if ((CF && CF->isExactlyValue(V: 1.0)) || (CINT && ci_opr1 == 1)) { |
926 | // pow/powr/pown(x, 1.0) = x |
927 | LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << *opr0 << "\n" ); |
928 | replaceCall(I: FPOp, With: opr0); |
929 | return true; |
930 | } |
931 | if ((CF && CF->isExactlyValue(V: 2.0)) || (CINT && ci_opr1 == 2)) { |
932 | // pow/powr/pown(x, 2.0) = x*x |
933 | LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << *opr0 << " * " |
934 | << *opr0 << "\n" ); |
935 | Value *nval = B.CreateFMul(L: opr0, R: opr0, Name: "__pow2" ); |
936 | replaceCall(I: FPOp, With: nval); |
937 | return true; |
938 | } |
939 | if ((CF && CF->isExactlyValue(V: -1.0)) || (CINT && ci_opr1 == -1)) { |
940 | // pow/powr/pown(x, -1.0) = 1.0/x |
941 | LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1 / " << *opr0 << "\n" ); |
942 | Constant *cnval = ConstantFP::get(Ty: eltType, V: 1.0); |
943 | if (getVecSize(FInfo) > 1) { |
944 | cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval); |
945 | } |
946 | Value *nval = B.CreateFDiv(L: cnval, R: opr0, Name: "__powrecip" ); |
947 | replaceCall(I: FPOp, With: nval); |
948 | return true; |
949 | } |
950 | |
951 | if (CF && (CF->isExactlyValue(V: 0.5) || CF->isExactlyValue(V: -0.5))) { |
952 | // pow[r](x, [-]0.5) = sqrt(x) |
953 | bool issqrt = CF->isExactlyValue(V: 0.5); |
954 | if (FunctionCallee FPExpr = |
955 | getFunction(M, fInfo: AMDGPULibFunc(issqrt ? AMDGPULibFunc::EI_SQRT |
956 | : AMDGPULibFunc::EI_RSQRT, |
957 | FInfo))) { |
958 | LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << FInfo.getName() |
959 | << '(' << *opr0 << ")\n" ); |
960 | Value *nval = CreateCallEx(B,Callee: FPExpr, Arg: opr0, Name: issqrt ? "__pow2sqrt" |
961 | : "__pow2rsqrt" ); |
962 | replaceCall(I: FPOp, With: nval); |
963 | return true; |
964 | } |
965 | } |
966 | |
967 | if (!isUnsafeFiniteOnlyMath(FPOp)) |
968 | return false; |
969 | |
970 | // Unsafe Math optimization |
971 | |
972 | // Remember that ci_opr1 is set if opr1 is integral |
973 | if (CF) { |
974 | double dval = (getArgType(FInfo) == AMDGPULibFunc::F32) |
975 | ? (double)CF->convertToFloat() |
976 | : CF->convertToDouble(); |
977 | int ival = (int)dval; |
978 | if ((double)ival == dval) { |
979 | ci_opr1 = ival; |
980 | } else |
981 | ci_opr1 = 0x11111111; |
982 | } |
983 | |
984 | // pow/powr/pown(x, c) = [1/](x*x*..x); where |
985 | // trunc(c) == c && the number of x == c && |c| <= 12 |
986 | unsigned abs_opr1 = (ci_opr1 < 0) ? -ci_opr1 : ci_opr1; |
987 | if (abs_opr1 <= 12) { |
988 | Constant *cnval; |
989 | Value *nval; |
990 | if (abs_opr1 == 0) { |
991 | cnval = ConstantFP::get(Ty: eltType, V: 1.0); |
992 | if (getVecSize(FInfo) > 1) { |
993 | cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval); |
994 | } |
995 | nval = cnval; |
996 | } else { |
997 | Value *valx2 = nullptr; |
998 | nval = nullptr; |
999 | while (abs_opr1 > 0) { |
1000 | valx2 = valx2 ? B.CreateFMul(L: valx2, R: valx2, Name: "__powx2" ) : opr0; |
1001 | if (abs_opr1 & 1) { |
1002 | nval = nval ? B.CreateFMul(L: nval, R: valx2, Name: "__powprod" ) : valx2; |
1003 | } |
1004 | abs_opr1 >>= 1; |
1005 | } |
1006 | } |
1007 | |
1008 | if (ci_opr1 < 0) { |
1009 | cnval = ConstantFP::get(Ty: eltType, V: 1.0); |
1010 | if (getVecSize(FInfo) > 1) { |
1011 | cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval); |
1012 | } |
1013 | nval = B.CreateFDiv(L: cnval, R: nval, Name: "__1powprod" ); |
1014 | } |
1015 | LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " |
1016 | << ((ci_opr1 < 0) ? "1/prod(" : "prod(" ) << *opr0 |
1017 | << ")\n" ); |
1018 | replaceCall(I: FPOp, With: nval); |
1019 | return true; |
1020 | } |
1021 | |
1022 | // If we should use the generic intrinsic instead of emitting a libcall |
1023 | const bool ShouldUseIntrinsic = eltType->isFloatTy() || eltType->isHalfTy(); |
1024 | |
1025 | // powr ---> exp2(y * log2(x)) |
1026 | // pown/pow ---> powr(fabs(x), y) | (x & ((int)y << 31)) |
1027 | FunctionCallee ExpExpr; |
1028 | if (ShouldUseIntrinsic) |
1029 | ExpExpr = Intrinsic::getDeclaration(M, Intrinsic::id: exp2, Tys: {FPOp->getType()}); |
1030 | else { |
1031 | ExpExpr = getFunction(M, fInfo: AMDGPULibFunc(AMDGPULibFunc::EI_EXP2, FInfo)); |
1032 | if (!ExpExpr) |
1033 | return false; |
1034 | } |
1035 | |
1036 | bool needlog = false; |
1037 | bool needabs = false; |
1038 | bool needcopysign = false; |
1039 | Constant *cnval = nullptr; |
1040 | if (getVecSize(FInfo) == 1) { |
1041 | CF = nullptr; |
1042 | match(V: opr0, P: m_APFloatAllowPoison(Res&: CF)); |
1043 | |
1044 | if (CF) { |
1045 | double V = (getArgType(FInfo) == AMDGPULibFunc::F32) |
1046 | ? (double)CF->convertToFloat() |
1047 | : CF->convertToDouble(); |
1048 | |
1049 | V = log2(V: std::abs(x: V)); |
1050 | cnval = ConstantFP::get(Ty: eltType, V); |
1051 | needcopysign = (FInfo.getId() != AMDGPULibFunc::EI_POWR) && |
1052 | CF->isNegative(); |
1053 | } else { |
1054 | needlog = true; |
1055 | needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR; |
1056 | } |
1057 | } else { |
1058 | ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(Val: opr0); |
1059 | |
1060 | if (!CDV) { |
1061 | needlog = true; |
1062 | needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR; |
1063 | } else { |
1064 | assert ((int)CDV->getNumElements() == getVecSize(FInfo) && |
1065 | "Wrong vector size detected" ); |
1066 | |
1067 | SmallVector<double, 0> DVal; |
1068 | for (int i=0; i < getVecSize(FInfo); ++i) { |
1069 | double V = CDV->getElementAsAPFloat(i).convertToDouble(); |
1070 | if (V < 0.0) needcopysign = true; |
1071 | V = log2(V: std::abs(x: V)); |
1072 | DVal.push_back(Elt: V); |
1073 | } |
1074 | if (getArgType(FInfo) == AMDGPULibFunc::F32) { |
1075 | SmallVector<float, 0> FVal; |
1076 | for (unsigned i=0; i < DVal.size(); ++i) { |
1077 | FVal.push_back(Elt: (float)DVal[i]); |
1078 | } |
1079 | ArrayRef<float> tmp(FVal); |
1080 | cnval = ConstantDataVector::get(Context&: M->getContext(), Elts: tmp); |
1081 | } else { |
1082 | ArrayRef<double> tmp(DVal); |
1083 | cnval = ConstantDataVector::get(Context&: M->getContext(), Elts: tmp); |
1084 | } |
1085 | } |
1086 | } |
1087 | |
1088 | if (needcopysign && (FInfo.getId() == AMDGPULibFunc::EI_POW)) { |
1089 | // We cannot handle corner cases for a general pow() function, give up |
1090 | // unless y is a constant integral value. Then proceed as if it were pown. |
1091 | if (!isKnownIntegral(V: opr1, DL: M->getDataLayout(), FMF: FPOp->getFastMathFlags())) |
1092 | return false; |
1093 | } |
1094 | |
1095 | Value *nval; |
1096 | if (needabs) { |
1097 | nval = B.CreateUnaryIntrinsic(Intrinsic::ID: fabs, V: opr0, FMFSource: nullptr, Name: "__fabs" ); |
1098 | } else { |
1099 | nval = cnval ? cnval : opr0; |
1100 | } |
1101 | if (needlog) { |
1102 | FunctionCallee LogExpr; |
1103 | if (ShouldUseIntrinsic) { |
1104 | LogExpr = |
1105 | Intrinsic::getDeclaration(M, Intrinsic::id: log2, Tys: {FPOp->getType()}); |
1106 | } else { |
1107 | LogExpr = getFunction(M, fInfo: AMDGPULibFunc(AMDGPULibFunc::EI_LOG2, FInfo)); |
1108 | if (!LogExpr) |
1109 | return false; |
1110 | } |
1111 | |
1112 | nval = CreateCallEx(B,Callee: LogExpr, Arg: nval, Name: "__log2" ); |
1113 | } |
1114 | |
1115 | if (FInfo.getId() == AMDGPULibFunc::EI_POWN) { |
1116 | // convert int(32) to fp(f32 or f64) |
1117 | opr1 = B.CreateSIToFP(V: opr1, DestTy: nval->getType(), Name: "pownI2F" ); |
1118 | } |
1119 | nval = B.CreateFMul(L: opr1, R: nval, Name: "__ylogx" ); |
1120 | nval = CreateCallEx(B,Callee: ExpExpr, Arg: nval, Name: "__exp2" ); |
1121 | |
1122 | if (needcopysign) { |
1123 | Value *opr_n; |
1124 | Type* rTy = opr0->getType(); |
1125 | Type* nTyS = B.getIntNTy(N: eltType->getPrimitiveSizeInBits()); |
1126 | Type *nTy = nTyS; |
1127 | if (const auto *vTy = dyn_cast<FixedVectorType>(Val: rTy)) |
1128 | nTy = FixedVectorType::get(ElementType: nTyS, FVTy: vTy); |
1129 | unsigned size = nTy->getScalarSizeInBits(); |
1130 | opr_n = FPOp->getOperand(i: 1); |
1131 | if (opr_n->getType()->isIntegerTy()) |
1132 | opr_n = B.CreateZExtOrTrunc(V: opr_n, DestTy: nTy, Name: "__ytou" ); |
1133 | else |
1134 | opr_n = B.CreateFPToSI(V: opr1, DestTy: nTy, Name: "__ytou" ); |
1135 | |
1136 | Value *sign = B.CreateShl(LHS: opr_n, RHS: size-1, Name: "__yeven" ); |
1137 | sign = B.CreateAnd(LHS: B.CreateBitCast(V: opr0, DestTy: nTy), RHS: sign, Name: "__pow_sign" ); |
1138 | nval = B.CreateOr(LHS: B.CreateBitCast(V: nval, DestTy: nTy), RHS: sign); |
1139 | nval = B.CreateBitCast(V: nval, DestTy: opr0->getType()); |
1140 | } |
1141 | |
1142 | LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " |
1143 | << "exp2(" << *opr1 << " * log2(" << *opr0 << "))\n" ); |
1144 | replaceCall(I: FPOp, With: nval); |
1145 | |
1146 | return true; |
1147 | } |
1148 | |
1149 | bool AMDGPULibCalls::fold_rootn(FPMathOperator *FPOp, IRBuilder<> &B, |
1150 | const FuncInfo &FInfo) { |
1151 | // skip vector function |
1152 | if (getVecSize(FInfo) != 1) |
1153 | return false; |
1154 | |
1155 | Value *opr0 = FPOp->getOperand(i: 0); |
1156 | Value *opr1 = FPOp->getOperand(i: 1); |
1157 | |
1158 | ConstantInt *CINT = dyn_cast<ConstantInt>(Val: opr1); |
1159 | if (!CINT) { |
1160 | return false; |
1161 | } |
1162 | int ci_opr1 = (int)CINT->getSExtValue(); |
1163 | if (ci_opr1 == 1) { // rootn(x, 1) = x |
1164 | LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << *opr0 << "\n" ); |
1165 | replaceCall(I: FPOp, With: opr0); |
1166 | return true; |
1167 | } |
1168 | |
1169 | Module *M = B.GetInsertBlock()->getModule(); |
1170 | if (ci_opr1 == 2) { // rootn(x, 2) = sqrt(x) |
1171 | if (FunctionCallee FPExpr = |
1172 | getFunction(M, fInfo: AMDGPULibFunc(AMDGPULibFunc::EI_SQRT, FInfo))) { |
1173 | LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> sqrt(" << *opr0 |
1174 | << ")\n" ); |
1175 | Value *nval = CreateCallEx(B,Callee: FPExpr, Arg: opr0, Name: "__rootn2sqrt" ); |
1176 | replaceCall(I: FPOp, With: nval); |
1177 | return true; |
1178 | } |
1179 | } else if (ci_opr1 == 3) { // rootn(x, 3) = cbrt(x) |
1180 | if (FunctionCallee FPExpr = |
1181 | getFunction(M, fInfo: AMDGPULibFunc(AMDGPULibFunc::EI_CBRT, FInfo))) { |
1182 | LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> cbrt(" << *opr0 |
1183 | << ")\n" ); |
1184 | Value *nval = CreateCallEx(B,Callee: FPExpr, Arg: opr0, Name: "__rootn2cbrt" ); |
1185 | replaceCall(I: FPOp, With: nval); |
1186 | return true; |
1187 | } |
1188 | } else if (ci_opr1 == -1) { // rootn(x, -1) = 1.0/x |
1189 | LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1.0 / " << *opr0 << "\n" ); |
1190 | Value *nval = B.CreateFDiv(L: ConstantFP::get(Ty: opr0->getType(), V: 1.0), |
1191 | R: opr0, |
1192 | Name: "__rootn2div" ); |
1193 | replaceCall(I: FPOp, With: nval); |
1194 | return true; |
1195 | } else if (ci_opr1 == -2) { // rootn(x, -2) = rsqrt(x) |
1196 | if (FunctionCallee FPExpr = |
1197 | getFunction(M, fInfo: AMDGPULibFunc(AMDGPULibFunc::EI_RSQRT, FInfo))) { |
1198 | LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> rsqrt(" << *opr0 |
1199 | << ")\n" ); |
1200 | Value *nval = CreateCallEx(B,Callee: FPExpr, Arg: opr0, Name: "__rootn2rsqrt" ); |
1201 | replaceCall(I: FPOp, With: nval); |
1202 | return true; |
1203 | } |
1204 | } |
1205 | return false; |
1206 | } |
1207 | |
1208 | // Get a scalar native builtin single argument FP function |
1209 | FunctionCallee AMDGPULibCalls::getNativeFunction(Module *M, |
1210 | const FuncInfo &FInfo) { |
1211 | if (getArgType(FInfo) == AMDGPULibFunc::F64 || !HasNative(id: FInfo.getId())) |
1212 | return nullptr; |
1213 | FuncInfo nf = FInfo; |
1214 | nf.setPrefix(AMDGPULibFunc::NATIVE); |
1215 | return getFunction(M, fInfo: nf); |
1216 | } |
1217 | |
1218 | // Some library calls are just wrappers around llvm intrinsics, but compiled |
1219 | // conservatively. Preserve the flags from the original call site by |
1220 | // substituting them with direct calls with all the flags. |
1221 | bool AMDGPULibCalls::shouldReplaceLibcallWithIntrinsic(const CallInst *CI, |
1222 | bool AllowMinSizeF32, |
1223 | bool AllowF64, |
1224 | bool AllowStrictFP) { |
1225 | Type *FltTy = CI->getType()->getScalarType(); |
1226 | const bool IsF32 = FltTy->isFloatTy(); |
1227 | |
1228 | // f64 intrinsics aren't implemented for most operations. |
1229 | if (!IsF32 && !FltTy->isHalfTy() && (!AllowF64 || !FltTy->isDoubleTy())) |
1230 | return false; |
1231 | |
1232 | // We're implicitly inlining by replacing the libcall with the intrinsic, so |
1233 | // don't do it for noinline call sites. |
1234 | if (CI->isNoInline()) |
1235 | return false; |
1236 | |
1237 | const Function *ParentF = CI->getFunction(); |
1238 | // TODO: Handle strictfp |
1239 | if (!AllowStrictFP && ParentF->hasFnAttribute(Attribute::StrictFP)) |
1240 | return false; |
1241 | |
1242 | if (IsF32 && !AllowMinSizeF32 && ParentF->hasMinSize()) |
1243 | return false; |
1244 | return true; |
1245 | } |
1246 | |
1247 | void AMDGPULibCalls::replaceLibCallWithSimpleIntrinsic(IRBuilder<> &B, |
1248 | CallInst *CI, |
1249 | Intrinsic::ID IntrID) { |
1250 | if (CI->arg_size() == 2) { |
1251 | Value *Arg0 = CI->getArgOperand(i: 0); |
1252 | Value *Arg1 = CI->getArgOperand(i: 1); |
1253 | VectorType *Arg0VecTy = dyn_cast<VectorType>(Val: Arg0->getType()); |
1254 | VectorType *Arg1VecTy = dyn_cast<VectorType>(Val: Arg1->getType()); |
1255 | if (Arg0VecTy && !Arg1VecTy) { |
1256 | Value *SplatRHS = B.CreateVectorSplat(EC: Arg0VecTy->getElementCount(), V: Arg1); |
1257 | CI->setArgOperand(i: 1, v: SplatRHS); |
1258 | } else if (!Arg0VecTy && Arg1VecTy) { |
1259 | Value *SplatLHS = B.CreateVectorSplat(EC: Arg1VecTy->getElementCount(), V: Arg0); |
1260 | CI->setArgOperand(i: 0, v: SplatLHS); |
1261 | } |
1262 | } |
1263 | |
1264 | CI->setCalledFunction( |
1265 | Intrinsic::getDeclaration(M: CI->getModule(), id: IntrID, Tys: {CI->getType()})); |
1266 | } |
1267 | |
1268 | bool AMDGPULibCalls::tryReplaceLibcallWithSimpleIntrinsic( |
1269 | IRBuilder<> &B, CallInst *CI, Intrinsic::ID IntrID, bool AllowMinSizeF32, |
1270 | bool AllowF64, bool AllowStrictFP) { |
1271 | if (!shouldReplaceLibcallWithIntrinsic(CI, AllowMinSizeF32, AllowF64, |
1272 | AllowStrictFP)) |
1273 | return false; |
1274 | replaceLibCallWithSimpleIntrinsic(B, CI, IntrID); |
1275 | return true; |
1276 | } |
1277 | |
1278 | std::tuple<Value *, Value *, Value *> |
1279 | AMDGPULibCalls::insertSinCos(Value *Arg, FastMathFlags FMF, IRBuilder<> &B, |
1280 | FunctionCallee Fsincos) { |
1281 | DebugLoc DL = B.getCurrentDebugLocation(); |
1282 | Function *F = B.GetInsertBlock()->getParent(); |
1283 | B.SetInsertPointPastAllocas(F); |
1284 | |
1285 | AllocaInst *Alloc = B.CreateAlloca(Ty: Arg->getType(), ArraySize: nullptr, Name: "__sincos_" ); |
1286 | |
1287 | if (Instruction *ArgInst = dyn_cast<Instruction>(Val: Arg)) { |
1288 | // If the argument is an instruction, it must dominate all uses so put our |
1289 | // sincos call there. Otherwise, right after the allocas works well enough |
1290 | // if it's an argument or constant. |
1291 | |
1292 | B.SetInsertPoint(TheBB: ArgInst->getParent(), IP: ++ArgInst->getIterator()); |
1293 | |
1294 | // SetInsertPoint unwelcomely always tries to set the debug loc. |
1295 | B.SetCurrentDebugLocation(DL); |
1296 | } |
1297 | |
1298 | Type *CosPtrTy = Fsincos.getFunctionType()->getParamType(i: 1); |
1299 | |
1300 | // The allocaInst allocates the memory in private address space. This need |
1301 | // to be addrspacecasted to point to the address space of cos pointer type. |
1302 | // In OpenCL 2.0 this is generic, while in 1.2 that is private. |
1303 | Value *CastAlloc = B.CreateAddrSpaceCast(V: Alloc, DestTy: CosPtrTy); |
1304 | |
1305 | CallInst *SinCos = CreateCallEx2(B, Callee: Fsincos, Arg1: Arg, Arg2: CastAlloc); |
1306 | |
1307 | // TODO: Is it worth trying to preserve the location for the cos calls for the |
1308 | // load? |
1309 | |
1310 | LoadInst *LoadCos = B.CreateLoad(Ty: Alloc->getAllocatedType(), Ptr: Alloc); |
1311 | return {SinCos, LoadCos, SinCos}; |
1312 | } |
1313 | |
1314 | // fold sin, cos -> sincos. |
1315 | bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B, |
1316 | const FuncInfo &fInfo) { |
1317 | assert(fInfo.getId() == AMDGPULibFunc::EI_SIN || |
1318 | fInfo.getId() == AMDGPULibFunc::EI_COS); |
1319 | |
1320 | if ((getArgType(FInfo: fInfo) != AMDGPULibFunc::F32 && |
1321 | getArgType(FInfo: fInfo) != AMDGPULibFunc::F64) || |
1322 | fInfo.getPrefix() != AMDGPULibFunc::NOPFX) |
1323 | return false; |
1324 | |
1325 | bool const isSin = fInfo.getId() == AMDGPULibFunc::EI_SIN; |
1326 | |
1327 | Value *CArgVal = FPOp->getOperand(i: 0); |
1328 | CallInst *CI = cast<CallInst>(Val: FPOp); |
1329 | |
1330 | Function *F = B.GetInsertBlock()->getParent(); |
1331 | Module *M = F->getParent(); |
1332 | |
1333 | // Merge the sin and cos. For OpenCL 2.0, there may only be a generic pointer |
1334 | // implementation. Prefer the private form if available. |
1335 | AMDGPULibFunc SinCosLibFuncPrivate(AMDGPULibFunc::EI_SINCOS, fInfo); |
1336 | SinCosLibFuncPrivate.getLeads()[0].PtrKind = |
1337 | AMDGPULibFunc::getEPtrKindFromAddrSpace(AS: AMDGPUAS::PRIVATE_ADDRESS); |
1338 | |
1339 | AMDGPULibFunc SinCosLibFuncGeneric(AMDGPULibFunc::EI_SINCOS, fInfo); |
1340 | SinCosLibFuncGeneric.getLeads()[0].PtrKind = |
1341 | AMDGPULibFunc::getEPtrKindFromAddrSpace(AS: AMDGPUAS::FLAT_ADDRESS); |
1342 | |
1343 | FunctionCallee FSinCosPrivate = getFunction(M, fInfo: SinCosLibFuncPrivate); |
1344 | FunctionCallee FSinCosGeneric = getFunction(M, fInfo: SinCosLibFuncGeneric); |
1345 | FunctionCallee FSinCos = FSinCosPrivate ? FSinCosPrivate : FSinCosGeneric; |
1346 | if (!FSinCos) |
1347 | return false; |
1348 | |
1349 | SmallVector<CallInst *> SinCalls; |
1350 | SmallVector<CallInst *> CosCalls; |
1351 | SmallVector<CallInst *> SinCosCalls; |
1352 | FuncInfo PartnerInfo(isSin ? AMDGPULibFunc::EI_COS : AMDGPULibFunc::EI_SIN, |
1353 | fInfo); |
1354 | const std::string PairName = PartnerInfo.mangle(); |
1355 | |
1356 | StringRef SinName = isSin ? CI->getCalledFunction()->getName() : PairName; |
1357 | StringRef CosName = isSin ? PairName : CI->getCalledFunction()->getName(); |
1358 | const std::string SinCosPrivateName = SinCosLibFuncPrivate.mangle(); |
1359 | const std::string SinCosGenericName = SinCosLibFuncGeneric.mangle(); |
1360 | |
1361 | // Intersect the two sets of flags. |
1362 | FastMathFlags FMF = FPOp->getFastMathFlags(); |
1363 | MDNode *FPMath = CI->getMetadata(KindID: LLVMContext::MD_fpmath); |
1364 | |
1365 | SmallVector<DILocation *> MergeDbgLocs = {CI->getDebugLoc()}; |
1366 | |
1367 | for (User* U : CArgVal->users()) { |
1368 | CallInst *XI = dyn_cast<CallInst>(Val: U); |
1369 | if (!XI || XI->getFunction() != F || XI->isNoBuiltin()) |
1370 | continue; |
1371 | |
1372 | Function *UCallee = XI->getCalledFunction(); |
1373 | if (!UCallee) |
1374 | continue; |
1375 | |
1376 | bool Handled = true; |
1377 | |
1378 | if (UCallee->getName() == SinName) |
1379 | SinCalls.push_back(Elt: XI); |
1380 | else if (UCallee->getName() == CosName) |
1381 | CosCalls.push_back(Elt: XI); |
1382 | else if (UCallee->getName() == SinCosPrivateName || |
1383 | UCallee->getName() == SinCosGenericName) |
1384 | SinCosCalls.push_back(Elt: XI); |
1385 | else |
1386 | Handled = false; |
1387 | |
1388 | if (Handled) { |
1389 | MergeDbgLocs.push_back(Elt: XI->getDebugLoc()); |
1390 | auto *OtherOp = cast<FPMathOperator>(Val: XI); |
1391 | FMF &= OtherOp->getFastMathFlags(); |
1392 | FPMath = MDNode::getMostGenericFPMath( |
1393 | A: FPMath, B: XI->getMetadata(KindID: LLVMContext::MD_fpmath)); |
1394 | } |
1395 | } |
1396 | |
1397 | if (SinCalls.empty() || CosCalls.empty()) |
1398 | return false; |
1399 | |
1400 | B.setFastMathFlags(FMF); |
1401 | B.setDefaultFPMathTag(FPMath); |
1402 | DILocation *DbgLoc = DILocation::getMergedLocations(Locs: MergeDbgLocs); |
1403 | B.SetCurrentDebugLocation(DbgLoc); |
1404 | |
1405 | auto [Sin, Cos, SinCos] = insertSinCos(Arg: CArgVal, FMF, B, Fsincos: FSinCos); |
1406 | |
1407 | auto replaceTrigInsts = [](ArrayRef<CallInst *> Calls, Value *Res) { |
1408 | for (CallInst *C : Calls) |
1409 | C->replaceAllUsesWith(V: Res); |
1410 | |
1411 | // Leave the other dead instructions to avoid clobbering iterators. |
1412 | }; |
1413 | |
1414 | replaceTrigInsts(SinCalls, Sin); |
1415 | replaceTrigInsts(CosCalls, Cos); |
1416 | replaceTrigInsts(SinCosCalls, SinCos); |
1417 | |
1418 | // It's safe to delete the original now. |
1419 | CI->eraseFromParent(); |
1420 | return true; |
1421 | } |
1422 | |
1423 | bool AMDGPULibCalls::evaluateScalarMathFunc(const FuncInfo &FInfo, double &Res0, |
1424 | double &Res1, Constant *copr0, |
1425 | Constant *copr1) { |
1426 | // By default, opr0/opr1/opr3 holds values of float/double type. |
1427 | // If they are not float/double, each function has to its |
1428 | // operand separately. |
1429 | double opr0 = 0.0, opr1 = 0.0; |
1430 | ConstantFP *fpopr0 = dyn_cast_or_null<ConstantFP>(Val: copr0); |
1431 | ConstantFP *fpopr1 = dyn_cast_or_null<ConstantFP>(Val: copr1); |
1432 | if (fpopr0) { |
1433 | opr0 = (getArgType(FInfo) == AMDGPULibFunc::F64) |
1434 | ? fpopr0->getValueAPF().convertToDouble() |
1435 | : (double)fpopr0->getValueAPF().convertToFloat(); |
1436 | } |
1437 | |
1438 | if (fpopr1) { |
1439 | opr1 = (getArgType(FInfo) == AMDGPULibFunc::F64) |
1440 | ? fpopr1->getValueAPF().convertToDouble() |
1441 | : (double)fpopr1->getValueAPF().convertToFloat(); |
1442 | } |
1443 | |
1444 | switch (FInfo.getId()) { |
1445 | default : return false; |
1446 | |
1447 | case AMDGPULibFunc::EI_ACOS: |
1448 | Res0 = acos(x: opr0); |
1449 | return true; |
1450 | |
1451 | case AMDGPULibFunc::EI_ACOSH: |
1452 | // acosh(x) == log(x + sqrt(x*x - 1)) |
1453 | Res0 = log(x: opr0 + sqrt(x: opr0*opr0 - 1.0)); |
1454 | return true; |
1455 | |
1456 | case AMDGPULibFunc::EI_ACOSPI: |
1457 | Res0 = acos(x: opr0) / MATH_PI; |
1458 | return true; |
1459 | |
1460 | case AMDGPULibFunc::EI_ASIN: |
1461 | Res0 = asin(x: opr0); |
1462 | return true; |
1463 | |
1464 | case AMDGPULibFunc::EI_ASINH: |
1465 | // asinh(x) == log(x + sqrt(x*x + 1)) |
1466 | Res0 = log(x: opr0 + sqrt(x: opr0*opr0 + 1.0)); |
1467 | return true; |
1468 | |
1469 | case AMDGPULibFunc::EI_ASINPI: |
1470 | Res0 = asin(x: opr0) / MATH_PI; |
1471 | return true; |
1472 | |
1473 | case AMDGPULibFunc::EI_ATAN: |
1474 | Res0 = atan(x: opr0); |
1475 | return true; |
1476 | |
1477 | case AMDGPULibFunc::EI_ATANH: |
1478 | // atanh(x) == (log(x+1) - log(x-1))/2; |
1479 | Res0 = (log(x: opr0 + 1.0) - log(x: opr0 - 1.0))/2.0; |
1480 | return true; |
1481 | |
1482 | case AMDGPULibFunc::EI_ATANPI: |
1483 | Res0 = atan(x: opr0) / MATH_PI; |
1484 | return true; |
1485 | |
1486 | case AMDGPULibFunc::EI_CBRT: |
1487 | Res0 = (opr0 < 0.0) ? -pow(x: -opr0, y: 1.0/3.0) : pow(x: opr0, y: 1.0/3.0); |
1488 | return true; |
1489 | |
1490 | case AMDGPULibFunc::EI_COS: |
1491 | Res0 = cos(x: opr0); |
1492 | return true; |
1493 | |
1494 | case AMDGPULibFunc::EI_COSH: |
1495 | Res0 = cosh(x: opr0); |
1496 | return true; |
1497 | |
1498 | case AMDGPULibFunc::EI_COSPI: |
1499 | Res0 = cos(MATH_PI * opr0); |
1500 | return true; |
1501 | |
1502 | case AMDGPULibFunc::EI_EXP: |
1503 | Res0 = exp(x: opr0); |
1504 | return true; |
1505 | |
1506 | case AMDGPULibFunc::EI_EXP2: |
1507 | Res0 = pow(x: 2.0, y: opr0); |
1508 | return true; |
1509 | |
1510 | case AMDGPULibFunc::EI_EXP10: |
1511 | Res0 = pow(x: 10.0, y: opr0); |
1512 | return true; |
1513 | |
1514 | case AMDGPULibFunc::EI_LOG: |
1515 | Res0 = log(x: opr0); |
1516 | return true; |
1517 | |
1518 | case AMDGPULibFunc::EI_LOG2: |
1519 | Res0 = log(x: opr0) / log(x: 2.0); |
1520 | return true; |
1521 | |
1522 | case AMDGPULibFunc::EI_LOG10: |
1523 | Res0 = log(x: opr0) / log(x: 10.0); |
1524 | return true; |
1525 | |
1526 | case AMDGPULibFunc::EI_RSQRT: |
1527 | Res0 = 1.0 / sqrt(x: opr0); |
1528 | return true; |
1529 | |
1530 | case AMDGPULibFunc::EI_SIN: |
1531 | Res0 = sin(x: opr0); |
1532 | return true; |
1533 | |
1534 | case AMDGPULibFunc::EI_SINH: |
1535 | Res0 = sinh(x: opr0); |
1536 | return true; |
1537 | |
1538 | case AMDGPULibFunc::EI_SINPI: |
1539 | Res0 = sin(MATH_PI * opr0); |
1540 | return true; |
1541 | |
1542 | case AMDGPULibFunc::EI_TAN: |
1543 | Res0 = tan(x: opr0); |
1544 | return true; |
1545 | |
1546 | case AMDGPULibFunc::EI_TANH: |
1547 | Res0 = tanh(x: opr0); |
1548 | return true; |
1549 | |
1550 | case AMDGPULibFunc::EI_TANPI: |
1551 | Res0 = tan(MATH_PI * opr0); |
1552 | return true; |
1553 | |
1554 | // two-arg functions |
1555 | case AMDGPULibFunc::EI_POW: |
1556 | case AMDGPULibFunc::EI_POWR: |
1557 | Res0 = pow(x: opr0, y: opr1); |
1558 | return true; |
1559 | |
1560 | case AMDGPULibFunc::EI_POWN: { |
1561 | if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(Val: copr1)) { |
1562 | double val = (double)iopr1->getSExtValue(); |
1563 | Res0 = pow(x: opr0, y: val); |
1564 | return true; |
1565 | } |
1566 | return false; |
1567 | } |
1568 | |
1569 | case AMDGPULibFunc::EI_ROOTN: { |
1570 | if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(Val: copr1)) { |
1571 | double val = (double)iopr1->getSExtValue(); |
1572 | Res0 = pow(x: opr0, y: 1.0 / val); |
1573 | return true; |
1574 | } |
1575 | return false; |
1576 | } |
1577 | |
1578 | // with ptr arg |
1579 | case AMDGPULibFunc::EI_SINCOS: |
1580 | Res0 = sin(x: opr0); |
1581 | Res1 = cos(x: opr0); |
1582 | return true; |
1583 | } |
1584 | |
1585 | return false; |
1586 | } |
1587 | |
1588 | bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) { |
1589 | int numArgs = (int)aCI->arg_size(); |
1590 | if (numArgs > 3) |
1591 | return false; |
1592 | |
1593 | Constant *copr0 = nullptr; |
1594 | Constant *copr1 = nullptr; |
1595 | if (numArgs > 0) { |
1596 | if ((copr0 = dyn_cast<Constant>(Val: aCI->getArgOperand(i: 0))) == nullptr) |
1597 | return false; |
1598 | } |
1599 | |
1600 | if (numArgs > 1) { |
1601 | if ((copr1 = dyn_cast<Constant>(Val: aCI->getArgOperand(i: 1))) == nullptr) { |
1602 | if (FInfo.getId() != AMDGPULibFunc::EI_SINCOS) |
1603 | return false; |
1604 | } |
1605 | } |
1606 | |
1607 | // At this point, all arguments to aCI are constants. |
1608 | |
1609 | // max vector size is 16, and sincos will generate two results. |
1610 | double DVal0[16], DVal1[16]; |
1611 | int FuncVecSize = getVecSize(FInfo); |
1612 | bool hasTwoResults = (FInfo.getId() == AMDGPULibFunc::EI_SINCOS); |
1613 | if (FuncVecSize == 1) { |
1614 | if (!evaluateScalarMathFunc(FInfo, Res0&: DVal0[0], Res1&: DVal1[0], copr0, copr1)) { |
1615 | return false; |
1616 | } |
1617 | } else { |
1618 | ConstantDataVector *CDV0 = dyn_cast_or_null<ConstantDataVector>(Val: copr0); |
1619 | ConstantDataVector *CDV1 = dyn_cast_or_null<ConstantDataVector>(Val: copr1); |
1620 | for (int i = 0; i < FuncVecSize; ++i) { |
1621 | Constant *celt0 = CDV0 ? CDV0->getElementAsConstant(i) : nullptr; |
1622 | Constant *celt1 = CDV1 ? CDV1->getElementAsConstant(i) : nullptr; |
1623 | if (!evaluateScalarMathFunc(FInfo, Res0&: DVal0[i], Res1&: DVal1[i], copr0: celt0, copr1: celt1)) { |
1624 | return false; |
1625 | } |
1626 | } |
1627 | } |
1628 | |
1629 | LLVMContext &context = aCI->getContext(); |
1630 | Constant *nval0, *nval1; |
1631 | if (FuncVecSize == 1) { |
1632 | nval0 = ConstantFP::get(Ty: aCI->getType(), V: DVal0[0]); |
1633 | if (hasTwoResults) |
1634 | nval1 = ConstantFP::get(Ty: aCI->getType(), V: DVal1[0]); |
1635 | } else { |
1636 | if (getArgType(FInfo) == AMDGPULibFunc::F32) { |
1637 | SmallVector <float, 0> FVal0, FVal1; |
1638 | for (int i = 0; i < FuncVecSize; ++i) |
1639 | FVal0.push_back(Elt: (float)DVal0[i]); |
1640 | ArrayRef<float> tmp0(FVal0); |
1641 | nval0 = ConstantDataVector::get(Context&: context, Elts: tmp0); |
1642 | if (hasTwoResults) { |
1643 | for (int i = 0; i < FuncVecSize; ++i) |
1644 | FVal1.push_back(Elt: (float)DVal1[i]); |
1645 | ArrayRef<float> tmp1(FVal1); |
1646 | nval1 = ConstantDataVector::get(Context&: context, Elts: tmp1); |
1647 | } |
1648 | } else { |
1649 | ArrayRef<double> tmp0(DVal0); |
1650 | nval0 = ConstantDataVector::get(Context&: context, Elts: tmp0); |
1651 | if (hasTwoResults) { |
1652 | ArrayRef<double> tmp1(DVal1); |
1653 | nval1 = ConstantDataVector::get(Context&: context, Elts: tmp1); |
1654 | } |
1655 | } |
1656 | } |
1657 | |
1658 | if (hasTwoResults) { |
1659 | // sincos |
1660 | assert(FInfo.getId() == AMDGPULibFunc::EI_SINCOS && |
1661 | "math function with ptr arg not supported yet" ); |
1662 | new StoreInst(nval1, aCI->getArgOperand(i: 1), aCI->getIterator()); |
1663 | } |
1664 | |
1665 | replaceCall(I: aCI, With: nval0); |
1666 | return true; |
1667 | } |
1668 | |
1669 | PreservedAnalyses AMDGPUSimplifyLibCallsPass::run(Function &F, |
1670 | FunctionAnalysisManager &AM) { |
1671 | AMDGPULibCalls Simplifier; |
1672 | Simplifier.initNativeFuncs(); |
1673 | Simplifier.initFunction(F, FAM&: AM); |
1674 | |
1675 | bool Changed = false; |
1676 | |
1677 | LLVM_DEBUG(dbgs() << "AMDIC: process function " ; |
1678 | F.printAsOperand(dbgs(), false, F.getParent()); dbgs() << '\n';); |
1679 | |
1680 | for (auto &BB : F) { |
1681 | for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;) { |
1682 | // Ignore non-calls. |
1683 | CallInst *CI = dyn_cast<CallInst>(Val&: I); |
1684 | ++I; |
1685 | |
1686 | if (CI) { |
1687 | if (Simplifier.fold(CI)) |
1688 | Changed = true; |
1689 | } |
1690 | } |
1691 | } |
1692 | return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); |
1693 | } |
1694 | |
1695 | PreservedAnalyses AMDGPUUseNativeCallsPass::run(Function &F, |
1696 | FunctionAnalysisManager &AM) { |
1697 | if (UseNative.empty()) |
1698 | return PreservedAnalyses::all(); |
1699 | |
1700 | AMDGPULibCalls Simplifier; |
1701 | Simplifier.initNativeFuncs(); |
1702 | Simplifier.initFunction(F, FAM&: AM); |
1703 | |
1704 | bool Changed = false; |
1705 | for (auto &BB : F) { |
1706 | for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;) { |
1707 | // Ignore non-calls. |
1708 | CallInst *CI = dyn_cast<CallInst>(Val&: I); |
1709 | ++I; |
1710 | if (CI && Simplifier.useNative(aCI: CI)) |
1711 | Changed = true; |
1712 | } |
1713 | } |
1714 | return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); |
1715 | } |
1716 | |