| 1 | //===- OpToFuncCallLowering.h - GPU ops lowering to custom calls *- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | #ifndef MLIR_CONVERSION_GPUCOMMON_OPTOFUNCCALLLOWERING_H_ |
| 9 | #define MLIR_CONVERSION_GPUCOMMON_OPTOFUNCCALLLOWERING_H_ |
| 10 | |
| 11 | #include "mlir/Conversion/LLVMCommon/Pattern.h" |
| 12 | #include "mlir/Dialect/Arith/IR/Arith.h" |
| 13 | #include "mlir/Dialect/GPU/IR/GPUDialect.h" |
| 14 | #include "mlir/Dialect/LLVMIR/LLVMDialect.h" |
| 15 | #include "mlir/IR/Builders.h" |
| 16 | |
| 17 | namespace mlir { |
| 18 | |
| 19 | namespace { |
| 20 | /// Detection trait tor the `getFastmath` instance method. |
| 21 | template <typename T> |
| 22 | using has_get_fastmath_t = decltype(std::declval<T>().getFastmath()); |
| 23 | } // namespace |
| 24 | |
| 25 | /// Rewriting that replaces SourceOp with a CallOp to `f32Func` or `f64Func` or |
| 26 | /// `f32ApproxFunc` or `f16Func` or `i32Type` depending on the element type and |
| 27 | /// the fastMathFlag of that Op, if present. The function declaration is added |
| 28 | /// in case it was not added before. |
| 29 | /// |
| 30 | /// If the input values are of bf16 type (or f16 type if f16Func is empty), the |
| 31 | /// value is first casted to f32, the function called and then the result casted |
| 32 | /// back. |
| 33 | /// |
| 34 | /// Example with NVVM: |
| 35 | /// %exp_f32 = math.exp %arg_f32 : f32 |
| 36 | /// |
| 37 | /// will be transformed into |
| 38 | /// llvm.call @__nv_expf(%arg_f32) : (f32) -> f32 |
| 39 | /// |
| 40 | /// If the fastMathFlag attribute of SourceOp is `afn` or `fast`, this Op lowers |
| 41 | /// to the approximate calculation function. |
| 42 | /// |
| 43 | /// Also example with NVVM: |
| 44 | /// %exp_f32 = math.exp %arg_f32 fastmath<afn> : f32 |
| 45 | /// |
| 46 | /// will be transformed into |
| 47 | /// llvm.call @__nv_fast_expf(%arg_f32) : (f32) -> f32 |
| 48 | /// |
| 49 | /// Final example with NVVM: |
| 50 | /// %pow_f32 = math.fpowi %arg_f32, %arg_i32 |
| 51 | /// |
| 52 | /// will be transformed into |
| 53 | /// llvm.call @__nv_powif(%arg_f32, %arg_i32) : (f32, i32) -> f32 |
| 54 | template <typename SourceOp> |
| 55 | struct OpToFuncCallLowering : public ConvertOpToLLVMPattern<SourceOp> { |
| 56 | public: |
| 57 | explicit OpToFuncCallLowering(const LLVMTypeConverter &lowering, |
| 58 | StringRef f32Func, StringRef f64Func, |
| 59 | StringRef f32ApproxFunc, StringRef f16Func, |
| 60 | StringRef i32Func = "" , |
| 61 | PatternBenefit benefit = 1) |
| 62 | : ConvertOpToLLVMPattern<SourceOp>(lowering, benefit), f32Func(f32Func), |
| 63 | f64Func(f64Func), f32ApproxFunc(f32ApproxFunc), f16Func(f16Func), |
| 64 | i32Func(i32Func) {} |
| 65 | |
| 66 | LogicalResult |
| 67 | matchAndRewrite(SourceOp op, typename SourceOp::Adaptor adaptor, |
| 68 | ConversionPatternRewriter &rewriter) const override { |
| 69 | using LLVM::LLVMFuncOp; |
| 70 | |
| 71 | static_assert( |
| 72 | std::is_base_of<OpTrait::OneResult<SourceOp>, SourceOp>::value, |
| 73 | "expected single result op" ); |
| 74 | |
| 75 | bool isResultBool = op->getResultTypes().front().isInteger(1); |
| 76 | if constexpr (!std::is_base_of<OpTrait::SameOperandsAndResultType<SourceOp>, |
| 77 | SourceOp>::value) { |
| 78 | assert(op->getNumOperands() > 0 && |
| 79 | "expected op to take at least one operand" ); |
| 80 | assert((op->getResultTypes().front() == op->getOperand(0).getType() || |
| 81 | isResultBool) && |
| 82 | "expected op with same operand and result types" ); |
| 83 | } |
| 84 | |
| 85 | if (!op->template getParentOfType<FunctionOpInterface>()) { |
| 86 | return rewriter.notifyMatchFailure( |
| 87 | op, "expected op to be within a function region" ); |
| 88 | } |
| 89 | |
| 90 | SmallVector<Value, 1> castedOperands; |
| 91 | for (Value operand : adaptor.getOperands()) |
| 92 | castedOperands.push_back(Elt: maybeCast(operand, rewriter)); |
| 93 | |
| 94 | Type castedOperandType = castedOperands.front().getType(); |
| 95 | |
| 96 | // At ABI level, booleans are treated as i32. |
| 97 | Type resultType = |
| 98 | isResultBool ? rewriter.getIntegerType(32) : castedOperandType; |
| 99 | Type funcType = getFunctionType(resultType, operands: castedOperands); |
| 100 | StringRef funcName = getFunctionName(type: castedOperandType, op); |
| 101 | if (funcName.empty()) |
| 102 | return failure(); |
| 103 | |
| 104 | LLVMFuncOp funcOp = appendOrGetFuncOp(funcName, funcType, op); |
| 105 | auto callOp = |
| 106 | rewriter.create<LLVM::CallOp>(op->getLoc(), funcOp, castedOperands); |
| 107 | |
| 108 | if (resultType == adaptor.getOperands().front().getType()) { |
| 109 | rewriter.replaceOp(op, {callOp.getResult()}); |
| 110 | return success(); |
| 111 | } |
| 112 | |
| 113 | // Boolean result are mapping to i32 at the ABI level with zero values being |
| 114 | // interpreted as false and non-zero values being interpreted as true. Since |
| 115 | // there is no guarantee of a specific value being used to indicate true, |
| 116 | // compare for inequality with zero (rather than truncate or shift). |
| 117 | if (isResultBool) { |
| 118 | Value zero = rewriter.create<LLVM::ConstantOp>( |
| 119 | op->getLoc(), rewriter.getIntegerType(32), |
| 120 | rewriter.getI32IntegerAttr(0)); |
| 121 | Value truncated = rewriter.create<LLVM::ICmpOp>( |
| 122 | op->getLoc(), LLVM::ICmpPredicate::ne, callOp.getResult(), zero); |
| 123 | rewriter.replaceOp(op, {truncated}); |
| 124 | return success(); |
| 125 | } |
| 126 | |
| 127 | assert(callOp.getResult().getType().isF32() && |
| 128 | "only f32 types are supposed to be truncated back" ); |
| 129 | Value truncated = rewriter.create<LLVM::FPTruncOp>( |
| 130 | op->getLoc(), adaptor.getOperands().front().getType(), |
| 131 | callOp.getResult()); |
| 132 | rewriter.replaceOp(op, {truncated}); |
| 133 | return success(); |
| 134 | } |
| 135 | |
| 136 | Value maybeCast(Value operand, PatternRewriter &rewriter) const { |
| 137 | Type type = operand.getType(); |
| 138 | if (!isa<Float16Type, BFloat16Type>(type)) |
| 139 | return operand; |
| 140 | |
| 141 | // If there's an f16 function, no need to cast f16 values. |
| 142 | if (!f16Func.empty() && isa<Float16Type>(type)) |
| 143 | return operand; |
| 144 | |
| 145 | return rewriter.create<LLVM::FPExtOp>( |
| 146 | operand.getLoc(), Float32Type::get(rewriter.getContext()), operand); |
| 147 | } |
| 148 | |
| 149 | Type getFunctionType(Type resultType, ValueRange operands) const { |
| 150 | SmallVector<Type> operandTypes(operands.getTypes()); |
| 151 | return LLVM::LLVMFunctionType::get(resultType, operandTypes); |
| 152 | } |
| 153 | |
| 154 | LLVM::LLVMFuncOp appendOrGetFuncOp(StringRef funcName, Type funcType, |
| 155 | Operation *op) const { |
| 156 | using LLVM::LLVMFuncOp; |
| 157 | |
| 158 | auto funcAttr = StringAttr::get(op->getContext(), funcName); |
| 159 | auto funcOp = |
| 160 | SymbolTable::lookupNearestSymbolFrom<LLVMFuncOp>(op, funcAttr); |
| 161 | if (funcOp) |
| 162 | return funcOp; |
| 163 | |
| 164 | auto parentFunc = op->getParentOfType<FunctionOpInterface>(); |
| 165 | assert(parentFunc && "expected there to be a parent function" ); |
| 166 | OpBuilder b(parentFunc); |
| 167 | return b.create<LLVMFuncOp>(op->getLoc(), funcName, funcType); |
| 168 | } |
| 169 | |
| 170 | StringRef getFunctionName(Type type, SourceOp op) const { |
| 171 | bool useApprox = false; |
| 172 | if constexpr (llvm::is_detected<has_get_fastmath_t, SourceOp>::value) { |
| 173 | arith::FastMathFlags flag = op.getFastmath(); |
| 174 | useApprox = ((uint32_t)arith::FastMathFlags::afn & (uint32_t)flag) && |
| 175 | !f32ApproxFunc.empty(); |
| 176 | } |
| 177 | |
| 178 | if (isa<Float16Type>(type)) |
| 179 | return f16Func; |
| 180 | if (isa<Float32Type>(type)) { |
| 181 | if (useApprox) |
| 182 | return f32ApproxFunc; |
| 183 | return f32Func; |
| 184 | } |
| 185 | if (isa<Float64Type>(type)) |
| 186 | return f64Func; |
| 187 | |
| 188 | if (type.isInteger(width: 32)) |
| 189 | return i32Func; |
| 190 | return "" ; |
| 191 | } |
| 192 | |
| 193 | const std::string f32Func; |
| 194 | const std::string f64Func; |
| 195 | const std::string f32ApproxFunc; |
| 196 | const std::string f16Func; |
| 197 | const std::string i32Func; |
| 198 | }; |
| 199 | |
| 200 | } // namespace mlir |
| 201 | |
| 202 | #endif // MLIR_CONVERSION_GPUCOMMON_OPTOFUNCCALLLOWERING_H_ |
| 203 | |