| 1 | //===- KernelOutlining.cpp - Implementation of GPU kernel outlining -------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This file implements the GPU dialect kernel outlining pass. |
| 10 | // |
| 11 | //===----------------------------------------------------------------------===// |
| 12 | |
| 13 | #include "mlir/Dialect/GPU/Transforms/Passes.h" |
| 14 | |
| 15 | #include "mlir/AsmParser/AsmParser.h" |
| 16 | #include "mlir/Dialect/Arith/IR/Arith.h" |
| 17 | #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h" |
| 18 | #include "mlir/Dialect/DLTI/DLTI.h" |
| 19 | #include "mlir/Dialect/GPU/IR/GPUDialect.h" |
| 20 | #include "mlir/Dialect/GPU/Utils/GPUUtils.h" |
| 21 | #include "mlir/Dialect/MemRef/IR/MemRef.h" |
| 22 | #include "mlir/IR/Builders.h" |
| 23 | #include "mlir/IR/BuiltinAttributes.h" |
| 24 | #include "mlir/IR/IRMapping.h" |
| 25 | #include "mlir/IR/Matchers.h" |
| 26 | #include "mlir/IR/SymbolTable.h" |
| 27 | #include "mlir/Support/LLVM.h" |
| 28 | #include "mlir/Transforms/RegionUtils.h" |
| 29 | #include <limits> |
| 30 | |
| 31 | namespace mlir { |
| 32 | #define GEN_PASS_DEF_GPULAUNCHSINKINDEXCOMPUTATIONSPASS |
| 33 | #define GEN_PASS_DEF_GPUKERNELOUTLININGPASS |
| 34 | #include "mlir/Dialect/GPU/Transforms/Passes.h.inc" |
| 35 | } // namespace mlir |
| 36 | |
| 37 | using namespace mlir; |
| 38 | |
| 39 | template <typename OpTy> |
| 40 | static void createForAllDimensions(OpBuilder &builder, Location loc, |
| 41 | SmallVectorImpl<Value> &values) { |
| 42 | for (auto dim : {gpu::Dimension::x, gpu::Dimension::y, gpu::Dimension::z}) |
| 43 | values.push_back(Elt: builder.create<OpTy>(loc, builder.getIndexType(), dim)); |
| 44 | } |
| 45 | |
| 46 | /// Adds operations generating block/thread ids and grid/block dimensions at the |
| 47 | /// beginning of the `launchFuncOpBody` region. Add mapping from argument in |
| 48 | /// entry block of `launchOpBody`, to the corresponding result value of the |
| 49 | /// added operations. |
| 50 | static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody, |
| 51 | Region &launchOpBody, IRMapping &map, |
| 52 | bool hasCluster = false) { |
| 53 | OpBuilder builder(loc->getContext()); |
| 54 | Block &firstBlock = launchOpBody.front(); |
| 55 | builder.setInsertionPointToStart(&launchFuncOpBody.front()); |
| 56 | SmallVector<Value> indexOps; |
| 57 | // The order is important here, as it must match the order of the arguments |
| 58 | createForAllDimensions<gpu::BlockIdOp>(builder, loc, values&: indexOps); |
| 59 | createForAllDimensions<gpu::ThreadIdOp>(builder, loc, values&: indexOps); |
| 60 | createForAllDimensions<gpu::GridDimOp>(builder, loc, values&: indexOps); |
| 61 | createForAllDimensions<gpu::BlockDimOp>(builder, loc, values&: indexOps); |
| 62 | if (hasCluster) { |
| 63 | createForAllDimensions<gpu::ClusterIdOp>(builder, loc, values&: indexOps); |
| 64 | createForAllDimensions<gpu::ClusterDimOp>(builder, loc, values&: indexOps); |
| 65 | } |
| 66 | // Replace the leading 12 function args with the respective thread/block index |
| 67 | // operations. Iterate backwards since args are erased and indices change. |
| 68 | for (const auto &indexOp : enumerate(First&: indexOps)) |
| 69 | map.map(from: firstBlock.getArgument(i: indexOp.index()), to: indexOp.value()); |
| 70 | } |
| 71 | |
| 72 | /// Identifies operations that are beneficial to sink into kernels. These |
| 73 | /// operations may not have side-effects, as otherwise sinking (and hence |
| 74 | /// duplicating them) is not legal. |
| 75 | static bool isLikelyAnIndexComputation(Operation *op) { |
| 76 | return matchPattern(op, pattern: m_Constant()) || |
| 77 | isa<memref::DimOp, arith::SelectOp, arith::CmpIOp>(Val: op); |
| 78 | } |
| 79 | |
| 80 | /// For a given operation `op`, computes whether it is beneficial to sink the |
| 81 | /// operation into the kernel. An operation can be sunk if doing so does not |
| 82 | /// introduce new kernel arguments. Whether a value is already available in the |
| 83 | /// kernel (and hence does not introduce new arguments) is checked by |
| 84 | /// querying `existingDependencies` and `availableValues`. |
| 85 | /// If an operand is not yet available, we recursively check whether it can be |
| 86 | /// made available by siking its defining op. |
| 87 | /// Operations that are indentified for sinking are added to `beneficiaryOps` in |
| 88 | /// the order they should appear in the kernel. Furthermore, `availableValues` |
| 89 | /// is updated with results that will be available after sinking the identified |
| 90 | /// ops. |
| 91 | static bool ( |
| 92 | Operation *op, const SetVector<Value> &existingDependencies, |
| 93 | SetVector<Operation *> &beneficiaryOps, |
| 94 | llvm::SmallPtrSetImpl<Value> &availableValues, |
| 95 | llvm::function_ref<bool(Operation *)> isSinkingBeneficiary) { |
| 96 | if (beneficiaryOps.count(key: op)) |
| 97 | return true; |
| 98 | |
| 99 | if (!isSinkingBeneficiary(op)) |
| 100 | return false; |
| 101 | |
| 102 | for (Value operand : op->getOperands()) { |
| 103 | // It is already visible in the kernel, keep going. |
| 104 | if (availableValues.count(Ptr: operand)) |
| 105 | continue; |
| 106 | // Else check whether it can be made available via sinking or already is a |
| 107 | // dependency. |
| 108 | Operation *definingOp = operand.getDefiningOp(); |
| 109 | if ((!definingOp || !extractBeneficiaryOps(op: definingOp, existingDependencies, |
| 110 | beneficiaryOps, availableValues, |
| 111 | isSinkingBeneficiary)) && |
| 112 | !existingDependencies.count(key: operand)) |
| 113 | return false; |
| 114 | } |
| 115 | // We will sink the operation, mark its results as now available. |
| 116 | beneficiaryOps.insert(X: op); |
| 117 | for (Value result : op->getResults()) |
| 118 | availableValues.insert(Ptr: result); |
| 119 | return true; |
| 120 | } |
| 121 | |
| 122 | LogicalResult mlir::sinkOperationsIntoLaunchOp( |
| 123 | gpu::LaunchOp launchOp, |
| 124 | llvm::function_ref<bool(Operation *)> isSinkingBeneficiary) { |
| 125 | assert(isSinkingBeneficiary); |
| 126 | Region &launchOpBody = launchOp.getBody(); |
| 127 | |
| 128 | // Identify uses from values defined outside of the scope of the launch |
| 129 | // operation. |
| 130 | SetVector<Value> sinkCandidates; |
| 131 | getUsedValuesDefinedAbove(regions: launchOpBody, values&: sinkCandidates); |
| 132 | |
| 133 | SetVector<Operation *> toBeSunk; |
| 134 | llvm::SmallPtrSet<Value, 4> availableValues; |
| 135 | for (Value operand : sinkCandidates) { |
| 136 | Operation *operandOp = operand.getDefiningOp(); |
| 137 | if (!operandOp) |
| 138 | continue; |
| 139 | extractBeneficiaryOps(op: operandOp, existingDependencies: sinkCandidates, beneficiaryOps&: toBeSunk, availableValues, |
| 140 | isSinkingBeneficiary); |
| 141 | } |
| 142 | |
| 143 | // Insert operations so that the defs get cloned before uses. |
| 144 | IRMapping map; |
| 145 | OpBuilder builder(launchOpBody); |
| 146 | for (Operation *op : toBeSunk) { |
| 147 | Operation *clonedOp = builder.clone(op&: *op, mapper&: map); |
| 148 | // Only replace uses within the launch op. |
| 149 | for (auto pair : llvm::zip(t: op->getResults(), u: clonedOp->getResults())) |
| 150 | replaceAllUsesInRegionWith(orig: std::get<0>(t&: pair), replacement: std::get<1>(t&: pair), |
| 151 | region&: launchOp.getBody()); |
| 152 | } |
| 153 | return success(); |
| 154 | } |
| 155 | |
| 156 | /// Return the provided KernelDim3 as an array of i32 constants if possible. |
| 157 | static DenseI32ArrayAttr maybeConstantDimsAttr(gpu::KernelDim3 dims) { |
| 158 | SmallVector<int32_t, 3> constants; |
| 159 | MLIRContext *ctx = dims.x.getContext(); |
| 160 | for (Value v : {dims.x, dims.y, dims.z}) { |
| 161 | APInt constValue; |
| 162 | if (!matchPattern(value: v, pattern: m_ConstantInt(bind_value: &constValue))) |
| 163 | return nullptr; |
| 164 | // In the event someone called for a too-large block or grid dimension, |
| 165 | // don't set bounds as it is likely to cause more confusing behavior. |
| 166 | if (constValue.ugt(RHS: std::numeric_limits<uint32_t>::max())) |
| 167 | return nullptr; |
| 168 | constants.push_back( |
| 169 | Elt: constValue.getLimitedValue(Limit: std::numeric_limits<uint32_t>::max())); |
| 170 | } |
| 171 | return DenseI32ArrayAttr::get(context: ctx, content: constants); |
| 172 | } |
| 173 | |
| 174 | /// Outline the `gpu.launch` operation body into a kernel function. Replace |
| 175 | /// `gpu.terminator` operations by `gpu.return` in the generated function. |
| 176 | /// Set block and grid size bounds if known. |
| 177 | static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp, |
| 178 | StringRef kernelFnName, |
| 179 | SetVector<Value> &operands) { |
| 180 | Location loc = launchOp.getLoc(); |
| 181 | // Create a builder with no insertion point, insertion will happen separately |
| 182 | // due to symbol table manipulation. |
| 183 | OpBuilder builder(launchOp.getContext()); |
| 184 | Region &launchOpBody = launchOp.getBody(); |
| 185 | |
| 186 | // Identify uses from values defined outside of the scope of the launch |
| 187 | // operation. |
| 188 | getUsedValuesDefinedAbove(regions: launchOpBody, values&: operands); |
| 189 | |
| 190 | // Create the gpu.func operation. |
| 191 | SmallVector<Type, 4> kernelOperandTypes; |
| 192 | kernelOperandTypes.reserve(N: operands.size()); |
| 193 | for (Value operand : operands) { |
| 194 | kernelOperandTypes.push_back(Elt: operand.getType()); |
| 195 | } |
| 196 | FunctionType type = |
| 197 | FunctionType::get(context: launchOp.getContext(), inputs: kernelOperandTypes, results: {}); |
| 198 | auto outlinedFunc = builder.create<gpu::GPUFuncOp>( |
| 199 | location: loc, args&: kernelFnName, args&: type, |
| 200 | args: TypeRange(ValueRange(launchOp.getWorkgroupAttributions())), |
| 201 | args: TypeRange(ValueRange(launchOp.getPrivateAttributions()))); |
| 202 | outlinedFunc->setAttr(name: gpu::GPUDialect::getKernelFuncAttrName(), |
| 203 | value: builder.getUnitAttr()); |
| 204 | |
| 205 | // If we can infer bounds on the grid and/or block sizes from the arguments |
| 206 | // to the launch op, propagate them to the generated kernel. This is safe |
| 207 | // because multiple launches with the same body are not deduplicated. |
| 208 | if (auto blockBounds = |
| 209 | maybeConstantDimsAttr(dims: launchOp.getBlockSizeOperandValues())) |
| 210 | outlinedFunc.setKnownBlockSizeAttr(blockBounds); |
| 211 | if (auto gridBounds = |
| 212 | maybeConstantDimsAttr(dims: launchOp.getGridSizeOperandValues())) |
| 213 | outlinedFunc.setKnownGridSizeAttr(gridBounds); |
| 214 | |
| 215 | IRMapping map; |
| 216 | |
| 217 | // Map the arguments corresponding to the launch parameters like blockIdx, |
| 218 | // threadIdx, etc. If cluster is present, then we also generate clusterIdx and |
| 219 | // clusterDim. |
| 220 | Region &outlinedFuncBody = outlinedFunc.getBody(); |
| 221 | injectGpuIndexOperations(loc, launchFuncOpBody&: outlinedFuncBody, launchOpBody, map, |
| 222 | hasCluster: launchOp.hasClusterSize()); |
| 223 | |
| 224 | // Map memory attributions from the LaunOp op to the GPUFuncOp attributions. |
| 225 | for (const auto &[launchArg, funcArg] : |
| 226 | llvm::zip(t: launchOp.getWorkgroupAttributions(), |
| 227 | u: outlinedFunc.getWorkgroupAttributions())) |
| 228 | map.map(from: launchArg, to: funcArg); |
| 229 | for (const auto &[launchArg, funcArg] : |
| 230 | llvm::zip(t: launchOp.getPrivateAttributions(), |
| 231 | u: outlinedFunc.getPrivateAttributions())) |
| 232 | map.map(from: launchArg, to: funcArg); |
| 233 | |
| 234 | // Map arguments from gpu.launch region to the arguments of the gpu.func |
| 235 | // operation. |
| 236 | Block &entryBlock = outlinedFuncBody.front(); |
| 237 | for (const auto &operand : enumerate(First&: operands)) |
| 238 | map.map(from: operand.value(), to: entryBlock.getArgument(i: operand.index())); |
| 239 | |
| 240 | // Clone the region of the gpu.launch operation into the gpu.func operation. |
| 241 | launchOpBody.cloneInto(dest: &outlinedFuncBody, mapper&: map); |
| 242 | |
| 243 | // Replace the terminator op with returns. |
| 244 | for (Block &block : launchOpBody) { |
| 245 | Block *clonedBlock = map.lookup(from: &block); |
| 246 | auto terminator = dyn_cast<gpu::TerminatorOp>(Val: clonedBlock->getTerminator()); |
| 247 | if (!terminator) |
| 248 | continue; |
| 249 | OpBuilder replacer(terminator); |
| 250 | replacer.create<gpu::ReturnOp>(location: terminator->getLoc()); |
| 251 | terminator->erase(); |
| 252 | } |
| 253 | |
| 254 | // Splice now the entry block of the gpu.launch operation at the end of the |
| 255 | // gpu.func entry block and erase the redundant block. |
| 256 | Block *clonedLaunchOpEntry = map.lookup(from: &launchOpBody.front()); |
| 257 | entryBlock.getOperations().splice(where: entryBlock.getOperations().end(), |
| 258 | L2&: clonedLaunchOpEntry->getOperations()); |
| 259 | clonedLaunchOpEntry->erase(); |
| 260 | |
| 261 | return outlinedFunc; |
| 262 | } |
| 263 | |
| 264 | gpu::GPUFuncOp mlir::outlineKernelFunc(gpu::LaunchOp launchOp, |
| 265 | StringRef kernelFnName, |
| 266 | llvm::SmallVectorImpl<Value> &operands) { |
| 267 | DenseSet<Value> inputOperandSet; |
| 268 | inputOperandSet.insert_range(R&: operands); |
| 269 | SetVector<Value> operandSet(llvm::from_range, operands); |
| 270 | auto funcOp = outlineKernelFuncImpl(launchOp, kernelFnName, operands&: operandSet); |
| 271 | for (auto operand : operandSet) { |
| 272 | if (!inputOperandSet.count(V: operand)) |
| 273 | operands.push_back(Elt: operand); |
| 274 | } |
| 275 | return funcOp; |
| 276 | } |
| 277 | |
| 278 | /// Replace `gpu.launch` operations with an `gpu.launch_func` operation |
| 279 | /// launching `kernelFunc`. The kernel func contains the body of the |
| 280 | /// `gpu.launch` with constant region arguments inlined. |
| 281 | static void convertToLaunchFuncOp(gpu::LaunchOp launchOp, |
| 282 | gpu::GPUFuncOp kernelFunc, |
| 283 | ValueRange operands) { |
| 284 | OpBuilder builder(launchOp); |
| 285 | // The launch op has an optional dynamic shared memory size. If it doesn't |
| 286 | // exist, we use zero. |
| 287 | Value asyncToken = launchOp.getAsyncToken(); |
| 288 | std::optional<gpu::KernelDim3> clusterSize = |
| 289 | launchOp.getClusterSizeOperandValues(); |
| 290 | auto launchFunc = builder.create<gpu::LaunchFuncOp>( |
| 291 | location: launchOp.getLoc(), args&: kernelFunc, args: launchOp.getGridSizeOperandValues(), |
| 292 | args: launchOp.getBlockSizeOperandValues(), |
| 293 | args: launchOp.getDynamicSharedMemorySize(), args&: operands, |
| 294 | args: asyncToken ? asyncToken.getType() : nullptr, |
| 295 | args: launchOp.getAsyncDependencies(), args&: clusterSize); |
| 296 | launchOp.replaceAllUsesWith(values&: launchFunc); |
| 297 | launchOp.erase(); |
| 298 | } |
| 299 | |
| 300 | namespace { |
| 301 | /// Pass that moves ops which are likely an index computation into gpu.launch |
| 302 | /// body. |
| 303 | class GpuLaunchSinkIndexComputationsPass |
| 304 | : public impl::GpuLaunchSinkIndexComputationsPassBase< |
| 305 | GpuLaunchSinkIndexComputationsPass> { |
| 306 | public: |
| 307 | void runOnOperation() override { |
| 308 | Operation *op = getOperation(); |
| 309 | if (op->walk(callback: [](gpu::LaunchOp launch) { |
| 310 | // Pull in instructions that can be sunk |
| 311 | if (failed(Result: sinkOperationsIntoLaunchOp(launchOp: launch, |
| 312 | isSinkingBeneficiary: isLikelyAnIndexComputation))) |
| 313 | return WalkResult::interrupt(); |
| 314 | |
| 315 | return WalkResult::advance(); |
| 316 | }).wasInterrupted()) |
| 317 | signalPassFailure(); |
| 318 | } |
| 319 | }; |
| 320 | |
| 321 | /// Pass that moves the kernel of each LaunchOp into its separate nested module. |
| 322 | /// |
| 323 | /// This pass moves the kernel code of each LaunchOp into a function created |
| 324 | /// inside a nested module. It also creates an external function of the same |
| 325 | /// name in the parent module. |
| 326 | /// |
| 327 | /// The gpu.modules are intended to be compiled to a cubin blob independently in |
| 328 | /// a separate pass. The external functions can then be annotated with the |
| 329 | /// symbol of the cubin accessor function. |
| 330 | class GpuKernelOutliningPass |
| 331 | : public impl::GpuKernelOutliningPassBase<GpuKernelOutliningPass> { |
| 332 | public: |
| 333 | using Base::Base; |
| 334 | |
| 335 | LogicalResult initialize(MLIRContext *context) override { |
| 336 | // Initialize the data layout specification from the data layout string. |
| 337 | if (!dataLayoutStr.empty()) { |
| 338 | Attribute resultAttr = mlir::parseAttribute(attrStr: dataLayoutStr, context); |
| 339 | if (!resultAttr) |
| 340 | return failure(); |
| 341 | |
| 342 | dataLayoutSpec = dyn_cast<DataLayoutSpecInterface>(Val&: resultAttr); |
| 343 | if (!dataLayoutSpec) |
| 344 | return failure(); |
| 345 | } |
| 346 | |
| 347 | return success(); |
| 348 | } |
| 349 | |
| 350 | void runOnOperation() override { |
| 351 | SymbolTable symbolTable(getOperation()); |
| 352 | bool modified = false; |
| 353 | for (auto func : getOperation().getOps<SymbolOpInterface>()) { |
| 354 | // Insert just after the function. |
| 355 | Block::iterator insertPt(func->getNextNode()); |
| 356 | auto funcWalkResult = func.walk(callback: [&](gpu::LaunchOp op) { |
| 357 | SetVector<Value> operands; |
| 358 | std::string kernelFnName; |
| 359 | if (op.getKernelFunc()) { |
| 360 | kernelFnName = op.getKernelFunc()->getRootReference().str(); |
| 361 | } else { |
| 362 | kernelFnName = |
| 363 | Twine(op->getParentOfType<SymbolOpInterface>().getName(), |
| 364 | "_kernel" ) |
| 365 | .str(); |
| 366 | } |
| 367 | |
| 368 | gpu::GPUFuncOp outlinedFunc = |
| 369 | outlineKernelFuncImpl(launchOp: op, kernelFnName, operands); |
| 370 | |
| 371 | // Create nested module and insert outlinedFunc. The module will |
| 372 | // originally get the same name as the function, but may be renamed on |
| 373 | // insertion into the parent module. |
| 374 | auto kernelModule = createKernelModule(gpuLaunchOp: op, kernelFunc: outlinedFunc, parentSymbolTable: symbolTable); |
| 375 | symbolTable.insert(symbol: kernelModule, insertPt); |
| 376 | |
| 377 | // Potentially changes signature, pulling in constants. |
| 378 | convertToLaunchFuncOp(launchOp: op, kernelFunc: outlinedFunc, operands: operands.getArrayRef()); |
| 379 | modified = true; |
| 380 | return WalkResult::advance(); |
| 381 | }); |
| 382 | if (funcWalkResult.wasInterrupted()) |
| 383 | return signalPassFailure(); |
| 384 | } |
| 385 | |
| 386 | // If any new module was inserted in this module, annotate this module as |
| 387 | // a container module. |
| 388 | if (modified) |
| 389 | getOperation()->setAttr(name: gpu::GPUDialect::getContainerModuleAttrName(), |
| 390 | value: UnitAttr::get(context: &getContext())); |
| 391 | } |
| 392 | |
| 393 | private: |
| 394 | /// Returns a gpu.module containing kernelFunc and all callees (recursive). |
| 395 | gpu::GPUModuleOp createKernelModule(gpu::LaunchOp gpuLaunchOp, |
| 396 | gpu::GPUFuncOp kernelFunc, |
| 397 | const SymbolTable &parentSymbolTable) { |
| 398 | // TODO: This code cannot use an OpBuilder because it must be inserted into |
| 399 | // a SymbolTable by the caller. SymbolTable needs to be refactored to |
| 400 | // prevent manual building of Ops with symbols in code using SymbolTables |
| 401 | // and then this needs to use the OpBuilder. |
| 402 | auto *context = getOperation().getContext(); |
| 403 | OpBuilder builder(context); |
| 404 | std::string kernelModuleName; |
| 405 | gpu::GPUModuleOp kernelModule; |
| 406 | if (gpuLaunchOp.getKernelModule()) { |
| 407 | kernelModuleName = |
| 408 | gpuLaunchOp.getKernelModule()->getRootReference().str(); |
| 409 | kernelModule = |
| 410 | parentSymbolTable.lookup<gpu::GPUModuleOp>(name: kernelModuleName); |
| 411 | } else { |
| 412 | kernelModuleName = kernelFunc.getName(); |
| 413 | } |
| 414 | |
| 415 | // Check if the module already exists in the symbol table |
| 416 | if (!kernelModule) { |
| 417 | // If not found, create a new GPU module |
| 418 | kernelModule = builder.create<gpu::GPUModuleOp>(location: kernelFunc.getLoc(), |
| 419 | args&: kernelModuleName); |
| 420 | } |
| 421 | |
| 422 | // If a valid data layout spec was provided, attach it to the kernel module. |
| 423 | // Otherwise, the default data layout will be used. |
| 424 | if (dataLayoutSpec) |
| 425 | kernelModule->setAttr(name: DLTIDialect::kDataLayoutAttrName, value: dataLayoutSpec); |
| 426 | |
| 427 | SymbolTable symbolTable(kernelModule); |
| 428 | symbolTable.insert(symbol: kernelFunc); |
| 429 | |
| 430 | SmallVector<Operation *, 8> symbolDefWorklist = {kernelFunc}; |
| 431 | while (!symbolDefWorklist.empty()) { |
| 432 | if (std::optional<SymbolTable::UseRange> symbolUses = |
| 433 | SymbolTable::getSymbolUses(from: symbolDefWorklist.pop_back_val())) { |
| 434 | for (SymbolTable::SymbolUse symbolUse : *symbolUses) { |
| 435 | StringRef symbolName = |
| 436 | cast<FlatSymbolRefAttr>(Val: symbolUse.getSymbolRef()).getValue(); |
| 437 | if (symbolTable.lookup(name: symbolName)) |
| 438 | continue; |
| 439 | |
| 440 | Operation *symbolDefClone = |
| 441 | parentSymbolTable.lookup(name: symbolName)->clone(); |
| 442 | symbolDefWorklist.push_back(Elt: symbolDefClone); |
| 443 | symbolTable.insert(symbol: symbolDefClone); |
| 444 | } |
| 445 | } |
| 446 | } |
| 447 | |
| 448 | return kernelModule; |
| 449 | } |
| 450 | |
| 451 | DataLayoutSpecInterface dataLayoutSpec; |
| 452 | }; |
| 453 | |
| 454 | } // namespace |
| 455 | |