| 1 | //===- KernelOutlining.cpp - Implementation of GPU kernel outlining -------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This file implements the GPU dialect kernel outlining pass. |
| 10 | // |
| 11 | //===----------------------------------------------------------------------===// |
| 12 | |
| 13 | #include "mlir/Dialect/GPU/Transforms/Passes.h" |
| 14 | |
| 15 | #include "mlir/AsmParser/AsmParser.h" |
| 16 | #include "mlir/Dialect/Arith/IR/Arith.h" |
| 17 | #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h" |
| 18 | #include "mlir/Dialect/DLTI/DLTI.h" |
| 19 | #include "mlir/Dialect/Func/IR/FuncOps.h" |
| 20 | #include "mlir/Dialect/GPU/IR/GPUDialect.h" |
| 21 | #include "mlir/Dialect/GPU/Utils/GPUUtils.h" |
| 22 | #include "mlir/Dialect/MemRef/IR/MemRef.h" |
| 23 | #include "mlir/IR/Builders.h" |
| 24 | #include "mlir/IR/BuiltinAttributes.h" |
| 25 | #include "mlir/IR/IRMapping.h" |
| 26 | #include "mlir/IR/Matchers.h" |
| 27 | #include "mlir/IR/SymbolTable.h" |
| 28 | #include "mlir/Support/LLVM.h" |
| 29 | #include "mlir/Transforms/RegionUtils.h" |
| 30 | #include <limits> |
| 31 | |
| 32 | namespace mlir { |
| 33 | #define GEN_PASS_DEF_GPULAUNCHSINKINDEXCOMPUTATIONSPASS |
| 34 | #define GEN_PASS_DEF_GPUKERNELOUTLININGPASS |
| 35 | #include "mlir/Dialect/GPU/Transforms/Passes.h.inc" |
| 36 | } // namespace mlir |
| 37 | |
| 38 | using namespace mlir; |
| 39 | |
| 40 | template <typename OpTy> |
| 41 | static void createForAllDimensions(OpBuilder &builder, Location loc, |
| 42 | SmallVectorImpl<Value> &values) { |
| 43 | for (auto dim : {gpu::Dimension::x, gpu::Dimension::y, gpu::Dimension::z}) |
| 44 | values.push_back(builder.create<OpTy>(loc, builder.getIndexType(), dim)); |
| 45 | } |
| 46 | |
| 47 | /// Adds operations generating block/thread ids and grid/block dimensions at the |
| 48 | /// beginning of the `launchFuncOpBody` region. Add mapping from argument in |
| 49 | /// entry block of `launchOpBody`, to the corresponding result value of the |
| 50 | /// added operations. |
| 51 | static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody, |
| 52 | Region &launchOpBody, IRMapping &map, |
| 53 | bool hasCluster = false) { |
| 54 | OpBuilder builder(loc->getContext()); |
| 55 | Block &firstBlock = launchOpBody.front(); |
| 56 | builder.setInsertionPointToStart(&launchFuncOpBody.front()); |
| 57 | SmallVector<Value> indexOps; |
| 58 | // The order is important here, as it must match the order of the arguments |
| 59 | createForAllDimensions<gpu::BlockIdOp>(builder, loc, indexOps); |
| 60 | createForAllDimensions<gpu::ThreadIdOp>(builder, loc, indexOps); |
| 61 | createForAllDimensions<gpu::GridDimOp>(builder, loc, indexOps); |
| 62 | createForAllDimensions<gpu::BlockDimOp>(builder, loc, indexOps); |
| 63 | if (hasCluster) { |
| 64 | createForAllDimensions<gpu::ClusterIdOp>(builder, loc, indexOps); |
| 65 | createForAllDimensions<gpu::ClusterDimOp>(builder, loc, indexOps); |
| 66 | } |
| 67 | // Replace the leading 12 function args with the respective thread/block index |
| 68 | // operations. Iterate backwards since args are erased and indices change. |
| 69 | for (const auto &indexOp : enumerate(First&: indexOps)) |
| 70 | map.map(from: firstBlock.getArgument(i: indexOp.index()), to: indexOp.value()); |
| 71 | } |
| 72 | |
| 73 | /// Identifies operations that are beneficial to sink into kernels. These |
| 74 | /// operations may not have side-effects, as otherwise sinking (and hence |
| 75 | /// duplicating them) is not legal. |
| 76 | static bool isLikelyAnIndexComputation(Operation *op) { |
| 77 | return matchPattern(op, m_Constant()) || |
| 78 | isa<memref::DimOp, arith::SelectOp, arith::CmpIOp>(op); |
| 79 | } |
| 80 | |
| 81 | /// For a given operation `op`, computes whether it is beneficial to sink the |
| 82 | /// operation into the kernel. An operation can be sunk if doing so does not |
| 83 | /// introduce new kernel arguments. Whether a value is already available in the |
| 84 | /// kernel (and hence does not introduce new arguments) is checked by |
| 85 | /// querying `existingDependencies` and `availableValues`. |
| 86 | /// If an operand is not yet available, we recursively check whether it can be |
| 87 | /// made available by siking its defining op. |
| 88 | /// Operations that are indentified for sinking are added to `beneficiaryOps` in |
| 89 | /// the order they should appear in the kernel. Furthermore, `availableValues` |
| 90 | /// is updated with results that will be available after sinking the identified |
| 91 | /// ops. |
| 92 | static bool ( |
| 93 | Operation *op, const SetVector<Value> &existingDependencies, |
| 94 | SetVector<Operation *> &beneficiaryOps, |
| 95 | llvm::SmallPtrSetImpl<Value> &availableValues, |
| 96 | llvm::function_ref<bool(Operation *)> isSinkingBeneficiary) { |
| 97 | if (beneficiaryOps.count(key: op)) |
| 98 | return true; |
| 99 | |
| 100 | if (!isSinkingBeneficiary(op)) |
| 101 | return false; |
| 102 | |
| 103 | for (Value operand : op->getOperands()) { |
| 104 | // It is already visible in the kernel, keep going. |
| 105 | if (availableValues.count(Ptr: operand)) |
| 106 | continue; |
| 107 | // Else check whether it can be made available via sinking or already is a |
| 108 | // dependency. |
| 109 | Operation *definingOp = operand.getDefiningOp(); |
| 110 | if ((!definingOp || !extractBeneficiaryOps(op: definingOp, existingDependencies, |
| 111 | beneficiaryOps, availableValues, |
| 112 | isSinkingBeneficiary)) && |
| 113 | !existingDependencies.count(key: operand)) |
| 114 | return false; |
| 115 | } |
| 116 | // We will sink the operation, mark its results as now available. |
| 117 | beneficiaryOps.insert(X: op); |
| 118 | for (Value result : op->getResults()) |
| 119 | availableValues.insert(Ptr: result); |
| 120 | return true; |
| 121 | } |
| 122 | |
| 123 | LogicalResult mlir::sinkOperationsIntoLaunchOp( |
| 124 | gpu::LaunchOp launchOp, |
| 125 | llvm::function_ref<bool(Operation *)> isSinkingBeneficiary) { |
| 126 | assert(isSinkingBeneficiary); |
| 127 | Region &launchOpBody = launchOp.getBody(); |
| 128 | |
| 129 | // Identify uses from values defined outside of the scope of the launch |
| 130 | // operation. |
| 131 | SetVector<Value> sinkCandidates; |
| 132 | getUsedValuesDefinedAbove(regions: launchOpBody, values&: sinkCandidates); |
| 133 | |
| 134 | SetVector<Operation *> toBeSunk; |
| 135 | llvm::SmallPtrSet<Value, 4> availableValues; |
| 136 | for (Value operand : sinkCandidates) { |
| 137 | Operation *operandOp = operand.getDefiningOp(); |
| 138 | if (!operandOp) |
| 139 | continue; |
| 140 | extractBeneficiaryOps(op: operandOp, existingDependencies: sinkCandidates, beneficiaryOps&: toBeSunk, availableValues, |
| 141 | isSinkingBeneficiary); |
| 142 | } |
| 143 | |
| 144 | // Insert operations so that the defs get cloned before uses. |
| 145 | IRMapping map; |
| 146 | OpBuilder builder(launchOpBody); |
| 147 | for (Operation *op : toBeSunk) { |
| 148 | Operation *clonedOp = builder.clone(op&: *op, mapper&: map); |
| 149 | // Only replace uses within the launch op. |
| 150 | for (auto pair : llvm::zip(op->getResults(), clonedOp->getResults())) |
| 151 | replaceAllUsesInRegionWith(std::get<0>(pair), std::get<1>(pair), |
| 152 | launchOp.getBody()); |
| 153 | } |
| 154 | return success(); |
| 155 | } |
| 156 | |
| 157 | /// Return the provided KernelDim3 as an array of i32 constants if possible. |
| 158 | static DenseI32ArrayAttr maybeConstantDimsAttr(gpu::KernelDim3 dims) { |
| 159 | SmallVector<int32_t, 3> constants; |
| 160 | MLIRContext *ctx = dims.x.getContext(); |
| 161 | for (Value v : {dims.x, dims.y, dims.z}) { |
| 162 | APInt constValue; |
| 163 | if (!matchPattern(v, m_ConstantInt(&constValue))) |
| 164 | return nullptr; |
| 165 | // In the event someone called for a too-large block or grid dimension, |
| 166 | // don't set bounds as it is likely to cause more confusing behavior. |
| 167 | if (constValue.ugt(RHS: std::numeric_limits<uint32_t>::max())) |
| 168 | return nullptr; |
| 169 | constants.push_back( |
| 170 | Elt: constValue.getLimitedValue(Limit: std::numeric_limits<uint32_t>::max())); |
| 171 | } |
| 172 | return DenseI32ArrayAttr::get(ctx, constants); |
| 173 | } |
| 174 | |
| 175 | /// Outline the `gpu.launch` operation body into a kernel function. Replace |
| 176 | /// `gpu.terminator` operations by `gpu.return` in the generated function. |
| 177 | /// Set block and grid size bounds if known. |
| 178 | static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp, |
| 179 | StringRef kernelFnName, |
| 180 | SetVector<Value> &operands) { |
| 181 | Location loc = launchOp.getLoc(); |
| 182 | // Create a builder with no insertion point, insertion will happen separately |
| 183 | // due to symbol table manipulation. |
| 184 | OpBuilder builder(launchOp.getContext()); |
| 185 | Region &launchOpBody = launchOp.getBody(); |
| 186 | |
| 187 | // Identify uses from values defined outside of the scope of the launch |
| 188 | // operation. |
| 189 | getUsedValuesDefinedAbove(regions: launchOpBody, values&: operands); |
| 190 | |
| 191 | // Create the gpu.func operation. |
| 192 | SmallVector<Type, 4> kernelOperandTypes; |
| 193 | kernelOperandTypes.reserve(N: operands.size()); |
| 194 | for (Value operand : operands) { |
| 195 | kernelOperandTypes.push_back(Elt: operand.getType()); |
| 196 | } |
| 197 | FunctionType type = |
| 198 | FunctionType::get(launchOp.getContext(), kernelOperandTypes, {}); |
| 199 | auto outlinedFunc = builder.create<gpu::GPUFuncOp>( |
| 200 | loc, kernelFnName, type, |
| 201 | TypeRange(ValueRange(launchOp.getWorkgroupAttributions())), |
| 202 | TypeRange(ValueRange(launchOp.getPrivateAttributions()))); |
| 203 | outlinedFunc->setAttr(gpu::GPUDialect::getKernelFuncAttrName(), |
| 204 | builder.getUnitAttr()); |
| 205 | |
| 206 | // If we can infer bounds on the grid and/or block sizes from the arguments |
| 207 | // to the launch op, propagate them to the generated kernel. This is safe |
| 208 | // because multiple launches with the same body are not deduplicated. |
| 209 | if (auto blockBounds = |
| 210 | maybeConstantDimsAttr(launchOp.getBlockSizeOperandValues())) |
| 211 | outlinedFunc.setKnownBlockSizeAttr(blockBounds); |
| 212 | if (auto gridBounds = |
| 213 | maybeConstantDimsAttr(launchOp.getGridSizeOperandValues())) |
| 214 | outlinedFunc.setKnownGridSizeAttr(gridBounds); |
| 215 | |
| 216 | IRMapping map; |
| 217 | |
| 218 | // Map the arguments corresponding to the launch parameters like blockIdx, |
| 219 | // threadIdx, etc. If cluster is present, then we also generate clusterIdx and |
| 220 | // clusterDim. |
| 221 | Region &outlinedFuncBody = outlinedFunc.getBody(); |
| 222 | injectGpuIndexOperations(loc, outlinedFuncBody, launchOpBody, map, |
| 223 | launchOp.hasClusterSize()); |
| 224 | |
| 225 | // Map memory attributions from the LaunOp op to the GPUFuncOp attributions. |
| 226 | for (const auto &[launchArg, funcArg] : |
| 227 | llvm::zip(launchOp.getWorkgroupAttributions(), |
| 228 | outlinedFunc.getWorkgroupAttributions())) |
| 229 | map.map(launchArg, funcArg); |
| 230 | for (const auto &[launchArg, funcArg] : |
| 231 | llvm::zip(launchOp.getPrivateAttributions(), |
| 232 | outlinedFunc.getPrivateAttributions())) |
| 233 | map.map(launchArg, funcArg); |
| 234 | |
| 235 | // Map arguments from gpu.launch region to the arguments of the gpu.func |
| 236 | // operation. |
| 237 | Block &entryBlock = outlinedFuncBody.front(); |
| 238 | for (const auto &operand : enumerate(First&: operands)) |
| 239 | map.map(from: operand.value(), to: entryBlock.getArgument(i: operand.index())); |
| 240 | |
| 241 | // Clone the region of the gpu.launch operation into the gpu.func operation. |
| 242 | launchOpBody.cloneInto(dest: &outlinedFuncBody, mapper&: map); |
| 243 | |
| 244 | // Replace the terminator op with returns. |
| 245 | for (Block &block : launchOpBody) { |
| 246 | Block *clonedBlock = map.lookup(&block); |
| 247 | auto terminator = dyn_cast<gpu::TerminatorOp>(clonedBlock->getTerminator()); |
| 248 | if (!terminator) |
| 249 | continue; |
| 250 | OpBuilder replacer(terminator); |
| 251 | replacer.create<gpu::ReturnOp>(terminator->getLoc()); |
| 252 | terminator->erase(); |
| 253 | } |
| 254 | |
| 255 | // Splice now the entry block of the gpu.launch operation at the end of the |
| 256 | // gpu.func entry block and erase the redundant block. |
| 257 | Block *clonedLaunchOpEntry = map.lookup(from: &launchOpBody.front()); |
| 258 | entryBlock.getOperations().splice(where: entryBlock.getOperations().end(), |
| 259 | L2&: clonedLaunchOpEntry->getOperations()); |
| 260 | clonedLaunchOpEntry->erase(); |
| 261 | |
| 262 | return outlinedFunc; |
| 263 | } |
| 264 | |
| 265 | gpu::GPUFuncOp mlir::outlineKernelFunc(gpu::LaunchOp launchOp, |
| 266 | StringRef kernelFnName, |
| 267 | llvm::SmallVectorImpl<Value> &operands) { |
| 268 | DenseSet<Value> inputOperandSet; |
| 269 | inputOperandSet.insert_range(R&: operands); |
| 270 | SetVector<Value> operandSet(llvm::from_range, operands); |
| 271 | auto funcOp = outlineKernelFuncImpl(launchOp, kernelFnName, operandSet); |
| 272 | for (auto operand : operandSet) { |
| 273 | if (!inputOperandSet.count(V: operand)) |
| 274 | operands.push_back(Elt: operand); |
| 275 | } |
| 276 | return funcOp; |
| 277 | } |
| 278 | |
| 279 | /// Replace `gpu.launch` operations with an `gpu.launch_func` operation |
| 280 | /// launching `kernelFunc`. The kernel func contains the body of the |
| 281 | /// `gpu.launch` with constant region arguments inlined. |
| 282 | static void convertToLaunchFuncOp(gpu::LaunchOp launchOp, |
| 283 | gpu::GPUFuncOp kernelFunc, |
| 284 | ValueRange operands) { |
| 285 | OpBuilder builder(launchOp); |
| 286 | // The launch op has an optional dynamic shared memory size. If it doesn't |
| 287 | // exist, we use zero. |
| 288 | Value asyncToken = launchOp.getAsyncToken(); |
| 289 | std::optional<gpu::KernelDim3> clusterSize = |
| 290 | launchOp.getClusterSizeOperandValues(); |
| 291 | auto launchFunc = builder.create<gpu::LaunchFuncOp>( |
| 292 | launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(), |
| 293 | launchOp.getBlockSizeOperandValues(), |
| 294 | launchOp.getDynamicSharedMemorySize(), operands, |
| 295 | asyncToken ? asyncToken.getType() : nullptr, |
| 296 | launchOp.getAsyncDependencies(), clusterSize); |
| 297 | launchOp.replaceAllUsesWith(launchFunc); |
| 298 | launchOp.erase(); |
| 299 | } |
| 300 | |
| 301 | namespace { |
| 302 | /// Pass that moves ops which are likely an index computation into gpu.launch |
| 303 | /// body. |
| 304 | class GpuLaunchSinkIndexComputationsPass |
| 305 | : public impl::GpuLaunchSinkIndexComputationsPassBase< |
| 306 | GpuLaunchSinkIndexComputationsPass> { |
| 307 | public: |
| 308 | void runOnOperation() override { |
| 309 | Operation *op = getOperation(); |
| 310 | if (op->walk([](gpu::LaunchOp launch) { |
| 311 | // Pull in instructions that can be sunk |
| 312 | if (failed(sinkOperationsIntoLaunchOp(launch, |
| 313 | isLikelyAnIndexComputation))) |
| 314 | return WalkResult::interrupt(); |
| 315 | |
| 316 | return WalkResult::advance(); |
| 317 | }).wasInterrupted()) |
| 318 | signalPassFailure(); |
| 319 | } |
| 320 | }; |
| 321 | |
| 322 | /// Pass that moves the kernel of each LaunchOp into its separate nested module. |
| 323 | /// |
| 324 | /// This pass moves the kernel code of each LaunchOp into a function created |
| 325 | /// inside a nested module. It also creates an external function of the same |
| 326 | /// name in the parent module. |
| 327 | /// |
| 328 | /// The gpu.modules are intended to be compiled to a cubin blob independently in |
| 329 | /// a separate pass. The external functions can then be annotated with the |
| 330 | /// symbol of the cubin accessor function. |
| 331 | class GpuKernelOutliningPass |
| 332 | : public impl::GpuKernelOutliningPassBase<GpuKernelOutliningPass> { |
| 333 | public: |
| 334 | using Base::Base; |
| 335 | |
| 336 | LogicalResult initialize(MLIRContext *context) override { |
| 337 | // Initialize the data layout specification from the data layout string. |
| 338 | if (!dataLayoutStr.empty()) { |
| 339 | Attribute resultAttr = mlir::parseAttribute(dataLayoutStr, context); |
| 340 | if (!resultAttr) |
| 341 | return failure(); |
| 342 | |
| 343 | dataLayoutSpec = dyn_cast<DataLayoutSpecInterface>(resultAttr); |
| 344 | if (!dataLayoutSpec) |
| 345 | return failure(); |
| 346 | } |
| 347 | |
| 348 | return success(); |
| 349 | } |
| 350 | |
| 351 | void runOnOperation() override { |
| 352 | SymbolTable symbolTable(getOperation()); |
| 353 | bool modified = false; |
| 354 | for (auto func : getOperation().getOps<SymbolOpInterface>()) { |
| 355 | // Insert just after the function. |
| 356 | Block::iterator insertPt(func->getNextNode()); |
| 357 | auto funcWalkResult = func.walk([&](gpu::LaunchOp op) { |
| 358 | SetVector<Value> operands; |
| 359 | std::string kernelFnName; |
| 360 | if (op.getKernelFunc()) { |
| 361 | kernelFnName = op.getKernelFunc()->getRootReference().str(); |
| 362 | } else { |
| 363 | kernelFnName = |
| 364 | Twine(op->getParentOfType<SymbolOpInterface>().getName(), |
| 365 | "_kernel" ) |
| 366 | .str(); |
| 367 | } |
| 368 | |
| 369 | gpu::GPUFuncOp outlinedFunc = |
| 370 | outlineKernelFuncImpl(op, kernelFnName, operands); |
| 371 | |
| 372 | // Create nested module and insert outlinedFunc. The module will |
| 373 | // originally get the same name as the function, but may be renamed on |
| 374 | // insertion into the parent module. |
| 375 | auto kernelModule = createKernelModule(op, outlinedFunc, symbolTable); |
| 376 | symbolTable.insert(kernelModule, insertPt); |
| 377 | |
| 378 | // Potentially changes signature, pulling in constants. |
| 379 | convertToLaunchFuncOp(op, outlinedFunc, operands.getArrayRef()); |
| 380 | modified = true; |
| 381 | return WalkResult::advance(); |
| 382 | }); |
| 383 | if (funcWalkResult.wasInterrupted()) |
| 384 | return signalPassFailure(); |
| 385 | } |
| 386 | |
| 387 | // If any new module was inserted in this module, annotate this module as |
| 388 | // a container module. |
| 389 | if (modified) |
| 390 | getOperation()->setAttr(gpu::GPUDialect::getContainerModuleAttrName(), |
| 391 | UnitAttr::get(&getContext())); |
| 392 | } |
| 393 | |
| 394 | private: |
| 395 | /// Returns a gpu.module containing kernelFunc and all callees (recursive). |
| 396 | gpu::GPUModuleOp createKernelModule(gpu::LaunchOp gpuLaunchOp, |
| 397 | gpu::GPUFuncOp kernelFunc, |
| 398 | const SymbolTable &parentSymbolTable) { |
| 399 | // TODO: This code cannot use an OpBuilder because it must be inserted into |
| 400 | // a SymbolTable by the caller. SymbolTable needs to be refactored to |
| 401 | // prevent manual building of Ops with symbols in code using SymbolTables |
| 402 | // and then this needs to use the OpBuilder. |
| 403 | auto *context = getOperation().getContext(); |
| 404 | OpBuilder builder(context); |
| 405 | std::string kernelModuleName; |
| 406 | gpu::GPUModuleOp kernelModule; |
| 407 | if (gpuLaunchOp.getKernelModule()) { |
| 408 | kernelModuleName = |
| 409 | gpuLaunchOp.getKernelModule()->getRootReference().str(); |
| 410 | kernelModule = |
| 411 | parentSymbolTable.lookup<gpu::GPUModuleOp>(kernelModuleName); |
| 412 | } else { |
| 413 | kernelModuleName = kernelFunc.getName(); |
| 414 | } |
| 415 | |
| 416 | // Check if the module already exists in the symbol table |
| 417 | if (!kernelModule) { |
| 418 | // If not found, create a new GPU module |
| 419 | kernelModule = builder.create<gpu::GPUModuleOp>(kernelFunc.getLoc(), |
| 420 | kernelModuleName); |
| 421 | } |
| 422 | |
| 423 | // If a valid data layout spec was provided, attach it to the kernel module. |
| 424 | // Otherwise, the default data layout will be used. |
| 425 | if (dataLayoutSpec) |
| 426 | kernelModule->setAttr(DLTIDialect::kDataLayoutAttrName, dataLayoutSpec); |
| 427 | |
| 428 | SymbolTable symbolTable(kernelModule); |
| 429 | symbolTable.insert(symbol: kernelFunc); |
| 430 | |
| 431 | SmallVector<Operation *, 8> symbolDefWorklist = {kernelFunc}; |
| 432 | while (!symbolDefWorklist.empty()) { |
| 433 | if (std::optional<SymbolTable::UseRange> symbolUses = |
| 434 | SymbolTable::getSymbolUses(from: symbolDefWorklist.pop_back_val())) { |
| 435 | for (SymbolTable::SymbolUse symbolUse : *symbolUses) { |
| 436 | StringRef symbolName = |
| 437 | cast<FlatSymbolRefAttr>(symbolUse.getSymbolRef()).getValue(); |
| 438 | if (symbolTable.lookup(symbolName)) |
| 439 | continue; |
| 440 | |
| 441 | Operation *symbolDefClone = |
| 442 | parentSymbolTable.lookup(symbolName)->clone(); |
| 443 | symbolDefWorklist.push_back(symbolDefClone); |
| 444 | symbolTable.insert(symbolDefClone); |
| 445 | } |
| 446 | } |
| 447 | } |
| 448 | |
| 449 | return kernelModule; |
| 450 | } |
| 451 | |
| 452 | DataLayoutSpecInterface dataLayoutSpec; |
| 453 | }; |
| 454 | |
| 455 | } // namespace |
| 456 | |