| 1 | //===- MemoryPromotion.cpp - Utilities for moving data across GPU memories ===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This file implements utilities that allow one to create IR moving the data |
| 10 | // across different levels of the GPU memory hierarchy. |
| 11 | // |
| 12 | //===----------------------------------------------------------------------===// |
| 13 | |
| 14 | #include "mlir/Dialect/GPU/Transforms/MemoryPromotion.h" |
| 15 | |
| 16 | #include "mlir/Dialect/Affine/LoopUtils.h" |
| 17 | #include "mlir/Dialect/Arith/IR/Arith.h" |
| 18 | #include "mlir/Dialect/GPU/IR/GPUDialect.h" |
| 19 | #include "mlir/Dialect/MemRef/IR/MemRef.h" |
| 20 | #include "mlir/Dialect/SCF/IR/SCF.h" |
| 21 | #include "mlir/IR/ImplicitLocOpBuilder.h" |
| 22 | #include "mlir/Pass/Pass.h" |
| 23 | |
| 24 | using namespace mlir; |
| 25 | using namespace mlir::gpu; |
| 26 | |
| 27 | /// Emits the (imperfect) loop nest performing the copy between "from" and "to" |
| 28 | /// values using the bounds derived from the "from" value. Emits at least |
| 29 | /// GPUDialect::getNumWorkgroupDimensions() loops, completing the nest with |
| 30 | /// single-iteration loops. Maps the innermost loops to thread dimensions, in |
| 31 | /// reverse order to enable access coalescing in the innermost loop. |
| 32 | static void insertCopyLoops(ImplicitLocOpBuilder &b, Value from, Value to) { |
| 33 | auto memRefType = cast<MemRefType>(from.getType()); |
| 34 | auto rank = memRefType.getRank(); |
| 35 | |
| 36 | SmallVector<Value, 4> lbs, ubs, steps; |
| 37 | Value zero = b.create<arith::ConstantIndexOp>(args: 0); |
| 38 | Value one = b.create<arith::ConstantIndexOp>(args: 1); |
| 39 | |
| 40 | // Make sure we have enough loops to use all thread dimensions, these trivial |
| 41 | // loops should be outermost and therefore inserted first. |
| 42 | if (rank < GPUDialect::getNumWorkgroupDimensions()) { |
| 43 | unsigned = GPUDialect::getNumWorkgroupDimensions() - rank; |
| 44 | lbs.resize(N: extraLoops, NV: zero); |
| 45 | ubs.resize(N: extraLoops, NV: one); |
| 46 | steps.resize(N: extraLoops, NV: one); |
| 47 | } |
| 48 | |
| 49 | // Add existing bounds. |
| 50 | lbs.append(rank, zero); |
| 51 | ubs.reserve(N: lbs.size()); |
| 52 | steps.reserve(N: lbs.size()); |
| 53 | for (auto idx = 0; idx < rank; ++idx) { |
| 54 | ubs.push_back(b.createOrFold<memref::DimOp>(from, idx)); |
| 55 | steps.push_back(Elt: one); |
| 56 | } |
| 57 | |
| 58 | // Obtain thread identifiers and block sizes, necessary to map to them. |
| 59 | auto indexType = b.getIndexType(); |
| 60 | SmallVector<Value, 3> threadIds, blockDims; |
| 61 | for (auto dim : {gpu::Dimension::x, gpu::Dimension::y, gpu::Dimension::z}) { |
| 62 | threadIds.push_back(b.create<gpu::ThreadIdOp>(indexType, dim)); |
| 63 | blockDims.push_back(b.create<gpu::BlockDimOp>(indexType, dim)); |
| 64 | } |
| 65 | |
| 66 | // Produce the loop nest with copies. |
| 67 | SmallVector<Value, 8> ivs(lbs.size()); |
| 68 | mlir::scf::buildLoopNest( |
| 69 | builder&: b, loc: b.getLoc(), lbs, ubs, steps, |
| 70 | bodyBuilder: [&](OpBuilder &b, Location loc, ValueRange loopIvs) { |
| 71 | ivs.assign(in_start: loopIvs.begin(), in_end: loopIvs.end()); |
| 72 | auto activeIvs = llvm::ArrayRef(ivs).take_back(N: rank); |
| 73 | Value loaded = b.create<memref::LoadOp>(loc, from, activeIvs); |
| 74 | b.create<memref::StoreOp>(loc, loaded, to, activeIvs); |
| 75 | }); |
| 76 | |
| 77 | // Map the innermost loops to threads in reverse order. |
| 78 | for (const auto &en : |
| 79 | llvm::enumerate(llvm::reverse(llvm::ArrayRef(ivs).take_back( |
| 80 | GPUDialect::getNumWorkgroupDimensions())))) { |
| 81 | Value v = en.value(); |
| 82 | auto loop = cast<scf::ForOp>(v.getParentRegion()->getParentOp()); |
| 83 | affine::mapLoopToProcessorIds(loop, {threadIds[en.index()]}, |
| 84 | {blockDims[en.index()]}); |
| 85 | } |
| 86 | } |
| 87 | |
| 88 | /// Emits the loop nests performing the copy to the designated location in the |
| 89 | /// beginning of the region, and from the designated location immediately before |
| 90 | /// the terminator of the first block of the region. The region is expected to |
| 91 | /// have one block. This boils down to the following structure |
| 92 | /// |
| 93 | /// ^bb(...): |
| 94 | /// <loop-bound-computation> |
| 95 | /// for %arg0 = ... to ... step ... { |
| 96 | /// ... |
| 97 | /// for %argN = <thread-id-x> to ... step <block-dim-x> { |
| 98 | /// %0 = load %from[%arg0, ..., %argN] |
| 99 | /// store %0, %to[%arg0, ..., %argN] |
| 100 | /// } |
| 101 | /// ... |
| 102 | /// } |
| 103 | /// gpu.barrier |
| 104 | /// <... original body ...> |
| 105 | /// gpu.barrier |
| 106 | /// for %arg0 = ... to ... step ... { |
| 107 | /// ... |
| 108 | /// for %argN = <thread-id-x> to ... step <block-dim-x> { |
| 109 | /// %1 = load %to[%arg0, ..., %argN] |
| 110 | /// store %1, %from[%arg0, ..., %argN] |
| 111 | /// } |
| 112 | /// ... |
| 113 | /// } |
| 114 | /// |
| 115 | /// Inserts the barriers unconditionally since different threads may be copying |
| 116 | /// values and reading them. An analysis would be required to eliminate barriers |
| 117 | /// in case where value is only used by the thread that copies it. Both copies |
| 118 | /// are inserted unconditionally, an analysis would be required to only copy |
| 119 | /// live-in and live-out values when necessary. This copies the entire memref |
| 120 | /// pointed to by "from". In case a smaller block would be sufficient, the |
| 121 | /// caller can create a subview of the memref and promote it instead. |
| 122 | static void insertCopies(Region ®ion, Location loc, Value from, Value to) { |
| 123 | auto fromType = cast<MemRefType>(from.getType()); |
| 124 | auto toType = cast<MemRefType>(to.getType()); |
| 125 | (void)fromType; |
| 126 | (void)toType; |
| 127 | assert(fromType.getShape() == toType.getShape()); |
| 128 | assert(fromType.getRank() != 0); |
| 129 | assert(llvm::hasSingleElement(region) && |
| 130 | "unstructured control flow not supported" ); |
| 131 | |
| 132 | auto b = ImplicitLocOpBuilder::atBlockBegin(loc, block: ®ion.front()); |
| 133 | insertCopyLoops(b, from, to); |
| 134 | b.create<gpu::BarrierOp>(); |
| 135 | |
| 136 | b.setInsertionPoint(®ion.front().back()); |
| 137 | b.create<gpu::BarrierOp>(); |
| 138 | insertCopyLoops(b, from: to, to: from); |
| 139 | } |
| 140 | |
| 141 | /// Promotes a function argument to workgroup memory in the given function. The |
| 142 | /// copies will be inserted in the beginning and in the end of the function. |
| 143 | void mlir::promoteToWorkgroupMemory(GPUFuncOp op, unsigned arg) { |
| 144 | Value value = op.getArgument(arg); |
| 145 | auto type = dyn_cast<MemRefType>(value.getType()); |
| 146 | assert(type && type.hasStaticShape() && "can only promote memrefs" ); |
| 147 | |
| 148 | // Get the type of the buffer in the workgroup memory. |
| 149 | auto workgroupMemoryAddressSpace = gpu::AddressSpaceAttr::get( |
| 150 | op->getContext(), gpu::AddressSpace::Workgroup); |
| 151 | auto bufferType = MemRefType::get(type.getShape(), type.getElementType(), |
| 152 | MemRefLayoutAttrInterface{}, |
| 153 | Attribute(workgroupMemoryAddressSpace)); |
| 154 | Value attribution = op.addWorkgroupAttribution(bufferType, value.getLoc()); |
| 155 | |
| 156 | // Replace the uses first since only the original uses are currently present. |
| 157 | // Then insert the copies. |
| 158 | value.replaceAllUsesWith(newValue: attribution); |
| 159 | insertCopies(op.getBody(), op.getLoc(), value, attribution); |
| 160 | } |
| 161 | |