1 | //===- MemoryPromotion.cpp - Utilities for moving data across GPU memories ===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements utilities that allow one to create IR moving the data |
10 | // across different levels of the GPU memory hierarchy. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "mlir/Dialect/GPU/Transforms/MemoryPromotion.h" |
15 | |
16 | #include "mlir/Dialect/Affine/LoopUtils.h" |
17 | #include "mlir/Dialect/Arith/IR/Arith.h" |
18 | #include "mlir/Dialect/GPU/IR/GPUDialect.h" |
19 | #include "mlir/Dialect/MemRef/IR/MemRef.h" |
20 | #include "mlir/Dialect/SCF/IR/SCF.h" |
21 | #include "mlir/IR/ImplicitLocOpBuilder.h" |
22 | #include "mlir/Pass/Pass.h" |
23 | |
24 | using namespace mlir; |
25 | using namespace mlir::gpu; |
26 | |
27 | /// Emits the (imperfect) loop nest performing the copy between "from" and "to" |
28 | /// values using the bounds derived from the "from" value. Emits at least |
29 | /// GPUDialect::getNumWorkgroupDimensions() loops, completing the nest with |
30 | /// single-iteration loops. Maps the innermost loops to thread dimensions, in |
31 | /// reverse order to enable access coalescing in the innermost loop. |
32 | static void insertCopyLoops(ImplicitLocOpBuilder &b, Value from, Value to) { |
33 | auto memRefType = cast<MemRefType>(from.getType()); |
34 | auto rank = memRefType.getRank(); |
35 | |
36 | SmallVector<Value, 4> lbs, ubs, steps; |
37 | Value zero = b.create<arith::ConstantIndexOp>(args: 0); |
38 | Value one = b.create<arith::ConstantIndexOp>(args: 1); |
39 | |
40 | // Make sure we have enough loops to use all thread dimensions, these trivial |
41 | // loops should be outermost and therefore inserted first. |
42 | if (rank < GPUDialect::getNumWorkgroupDimensions()) { |
43 | unsigned = GPUDialect::getNumWorkgroupDimensions() - rank; |
44 | lbs.resize(N: extraLoops, NV: zero); |
45 | ubs.resize(N: extraLoops, NV: one); |
46 | steps.resize(N: extraLoops, NV: one); |
47 | } |
48 | |
49 | // Add existing bounds. |
50 | lbs.append(rank, zero); |
51 | ubs.reserve(N: lbs.size()); |
52 | steps.reserve(N: lbs.size()); |
53 | for (auto idx = 0; idx < rank; ++idx) { |
54 | ubs.push_back(b.createOrFold<memref::DimOp>(from, idx)); |
55 | steps.push_back(Elt: one); |
56 | } |
57 | |
58 | // Obtain thread identifiers and block sizes, necessary to map to them. |
59 | auto indexType = b.getIndexType(); |
60 | SmallVector<Value, 3> threadIds, blockDims; |
61 | for (auto dim : {gpu::Dimension::x, gpu::Dimension::y, gpu::Dimension::z}) { |
62 | threadIds.push_back(b.create<gpu::ThreadIdOp>(indexType, dim)); |
63 | blockDims.push_back(b.create<gpu::BlockDimOp>(indexType, dim)); |
64 | } |
65 | |
66 | // Produce the loop nest with copies. |
67 | SmallVector<Value, 8> ivs(lbs.size()); |
68 | mlir::scf::buildLoopNest( |
69 | builder&: b, loc: b.getLoc(), lbs, ubs, steps, |
70 | bodyBuilder: [&](OpBuilder &b, Location loc, ValueRange loopIvs) { |
71 | ivs.assign(in_start: loopIvs.begin(), in_end: loopIvs.end()); |
72 | auto activeIvs = llvm::ArrayRef(ivs).take_back(N: rank); |
73 | Value loaded = b.create<memref::LoadOp>(loc, from, activeIvs); |
74 | b.create<memref::StoreOp>(loc, loaded, to, activeIvs); |
75 | }); |
76 | |
77 | // Map the innermost loops to threads in reverse order. |
78 | for (const auto &en : |
79 | llvm::enumerate(llvm::reverse(llvm::ArrayRef(ivs).take_back( |
80 | GPUDialect::getNumWorkgroupDimensions())))) { |
81 | Value v = en.value(); |
82 | auto loop = cast<scf::ForOp>(v.getParentRegion()->getParentOp()); |
83 | affine::mapLoopToProcessorIds(loop, {threadIds[en.index()]}, |
84 | {blockDims[en.index()]}); |
85 | } |
86 | } |
87 | |
88 | /// Emits the loop nests performing the copy to the designated location in the |
89 | /// beginning of the region, and from the designated location immediately before |
90 | /// the terminator of the first block of the region. The region is expected to |
91 | /// have one block. This boils down to the following structure |
92 | /// |
93 | /// ^bb(...): |
94 | /// <loop-bound-computation> |
95 | /// for %arg0 = ... to ... step ... { |
96 | /// ... |
97 | /// for %argN = <thread-id-x> to ... step <block-dim-x> { |
98 | /// %0 = load %from[%arg0, ..., %argN] |
99 | /// store %0, %to[%arg0, ..., %argN] |
100 | /// } |
101 | /// ... |
102 | /// } |
103 | /// gpu.barrier |
104 | /// <... original body ...> |
105 | /// gpu.barrier |
106 | /// for %arg0 = ... to ... step ... { |
107 | /// ... |
108 | /// for %argN = <thread-id-x> to ... step <block-dim-x> { |
109 | /// %1 = load %to[%arg0, ..., %argN] |
110 | /// store %1, %from[%arg0, ..., %argN] |
111 | /// } |
112 | /// ... |
113 | /// } |
114 | /// |
115 | /// Inserts the barriers unconditionally since different threads may be copying |
116 | /// values and reading them. An analysis would be required to eliminate barriers |
117 | /// in case where value is only used by the thread that copies it. Both copies |
118 | /// are inserted unconditionally, an analysis would be required to only copy |
119 | /// live-in and live-out values when necessary. This copies the entire memref |
120 | /// pointed to by "from". In case a smaller block would be sufficient, the |
121 | /// caller can create a subview of the memref and promote it instead. |
122 | static void insertCopies(Region ®ion, Location loc, Value from, Value to) { |
123 | auto fromType = cast<MemRefType>(from.getType()); |
124 | auto toType = cast<MemRefType>(to.getType()); |
125 | (void)fromType; |
126 | (void)toType; |
127 | assert(fromType.getShape() == toType.getShape()); |
128 | assert(fromType.getRank() != 0); |
129 | assert(llvm::hasSingleElement(region) && |
130 | "unstructured control flow not supported" ); |
131 | |
132 | auto b = ImplicitLocOpBuilder::atBlockBegin(loc, block: ®ion.front()); |
133 | insertCopyLoops(b, from, to); |
134 | b.create<gpu::BarrierOp>(); |
135 | |
136 | b.setInsertionPoint(®ion.front().back()); |
137 | b.create<gpu::BarrierOp>(); |
138 | insertCopyLoops(b, from: to, to: from); |
139 | } |
140 | |
141 | /// Promotes a function argument to workgroup memory in the given function. The |
142 | /// copies will be inserted in the beginning and in the end of the function. |
143 | void mlir::promoteToWorkgroupMemory(GPUFuncOp op, unsigned arg) { |
144 | Value value = op.getArgument(arg); |
145 | auto type = dyn_cast<MemRefType>(value.getType()); |
146 | assert(type && type.hasStaticShape() && "can only promote memrefs" ); |
147 | |
148 | // Get the type of the buffer in the workgroup memory. |
149 | auto workgroupMemoryAddressSpace = gpu::AddressSpaceAttr::get( |
150 | op->getContext(), gpu::AddressSpace::Workgroup); |
151 | auto bufferType = MemRefType::get(type.getShape(), type.getElementType(), |
152 | MemRefLayoutAttrInterface{}, |
153 | Attribute(workgroupMemoryAddressSpace)); |
154 | Value attribution = op.addWorkgroupAttribution(bufferType, value.getLoc()); |
155 | |
156 | // Replace the uses first since only the original uses are currently present. |
157 | // Then insert the copies. |
158 | value.replaceAllUsesWith(newValue: attribution); |
159 | insertCopies(op.getBody(), op.getLoc(), value, attribution); |
160 | } |
161 | |