1//===- MemoryPromotion.cpp - Utilities for moving data across GPU memories ===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements utilities that allow one to create IR moving the data
10// across different levels of the GPU memory hierarchy.
11//
12//===----------------------------------------------------------------------===//
13
14#include "mlir/Dialect/GPU/Transforms/MemoryPromotion.h"
15
16#include "mlir/Dialect/Affine/LoopUtils.h"
17#include "mlir/Dialect/Arith/IR/Arith.h"
18#include "mlir/Dialect/GPU/IR/GPUDialect.h"
19#include "mlir/Dialect/MemRef/IR/MemRef.h"
20#include "mlir/Dialect/SCF/IR/SCF.h"
21#include "mlir/IR/ImplicitLocOpBuilder.h"
22#include "mlir/Pass/Pass.h"
23
24using namespace mlir;
25using namespace mlir::gpu;
26
27/// Emits the (imperfect) loop nest performing the copy between "from" and "to"
28/// values using the bounds derived from the "from" value. Emits at least
29/// GPUDialect::getNumWorkgroupDimensions() loops, completing the nest with
30/// single-iteration loops. Maps the innermost loops to thread dimensions, in
31/// reverse order to enable access coalescing in the innermost loop.
32static void insertCopyLoops(ImplicitLocOpBuilder &b, Value from, Value to) {
33 auto memRefType = cast<MemRefType>(from.getType());
34 auto rank = memRefType.getRank();
35
36 SmallVector<Value, 4> lbs, ubs, steps;
37 Value zero = b.create<arith::ConstantIndexOp>(args: 0);
38 Value one = b.create<arith::ConstantIndexOp>(args: 1);
39
40 // Make sure we have enough loops to use all thread dimensions, these trivial
41 // loops should be outermost and therefore inserted first.
42 if (rank < GPUDialect::getNumWorkgroupDimensions()) {
43 unsigned extraLoops = GPUDialect::getNumWorkgroupDimensions() - rank;
44 lbs.resize(N: extraLoops, NV: zero);
45 ubs.resize(N: extraLoops, NV: one);
46 steps.resize(N: extraLoops, NV: one);
47 }
48
49 // Add existing bounds.
50 lbs.append(rank, zero);
51 ubs.reserve(N: lbs.size());
52 steps.reserve(N: lbs.size());
53 for (auto idx = 0; idx < rank; ++idx) {
54 ubs.push_back(b.createOrFold<memref::DimOp>(from, idx));
55 steps.push_back(Elt: one);
56 }
57
58 // Obtain thread identifiers and block sizes, necessary to map to them.
59 auto indexType = b.getIndexType();
60 SmallVector<Value, 3> threadIds, blockDims;
61 for (auto dim : {gpu::Dimension::x, gpu::Dimension::y, gpu::Dimension::z}) {
62 threadIds.push_back(b.create<gpu::ThreadIdOp>(indexType, dim));
63 blockDims.push_back(b.create<gpu::BlockDimOp>(indexType, dim));
64 }
65
66 // Produce the loop nest with copies.
67 SmallVector<Value, 8> ivs(lbs.size());
68 mlir::scf::buildLoopNest(
69 builder&: b, loc: b.getLoc(), lbs, ubs, steps,
70 bodyBuilder: [&](OpBuilder &b, Location loc, ValueRange loopIvs) {
71 ivs.assign(in_start: loopIvs.begin(), in_end: loopIvs.end());
72 auto activeIvs = llvm::ArrayRef(ivs).take_back(N: rank);
73 Value loaded = b.create<memref::LoadOp>(loc, from, activeIvs);
74 b.create<memref::StoreOp>(loc, loaded, to, activeIvs);
75 });
76
77 // Map the innermost loops to threads in reverse order.
78 for (const auto &en :
79 llvm::enumerate(llvm::reverse(llvm::ArrayRef(ivs).take_back(
80 GPUDialect::getNumWorkgroupDimensions())))) {
81 Value v = en.value();
82 auto loop = cast<scf::ForOp>(v.getParentRegion()->getParentOp());
83 affine::mapLoopToProcessorIds(loop, {threadIds[en.index()]},
84 {blockDims[en.index()]});
85 }
86}
87
88/// Emits the loop nests performing the copy to the designated location in the
89/// beginning of the region, and from the designated location immediately before
90/// the terminator of the first block of the region. The region is expected to
91/// have one block. This boils down to the following structure
92///
93/// ^bb(...):
94/// <loop-bound-computation>
95/// for %arg0 = ... to ... step ... {
96/// ...
97/// for %argN = <thread-id-x> to ... step <block-dim-x> {
98/// %0 = load %from[%arg0, ..., %argN]
99/// store %0, %to[%arg0, ..., %argN]
100/// }
101/// ...
102/// }
103/// gpu.barrier
104/// <... original body ...>
105/// gpu.barrier
106/// for %arg0 = ... to ... step ... {
107/// ...
108/// for %argN = <thread-id-x> to ... step <block-dim-x> {
109/// %1 = load %to[%arg0, ..., %argN]
110/// store %1, %from[%arg0, ..., %argN]
111/// }
112/// ...
113/// }
114///
115/// Inserts the barriers unconditionally since different threads may be copying
116/// values and reading them. An analysis would be required to eliminate barriers
117/// in case where value is only used by the thread that copies it. Both copies
118/// are inserted unconditionally, an analysis would be required to only copy
119/// live-in and live-out values when necessary. This copies the entire memref
120/// pointed to by "from". In case a smaller block would be sufficient, the
121/// caller can create a subview of the memref and promote it instead.
122static void insertCopies(Region &region, Location loc, Value from, Value to) {
123 auto fromType = cast<MemRefType>(from.getType());
124 auto toType = cast<MemRefType>(to.getType());
125 (void)fromType;
126 (void)toType;
127 assert(fromType.getShape() == toType.getShape());
128 assert(fromType.getRank() != 0);
129 assert(llvm::hasSingleElement(region) &&
130 "unstructured control flow not supported");
131
132 auto b = ImplicitLocOpBuilder::atBlockBegin(loc, block: &region.front());
133 insertCopyLoops(b, from, to);
134 b.create<gpu::BarrierOp>();
135
136 b.setInsertionPoint(&region.front().back());
137 b.create<gpu::BarrierOp>();
138 insertCopyLoops(b, from: to, to: from);
139}
140
141/// Promotes a function argument to workgroup memory in the given function. The
142/// copies will be inserted in the beginning and in the end of the function.
143void mlir::promoteToWorkgroupMemory(GPUFuncOp op, unsigned arg) {
144 Value value = op.getArgument(arg);
145 auto type = dyn_cast<MemRefType>(value.getType());
146 assert(type && type.hasStaticShape() && "can only promote memrefs");
147
148 // Get the type of the buffer in the workgroup memory.
149 auto workgroupMemoryAddressSpace = gpu::AddressSpaceAttr::get(
150 op->getContext(), gpu::AddressSpace::Workgroup);
151 auto bufferType = MemRefType::get(type.getShape(), type.getElementType(),
152 MemRefLayoutAttrInterface{},
153 Attribute(workgroupMemoryAddressSpace));
154 Value attribution = op.addWorkgroupAttribution(bufferType, value.getLoc());
155
156 // Replace the uses first since only the original uses are currently present.
157 // Then insert the copies.
158 value.replaceAllUsesWith(newValue: attribution);
159 insertCopies(op.getBody(), op.getLoc(), value, attribution);
160}
161

source code of mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp