MemoryPromotion.cpp source code [mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp]

1	//===- MemoryPromotion.cpp - Utilities for moving data across GPU memories ===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements utilities that allow one to create IR moving the data
10	// across different levels of the GPU memory hierarchy.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "mlir/Dialect/GPU/Transforms/MemoryPromotion.h"
15
16	#include "mlir/Dialect/Affine/LoopUtils.h"
17	#include "mlir/Dialect/Arith/IR/Arith.h"
18	#include "mlir/Dialect/GPU/IR/GPUDialect.h"
19	#include "mlir/Dialect/MemRef/IR/MemRef.h"
20	#include "mlir/Dialect/SCF/IR/SCF.h"
21	#include "mlir/IR/ImplicitLocOpBuilder.h"
22	#include "mlir/Pass/Pass.h"
23
24	using namespace mlir;
25	using namespace mlir::gpu;
26
27	/// Emits the (imperfect) loop nest performing the copy between "from" and "to"
28	/// values using the bounds derived from the "from" value. Emits at least
29	/// GPUDialect::getNumWorkgroupDimensions() loops, completing the nest with
30	/// single-iteration loops. Maps the innermost loops to thread dimensions, in
31	/// reverse order to enable access coalescing in the innermost loop.
32	static void insertCopyLoops(ImplicitLocOpBuilder &b, Value from, Value to) {
33	auto memRefType = cast<MemRefType>(from.getType());
34	auto rank = memRefType.getRank();
35
36	SmallVector<Value, `4`> lbs, ubs, steps;
37	Value zero = b.create<arith::ConstantIndexOp>(args: `0`);
38	Value one = b.create<arith::ConstantIndexOp>(args: `1`);
39
40	// Make sure we have enough loops to use all thread dimensions, these trivial
41	// loops should be outermost and therefore inserted first.
42	if (rank < GPUDialect::getNumWorkgroupDimensions()) {
43	unsigned extraLoops = GPUDialect::getNumWorkgroupDimensions() - rank;
44	lbs.resize(N: extraLoops, NV: zero);
45	ubs.resize(N: extraLoops, NV: one);
46	steps.resize(N: extraLoops, NV: one);
47	}
48
49	// Add existing bounds.
50	lbs.append(rank, zero);
51	ubs.reserve(N: lbs.size());
52	steps.reserve(N: lbs.size());
53	for (auto idx = `0`; idx < rank; ++idx) {
54	ubs.push_back(b.createOrFold<memref::DimOp>(from, idx));
55	steps.push_back(Elt: one);
56	}
57
58	// Obtain thread identifiers and block sizes, necessary to map to them.
59	auto indexType = b.getIndexType();
60	SmallVector<Value, `3`> threadIds, blockDims;
61	for (auto dim : {gpu::Dimension::x, gpu::Dimension::y, gpu::Dimension::z}) {
62	threadIds.push_back(b.create<gpu::ThreadIdOp>(indexType, dim));
63	blockDims.push_back(b.create<gpu::BlockDimOp>(indexType, dim));
64	}
65
66	// Produce the loop nest with copies.
67	SmallVector<Value, `8`> ivs(lbs.size());
68	mlir::scf::buildLoopNest(
69	builder&: b, loc: b.getLoc(), lbs, ubs, steps,
70	bodyBuilder: [&](OpBuilder &b, Location loc, ValueRange loopIvs) {
71	ivs.assign(in_start: loopIvs.begin(), in_end: loopIvs.end());
72	auto activeIvs = llvm::ArrayRef(ivs).take_back(N: rank);
73	Value loaded = b.create<memref::LoadOp>(loc, from, activeIvs);
74	b.create<memref::StoreOp>(loc, loaded, to, activeIvs);
75	});
76
77	// Map the innermost loops to threads in reverse order.
78	for (const auto &en :
79	llvm::enumerate(llvm::reverse(llvm::ArrayRef(ivs).take_back(
80	GPUDialect::getNumWorkgroupDimensions())))) {
81	Value v = en.value();
82	auto loop = cast<scf::ForOp>(v.getParentRegion()->getParentOp());
83	affine::mapLoopToProcessorIds(loop, {threadIds[en.index()]},
84	{blockDims[en.index()]});
85	}
86	}
87
88	/// Emits the loop nests performing the copy to the designated location in the
89	/// beginning of the region, and from the designated location immediately before
90	/// the terminator of the first block of the region. The region is expected to
91	/// have one block. This boils down to the following structure
92	///
93	/// ^bb(...):
94	/// <loop-bound-computation>
95	/// for %arg0 = ... to ... step ... {
96	/// ...
97	/// for %argN = <thread-id-x> to ... step <block-dim-x> {
98	/// %0 = load %from[%arg0, ..., %argN]
99	/// store %0, %to[%arg0, ..., %argN]
100	/// }
101	/// ...
102	/// }
103	/// gpu.barrier
104	/// <... original body ...>
105	/// gpu.barrier
106	/// for %arg0 = ... to ... step ... {
107	/// ...
108	/// for %argN = <thread-id-x> to ... step <block-dim-x> {
109	/// %1 = load %to[%arg0, ..., %argN]
110	/// store %1, %from[%arg0, ..., %argN]
111	/// }
112	/// ...
113	/// }
114	///
115	/// Inserts the barriers unconditionally since different threads may be copying
116	/// values and reading them. An analysis would be required to eliminate barriers
117	/// in case where value is only used by the thread that copies it. Both copies
118	/// are inserted unconditionally, an analysis would be required to only copy
119	/// live-in and live-out values when necessary. This copies the entire memref
120	/// pointed to by "from". In case a smaller block would be sufficient, the
121	/// caller can create a subview of the memref and promote it instead.
122	static void insertCopies(Region &region, Location loc, Value from, Value to) {
123	auto fromType = cast<MemRefType>(from.getType());
124	auto toType = cast<MemRefType>(to.getType());
125	(void)fromType;
126	(void)toType;
127	assert(fromType.getShape() == toType.getShape());
128	assert(fromType.getRank() != `0`);
129	assert(llvm::hasSingleElement(region) &&
130	"unstructured control flow not supported");
131
132	auto b = ImplicitLocOpBuilder::atBlockBegin(loc, block: &region.front());
133	insertCopyLoops(b, from, to);
134	b.create<gpu::BarrierOp>();
135
136	b.setInsertionPoint(&region.front().back());
137	b.create<gpu::BarrierOp>();
138	insertCopyLoops(b, from: to, to: from);
139	}
140
141	/// Promotes a function argument to workgroup memory in the given function. The
142	/// copies will be inserted in the beginning and in the end of the function.
143	void mlir::promoteToWorkgroupMemory(GPUFuncOp op, unsigned arg) {
144	Value value = op.getArgument(arg);
145	auto type = dyn_cast<MemRefType>(value.getType());
146	assert(type && type.hasStaticShape() && "can only promote memrefs");
147
148	// Get the type of the buffer in the workgroup memory.
149	auto workgroupMemoryAddressSpace = gpu::AddressSpaceAttr::get(
150	op->getContext(), gpu::AddressSpace::Workgroup);
151	auto bufferType = MemRefType::get(type.getShape(), type.getElementType(),
152	MemRefLayoutAttrInterface{},
153	Attribute(workgroupMemoryAddressSpace));
154	Value attribution = op.addWorkgroupAttribution(bufferType, value.getLoc());
155
156	// Replace the uses first since only the original uses are currently present.
157	// Then insert the copies.
158	value.replaceAllUsesWith(newValue: attribution);
159	insertCopies(op.getBody(), op.getLoc(), value, attribution);
160	}
161

Provided by KDAB

Learn to use CMake with our Intro Training

Find out more

Definitions

source code of mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp