GPUHeuristics.cpp source code [mlir/lib/Dialect/Linalg/TransformOps/GPUHeuristics.cpp]

1	//===- GPUHeuristics.cpp - Heuristics Implementation for Transforms -------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "mlir/Dialect/Linalg/TransformOps/GPUHeuristics.h"
10
11	#include "mlir/Dialect/GPU/IR/GPUDialect.h"
12	#include "llvm/ADT/ArrayRef.h"
13	#include "llvm/ADT/STLExtras.h"
14	#include "llvm/Support/CommandLine.h"
15	#include "llvm/Support/Debug.h"
16	#include "llvm/Support/InterleavedRange.h"
17	#include "llvm/Support/MathExtras.h"
18	#include "llvm/Support/raw_ostream.h"
19	#include <cmath>
20	#include <numeric>
21
22	using namespace mlir;
23
24	#define DEBUG_TYPE "linalg-transforms"
25	#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
26	#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
27
28	static Attribute linearId0(MLIRContext *ctx) {
29	return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim0);
30	}
31	static Attribute linearId1(MLIRContext *ctx) {
32	return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim1);
33	}
34	static Attribute linearId2(MLIRContext *ctx) {
35	return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim2);
36	}
37
38	transform::gpu::CopyMappingInfo::CopyMappingInfo(MLIRContext *ctx,
39	int totalNumThreads,
40	int64_t desiredBitAlignment,
41	ArrayRef<int64_t> copySizes,
42	bool favorPredication,
43	int64_t elementalBitwidth) {
44	assert(!copySizes.empty() && copySizes.size() <= `3` &&
45	"only 1,2,3-D copies are supported for now");
46
47	LDBG("START CopyMappingInfo, favorPredication: " << favorPredication);
48	LLVM_DEBUG(DBGS() << "--copy shape: " << llvm::interleaved(copySizes)
49	<< "\n");
50
51	// Greedily find the largest vector size that can be used to copy the most
52	// minor dimension: we are in the business of filling kMaxVectorLoadBitWidth
53	// contiguous memory transactions with as few threads as possible.
54	int64_t desiredVectorSize = CopyMappingInfo::maxContiguousElementsToTransfer(
55	alignment: desiredBitAlignment, numContiguousElements: copySizes.back(), elementalBitwidth);
56
57	LDBG("--greedily determined vectorSize: "
58	<< desiredVectorSize << " elements of " << elementalBitwidth
59	<< "b each -> " << (desiredVectorSize * elementalBitwidth)
60	<< "b total out of a max of " << kMaxVectorLoadBitWidth << "b");
61
62	status = inferNumThreads(totalNumThreads, sizes: copySizes, desiredVectorSize,
63	favorPredication);
64	if (status == Status::Invalid)
65	return;
66
67	LLVM_DEBUG(DBGS() << "--copy: " << llvm::interleaved(copySizes) << "\n"
68	<< "--numThreads: " << llvm::interleaved(this->numThreads)
69	<< "\n"
70	<< "--vectorSize: " << this->vectorSize << "\n");
71	assert(this->numThreads.size() == copySizes.size() &&
72	"compute copy mapping expected same number of threads and copy sizes");
73
74	// Compute the smallest bounding box.
75	this->smallestBoundingTileSizes = llvm::to_vector(
76	Range: llvm::map_range(C: llvm::zip(t&: copySizes, u&: this->numThreads), F: [](auto &&pair) {
77	int64_t size, numThreads;
78	std::tie(args&: size, args&: numThreads) = pair;
79	return llvm::divideCeilSigned(Numerator: size, Denominator: numThreads);
80	}));
81	SmallVector<Attribute> allThreadMappings{linearId2(ctx), linearId1(ctx),
82	linearId0(ctx)};
83
84	// Set the thread mapping.
85	this->threadMapping =
86	llvm::to_vector(Range: ArrayRef(allThreadMappings)
87	.take_back(N: this->smallestBoundingTileSizes.size()));
88	LLVM_DEBUG(this->print(DBGS()); llvm::dbgs() << "\n");
89	}
90
91	int64_t transform::gpu::CopyMappingInfo::maxContiguousElementsToTransfer(
92	int64_t desiredBitAlignment, int64_t numContiguousElements,
93	int64_t elementalBitwidth) {
94	assert(kMaxVectorLoadBitWidth % elementalBitwidth == `0` &&
95	"elemental bitwidth does not divide kMaxVectorLoadBitWidth");
96	assert(desiredBitAlignment % elementalBitwidth == `0` &&
97	"elemental bitwidth does not divide desired bit alignment");
98	return std::gcd(
99	m: std::gcd(m: desiredBitAlignment / elementalBitwidth, n: numContiguousElements),
100	n: kMaxVectorLoadBitWidth / elementalBitwidth);
101	}
102
103	/// Get the list of all factors that divide `val`, not just the prime factors.
104	static SmallVector<int64_t> getFactors(int64_t val) {
105	SmallVector<int64_t> factors;
106	factors.reserve(N: val);
107	for (int64_t factor = `1`; factor <= val; ++factor) {
108	if (val % factor != `0`)
109	continue;
110	factors.push_back(Elt: factor);
111	}
112	factors.push_back(Elt: val);
113	return factors;
114	}
115
116	static int64_t product(ArrayRef<int64_t> vals) {
117	int64_t res = `1`;
118	for (auto val : vals)
119	res *= val;
120	return res;
121	}
122
123	/// Extract `result` from `sizes` with the following constraints:
124	/// 1. sizes[i] % result[i] for all i
125	/// 2. product_of_threadsPerDim <= maxNumThreads
126	/// 3. if `currentIndex` is sizes.size() - 1, then threadsPerDim[currentIndex]
127	/// must be sizes[currentIndex].
128	/// This is used to greedily extract the maximum number of threads usable for
129	/// mapping a copy of size `sizes`, while being bounded by `totalNumThreads` and
130	/// ensuring coalesced access along the most minor dimension.
131	/// Return the number of threads used in the range:
132	/// threadsPerDim[currentIndex .. sizes.end()]
133	// The implementation uses a dynamic programming approach to greedily extract
134	// the best combination under the constraints.
135	// TODO: Implementation details can be improved but putting effort there is a
136	// tradeoffs: `sizes` is expected to be of small rank and contain small values.
137	static SmallVector<int64_t> maximizeNumThreads(ArrayRef<int64_t> sizes,
138	int64_t currentIndex,
139	int64_t maxNumThreads) {
140	assert(static_cast<size_t>(currentIndex) < sizes.size() &&
141	"currentIndex out of bounds");
142	std::string indent(`2` * currentIndex, `'-'`);
143	if (static_cast<size_t>(currentIndex) == sizes.size() - `1`) {
144	LDBG(indent << "mandated globalBest: " << sizes[currentIndex]);
145	return SmallVector<int64_t>{sizes [currentIndex]};
146	}
147
148	int64_t best = `0`;
149	int64_t s = sizes [currentIndex];
150	SmallVector<int64_t> factors = getFactors(val: s);
151	SmallVector<int64_t> localThreadsPerDim;
152	localThreadsPerDim.reserve(N: sizes.size());
153	LDBG(indent << "maximizeNumThreads in " << s
154	<< " with limit: " << maxNumThreads);
155	for (auto factor : factors) {
156	auto nestedThreadsPerDim =
157	maximizeNumThreads(sizes, currentIndex: currentIndex + `1`, maxNumThreads: maxNumThreads / factor);
158	int64_t localBest = factor * product(vals: nestedThreadsPerDim);
159	if (localBest > best && localBest <= maxNumThreads) {
160	LDBG(indent << "new localBest: " << localBest);
161	LDBG(indent << "nestedThreadsPerDim: "
162	<< llvm::interleaved(nestedThreadsPerDim));
163	localThreadsPerDim.clear();
164	localThreadsPerDim.push_back(Elt: factor);
165	llvm::append_range(C&: localThreadsPerDim, R&: nestedThreadsPerDim);
166	best = localBest;
167	}
168	}
169
170	LDBG(indent << "found globalBest: " << best);
171	LDBG(indent << "numThreads: " << llvm::interleaved(localThreadsPerDim));
172	return localThreadsPerDim;
173	}
174
175	transform::gpu::CopyMappingInfo::Status
176	transform::gpu::CopyMappingInfo::inferNumThreads(int64_t totalNumThreads,
177	ArrayRef<int64_t> sizes,
178	int64_t desiredVectorSize,
179	bool favorPredication) {
180
181	if (!favorPredication) {
182	int64_t localVectorSize = desiredVectorSize;
183	for (; localVectorSize >= `1`; localVectorSize /= `2`) {
184	// Attempt to map the copy with predication and current fixed vector size:
185	// 1. if the status is Success, we are done.
186	// 2. if the status is Invalid, we fail immediately, no amount of
187	// vector size reduction can offset the bad tile size selection from the
188	// higher-level.
189	// 3. if the status is RequiresPredication, we try again with a smaller
190	// vector size.
191	Status status =
192	inferNumThreadsImpl(totalNumThreads, sizes, desiredVectorSize: localVectorSize);
193	if (status == Status::Success \|\| status == Status::Invalid)
194	return status;
195
196	LDBG("requires predication, try reducing vector size to "
197	<< (localVectorSize / `2`));
198	}
199	}
200
201	// If we have not yet returned, it means that we have tried all vector sizes
202	// and we still require predication. Restart from the original vector size and
203	// do not attempt to
204	return inferNumThreadsImpl(totalNumThreads, sizes, desiredVectorSize);
205	}
206
207	transform::gpu::CopyMappingInfo::Status
208	transform::gpu::CopyMappingInfo::inferNumThreadsImpl(
209	int64_t totalNumThreads, ArrayRef<int64_t> sizes,
210	int64_t desiredVectorSize) {
211	assert(sizes.back() % desiredVectorSize == `0` &&
212	"most-minor size not divisible by actualVectorSize");
213
214	LDBG("inferNumThreadsImpl with totalNumThreads: "
215	<< totalNumThreads << " and vectorSize: " << desiredVectorSize);
216
217	// Scale the most minor size to account for the chosen vector size and
218	// maximize the number of threads without exceeding the total number of
219	// threads.
220	SmallVector<int64_t> scaledSizes(sizes);
221	scaledSizes.back() /= desiredVectorSize;
222	if (scaledSizes.back() > totalNumThreads) {
223	LDBG("--Too few threads given the required vector size -> FAIL");
224	return Status::Invalid;
225	}
226	SmallVector<int64_t> inferredNumThreads =
227	maximizeNumThreads(sizes: scaledSizes, currentIndex: `0`, maxNumThreads: totalNumThreads);
228
229	LDBG("inferred numThreads: " << llvm::interleaved(inferredNumThreads));
230	LDBG("computed actualVectorSize: " << desiredVectorSize);
231
232	// Corner case: we cannot use more threads than available. If the dimension of
233	// the copy is so bad it is because higher-level tiling did not do its job, we
234	// do not try to recover from it here.
235	int64_t totalNumThreadsUsed = product(vals: inferredNumThreads);
236	LDBG("--totalNumThreadsUsed: " << totalNumThreadsUsed);
237	if (totalNumThreadsUsed == `0` \|\| totalNumThreadsUsed > totalNumThreads) {
238	LDBG("--Too few threads given the required vector size -> FAIL");
239	return Status::Invalid;
240	}
241
242	this->vectorSize = desiredVectorSize;
243	this->numThreads = inferredNumThreads;
244	if (totalNumThreadsUsed == totalNumThreads)
245	return Status::Success;
246
247	return Status::RequiresPredication;
248	}
249
250	void transform::gpu::CopyMappingInfo::print(llvm::raw_ostream &os) const {
251	os << "MappingInfo{"
252	<< "CopyMappingInfo: " << "valid: " << (status != Status::Invalid) << ", "
253	<< "vectorSize: " << vectorSize << ", numThreads: {"
254	<< llvm::interleaved(R: numThreads) << "}, smallestBoundingTileSizes: {"
255	<< llvm::interleaved(R: smallestBoundingTileSizes) << "}, threadMapping: {"
256	<< llvm::interleaved(R: threadMapping) << "}}";
257	}
258

Provided by KDAB

Learn to use CMake with our Intro Training

Find out more

Definitions

source code of mlir/lib/Dialect/Linalg/TransformOps/GPUHeuristics.cpp