1//===- GPUHeuristics.cpp - Heuristics Implementation for Transforms -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "mlir/Dialect/Linalg/TransformOps/GPUHeuristics.h"
10
11#include "mlir/Dialect/GPU/IR/GPUDialect.h"
12#include "mlir/Support/MathExtras.h"
13#include "llvm/ADT/ArrayRef.h"
14#include "llvm/ADT/STLExtras.h"
15#include "llvm/Support/CommandLine.h"
16#include "llvm/Support/Debug.h"
17#include "llvm/Support/raw_ostream.h"
18#include <cmath>
19#include <numeric>
20
21using namespace mlir;
22
23#define DEBUG_TYPE "linalg-transforms"
24#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
25#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
26
27static Attribute linearId0(MLIRContext *ctx) {
28 return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim0);
29}
30static Attribute linearId1(MLIRContext *ctx) {
31 return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim1);
32}
33static Attribute linearId2(MLIRContext *ctx) {
34 return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim2);
35}
36
37transform::gpu::CopyMappingInfo::CopyMappingInfo(MLIRContext *ctx,
38 int totalNumThreads,
39 int64_t desiredBitAlignment,
40 ArrayRef<int64_t> copySizes,
41 bool favorPredication,
42 int64_t elementalBitwidth) {
43 assert(!copySizes.empty() && copySizes.size() <= 3 &&
44 "only 1,2,3-D copies are supported for now");
45
46 LDBG("START CopyMappingInfo, favorPredication: " << favorPredication);
47 LLVM_DEBUG(llvm::interleaveComma(copySizes, DBGS() << "--copy shape: ");
48 llvm::dbgs() << "\n";);
49
50 // Greedily find the largest vector size that can be used to copy the most
51 // minor dimension: we are in the business of filling kMaxVectorLoadBitWidth
52 // contiguous memory transactions with as few threads as possible.
53 int64_t desiredVectorSize = CopyMappingInfo::maxContiguousElementsToTransfer(
54 alignment: desiredBitAlignment, numContiguousElements: copySizes.back(), elementalBitwidth);
55
56 LDBG("--greedily determined vectorSize: "
57 << desiredVectorSize << " elements of " << elementalBitwidth
58 << "b each -> " << (desiredVectorSize * elementalBitwidth)
59 << "b total out of a max of " << kMaxVectorLoadBitWidth << "b");
60
61 status = inferNumThreads(totalNumThreads, sizes: copySizes, desiredVectorSize,
62 favorPredication);
63 if (status == Status::Invalid)
64 return;
65
66 LLVM_DEBUG(llvm::interleaveComma(copySizes, DBGS() << "--copy: ");
67 llvm::dbgs() << "\n"; llvm::interleaveComma(
68 this->numThreads, DBGS() << "--numThreads: ");
69 llvm::dbgs() << "\n";);
70 LDBG("--vectorSize: " << this->vectorSize);
71 assert(this->numThreads.size() == copySizes.size() &&
72 "compute copy mapping expected same number of threads and copy sizes");
73
74 // Compute the smallest bounding box.
75 this->smallestBoundingTileSizes = llvm::to_vector(
76 Range: llvm::map_range(C: llvm::zip(t&: copySizes, u&: this->numThreads), F: [](auto &&pair) {
77 int64_t size, numThreads;
78 std::tie(args&: size, args&: numThreads) = pair;
79 return mlir::ceilDiv(lhs: size, rhs: numThreads);
80 }));
81 SmallVector<Attribute> allThreadMappings{linearId2(ctx), linearId1(ctx),
82 linearId0(ctx)};
83
84 // Set the thread mapping.
85 this->threadMapping =
86 llvm::to_vector(Range: ArrayRef(allThreadMappings)
87 .take_back(N: this->smallestBoundingTileSizes.size()));
88 LLVM_DEBUG(this->print(DBGS()); llvm::dbgs() << "\n");
89}
90
91int64_t transform::gpu::CopyMappingInfo::maxContiguousElementsToTransfer(
92 int64_t desiredBitAlignment, int64_t numContiguousElements,
93 int64_t elementalBitwidth) {
94 assert(kMaxVectorLoadBitWidth % elementalBitwidth == 0 &&
95 "elemental bitwidth does not divide kMaxVectorLoadBitWidth");
96 assert(desiredBitAlignment % elementalBitwidth == 0 &&
97 "elemental bitwidth does not divide desired bit alignment");
98 return std::gcd(
99 m: std::gcd(m: desiredBitAlignment / elementalBitwidth, n: numContiguousElements),
100 n: kMaxVectorLoadBitWidth / elementalBitwidth);
101}
102
103/// Get the list of all factors that divide `val`, not just the prime factors.
104static SmallVector<int64_t> getFactors(int64_t val) {
105 SmallVector<int64_t> factors;
106 factors.reserve(N: val);
107 for (int64_t factor = 1; factor <= val; ++factor) {
108 if (val % factor != 0)
109 continue;
110 factors.push_back(Elt: factor);
111 }
112 factors.push_back(Elt: val);
113 return factors;
114}
115
116static int64_t product(ArrayRef<int64_t> vals) {
117 int64_t res = 1;
118 for (auto val : vals)
119 res *= val;
120 return res;
121}
122
123/// Extract `result` from `sizes` with the following constraints:
124/// 1. sizes[i] % result[i] for all i
125/// 2. product_of_threadsPerDim <= maxNumThreads
126/// 3. if `currentIndex` is sizes.size() - 1, then threadsPerDim[currentIndex]
127/// must be sizes[currentIndex].
128/// This is used to greedily extract the maximum number of threads usable for
129/// mapping a copy of size `sizes`, while being bounded by `totalNumThreads` and
130/// ensuring coalesced access along the most minor dimension.
131/// Return the number of threads used in the range:
132/// threadsPerDim[currentIndex .. sizes.end()]
133// The implementation uses a dynamic programming approach to greedily extract
134// the best combination under the constraints.
135// TODO: Implementation details can be improved but putting effort there is a
136// tradeoffs: `sizes` is expected to be of small rank and contain small values.
137static SmallVector<int64_t> maximizeNumThreads(ArrayRef<int64_t> sizes,
138 int64_t currentIndex,
139 int64_t maxNumThreads) {
140 assert(static_cast<size_t>(currentIndex) < sizes.size() &&
141 "currentIndex out of bounds");
142 std::string indent(2 * currentIndex, '-');
143 if (static_cast<size_t>(currentIndex) == sizes.size() - 1) {
144 LDBG(indent << "mandated globalBest: " << sizes[currentIndex]);
145 return SmallVector<int64_t>{sizes[currentIndex]};
146 }
147
148 int64_t best = 0;
149 int64_t s = sizes[currentIndex];
150 SmallVector<int64_t> factors = getFactors(val: s);
151 SmallVector<int64_t> localThreadsPerDim;
152 localThreadsPerDim.reserve(N: sizes.size());
153 LDBG(indent << "maximizeNumThreads in " << s
154 << " with limit: " << maxNumThreads);
155 for (auto factor : factors) {
156 auto nestedThreadsPerDim =
157 maximizeNumThreads(sizes, currentIndex: currentIndex + 1, maxNumThreads: maxNumThreads / factor);
158 int64_t localBest = factor * product(vals: nestedThreadsPerDim);
159 if (localBest > best && localBest <= maxNumThreads) {
160 LDBG(indent << "new localBest: " << localBest);
161 LLVM_DEBUG(
162 llvm::interleaveComma(nestedThreadsPerDim,
163 DBGS() << indent << "nestedThreadsPerDim: ");
164 llvm::dbgs() << "\n";);
165 localThreadsPerDim.clear();
166 localThreadsPerDim.push_back(Elt: factor);
167 llvm::append_range(C&: localThreadsPerDim, R&: nestedThreadsPerDim);
168 best = localBest;
169 }
170 }
171
172 LDBG(indent << "found globalBest: " << best);
173 LLVM_DEBUG(llvm::interleaveComma(localThreadsPerDim,
174 DBGS() << indent << "numThreads: ");
175 llvm::dbgs() << "\n";);
176
177 return localThreadsPerDim;
178}
179
180transform::gpu::CopyMappingInfo::Status
181transform::gpu::CopyMappingInfo::inferNumThreads(int64_t totalNumThreads,
182 ArrayRef<int64_t> sizes,
183 int64_t desiredVectorSize,
184 bool favorPredication) {
185
186 if (!favorPredication) {
187 int64_t localVectorSize = desiredVectorSize;
188 for (; localVectorSize >= 1; localVectorSize /= 2) {
189 // Attempt to map the copy with predication and current fixed vector size:
190 // 1. if the status is Success, we are done.
191 // 2. if the status is Invalid, we fail immediately, no amount of
192 // vector size reduction can offset the bad tile size selection from the
193 // higher-level.
194 // 3. if the status is RequiresPredication, we try again with a smaller
195 // vector size.
196 Status status =
197 inferNumThreadsImpl(totalNumThreads, sizes, desiredVectorSize: localVectorSize);
198 if (status == Status::Success || status == Status::Invalid)
199 return status;
200
201 LDBG("requires predication, try reducing vector size to "
202 << (localVectorSize / 2));
203 }
204 }
205
206 // If we have not yet returned, it means that we have tried all vector sizes
207 // and we still require predication. Restart from the original vector size and
208 // do not attempt to
209 return inferNumThreadsImpl(totalNumThreads, sizes, desiredVectorSize);
210}
211
212transform::gpu::CopyMappingInfo::Status
213transform::gpu::CopyMappingInfo::inferNumThreadsImpl(
214 int64_t totalNumThreads, ArrayRef<int64_t> sizes,
215 int64_t desiredVectorSize) {
216 assert(sizes.back() % desiredVectorSize == 0 &&
217 "most-minor size not divisible by actualVectorSize");
218
219 LDBG("inferNumThreadsImpl with totalNumThreads: "
220 << totalNumThreads << " and vectorSize: " << desiredVectorSize);
221
222 // Scale the most minor size to account for the chosen vector size and
223 // maximize the number of threads without exceeding the total number of
224 // threads.
225 SmallVector<int64_t> scaledSizes{sizes};
226 scaledSizes.back() /= desiredVectorSize;
227 if (scaledSizes.back() > totalNumThreads) {
228 LDBG("--Too few threads given the required vector size -> FAIL");
229 return Status::Invalid;
230 }
231 SmallVector<int64_t> inferredNumThreads =
232 maximizeNumThreads(sizes: scaledSizes, currentIndex: 0, maxNumThreads: totalNumThreads);
233
234 LLVM_DEBUG(llvm::interleaveComma(inferredNumThreads,
235 DBGS() << "inferred numThreads: ");
236 llvm::dbgs() << "\n";
237 LDBG("computed actualVectorSize: " << desiredVectorSize););
238
239 // Corner case: we cannot use more threads than available. If the dimension of
240 // the copy is so bad it is because higher-level tiling did not do its job, we
241 // do not try to recover from it here.
242 int64_t totalNumThreadsUsed = product(vals: inferredNumThreads);
243 LDBG("--totalNumThreadsUsed: " << totalNumThreadsUsed);
244 if (totalNumThreadsUsed == 0 || totalNumThreadsUsed > totalNumThreads) {
245 LDBG("--Too few threads given the required vector size -> FAIL");
246 return Status::Invalid;
247 }
248
249 this->vectorSize = desiredVectorSize;
250 this->numThreads = inferredNumThreads;
251 if (totalNumThreadsUsed == totalNumThreads)
252 return Status::Success;
253
254 return Status::RequiresPredication;
255}
256
257void transform::gpu::CopyMappingInfo::print(llvm::raw_ostream &os) const {
258 os << "MappingInfo{";
259 os << "CopyMappingInfo: ";
260 os << "valid: " << (status != Status::Invalid) << ", ";
261 os << "vectorSize: " << vectorSize << ", ";
262 llvm::interleaveComma(c: numThreads, os&: os << ", numThreads: {");
263 llvm::interleaveComma(c: smallestBoundingTileSizes,
264 os&: os << "}, smallestBoundingTileSizes: {");
265 llvm::interleaveComma(c: threadMapping, os&: os << "}, threadMapping: {");
266 os << "}}";
267}
268

source code of mlir/lib/Dialect/Linalg/TransformOps/GPUHeuristics.cpp