1 | //===- GPUHeuristics.cpp - Heuristics Implementation for Transforms -------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "mlir/Dialect/Linalg/TransformOps/GPUHeuristics.h" |
10 | |
11 | #include "mlir/Dialect/GPU/IR/GPUDialect.h" |
12 | #include "llvm/ADT/ArrayRef.h" |
13 | #include "llvm/ADT/STLExtras.h" |
14 | #include "llvm/Support/CommandLine.h" |
15 | #include "llvm/Support/Debug.h" |
16 | #include "llvm/Support/InterleavedRange.h" |
17 | #include "llvm/Support/MathExtras.h" |
18 | #include "llvm/Support/raw_ostream.h" |
19 | #include <cmath> |
20 | #include <numeric> |
21 | |
22 | using namespace mlir; |
23 | |
24 | #define DEBUG_TYPE "linalg-transforms" |
25 | #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") |
26 | #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n") |
27 | |
28 | static Attribute linearId0(MLIRContext *ctx) { |
29 | return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim0); |
30 | } |
31 | static Attribute linearId1(MLIRContext *ctx) { |
32 | return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim1); |
33 | } |
34 | static Attribute linearId2(MLIRContext *ctx) { |
35 | return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim2); |
36 | } |
37 | |
38 | transform::gpu::CopyMappingInfo::CopyMappingInfo(MLIRContext *ctx, |
39 | int totalNumThreads, |
40 | int64_t desiredBitAlignment, |
41 | ArrayRef<int64_t> copySizes, |
42 | bool favorPredication, |
43 | int64_t elementalBitwidth) { |
44 | assert(!copySizes.empty() && copySizes.size() <= 3 && |
45 | "only 1,2,3-D copies are supported for now" ); |
46 | |
47 | LDBG("START CopyMappingInfo, favorPredication: " << favorPredication); |
48 | LLVM_DEBUG(DBGS() << "--copy shape: " << llvm::interleaved(copySizes) |
49 | << "\n" ); |
50 | |
51 | // Greedily find the largest vector size that can be used to copy the most |
52 | // minor dimension: we are in the business of filling kMaxVectorLoadBitWidth |
53 | // contiguous memory transactions with as few threads as possible. |
54 | int64_t desiredVectorSize = CopyMappingInfo::maxContiguousElementsToTransfer( |
55 | alignment: desiredBitAlignment, numContiguousElements: copySizes.back(), elementalBitwidth); |
56 | |
57 | LDBG("--greedily determined vectorSize: " |
58 | << desiredVectorSize << " elements of " << elementalBitwidth |
59 | << "b each -> " << (desiredVectorSize * elementalBitwidth) |
60 | << "b total out of a max of " << kMaxVectorLoadBitWidth << "b" ); |
61 | |
62 | status = inferNumThreads(totalNumThreads, sizes: copySizes, desiredVectorSize, |
63 | favorPredication); |
64 | if (status == Status::Invalid) |
65 | return; |
66 | |
67 | LLVM_DEBUG(DBGS() << "--copy: " << llvm::interleaved(copySizes) << "\n" |
68 | << "--numThreads: " << llvm::interleaved(this->numThreads) |
69 | << "\n" |
70 | << "--vectorSize: " << this->vectorSize << "\n" ); |
71 | assert(this->numThreads.size() == copySizes.size() && |
72 | "compute copy mapping expected same number of threads and copy sizes" ); |
73 | |
74 | // Compute the smallest bounding box. |
75 | this->smallestBoundingTileSizes = llvm::to_vector( |
76 | Range: llvm::map_range(C: llvm::zip(t&: copySizes, u&: this->numThreads), F: [](auto &&pair) { |
77 | int64_t size, numThreads; |
78 | std::tie(args&: size, args&: numThreads) = pair; |
79 | return llvm::divideCeilSigned(Numerator: size, Denominator: numThreads); |
80 | })); |
81 | SmallVector<Attribute> allThreadMappings{linearId2(ctx), linearId1(ctx), |
82 | linearId0(ctx)}; |
83 | |
84 | // Set the thread mapping. |
85 | this->threadMapping = |
86 | llvm::to_vector(Range: ArrayRef(allThreadMappings) |
87 | .take_back(N: this->smallestBoundingTileSizes.size())); |
88 | LLVM_DEBUG(this->print(DBGS()); llvm::dbgs() << "\n" ); |
89 | } |
90 | |
91 | int64_t transform::gpu::CopyMappingInfo::maxContiguousElementsToTransfer( |
92 | int64_t desiredBitAlignment, int64_t numContiguousElements, |
93 | int64_t elementalBitwidth) { |
94 | assert(kMaxVectorLoadBitWidth % elementalBitwidth == 0 && |
95 | "elemental bitwidth does not divide kMaxVectorLoadBitWidth" ); |
96 | assert(desiredBitAlignment % elementalBitwidth == 0 && |
97 | "elemental bitwidth does not divide desired bit alignment" ); |
98 | return std::gcd( |
99 | m: std::gcd(m: desiredBitAlignment / elementalBitwidth, n: numContiguousElements), |
100 | n: kMaxVectorLoadBitWidth / elementalBitwidth); |
101 | } |
102 | |
103 | /// Get the list of all factors that divide `val`, not just the prime factors. |
104 | static SmallVector<int64_t> getFactors(int64_t val) { |
105 | SmallVector<int64_t> factors; |
106 | factors.reserve(N: val); |
107 | for (int64_t factor = 1; factor <= val; ++factor) { |
108 | if (val % factor != 0) |
109 | continue; |
110 | factors.push_back(Elt: factor); |
111 | } |
112 | factors.push_back(Elt: val); |
113 | return factors; |
114 | } |
115 | |
116 | static int64_t product(ArrayRef<int64_t> vals) { |
117 | int64_t res = 1; |
118 | for (auto val : vals) |
119 | res *= val; |
120 | return res; |
121 | } |
122 | |
123 | /// Extract `result` from `sizes` with the following constraints: |
124 | /// 1. sizes[i] % result[i] for all i |
125 | /// 2. product_of_threadsPerDim <= maxNumThreads |
126 | /// 3. if `currentIndex` is sizes.size() - 1, then threadsPerDim[currentIndex] |
127 | /// must be sizes[currentIndex]. |
128 | /// This is used to greedily extract the maximum number of threads usable for |
129 | /// mapping a copy of size `sizes`, while being bounded by `totalNumThreads` and |
130 | /// ensuring coalesced access along the most minor dimension. |
131 | /// Return the number of threads used in the range: |
132 | /// threadsPerDim[currentIndex .. sizes.end()] |
133 | // The implementation uses a dynamic programming approach to greedily extract |
134 | // the best combination under the constraints. |
135 | // TODO: Implementation details can be improved but putting effort there is a |
136 | // tradeoffs: `sizes` is expected to be of small rank and contain small values. |
137 | static SmallVector<int64_t> maximizeNumThreads(ArrayRef<int64_t> sizes, |
138 | int64_t currentIndex, |
139 | int64_t maxNumThreads) { |
140 | assert(static_cast<size_t>(currentIndex) < sizes.size() && |
141 | "currentIndex out of bounds" ); |
142 | std::string indent(2 * currentIndex, '-'); |
143 | if (static_cast<size_t>(currentIndex) == sizes.size() - 1) { |
144 | LDBG(indent << "mandated globalBest: " << sizes[currentIndex]); |
145 | return SmallVector<int64_t>{sizes[currentIndex]}; |
146 | } |
147 | |
148 | int64_t best = 0; |
149 | int64_t s = sizes[currentIndex]; |
150 | SmallVector<int64_t> factors = getFactors(val: s); |
151 | SmallVector<int64_t> localThreadsPerDim; |
152 | localThreadsPerDim.reserve(N: sizes.size()); |
153 | LDBG(indent << "maximizeNumThreads in " << s |
154 | << " with limit: " << maxNumThreads); |
155 | for (auto factor : factors) { |
156 | auto nestedThreadsPerDim = |
157 | maximizeNumThreads(sizes, currentIndex: currentIndex + 1, maxNumThreads: maxNumThreads / factor); |
158 | int64_t localBest = factor * product(vals: nestedThreadsPerDim); |
159 | if (localBest > best && localBest <= maxNumThreads) { |
160 | LDBG(indent << "new localBest: " << localBest); |
161 | LDBG(indent << "nestedThreadsPerDim: " |
162 | << llvm::interleaved(nestedThreadsPerDim)); |
163 | localThreadsPerDim.clear(); |
164 | localThreadsPerDim.push_back(Elt: factor); |
165 | llvm::append_range(C&: localThreadsPerDim, R&: nestedThreadsPerDim); |
166 | best = localBest; |
167 | } |
168 | } |
169 | |
170 | LDBG(indent << "found globalBest: " << best); |
171 | LDBG(indent << "numThreads: " << llvm::interleaved(localThreadsPerDim)); |
172 | return localThreadsPerDim; |
173 | } |
174 | |
175 | transform::gpu::CopyMappingInfo::Status |
176 | transform::gpu::CopyMappingInfo::inferNumThreads(int64_t totalNumThreads, |
177 | ArrayRef<int64_t> sizes, |
178 | int64_t desiredVectorSize, |
179 | bool favorPredication) { |
180 | |
181 | if (!favorPredication) { |
182 | int64_t localVectorSize = desiredVectorSize; |
183 | for (; localVectorSize >= 1; localVectorSize /= 2) { |
184 | // Attempt to map the copy with predication and current fixed vector size: |
185 | // 1. if the status is Success, we are done. |
186 | // 2. if the status is Invalid, we fail immediately, no amount of |
187 | // vector size reduction can offset the bad tile size selection from the |
188 | // higher-level. |
189 | // 3. if the status is RequiresPredication, we try again with a smaller |
190 | // vector size. |
191 | Status status = |
192 | inferNumThreadsImpl(totalNumThreads, sizes, desiredVectorSize: localVectorSize); |
193 | if (status == Status::Success || status == Status::Invalid) |
194 | return status; |
195 | |
196 | LDBG("requires predication, try reducing vector size to " |
197 | << (localVectorSize / 2)); |
198 | } |
199 | } |
200 | |
201 | // If we have not yet returned, it means that we have tried all vector sizes |
202 | // and we still require predication. Restart from the original vector size and |
203 | // do not attempt to |
204 | return inferNumThreadsImpl(totalNumThreads, sizes, desiredVectorSize); |
205 | } |
206 | |
207 | transform::gpu::CopyMappingInfo::Status |
208 | transform::gpu::CopyMappingInfo::inferNumThreadsImpl( |
209 | int64_t totalNumThreads, ArrayRef<int64_t> sizes, |
210 | int64_t desiredVectorSize) { |
211 | assert(sizes.back() % desiredVectorSize == 0 && |
212 | "most-minor size not divisible by actualVectorSize" ); |
213 | |
214 | LDBG("inferNumThreadsImpl with totalNumThreads: " |
215 | << totalNumThreads << " and vectorSize: " << desiredVectorSize); |
216 | |
217 | // Scale the most minor size to account for the chosen vector size and |
218 | // maximize the number of threads without exceeding the total number of |
219 | // threads. |
220 | SmallVector<int64_t> scaledSizes(sizes); |
221 | scaledSizes.back() /= desiredVectorSize; |
222 | if (scaledSizes.back() > totalNumThreads) { |
223 | LDBG("--Too few threads given the required vector size -> FAIL" ); |
224 | return Status::Invalid; |
225 | } |
226 | SmallVector<int64_t> inferredNumThreads = |
227 | maximizeNumThreads(sizes: scaledSizes, currentIndex: 0, maxNumThreads: totalNumThreads); |
228 | |
229 | LDBG("inferred numThreads: " << llvm::interleaved(inferredNumThreads)); |
230 | LDBG("computed actualVectorSize: " << desiredVectorSize); |
231 | |
232 | // Corner case: we cannot use more threads than available. If the dimension of |
233 | // the copy is so bad it is because higher-level tiling did not do its job, we |
234 | // do not try to recover from it here. |
235 | int64_t totalNumThreadsUsed = product(vals: inferredNumThreads); |
236 | LDBG("--totalNumThreadsUsed: " << totalNumThreadsUsed); |
237 | if (totalNumThreadsUsed == 0 || totalNumThreadsUsed > totalNumThreads) { |
238 | LDBG("--Too few threads given the required vector size -> FAIL" ); |
239 | return Status::Invalid; |
240 | } |
241 | |
242 | this->vectorSize = desiredVectorSize; |
243 | this->numThreads = inferredNumThreads; |
244 | if (totalNumThreadsUsed == totalNumThreads) |
245 | return Status::Success; |
246 | |
247 | return Status::RequiresPredication; |
248 | } |
249 | |
250 | void transform::gpu::CopyMappingInfo::print(llvm::raw_ostream &os) const { |
251 | os << "MappingInfo{" |
252 | << "CopyMappingInfo: " << "valid: " << (status != Status::Invalid) << ", " |
253 | << "vectorSize: " << vectorSize << ", numThreads: {" |
254 | << llvm::interleaved(R: numThreads) << "}, smallestBoundingTileSizes: {" |
255 | << llvm::interleaved(R: smallestBoundingTileSizes) << "}, threadMapping: {" |
256 | << llvm::interleaved(R: threadMapping) << "}}" ; |
257 | } |
258 | |