| 1 | //===- GPUHeuristics.cpp - Heuristics Implementation for Transforms -------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #include "mlir/Dialect/Linalg/TransformOps/GPUHeuristics.h" |
| 10 | |
| 11 | #include "mlir/Dialect/GPU/IR/GPUDialect.h" |
| 12 | #include "llvm/ADT/ArrayRef.h" |
| 13 | #include "llvm/ADT/STLExtras.h" |
| 14 | #include "llvm/Support/CommandLine.h" |
| 15 | #include "llvm/Support/Debug.h" |
| 16 | #include "llvm/Support/InterleavedRange.h" |
| 17 | #include "llvm/Support/MathExtras.h" |
| 18 | #include "llvm/Support/raw_ostream.h" |
| 19 | #include <cmath> |
| 20 | #include <numeric> |
| 21 | |
| 22 | using namespace mlir; |
| 23 | |
| 24 | #define DEBUG_TYPE "linalg-transforms" |
| 25 | #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") |
| 26 | #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n") |
| 27 | |
| 28 | static Attribute linearId0(MLIRContext *ctx) { |
| 29 | return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim0); |
| 30 | } |
| 31 | static Attribute linearId1(MLIRContext *ctx) { |
| 32 | return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim1); |
| 33 | } |
| 34 | static Attribute linearId2(MLIRContext *ctx) { |
| 35 | return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim2); |
| 36 | } |
| 37 | |
| 38 | transform::gpu::CopyMappingInfo::CopyMappingInfo(MLIRContext *ctx, |
| 39 | int totalNumThreads, |
| 40 | int64_t desiredBitAlignment, |
| 41 | ArrayRef<int64_t> copySizes, |
| 42 | bool favorPredication, |
| 43 | int64_t elementalBitwidth) { |
| 44 | assert(!copySizes.empty() && copySizes.size() <= 3 && |
| 45 | "only 1,2,3-D copies are supported for now" ); |
| 46 | |
| 47 | LDBG("START CopyMappingInfo, favorPredication: " << favorPredication); |
| 48 | LLVM_DEBUG(DBGS() << "--copy shape: " << llvm::interleaved(copySizes) |
| 49 | << "\n" ); |
| 50 | |
| 51 | // Greedily find the largest vector size that can be used to copy the most |
| 52 | // minor dimension: we are in the business of filling kMaxVectorLoadBitWidth |
| 53 | // contiguous memory transactions with as few threads as possible. |
| 54 | int64_t desiredVectorSize = CopyMappingInfo::maxContiguousElementsToTransfer( |
| 55 | alignment: desiredBitAlignment, numContiguousElements: copySizes.back(), elementalBitwidth); |
| 56 | |
| 57 | LDBG("--greedily determined vectorSize: " |
| 58 | << desiredVectorSize << " elements of " << elementalBitwidth |
| 59 | << "b each -> " << (desiredVectorSize * elementalBitwidth) |
| 60 | << "b total out of a max of " << kMaxVectorLoadBitWidth << "b" ); |
| 61 | |
| 62 | status = inferNumThreads(totalNumThreads, sizes: copySizes, desiredVectorSize, |
| 63 | favorPredication); |
| 64 | if (status == Status::Invalid) |
| 65 | return; |
| 66 | |
| 67 | LLVM_DEBUG(DBGS() << "--copy: " << llvm::interleaved(copySizes) << "\n" |
| 68 | << "--numThreads: " << llvm::interleaved(this->numThreads) |
| 69 | << "\n" |
| 70 | << "--vectorSize: " << this->vectorSize << "\n" ); |
| 71 | assert(this->numThreads.size() == copySizes.size() && |
| 72 | "compute copy mapping expected same number of threads and copy sizes" ); |
| 73 | |
| 74 | // Compute the smallest bounding box. |
| 75 | this->smallestBoundingTileSizes = llvm::to_vector( |
| 76 | Range: llvm::map_range(C: llvm::zip(t&: copySizes, u&: this->numThreads), F: [](auto &&pair) { |
| 77 | int64_t size, numThreads; |
| 78 | std::tie(args&: size, args&: numThreads) = pair; |
| 79 | return llvm::divideCeilSigned(Numerator: size, Denominator: numThreads); |
| 80 | })); |
| 81 | SmallVector<Attribute> allThreadMappings{linearId2(ctx), linearId1(ctx), |
| 82 | linearId0(ctx)}; |
| 83 | |
| 84 | // Set the thread mapping. |
| 85 | this->threadMapping = |
| 86 | llvm::to_vector(Range: ArrayRef(allThreadMappings) |
| 87 | .take_back(N: this->smallestBoundingTileSizes.size())); |
| 88 | LLVM_DEBUG(this->print(DBGS()); llvm::dbgs() << "\n" ); |
| 89 | } |
| 90 | |
| 91 | int64_t transform::gpu::CopyMappingInfo::maxContiguousElementsToTransfer( |
| 92 | int64_t desiredBitAlignment, int64_t numContiguousElements, |
| 93 | int64_t elementalBitwidth) { |
| 94 | assert(kMaxVectorLoadBitWidth % elementalBitwidth == 0 && |
| 95 | "elemental bitwidth does not divide kMaxVectorLoadBitWidth" ); |
| 96 | assert(desiredBitAlignment % elementalBitwidth == 0 && |
| 97 | "elemental bitwidth does not divide desired bit alignment" ); |
| 98 | return std::gcd( |
| 99 | m: std::gcd(m: desiredBitAlignment / elementalBitwidth, n: numContiguousElements), |
| 100 | n: kMaxVectorLoadBitWidth / elementalBitwidth); |
| 101 | } |
| 102 | |
| 103 | /// Get the list of all factors that divide `val`, not just the prime factors. |
| 104 | static SmallVector<int64_t> getFactors(int64_t val) { |
| 105 | SmallVector<int64_t> factors; |
| 106 | factors.reserve(N: val); |
| 107 | for (int64_t factor = 1; factor <= val; ++factor) { |
| 108 | if (val % factor != 0) |
| 109 | continue; |
| 110 | factors.push_back(Elt: factor); |
| 111 | } |
| 112 | factors.push_back(Elt: val); |
| 113 | return factors; |
| 114 | } |
| 115 | |
| 116 | static int64_t product(ArrayRef<int64_t> vals) { |
| 117 | int64_t res = 1; |
| 118 | for (auto val : vals) |
| 119 | res *= val; |
| 120 | return res; |
| 121 | } |
| 122 | |
| 123 | /// Extract `result` from `sizes` with the following constraints: |
| 124 | /// 1. sizes[i] % result[i] for all i |
| 125 | /// 2. product_of_threadsPerDim <= maxNumThreads |
| 126 | /// 3. if `currentIndex` is sizes.size() - 1, then threadsPerDim[currentIndex] |
| 127 | /// must be sizes[currentIndex]. |
| 128 | /// This is used to greedily extract the maximum number of threads usable for |
| 129 | /// mapping a copy of size `sizes`, while being bounded by `totalNumThreads` and |
| 130 | /// ensuring coalesced access along the most minor dimension. |
| 131 | /// Return the number of threads used in the range: |
| 132 | /// threadsPerDim[currentIndex .. sizes.end()] |
| 133 | // The implementation uses a dynamic programming approach to greedily extract |
| 134 | // the best combination under the constraints. |
| 135 | // TODO: Implementation details can be improved but putting effort there is a |
| 136 | // tradeoffs: `sizes` is expected to be of small rank and contain small values. |
| 137 | static SmallVector<int64_t> maximizeNumThreads(ArrayRef<int64_t> sizes, |
| 138 | int64_t currentIndex, |
| 139 | int64_t maxNumThreads) { |
| 140 | assert(static_cast<size_t>(currentIndex) < sizes.size() && |
| 141 | "currentIndex out of bounds" ); |
| 142 | std::string indent(2 * currentIndex, '-'); |
| 143 | if (static_cast<size_t>(currentIndex) == sizes.size() - 1) { |
| 144 | LDBG(indent << "mandated globalBest: " << sizes[currentIndex]); |
| 145 | return SmallVector<int64_t>{sizes[currentIndex]}; |
| 146 | } |
| 147 | |
| 148 | int64_t best = 0; |
| 149 | int64_t s = sizes[currentIndex]; |
| 150 | SmallVector<int64_t> factors = getFactors(val: s); |
| 151 | SmallVector<int64_t> localThreadsPerDim; |
| 152 | localThreadsPerDim.reserve(N: sizes.size()); |
| 153 | LDBG(indent << "maximizeNumThreads in " << s |
| 154 | << " with limit: " << maxNumThreads); |
| 155 | for (auto factor : factors) { |
| 156 | auto nestedThreadsPerDim = |
| 157 | maximizeNumThreads(sizes, currentIndex: currentIndex + 1, maxNumThreads: maxNumThreads / factor); |
| 158 | int64_t localBest = factor * product(vals: nestedThreadsPerDim); |
| 159 | if (localBest > best && localBest <= maxNumThreads) { |
| 160 | LDBG(indent << "new localBest: " << localBest); |
| 161 | LDBG(indent << "nestedThreadsPerDim: " |
| 162 | << llvm::interleaved(nestedThreadsPerDim)); |
| 163 | localThreadsPerDim.clear(); |
| 164 | localThreadsPerDim.push_back(Elt: factor); |
| 165 | llvm::append_range(C&: localThreadsPerDim, R&: nestedThreadsPerDim); |
| 166 | best = localBest; |
| 167 | } |
| 168 | } |
| 169 | |
| 170 | LDBG(indent << "found globalBest: " << best); |
| 171 | LDBG(indent << "numThreads: " << llvm::interleaved(localThreadsPerDim)); |
| 172 | return localThreadsPerDim; |
| 173 | } |
| 174 | |
| 175 | transform::gpu::CopyMappingInfo::Status |
| 176 | transform::gpu::CopyMappingInfo::inferNumThreads(int64_t totalNumThreads, |
| 177 | ArrayRef<int64_t> sizes, |
| 178 | int64_t desiredVectorSize, |
| 179 | bool favorPredication) { |
| 180 | |
| 181 | if (!favorPredication) { |
| 182 | int64_t localVectorSize = desiredVectorSize; |
| 183 | for (; localVectorSize >= 1; localVectorSize /= 2) { |
| 184 | // Attempt to map the copy with predication and current fixed vector size: |
| 185 | // 1. if the status is Success, we are done. |
| 186 | // 2. if the status is Invalid, we fail immediately, no amount of |
| 187 | // vector size reduction can offset the bad tile size selection from the |
| 188 | // higher-level. |
| 189 | // 3. if the status is RequiresPredication, we try again with a smaller |
| 190 | // vector size. |
| 191 | Status status = |
| 192 | inferNumThreadsImpl(totalNumThreads, sizes, desiredVectorSize: localVectorSize); |
| 193 | if (status == Status::Success || status == Status::Invalid) |
| 194 | return status; |
| 195 | |
| 196 | LDBG("requires predication, try reducing vector size to " |
| 197 | << (localVectorSize / 2)); |
| 198 | } |
| 199 | } |
| 200 | |
| 201 | // If we have not yet returned, it means that we have tried all vector sizes |
| 202 | // and we still require predication. Restart from the original vector size and |
| 203 | // do not attempt to |
| 204 | return inferNumThreadsImpl(totalNumThreads, sizes, desiredVectorSize); |
| 205 | } |
| 206 | |
| 207 | transform::gpu::CopyMappingInfo::Status |
| 208 | transform::gpu::CopyMappingInfo::inferNumThreadsImpl( |
| 209 | int64_t totalNumThreads, ArrayRef<int64_t> sizes, |
| 210 | int64_t desiredVectorSize) { |
| 211 | assert(sizes.back() % desiredVectorSize == 0 && |
| 212 | "most-minor size not divisible by actualVectorSize" ); |
| 213 | |
| 214 | LDBG("inferNumThreadsImpl with totalNumThreads: " |
| 215 | << totalNumThreads << " and vectorSize: " << desiredVectorSize); |
| 216 | |
| 217 | // Scale the most minor size to account for the chosen vector size and |
| 218 | // maximize the number of threads without exceeding the total number of |
| 219 | // threads. |
| 220 | SmallVector<int64_t> scaledSizes(sizes); |
| 221 | scaledSizes.back() /= desiredVectorSize; |
| 222 | if (scaledSizes.back() > totalNumThreads) { |
| 223 | LDBG("--Too few threads given the required vector size -> FAIL" ); |
| 224 | return Status::Invalid; |
| 225 | } |
| 226 | SmallVector<int64_t> inferredNumThreads = |
| 227 | maximizeNumThreads(sizes: scaledSizes, currentIndex: 0, maxNumThreads: totalNumThreads); |
| 228 | |
| 229 | LDBG("inferred numThreads: " << llvm::interleaved(inferredNumThreads)); |
| 230 | LDBG("computed actualVectorSize: " << desiredVectorSize); |
| 231 | |
| 232 | // Corner case: we cannot use more threads than available. If the dimension of |
| 233 | // the copy is so bad it is because higher-level tiling did not do its job, we |
| 234 | // do not try to recover from it here. |
| 235 | int64_t totalNumThreadsUsed = product(vals: inferredNumThreads); |
| 236 | LDBG("--totalNumThreadsUsed: " << totalNumThreadsUsed); |
| 237 | if (totalNumThreadsUsed == 0 || totalNumThreadsUsed > totalNumThreads) { |
| 238 | LDBG("--Too few threads given the required vector size -> FAIL" ); |
| 239 | return Status::Invalid; |
| 240 | } |
| 241 | |
| 242 | this->vectorSize = desiredVectorSize; |
| 243 | this->numThreads = inferredNumThreads; |
| 244 | if (totalNumThreadsUsed == totalNumThreads) |
| 245 | return Status::Success; |
| 246 | |
| 247 | return Status::RequiresPredication; |
| 248 | } |
| 249 | |
| 250 | void transform::gpu::CopyMappingInfo::print(llvm::raw_ostream &os) const { |
| 251 | os << "MappingInfo{" |
| 252 | << "CopyMappingInfo: " << "valid: " << (status != Status::Invalid) << ", " |
| 253 | << "vectorSize: " << vectorSize << ", numThreads: {" |
| 254 | << llvm::interleaved(R: numThreads) << "}, smallestBoundingTileSizes: {" |
| 255 | << llvm::interleaved(R: smallestBoundingTileSizes) << "}, threadMapping: {" |
| 256 | << llvm::interleaved(R: threadMapping) << "}}" ; |
| 257 | } |
| 258 | |