| 1 | //===- VectorUtils.cpp - MLIR Utilities for VectorOps ------------------===// |
| 2 | // |
| 3 | // Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This file implements utility methods for working with the Vector dialect. |
| 10 | // |
| 11 | //===----------------------------------------------------------------------===// |
| 12 | |
| 13 | #include "mlir/Dialect/Vector/Utils/VectorUtils.h" |
| 14 | |
| 15 | #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h" |
| 16 | #include "mlir/Dialect/Affine/IR/AffineOps.h" |
| 17 | #include "mlir/Dialect/Arith/IR/Arith.h" |
| 18 | #include "mlir/Dialect/Func/IR/FuncOps.h" |
| 19 | #include "mlir/Dialect/MemRef/IR/MemRef.h" |
| 20 | #include "mlir/Dialect/Tensor/IR/Tensor.h" |
| 21 | #include "mlir/Dialect/Utils/IndexingUtils.h" |
| 22 | #include "mlir/Dialect/Vector/IR/VectorOps.h" |
| 23 | #include "mlir/IR/Builders.h" |
| 24 | #include "mlir/IR/IntegerSet.h" |
| 25 | #include "mlir/IR/Operation.h" |
| 26 | #include "mlir/IR/TypeUtilities.h" |
| 27 | #include "mlir/Support/LLVM.h" |
| 28 | |
| 29 | #include "llvm/ADT/DenseSet.h" |
| 30 | #include "llvm/Support/InterleavedRange.h" |
| 31 | |
| 32 | #define DEBUG_TYPE "vector-utils" |
| 33 | |
| 34 | #define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ") |
| 35 | #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n") |
| 36 | |
| 37 | using namespace mlir; |
| 38 | |
| 39 | /// Helper function that creates a memref::DimOp or tensor::DimOp depending on |
| 40 | /// the type of `source`. |
| 41 | Value mlir::vector::createOrFoldDimOp(OpBuilder &b, Location loc, Value source, |
| 42 | int64_t dim) { |
| 43 | if (isa<UnrankedMemRefType, MemRefType>(source.getType())) |
| 44 | return b.createOrFold<memref::DimOp>(loc, source, dim); |
| 45 | if (isa<UnrankedTensorType, RankedTensorType>(source.getType())) |
| 46 | return b.createOrFold<tensor::DimOp>(loc, source, dim); |
| 47 | llvm_unreachable("Expected MemRefType or TensorType" ); |
| 48 | } |
| 49 | |
| 50 | /// Given the n-D transpose pattern 'transp', return true if 'dim0' and 'dim1' |
| 51 | /// should be transposed with each other within the context of their 2D |
| 52 | /// transposition slice. |
| 53 | /// |
| 54 | /// Example 1: dim0 = 0, dim1 = 2, transp = [2, 1, 0] |
| 55 | /// Return true: dim0 and dim1 are transposed within the context of their 2D |
| 56 | /// transposition slice ([1, 0]). |
| 57 | /// |
| 58 | /// Example 2: dim0 = 0, dim1 = 1, transp = [2, 1, 0] |
| 59 | /// Return true: dim0 and dim1 are transposed within the context of their 2D |
| 60 | /// transposition slice ([1, 0]). Paradoxically, note how dim1 (1) is *not* |
| 61 | /// transposed within the full context of the transposition. |
| 62 | /// |
| 63 | /// Example 3: dim0 = 0, dim1 = 1, transp = [2, 0, 1] |
| 64 | /// Return false: dim0 and dim1 are *not* transposed within the context of |
| 65 | /// their 2D transposition slice ([0, 1]). Paradoxically, note how dim0 (0) |
| 66 | /// and dim1 (1) are transposed within the full context of the of the |
| 67 | /// transposition. |
| 68 | static bool areDimsTransposedIn2DSlice(int64_t dim0, int64_t dim1, |
| 69 | ArrayRef<int64_t> transp) { |
| 70 | // Perform a linear scan along the dimensions of the transposed pattern. If |
| 71 | // dim0 is found first, dim0 and dim1 are not transposed within the context of |
| 72 | // their 2D slice. Otherwise, 'dim1' is found first and they are transposed. |
| 73 | for (int64_t permDim : transp) { |
| 74 | if (permDim == dim0) |
| 75 | return false; |
| 76 | if (permDim == dim1) |
| 77 | return true; |
| 78 | } |
| 79 | |
| 80 | llvm_unreachable("Ill-formed transpose pattern" ); |
| 81 | } |
| 82 | |
| 83 | FailureOr<std::pair<int, int>> |
| 84 | mlir::vector::isTranspose2DSlice(vector::TransposeOp op) { |
| 85 | VectorType srcType = op.getSourceVectorType(); |
| 86 | SmallVector<int64_t> srcGtOneDims; |
| 87 | for (auto [index, size] : llvm::enumerate(srcType.getShape())) |
| 88 | if (size > 1) |
| 89 | srcGtOneDims.push_back(index); |
| 90 | |
| 91 | if (srcGtOneDims.size() != 2) |
| 92 | return failure(); |
| 93 | |
| 94 | // Check whether the two source vector dimensions that are greater than one |
| 95 | // must be transposed with each other so that we can apply one of the 2-D |
| 96 | // transpose pattens. Otherwise, these patterns are not applicable. |
| 97 | if (!areDimsTransposedIn2DSlice(srcGtOneDims[0], srcGtOneDims[1], |
| 98 | op.getPermutation())) |
| 99 | return failure(); |
| 100 | |
| 101 | return std::pair<int, int>(srcGtOneDims[0], srcGtOneDims[1]); |
| 102 | } |
| 103 | |
| 104 | /// Constructs a permutation map from memref indices to vector dimension. |
| 105 | /// |
| 106 | /// The implementation uses the knowledge of the mapping of enclosing loop to |
| 107 | /// vector dimension. `enclosingLoopToVectorDim` carries this information as a |
| 108 | /// map with: |
| 109 | /// - keys representing "vectorized enclosing loops"; |
| 110 | /// - values representing the corresponding vector dimension. |
| 111 | /// The algorithm traverses "vectorized enclosing loops" and extracts the |
| 112 | /// at-most-one MemRef index that is invariant along said loop. This index is |
| 113 | /// guaranteed to be at most one by construction: otherwise the MemRef is not |
| 114 | /// vectorizable. |
| 115 | /// If this invariant index is found, it is added to the permutation_map at the |
| 116 | /// proper vector dimension. |
| 117 | /// If no index is found to be invariant, 0 is added to the permutation_map and |
| 118 | /// corresponds to a vector broadcast along that dimension. |
| 119 | /// |
| 120 | /// Returns an empty AffineMap if `enclosingLoopToVectorDim` is empty, |
| 121 | /// signalling that no permutation map can be constructed given |
| 122 | /// `enclosingLoopToVectorDim`. |
| 123 | /// |
| 124 | /// Examples can be found in the documentation of `makePermutationMap`, in the |
| 125 | /// header file. |
| 126 | static AffineMap makePermutationMap( |
| 127 | ArrayRef<Value> indices, |
| 128 | const DenseMap<Operation *, unsigned> &enclosingLoopToVectorDim) { |
| 129 | if (enclosingLoopToVectorDim.empty()) |
| 130 | return AffineMap(); |
| 131 | MLIRContext *context = |
| 132 | enclosingLoopToVectorDim.begin()->getFirst()->getContext(); |
| 133 | SmallVector<AffineExpr> perm(enclosingLoopToVectorDim.size(), |
| 134 | getAffineConstantExpr(constant: 0, context)); |
| 135 | |
| 136 | for (auto kvp : enclosingLoopToVectorDim) { |
| 137 | assert(kvp.second < perm.size()); |
| 138 | auto invariants = affine::getInvariantAccesses( |
| 139 | iv: cast<affine::AffineForOp>(kvp.first).getInductionVar(), indices); |
| 140 | unsigned numIndices = indices.size(); |
| 141 | unsigned countInvariantIndices = 0; |
| 142 | for (unsigned dim = 0; dim < numIndices; ++dim) { |
| 143 | if (!invariants.count(indices[dim])) { |
| 144 | assert(perm[kvp.second] == getAffineConstantExpr(0, context) && |
| 145 | "permutationMap already has an entry along dim" ); |
| 146 | perm[kvp.second] = getAffineDimExpr(position: dim, context); |
| 147 | } else { |
| 148 | ++countInvariantIndices; |
| 149 | } |
| 150 | } |
| 151 | assert((countInvariantIndices == numIndices || |
| 152 | countInvariantIndices == numIndices - 1) && |
| 153 | "Vectorization prerequisite violated: at most 1 index may be " |
| 154 | "invariant wrt a vectorized loop" ); |
| 155 | (void)countInvariantIndices; |
| 156 | } |
| 157 | return AffineMap::get(dimCount: indices.size(), symbolCount: 0, results: perm, context); |
| 158 | } |
| 159 | |
| 160 | /// Implementation detail that walks up the parents and records the ones with |
| 161 | /// the specified type. |
| 162 | /// TODO: could also be implemented as a collect parents followed by a |
| 163 | /// filter and made available outside this file. |
| 164 | template <typename T> |
| 165 | static SetVector<Operation *> getParentsOfType(Block *block) { |
| 166 | SetVector<Operation *> res; |
| 167 | auto *current = block->getParentOp(); |
| 168 | while (current) { |
| 169 | if ([[maybe_unused]] auto typedParent = dyn_cast<T>(current)) { |
| 170 | assert(res.count(current) == 0 && "Already inserted" ); |
| 171 | res.insert(X: current); |
| 172 | } |
| 173 | current = current->getParentOp(); |
| 174 | } |
| 175 | return res; |
| 176 | } |
| 177 | |
| 178 | /// Returns the enclosing AffineForOp, from closest to farthest. |
| 179 | static SetVector<Operation *> getEnclosingforOps(Block *block) { |
| 180 | return getParentsOfType<affine::AffineForOp>(block); |
| 181 | } |
| 182 | |
| 183 | AffineMap mlir::makePermutationMap( |
| 184 | Block *insertPoint, ArrayRef<Value> indices, |
| 185 | const DenseMap<Operation *, unsigned> &loopToVectorDim) { |
| 186 | DenseMap<Operation *, unsigned> enclosingLoopToVectorDim; |
| 187 | auto enclosingLoops = getEnclosingforOps(block: insertPoint); |
| 188 | for (auto *forInst : enclosingLoops) { |
| 189 | auto it = loopToVectorDim.find(Val: forInst); |
| 190 | if (it != loopToVectorDim.end()) { |
| 191 | enclosingLoopToVectorDim.insert(KV: *it); |
| 192 | } |
| 193 | } |
| 194 | return ::makePermutationMap(indices, enclosingLoopToVectorDim); |
| 195 | } |
| 196 | |
| 197 | AffineMap mlir::makePermutationMap( |
| 198 | Operation *op, ArrayRef<Value> indices, |
| 199 | const DenseMap<Operation *, unsigned> &loopToVectorDim) { |
| 200 | return makePermutationMap(insertPoint: op->getBlock(), indices, loopToVectorDim); |
| 201 | } |
| 202 | |
| 203 | bool matcher::operatesOnSuperVectorsOf(Operation &op, |
| 204 | VectorType subVectorType) { |
| 205 | // First, extract the vector type and distinguish between: |
| 206 | // a. ops that *must* lower a super-vector (i.e. vector.transfer_read, |
| 207 | // vector.transfer_write); and |
| 208 | // b. ops that *may* lower a super-vector (all other ops). |
| 209 | // The ops that *may* lower a super-vector only do so if the super-vector to |
| 210 | // sub-vector ratio exists. The ops that *must* lower a super-vector are |
| 211 | // explicitly checked for this property. |
| 212 | /// TODO: there should be a single function for all ops to do this so we |
| 213 | /// do not have to special case. Maybe a trait, or just a method, unclear atm. |
| 214 | bool mustDivide = false; |
| 215 | (void)mustDivide; |
| 216 | VectorType superVectorType; |
| 217 | if (auto transfer = dyn_cast<VectorTransferOpInterface>(op)) { |
| 218 | superVectorType = transfer.getVectorType(); |
| 219 | mustDivide = true; |
| 220 | } else if (op.getNumResults() == 0) { |
| 221 | if (!isa<func::ReturnOp>(op)) { |
| 222 | op.emitError(message: "NYI: assuming only return operations can have 0 " |
| 223 | " results at this point" ); |
| 224 | } |
| 225 | return false; |
| 226 | } else if (op.getNumResults() == 1) { |
| 227 | if (auto v = dyn_cast<VectorType>(op.getResult(0).getType())) { |
| 228 | superVectorType = v; |
| 229 | } else { |
| 230 | // Not a vector type. |
| 231 | return false; |
| 232 | } |
| 233 | } else { |
| 234 | // Not a vector.transfer and has more than 1 result, fail hard for now to |
| 235 | // wake us up when something changes. |
| 236 | op.emitError(message: "NYI: operation has more than 1 result" ); |
| 237 | return false; |
| 238 | } |
| 239 | |
| 240 | // Get the ratio. |
| 241 | auto ratio = |
| 242 | computeShapeRatio(superVectorType.getShape(), subVectorType.getShape()); |
| 243 | |
| 244 | // Sanity check. |
| 245 | assert((ratio || !mustDivide) && |
| 246 | "vector.transfer operation in which super-vector size is not an" |
| 247 | " integer multiple of sub-vector size" ); |
| 248 | |
| 249 | // This catches cases that are not strictly necessary to have multiplicity but |
| 250 | // still aren't divisible by the sub-vector shape. |
| 251 | // This could be useful information if we wanted to reshape at the level of |
| 252 | // the vector type (but we would have to look at the compute and distinguish |
| 253 | // between parallel, reduction and possibly other cases. |
| 254 | return ratio.has_value(); |
| 255 | } |
| 256 | |
| 257 | bool vector::isContiguousSlice(MemRefType memrefType, VectorType vectorType) { |
| 258 | if (vectorType.isScalable()) |
| 259 | return false; |
| 260 | |
| 261 | ArrayRef<int64_t> vectorShape = vectorType.getShape(); |
| 262 | auto vecRank = vectorType.getRank(); |
| 263 | |
| 264 | if (!memrefType.areTrailingDimsContiguous(vecRank)) |
| 265 | return false; |
| 266 | |
| 267 | // Extract the trailing dims and strides of the input memref |
| 268 | auto memrefShape = memrefType.getShape().take_back(vecRank); |
| 269 | |
| 270 | // Compare the dims of `vectorType` against `memrefType` (in reverse). |
| 271 | // In the most basic case, all dims will match. |
| 272 | auto firstNonMatchingDim = |
| 273 | std::mismatch(vectorShape.rbegin(), vectorShape.rend(), |
| 274 | memrefShape.rbegin(), memrefShape.rend()); |
| 275 | if (firstNonMatchingDim.first == vectorShape.rend()) |
| 276 | return true; |
| 277 | |
| 278 | // One non-matching dim is still fine, however the remaining leading dims of |
| 279 | // `vectorType` need to be 1. |
| 280 | SmallVector<int64_t> leadingDims(++firstNonMatchingDim.first, |
| 281 | vectorShape.rend()); |
| 282 | |
| 283 | return llvm::all_of(Range&: leadingDims, P: [](auto x) { return x == 1; }); |
| 284 | } |
| 285 | |
| 286 | std::optional<StaticTileOffsetRange> |
| 287 | vector::createUnrollIterator(VectorType vType, int64_t targetRank) { |
| 288 | if (vType.getRank() <= targetRank) |
| 289 | return {}; |
| 290 | // Attempt to unroll until targetRank or the first scalable dimension (which |
| 291 | // cannot be unrolled). |
| 292 | auto shapeToUnroll = vType.getShape().drop_back(targetRank); |
| 293 | auto scalableDimsToUnroll = vType.getScalableDims().drop_back(targetRank); |
| 294 | auto it = llvm::find(scalableDimsToUnroll, true); |
| 295 | auto firstScalableDim = it - scalableDimsToUnroll.begin(); |
| 296 | if (firstScalableDim == 0) |
| 297 | return {}; |
| 298 | // All scalable dimensions should be removed now. |
| 299 | scalableDimsToUnroll = scalableDimsToUnroll.slice(0, firstScalableDim); |
| 300 | assert(!llvm::is_contained(scalableDimsToUnroll, true) && |
| 301 | "unexpected leading scalable dimension" ); |
| 302 | // Create an unroll iterator for leading dimensions. |
| 303 | shapeToUnroll = shapeToUnroll.slice(0, firstScalableDim); |
| 304 | return StaticTileOffsetRange(shapeToUnroll, /*unrollStep=*/1); |
| 305 | } |
| 306 | |
| 307 | SmallVector<OpFoldResult> vector::getMixedSizesXfer(bool hasTensorSemantics, |
| 308 | Operation *xfer, |
| 309 | RewriterBase &rewriter) { |
| 310 | auto loc = xfer->getLoc(); |
| 311 | |
| 312 | Value base = TypeSwitch<Operation *, Value>(xfer) |
| 313 | .Case<vector::TransferReadOp>( |
| 314 | caseFn: [&](auto readOp) { return readOp.getBase(); }) |
| 315 | .Case<vector::TransferWriteOp>( |
| 316 | caseFn: [&](auto writeOp) { return writeOp.getOperand(1); }); |
| 317 | |
| 318 | SmallVector<OpFoldResult> mixedSourceDims = |
| 319 | hasTensorSemantics ? tensor::getMixedSizes(builder&: rewriter, loc, value: base) |
| 320 | : memref::getMixedSizes(builder&: rewriter, loc, value: base); |
| 321 | return mixedSourceDims; |
| 322 | } |
| 323 | |
| 324 | bool vector::isLinearizableVector(VectorType type) { |
| 325 | return (type.getRank() > 1) && (type.getNumScalableDims() <= 1); |
| 326 | } |
| 327 | |
| 328 | Value vector::createReadOrMaskedRead(OpBuilder &builder, Location loc, |
| 329 | Value source, |
| 330 | ArrayRef<int64_t> inputVectorSizes, |
| 331 | Value padValue, |
| 332 | bool useInBoundsInsteadOfMasking) { |
| 333 | assert(llvm::none_of(inputVectorSizes, |
| 334 | [](int64_t s) { return s == ShapedType::kDynamic; }) && |
| 335 | "invalid input vector sizes" ); |
| 336 | auto sourceShapedType = cast<ShapedType>(source.getType()); |
| 337 | auto sourceShape = sourceShapedType.getShape(); |
| 338 | assert(sourceShape.size() == inputVectorSizes.size() && |
| 339 | "expected same ranks." ); |
| 340 | auto vectorType = VectorType::get(inputVectorSizes, padValue.getType()); |
| 341 | assert(padValue.getType() == sourceShapedType.getElementType() && |
| 342 | "expected same pad element type to match source element type" ); |
| 343 | int64_t readRank = inputVectorSizes.size(); |
| 344 | auto zero = builder.create<arith::ConstantIndexOp>(location: loc, args: 0); |
| 345 | SmallVector<bool> inBoundsVal(readRank, true); |
| 346 | |
| 347 | if (useInBoundsInsteadOfMasking) { |
| 348 | // Update the inBounds attribute. |
| 349 | // FIXME: This computation is too weak - it ignores the read indices. |
| 350 | for (unsigned i = 0; i < readRank; i++) |
| 351 | inBoundsVal[i] = (sourceShape[i] == inputVectorSizes[i]) && |
| 352 | !ShapedType::isDynamic(sourceShape[i]); |
| 353 | } |
| 354 | auto transferReadOp = builder.create<vector::TransferReadOp>( |
| 355 | loc, |
| 356 | /*vectorType=*/vectorType, |
| 357 | /*source=*/source, |
| 358 | /*indices=*/SmallVector<Value>(readRank, zero), |
| 359 | /*padding=*/padValue, |
| 360 | /*inBounds=*/inBoundsVal); |
| 361 | |
| 362 | if (llvm::equal(inputVectorSizes, sourceShape) || useInBoundsInsteadOfMasking) |
| 363 | return transferReadOp; |
| 364 | SmallVector<OpFoldResult> mixedSourceDims = |
| 365 | tensor::getMixedSizes(builder, loc, value: source); |
| 366 | |
| 367 | auto maskType = VectorType::get(inputVectorSizes, builder.getI1Type()); |
| 368 | Value mask = |
| 369 | builder.create<vector::CreateMaskOp>(loc, maskType, mixedSourceDims); |
| 370 | return mlir::vector::maskOperation(builder, maskableOp: transferReadOp, mask) |
| 371 | ->getResult(0); |
| 372 | } |
| 373 | |
| 374 | LogicalResult |
| 375 | vector::isValidMaskedInputVector(ArrayRef<int64_t> shape, |
| 376 | ArrayRef<int64_t> inputVectorSizes) { |
| 377 | LDBG("Iteration space static sizes:" << llvm::interleaved(shape)); |
| 378 | |
| 379 | if (inputVectorSizes.size() != shape.size()) { |
| 380 | LDBG("Input vector sizes don't match the number of loops" ); |
| 381 | return failure(); |
| 382 | } |
| 383 | if (ShapedType::isDynamicShape(inputVectorSizes)) { |
| 384 | LDBG("Input vector sizes can't have dynamic dimensions" ); |
| 385 | return failure(); |
| 386 | } |
| 387 | if (!llvm::all_of(Range: llvm::zip(t&: shape, u&: inputVectorSizes), |
| 388 | P: [](std::tuple<int64_t, int64_t> sizePair) { |
| 389 | int64_t staticSize = std::get<0>(t&: sizePair); |
| 390 | int64_t inputSize = std::get<1>(t&: sizePair); |
| 391 | return ShapedType::isDynamic(staticSize) || |
| 392 | staticSize <= inputSize; |
| 393 | })) { |
| 394 | LDBG("Input vector sizes must be greater than or equal to iteration space " |
| 395 | "static sizes" ); |
| 396 | return failure(); |
| 397 | } |
| 398 | return success(); |
| 399 | } |
| 400 | |