| 1 | //===- LoopVersioning.cpp -------------------------------------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | //===----------------------------------------------------------------------===// |
| 10 | /// \file |
| 11 | /// This pass looks for loops iterating over assumed-shape arrays, that can |
| 12 | /// be optimized by "guessing" that the stride is element-sized. |
| 13 | /// |
| 14 | /// This is done by creating two versions of the same loop: one which assumes |
| 15 | /// that the elements are contiguous (stride == size of element), and one that |
| 16 | /// is the original generic loop. |
| 17 | /// |
| 18 | /// As a side-effect of the assumed element size stride, the array is also |
| 19 | /// flattened to make it a 1D array - this is because the internal array |
| 20 | /// structure must be either 1D or have known sizes in all dimensions - and at |
| 21 | /// least one of the dimensions here is already unknown. |
| 22 | /// |
| 23 | /// There are two distinct benefits here: |
| 24 | /// 1. The loop that iterates over the elements is somewhat simplified by the |
| 25 | /// constant stride calculation. |
| 26 | /// 2. Since the compiler can understand the size of the stride, it can use |
| 27 | /// vector instructions, where an unknown (at compile time) stride does often |
| 28 | /// prevent vector operations from being used. |
| 29 | /// |
| 30 | /// A known drawback is that the code-size is increased, in some cases that can |
| 31 | /// be quite substantial - 3-4x is quite plausible (this includes that the loop |
| 32 | /// gets vectorized, which in itself often more than doubles the size of the |
| 33 | /// code, because unless the loop size is known, there will be a modulo |
| 34 | /// vector-size remainder to deal with. |
| 35 | /// |
| 36 | /// TODO: Do we need some size limit where loops no longer get duplicated? |
| 37 | // Maybe some sort of cost analysis. |
| 38 | /// TODO: Should some loop content - for example calls to functions and |
| 39 | /// subroutines inhibit the versioning of the loops. Plausibly, this |
| 40 | /// could be part of the cost analysis above. |
| 41 | //===----------------------------------------------------------------------===// |
| 42 | |
| 43 | #include "flang/Common/ISO_Fortran_binding_wrapper.h" |
| 44 | #include "flang/Optimizer/Builder/BoxValue.h" |
| 45 | #include "flang/Optimizer/Builder/FIRBuilder.h" |
| 46 | #include "flang/Optimizer/Builder/Runtime/Inquiry.h" |
| 47 | #include "flang/Optimizer/Dialect/FIRDialect.h" |
| 48 | #include "flang/Optimizer/Dialect/FIROps.h" |
| 49 | #include "flang/Optimizer/Dialect/FIRType.h" |
| 50 | #include "flang/Optimizer/Dialect/Support/FIRContext.h" |
| 51 | #include "flang/Optimizer/Dialect/Support/KindMapping.h" |
| 52 | #include "flang/Optimizer/Support/DataLayout.h" |
| 53 | #include "flang/Optimizer/Transforms/Passes.h" |
| 54 | #include "mlir/Dialect/DLTI/DLTI.h" |
| 55 | #include "mlir/Dialect/LLVMIR/LLVMDialect.h" |
| 56 | #include "mlir/IR/Dominance.h" |
| 57 | #include "mlir/IR/Matchers.h" |
| 58 | #include "mlir/IR/TypeUtilities.h" |
| 59 | #include "mlir/Pass/Pass.h" |
| 60 | #include "mlir/Transforms/DialectConversion.h" |
| 61 | #include "mlir/Transforms/GreedyPatternRewriteDriver.h" |
| 62 | #include "mlir/Transforms/RegionUtils.h" |
| 63 | #include "llvm/Support/Debug.h" |
| 64 | #include "llvm/Support/raw_ostream.h" |
| 65 | |
| 66 | #include <algorithm> |
| 67 | |
| 68 | namespace fir { |
| 69 | #define GEN_PASS_DEF_LOOPVERSIONING |
| 70 | #include "flang/Optimizer/Transforms/Passes.h.inc" |
| 71 | } // namespace fir |
| 72 | |
| 73 | #define DEBUG_TYPE "flang-loop-versioning" |
| 74 | |
| 75 | namespace { |
| 76 | |
| 77 | class LoopVersioningPass |
| 78 | : public fir::impl::LoopVersioningBase<LoopVersioningPass> { |
| 79 | public: |
| 80 | void runOnOperation() override; |
| 81 | }; |
| 82 | |
| 83 | /// @struct ArgInfo |
| 84 | /// A structure to hold an argument, the size of the argument and dimension |
| 85 | /// information. |
| 86 | struct ArgInfo { |
| 87 | mlir::Value arg; |
| 88 | size_t size; |
| 89 | unsigned rank; |
| 90 | fir::BoxDimsOp dims[CFI_MAX_RANK]; |
| 91 | }; |
| 92 | |
| 93 | /// @struct ArgsUsageInLoop |
| 94 | /// A structure providing information about the function arguments |
| 95 | /// usage by the instructions immediately nested in a loop. |
| 96 | struct ArgsUsageInLoop { |
| 97 | /// Mapping between the memref operand of an array indexing |
| 98 | /// operation (e.g. fir.coordinate_of) and the argument information. |
| 99 | llvm::DenseMap<mlir::Value, ArgInfo> usageInfo; |
| 100 | /// Some array indexing operations inside a loop cannot be transformed. |
| 101 | /// This vector holds the memref operands of such operations. |
| 102 | /// The vector is used to make sure that we do not try to transform |
| 103 | /// any outer loop, since this will imply the operation rewrite |
| 104 | /// in this loop. |
| 105 | llvm::SetVector<mlir::Value> cannotTransform; |
| 106 | |
| 107 | // Debug dump of the structure members assuming that |
| 108 | // the information has been collected for the given loop. |
| 109 | void dump(fir::DoLoopOp loop) const { |
| 110 | LLVM_DEBUG({ |
| 111 | mlir::OpPrintingFlags printFlags; |
| 112 | printFlags.skipRegions(); |
| 113 | llvm::dbgs() << "Arguments usage info for loop:\n" ; |
| 114 | loop.print(llvm::dbgs(), printFlags); |
| 115 | llvm::dbgs() << "\nUsed args:\n" ; |
| 116 | for (auto &use : usageInfo) { |
| 117 | mlir::Value v = use.first; |
| 118 | v.print(llvm::dbgs(), printFlags); |
| 119 | llvm::dbgs() << "\n" ; |
| 120 | } |
| 121 | llvm::dbgs() << "\nCannot transform args:\n" ; |
| 122 | for (mlir::Value arg : cannotTransform) { |
| 123 | arg.print(llvm::dbgs(), printFlags); |
| 124 | llvm::dbgs() << "\n" ; |
| 125 | } |
| 126 | llvm::dbgs() << "====\n" ; |
| 127 | }); |
| 128 | } |
| 129 | |
| 130 | // Erase usageInfo and cannotTransform entries for a set |
| 131 | // of given arguments. |
| 132 | void eraseUsage(const llvm::SetVector<mlir::Value> &args) { |
| 133 | for (auto &arg : args) |
| 134 | usageInfo.erase(arg); |
| 135 | cannotTransform.set_subtract(args); |
| 136 | } |
| 137 | |
| 138 | // Erase usageInfo and cannotTransform entries for a set |
| 139 | // of given arguments provided in the form of usageInfo map. |
| 140 | void eraseUsage(const llvm::DenseMap<mlir::Value, ArgInfo> &args) { |
| 141 | for (auto &arg : args) { |
| 142 | usageInfo.erase(arg.first); |
| 143 | cannotTransform.remove(arg.first); |
| 144 | } |
| 145 | } |
| 146 | }; |
| 147 | } // namespace |
| 148 | |
| 149 | static fir::SequenceType getAsSequenceType(mlir::Value v) { |
| 150 | mlir::Type argTy = fir::unwrapPassByRefType(fir::unwrapRefType(v.getType())); |
| 151 | return mlir::dyn_cast<fir::SequenceType>(argTy); |
| 152 | } |
| 153 | |
| 154 | /// Return the rank and the element size (in bytes) of the given |
| 155 | /// value \p v. If it is not an array or the element type is not |
| 156 | /// supported, then return <0, 0>. Only trivial data types |
| 157 | /// are currently supported. |
| 158 | /// When \p isArgument is true, \p v is assumed to be a function |
| 159 | /// argument. If \p v's type does not look like a type of an assumed |
| 160 | /// shape array, then the function returns <0, 0>. |
| 161 | /// When \p isArgument is false, array types with known innermost |
| 162 | /// dimension are allowed to proceed. |
| 163 | static std::pair<unsigned, size_t> |
| 164 | getRankAndElementSize(const fir::KindMapping &kindMap, |
| 165 | const mlir::DataLayout &dl, mlir::Value v, |
| 166 | bool isArgument = false) { |
| 167 | if (auto seqTy = getAsSequenceType(v)) { |
| 168 | unsigned rank = seqTy.getDimension(); |
| 169 | if (rank > 0 && |
| 170 | (!isArgument || |
| 171 | seqTy.getShape()[0] == fir::SequenceType::getUnknownExtent())) { |
| 172 | size_t typeSize = 0; |
| 173 | mlir::Type elementType = fir::unwrapSeqOrBoxedSeqType(v.getType()); |
| 174 | if (fir::isa_trivial(elementType)) { |
| 175 | auto [eleSize, eleAlign] = fir::getTypeSizeAndAlignmentOrCrash( |
| 176 | v.getLoc(), elementType, dl, kindMap); |
| 177 | typeSize = llvm::alignTo(eleSize, eleAlign); |
| 178 | } |
| 179 | if (typeSize) |
| 180 | return {rank, typeSize}; |
| 181 | } |
| 182 | } |
| 183 | |
| 184 | LLVM_DEBUG(llvm::dbgs() << "Unsupported rank/type: " << v << '\n'); |
| 185 | return {0, 0}; |
| 186 | } |
| 187 | |
| 188 | /// If a value comes from a fir.declare of fir.pack_array, |
| 189 | /// follow it to the original source, otherwise return the value. |
| 190 | static mlir::Value unwrapPassThroughOps(mlir::Value val) { |
| 191 | // Instead of unwrapping fir.declare, we may try to start |
| 192 | // the analysis in this pass from fir.declare's instead |
| 193 | // of the function entry block arguments. This way the loop |
| 194 | // versioning would work even after FIR inlining. |
| 195 | while (true) { |
| 196 | if (fir::DeclareOp declare = val.getDefiningOp<fir::DeclareOp>()) { |
| 197 | val = declare.getMemref(); |
| 198 | continue; |
| 199 | } |
| 200 | // fir.pack_array might be met before fir.declare - this is how |
| 201 | // it is orifinally generated. |
| 202 | // It might also be met after fir.declare - after the optimization |
| 203 | // passes that sink fir.pack_array closer to the uses. |
| 204 | if (auto packArray = val.getDefiningOp<fir::PackArrayOp>()) { |
| 205 | val = packArray.getArray(); |
| 206 | continue; |
| 207 | } |
| 208 | break; |
| 209 | } |
| 210 | return val; |
| 211 | } |
| 212 | |
| 213 | /// if a value comes from a fir.rebox, follow the rebox to the original source, |
| 214 | /// of the value, otherwise return the value |
| 215 | static mlir::Value unwrapReboxOp(mlir::Value val) { |
| 216 | while (fir::ReboxOp rebox = val.getDefiningOp<fir::ReboxOp>()) { |
| 217 | if (!fir::reboxPreservesContinuity(rebox, /*checkWhole=*/false)) { |
| 218 | LLVM_DEBUG(llvm::dbgs() << "REBOX may produce non-contiguous array: " |
| 219 | << rebox << '\n'); |
| 220 | break; |
| 221 | } |
| 222 | val = rebox.getBox(); |
| 223 | } |
| 224 | return val; |
| 225 | } |
| 226 | |
| 227 | /// normalize a value (removing fir.declare and fir.rebox) so that we can |
| 228 | /// more conveniently spot values which came from function arguments |
| 229 | static mlir::Value normaliseVal(mlir::Value val) { |
| 230 | return unwrapPassThroughOps(unwrapReboxOp(val)); |
| 231 | } |
| 232 | |
| 233 | /// some FIR operations accept a fir.shape, a fir.shift or a fir.shapeshift. |
| 234 | /// fir.shift and fir.shapeshift allow us to extract lower bounds |
| 235 | /// if lowerbounds cannot be found, return nullptr |
| 236 | static mlir::Value tryGetLowerBoundsFromShapeLike(mlir::Value shapeLike, |
| 237 | unsigned dim) { |
| 238 | mlir::Value lowerBound{nullptr}; |
| 239 | if (auto shift = shapeLike.getDefiningOp<fir::ShiftOp>()) |
| 240 | lowerBound = shift.getOrigins()[dim]; |
| 241 | if (auto shapeShift = shapeLike.getDefiningOp<fir::ShapeShiftOp>()) |
| 242 | lowerBound = shapeShift.getOrigins()[dim]; |
| 243 | return lowerBound; |
| 244 | } |
| 245 | |
| 246 | /// attempt to get the array lower bounds of dimension dim of the memref |
| 247 | /// argument to a fir.array_coor op |
| 248 | /// 0 <= dim < rank |
| 249 | /// May return nullptr if no lower bounds can be determined |
| 250 | static mlir::Value getLowerBound(fir::ArrayCoorOp coop, unsigned dim) { |
| 251 | // 1) try to get from the shape argument to fir.array_coor |
| 252 | if (mlir::Value shapeLike = coop.getShape()) |
| 253 | if (mlir::Value lb = tryGetLowerBoundsFromShapeLike(shapeLike, dim)) |
| 254 | return lb; |
| 255 | |
| 256 | // It is important not to try to read the lower bound from the box, because |
| 257 | // in the FIR lowering, boxes will sometimes contain incorrect lower bound |
| 258 | // information |
| 259 | |
| 260 | // out of ideas |
| 261 | return {}; |
| 262 | } |
| 263 | |
| 264 | /// gets the i'th index from array coordinate operation op |
| 265 | /// dim should range between 0 and rank - 1 |
| 266 | static mlir::Value getIndex(fir::FirOpBuilder &builder, mlir::Operation *op, |
| 267 | unsigned dim) { |
| 268 | if (fir::CoordinateOp coop = mlir::dyn_cast<fir::CoordinateOp>(op)) |
| 269 | return coop.getCoor()[dim]; |
| 270 | |
| 271 | fir::ArrayCoorOp coop = mlir::dyn_cast<fir::ArrayCoorOp>(op); |
| 272 | assert(coop && |
| 273 | "operation must be either fir.coordiante_of or fir.array_coor" ); |
| 274 | |
| 275 | // fir.coordinate_of indices start at 0: adjust these indices to match by |
| 276 | // subtracting the lower bound |
| 277 | mlir::Value index = coop.getIndices()[dim]; |
| 278 | mlir::Value lb = getLowerBound(coop, dim); |
| 279 | if (!lb) |
| 280 | // assume a default lower bound of one |
| 281 | lb = builder.createIntegerConstant(coop.getLoc(), index.getType(), 1); |
| 282 | |
| 283 | // index_0 = index - lb; |
| 284 | if (lb.getType() != index.getType()) |
| 285 | lb = builder.createConvert(coop.getLoc(), index.getType(), lb); |
| 286 | return builder.create<mlir::arith::SubIOp>(coop.getLoc(), index, lb); |
| 287 | } |
| 288 | |
| 289 | void LoopVersioningPass::runOnOperation() { |
| 290 | LLVM_DEBUG(llvm::dbgs() << "=== Begin " DEBUG_TYPE " ===\n" ); |
| 291 | mlir::func::FuncOp func = getOperation(); |
| 292 | |
| 293 | // First look for arguments with assumed shape = unknown extent in the lowest |
| 294 | // dimension. |
| 295 | LLVM_DEBUG(llvm::dbgs() << "Func-name:" << func.getSymName() << "\n" ); |
| 296 | mlir::Block::BlockArgListType args = func.getArguments(); |
| 297 | mlir::ModuleOp module = func->getParentOfType<mlir::ModuleOp>(); |
| 298 | fir::KindMapping kindMap = fir::getKindMapping(module); |
| 299 | mlir::SmallVector<ArgInfo, 4> argsOfInterest; |
| 300 | std::optional<mlir::DataLayout> dl = fir::support::getOrSetMLIRDataLayout( |
| 301 | module, /*allowDefaultLayout=*/false); |
| 302 | if (!dl) |
| 303 | mlir::emitError(module.getLoc(), |
| 304 | "data layout attribute is required to perform " DEBUG_TYPE |
| 305 | "pass" ); |
| 306 | for (auto &arg : args) { |
| 307 | // Optional arguments must be checked for IsPresent before |
| 308 | // looking for the bounds. They are unsupported for the time being. |
| 309 | if (func.getArgAttrOfType<mlir::UnitAttr>(arg.getArgNumber(), |
| 310 | fir::getOptionalAttrName())) { |
| 311 | LLVM_DEBUG(llvm::dbgs() << "OPTIONAL is not supported\n" ); |
| 312 | continue; |
| 313 | } |
| 314 | |
| 315 | auto [rank, typeSize] = |
| 316 | getRankAndElementSize(kindMap, *dl, arg, /*isArgument=*/true); |
| 317 | if (rank != 0 && typeSize != 0) |
| 318 | argsOfInterest.push_back({arg, typeSize, rank, {}}); |
| 319 | } |
| 320 | |
| 321 | if (argsOfInterest.empty()) { |
| 322 | LLVM_DEBUG(llvm::dbgs() |
| 323 | << "No suitable arguments.\n=== End " DEBUG_TYPE " ===\n" ); |
| 324 | return; |
| 325 | } |
| 326 | |
| 327 | // A list of all loops in the function in post-order. |
| 328 | mlir::SmallVector<fir::DoLoopOp> originalLoops; |
| 329 | // Information about the arguments usage by the instructions |
| 330 | // immediately nested in a loop. |
| 331 | llvm::DenseMap<fir::DoLoopOp, ArgsUsageInLoop> argsInLoops; |
| 332 | |
| 333 | auto &domInfo = getAnalysis<mlir::DominanceInfo>(); |
| 334 | |
| 335 | // Traverse the loops in post-order and see |
| 336 | // if those arguments are used inside any loop. |
| 337 | func.walk([&](fir::DoLoopOp loop) { |
| 338 | mlir::Block &body = *loop.getBody(); |
| 339 | auto &argsInLoop = argsInLoops[loop]; |
| 340 | originalLoops.push_back(loop); |
| 341 | body.walk([&](mlir::Operation *op) { |
| 342 | // Support either fir.array_coor or fir.coordinate_of. |
| 343 | if (!mlir::isa<fir::ArrayCoorOp, fir::CoordinateOp>(op)) |
| 344 | return; |
| 345 | // Process only operations immediately nested in the current loop. |
| 346 | if (op->getParentOfType<fir::DoLoopOp>() != loop) |
| 347 | return; |
| 348 | mlir::Value operand = op->getOperand(0); |
| 349 | for (auto a : argsOfInterest) { |
| 350 | if (a.arg == normaliseVal(operand)) { |
| 351 | // Use the reboxed value, not the block arg when re-creating the loop. |
| 352 | a.arg = operand; |
| 353 | |
| 354 | // Check that the operand dominates the loop? |
| 355 | // If this is the case, record such operands in argsInLoop.cannot- |
| 356 | // Transform, so that they disable the transformation for the parent |
| 357 | /// loops as well. |
| 358 | if (!domInfo.dominates(a.arg, loop)) |
| 359 | argsInLoop.cannotTransform.insert(a.arg); |
| 360 | |
| 361 | // No support currently for sliced arrays. |
| 362 | // This means that we cannot transform properly |
| 363 | // instructions referencing a.arg in the whole loop |
| 364 | // nest this loop is located in. |
| 365 | if (auto arrayCoor = mlir::dyn_cast<fir::ArrayCoorOp>(op)) |
| 366 | if (arrayCoor.getSlice()) |
| 367 | argsInLoop.cannotTransform.insert(a.arg); |
| 368 | |
| 369 | // We need to compute the rank and element size |
| 370 | // based on the operand, not the original argument, |
| 371 | // because array slicing may affect it. |
| 372 | std::tie(a.rank, a.size) = getRankAndElementSize(kindMap, *dl, a.arg); |
| 373 | if (a.rank == 0 || a.size == 0) |
| 374 | argsInLoop.cannotTransform.insert(a.arg); |
| 375 | |
| 376 | if (argsInLoop.cannotTransform.contains(a.arg)) { |
| 377 | // Remove any previously recorded usage, if any. |
| 378 | argsInLoop.usageInfo.erase(a.arg); |
| 379 | break; |
| 380 | } |
| 381 | |
| 382 | // Record the a.arg usage, if not recorded yet. |
| 383 | argsInLoop.usageInfo.try_emplace(a.arg, a); |
| 384 | break; |
| 385 | } |
| 386 | } |
| 387 | }); |
| 388 | }); |
| 389 | |
| 390 | // Dump loops info after initial collection. |
| 391 | LLVM_DEBUG({ |
| 392 | llvm::dbgs() << "Initial usage info:\n" ; |
| 393 | for (fir::DoLoopOp loop : originalLoops) { |
| 394 | auto &argsInLoop = argsInLoops[loop]; |
| 395 | argsInLoop.dump(loop); |
| 396 | } |
| 397 | }); |
| 398 | |
| 399 | // Clear argument usage for parent loops if an inner loop |
| 400 | // contains a non-transformable usage. |
| 401 | for (fir::DoLoopOp loop : originalLoops) { |
| 402 | auto &argsInLoop = argsInLoops[loop]; |
| 403 | if (argsInLoop.cannotTransform.empty()) |
| 404 | continue; |
| 405 | |
| 406 | fir::DoLoopOp parent = loop; |
| 407 | while ((parent = parent->getParentOfType<fir::DoLoopOp>())) |
| 408 | argsInLoops[parent].eraseUsage(argsInLoop.cannotTransform); |
| 409 | } |
| 410 | |
| 411 | // If an argument access can be optimized in a loop and |
| 412 | // its descendant loop, then it does not make sense to |
| 413 | // generate the contiguity check for the descendant loop. |
| 414 | // The check will be produced as part of the ancestor |
| 415 | // loop's transformation. So we can clear the argument |
| 416 | // usage for all descendant loops. |
| 417 | for (fir::DoLoopOp loop : originalLoops) { |
| 418 | auto &argsInLoop = argsInLoops[loop]; |
| 419 | if (argsInLoop.usageInfo.empty()) |
| 420 | continue; |
| 421 | |
| 422 | loop.getBody()->walk([&](fir::DoLoopOp dloop) { |
| 423 | argsInLoops[dloop].eraseUsage(argsInLoop.usageInfo); |
| 424 | }); |
| 425 | } |
| 426 | |
| 427 | LLVM_DEBUG({ |
| 428 | llvm::dbgs() << "Final usage info:\n" ; |
| 429 | for (fir::DoLoopOp loop : originalLoops) { |
| 430 | auto &argsInLoop = argsInLoops[loop]; |
| 431 | argsInLoop.dump(loop); |
| 432 | } |
| 433 | }); |
| 434 | |
| 435 | // Reduce the collected information to a list of loops |
| 436 | // with attached arguments usage information. |
| 437 | // The list must hold the loops in post order, so that |
| 438 | // the inner loops are transformed before the outer loops. |
| 439 | struct OpsWithArgs { |
| 440 | mlir::Operation *op; |
| 441 | mlir::SmallVector<ArgInfo, 4> argsAndDims; |
| 442 | }; |
| 443 | mlir::SmallVector<OpsWithArgs, 4> loopsOfInterest; |
| 444 | for (fir::DoLoopOp loop : originalLoops) { |
| 445 | auto &argsInLoop = argsInLoops[loop]; |
| 446 | if (argsInLoop.usageInfo.empty()) |
| 447 | continue; |
| 448 | OpsWithArgs info; |
| 449 | info.op = loop; |
| 450 | for (auto &arg : argsInLoop.usageInfo) |
| 451 | info.argsAndDims.push_back(arg.second); |
| 452 | loopsOfInterest.emplace_back(std::move(info)); |
| 453 | } |
| 454 | |
| 455 | if (loopsOfInterest.empty()) { |
| 456 | LLVM_DEBUG(llvm::dbgs() |
| 457 | << "No loops to transform.\n=== End " DEBUG_TYPE " ===\n" ); |
| 458 | return; |
| 459 | } |
| 460 | |
| 461 | // If we get here, there are loops to process. |
| 462 | fir::FirOpBuilder builder{module, std::move(kindMap)}; |
| 463 | mlir::Location loc = builder.getUnknownLoc(); |
| 464 | mlir::IndexType idxTy = builder.getIndexType(); |
| 465 | |
| 466 | LLVM_DEBUG(llvm::dbgs() << "Func Before transformation:\n" ); |
| 467 | LLVM_DEBUG(func->dump()); |
| 468 | |
| 469 | LLVM_DEBUG(llvm::dbgs() << "loopsOfInterest: " << loopsOfInterest.size() |
| 470 | << "\n" ); |
| 471 | for (auto op : loopsOfInterest) { |
| 472 | LLVM_DEBUG(op.op->dump()); |
| 473 | builder.setInsertionPoint(op.op); |
| 474 | |
| 475 | mlir::Value allCompares = nullptr; |
| 476 | // Ensure all of the arrays are unit-stride. |
| 477 | for (auto &arg : op.argsAndDims) { |
| 478 | // Fetch all the dimensions of the array, except the last dimension. |
| 479 | // Always fetch the first dimension, however, so set ndims = 1 if |
| 480 | // we have one dim |
| 481 | unsigned ndims = arg.rank; |
| 482 | for (unsigned i = 0; i < ndims; i++) { |
| 483 | mlir::Value dimIdx = builder.createIntegerConstant(loc, idxTy, i); |
| 484 | arg.dims[i] = builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy, |
| 485 | arg.arg, dimIdx); |
| 486 | } |
| 487 | // We only care about lowest order dimension, here. |
| 488 | mlir::Value elemSize = |
| 489 | builder.createIntegerConstant(loc, idxTy, arg.size); |
| 490 | mlir::Value cmp = builder.create<mlir::arith::CmpIOp>( |
| 491 | loc, mlir::arith::CmpIPredicate::eq, arg.dims[0].getResult(2), |
| 492 | elemSize); |
| 493 | if (!allCompares) { |
| 494 | allCompares = cmp; |
| 495 | } else { |
| 496 | allCompares = |
| 497 | builder.create<mlir::arith::AndIOp>(loc, cmp, allCompares); |
| 498 | } |
| 499 | } |
| 500 | |
| 501 | auto ifOp = |
| 502 | builder.create<fir::IfOp>(loc, op.op->getResultTypes(), allCompares, |
| 503 | /*withElse=*/true); |
| 504 | builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); |
| 505 | |
| 506 | LLVM_DEBUG(llvm::dbgs() << "Creating cloned loop\n" ); |
| 507 | mlir::Operation *clonedLoop = op.op->clone(); |
| 508 | bool changed = false; |
| 509 | for (auto &arg : op.argsAndDims) { |
| 510 | fir::SequenceType::Shape newShape; |
| 511 | newShape.push_back(fir::SequenceType::getUnknownExtent()); |
| 512 | auto elementType = fir::unwrapSeqOrBoxedSeqType(arg.arg.getType()); |
| 513 | mlir::Type arrTy = fir::SequenceType::get(newShape, elementType); |
| 514 | mlir::Type boxArrTy = fir::BoxType::get(arrTy); |
| 515 | mlir::Type refArrTy = builder.getRefType(arrTy); |
| 516 | auto carg = builder.create<fir::ConvertOp>(loc, boxArrTy, arg.arg); |
| 517 | auto caddr = builder.create<fir::BoxAddrOp>(loc, refArrTy, carg); |
| 518 | auto insPt = builder.saveInsertionPoint(); |
| 519 | // Use caddr instead of arg. |
| 520 | clonedLoop->walk([&](mlir::Operation *coop) { |
| 521 | if (!mlir::isa<fir::CoordinateOp, fir::ArrayCoorOp>(coop)) |
| 522 | return; |
| 523 | // Reduce the multi-dimensioned index to a single index. |
| 524 | // This is required becase fir arrays do not support multiple dimensions |
| 525 | // with unknown dimensions at compile time. |
| 526 | // We then calculate the multidimensional array like this: |
| 527 | // arr(x, y, z) bedcomes arr(z * stride(2) + y * stride(1) + x) |
| 528 | // where stride is the distance between elements in the dimensions |
| 529 | // 0, 1 and 2 or x, y and z. |
| 530 | if (coop->getOperand(0) == arg.arg && coop->getOperands().size() >= 2) { |
| 531 | builder.setInsertionPoint(coop); |
| 532 | mlir::Value totalIndex; |
| 533 | for (unsigned i = arg.rank - 1; i > 0; i--) { |
| 534 | mlir::Value curIndex = |
| 535 | builder.createConvert(loc, idxTy, getIndex(builder, coop, i)); |
| 536 | // Multiply by the stride of this array. Later we'll divide by the |
| 537 | // element size. |
| 538 | mlir::Value scale = |
| 539 | builder.createConvert(loc, idxTy, arg.dims[i].getResult(2)); |
| 540 | curIndex = |
| 541 | builder.create<mlir::arith::MulIOp>(loc, scale, curIndex); |
| 542 | totalIndex = (totalIndex) ? builder.create<mlir::arith::AddIOp>( |
| 543 | loc, curIndex, totalIndex) |
| 544 | : curIndex; |
| 545 | } |
| 546 | // This is the lowest dimension - which doesn't need scaling |
| 547 | mlir::Value finalIndex = |
| 548 | builder.createConvert(loc, idxTy, getIndex(builder, coop, 0)); |
| 549 | if (totalIndex) { |
| 550 | assert(llvm::isPowerOf2_32(arg.size) && |
| 551 | "Expected power of two here" ); |
| 552 | unsigned bits = llvm::Log2_32(arg.size); |
| 553 | mlir::Value elemShift = |
| 554 | builder.createIntegerConstant(loc, idxTy, bits); |
| 555 | totalIndex = builder.create<mlir::arith::AddIOp>( |
| 556 | loc, |
| 557 | builder.create<mlir::arith::ShRSIOp>(loc, totalIndex, |
| 558 | elemShift), |
| 559 | finalIndex); |
| 560 | } else { |
| 561 | totalIndex = finalIndex; |
| 562 | } |
| 563 | auto newOp = builder.create<fir::CoordinateOp>( |
| 564 | loc, builder.getRefType(elementType), caddr, |
| 565 | mlir::ValueRange{totalIndex}); |
| 566 | LLVM_DEBUG(newOp->dump()); |
| 567 | coop->getResult(0).replaceAllUsesWith(newOp->getResult(0)); |
| 568 | coop->erase(); |
| 569 | changed = true; |
| 570 | } |
| 571 | }); |
| 572 | |
| 573 | builder.restoreInsertionPoint(insPt); |
| 574 | } |
| 575 | assert(changed && "Expected operations to have changed" ); |
| 576 | |
| 577 | builder.insert(clonedLoop); |
| 578 | // Forward the result(s), if any, from the loop operation to the |
| 579 | // |
| 580 | mlir::ResultRange results = clonedLoop->getResults(); |
| 581 | bool hasResults = (results.size() > 0); |
| 582 | if (hasResults) |
| 583 | builder.create<fir::ResultOp>(loc, results); |
| 584 | |
| 585 | // Add the original loop in the else-side of the if operation. |
| 586 | builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); |
| 587 | op.op->replaceAllUsesWith(ifOp); |
| 588 | op.op->remove(); |
| 589 | builder.insert(op.op); |
| 590 | // Rely on "cloned loop has results, so original loop also has results". |
| 591 | if (hasResults) { |
| 592 | builder.create<fir::ResultOp>(loc, op.op->getResults()); |
| 593 | } else { |
| 594 | // Use an assert to check this. |
| 595 | assert(op.op->getResults().size() == 0 && |
| 596 | "Weird, the cloned loop doesn't have results, but the original " |
| 597 | "does?" ); |
| 598 | } |
| 599 | } |
| 600 | |
| 601 | LLVM_DEBUG(llvm::dbgs() << "Func After transform:\n" ); |
| 602 | LLVM_DEBUG(func->dump()); |
| 603 | |
| 604 | LLVM_DEBUG(llvm::dbgs() << "=== End " DEBUG_TYPE " ===\n" ); |
| 605 | } |
| 606 | |