1//===- ScheduleOptimizer.cpp - Calculate an optimized schedule ------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass generates an entirely new schedule tree from the data dependences
10// and iteration domains. The new schedule tree is computed in two steps:
11//
12// 1) The isl scheduling optimizer is run
13//
14// The isl scheduling optimizer creates a new schedule tree that maximizes
15// parallelism and tileability and minimizes data-dependence distances. The
16// algorithm used is a modified version of the ``Pluto'' algorithm:
17//
18// U. Bondhugula, A. Hartono, J. Ramanujam, and P. Sadayappan.
19// A Practical Automatic Polyhedral Parallelizer and Locality Optimizer.
20// In Proceedings of the 2008 ACM SIGPLAN Conference On Programming Language
21// Design and Implementation, PLDI ’08, pages 101–113. ACM, 2008.
22//
23// 2) A set of post-scheduling transformations is applied on the schedule tree.
24//
25// These optimizations include:
26//
27// - Tiling of the innermost tilable bands
28// - Prevectorization - The choice of a possible outer loop that is strip-mined
29// to the innermost level to enable inner-loop
30// vectorization.
31// - Some optimizations for spatial locality are also planned.
32//
33// For a detailed description of the schedule tree itself please see section 6
34// of:
35//
36// Polyhedral AST generation is more than scanning polyhedra
37// Tobias Grosser, Sven Verdoolaege, Albert Cohen
38// ACM Transactions on Programming Languages and Systems (TOPLAS),
39// 37(4), July 2015
40// http://www.grosser.es/#pub-polyhedral-AST-generation
41//
42// This publication also contains a detailed discussion of the different options
43// for polyhedral loop unrolling, full/partial tile separation and other uses
44// of the schedule tree.
45//
46//===----------------------------------------------------------------------===//
47
48#include "polly/ScheduleOptimizer.h"
49#include "polly/CodeGen/CodeGeneration.h"
50#include "polly/DependenceInfo.h"
51#include "polly/ManualOptimizer.h"
52#include "polly/MatmulOptimizer.h"
53#include "polly/Options.h"
54#include "polly/ScheduleTreeTransform.h"
55#include "polly/Support/ISLOStream.h"
56#include "polly/Support/ISLTools.h"
57#include "llvm/ADT/Sequence.h"
58#include "llvm/ADT/Statistic.h"
59#include "llvm/Analysis/OptimizationRemarkEmitter.h"
60#include "llvm/InitializePasses.h"
61#include "llvm/Support/CommandLine.h"
62#include "isl/options.h"
63
64using namespace llvm;
65using namespace polly;
66
67namespace llvm {
68class Loop;
69class Module;
70} // namespace llvm
71
72#include "polly/Support/PollyDebug.h"
73#define DEBUG_TYPE "polly-opt-isl"
74
75static cl::opt<std::string>
76 OptimizeDeps("polly-opt-optimize-only",
77 cl::desc("Only a certain kind of dependences (all/raw)"),
78 cl::Hidden, cl::init(Val: "all"), cl::cat(PollyCategory));
79
80static cl::opt<std::string>
81 SimplifyDeps("polly-opt-simplify-deps",
82 cl::desc("Dependences should be simplified (yes/no)"),
83 cl::Hidden, cl::init(Val: "yes"), cl::cat(PollyCategory));
84
85static cl::opt<int> MaxConstantTerm(
86 "polly-opt-max-constant-term",
87 cl::desc("The maximal constant term allowed (-1 is unlimited)"), cl::Hidden,
88 cl::init(Val: 20), cl::cat(PollyCategory));
89
90static cl::opt<int> MaxCoefficient(
91 "polly-opt-max-coefficient",
92 cl::desc("The maximal coefficient allowed (-1 is unlimited)"), cl::Hidden,
93 cl::init(Val: 20), cl::cat(PollyCategory));
94
95static cl::opt<std::string>
96 MaximizeBandDepth("polly-opt-maximize-bands",
97 cl::desc("Maximize the band depth (yes/no)"), cl::Hidden,
98 cl::init(Val: "yes"), cl::cat(PollyCategory));
99
100static cl::opt<int>
101 ScheduleComputeOut("polly-schedule-computeout",
102 cl::desc("Bound the scheduler by maximal amount"
103 "of computational steps. "),
104 cl::Hidden, cl::init(Val: 300000), cl::ZeroOrMore,
105 cl::cat(PollyCategory));
106
107static cl::opt<bool>
108 GreedyFusion("polly-loopfusion-greedy",
109 cl::desc("Aggressively try to fuse everything"), cl::Hidden,
110 cl::cat(PollyCategory));
111
112static cl::opt<std::string> OuterCoincidence(
113 "polly-opt-outer-coincidence",
114 cl::desc("Try to construct schedules where the outer member of each band "
115 "satisfies the coincidence constraints (yes/no)"),
116 cl::Hidden, cl::init(Val: "no"), cl::cat(PollyCategory));
117
118static cl::opt<int> PrevectorWidth(
119 "polly-prevect-width",
120 cl::desc(
121 "The number of loop iterations to strip-mine for pre-vectorization"),
122 cl::Hidden, cl::init(Val: 4), cl::cat(PollyCategory));
123
124static cl::opt<bool> FirstLevelTiling("polly-tiling",
125 cl::desc("Enable loop tiling"),
126 cl::init(Val: true), cl::cat(PollyCategory));
127
128static cl::opt<int> FirstLevelDefaultTileSize(
129 "polly-default-tile-size",
130 cl::desc("The default tile size (if not enough were provided by"
131 " --polly-tile-sizes)"),
132 cl::Hidden, cl::init(Val: 32), cl::cat(PollyCategory));
133
134static cl::list<int>
135 FirstLevelTileSizes("polly-tile-sizes",
136 cl::desc("A tile size for each loop dimension, filled "
137 "with --polly-default-tile-size"),
138 cl::Hidden, cl::CommaSeparated, cl::cat(PollyCategory));
139
140static cl::opt<bool>
141 SecondLevelTiling("polly-2nd-level-tiling",
142 cl::desc("Enable a 2nd level loop of loop tiling"),
143 cl::cat(PollyCategory));
144
145static cl::opt<int> SecondLevelDefaultTileSize(
146 "polly-2nd-level-default-tile-size",
147 cl::desc("The default 2nd-level tile size (if not enough were provided by"
148 " --polly-2nd-level-tile-sizes)"),
149 cl::Hidden, cl::init(Val: 16), cl::cat(PollyCategory));
150
151static cl::list<int>
152 SecondLevelTileSizes("polly-2nd-level-tile-sizes",
153 cl::desc("A tile size for each loop dimension, filled "
154 "with --polly-default-tile-size"),
155 cl::Hidden, cl::CommaSeparated,
156 cl::cat(PollyCategory));
157
158static cl::opt<bool> RegisterTiling("polly-register-tiling",
159 cl::desc("Enable register tiling"),
160 cl::cat(PollyCategory));
161
162static cl::opt<int> RegisterDefaultTileSize(
163 "polly-register-tiling-default-tile-size",
164 cl::desc("The default register tile size (if not enough were provided by"
165 " --polly-register-tile-sizes)"),
166 cl::Hidden, cl::init(Val: 2), cl::cat(PollyCategory));
167
168static cl::list<int>
169 RegisterTileSizes("polly-register-tile-sizes",
170 cl::desc("A tile size for each loop dimension, filled "
171 "with --polly-register-tile-size"),
172 cl::Hidden, cl::CommaSeparated, cl::cat(PollyCategory));
173
174static cl::opt<bool> PragmaBasedOpts(
175 "polly-pragma-based-opts",
176 cl::desc("Apply user-directed transformation from metadata"),
177 cl::init(Val: true), cl::cat(PollyCategory));
178
179static cl::opt<bool> EnableReschedule("polly-reschedule",
180 cl::desc("Optimize SCoPs using ISL"),
181 cl::init(Val: true), cl::cat(PollyCategory));
182
183static cl::opt<bool>
184 PMBasedOpts("polly-pattern-matching-based-opts",
185 cl::desc("Perform optimizations based on pattern matching"),
186 cl::init(Val: true), cl::cat(PollyCategory));
187
188static cl::opt<bool>
189 EnablePostopts("polly-postopts",
190 cl::desc("Apply post-rescheduling optimizations such as "
191 "tiling (requires -polly-reschedule)"),
192 cl::init(Val: true), cl::cat(PollyCategory));
193
194static cl::opt<bool> OptimizedScops(
195 "polly-optimized-scops",
196 cl::desc("Polly - Dump polyhedral description of Scops optimized with "
197 "the isl scheduling optimizer and the set of post-scheduling "
198 "transformations is applied on the schedule tree"),
199 cl::cat(PollyCategory));
200
201STATISTIC(ScopsProcessed, "Number of scops processed");
202STATISTIC(ScopsRescheduled, "Number of scops rescheduled");
203STATISTIC(ScopsOptimized, "Number of scops optimized");
204
205STATISTIC(NumAffineLoopsOptimized, "Number of affine loops optimized");
206STATISTIC(NumBoxedLoopsOptimized, "Number of boxed loops optimized");
207
208#define THREE_STATISTICS(VARNAME, DESC) \
209 static Statistic VARNAME[3] = { \
210 {DEBUG_TYPE, #VARNAME "0", DESC " (original)"}, \
211 {DEBUG_TYPE, #VARNAME "1", DESC " (after scheduler)"}, \
212 {DEBUG_TYPE, #VARNAME "2", DESC " (after optimizer)"}}
213
214THREE_STATISTICS(NumBands, "Number of bands");
215THREE_STATISTICS(NumBandMembers, "Number of band members");
216THREE_STATISTICS(NumCoincident, "Number of coincident band members");
217THREE_STATISTICS(NumPermutable, "Number of permutable bands");
218THREE_STATISTICS(NumFilters, "Number of filter nodes");
219THREE_STATISTICS(NumExtension, "Number of extension nodes");
220
221STATISTIC(FirstLevelTileOpts, "Number of first level tiling applied");
222STATISTIC(SecondLevelTileOpts, "Number of second level tiling applied");
223STATISTIC(RegisterTileOpts, "Number of register tiling applied");
224STATISTIC(PrevectOpts, "Number of strip-mining for prevectorization applied");
225STATISTIC(MatMulOpts,
226 "Number of matrix multiplication patterns detected and optimized");
227
228namespace {
229/// Additional parameters of the schedule optimizer.
230///
231/// Target Transform Info and the SCoP dependencies used by the schedule
232/// optimizer.
233struct OptimizerAdditionalInfoTy {
234 const llvm::TargetTransformInfo *TTI;
235 const Dependences *D;
236 bool PatternOpts;
237 bool Postopts;
238 bool Prevect;
239 bool &DepsChanged;
240};
241
242class ScheduleTreeOptimizer final {
243public:
244 /// Apply schedule tree transformations.
245 ///
246 /// This function takes an (possibly already optimized) schedule tree and
247 /// applies a set of additional optimizations on the schedule tree. The
248 /// transformations applied include:
249 ///
250 /// - Pattern-based optimizations
251 /// - Tiling
252 /// - Prevectorization
253 ///
254 /// @param Schedule The schedule object the transformations will be applied
255 /// to.
256 /// @param OAI Target Transform Info and the SCoP dependencies.
257 /// @returns The transformed schedule.
258 static isl::schedule
259 optimizeSchedule(isl::schedule Schedule,
260 const OptimizerAdditionalInfoTy *OAI = nullptr);
261
262 /// Apply schedule tree transformations.
263 ///
264 /// This function takes a node in an (possibly already optimized) schedule
265 /// tree and applies a set of additional optimizations on this schedule tree
266 /// node and its descendants. The transformations applied include:
267 ///
268 /// - Pattern-based optimizations
269 /// - Tiling
270 /// - Prevectorization
271 ///
272 /// @param Node The schedule object post-transformations will be applied to.
273 /// @param OAI Target Transform Info and the SCoP dependencies.
274 /// @returns The transformed schedule.
275 static isl::schedule_node
276 optimizeScheduleNode(isl::schedule_node Node,
277 const OptimizerAdditionalInfoTy *OAI = nullptr);
278
279 /// Decide if the @p NewSchedule is profitable for @p S.
280 ///
281 /// @param S The SCoP we optimize.
282 /// @param NewSchedule The new schedule we computed.
283 ///
284 /// @return True, if we believe @p NewSchedule is an improvement for @p S.
285 static bool isProfitableSchedule(polly::Scop &S, isl::schedule NewSchedule);
286
287 /// Isolate a set of partial tile prefixes.
288 ///
289 /// This set should ensure that it contains only partial tile prefixes that
290 /// have exactly VectorWidth iterations.
291 ///
292 /// @param Node A schedule node band, which is a parent of a band node,
293 /// that contains a vector loop.
294 /// @return Modified isl_schedule_node.
295 static isl::schedule_node isolateFullPartialTiles(isl::schedule_node Node,
296 int VectorWidth);
297
298private:
299 /// Check if this node is a band node we want to tile.
300 ///
301 /// We look for innermost band nodes where individual dimensions are marked as
302 /// permutable.
303 ///
304 /// @param Node The node to check.
305 static bool isTileableBandNode(isl::schedule_node Node);
306
307 /// Check if this node is a band node we want to transform using pattern
308 /// matching.
309 ///
310 /// We look for innermost band nodes where individual dimensions are marked as
311 /// permutable. There is no restriction on the number of individual
312 /// dimensions.
313 ///
314 /// @param Node The node to check.
315 static bool isPMOptimizableBandNode(isl::schedule_node Node);
316
317 /// Pre-vectorizes one scheduling dimension of a schedule band.
318 ///
319 /// prevectSchedBand splits out the dimension DimToVectorize, tiles it and
320 /// sinks the resulting point loop.
321 ///
322 /// Example (DimToVectorize=0, VectorWidth=4):
323 ///
324 /// | Before transformation:
325 /// |
326 /// | A[i,j] -> [i,j]
327 /// |
328 /// | for (i = 0; i < 128; i++)
329 /// | for (j = 0; j < 128; j++)
330 /// | A(i,j);
331 ///
332 /// | After transformation:
333 /// |
334 /// | for (it = 0; it < 32; it+=1)
335 /// | for (j = 0; j < 128; j++)
336 /// | for (ip = 0; ip <= 3; ip++)
337 /// | A(4 * it + ip,j);
338 ///
339 /// The goal of this transformation is to create a trivially vectorizable
340 /// loop. This means a parallel loop at the innermost level that has a
341 /// constant number of iterations corresponding to the target vector width.
342 ///
343 /// This transformation creates a loop at the innermost level. The loop has
344 /// a constant number of iterations, if the number of loop iterations at
345 /// DimToVectorize can be divided by VectorWidth. The default VectorWidth is
346 /// currently constant and not yet target specific. This function does not
347 /// reason about parallelism.
348 static isl::schedule_node prevectSchedBand(isl::schedule_node Node,
349 unsigned DimToVectorize,
350 int VectorWidth);
351
352 /// Apply additional optimizations on the bands in the schedule tree.
353 ///
354 /// We are looking for an innermost band node and apply the following
355 /// transformations:
356 ///
357 /// - Tile the band
358 /// - if the band is tileable
359 /// - if the band has more than one loop dimension
360 ///
361 /// - Prevectorize the schedule of the band (or the point loop in case of
362 /// tiling).
363 /// - if vectorization is enabled
364 ///
365 /// @param Node The schedule node to (possibly) optimize.
366 /// @param User A pointer to forward some use information
367 /// (currently unused).
368 static isl_schedule_node *optimizeBand(isl_schedule_node *Node, void *User);
369
370 /// Apply tiling optimizations on the bands in the schedule tree.
371 ///
372 /// @param Node The schedule node to (possibly) optimize.
373 static isl::schedule_node applyTileBandOpt(isl::schedule_node Node);
374
375 /// Apply prevectorization on the bands in the schedule tree.
376 ///
377 /// @param Node The schedule node to (possibly) prevectorize.
378 static isl::schedule_node applyPrevectBandOpt(isl::schedule_node Node);
379};
380
381isl::schedule_node
382ScheduleTreeOptimizer::isolateFullPartialTiles(isl::schedule_node Node,
383 int VectorWidth) {
384 assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
385 Node = Node.child(pos: 0).child(pos: 0);
386 isl::union_map SchedRelUMap = Node.get_prefix_schedule_relation();
387 isl::union_set ScheduleRangeUSet = SchedRelUMap.range();
388 isl::set ScheduleRange{ScheduleRangeUSet};
389 isl::set IsolateDomain = getPartialTilePrefixes(ScheduleRange, VectorWidth);
390 auto AtomicOption = getDimOptions(Ctx: IsolateDomain.ctx(), Option: "atomic");
391 isl::union_set IsolateOption = getIsolateOptions(IsolateDomain, OutDimsNum: 1);
392 Node = Node.parent().parent();
393 isl::union_set Options = IsolateOption.unite(uset2: AtomicOption);
394 isl::schedule_node_band Result =
395 Node.as<isl::schedule_node_band>().set_ast_build_options(Options);
396 return Result;
397}
398
399struct InsertSimdMarkers final : ScheduleNodeRewriter<InsertSimdMarkers> {
400 isl::schedule_node visitBand(isl::schedule_node_band Band) {
401 isl::schedule_node Node = visitChildren(Node: Band);
402
403 // Only add SIMD markers to innermost bands.
404 if (!Node.first_child().isa<isl::schedule_node_leaf>())
405 return Node;
406
407 isl::id LoopMarker = isl::id::alloc(ctx: Band.ctx(), name: "SIMD", user: nullptr);
408 return Band.insert_mark(mark: LoopMarker);
409 }
410};
411
412isl::schedule_node ScheduleTreeOptimizer::prevectSchedBand(
413 isl::schedule_node Node, unsigned DimToVectorize, int VectorWidth) {
414 assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
415
416 auto Space = isl::manage(ptr: isl_schedule_node_band_get_space(node: Node.get()));
417 unsigned ScheduleDimensions = unsignedFromIslSize(Size: Space.dim(type: isl::dim::set));
418 assert(DimToVectorize < ScheduleDimensions);
419
420 if (DimToVectorize > 0) {
421 Node = isl::manage(
422 ptr: isl_schedule_node_band_split(node: Node.release(), pos: DimToVectorize));
423 Node = Node.child(pos: 0);
424 }
425 if (DimToVectorize < ScheduleDimensions - 1)
426 Node = isl::manage(ptr: isl_schedule_node_band_split(node: Node.release(), pos: 1));
427 Space = isl::manage(ptr: isl_schedule_node_band_get_space(node: Node.get()));
428 auto Sizes = isl::multi_val::zero(space: Space);
429 Sizes = Sizes.set_val(pos: 0, el: isl::val(Node.ctx(), VectorWidth));
430 Node =
431 isl::manage(ptr: isl_schedule_node_band_tile(node: Node.release(), sizes: Sizes.release()));
432 Node = isolateFullPartialTiles(Node, VectorWidth);
433 Node = Node.child(pos: 0);
434 // Make sure the "trivially vectorizable loop" is not unrolled. Otherwise,
435 // we will have troubles to match it in the backend.
436 Node = Node.as<isl::schedule_node_band>().set_ast_build_options(
437 isl::union_set(Node.ctx(), "{ unroll[x]: 1 = 0 }"));
438
439 // Sink the inner loop into the smallest possible statements to make them
440 // represent a single vector instruction if possible.
441 Node = isl::manage(ptr: isl_schedule_node_band_sink(node: Node.release()));
442
443 // Add SIMD markers to those vector statements.
444 InsertSimdMarkers SimdMarkerInserter;
445 Node = SimdMarkerInserter.visit(Node);
446
447 PrevectOpts++;
448 return Node.parent();
449}
450
451static bool isSimpleInnermostBand(const isl::schedule_node &Node) {
452 assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
453 assert(isl_schedule_node_n_children(Node.get()) == 1);
454
455 auto ChildType = isl_schedule_node_get_type(node: Node.child(pos: 0).get());
456
457 if (ChildType == isl_schedule_node_leaf)
458 return true;
459
460 if (ChildType != isl_schedule_node_sequence)
461 return false;
462
463 auto Sequence = Node.child(pos: 0);
464
465 for (int c = 0, nc = isl_schedule_node_n_children(node: Sequence.get()); c < nc;
466 ++c) {
467 auto Child = Sequence.child(pos: c);
468 if (isl_schedule_node_get_type(node: Child.get()) != isl_schedule_node_filter)
469 return false;
470 if (isl_schedule_node_get_type(node: Child.child(pos: 0).get()) !=
471 isl_schedule_node_leaf)
472 return false;
473 }
474 return true;
475}
476
477/// Check if this node is a band node, which has only one child.
478///
479/// @param Node The node to check.
480static bool isOneTimeParentBandNode(isl::schedule_node Node) {
481 if (isl_schedule_node_get_type(node: Node.get()) != isl_schedule_node_band)
482 return false;
483
484 if (isl_schedule_node_n_children(node: Node.get()) != 1)
485 return false;
486
487 return true;
488}
489
490bool ScheduleTreeOptimizer::isTileableBandNode(isl::schedule_node Node) {
491 if (!isOneTimeParentBandNode(Node))
492 return false;
493
494 if (!isl_schedule_node_band_get_permutable(node: Node.get()))
495 return false;
496
497 auto Space = isl::manage(ptr: isl_schedule_node_band_get_space(node: Node.get()));
498
499 if (unsignedFromIslSize(Size: Space.dim(type: isl::dim::set)) <= 1u)
500 return false;
501
502 return isSimpleInnermostBand(Node);
503}
504
505bool ScheduleTreeOptimizer::isPMOptimizableBandNode(isl::schedule_node Node) {
506 if (!isOneTimeParentBandNode(Node))
507 return false;
508
509 return Node.child(pos: 0).isa<isl::schedule_node_leaf>();
510}
511
512__isl_give isl::schedule_node
513ScheduleTreeOptimizer::applyTileBandOpt(isl::schedule_node Node) {
514 if (FirstLevelTiling) {
515 Node = tileNode(Node, Identifier: "1st level tiling", TileSizes: FirstLevelTileSizes,
516 DefaultTileSize: FirstLevelDefaultTileSize);
517 FirstLevelTileOpts++;
518 }
519
520 if (SecondLevelTiling) {
521 Node = tileNode(Node, Identifier: "2nd level tiling", TileSizes: SecondLevelTileSizes,
522 DefaultTileSize: SecondLevelDefaultTileSize);
523 SecondLevelTileOpts++;
524 }
525
526 if (RegisterTiling) {
527 Node =
528 applyRegisterTiling(Node, TileSizes: RegisterTileSizes, DefaultTileSize: RegisterDefaultTileSize);
529 RegisterTileOpts++;
530 }
531
532 return Node;
533}
534
535isl::schedule_node
536ScheduleTreeOptimizer::applyPrevectBandOpt(isl::schedule_node Node) {
537 auto Space = isl::manage(ptr: isl_schedule_node_band_get_space(node: Node.get()));
538 int Dims = unsignedFromIslSize(Size: Space.dim(type: isl::dim::set));
539
540 for (int i = Dims - 1; i >= 0; i--)
541 if (Node.as<isl::schedule_node_band>().member_get_coincident(pos: i)) {
542 Node = prevectSchedBand(Node, DimToVectorize: i, VectorWidth: PrevectorWidth);
543 break;
544 }
545
546 return Node;
547}
548
549__isl_give isl_schedule_node *
550ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *NodeArg,
551 void *User) {
552 const OptimizerAdditionalInfoTy *OAI =
553 static_cast<const OptimizerAdditionalInfoTy *>(User);
554 assert(OAI && "Expecting optimization options");
555
556 isl::schedule_node Node = isl::manage(ptr: NodeArg);
557
558 if (OAI->PatternOpts && isPMOptimizableBandNode(Node)) {
559 isl::schedule_node PatternOptimizedSchedule =
560 tryOptimizeMatMulPattern(Node, TTI: OAI->TTI, D: OAI->D);
561 if (!PatternOptimizedSchedule.is_null()) {
562 MatMulOpts++;
563 OAI->DepsChanged = true;
564 return PatternOptimizedSchedule.release();
565 }
566 }
567
568 if (!isTileableBandNode(Node))
569 return Node.release();
570
571 if (OAI->Postopts)
572 Node = applyTileBandOpt(Node);
573
574 if (OAI->Prevect) {
575 // FIXME: Prevectorization requirements are different from those checked by
576 // isTileableBandNode.
577 Node = applyPrevectBandOpt(Node);
578 }
579
580 return Node.release();
581}
582
583isl::schedule
584ScheduleTreeOptimizer::optimizeSchedule(isl::schedule Schedule,
585 const OptimizerAdditionalInfoTy *OAI) {
586 auto Root = Schedule.get_root();
587 Root = optimizeScheduleNode(Node: Root, OAI);
588 return Root.get_schedule();
589}
590
591isl::schedule_node ScheduleTreeOptimizer::optimizeScheduleNode(
592 isl::schedule_node Node, const OptimizerAdditionalInfoTy *OAI) {
593 Node = isl::manage(ptr: isl_schedule_node_map_descendant_bottom_up(
594 node: Node.release(), fn: optimizeBand,
595 user: const_cast<void *>(static_cast<const void *>(OAI))));
596 return Node;
597}
598
599bool ScheduleTreeOptimizer::isProfitableSchedule(Scop &S,
600 isl::schedule NewSchedule) {
601 // To understand if the schedule has been optimized we check if the schedule
602 // has changed at all.
603 // TODO: We can improve this by tracking if any necessarily beneficial
604 // transformations have been performed. This can e.g. be tiling, loop
605 // interchange, or ...) We can track this either at the place where the
606 // transformation has been performed or, in case of automatic ILP based
607 // optimizations, by comparing (yet to be defined) performance metrics
608 // before/after the scheduling optimizer
609 // (e.g., #stride-one accesses)
610 // FIXME: A schedule tree whose union_map-conversion is identical to the
611 // original schedule map may still allow for parallelization, i.e. can still
612 // be profitable.
613 auto NewScheduleMap = NewSchedule.get_map();
614 auto OldSchedule = S.getSchedule();
615 assert(!OldSchedule.is_null() &&
616 "Only IslScheduleOptimizer can insert extension nodes "
617 "that make Scop::getSchedule() return nullptr.");
618 bool changed = !OldSchedule.is_equal(umap2: NewScheduleMap);
619 return changed;
620}
621
622class IslScheduleOptimizerWrapperPass final : public ScopPass {
623public:
624 static char ID;
625
626 explicit IslScheduleOptimizerWrapperPass() : ScopPass(ID) {}
627
628 /// Optimize the schedule of the SCoP @p S.
629 bool runOnScop(Scop &S) override;
630
631 /// Print the new schedule for the SCoP @p S.
632 void printScop(raw_ostream &OS, Scop &S) const override;
633
634 /// Register all analyses and transformation required.
635 void getAnalysisUsage(AnalysisUsage &AU) const override;
636
637 /// Release the internal memory.
638 void releaseMemory() override {
639 LastSchedule = {};
640 IslCtx.reset();
641 }
642
643private:
644 std::shared_ptr<isl_ctx> IslCtx;
645 isl::schedule LastSchedule;
646};
647
648char IslScheduleOptimizerWrapperPass::ID = 0;
649
650#ifndef NDEBUG
651static void printSchedule(llvm::raw_ostream &OS, const isl::schedule &Schedule,
652 StringRef Desc) {
653 isl::ctx Ctx = Schedule.ctx();
654 isl_printer *P = isl_printer_to_str(ctx: Ctx.get());
655 P = isl_printer_set_yaml_style(p: P, ISL_YAML_STYLE_BLOCK);
656 P = isl_printer_print_schedule(p: P, schedule: Schedule.get());
657 char *Str = isl_printer_get_str(printer: P);
658 OS << Desc << ": \n" << Str << "\n";
659 free(ptr: Str);
660 isl_printer_free(printer: P);
661}
662#endif
663
664/// Collect statistics for the schedule tree.
665///
666/// @param Schedule The schedule tree to analyze. If not a schedule tree it is
667/// ignored.
668/// @param Version The version of the schedule tree that is analyzed.
669/// 0 for the original schedule tree before any transformation.
670/// 1 for the schedule tree after isl's rescheduling.
671/// 2 for the schedule tree after optimizations are applied
672/// (tiling, pattern matching)
673static void walkScheduleTreeForStatistics(isl::schedule Schedule, int Version) {
674 auto Root = Schedule.get_root();
675 if (Root.is_null())
676 return;
677
678 isl_schedule_node_foreach_descendant_top_down(
679 node: Root.get(),
680 fn: [](__isl_keep isl_schedule_node *nodeptr, void *user) -> isl_bool {
681 isl::schedule_node Node = isl::manage_copy(ptr: nodeptr);
682 int Version = *static_cast<int *>(user);
683
684 switch (isl_schedule_node_get_type(node: Node.get())) {
685 case isl_schedule_node_band: {
686 NumBands[Version]++;
687 if (isl_schedule_node_band_get_permutable(node: Node.get()) ==
688 isl_bool_true)
689 NumPermutable[Version]++;
690
691 int CountMembers = isl_schedule_node_band_n_member(node: Node.get());
692 NumBandMembers[Version] += CountMembers;
693 for (int i = 0; i < CountMembers; i += 1) {
694 if (Node.as<isl::schedule_node_band>().member_get_coincident(pos: i))
695 NumCoincident[Version]++;
696 }
697 break;
698 }
699
700 case isl_schedule_node_filter:
701 NumFilters[Version]++;
702 break;
703
704 case isl_schedule_node_extension:
705 NumExtension[Version]++;
706 break;
707
708 default:
709 break;
710 }
711
712 return isl_bool_true;
713 },
714 user: &Version);
715}
716
717static void runIslScheduleOptimizer(
718 Scop &S,
719 function_ref<const Dependences &(Dependences::AnalysisLevel)> GetDeps,
720 TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE,
721 isl::schedule &LastSchedule, bool &DepsChanged) {
722 // Skip empty SCoPs but still allow code generation as it will delete the
723 // loops present but not needed.
724 if (S.getSize() == 0) {
725 S.markAsOptimized();
726 return;
727 }
728
729 ScopsProcessed++;
730
731 // Schedule without optimizations.
732 isl::schedule Schedule = S.getScheduleTree();
733 walkScheduleTreeForStatistics(Schedule: S.getScheduleTree(), Version: 0);
734 POLLY_DEBUG(printSchedule(dbgs(), Schedule, "Original schedule tree"));
735
736 bool HasUserTransformation = false;
737 if (PragmaBasedOpts) {
738 isl::schedule ManuallyTransformed = applyManualTransformations(
739 S: &S, Sched: Schedule, D: GetDeps(Dependences::AL_Statement), ORE);
740 if (ManuallyTransformed.is_null()) {
741 POLLY_DEBUG(dbgs() << "Error during manual optimization\n");
742 return;
743 }
744
745 if (ManuallyTransformed.get() != Schedule.get()) {
746 // User transformations have precedence over other transformations.
747 HasUserTransformation = true;
748 Schedule = std::move(ManuallyTransformed);
749 POLLY_DEBUG(
750 printSchedule(dbgs(), Schedule, "After manual transformations"));
751 }
752 }
753
754 // Only continue if either manual transformations have been applied or we are
755 // allowed to apply heuristics.
756 // TODO: Detect disabled heuristics and no user-directed transformation
757 // metadata earlier in ScopDetection.
758 if (!HasUserTransformation && S.hasDisableHeuristicsHint()) {
759 POLLY_DEBUG(dbgs() << "Heuristic optimizations disabled by metadata\n");
760 return;
761 }
762
763 // Get dependency analysis.
764 const Dependences &D = GetDeps(Dependences::AL_Statement);
765 if (D.getSharedIslCtx() != S.getSharedIslCtx()) {
766 POLLY_DEBUG(dbgs() << "DependenceInfo for another SCoP/isl_ctx\n");
767 return;
768 }
769 if (!D.hasValidDependences()) {
770 POLLY_DEBUG(dbgs() << "Dependency information not available\n");
771 return;
772 }
773
774 // Apply ISL's algorithm only if not overriden by the user. Note that
775 // post-rescheduling optimizations (tiling, pattern-based, prevectorization)
776 // rely on the coincidence/permutable annotations on schedule tree bands that
777 // are added by the rescheduling analyzer. Therefore, disabling the
778 // rescheduler implicitly also disables these optimizations.
779 if (!EnableReschedule) {
780 POLLY_DEBUG(dbgs() << "Skipping rescheduling due to command line option\n");
781 } else if (HasUserTransformation) {
782 POLLY_DEBUG(
783 dbgs() << "Skipping rescheduling due to manual transformation\n");
784 } else {
785 // Build input data.
786 int ValidityKinds =
787 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
788 int ProximityKinds;
789
790 if (OptimizeDeps == "all")
791 ProximityKinds =
792 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
793 else if (OptimizeDeps == "raw")
794 ProximityKinds = Dependences::TYPE_RAW;
795 else {
796 errs() << "Do not know how to optimize for '" << OptimizeDeps << "'"
797 << " Falling back to optimizing all dependences.\n";
798 ProximityKinds =
799 Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
800 }
801
802 isl::union_set Domain = S.getDomains();
803
804 if (Domain.is_null())
805 return;
806
807 isl::union_map Validity = D.getDependences(Kinds: ValidityKinds);
808 isl::union_map Proximity = D.getDependences(Kinds: ProximityKinds);
809
810 // Simplify the dependences by removing the constraints introduced by the
811 // domains. This can speed up the scheduling time significantly, as large
812 // constant coefficients will be removed from the dependences. The
813 // introduction of some additional dependences reduces the possible
814 // transformations, but in most cases, such transformation do not seem to be
815 // interesting anyway. In some cases this option may stop the scheduler to
816 // find any schedule.
817 if (SimplifyDeps == "yes") {
818 Validity = Validity.gist_domain(uset: Domain);
819 Validity = Validity.gist_range(uset: Domain);
820 Proximity = Proximity.gist_domain(uset: Domain);
821 Proximity = Proximity.gist_range(uset: Domain);
822 } else if (SimplifyDeps != "no") {
823 errs()
824 << "warning: Option -polly-opt-simplify-deps should either be 'yes' "
825 "or 'no'. Falling back to default: 'yes'\n";
826 }
827
828 POLLY_DEBUG(dbgs() << "\n\nCompute schedule from: ");
829 POLLY_DEBUG(dbgs() << "Domain := " << Domain << ";\n");
830 POLLY_DEBUG(dbgs() << "Proximity := " << Proximity << ";\n");
831 POLLY_DEBUG(dbgs() << "Validity := " << Validity << ";\n");
832
833 int IslMaximizeBands;
834 if (MaximizeBandDepth == "yes") {
835 IslMaximizeBands = 1;
836 } else if (MaximizeBandDepth == "no") {
837 IslMaximizeBands = 0;
838 } else {
839 errs()
840 << "warning: Option -polly-opt-maximize-bands should either be 'yes'"
841 " or 'no'. Falling back to default: 'yes'\n";
842 IslMaximizeBands = 1;
843 }
844
845 int IslOuterCoincidence;
846 if (OuterCoincidence == "yes") {
847 IslOuterCoincidence = 1;
848 } else if (OuterCoincidence == "no") {
849 IslOuterCoincidence = 0;
850 } else {
851 errs() << "warning: Option -polly-opt-outer-coincidence should either be "
852 "'yes' or 'no'. Falling back to default: 'no'\n";
853 IslOuterCoincidence = 0;
854 }
855
856 isl_ctx *Ctx = S.getIslCtx().get();
857
858 isl_options_set_schedule_outer_coincidence(ctx: Ctx, val: IslOuterCoincidence);
859 isl_options_set_schedule_maximize_band_depth(ctx: Ctx, val: IslMaximizeBands);
860 isl_options_set_schedule_max_constant_term(ctx: Ctx, val: MaxConstantTerm);
861 isl_options_set_schedule_max_coefficient(ctx: Ctx, val: MaxCoefficient);
862 isl_options_set_tile_scale_tile_loops(ctx: Ctx, val: 0);
863
864 auto OnErrorStatus = isl_options_get_on_error(ctx: Ctx);
865 isl_options_set_on_error(ctx: Ctx, ISL_ON_ERROR_CONTINUE);
866
867 auto SC = isl::schedule_constraints::on_domain(domain: Domain);
868 SC = SC.set_proximity(Proximity);
869 SC = SC.set_validity(Validity);
870 SC = SC.set_coincidence(Validity);
871
872 {
873 IslMaxOperationsGuard MaxOpGuard(Ctx, ScheduleComputeOut);
874 Schedule = SC.compute_schedule();
875
876 if (MaxOpGuard.hasQuotaExceeded())
877 POLLY_DEBUG(
878 dbgs() << "Schedule optimizer calculation exceeds ISL quota\n");
879 }
880
881 isl_options_set_on_error(ctx: Ctx, val: OnErrorStatus);
882
883 ScopsRescheduled++;
884 POLLY_DEBUG(printSchedule(dbgs(), Schedule, "After rescheduling"));
885 }
886
887 walkScheduleTreeForStatistics(Schedule, Version: 1);
888
889 // In cases the scheduler is not able to optimize the code, we just do not
890 // touch the schedule.
891 if (Schedule.is_null())
892 return;
893
894 if (GreedyFusion) {
895 isl::union_map Validity = D.getDependences(
896 Kinds: Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW);
897 Schedule = applyGreedyFusion(Sched: Schedule, Deps: Validity);
898 assert(!Schedule.is_null());
899 }
900
901 // Apply post-rescheduling optimizations (if enabled) and/or prevectorization.
902 const OptimizerAdditionalInfoTy OAI = {
903 .TTI: TTI,
904 .D: const_cast<Dependences *>(&D),
905 /*PatternOpts=*/!HasUserTransformation && PMBasedOpts,
906 /*Postopts=*/!HasUserTransformation && EnablePostopts,
907 /*Prevect=*/PollyVectorizerChoice != VECTORIZER_NONE,
908 .DepsChanged: DepsChanged};
909 if (OAI.PatternOpts || OAI.Postopts || OAI.Prevect) {
910 Schedule = ScheduleTreeOptimizer::optimizeSchedule(Schedule, OAI: &OAI);
911 Schedule = hoistExtensionNodes(Sched: Schedule);
912 POLLY_DEBUG(printSchedule(dbgs(), Schedule, "After post-optimizations"));
913 walkScheduleTreeForStatistics(Schedule, Version: 2);
914 }
915
916 // Skip profitability check if user transformation(s) have been applied.
917 if (!HasUserTransformation &&
918 !ScheduleTreeOptimizer::isProfitableSchedule(S, NewSchedule: Schedule))
919 return;
920
921 auto ScopStats = S.getStatistics();
922 ScopsOptimized++;
923 NumAffineLoopsOptimized += ScopStats.NumAffineLoops;
924 NumBoxedLoopsOptimized += ScopStats.NumBoxedLoops;
925 LastSchedule = Schedule;
926
927 S.setScheduleTree(Schedule);
928 S.markAsOptimized();
929
930 if (OptimizedScops)
931 errs() << S;
932}
933
934bool IslScheduleOptimizerWrapperPass::runOnScop(Scop &S) {
935 releaseMemory();
936
937 Function &F = S.getFunction();
938 IslCtx = S.getSharedIslCtx();
939
940 auto getDependences =
941 [this](Dependences::AnalysisLevel) -> const Dependences & {
942 return getAnalysis<DependenceInfo>().getDependences(
943 Level: Dependences::AL_Statement);
944 };
945 OptimizationRemarkEmitter &ORE =
946 getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
947 TargetTransformInfo *TTI =
948 &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
949
950 bool DepsChanged = false;
951 runIslScheduleOptimizer(S, GetDeps: getDependences, TTI, ORE: &ORE, LastSchedule,
952 DepsChanged);
953 if (DepsChanged)
954 getAnalysis<DependenceInfo>().abandonDependences();
955 return false;
956}
957
958static void runScheduleOptimizerPrinter(raw_ostream &OS,
959 isl::schedule LastSchedule) {
960 isl_printer *p;
961 char *ScheduleStr;
962
963 OS << "Calculated schedule:\n";
964
965 if (LastSchedule.is_null()) {
966 OS << "n/a\n";
967 return;
968 }
969
970 p = isl_printer_to_str(ctx: LastSchedule.ctx().get());
971 p = isl_printer_set_yaml_style(p, ISL_YAML_STYLE_BLOCK);
972 p = isl_printer_print_schedule(p, schedule: LastSchedule.get());
973 ScheduleStr = isl_printer_get_str(printer: p);
974 isl_printer_free(printer: p);
975
976 OS << ScheduleStr << "\n";
977
978 free(ptr: ScheduleStr);
979}
980
981void IslScheduleOptimizerWrapperPass::printScop(raw_ostream &OS, Scop &) const {
982 runScheduleOptimizerPrinter(OS, LastSchedule);
983}
984
985void IslScheduleOptimizerWrapperPass::getAnalysisUsage(
986 AnalysisUsage &AU) const {
987 ScopPass::getAnalysisUsage(AU);
988 AU.addRequired<DependenceInfo>();
989 AU.addRequired<TargetTransformInfoWrapperPass>();
990 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
991
992 AU.addPreserved<DependenceInfo>();
993 AU.addPreserved<OptimizationRemarkEmitterWrapperPass>();
994}
995
996} // namespace
997
998Pass *polly::createIslScheduleOptimizerWrapperPass() {
999 return new IslScheduleOptimizerWrapperPass();
1000}
1001
1002INITIALIZE_PASS_BEGIN(IslScheduleOptimizerWrapperPass, "polly-opt-isl",
1003 "Polly - Optimize schedule of SCoP", false, false);
1004INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
1005INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass);
1006INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass);
1007INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass);
1008INITIALIZE_PASS_END(IslScheduleOptimizerWrapperPass, "polly-opt-isl",
1009 "Polly - Optimize schedule of SCoP", false, false)
1010
1011static llvm::PreservedAnalyses
1012runIslScheduleOptimizerUsingNPM(Scop &S, ScopAnalysisManager &SAM,
1013 ScopStandardAnalysisResults &SAR, SPMUpdater &U,
1014 raw_ostream *OS) {
1015 DependenceAnalysis::Result &Deps = SAM.getResult<DependenceAnalysis>(IR&: S, ExtraArgs&: SAR);
1016 auto GetDeps = [&Deps](Dependences::AnalysisLevel) -> const Dependences & {
1017 return Deps.getDependences(Level: Dependences::AL_Statement);
1018 };
1019 OptimizationRemarkEmitter ORE(&S.getFunction());
1020 TargetTransformInfo *TTI = &SAR.TTI;
1021 isl::schedule LastSchedule;
1022 bool DepsChanged = false;
1023 runIslScheduleOptimizer(S, GetDeps, TTI, ORE: &ORE, LastSchedule, DepsChanged);
1024 if (DepsChanged)
1025 Deps.abandonDependences();
1026
1027 if (OS) {
1028 *OS << "Printing analysis 'Polly - Optimize schedule of SCoP' for region: '"
1029 << S.getName() << "' in function '" << S.getFunction().getName()
1030 << "':\n";
1031 runScheduleOptimizerPrinter(OS&: *OS, LastSchedule);
1032 }
1033 return PreservedAnalyses::all();
1034}
1035
1036llvm::PreservedAnalyses
1037IslScheduleOptimizerPass::run(Scop &S, ScopAnalysisManager &SAM,
1038 ScopStandardAnalysisResults &SAR, SPMUpdater &U) {
1039 return runIslScheduleOptimizerUsingNPM(S, SAM, SAR, U, OS: nullptr);
1040}
1041
1042llvm::PreservedAnalyses
1043IslScheduleOptimizerPrinterPass::run(Scop &S, ScopAnalysisManager &SAM,
1044 ScopStandardAnalysisResults &SAR,
1045 SPMUpdater &U) {
1046 return runIslScheduleOptimizerUsingNPM(S, SAM, SAR, U, OS: &OS);
1047}
1048
1049//===----------------------------------------------------------------------===//
1050
1051namespace {
1052/// Print result from IslScheduleOptimizerWrapperPass.
1053class IslScheduleOptimizerPrinterLegacyPass final : public ScopPass {
1054public:
1055 static char ID;
1056
1057 IslScheduleOptimizerPrinterLegacyPass()
1058 : IslScheduleOptimizerPrinterLegacyPass(outs()) {}
1059 explicit IslScheduleOptimizerPrinterLegacyPass(llvm::raw_ostream &OS)
1060 : ScopPass(ID), OS(OS) {}
1061
1062 bool runOnScop(Scop &S) override {
1063 IslScheduleOptimizerWrapperPass &P =
1064 getAnalysis<IslScheduleOptimizerWrapperPass>();
1065
1066 OS << "Printing analysis '" << P.getPassName() << "' for region: '"
1067 << S.getRegion().getNameStr() << "' in function '"
1068 << S.getFunction().getName() << "':\n";
1069 P.printScop(OS, S);
1070
1071 return false;
1072 }
1073
1074 void getAnalysisUsage(AnalysisUsage &AU) const override {
1075 ScopPass::getAnalysisUsage(AU);
1076 AU.addRequired<IslScheduleOptimizerWrapperPass>();
1077 AU.setPreservesAll();
1078 }
1079
1080private:
1081 llvm::raw_ostream &OS;
1082};
1083
1084char IslScheduleOptimizerPrinterLegacyPass::ID = 0;
1085} // namespace
1086
1087Pass *polly::createIslScheduleOptimizerPrinterLegacyPass(raw_ostream &OS) {
1088 return new IslScheduleOptimizerPrinterLegacyPass(OS);
1089}
1090
1091INITIALIZE_PASS_BEGIN(IslScheduleOptimizerPrinterLegacyPass,
1092 "polly-print-opt-isl",
1093 "Polly - Print optimizer schedule of SCoP", false, false);
1094INITIALIZE_PASS_DEPENDENCY(IslScheduleOptimizerWrapperPass)
1095INITIALIZE_PASS_END(IslScheduleOptimizerPrinterLegacyPass,
1096 "polly-print-opt-isl",
1097 "Polly - Print optimizer schedule of SCoP", false, false)
1098

source code of polly/lib/Transform/ScheduleOptimizer.cpp