1 | //===- MatmulOptimizer.h -------------------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #ifndef POLLY_MATMULOPTIMIZER_H |
10 | #define POLLY_MATMULOPTIMIZER_H |
11 | |
12 | #include "isl/isl-noexceptions.h" |
13 | |
14 | namespace llvm { |
15 | class TargetTransformInfo; |
16 | } |
17 | |
18 | namespace polly { |
19 | class Dependences; |
20 | |
21 | /// Apply the BLIS matmul optimization pattern if possible. |
22 | /// |
23 | /// Make the loops containing the matrix multiplication be the innermost |
24 | /// loops and apply the BLIS matmul optimization pattern. BLIS implements |
25 | /// gemm as three nested loops around a macro-kernel, plus two packing |
26 | /// routines. The macro-kernel is implemented in terms of two additional |
27 | /// loops around a micro-kernel. The micro-kernel is a loop around a rank-1 |
28 | /// (i.e., outer product) update. |
29 | /// |
30 | /// For a detailed description please see [1]. |
31 | /// |
32 | /// The order of the loops defines the data reused in the BLIS implementation |
33 | /// of gemm ([1]). In particular, elements of the matrix B, the second |
34 | /// operand of matrix multiplication, are reused between iterations of the |
35 | /// innermost loop. To keep the reused data in cache, only elements of matrix |
36 | /// A, the first operand of matrix multiplication, should be evicted during |
37 | /// an iteration of the innermost loop. To provide such a cache replacement |
38 | /// policy, elements of the matrix A can, in particular, be loaded first and, |
39 | /// consequently, be least-recently-used. |
40 | /// |
41 | /// In our case matrices are stored in row-major order instead of |
42 | /// column-major order used in the BLIS implementation ([1]). It affects only |
43 | /// on the form of the BLIS micro kernel and the computation of its |
44 | /// parameters. In particular, reused elements of the matrix B are |
45 | /// successively multiplied by specific elements of the matrix A. |
46 | /// |
47 | /// Refs.: |
48 | /// [1] - Analytical Modeling is Enough for High Performance BLIS |
49 | /// Tze Meng Low, Francisco D Igual, Tyler M Smith, Enrique S Quintana-Orti |
50 | /// Technical Report, 2014 |
51 | /// http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf |
52 | /// |
53 | /// @see ScheduleTreeOptimizer::createMicroKernel |
54 | /// @see ScheduleTreeOptimizer::createMacroKernel |
55 | /// @see getMicroKernelParams |
56 | /// @see getMacroKernelParams |
57 | /// |
58 | /// TODO: Implement the packing transformation. |
59 | /// |
60 | /// @param Node The node that contains a band to be optimized. The node |
61 | /// is required to successfully pass |
62 | /// ScheduleTreeOptimizer::isMatrMultPattern. |
63 | /// @param TTI Target Transform Info. |
64 | /// @param D The dependencies. |
65 | /// |
66 | /// @returns The transformed schedule or nullptr if the optimization |
67 | /// cannot be applied. |
68 | isl::schedule_node |
69 | tryOptimizeMatMulPattern(isl::schedule_node Node, |
70 | const llvm::TargetTransformInfo *TTI, |
71 | const Dependences *D); |
72 | |
73 | } // namespace polly |
74 | #endif // POLLY_MATMULOPTIMIZER_H |
75 | |