1 | //===- lib/CodeGen/MachineTraceMetrics.h - Super-scalar metrics -*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file defines the interface for the MachineTraceMetrics analysis pass |
10 | // that estimates CPU resource usage and critical data dependency paths through |
11 | // preferred traces. This is useful for super-scalar CPUs where execution speed |
12 | // can be limited both by data dependencies and by limited execution resources. |
13 | // |
14 | // Out-of-order CPUs will often be executing instructions from multiple basic |
15 | // blocks at the same time. This makes it difficult to estimate the resource |
16 | // usage accurately in a single basic block. Resources can be estimated better |
17 | // by looking at a trace through the current basic block. |
18 | // |
19 | // For every block, the MachineTraceMetrics pass will pick a preferred trace |
20 | // that passes through the block. The trace is chosen based on loop structure, |
21 | // branch probabilities, and resource usage. The intention is to pick likely |
22 | // traces that would be the most affected by code transformations. |
23 | // |
24 | // It is expensive to compute a full arbitrary trace for every block, so to |
25 | // save some computations, traces are chosen to be convergent. This means that |
26 | // if the traces through basic blocks A and B ever cross when moving away from |
27 | // A and B, they never diverge again. This applies in both directions - If the |
28 | // traces meet above A and B, they won't diverge when going further back. |
29 | // |
30 | // Traces tend to align with loops. The trace through a block in an inner loop |
31 | // will begin at the loop entry block and end at a back edge. If there are |
32 | // nested loops, the trace may begin and end at those instead. |
33 | // |
34 | // For each trace, we compute the critical path length, which is the number of |
35 | // cycles required to execute the trace when execution is limited by data |
36 | // dependencies only. We also compute the resource height, which is the number |
37 | // of cycles required to execute all instructions in the trace when ignoring |
38 | // data dependencies. |
39 | // |
40 | // Every instruction in the current block has a slack - the number of cycles |
41 | // execution of the instruction can be delayed without extending the critical |
42 | // path. |
43 | // |
44 | //===----------------------------------------------------------------------===// |
45 | |
46 | #ifndef LLVM_CODEGEN_MACHINETRACEMETRICS_H |
47 | #define LLVM_CODEGEN_MACHINETRACEMETRICS_H |
48 | |
49 | #include "llvm/ADT/SparseSet.h" |
50 | #include "llvm/ADT/ArrayRef.h" |
51 | #include "llvm/ADT/DenseMap.h" |
52 | #include "llvm/ADT/SmallVector.h" |
53 | #include "llvm/CodeGen/MachineBasicBlock.h" |
54 | #include "llvm/CodeGen/MachineFunctionPass.h" |
55 | #include "llvm/CodeGen/TargetSchedule.h" |
56 | |
57 | namespace llvm { |
58 | |
59 | class AnalysisUsage; |
60 | class MachineFunction; |
61 | class MachineInstr; |
62 | class MachineLoop; |
63 | class MachineLoopInfo; |
64 | class MachineRegisterInfo; |
65 | struct MCSchedClassDesc; |
66 | class raw_ostream; |
67 | class TargetInstrInfo; |
68 | class TargetRegisterInfo; |
69 | |
70 | // Keep track of physreg data dependencies by recording each live register unit. |
71 | // Associate each regunit with an instruction operand. Depending on the |
72 | // direction instructions are scanned, it could be the operand that defined the |
73 | // regunit, or the highest operand to read the regunit. |
74 | struct LiveRegUnit { |
75 | unsigned RegUnit; |
76 | unsigned Cycle = 0; |
77 | const MachineInstr *MI = nullptr; |
78 | unsigned Op = 0; |
79 | |
80 | unsigned getSparseSetIndex() const { return RegUnit; } |
81 | |
82 | LiveRegUnit(unsigned RU) : RegUnit(RU) {} |
83 | }; |
84 | |
85 | /// Strategies for selecting traces. |
86 | enum class MachineTraceStrategy { |
87 | /// Select the trace through a block that has the fewest instructions. |
88 | TS_MinInstrCount, |
89 | /// Select the trace that contains only the current basic block. For instance, |
90 | /// this strategy can be used by MachineCombiner to make better decisions when |
91 | /// we estimate critical path for in-order cores. |
92 | TS_Local, |
93 | TS_NumStrategies |
94 | }; |
95 | |
96 | class MachineTraceMetrics : public MachineFunctionPass { |
97 | const MachineFunction *MF = nullptr; |
98 | const TargetInstrInfo *TII = nullptr; |
99 | const TargetRegisterInfo *TRI = nullptr; |
100 | const MachineRegisterInfo *MRI = nullptr; |
101 | const MachineLoopInfo *Loops = nullptr; |
102 | TargetSchedModel SchedModel; |
103 | |
104 | public: |
105 | friend class Ensemble; |
106 | friend class Trace; |
107 | |
108 | class Ensemble; |
109 | |
110 | static char ID; |
111 | |
112 | MachineTraceMetrics(); |
113 | |
114 | void getAnalysisUsage(AnalysisUsage&) const override; |
115 | bool runOnMachineFunction(MachineFunction&) override; |
116 | void releaseMemory() override; |
117 | void verifyAnalysis() const override; |
118 | |
119 | /// Per-basic block information that doesn't depend on the trace through the |
120 | /// block. |
121 | struct FixedBlockInfo { |
122 | /// The number of non-trivial instructions in the block. |
123 | /// Doesn't count PHI and COPY instructions that are likely to be removed. |
124 | unsigned InstrCount = ~0u; |
125 | |
126 | /// True when the block contains calls. |
127 | bool HasCalls = false; |
128 | |
129 | FixedBlockInfo() = default; |
130 | |
131 | /// Returns true when resource information for this block has been computed. |
132 | bool hasResources() const { return InstrCount != ~0u; } |
133 | |
134 | /// Invalidate resource information. |
135 | void invalidate() { InstrCount = ~0u; } |
136 | }; |
137 | |
138 | /// Get the fixed resource information about MBB. Compute it on demand. |
139 | const FixedBlockInfo *getResources(const MachineBasicBlock*); |
140 | |
141 | /// Get the scaled number of cycles used per processor resource in MBB. |
142 | /// This is an array with SchedModel.getNumProcResourceKinds() entries. |
143 | /// The getResources() function above must have been called first. |
144 | /// |
145 | /// These numbers have already been scaled by SchedModel.getResourceFactor(). |
146 | ArrayRef<unsigned> getProcReleaseAtCycles(unsigned MBBNum) const; |
147 | |
148 | /// A virtual register or regunit required by a basic block or its trace |
149 | /// successors. |
150 | struct LiveInReg { |
151 | /// The virtual register required, or a register unit. |
152 | Register Reg; |
153 | |
154 | /// For virtual registers: Minimum height of the defining instruction. |
155 | /// For regunits: Height of the highest user in the trace. |
156 | unsigned Height; |
157 | |
158 | LiveInReg(Register Reg, unsigned Height = 0) : Reg(Reg), Height(Height) {} |
159 | }; |
160 | |
161 | /// Per-basic block information that relates to a specific trace through the |
162 | /// block. Convergent traces means that only one of these is required per |
163 | /// block in a trace ensemble. |
164 | struct TraceBlockInfo { |
165 | /// Trace predecessor, or NULL for the first block in the trace. |
166 | /// Valid when hasValidDepth(). |
167 | const MachineBasicBlock *Pred = nullptr; |
168 | |
169 | /// Trace successor, or NULL for the last block in the trace. |
170 | /// Valid when hasValidHeight(). |
171 | const MachineBasicBlock *Succ = nullptr; |
172 | |
173 | /// The block number of the head of the trace. (When hasValidDepth()). |
174 | unsigned Head; |
175 | |
176 | /// The block number of the tail of the trace. (When hasValidHeight()). |
177 | unsigned Tail; |
178 | |
179 | /// Accumulated number of instructions in the trace above this block. |
180 | /// Does not include instructions in this block. |
181 | unsigned InstrDepth = ~0u; |
182 | |
183 | /// Accumulated number of instructions in the trace below this block. |
184 | /// Includes instructions in this block. |
185 | unsigned InstrHeight = ~0u; |
186 | |
187 | TraceBlockInfo() = default; |
188 | |
189 | /// Returns true if the depth resources have been computed from the trace |
190 | /// above this block. |
191 | bool hasValidDepth() const { return InstrDepth != ~0u; } |
192 | |
193 | /// Returns true if the height resources have been computed from the trace |
194 | /// below this block. |
195 | bool hasValidHeight() const { return InstrHeight != ~0u; } |
196 | |
197 | /// Invalidate depth resources when some block above this one has changed. |
198 | void invalidateDepth() { InstrDepth = ~0u; HasValidInstrDepths = false; } |
199 | |
200 | /// Invalidate height resources when a block below this one has changed. |
201 | void invalidateHeight() { InstrHeight = ~0u; HasValidInstrHeights = false; } |
202 | |
203 | /// Assuming that this is a dominator of TBI, determine if it contains |
204 | /// useful instruction depths. A dominating block can be above the current |
205 | /// trace head, and any dependencies from such a far away dominator are not |
206 | /// expected to affect the critical path. |
207 | /// |
208 | /// Also returns true when TBI == this. |
209 | bool isUsefulDominator(const TraceBlockInfo &TBI) const { |
210 | // The trace for TBI may not even be calculated yet. |
211 | if (!hasValidDepth() || !TBI.hasValidDepth()) |
212 | return false; |
213 | // Instruction depths are only comparable if the traces share a head. |
214 | if (Head != TBI.Head) |
215 | return false; |
216 | // It is almost always the case that TBI belongs to the same trace as |
217 | // this block, but rare convoluted cases involving irreducible control |
218 | // flow, a dominator may share a trace head without actually being on the |
219 | // same trace as TBI. This is not a big problem as long as it doesn't |
220 | // increase the instruction depth. |
221 | return HasValidInstrDepths && InstrDepth <= TBI.InstrDepth; |
222 | } |
223 | |
224 | // Data-dependency-related information. Per-instruction depth and height |
225 | // are computed from data dependencies in the current trace, using |
226 | // itinerary data. |
227 | |
228 | /// Instruction depths have been computed. This implies hasValidDepth(). |
229 | bool HasValidInstrDepths = false; |
230 | |
231 | /// Instruction heights have been computed. This implies hasValidHeight(). |
232 | bool HasValidInstrHeights = false; |
233 | |
234 | /// Critical path length. This is the number of cycles in the longest data |
235 | /// dependency chain through the trace. This is only valid when both |
236 | /// HasValidInstrDepths and HasValidInstrHeights are set. |
237 | unsigned CriticalPath; |
238 | |
239 | /// Live-in registers. These registers are defined above the current block |
240 | /// and used by this block or a block below it. |
241 | /// This does not include PHI uses in the current block, but it does |
242 | /// include PHI uses in deeper blocks. |
243 | SmallVector<LiveInReg, 4> LiveIns; |
244 | |
245 | void print(raw_ostream&) const; |
246 | }; |
247 | |
248 | /// InstrCycles represents the cycle height and depth of an instruction in a |
249 | /// trace. |
250 | struct InstrCycles { |
251 | /// Earliest issue cycle as determined by data dependencies and instruction |
252 | /// latencies from the beginning of the trace. Data dependencies from |
253 | /// before the trace are not included. |
254 | unsigned Depth; |
255 | |
256 | /// Minimum number of cycles from this instruction is issued to the of the |
257 | /// trace, as determined by data dependencies and instruction latencies. |
258 | unsigned Height; |
259 | }; |
260 | |
261 | /// A trace represents a plausible sequence of executed basic blocks that |
262 | /// passes through the current basic block one. The Trace class serves as a |
263 | /// handle to internal cached data structures. |
264 | class Trace { |
265 | Ensemble &TE; |
266 | TraceBlockInfo &TBI; |
267 | |
268 | unsigned getBlockNum() const { return &TBI - &TE.BlockInfo[0]; } |
269 | |
270 | public: |
271 | explicit Trace(Ensemble &te, TraceBlockInfo &tbi) : TE(te), TBI(tbi) {} |
272 | |
273 | void print(raw_ostream&) const; |
274 | |
275 | /// Compute the total number of instructions in the trace. |
276 | unsigned getInstrCount() const { |
277 | return TBI.InstrDepth + TBI.InstrHeight; |
278 | } |
279 | |
280 | /// Return the resource depth of the top/bottom of the trace center block. |
281 | /// This is the number of cycles required to execute all instructions from |
282 | /// the trace head to the trace center block. The resource depth only |
283 | /// considers execution resources, it ignores data dependencies. |
284 | /// When Bottom is set, instructions in the trace center block are included. |
285 | unsigned getResourceDepth(bool Bottom) const; |
286 | |
287 | /// Return the resource length of the trace. This is the number of cycles |
288 | /// required to execute the instructions in the trace if they were all |
289 | /// independent, exposing the maximum instruction-level parallelism. |
290 | /// |
291 | /// Any blocks in Extrablocks are included as if they were part of the |
292 | /// trace. Likewise, extra resources required by the specified scheduling |
293 | /// classes are included. For the caller to account for extra machine |
294 | /// instructions, it must first resolve each instruction's scheduling class. |
295 | unsigned getResourceLength( |
296 | ArrayRef<const MachineBasicBlock *> = std::nullopt, |
297 | ArrayRef<const MCSchedClassDesc *> = std::nullopt, |
298 | ArrayRef<const MCSchedClassDesc *> RemoveInstrs = std::nullopt) const; |
299 | |
300 | /// Return the length of the (data dependency) critical path through the |
301 | /// trace. |
302 | unsigned getCriticalPath() const { return TBI.CriticalPath; } |
303 | |
304 | /// Return the depth and height of MI. The depth is only valid for |
305 | /// instructions in or above the trace center block. The height is only |
306 | /// valid for instructions in or below the trace center block. |
307 | InstrCycles getInstrCycles(const MachineInstr &MI) const { |
308 | return TE.Cycles.lookup(Val: &MI); |
309 | } |
310 | |
311 | /// Return the slack of MI. This is the number of cycles MI can be delayed |
312 | /// before the critical path becomes longer. |
313 | /// MI must be an instruction in the trace center block. |
314 | unsigned getInstrSlack(const MachineInstr &MI) const; |
315 | |
316 | /// Return the Depth of a PHI instruction in a trace center block successor. |
317 | /// The PHI does not have to be part of the trace. |
318 | unsigned getPHIDepth(const MachineInstr &PHI) const; |
319 | |
320 | /// A dependence is useful if the basic block of the defining instruction |
321 | /// is part of the trace of the user instruction. It is assumed that DefMI |
322 | /// dominates UseMI (see also isUsefulDominator). |
323 | bool isDepInTrace(const MachineInstr &DefMI, |
324 | const MachineInstr &UseMI) const; |
325 | }; |
326 | |
327 | /// A trace ensemble is a collection of traces selected using the same |
328 | /// strategy, for example 'minimum resource height'. There is one trace for |
329 | /// every block in the function. |
330 | class Ensemble { |
331 | friend class Trace; |
332 | |
333 | SmallVector<TraceBlockInfo, 4> BlockInfo; |
334 | DenseMap<const MachineInstr*, InstrCycles> Cycles; |
335 | SmallVector<unsigned, 0> ProcResourceDepths; |
336 | SmallVector<unsigned, 0> ProcResourceHeights; |
337 | |
338 | void computeTrace(const MachineBasicBlock*); |
339 | void computeDepthResources(const MachineBasicBlock*); |
340 | void computeHeightResources(const MachineBasicBlock*); |
341 | unsigned computeCrossBlockCriticalPath(const TraceBlockInfo&); |
342 | void computeInstrDepths(const MachineBasicBlock*); |
343 | void computeInstrHeights(const MachineBasicBlock*); |
344 | void addLiveIns(const MachineInstr *DefMI, unsigned DefOp, |
345 | ArrayRef<const MachineBasicBlock*> Trace); |
346 | |
347 | protected: |
348 | MachineTraceMetrics &MTM; |
349 | |
350 | explicit Ensemble(MachineTraceMetrics*); |
351 | |
352 | virtual const MachineBasicBlock *pickTracePred(const MachineBasicBlock*) =0; |
353 | virtual const MachineBasicBlock *pickTraceSucc(const MachineBasicBlock*) =0; |
354 | const MachineLoop *getLoopFor(const MachineBasicBlock*) const; |
355 | const TraceBlockInfo *getDepthResources(const MachineBasicBlock*) const; |
356 | const TraceBlockInfo *getHeightResources(const MachineBasicBlock*) const; |
357 | ArrayRef<unsigned> getProcResourceDepths(unsigned MBBNum) const; |
358 | ArrayRef<unsigned> getProcResourceHeights(unsigned MBBNum) const; |
359 | |
360 | public: |
361 | virtual ~Ensemble(); |
362 | |
363 | virtual const char *getName() const = 0; |
364 | void print(raw_ostream&) const; |
365 | void invalidate(const MachineBasicBlock *MBB); |
366 | void verify() const; |
367 | |
368 | /// Get the trace that passes through MBB. |
369 | /// The trace is computed on demand. |
370 | Trace getTrace(const MachineBasicBlock *MBB); |
371 | |
372 | /// Updates the depth of an machine instruction, given RegUnits. |
373 | void updateDepth(TraceBlockInfo &TBI, const MachineInstr&, |
374 | SparseSet<LiveRegUnit> &RegUnits); |
375 | void updateDepth(const MachineBasicBlock *, const MachineInstr&, |
376 | SparseSet<LiveRegUnit> &RegUnits); |
377 | |
378 | /// Updates the depth of the instructions from Start to End. |
379 | void updateDepths(MachineBasicBlock::iterator Start, |
380 | MachineBasicBlock::iterator End, |
381 | SparseSet<LiveRegUnit> &RegUnits); |
382 | |
383 | }; |
384 | |
385 | /// Get the trace ensemble representing the given trace selection strategy. |
386 | /// The returned Ensemble object is owned by the MachineTraceMetrics analysis, |
387 | /// and valid for the lifetime of the analysis pass. |
388 | Ensemble *getEnsemble(MachineTraceStrategy); |
389 | |
390 | /// Invalidate cached information about MBB. This must be called *before* MBB |
391 | /// is erased, or the CFG is otherwise changed. |
392 | /// |
393 | /// This invalidates per-block information about resource usage for MBB only, |
394 | /// and it invalidates per-trace information for any trace that passes |
395 | /// through MBB. |
396 | /// |
397 | /// Call Ensemble::getTrace() again to update any trace handles. |
398 | void invalidate(const MachineBasicBlock *MBB); |
399 | |
400 | private: |
401 | // One entry per basic block, indexed by block number. |
402 | SmallVector<FixedBlockInfo, 4> BlockInfo; |
403 | |
404 | // Cycles consumed on each processor resource per block. |
405 | // The number of processor resource kinds is constant for a given subtarget, |
406 | // but it is not known at compile time. The number of cycles consumed by |
407 | // block B on processor resource R is at ProcReleaseAtCycles[B*Kinds + R] |
408 | // where Kinds = SchedModel.getNumProcResourceKinds(). |
409 | SmallVector<unsigned, 0> ProcReleaseAtCycles; |
410 | |
411 | // One ensemble per strategy. |
412 | Ensemble |
413 | *Ensembles[static_cast<size_t>(MachineTraceStrategy::TS_NumStrategies)]; |
414 | |
415 | // Convert scaled resource usage to a cycle count that can be compared with |
416 | // latencies. |
417 | unsigned getCycles(unsigned Scaled) { |
418 | unsigned Factor = SchedModel.getLatencyFactor(); |
419 | return (Scaled + Factor - 1) / Factor; |
420 | } |
421 | }; |
422 | |
423 | inline raw_ostream &operator<<(raw_ostream &OS, |
424 | const MachineTraceMetrics::Trace &Tr) { |
425 | Tr.print(OS); |
426 | return OS; |
427 | } |
428 | |
429 | inline raw_ostream &operator<<(raw_ostream &OS, |
430 | const MachineTraceMetrics::Ensemble &En) { |
431 | En.print(OS); |
432 | return OS; |
433 | } |
434 | |
435 | } // end namespace llvm |
436 | |
437 | #endif // LLVM_CODEGEN_MACHINETRACEMETRICS_H |
438 | |