1//===- bolt/Passes/FrameOptimizer.cpp -------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the FrameOptimizerPass class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "bolt/Passes/FrameOptimizer.h"
14#include "bolt/Core/ParallelUtilities.h"
15#include "bolt/Passes/BinaryFunctionCallGraph.h"
16#include "bolt/Passes/DataflowInfoManager.h"
17#include "bolt/Passes/ShrinkWrapping.h"
18#include "bolt/Passes/StackAvailableExpressions.h"
19#include "bolt/Passes/StackReachingUses.h"
20#include "bolt/Utils/CommandLineOpts.h"
21#include "llvm/Support/Timer.h"
22#include <deque>
23
24#define DEBUG_TYPE "fop"
25
26using namespace llvm;
27
28namespace opts {
29extern cl::opt<unsigned> Verbosity;
30extern cl::opt<bool> TimeOpts;
31extern cl::OptionCategory BoltOptCategory;
32
33using namespace bolt;
34
35cl::opt<FrameOptimizationType>
36FrameOptimization("frame-opt",
37 cl::init(Val: FOP_NONE),
38 cl::desc("optimize stack frame accesses"),
39 cl::values(
40 clEnumValN(FOP_NONE, "none", "do not perform frame optimization"),
41 clEnumValN(FOP_HOT, "hot", "perform FOP on hot functions"),
42 clEnumValN(FOP_ALL, "all", "perform FOP on all functions")),
43 cl::ZeroOrMore,
44 cl::cat(BoltOptCategory));
45
46cl::opt<bool> RemoveStores(
47 "frame-opt-rm-stores", cl::init(Val: FOP_NONE),
48 cl::desc("apply additional analysis to remove stores (experimental)"),
49 cl::cat(BoltOptCategory));
50
51} // namespace opts
52
53namespace llvm {
54namespace bolt {
55
56void FrameOptimizerPass::removeUnnecessaryLoads(const RegAnalysis &RA,
57 const FrameAnalysis &FA,
58 BinaryFunction &BF) {
59 StackAvailableExpressions SAE(RA, FA, BF);
60 SAE.run();
61
62 LLVM_DEBUG(dbgs() << "Performing unnecessary loads removal\n");
63 std::deque<std::pair<BinaryBasicBlock *, MCInst *>> ToErase;
64 bool Changed = false;
65 const auto ExprEnd = SAE.expr_end();
66 MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get();
67 for (BinaryBasicBlock &BB : BF) {
68 LLVM_DEBUG(dbgs() << "\tNow at BB " << BB.getName() << "\n");
69 const MCInst *Prev = nullptr;
70 for (MCInst &Inst : BB) {
71 LLVM_DEBUG({
72 dbgs() << "\t\tNow at ";
73 Inst.dump();
74 for (auto I = Prev ? SAE.expr_begin(*Prev) : SAE.expr_begin(BB);
75 I != ExprEnd; ++I) {
76 dbgs() << "\t\t\tReached by: ";
77 (*I)->dump();
78 }
79 });
80 // if Inst is a load from stack and the current available expressions show
81 // this value is available in a register or immediate, replace this load
82 // with move from register or from immediate.
83 ErrorOr<const FrameIndexEntry &> FIEX = FA.getFIEFor(Inst);
84 if (!FIEX) {
85 Prev = &Inst;
86 continue;
87 }
88 // FIXME: Change to remove IsSimple == 0. We're being conservative here,
89 // but once replaceMemOperandWithReg is ready, we should feed it with all
90 // sorts of complex instructions.
91 if (FIEX->IsLoad == false || FIEX->IsSimple == false ||
92 FIEX->StackOffset >= 0) {
93 Prev = &Inst;
94 continue;
95 }
96
97 for (auto I = Prev ? SAE.expr_begin(Point: *Prev) : SAE.expr_begin(Point: BB);
98 I != ExprEnd; ++I) {
99 const MCInst *AvailableInst = *I;
100 ErrorOr<const FrameIndexEntry &> FIEY = FA.getFIEFor(Inst: *AvailableInst);
101 if (!FIEY)
102 continue;
103 assert(FIEY->IsStore && FIEY->IsSimple);
104 if (FIEX->StackOffset != FIEY->StackOffset || FIEX->Size != FIEY->Size)
105 continue;
106 // TODO: Change push/pops to stack adjustment instruction
107 if (MIB->isPop(Inst))
108 continue;
109
110 ++NumRedundantLoads;
111 FreqRedundantLoads += BB.getKnownExecutionCount();
112 Changed = true;
113 LLVM_DEBUG(dbgs() << "Redundant load instruction: ");
114 LLVM_DEBUG(Inst.dump());
115 LLVM_DEBUG(dbgs() << "Related store instruction: ");
116 LLVM_DEBUG(AvailableInst->dump());
117 LLVM_DEBUG(dbgs() << "@BB: " << BB.getName() << "\n");
118 // Replace load
119 if (FIEY->IsStoreFromReg) {
120 if (!MIB->replaceMemOperandWithReg(Inst, RegNum: FIEY->RegOrImm)) {
121 LLVM_DEBUG(dbgs() << "FAILED to change operand to a reg\n");
122 break;
123 }
124 FreqLoadsChangedToReg += BB.getKnownExecutionCount();
125 MIB->removeAnnotation(Inst, Name: "FrameAccessEntry");
126 LLVM_DEBUG(dbgs() << "Changed operand to a reg\n");
127 if (MIB->isRedundantMove(Inst)) {
128 ++NumLoadsDeleted;
129 FreqLoadsDeleted += BB.getKnownExecutionCount();
130 LLVM_DEBUG(dbgs() << "Created a redundant move\n");
131 // Delete it!
132 ToErase.push_front(x: std::make_pair(x: &BB, y: &Inst));
133 }
134 } else {
135 char Buf[8] = {0, 0, 0, 0, 0, 0, 0, 0};
136 support::ulittle64_t::ref(Buf + 0) = FIEY->RegOrImm;
137 LLVM_DEBUG(dbgs() << "Changing operand to an imm... ");
138 if (!MIB->replaceMemOperandWithImm(Inst, ConstantData: StringRef(Buf, 8), Offset: 0)) {
139 LLVM_DEBUG(dbgs() << "FAILED\n");
140 } else {
141 FreqLoadsChangedToImm += BB.getKnownExecutionCount();
142 MIB->removeAnnotation(Inst, Name: "FrameAccessEntry");
143 LLVM_DEBUG(dbgs() << "Ok\n");
144 }
145 }
146 LLVM_DEBUG(dbgs() << "Changed to: ");
147 LLVM_DEBUG(Inst.dump());
148 break;
149 }
150 Prev = &Inst;
151 }
152 }
153 if (Changed)
154 LLVM_DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n");
155
156 // TODO: Implement an interface of eraseInstruction that works out the
157 // complete list of elements to remove.
158 for (std::pair<BinaryBasicBlock *, MCInst *> I : ToErase)
159 I.first->eraseInstruction(II: I.first->findInstruction(Inst: I.second));
160}
161
162void FrameOptimizerPass::removeUnusedStores(const FrameAnalysis &FA,
163 BinaryFunction &BF) {
164 StackReachingUses SRU(FA, BF);
165 SRU.run();
166
167 LLVM_DEBUG(dbgs() << "Performing unused stores removal\n");
168 std::vector<std::pair<BinaryBasicBlock *, MCInst *>> ToErase;
169 bool Changed = false;
170 for (BinaryBasicBlock &BB : BF) {
171 LLVM_DEBUG(dbgs() << "\tNow at BB " << BB.getName() << "\n");
172 const MCInst *Prev = nullptr;
173 for (MCInst &Inst : llvm::reverse(C&: BB)) {
174 LLVM_DEBUG({
175 dbgs() << "\t\tNow at ";
176 Inst.dump();
177 for (auto I = Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB);
178 I != SRU.expr_end(); ++I) {
179 dbgs() << "\t\t\tReached by: ";
180 (*I)->dump();
181 }
182 });
183 ErrorOr<const FrameIndexEntry &> FIEX = FA.getFIEFor(Inst);
184 if (!FIEX) {
185 Prev = &Inst;
186 continue;
187 }
188 if (FIEX->IsLoad || !FIEX->IsSimple || FIEX->StackOffset >= 0) {
189 Prev = &Inst;
190 continue;
191 }
192
193 if (SRU.isStoreUsed(StoreFIE: *FIEX,
194 Candidates: Prev ? SRU.expr_begin(Point: *Prev) : SRU.expr_begin(Point: BB))) {
195 Prev = &Inst;
196 continue;
197 }
198 // TODO: Change push/pops to stack adjustment instruction
199 if (BF.getBinaryContext().MIB->isPush(Inst))
200 continue;
201
202 ++NumRedundantStores;
203 FreqRedundantStores += BB.getKnownExecutionCount();
204 Changed = true;
205 LLVM_DEBUG(dbgs() << "Unused store instruction: ");
206 LLVM_DEBUG(Inst.dump());
207 LLVM_DEBUG(dbgs() << "@BB: " << BB.getName() << "\n");
208 LLVM_DEBUG(dbgs() << "FIE offset = " << FIEX->StackOffset
209 << " size = " << (int)FIEX->Size << "\n");
210 // Delete it!
211 ToErase.emplace_back(args: &BB, args: &Inst);
212 Prev = &Inst;
213 }
214 }
215
216 for (std::pair<BinaryBasicBlock *, MCInst *> I : ToErase)
217 I.first->eraseInstruction(II: I.first->findInstruction(Inst: I.second));
218
219 if (Changed)
220 LLVM_DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n");
221}
222
223Error FrameOptimizerPass::runOnFunctions(BinaryContext &BC) {
224 if (opts::FrameOptimization == FOP_NONE)
225 return Error::success();
226
227 std::unique_ptr<BinaryFunctionCallGraph> CG;
228 std::unique_ptr<FrameAnalysis> FA;
229 std::unique_ptr<RegAnalysis> RA;
230
231 {
232 NamedRegionTimer T1("callgraph", "create call graph", "FOP",
233 "FOP breakdown", opts::TimeOpts);
234 CG = std::make_unique<BinaryFunctionCallGraph>(args: buildCallGraph(BC));
235 }
236
237 {
238 NamedRegionTimer T1("frameanalysis", "frame analysis", "FOP",
239 "FOP breakdown", opts::TimeOpts);
240 FA = std::make_unique<FrameAnalysis>(args&: BC, args&: *CG);
241 }
242
243 {
244 NamedRegionTimer T1("reganalysis", "reg analysis", "FOP", "FOP breakdown",
245 opts::TimeOpts);
246 RA = std::make_unique<RegAnalysis>(args&: BC, args: &BC.getBinaryFunctions(), args: CG.get());
247 }
248
249 // Perform caller-saved register optimizations, then callee-saved register
250 // optimizations (shrink wrapping)
251 for (auto &I : BC.getBinaryFunctions()) {
252 if (!FA->hasFrameInfo(Func: I.second))
253 continue;
254 // Restrict pass execution if user asked to only run on hot functions
255 if (opts::FrameOptimization == FOP_HOT) {
256 if (I.second.getKnownExecutionCount() < BC.getHotThreshold())
257 continue;
258 LLVM_DEBUG(
259 dbgs() << "Considering " << I.second.getPrintName()
260 << " for frame optimizations because its execution count ( "
261 << I.second.getKnownExecutionCount()
262 << " ) exceeds our hotness threshold ( "
263 << BC.getHotThreshold() << " )\n");
264 }
265
266 {
267 NamedRegionTimer T1("removeloads", "remove loads", "FOP", "FOP breakdown",
268 opts::TimeOpts);
269 if (!FA->hasStackArithmetic(Func: I.second))
270 removeUnnecessaryLoads(RA: *RA, FA: *FA, BF&: I.second);
271 }
272
273 if (opts::RemoveStores) {
274 NamedRegionTimer T1("removestores", "remove stores", "FOP",
275 "FOP breakdown", opts::TimeOpts);
276 if (!FA->hasStackArithmetic(Func: I.second))
277 removeUnusedStores(FA: *FA, BF&: I.second);
278 }
279 // Don't even start shrink wrapping if no profiling info is available
280 if (I.second.getKnownExecutionCount() == 0)
281 continue;
282 }
283
284 {
285 NamedRegionTimer T1("shrinkwrapping", "shrink wrapping", "FOP",
286 "FOP breakdown", opts::TimeOpts);
287 if (Error E = performShrinkWrapping(RA: *RA, FA: *FA, BC))
288 return Error(std::move(E));
289 }
290
291 BC.outs() << "BOLT-INFO: FOP optimized " << NumRedundantLoads
292 << " redundant load(s) and " << NumRedundantStores
293 << " unused store(s)\n";
294 BC.outs() << "BOLT-INFO: Frequency of redundant loads is "
295 << FreqRedundantLoads << " and frequency of unused stores is "
296 << FreqRedundantStores << "\n";
297 BC.outs() << "BOLT-INFO: Frequency of loads changed to use a register is "
298 << FreqLoadsChangedToReg
299 << " and frequency of loads changed to use an immediate is "
300 << FreqLoadsChangedToImm << "\n";
301 BC.outs() << "BOLT-INFO: FOP deleted " << NumLoadsDeleted
302 << " load(s) (dyn count: " << FreqLoadsDeleted << ") and "
303 << NumRedundantStores << " store(s)\n";
304 FA->printStats();
305 ShrinkWrapping::printStats(BC);
306 return Error::success();
307}
308
309Error FrameOptimizerPass::performShrinkWrapping(const RegAnalysis &RA,
310 const FrameAnalysis &FA,
311 BinaryContext &BC) {
312 // Initialize necessary annotations to allow safe parallel accesses to
313 // annotation index in MIB
314 BC.MIB->getOrCreateAnnotationIndex(Name: CalleeSavedAnalysis::getSaveTagName());
315 BC.MIB->getOrCreateAnnotationIndex(Name: CalleeSavedAnalysis::getRestoreTagName());
316 BC.MIB->getOrCreateAnnotationIndex(Name: StackLayoutModifier::getTodoTagName());
317 BC.MIB->getOrCreateAnnotationIndex(Name: StackLayoutModifier::getSlotTagName());
318 BC.MIB->getOrCreateAnnotationIndex(
319 Name: StackLayoutModifier::getOffsetCFIRegTagName());
320 BC.MIB->getOrCreateAnnotationIndex(Name: "ReachingDefs");
321 BC.MIB->getOrCreateAnnotationIndex(Name: "ReachingUses");
322 BC.MIB->getOrCreateAnnotationIndex(Name: "LivenessAnalysis");
323 BC.MIB->getOrCreateAnnotationIndex(Name: "StackReachingUses");
324 BC.MIB->getOrCreateAnnotationIndex(Name: "PostDominatorAnalysis");
325 BC.MIB->getOrCreateAnnotationIndex(Name: "DominatorAnalysis");
326 BC.MIB->getOrCreateAnnotationIndex(Name: "StackPointerTracking");
327 BC.MIB->getOrCreateAnnotationIndex(Name: "StackPointerTrackingForInternalCalls");
328 BC.MIB->getOrCreateAnnotationIndex(Name: "StackAvailableExpressions");
329 BC.MIB->getOrCreateAnnotationIndex(Name: "StackAllocationAnalysis");
330 BC.MIB->getOrCreateAnnotationIndex(Name: "ShrinkWrap-Todo");
331 BC.MIB->getOrCreateAnnotationIndex(Name: "PredictiveStackPointerTracking");
332 BC.MIB->getOrCreateAnnotationIndex(Name: "ReachingInsnsBackward");
333 BC.MIB->getOrCreateAnnotationIndex(Name: "ReachingInsns");
334 BC.MIB->getOrCreateAnnotationIndex(Name: "AccessesDeletedPos");
335 BC.MIB->getOrCreateAnnotationIndex(Name: "DeleteMe");
336
337 std::vector<std::pair<uint64_t, const BinaryFunction *>> Top10Funcs;
338 auto LogFunc = [&](BinaryFunction &BF) {
339 auto Lower = llvm::lower_bound(
340 Range&: Top10Funcs, Value: BF.getKnownExecutionCount(),
341 C: [](const std::pair<uint64_t, const BinaryFunction *> &Elmt,
342 uint64_t Value) { return Elmt.first > Value; });
343 if (Lower == Top10Funcs.end() && Top10Funcs.size() >= 10)
344 return;
345 Top10Funcs.insert(position: Lower,
346 x: std::make_pair<>(x: BF.getKnownExecutionCount(), y: &BF));
347 if (Top10Funcs.size() > 10)
348 Top10Funcs.resize(new_size: 10);
349 };
350 (void)LogFunc;
351
352 ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
353 if (BF.getFunctionScore() == 0)
354 return true;
355
356 return false;
357 };
358
359 const bool HotOnly = opts::FrameOptimization == FOP_HOT;
360
361 Error SWError = Error::success();
362
363 ParallelUtilities::WorkFuncWithAllocTy WorkFunction =
364 [&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocatorId) {
365 DataflowInfoManager Info(BF, &RA, &FA, AllocatorId);
366 ShrinkWrapping SW(FA, BF, Info, AllocatorId);
367
368 auto ChangedOrErr = SW.perform(HotOnly);
369 if (auto E = ChangedOrErr.takeError()) {
370 std::lock_guard<std::mutex> Lock(FuncsChangedMutex);
371 SWError = joinErrors(E1: std::move(SWError), E2: Error(std::move(E)));
372 return;
373 }
374 const bool Changed = *ChangedOrErr;
375 if (Changed) {
376 std::lock_guard<std::mutex> Lock(FuncsChangedMutex);
377 FuncsChanged.insert(V: &BF);
378 LLVM_DEBUG(LogFunc(BF));
379 }
380 };
381
382 ParallelUtilities::runOnEachFunctionWithUniqueAllocId(
383 BC, SchedPolicy: ParallelUtilities::SchedulingPolicy::SP_INST_QUADRATIC, WorkFunction,
384 SkipPredicate, LogName: "shrink-wrapping");
385
386 if (!Top10Funcs.empty()) {
387 BC.outs() << "BOLT-INFO: top 10 functions changed by shrink wrapping:\n";
388 for (const auto &Elmt : Top10Funcs)
389 BC.outs() << Elmt.first << " : " << Elmt.second->getPrintName() << "\n";
390 }
391 return SWError;
392}
393
394} // namespace bolt
395} // namespace llvm
396

source code of bolt/lib/Passes/FrameOptimizer.cpp