1 | //===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements the transformation that optimizes memory intrinsics |
10 | // such as memcpy using the size value profile. When memory intrinsic size |
11 | // value profile metadata is available, a single memory intrinsic is expanded |
12 | // to a sequence of guarded specialized versions that are called with the |
13 | // hottest size(s), for later expansion into more optimal inline sequences. |
14 | // |
15 | //===----------------------------------------------------------------------===// |
16 | |
17 | #include "llvm/ADT/ArrayRef.h" |
18 | #include "llvm/ADT/Statistic.h" |
19 | #include "llvm/ADT/StringRef.h" |
20 | #include "llvm/ADT/Twine.h" |
21 | #include "llvm/Analysis/BlockFrequencyInfo.h" |
22 | #include "llvm/Analysis/DomTreeUpdater.h" |
23 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
24 | #include "llvm/Analysis/TargetLibraryInfo.h" |
25 | #include "llvm/IR/BasicBlock.h" |
26 | #include "llvm/IR/DerivedTypes.h" |
27 | #include "llvm/IR/Dominators.h" |
28 | #include "llvm/IR/Function.h" |
29 | #include "llvm/IR/IRBuilder.h" |
30 | #include "llvm/IR/InstVisitor.h" |
31 | #include "llvm/IR/Instruction.h" |
32 | #include "llvm/IR/Instructions.h" |
33 | #include "llvm/IR/LLVMContext.h" |
34 | #include "llvm/IR/PassManager.h" |
35 | #include "llvm/IR/Type.h" |
36 | #include "llvm/ProfileData/InstrProf.h" |
37 | #define INSTR_PROF_VALUE_PROF_MEMOP_API |
38 | #include "llvm/ProfileData/InstrProfData.inc" |
39 | #include "llvm/Support/Casting.h" |
40 | #include "llvm/Support/CommandLine.h" |
41 | #include "llvm/Support/Debug.h" |
42 | #include "llvm/Support/ErrorHandling.h" |
43 | #include "llvm/Support/MathExtras.h" |
44 | #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h" |
45 | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
46 | #include <cassert> |
47 | #include <cstdint> |
48 | #include <vector> |
49 | |
50 | using namespace llvm; |
51 | |
52 | #define DEBUG_TYPE "pgo-memop-opt" |
53 | |
54 | STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized." ); |
55 | STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated." ); |
56 | |
57 | // The minimum call count to optimize memory intrinsic calls. |
58 | static cl::opt<unsigned> |
59 | MemOPCountThreshold("pgo-memop-count-threshold" , cl::Hidden, cl::init(Val: 1000), |
60 | cl::desc("The minimum count to optimize memory " |
61 | "intrinsic calls" )); |
62 | |
63 | // Command line option to disable memory intrinsic optimization. The default is |
64 | // false. This is for debug purpose. |
65 | static cl::opt<bool> DisableMemOPOPT("disable-memop-opt" , cl::init(Val: false), |
66 | cl::Hidden, cl::desc("Disable optimize" )); |
67 | |
68 | // The percent threshold to optimize memory intrinsic calls. |
69 | static cl::opt<unsigned> |
70 | MemOPPercentThreshold("pgo-memop-percent-threshold" , cl::init(Val: 40), |
71 | cl::Hidden, |
72 | cl::desc("The percentage threshold for the " |
73 | "memory intrinsic calls optimization" )); |
74 | |
75 | // Maximum number of versions for optimizing memory intrinsic call. |
76 | static cl::opt<unsigned> |
77 | MemOPMaxVersion("pgo-memop-max-version" , cl::init(Val: 3), cl::Hidden, |
78 | cl::desc("The max version for the optimized memory " |
79 | " intrinsic calls" )); |
80 | |
81 | // Scale the counts from the annotation using the BB count value. |
82 | static cl::opt<bool> |
83 | MemOPScaleCount("pgo-memop-scale-count" , cl::init(Val: true), cl::Hidden, |
84 | cl::desc("Scale the memop size counts using the basic " |
85 | " block count value" )); |
86 | |
87 | cl::opt<bool> |
88 | MemOPOptMemcmpBcmp("pgo-memop-optimize-memcmp-bcmp" , cl::init(Val: true), |
89 | cl::Hidden, |
90 | cl::desc("Size-specialize memcmp and bcmp calls" )); |
91 | |
92 | static cl::opt<unsigned> |
93 | MemOpMaxOptSize("memop-value-prof-max-opt-size" , cl::Hidden, cl::init(Val: 128), |
94 | cl::desc("Optimize the memop size <= this value" )); |
95 | |
96 | namespace { |
97 | |
98 | static const char *getMIName(const MemIntrinsic *MI) { |
99 | switch (MI->getIntrinsicID()) { |
100 | case Intrinsic::memcpy: |
101 | return "memcpy" ; |
102 | case Intrinsic::memmove: |
103 | return "memmove" ; |
104 | case Intrinsic::memset: |
105 | return "memset" ; |
106 | default: |
107 | return "unknown" ; |
108 | } |
109 | } |
110 | |
111 | // A class that abstracts a memop (memcpy, memmove, memset, memcmp and bcmp). |
112 | struct MemOp { |
113 | Instruction *I; |
114 | MemOp(MemIntrinsic *MI) : I(MI) {} |
115 | MemOp(CallInst *CI) : I(CI) {} |
116 | MemIntrinsic *asMI() { return dyn_cast<MemIntrinsic>(Val: I); } |
117 | CallInst *asCI() { return cast<CallInst>(Val: I); } |
118 | MemOp clone() { |
119 | if (auto MI = asMI()) |
120 | return MemOp(cast<MemIntrinsic>(Val: MI->clone())); |
121 | return MemOp(cast<CallInst>(Val: asCI()->clone())); |
122 | } |
123 | Value *getLength() { |
124 | if (auto MI = asMI()) |
125 | return MI->getLength(); |
126 | return asCI()->getArgOperand(i: 2); |
127 | } |
128 | void setLength(Value *Length) { |
129 | if (auto MI = asMI()) |
130 | return MI->setLength(Length); |
131 | asCI()->setArgOperand(i: 2, v: Length); |
132 | } |
133 | StringRef getFuncName() { |
134 | if (auto MI = asMI()) |
135 | return MI->getCalledFunction()->getName(); |
136 | return asCI()->getCalledFunction()->getName(); |
137 | } |
138 | bool isMemmove() { |
139 | if (auto MI = asMI()) |
140 | if (MI->getIntrinsicID() == Intrinsic::memmove) |
141 | return true; |
142 | return false; |
143 | } |
144 | bool isMemcmp(TargetLibraryInfo &TLI) { |
145 | LibFunc Func; |
146 | if (asMI() == nullptr && TLI.getLibFunc(CB: *asCI(), F&: Func) && |
147 | Func == LibFunc_memcmp) { |
148 | return true; |
149 | } |
150 | return false; |
151 | } |
152 | bool isBcmp(TargetLibraryInfo &TLI) { |
153 | LibFunc Func; |
154 | if (asMI() == nullptr && TLI.getLibFunc(CB: *asCI(), F&: Func) && |
155 | Func == LibFunc_bcmp) { |
156 | return true; |
157 | } |
158 | return false; |
159 | } |
160 | const char *getName(TargetLibraryInfo &TLI) { |
161 | if (auto MI = asMI()) |
162 | return getMIName(MI); |
163 | LibFunc Func; |
164 | if (TLI.getLibFunc(CB: *asCI(), F&: Func)) { |
165 | if (Func == LibFunc_memcmp) |
166 | return "memcmp" ; |
167 | if (Func == LibFunc_bcmp) |
168 | return "bcmp" ; |
169 | } |
170 | llvm_unreachable("Must be MemIntrinsic or memcmp/bcmp CallInst" ); |
171 | return nullptr; |
172 | } |
173 | }; |
174 | |
175 | class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> { |
176 | public: |
177 | (Function &Func, BlockFrequencyInfo &BFI, |
178 | OptimizationRemarkEmitter &ORE, DominatorTree *DT, |
179 | TargetLibraryInfo &TLI) |
180 | : Func(Func), BFI(BFI), ORE(ORE), DT(DT), TLI(TLI), Changed(false) { |
181 | ValueDataArray = |
182 | std::make_unique<InstrProfValueData[]>(INSTR_PROF_NUM_BUCKETS); |
183 | } |
184 | bool isChanged() const { return Changed; } |
185 | void perform() { |
186 | WorkList.clear(); |
187 | visit(F&: Func); |
188 | |
189 | for (auto &MO : WorkList) { |
190 | ++NumOfPGOMemOPAnnotate; |
191 | if (perform(MO)) { |
192 | Changed = true; |
193 | ++NumOfPGOMemOPOpt; |
194 | LLVM_DEBUG(dbgs() << "MemOP call: " << MO.getFuncName() |
195 | << "is Transformed.\n" ); |
196 | } |
197 | } |
198 | } |
199 | |
200 | void visitMemIntrinsic(MemIntrinsic &MI) { |
201 | Value *Length = MI.getLength(); |
202 | // Not perform on constant length calls. |
203 | if (isa<ConstantInt>(Val: Length)) |
204 | return; |
205 | WorkList.push_back(x: MemOp(&MI)); |
206 | } |
207 | |
208 | void visitCallInst(CallInst &CI) { |
209 | LibFunc Func; |
210 | if (TLI.getLibFunc(CB: CI, F&: Func) && |
211 | (Func == LibFunc_memcmp || Func == LibFunc_bcmp) && |
212 | !isa<ConstantInt>(Val: CI.getArgOperand(i: 2))) { |
213 | WorkList.push_back(x: MemOp(&CI)); |
214 | } |
215 | } |
216 | |
217 | private: |
218 | Function &Func; |
219 | BlockFrequencyInfo &BFI; |
220 | OptimizationRemarkEmitter &ORE; |
221 | DominatorTree *DT; |
222 | TargetLibraryInfo &TLI; |
223 | bool Changed; |
224 | std::vector<MemOp> WorkList; |
225 | // The space to read the profile annotation. |
226 | std::unique_ptr<InstrProfValueData[]> ValueDataArray; |
227 | bool perform(MemOp MO); |
228 | }; |
229 | |
230 | static bool isProfitable(uint64_t Count, uint64_t TotalCount) { |
231 | assert(Count <= TotalCount); |
232 | if (Count < MemOPCountThreshold) |
233 | return false; |
234 | if (Count < TotalCount * MemOPPercentThreshold / 100) |
235 | return false; |
236 | return true; |
237 | } |
238 | |
239 | static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num, |
240 | uint64_t Denom) { |
241 | if (!MemOPScaleCount) |
242 | return Count; |
243 | bool Overflowed; |
244 | uint64_t ScaleCount = SaturatingMultiply(X: Count, Y: Num, ResultOverflowed: &Overflowed); |
245 | return ScaleCount / Denom; |
246 | } |
247 | |
248 | bool MemOPSizeOpt::perform(MemOp MO) { |
249 | assert(MO.I); |
250 | if (MO.isMemmove()) |
251 | return false; |
252 | if (!MemOPOptMemcmpBcmp && (MO.isMemcmp(TLI) || MO.isBcmp(TLI))) |
253 | return false; |
254 | |
255 | uint32_t NumVals, MaxNumVals = INSTR_PROF_NUM_BUCKETS; |
256 | uint64_t TotalCount; |
257 | if (!getValueProfDataFromInst(Inst: *MO.I, ValueKind: IPVK_MemOPSize, MaxNumValueData: MaxNumVals, |
258 | ValueData: ValueDataArray.get(), ActualNumValueData&: NumVals, TotalC&: TotalCount)) |
259 | return false; |
260 | |
261 | uint64_t ActualCount = TotalCount; |
262 | uint64_t SavedTotalCount = TotalCount; |
263 | if (MemOPScaleCount) { |
264 | auto BBEdgeCount = BFI.getBlockProfileCount(BB: MO.I->getParent()); |
265 | if (!BBEdgeCount) |
266 | return false; |
267 | ActualCount = *BBEdgeCount; |
268 | } |
269 | |
270 | ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals); |
271 | LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count " |
272 | << ActualCount << "\n" ); |
273 | LLVM_DEBUG( |
274 | for (auto &VD |
275 | : VDs) { dbgs() << " (" << VD.Value << "," << VD.Count << ")\n" ; }); |
276 | |
277 | if (ActualCount < MemOPCountThreshold) |
278 | return false; |
279 | // Skip if the total value profiled count is 0, in which case we can't |
280 | // scale up the counts properly (and there is no profitable transformation). |
281 | if (TotalCount == 0) |
282 | return false; |
283 | |
284 | TotalCount = ActualCount; |
285 | if (MemOPScaleCount) |
286 | LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount |
287 | << " denominator = " << SavedTotalCount << "\n" ); |
288 | |
289 | // Keeping track of the count of the default case: |
290 | uint64_t RemainCount = TotalCount; |
291 | uint64_t SavedRemainCount = SavedTotalCount; |
292 | SmallVector<uint64_t, 16> SizeIds; |
293 | SmallVector<uint64_t, 16> CaseCounts; |
294 | SmallDenseSet<uint64_t, 16> SeenSizeId; |
295 | uint64_t MaxCount = 0; |
296 | unsigned Version = 0; |
297 | // Default case is in the front -- save the slot here. |
298 | CaseCounts.push_back(Elt: 0); |
299 | SmallVector<InstrProfValueData, 24> RemainingVDs; |
300 | for (auto I = VDs.begin(), E = VDs.end(); I != E; ++I) { |
301 | auto &VD = *I; |
302 | int64_t V = VD.Value; |
303 | uint64_t C = VD.Count; |
304 | if (MemOPScaleCount) |
305 | C = getScaledCount(Count: C, Num: ActualCount, Denom: SavedTotalCount); |
306 | |
307 | if (!InstrProfIsSingleValRange(Value: V) || V > MemOpMaxOptSize) { |
308 | RemainingVDs.push_back(Elt: VD); |
309 | continue; |
310 | } |
311 | |
312 | // ValueCounts are sorted on the count. Break at the first un-profitable |
313 | // value. |
314 | if (!isProfitable(Count: C, TotalCount: RemainCount)) { |
315 | RemainingVDs.insert(I: RemainingVDs.end(), From: I, To: E); |
316 | break; |
317 | } |
318 | |
319 | if (!SeenSizeId.insert(V).second) { |
320 | errs() << "warning: Invalid Profile Data in Function " << Func.getName() |
321 | << ": Two identical values in MemOp value counts.\n" ; |
322 | return false; |
323 | } |
324 | |
325 | SizeIds.push_back(Elt: V); |
326 | CaseCounts.push_back(Elt: C); |
327 | if (C > MaxCount) |
328 | MaxCount = C; |
329 | |
330 | assert(RemainCount >= C); |
331 | RemainCount -= C; |
332 | assert(SavedRemainCount >= VD.Count); |
333 | SavedRemainCount -= VD.Count; |
334 | |
335 | if (++Version >= MemOPMaxVersion && MemOPMaxVersion != 0) { |
336 | RemainingVDs.insert(I: RemainingVDs.end(), From: I + 1, To: E); |
337 | break; |
338 | } |
339 | } |
340 | |
341 | if (Version == 0) |
342 | return false; |
343 | |
344 | CaseCounts[0] = RemainCount; |
345 | if (RemainCount > MaxCount) |
346 | MaxCount = RemainCount; |
347 | |
348 | uint64_t SumForOpt = TotalCount - RemainCount; |
349 | |
350 | LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version |
351 | << " Versions (covering " << SumForOpt << " out of " |
352 | << TotalCount << ")\n" ); |
353 | |
354 | // mem_op(..., size) |
355 | // ==> |
356 | // switch (size) { |
357 | // case s1: |
358 | // mem_op(..., s1); |
359 | // goto merge_bb; |
360 | // case s2: |
361 | // mem_op(..., s2); |
362 | // goto merge_bb; |
363 | // ... |
364 | // default: |
365 | // mem_op(..., size); |
366 | // goto merge_bb; |
367 | // } |
368 | // merge_bb: |
369 | |
370 | BasicBlock *BB = MO.I->getParent(); |
371 | LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n" ); |
372 | LLVM_DEBUG(dbgs() << *BB << "\n" ); |
373 | auto OrigBBFreq = BFI.getBlockFreq(BB); |
374 | |
375 | BasicBlock *DefaultBB = SplitBlock(Old: BB, SplitPt: MO.I, DT); |
376 | BasicBlock::iterator It(*MO.I); |
377 | ++It; |
378 | assert(It != DefaultBB->end()); |
379 | BasicBlock *MergeBB = SplitBlock(Old: DefaultBB, SplitPt: &(*It), DT); |
380 | MergeBB->setName("MemOP.Merge" ); |
381 | BFI.setBlockFreq(BB: MergeBB, Freq: OrigBBFreq); |
382 | DefaultBB->setName("MemOP.Default" ); |
383 | |
384 | DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); |
385 | auto &Ctx = Func.getContext(); |
386 | IRBuilder<> IRB(BB); |
387 | BB->getTerminator()->eraseFromParent(); |
388 | Value *SizeVar = MO.getLength(); |
389 | SwitchInst *SI = IRB.CreateSwitch(V: SizeVar, Dest: DefaultBB, NumCases: SizeIds.size()); |
390 | Type *MemOpTy = MO.I->getType(); |
391 | PHINode *PHI = nullptr; |
392 | if (!MemOpTy->isVoidTy()) { |
393 | // Insert a phi for the return values at the merge block. |
394 | IRBuilder<> IRBM(MergeBB->getFirstNonPHI()); |
395 | PHI = IRBM.CreatePHI(Ty: MemOpTy, NumReservedValues: SizeIds.size() + 1, Name: "MemOP.RVMerge" ); |
396 | MO.I->replaceAllUsesWith(V: PHI); |
397 | PHI->addIncoming(V: MO.I, BB: DefaultBB); |
398 | } |
399 | |
400 | // Clear the value profile data. |
401 | MO.I->setMetadata(KindID: LLVMContext::MD_prof, Node: nullptr); |
402 | // If all promoted, we don't need the MD.prof metadata. |
403 | if (SavedRemainCount > 0 || Version != NumVals) { |
404 | // Otherwise we need update with the un-promoted records back. |
405 | ArrayRef<InstrProfValueData> RemVDs(RemainingVDs); |
406 | annotateValueSite(M&: *Func.getParent(), Inst&: *MO.I, VDs: RemVDs, Sum: SavedRemainCount, |
407 | ValueKind: IPVK_MemOPSize, MaxMDCount: NumVals); |
408 | } |
409 | |
410 | LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n" ); |
411 | |
412 | std::vector<DominatorTree::UpdateType> Updates; |
413 | if (DT) |
414 | Updates.reserve(n: 2 * SizeIds.size()); |
415 | |
416 | for (uint64_t SizeId : SizeIds) { |
417 | BasicBlock *CaseBB = BasicBlock::Create( |
418 | Context&: Ctx, Name: Twine("MemOP.Case." ) + Twine(SizeId), Parent: &Func, InsertBefore: DefaultBB); |
419 | MemOp NewMO = MO.clone(); |
420 | // Fix the argument. |
421 | auto *SizeType = dyn_cast<IntegerType>(Val: NewMO.getLength()->getType()); |
422 | assert(SizeType && "Expected integer type size argument." ); |
423 | ConstantInt *CaseSizeId = ConstantInt::get(Ty: SizeType, V: SizeId); |
424 | NewMO.setLength(CaseSizeId); |
425 | NewMO.I->insertInto(ParentBB: CaseBB, It: CaseBB->end()); |
426 | IRBuilder<> IRBCase(CaseBB); |
427 | IRBCase.CreateBr(Dest: MergeBB); |
428 | SI->addCase(OnVal: CaseSizeId, Dest: CaseBB); |
429 | if (!MemOpTy->isVoidTy()) |
430 | PHI->addIncoming(V: NewMO.I, BB: CaseBB); |
431 | if (DT) { |
432 | Updates.push_back(x: {DominatorTree::Insert, CaseBB, MergeBB}); |
433 | Updates.push_back(x: {DominatorTree::Insert, BB, CaseBB}); |
434 | } |
435 | LLVM_DEBUG(dbgs() << *CaseBB << "\n" ); |
436 | } |
437 | DTU.applyUpdates(Updates); |
438 | Updates.clear(); |
439 | |
440 | if (MaxCount) |
441 | setProfMetadata(M: Func.getParent(), TI: SI, EdgeCounts: CaseCounts, MaxCount); |
442 | |
443 | LLVM_DEBUG(dbgs() << *BB << "\n" ); |
444 | LLVM_DEBUG(dbgs() << *DefaultBB << "\n" ); |
445 | LLVM_DEBUG(dbgs() << *MergeBB << "\n" ); |
446 | |
447 | ORE.emit(RemarkBuilder: [&]() { |
448 | using namespace ore; |
449 | return OptimizationRemark(DEBUG_TYPE, "memopt-opt" , MO.I) |
450 | << "optimized " << NV("Memop" , MO.getName(TLI)) << " with count " |
451 | << NV("Count" , SumForOpt) << " out of " << NV("Total" , TotalCount) |
452 | << " for " << NV("Versions" , Version) << " versions" ; |
453 | }); |
454 | |
455 | return true; |
456 | } |
457 | } // namespace |
458 | |
459 | static bool (Function &F, BlockFrequencyInfo &BFI, |
460 | OptimizationRemarkEmitter &ORE, |
461 | DominatorTree *DT, TargetLibraryInfo &TLI) { |
462 | if (DisableMemOPOPT) |
463 | return false; |
464 | |
465 | if (F.hasFnAttribute(Attribute::OptimizeForSize)) |
466 | return false; |
467 | MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT, TLI); |
468 | MemOPSizeOpt.perform(); |
469 | return MemOPSizeOpt.isChanged(); |
470 | } |
471 | |
472 | PreservedAnalyses PGOMemOPSizeOpt::run(Function &F, |
473 | FunctionAnalysisManager &FAM) { |
474 | auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(IR&: F); |
475 | auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F); |
476 | auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(IR&: F); |
477 | auto &TLI = FAM.getResult<TargetLibraryAnalysis>(IR&: F); |
478 | bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI); |
479 | if (!Changed) |
480 | return PreservedAnalyses::all(); |
481 | auto PA = PreservedAnalyses(); |
482 | PA.preserve<DominatorTreeAnalysis>(); |
483 | return PA; |
484 | } |
485 | |