1 | //===- IRSimilarityIdentifier.h - Find similarity in a module --------------==// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // \file |
10 | // Interface file for the IRSimilarityIdentifier for identifying similarities in |
11 | // IR including the IRInstructionMapper, which maps an Instruction to unsigned |
12 | // integers. |
13 | // |
14 | // Two sequences of instructions are called "similar" if they perform the same |
15 | // series of operations for all inputs. |
16 | // |
17 | // \code |
18 | // %1 = add i32 %a, 10 |
19 | // %2 = add i32 %a, %1 |
20 | // %3 = icmp slt icmp %1, %2 |
21 | // \endcode |
22 | // |
23 | // and |
24 | // |
25 | // \code |
26 | // %1 = add i32 11, %a |
27 | // %2 = sub i32 %a, %1 |
28 | // %3 = icmp sgt icmp %2, %1 |
29 | // \endcode |
30 | // |
31 | // ultimately have the same result, even if the inputs, and structure are |
32 | // slightly different. |
33 | // |
34 | // For instructions, we do not worry about operands that do not have fixed |
35 | // semantic meaning to the program. We consider the opcode that the instruction |
36 | // has, the types, parameters, and extra information such as the function name, |
37 | // or comparison predicate. These are used to create a hash to map instructions |
38 | // to integers to be used in similarity matching in sequences of instructions |
39 | // |
40 | // Terminology: |
41 | // An IRSimilarityCandidate is a region of IRInstructionData (wrapped |
42 | // Instructions), usually used to denote a region of similarity has been found. |
43 | // |
44 | // A SimilarityGroup is a set of IRSimilarityCandidates that are structurally |
45 | // similar to one another. |
46 | // |
47 | //===----------------------------------------------------------------------===// |
48 | |
49 | #ifndef LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H |
50 | #define LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H |
51 | |
52 | #include "llvm/IR/InstVisitor.h" |
53 | #include "llvm/IR/Instructions.h" |
54 | #include "llvm/IR/PassManager.h" |
55 | #include "llvm/Pass.h" |
56 | #include "llvm/Support/Allocator.h" |
57 | #include <optional> |
58 | |
59 | namespace llvm { |
60 | class Module; |
61 | |
62 | namespace IRSimilarity { |
63 | |
64 | struct IRInstructionDataList; |
65 | |
66 | /// This represents what is and is not supported when finding similarity in |
67 | /// Instructions. |
68 | /// |
69 | /// Legal Instructions are considered when looking at similarity between |
70 | /// Instructions. |
71 | /// |
72 | /// Illegal Instructions cannot be considered when looking for similarity |
73 | /// between Instructions. They act as boundaries between similarity regions. |
74 | /// |
75 | /// Invisible Instructions are skipped over during analysis. |
76 | // TODO: Shared with MachineOutliner |
77 | enum InstrType { Legal, Illegal, Invisible }; |
78 | |
79 | /// This provides the utilities for hashing an Instruction to an unsigned |
80 | /// integer. Two IRInstructionDatas produce the same hash value when their |
81 | /// underlying Instructions perform the same operation (even if they don't have |
82 | /// the same input operands.) |
83 | /// As a more concrete example, consider the following: |
84 | /// |
85 | /// \code |
86 | /// %add1 = add i32 %a, %b |
87 | /// %add2 = add i32 %c, %d |
88 | /// %add3 = add i64 %e, %f |
89 | /// \endcode |
90 | /// |
91 | // Then the IRInstructionData wrappers for these Instructions may be hashed like |
92 | /// so: |
93 | /// |
94 | /// \code |
95 | /// ; These two adds have the same types and operand types, so they hash to the |
96 | /// ; same number. |
97 | /// %add1 = add i32 %a, %b ; Hash: 1 |
98 | /// %add2 = add i32 %c, %d ; Hash: 1 |
99 | /// ; This add produces an i64. This differentiates it from %add1 and %add2. So, |
100 | /// ; it hashes to a different number. |
101 | /// %add3 = add i64 %e, %f; Hash: 2 |
102 | /// \endcode |
103 | /// |
104 | /// |
105 | /// This hashing scheme will be used to represent the program as a very long |
106 | /// string. This string can then be placed in a data structure which can be used |
107 | /// for similarity queries. |
108 | /// |
109 | /// TODO: Handle types of Instructions which can be equal even with different |
110 | /// operands. (E.g. comparisons with swapped predicates.) |
111 | /// TODO: Handle CallInsts, which are only checked for function type |
112 | /// by \ref isSameOperationAs. |
113 | /// TODO: Handle GetElementPtrInsts, as some of the operands have to be the |
114 | /// exact same, and some do not. |
115 | struct IRInstructionData |
116 | : ilist_node<IRInstructionData, ilist_sentinel_tracking<true>> { |
117 | |
118 | /// The source Instruction that is being wrapped. |
119 | Instruction *Inst = nullptr; |
120 | /// The values of the operands in the Instruction. |
121 | SmallVector<Value *, 4> OperVals; |
122 | /// The legality of the wrapped instruction. This is informed by InstrType, |
123 | /// and is used when checking when two instructions are considered similar. |
124 | /// If either instruction is not legal, the instructions are automatically not |
125 | /// considered similar. |
126 | bool Legal = false; |
127 | |
128 | /// This is only relevant if we are wrapping a CmpInst where we needed to |
129 | /// change the predicate of a compare instruction from a greater than form |
130 | /// to a less than form. It is std::nullopt otherwise. |
131 | std::optional<CmpInst::Predicate> RevisedPredicate; |
132 | |
133 | /// This is only relevant if we are wrapping a CallInst. If we are requiring |
134 | /// that the function calls have matching names as well as types, and the |
135 | /// call is not an indirect call, this will hold the name of the function. If |
136 | /// it is an indirect string, it will be the empty string. However, if this |
137 | /// requirement is not in place it will be the empty string regardless of the |
138 | /// function call type. The value held here is used to create the hash of the |
139 | /// instruction, and check to make sure two instructions are close to one |
140 | /// another. |
141 | std::optional<std::string> CalleeName; |
142 | |
143 | /// This structure holds the distances of how far "ahead of" or "behind" the |
144 | /// target blocks of a branch, or the incoming blocks of a phi nodes are. |
145 | /// If the value is negative, it means that the block was registered before |
146 | /// the block of this instruction in terms of blocks in the function. |
147 | /// Code Example: |
148 | /// \code |
149 | /// block_1: |
150 | /// br i1 %0, label %block_2, label %block_3 |
151 | /// block_2: |
152 | /// br i1 %1, label %block_1, label %block_2 |
153 | /// block_3: |
154 | /// br i1 %2, label %block_2, label %block_1 |
155 | /// ; Replacing the labels with relative values, this becomes: |
156 | /// block_1: |
157 | /// br i1 %0, distance 1, distance 2 |
158 | /// block_2: |
159 | /// br i1 %1, distance -1, distance 0 |
160 | /// block_3: |
161 | /// br i1 %2, distance -1, distance -2 |
162 | /// \endcode |
163 | /// Taking block_2 as our example, block_1 is "behind" block_2, and block_2 is |
164 | /// "ahead" of block_2. |
165 | SmallVector<int, 4> RelativeBlockLocations; |
166 | |
167 | /// Gather the information that is difficult to gather for an Instruction, or |
168 | /// is changed. i.e. the operands of an Instruction and the Types of those |
169 | /// operands. This extra information allows for similarity matching to make |
170 | /// assertions that allow for more flexibility when checking for whether an |
171 | /// Instruction performs the same operation. |
172 | IRInstructionData(Instruction &I, bool Legality, IRInstructionDataList &IDL); |
173 | IRInstructionData(IRInstructionDataList &IDL); |
174 | |
175 | /// Fills data stuctures for IRInstructionData when it is constructed from a |
176 | // reference or a pointer. |
177 | void initializeInstruction(); |
178 | |
179 | /// Get the predicate that the compare instruction is using for hashing the |
180 | /// instruction. the IRInstructionData must be wrapping a CmpInst. |
181 | CmpInst::Predicate getPredicate() const; |
182 | |
183 | /// Get the callee name that the call instruction is using for hashing the |
184 | /// instruction. The IRInstructionData must be wrapping a CallInst. |
185 | StringRef getCalleeName() const; |
186 | |
187 | /// A function that swaps the predicates to their less than form if they are |
188 | /// in a greater than form. Otherwise, the predicate is unchanged. |
189 | /// |
190 | /// \param CI - The comparison operation to find a consistent preidcate for. |
191 | /// \return the consistent comparison predicate. |
192 | static CmpInst::Predicate predicateForConsistency(CmpInst *CI); |
193 | |
194 | /// For an IRInstructionData containing a branch, finds the |
195 | /// relative distances from the source basic block to the target by taking |
196 | /// the difference of the number assigned to the current basic block and the |
197 | /// target basic block of the branch. |
198 | /// |
199 | /// \param BasicBlockToInteger - The mapping of basic blocks to their location |
200 | /// in the module. |
201 | void |
202 | setBranchSuccessors(DenseMap<BasicBlock *, unsigned> &BasicBlockToInteger); |
203 | |
204 | /// For an IRInstructionData containing a CallInst, set the function name |
205 | /// appropriately. This will be an empty string if it is an indirect call, |
206 | /// or we are not matching by name of the called function. It will be the |
207 | /// name of the function if \p MatchByName is true and it is not an indirect |
208 | /// call. We may decide not to match by name in order to expand the |
209 | /// size of the regions we can match. If a function name has the same type |
210 | /// signature, but the different name, the region of code is still almost the |
211 | /// same. Since function names can be treated as constants, the name itself |
212 | /// could be extrapolated away. However, matching by name provides a |
213 | /// specificity and more "identical" code than not matching by name. |
214 | /// |
215 | /// \param MatchByName - A flag to mark whether we are using the called |
216 | /// function name as a differentiating parameter. |
217 | void setCalleeName(bool MatchByName = true); |
218 | |
219 | /// For an IRInstructionData containing a PHINode, finds the |
220 | /// relative distances from the incoming basic block to the current block by |
221 | /// taking the difference of the number assigned to the current basic block |
222 | /// and the incoming basic block of the branch. |
223 | /// |
224 | /// \param BasicBlockToInteger - The mapping of basic blocks to their location |
225 | /// in the module. |
226 | void |
227 | setPHIPredecessors(DenseMap<BasicBlock *, unsigned> &BasicBlockToInteger); |
228 | |
229 | /// Get the BasicBlock based operands for PHINodes and BranchInsts. |
230 | /// |
231 | /// \returns A list of relevant BasicBlocks. |
232 | ArrayRef<Value *> getBlockOperVals(); |
233 | |
234 | /// Hashes \p Value based on its opcode, types, and operand types. |
235 | /// Two IRInstructionData instances produce the same hash when they perform |
236 | /// the same operation. |
237 | /// |
238 | /// As a simple example, consider the following instructions. |
239 | /// |
240 | /// \code |
241 | /// %add1 = add i32 %x1, %y1 |
242 | /// %add2 = add i32 %x2, %y2 |
243 | /// |
244 | /// %sub = sub i32 %x1, %y1 |
245 | /// |
246 | /// %add_i64 = add i64 %x2, %y2 |
247 | /// \endcode |
248 | /// |
249 | /// Because the first two adds operate the same types, and are performing the |
250 | /// same action, they will be hashed to the same value. |
251 | /// |
252 | /// However, the subtraction instruction is not the same as an addition, and |
253 | /// will be hashed to a different value. |
254 | /// |
255 | /// Finally, the last add has a different type compared to the first two add |
256 | /// instructions, so it will also be hashed to a different value that any of |
257 | /// the previous instructions. |
258 | /// |
259 | /// \param [in] ID - The IRInstructionData instance to be hashed. |
260 | /// \returns A hash_value of the IRInstructionData. |
261 | friend hash_code hash_value(const IRInstructionData &ID) { |
262 | SmallVector<Type *, 4> OperTypes; |
263 | for (Value *V : ID.OperVals) |
264 | OperTypes.push_back(Elt: V->getType()); |
265 | |
266 | if (isa<CmpInst>(Val: ID.Inst)) |
267 | return llvm::hash_combine( |
268 | args: llvm::hash_value(value: ID.Inst->getOpcode()), |
269 | args: llvm::hash_value(ptr: ID.Inst->getType()), |
270 | args: llvm::hash_value(value: ID.getPredicate()), |
271 | args: llvm::hash_combine_range(first: OperTypes.begin(), last: OperTypes.end())); |
272 | |
273 | if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: ID.Inst)) { |
274 | // To hash intrinsics, we use the opcode, and types like the other |
275 | // instructions, but also, the Intrinsic ID, and the Name of the |
276 | // intrinsic. |
277 | Intrinsic::ID IntrinsicID = II->getIntrinsicID(); |
278 | return llvm::hash_combine( |
279 | args: llvm::hash_value(value: ID.Inst->getOpcode()), |
280 | args: llvm::hash_value(ptr: ID.Inst->getType()), args: llvm::hash_value(value: IntrinsicID), |
281 | args: llvm::hash_value(arg: *ID.CalleeName), |
282 | args: llvm::hash_combine_range(first: OperTypes.begin(), last: OperTypes.end())); |
283 | } |
284 | |
285 | if (isa<CallInst>(Val: ID.Inst)) { |
286 | std::string FunctionName = *ID.CalleeName; |
287 | return llvm::hash_combine( |
288 | args: llvm::hash_value(value: ID.Inst->getOpcode()), |
289 | args: llvm::hash_value(ptr: ID.Inst->getType()), |
290 | args: llvm::hash_value(ptr: ID.Inst->getType()), args: llvm::hash_value(arg: FunctionName), |
291 | args: llvm::hash_combine_range(first: OperTypes.begin(), last: OperTypes.end())); |
292 | } |
293 | |
294 | return llvm::hash_combine( |
295 | args: llvm::hash_value(value: ID.Inst->getOpcode()), |
296 | args: llvm::hash_value(ptr: ID.Inst->getType()), |
297 | args: llvm::hash_combine_range(first: OperTypes.begin(), last: OperTypes.end())); |
298 | } |
299 | |
300 | IRInstructionDataList *IDL = nullptr; |
301 | }; |
302 | |
303 | struct IRInstructionDataList |
304 | : simple_ilist<IRInstructionData, ilist_sentinel_tracking<true>> {}; |
305 | |
306 | /// Compare one IRInstructionData class to another IRInstructionData class for |
307 | /// whether they are performing a the same operation, and can mapped to the |
308 | /// same value. For regular instructions if the hash value is the same, then |
309 | /// they will also be close. |
310 | /// |
311 | /// \param A - The first IRInstructionData class to compare |
312 | /// \param B - The second IRInstructionData class to compare |
313 | /// \returns true if \p A and \p B are similar enough to be mapped to the same |
314 | /// value. |
315 | bool isClose(const IRInstructionData &A, const IRInstructionData &B); |
316 | |
317 | struct IRInstructionDataTraits : DenseMapInfo<IRInstructionData *> { |
318 | static inline IRInstructionData *getEmptyKey() { return nullptr; } |
319 | static inline IRInstructionData *getTombstoneKey() { |
320 | return reinterpret_cast<IRInstructionData *>(-1); |
321 | } |
322 | |
323 | static unsigned getHashValue(const IRInstructionData *E) { |
324 | using llvm::hash_value; |
325 | assert(E && "IRInstructionData is a nullptr?" ); |
326 | return hash_value(ID: *E); |
327 | } |
328 | |
329 | static bool isEqual(const IRInstructionData *LHS, |
330 | const IRInstructionData *RHS) { |
331 | if (RHS == getEmptyKey() || RHS == getTombstoneKey() || |
332 | LHS == getEmptyKey() || LHS == getTombstoneKey()) |
333 | return LHS == RHS; |
334 | |
335 | assert(LHS && RHS && "nullptr should have been caught by getEmptyKey?" ); |
336 | return isClose(A: *LHS, B: *RHS); |
337 | } |
338 | }; |
339 | |
340 | /// Helper struct for converting the Instructions in a Module into a vector of |
341 | /// unsigned integers. This vector of unsigned integers can be thought of as a |
342 | /// "numeric string". This numeric string can then be queried by, for example, |
343 | /// data structures that find repeated substrings. |
344 | /// |
345 | /// This hashing is done per BasicBlock in the module. To hash Instructions |
346 | /// based off of their operations, each Instruction is wrapped in an |
347 | /// IRInstructionData struct. The unsigned integer for an IRInstructionData |
348 | /// depends on: |
349 | /// - The hash provided by the IRInstructionData. |
350 | /// - Which member of InstrType the IRInstructionData is classified as. |
351 | // See InstrType for more details on the possible classifications, and how they |
352 | // manifest in the numeric string. |
353 | /// |
354 | /// The numeric string for an individual BasicBlock is terminated by an unique |
355 | /// unsigned integer. This prevents data structures which rely on repetition |
356 | /// from matching across BasicBlocks. (For example, the SuffixTree.) |
357 | /// As a concrete example, if we have the following two BasicBlocks: |
358 | /// \code |
359 | /// bb0: |
360 | /// %add1 = add i32 %a, %b |
361 | /// %add2 = add i32 %c, %d |
362 | /// %add3 = add i64 %e, %f |
363 | /// bb1: |
364 | /// %sub = sub i32 %c, %d |
365 | /// \endcode |
366 | /// We may hash the Instructions like this (via IRInstructionData): |
367 | /// \code |
368 | /// bb0: |
369 | /// %add1 = add i32 %a, %b ; Hash: 1 |
370 | /// %add2 = add i32 %c, %d; Hash: 1 |
371 | /// %add3 = add i64 %e, %f; Hash: 2 |
372 | /// bb1: |
373 | /// %sub = sub i32 %c, %d; Hash: 3 |
374 | /// %add4 = add i32 %c, %d ; Hash: 1 |
375 | /// \endcode |
376 | /// And produce a "numeric string representation" like so: |
377 | /// 1, 1, 2, unique_integer_1, 3, 1, unique_integer_2 |
378 | /// |
379 | /// TODO: This is very similar to the MachineOutliner, and should be |
380 | /// consolidated into the same interface. |
381 | struct IRInstructionMapper { |
382 | /// The starting illegal instruction number to map to. |
383 | /// |
384 | /// Set to -3 for compatibility with DenseMapInfo<unsigned>. |
385 | unsigned IllegalInstrNumber = static_cast<unsigned>(-3); |
386 | |
387 | /// The next available integer to assign to a legal Instruction to. |
388 | unsigned LegalInstrNumber = 0; |
389 | |
390 | /// Correspondence from IRInstructionData to unsigned integers. |
391 | DenseMap<IRInstructionData *, unsigned, IRInstructionDataTraits> |
392 | InstructionIntegerMap; |
393 | |
394 | /// A mapping for a basic block in a module to its assigned number/location |
395 | /// in the module. |
396 | DenseMap<BasicBlock *, unsigned> BasicBlockToInteger; |
397 | |
398 | /// Set if we added an illegal number in the previous step. |
399 | /// Since each illegal number is unique, we only need one of them between |
400 | /// each range of legal numbers. This lets us make sure we don't add more |
401 | /// than one illegal number per range. |
402 | bool AddedIllegalLastTime = false; |
403 | |
404 | /// Marks whether we found a illegal instruction in the previous step. |
405 | bool CanCombineWithPrevInstr = false; |
406 | |
407 | /// Marks whether we have found a set of instructions that is long enough |
408 | /// to be considered for similarity. |
409 | bool HaveLegalRange = false; |
410 | |
411 | /// Marks whether we should use exact function names, as well as types to |
412 | /// find similarity between calls. |
413 | bool EnableMatchCallsByName = false; |
414 | |
415 | /// This allocator pointer is in charge of holding on to the IRInstructionData |
416 | /// so it is not deallocated until whatever external tool is using it is done |
417 | /// with the information. |
418 | SpecificBumpPtrAllocator<IRInstructionData> *InstDataAllocator = nullptr; |
419 | |
420 | /// This allocator pointer is in charge of creating the IRInstructionDataList |
421 | /// so it is not deallocated until whatever external tool is using it is done |
422 | /// with the information. |
423 | SpecificBumpPtrAllocator<IRInstructionDataList> *IDLAllocator = nullptr; |
424 | |
425 | /// Get an allocated IRInstructionData struct using the InstDataAllocator. |
426 | /// |
427 | /// \param I - The Instruction to wrap with IRInstructionData. |
428 | /// \param Legality - A boolean value that is true if the instruction is to |
429 | /// be considered for similarity, and false if not. |
430 | /// \param IDL - The InstructionDataList that the IRInstructionData is |
431 | /// inserted into. |
432 | /// \returns An allocated IRInstructionData struct. |
433 | IRInstructionData *allocateIRInstructionData(Instruction &I, bool Legality, |
434 | IRInstructionDataList &IDL); |
435 | |
436 | /// Get an empty allocated IRInstructionData struct using the |
437 | /// InstDataAllocator. |
438 | /// |
439 | /// \param IDL - The InstructionDataList that the IRInstructionData is |
440 | /// inserted into. |
441 | /// \returns An allocated IRInstructionData struct. |
442 | IRInstructionData *allocateIRInstructionData(IRInstructionDataList &IDL); |
443 | |
444 | /// Get an allocated IRInstructionDataList object using the IDLAllocator. |
445 | /// |
446 | /// \returns An allocated IRInstructionDataList object. |
447 | IRInstructionDataList *allocateIRInstructionDataList(); |
448 | |
449 | IRInstructionDataList *IDL = nullptr; |
450 | |
451 | /// Assigns values to all the basic blocks in function \p F starting from |
452 | /// integer \p BBNumber. |
453 | /// |
454 | /// \param F - The function containing the basic blocks to assign numbers to. |
455 | /// \param BBNumber - The number to start from. |
456 | void initializeForBBs(Function &F, unsigned &BBNumber) { |
457 | for (BasicBlock &BB : F) |
458 | BasicBlockToInteger.insert(KV: std::make_pair(x: &BB, y: BBNumber++)); |
459 | } |
460 | |
461 | /// Assigns values to all the basic blocks in Module \p M. |
462 | /// \param M - The module containing the basic blocks to assign numbers to. |
463 | void initializeForBBs(Module &M) { |
464 | unsigned BBNumber = 0; |
465 | for (Function &F : M) |
466 | initializeForBBs(F, BBNumber); |
467 | } |
468 | |
469 | /// Maps the Instructions in a BasicBlock \p BB to legal or illegal integers |
470 | /// determined by \p InstrType. Two Instructions are mapped to the same value |
471 | /// if they are close as defined by the InstructionData class above. |
472 | /// |
473 | /// \param [in] BB - The BasicBlock to be mapped to integers. |
474 | /// \param [in,out] InstrList - Vector of IRInstructionData to append to. |
475 | /// \param [in,out] IntegerMapping - Vector of unsigned integers to append to. |
476 | void convertToUnsignedVec(BasicBlock &BB, |
477 | std::vector<IRInstructionData *> &InstrList, |
478 | std::vector<unsigned> &IntegerMapping); |
479 | |
480 | /// Maps an Instruction to a legal integer. |
481 | /// |
482 | /// \param [in] It - The Instruction to be mapped to an integer. |
483 | /// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to |
484 | /// append to. |
485 | /// \param [in,out] InstrListForBB - Vector of InstructionData to append to. |
486 | /// \returns The integer \p It was mapped to. |
487 | unsigned mapToLegalUnsigned(BasicBlock::iterator &It, |
488 | std::vector<unsigned> &IntegerMappingForBB, |
489 | std::vector<IRInstructionData *> &InstrListForBB); |
490 | |
491 | /// Maps an Instruction to an illegal integer. |
492 | /// |
493 | /// \param [in] It - The \p Instruction to be mapped to an integer. |
494 | /// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to |
495 | /// append to. |
496 | /// \param [in,out] InstrListForBB - Vector of IRInstructionData to append to. |
497 | /// \param End - true if creating a dummy IRInstructionData at the end of a |
498 | /// basic block. |
499 | /// \returns The integer \p It was mapped to. |
500 | unsigned mapToIllegalUnsigned( |
501 | BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB, |
502 | std::vector<IRInstructionData *> &InstrListForBB, bool End = false); |
503 | |
504 | IRInstructionMapper(SpecificBumpPtrAllocator<IRInstructionData> *IDA, |
505 | SpecificBumpPtrAllocator<IRInstructionDataList> *IDLA) |
506 | : InstDataAllocator(IDA), IDLAllocator(IDLA) { |
507 | // Make sure that the implementation of DenseMapInfo<unsigned> hasn't |
508 | // changed. |
509 | assert(DenseMapInfo<unsigned>::getEmptyKey() == static_cast<unsigned>(-1) && |
510 | "DenseMapInfo<unsigned>'s empty key isn't -1!" ); |
511 | assert(DenseMapInfo<unsigned>::getTombstoneKey() == |
512 | static_cast<unsigned>(-2) && |
513 | "DenseMapInfo<unsigned>'s tombstone key isn't -2!" ); |
514 | |
515 | IDL = new (IDLAllocator->Allocate()) |
516 | IRInstructionDataList(); |
517 | } |
518 | |
519 | /// Custom InstVisitor to classify different instructions for whether it can |
520 | /// be analyzed for similarity. |
521 | struct InstructionClassification |
522 | : public InstVisitor<InstructionClassification, InstrType> { |
523 | InstructionClassification() = default; |
524 | |
525 | // TODO: Determine a scheme to resolve when the label is similar enough. |
526 | InstrType visitBranchInst(BranchInst &BI) { |
527 | if (EnableBranches) |
528 | return Legal; |
529 | return Illegal; |
530 | } |
531 | InstrType visitPHINode(PHINode &PN) { |
532 | if (EnableBranches) |
533 | return Legal; |
534 | return Illegal; |
535 | } |
536 | // TODO: Handle allocas. |
537 | InstrType visitAllocaInst(AllocaInst &AI) { return Illegal; } |
538 | // We exclude variable argument instructions since variable arguments |
539 | // requires extra checking of the argument list. |
540 | InstrType visitVAArgInst(VAArgInst &VI) { return Illegal; } |
541 | // We exclude all exception handling cases since they are so context |
542 | // dependent. |
543 | InstrType visitLandingPadInst(LandingPadInst &LPI) { return Illegal; } |
544 | InstrType visitFuncletPadInst(FuncletPadInst &FPI) { return Illegal; } |
545 | // DebugInfo should be included in the regions, but should not be |
546 | // analyzed for similarity as it has no bearing on the outcome of the |
547 | // program. |
548 | InstrType visitDbgInfoIntrinsic(DbgInfoIntrinsic &DII) { return Invisible; } |
549 | InstrType visitIntrinsicInst(IntrinsicInst &II) { |
550 | // These are disabled due to complications in the CodeExtractor when |
551 | // outlining these instructions. For instance, It is unclear what we |
552 | // should do when moving only the start or end lifetime instruction into |
553 | // an outlined function. Also, assume-like intrinsics could be removed |
554 | // from the region, removing arguments, causing discrepencies in the |
555 | // number of inputs between different regions. |
556 | if (II.isAssumeLikeIntrinsic()) |
557 | return Illegal; |
558 | return EnableIntrinsics ? Legal : Illegal; |
559 | } |
560 | // We only allow call instructions where the function has a name and |
561 | // is not an indirect call. |
562 | InstrType visitCallInst(CallInst &CI) { |
563 | Function *F = CI.getCalledFunction(); |
564 | bool IsIndirectCall = CI.isIndirectCall(); |
565 | if (IsIndirectCall && !EnableIndirectCalls) |
566 | return Illegal; |
567 | if (!F && !IsIndirectCall) |
568 | return Illegal; |
569 | // Functions marked with the swifttailcc and tailcc calling conventions |
570 | // require special handling when outlining musttail functions. The |
571 | // calling convention must be passed down to the outlined function as |
572 | // well. Further, there is special handling for musttail calls as well, |
573 | // requiring a return call directly after. For now, the outliner does not |
574 | // support this, so we do not handle matching this case either. |
575 | if ((CI.getCallingConv() == CallingConv::SwiftTail || |
576 | CI.getCallingConv() == CallingConv::Tail) && |
577 | !EnableMustTailCalls) |
578 | return Illegal; |
579 | if (CI.isMustTailCall() && !EnableMustTailCalls) |
580 | return Illegal; |
581 | return Legal; |
582 | } |
583 | // TODO: We do not current handle similarity that changes the control flow. |
584 | InstrType visitInvokeInst(InvokeInst &II) { return Illegal; } |
585 | // TODO: We do not current handle similarity that changes the control flow. |
586 | InstrType visitCallBrInst(CallBrInst &CBI) { return Illegal; } |
587 | // TODO: Handle interblock similarity. |
588 | InstrType visitTerminator(Instruction &I) { return Illegal; } |
589 | InstrType visitInstruction(Instruction &I) { return Legal; } |
590 | |
591 | // The flag variable that lets the classifier know whether we should |
592 | // allow branches to be checked for similarity. |
593 | bool EnableBranches = false; |
594 | |
595 | // The flag variable that lets the classifier know whether we should |
596 | // allow indirect calls to be considered legal instructions. |
597 | bool EnableIndirectCalls = false; |
598 | |
599 | // Flag that lets the classifier know whether we should allow intrinsics to |
600 | // be checked for similarity. |
601 | bool EnableIntrinsics = false; |
602 | |
603 | // Flag that lets the classifier know whether we should allow tail calls to |
604 | // be checked for similarity. |
605 | bool EnableMustTailCalls = false; |
606 | }; |
607 | |
608 | /// Maps an Instruction to a member of InstrType. |
609 | InstructionClassification InstClassifier; |
610 | }; |
611 | |
612 | /// This is a class that wraps a range of IRInstructionData from one point to |
613 | /// another in the vector of IRInstructionData, which is a region of the |
614 | /// program. It is also responsible for defining the structure within this |
615 | /// region of instructions. |
616 | /// |
617 | /// The structure of a region is defined through a value numbering system |
618 | /// assigned to each unique value in a region at the creation of the |
619 | /// IRSimilarityCandidate. |
620 | /// |
621 | /// For example, for each Instruction we add a mapping for each new |
622 | /// value seen in that Instruction. |
623 | /// IR: Mapping Added: |
624 | /// %add1 = add i32 %a, c1 %add1 -> 3, %a -> 1, c1 -> 2 |
625 | /// %add2 = add i32 %a, %1 %add2 -> 4 |
626 | /// %add3 = add i32 c2, c1 %add3 -> 6, c2 -> 5 |
627 | /// |
628 | /// We can compare IRSimilarityCandidates against one another. |
629 | /// The \ref isSimilar function compares each IRInstructionData against one |
630 | /// another and if we have the same sequences of IRInstructionData that would |
631 | /// create the same hash, we have similar IRSimilarityCandidates. |
632 | /// |
633 | /// We can also compare the structure of IRSimilarityCandidates. If we can |
634 | /// create a mapping of registers in the region contained by one |
635 | /// IRSimilarityCandidate to the region contained by different |
636 | /// IRSimilarityCandidate, they can be considered structurally similar. |
637 | /// |
638 | /// IRSimilarityCandidate1: IRSimilarityCandidate2: |
639 | /// %add1 = add i32 %a, %b %add1 = add i32 %d, %e |
640 | /// %add2 = add i32 %a, %c %add2 = add i32 %d, %f |
641 | /// %add3 = add i32 c1, c2 %add3 = add i32 c3, c4 |
642 | /// |
643 | /// Can have the following mapping from candidate to candidate of: |
644 | /// %a -> %d, %b -> %e, %c -> %f, c1 -> c3, c2 -> c4 |
645 | /// and can be considered similar. |
646 | /// |
647 | /// IRSimilarityCandidate1: IRSimilarityCandidate2: |
648 | /// %add1 = add i32 %a, %b %add1 = add i32 %d, c4 |
649 | /// %add2 = add i32 %a, %c %add2 = add i32 %d, %f |
650 | /// %add3 = add i32 c1, c2 %add3 = add i32 c3, c4 |
651 | /// |
652 | /// We cannot create the same mapping since the use of c4 is not used in the |
653 | /// same way as %b or c2. |
654 | class IRSimilarityCandidate { |
655 | private: |
656 | /// The start index of this IRSimilarityCandidate in the instruction list. |
657 | unsigned StartIdx = 0; |
658 | |
659 | /// The number of instructions in this IRSimilarityCandidate. |
660 | unsigned Len = 0; |
661 | |
662 | /// The first instruction in this IRSimilarityCandidate. |
663 | IRInstructionData *FirstInst = nullptr; |
664 | |
665 | /// The last instruction in this IRSimilarityCandidate. |
666 | IRInstructionData *LastInst = nullptr; |
667 | |
668 | /// Global Value Numbering structures |
669 | /// @{ |
670 | /// Stores the mapping of the value to the number assigned to it in the |
671 | /// IRSimilarityCandidate. |
672 | DenseMap<Value *, unsigned> ValueToNumber; |
673 | /// Stores the mapping of the number to the value assigned this number. |
674 | DenseMap<unsigned, Value *> NumberToValue; |
675 | /// Stores the mapping of a value's number to canonical numbering in the |
676 | /// candidate's respective similarity group. |
677 | DenseMap<unsigned, unsigned> NumberToCanonNum; |
678 | /// Stores the mapping of canonical number in the candidate's respective |
679 | /// similarity group to a value number. |
680 | DenseMap<unsigned, unsigned> CanonNumToNumber; |
681 | /// @} |
682 | |
683 | public: |
684 | /// \param StartIdx - The starting location of the region. |
685 | /// \param Len - The length of the region. |
686 | /// \param FirstInstIt - The starting IRInstructionData of the region. |
687 | /// \param LastInstIt - The ending IRInstructionData of the region. |
688 | IRSimilarityCandidate(unsigned StartIdx, unsigned Len, |
689 | IRInstructionData *FirstInstIt, |
690 | IRInstructionData *LastInstIt); |
691 | |
692 | /// \param A - The first IRInstructionCandidate to compare. |
693 | /// \param B - The second IRInstructionCandidate to compare. |
694 | /// \returns True when every IRInstructionData in \p A is similar to every |
695 | /// IRInstructionData in \p B. |
696 | static bool isSimilar(const IRSimilarityCandidate &A, |
697 | const IRSimilarityCandidate &B); |
698 | |
699 | /// \param [in] A - The first IRInstructionCandidate to compare. |
700 | /// \param [in] B - The second IRInstructionCandidate to compare. |
701 | /// \returns True when every IRInstructionData in \p A is structurally similar |
702 | /// to \p B. |
703 | static bool compareStructure(const IRSimilarityCandidate &A, |
704 | const IRSimilarityCandidate &B); |
705 | |
706 | /// \param [in] A - The first IRInstructionCandidate to compare. |
707 | /// \param [in] B - The second IRInstructionCandidate to compare. |
708 | /// \param [in,out] ValueNumberMappingA - A mapping of value numbers from |
709 | /// candidate \p A to candidate \B. |
710 | /// \param [in,out] ValueNumberMappingB - A mapping of value numbers from |
711 | /// candidate \p B to candidate \A. |
712 | /// \returns True when every IRInstructionData in \p A is structurally similar |
713 | /// to \p B. |
714 | static bool |
715 | compareStructure(const IRSimilarityCandidate &A, |
716 | const IRSimilarityCandidate &B, |
717 | DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingA, |
718 | DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingB); |
719 | |
720 | struct OperandMapping { |
721 | /// The IRSimilarityCandidate that holds the instruction the OperVals were |
722 | /// pulled from. |
723 | const IRSimilarityCandidate &IRSC; |
724 | |
725 | /// The operand values to be analyzed. |
726 | ArrayRef<Value *> &OperVals; |
727 | |
728 | /// The current mapping of global value numbers from one IRSimilarityCandidate |
729 | /// to another IRSimilarityCandidate. |
730 | DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMapping; |
731 | }; |
732 | |
733 | /// A helper struct to hold the candidate, for a branch instruction, the |
734 | /// relative location of a label, and the label itself. This is mostly to |
735 | /// group the values together before passing them as a bundle to a function. |
736 | struct RelativeLocMapping { |
737 | /// The IRSimilarityCandidate that holds the instruction the relative |
738 | /// location was pulled from. |
739 | const IRSimilarityCandidate &IRSC; |
740 | |
741 | /// The relative location to be analyzed. |
742 | int RelativeLocation; |
743 | |
744 | /// The corresponding value. |
745 | Value *OperVal; |
746 | }; |
747 | |
748 | /// Compare the operands in \p A and \p B and check that the current mapping |
749 | /// of global value numbers from \p A to \p B and \p B to \A is consistent. |
750 | /// |
751 | /// \param A - The first IRInstructionCandidate, operand values, and current |
752 | /// operand mappings to compare. |
753 | /// \param B - The second IRInstructionCandidate, operand values, and current |
754 | /// operand mappings to compare. |
755 | /// \returns true if the IRSimilarityCandidates operands are compatible. |
756 | static bool compareNonCommutativeOperandMapping(OperandMapping A, |
757 | OperandMapping B); |
758 | |
759 | /// Compare the operands in \p A and \p B and check that the current mapping |
760 | /// of global value numbers from \p A to \p B and \p B to \A is consistent |
761 | /// given that the operands are commutative. |
762 | /// |
763 | /// \param A - The first IRInstructionCandidate, operand values, and current |
764 | /// operand mappings to compare. |
765 | /// \param B - The second IRInstructionCandidate, operand values, and current |
766 | /// operand mappings to compare. |
767 | /// \returns true if the IRSimilarityCandidates operands are compatible. |
768 | static bool compareCommutativeOperandMapping(OperandMapping A, |
769 | OperandMapping B); |
770 | |
771 | /// Compare the GVN of the assignment value in corresponding instructions in |
772 | /// IRSimilarityCandidates \p A and \p B and check that there exists a mapping |
773 | /// between the values and replaces the mapping with a one-to-one value if |
774 | /// needed. |
775 | /// |
776 | /// \param InstValA - The assignment GVN from the first IRSimilarityCandidate. |
777 | /// \param InstValB - The assignment GVN from the second |
778 | /// IRSimilarityCandidate. |
779 | /// \param [in,out] ValueNumberMappingA - A mapping of value numbers from |
780 | /// candidate \p A to candidate \B. |
781 | /// \param [in,out] ValueNumberMappingB - A mapping of value numbers from |
782 | /// candidate \p B to candidate \A. |
783 | /// \returns true if the IRSimilarityCandidates assignments are compatible. |
784 | static bool compareAssignmentMapping( |
785 | const unsigned InstValA, const unsigned &InstValB, |
786 | DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingA, |
787 | DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingB); |
788 | |
789 | /// Compare the relative locations in \p A and \p B and check that the |
790 | /// distances match if both locations are contained in the region, and that |
791 | /// the branches both point outside the region if they do not. |
792 | /// Example Region: |
793 | /// \code |
794 | /// entry: |
795 | /// br i1 %0, label %block_1, label %block_3 |
796 | /// block_0: |
797 | /// br i1 %0, label %block_1, label %block_2 |
798 | /// block_1: |
799 | /// br i1 %0, label %block_2, label %block_3 |
800 | /// block_2: |
801 | /// br i1 %1, label %block_1, label %block_4 |
802 | /// block_3: |
803 | /// br i1 %2, label %block_2, label %block_5 |
804 | /// \endcode |
805 | /// If we compare the branches in block_0 and block_1 the relative values are |
806 | /// 1 and 2 for both, so we consider this a match. |
807 | /// |
808 | /// If we compare the branches in entry and block_0 the relative values are |
809 | /// 2 and 3, and 1 and 2 respectively. Since these are not the same we do not |
810 | /// consider them a match. |
811 | /// |
812 | /// If we compare the branches in block_1 and block_2 the relative values are |
813 | /// 1 and 2, and -1 and None respectively. As a result we do not consider |
814 | /// these to be the same |
815 | /// |
816 | /// If we compare the branches in block_2 and block_3 the relative values are |
817 | /// -1 and None for both. We do consider these to be a match. |
818 | /// |
819 | /// \param A - The first IRInstructionCandidate, relative location value, |
820 | /// and incoming block. |
821 | /// \param B - The second IRInstructionCandidate, relative location value, |
822 | /// and incoming block. |
823 | /// \returns true if the relative locations match. |
824 | static bool checkRelativeLocations(RelativeLocMapping A, |
825 | RelativeLocMapping B); |
826 | |
827 | /// Create a mapping from the value numbering to a different separate set of |
828 | /// numbers. This will serve as a guide for relating one candidate to another. |
829 | /// The canonical number gives use the ability identify which global value |
830 | /// number in one candidate relates to the global value number in the other. |
831 | /// |
832 | /// \param [in, out] CurrCand - The IRSimilarityCandidate to create a |
833 | /// canonical numbering for. |
834 | static void createCanonicalMappingFor(IRSimilarityCandidate &CurrCand); |
835 | |
836 | /// Create a mapping for the value numbering of the calling |
837 | /// IRSimilarityCandidate, to a different separate set of numbers, based on |
838 | /// the canonical ordering in \p SourceCand. These are defined based on the |
839 | /// found mappings in \p ToSourceMapping and \p FromSourceMapping. Both of |
840 | /// these relationships should have the same information, just in opposite |
841 | /// directions. |
842 | /// |
843 | /// \param [in, out] SourceCand - The IRSimilarityCandidate to create a |
844 | /// canonical numbering from. |
845 | /// \param ToSourceMapping - The mapping of value numbers from this candidate |
846 | /// to \p SourceCand. |
847 | /// \param FromSourceMapping - The mapping of value numbers from \p SoureCand |
848 | /// to this candidate. |
849 | void createCanonicalRelationFrom( |
850 | IRSimilarityCandidate &SourceCand, |
851 | DenseMap<unsigned, DenseSet<unsigned>> &ToSourceMapping, |
852 | DenseMap<unsigned, DenseSet<unsigned>> &FromSourceMapping); |
853 | |
854 | /// Create a mapping for the value numbering of the calling |
855 | /// IRSimilarityCandidate, to a different separate set of numbers, based on |
856 | /// the canonical ordering in \p SourceCand. These are defined based on the |
857 | /// found mappings in \p ToSourceMapping and \p FromSourceMapping. Both of |
858 | /// these relationships should have the same information, just in opposite |
859 | /// directions. Uses the \p OneToOne mapping from target candidate to \p |
860 | /// SourceCand GVNs to determine the mapping first for values with multiple |
861 | /// mappings. This mapping is created by the ordering of operands in the |
862 | /// instruction they are first seen in the candidates. |
863 | /// |
864 | /// \param [in, out] SourceCand - The IRSimilarityCandidate to create a |
865 | /// canonical numbering from. |
866 | /// \param [in,out] OneToOne - A mapping of value numbers from candidate |
867 | /// \p A to candidate \B using the structure of the original instructions. |
868 | /// \param ToSourceMapping - The mapping of value numbers from this candidate |
869 | /// to \p SourceCand. |
870 | /// \param FromSourceMapping - The mapping of value numbers from \p SoureCand |
871 | /// to this candidate. |
872 | void createCanonicalRelationFrom( |
873 | IRSimilarityCandidate &SourceCand, |
874 | DenseMap<unsigned, unsigned> &OneToOne, |
875 | DenseMap<unsigned, DenseSet<unsigned>> &ToSourceMapping, |
876 | DenseMap<unsigned, DenseSet<unsigned>> &FromSourceMapping); |
877 | |
878 | /// Create a mapping for the value numbering of the calling |
879 | /// IRSimilarityCandidate, to a different separate set of numbers, based on |
880 | /// the canonical ordering in \p SourceCand. These are defined based on the |
881 | /// canonical mapping defined between \p SoureCandLarge and |
882 | /// \p TargetCandLarge. These IRSimilarityCandidates are already structurally |
883 | /// similar, and fully encapsulate the IRSimilarityCandidates in question. |
884 | /// These are used as a "bridge" from the \p SourceCand to the target. |
885 | /// |
886 | /// \param [in, out] SourceCand - The IRSimilarityCandidate to create a |
887 | /// canonical numbering from. |
888 | /// \param SoureCandLarge - The IRSimilarityCandidate fully containing |
889 | /// \p SourceCand. |
890 | /// \param TargetCandLarge - The IRSimilarityCandidate fully containing |
891 | /// this Candidate. |
892 | void createCanonicalRelationFrom( |
893 | IRSimilarityCandidate &SourceCand, |
894 | IRSimilarityCandidate &SourceCandLarge, |
895 | IRSimilarityCandidate &TargetCandLarge); |
896 | |
897 | /// \param [in,out] BBSet - The set to track the basic blocks. |
898 | void getBasicBlocks(DenseSet<BasicBlock *> &BBSet) const { |
899 | for (IRInstructionData &ID : *this) { |
900 | BasicBlock *BB = ID.Inst->getParent(); |
901 | BBSet.insert(V: BB); |
902 | } |
903 | } |
904 | |
905 | /// \param [in,out] BBSet - The set to track the basic blocks. |
906 | /// \param [in,out] BBList - A list in order of use to track the basic blocks. |
907 | void getBasicBlocks(DenseSet<BasicBlock *> &BBSet, |
908 | SmallVector<BasicBlock *> &BBList) const { |
909 | for (IRInstructionData &ID : *this) { |
910 | BasicBlock *BB = ID.Inst->getParent(); |
911 | if (BBSet.insert(V: BB).second) |
912 | BBList.push_back(Elt: BB); |
913 | } |
914 | } |
915 | |
916 | /// Compare the start and end indices of the two IRSimilarityCandidates for |
917 | /// whether they overlap. If the start instruction of one |
918 | /// IRSimilarityCandidate is less than the end instruction of the other, and |
919 | /// the start instruction of one is greater than the start instruction of the |
920 | /// other, they overlap. |
921 | /// |
922 | /// \returns true if the IRSimilarityCandidates do not have overlapping |
923 | /// instructions. |
924 | static bool overlap(const IRSimilarityCandidate &A, |
925 | const IRSimilarityCandidate &B); |
926 | |
927 | /// \returns the number of instructions in this Candidate. |
928 | unsigned getLength() const { return Len; } |
929 | |
930 | /// \returns the start index of this IRSimilarityCandidate. |
931 | unsigned getStartIdx() const { return StartIdx; } |
932 | |
933 | /// \returns the end index of this IRSimilarityCandidate. |
934 | unsigned getEndIdx() const { return StartIdx + Len - 1; } |
935 | |
936 | /// \returns The first IRInstructionData. |
937 | IRInstructionData *front() const { return FirstInst; } |
938 | /// \returns The last IRInstructionData. |
939 | IRInstructionData *back() const { return LastInst; } |
940 | |
941 | /// \returns The first Instruction. |
942 | Instruction *frontInstruction() { return FirstInst->Inst; } |
943 | /// \returns The last Instruction |
944 | Instruction *backInstruction() { return LastInst->Inst; } |
945 | |
946 | /// \returns The BasicBlock the IRSimilarityCandidate starts in. |
947 | BasicBlock *getStartBB() { return FirstInst->Inst->getParent(); } |
948 | /// \returns The BasicBlock the IRSimilarityCandidate ends in. |
949 | BasicBlock *getEndBB() { return LastInst->Inst->getParent(); } |
950 | |
951 | /// \returns The Function that the IRSimilarityCandidate is located in. |
952 | Function *getFunction() { return getStartBB()->getParent(); } |
953 | |
954 | /// Finds the positive number associated with \p V if it has been mapped. |
955 | /// \param [in] V - the Value to find. |
956 | /// \returns The positive number corresponding to the value. |
957 | /// \returns std::nullopt if not present. |
958 | std::optional<unsigned> getGVN(Value *V) { |
959 | assert(V != nullptr && "Value is a nullptr?" ); |
960 | DenseMap<Value *, unsigned>::iterator VNIt = ValueToNumber.find(Val: V); |
961 | if (VNIt == ValueToNumber.end()) |
962 | return std::nullopt; |
963 | return VNIt->second; |
964 | } |
965 | |
966 | /// Finds the Value associate with \p Num if it exists. |
967 | /// \param [in] Num - the number to find. |
968 | /// \returns The Value associated with the number. |
969 | /// \returns std::nullopt if not present. |
970 | std::optional<Value *> fromGVN(unsigned Num) { |
971 | DenseMap<unsigned, Value *>::iterator VNIt = NumberToValue.find(Val: Num); |
972 | if (VNIt == NumberToValue.end()) |
973 | return std::nullopt; |
974 | assert(VNIt->second != nullptr && "Found value is a nullptr!" ); |
975 | return VNIt->second; |
976 | } |
977 | |
978 | /// Find the canonical number from the global value number \p N stored in the |
979 | /// candidate. |
980 | /// |
981 | /// \param N - The global value number to find the canonical number for. |
982 | /// \returns An optional containing the value, and std::nullopt if it could |
983 | /// not be found. |
984 | std::optional<unsigned> getCanonicalNum(unsigned N) { |
985 | DenseMap<unsigned, unsigned>::iterator NCIt = NumberToCanonNum.find(Val: N); |
986 | if (NCIt == NumberToCanonNum.end()) |
987 | return std::nullopt; |
988 | return NCIt->second; |
989 | } |
990 | |
991 | /// Find the global value number from the canonical number \p N stored in the |
992 | /// candidate. |
993 | /// |
994 | /// \param N - The canonical number to find the global vlaue number for. |
995 | /// \returns An optional containing the value, and std::nullopt if it could |
996 | /// not be found. |
997 | std::optional<unsigned> fromCanonicalNum(unsigned N) { |
998 | DenseMap<unsigned, unsigned>::iterator CNIt = CanonNumToNumber.find(Val: N); |
999 | if (CNIt == CanonNumToNumber.end()) |
1000 | return std::nullopt; |
1001 | return CNIt->second; |
1002 | } |
1003 | |
1004 | /// \param RHS -The IRSimilarityCandidate to compare against |
1005 | /// \returns true if the IRSimilarityCandidate is occurs after the |
1006 | /// IRSimilarityCandidate in the program. |
1007 | bool operator<(const IRSimilarityCandidate &RHS) const { |
1008 | return getStartIdx() > RHS.getStartIdx(); |
1009 | } |
1010 | |
1011 | using iterator = IRInstructionDataList::iterator; |
1012 | iterator begin() const { return iterator(front()); } |
1013 | iterator end() const { return std::next(x: iterator(back())); } |
1014 | }; |
1015 | |
1016 | typedef DenseMap<IRSimilarityCandidate *, |
1017 | DenseMap<unsigned, DenseSet<unsigned>>> |
1018 | CandidateGVNMapping; |
1019 | typedef std::vector<IRSimilarityCandidate> SimilarityGroup; |
1020 | typedef std::vector<SimilarityGroup> SimilarityGroupList; |
1021 | |
1022 | /// This class puts all the pieces of the IRInstructionData, |
1023 | /// IRInstructionMapper, IRSimilarityCandidate together. |
1024 | /// |
1025 | /// It first feeds the Module or vector of Modules into the IRInstructionMapper, |
1026 | /// and puts all the mapped instructions into a single long list of |
1027 | /// IRInstructionData. |
1028 | /// |
1029 | /// The list of unsigned integers is given to the Suffix Tree or similar data |
1030 | /// structure to find repeated subsequences. We construct an |
1031 | /// IRSimilarityCandidate for each instance of the subsequence. We compare them |
1032 | /// against one another since These repeated subsequences can have different |
1033 | /// structure. For each different kind of structure found, we create a |
1034 | /// similarity group. |
1035 | /// |
1036 | /// If we had four IRSimilarityCandidates A, B, C, and D where A, B and D are |
1037 | /// structurally similar to one another, while C is different we would have two |
1038 | /// SimilarityGroups: |
1039 | /// |
1040 | /// SimilarityGroup 1: SimilarityGroup 2 |
1041 | /// A, B, D C |
1042 | /// |
1043 | /// A list of the different similarity groups is then returned after |
1044 | /// analyzing the module. |
1045 | class IRSimilarityIdentifier { |
1046 | public: |
1047 | IRSimilarityIdentifier(bool MatchBranches = true, |
1048 | bool MatchIndirectCalls = true, |
1049 | bool MatchCallsWithName = false, |
1050 | bool MatchIntrinsics = true, |
1051 | bool MatchMustTailCalls = true) |
1052 | : Mapper(&InstDataAllocator, &InstDataListAllocator), |
1053 | EnableBranches(MatchBranches), EnableIndirectCalls(MatchIndirectCalls), |
1054 | EnableMatchingCallsByName(MatchCallsWithName), |
1055 | EnableIntrinsics(MatchIntrinsics), |
1056 | EnableMustTailCalls(MatchMustTailCalls) {} |
1057 | |
1058 | private: |
1059 | /// Map the instructions in the module to unsigned integers, using mapping |
1060 | /// already present in the Mapper if possible. |
1061 | /// |
1062 | /// \param [in] M Module - To map to integers. |
1063 | /// \param [in,out] InstrList - The vector to append IRInstructionData to. |
1064 | /// \param [in,out] IntegerMapping - The vector to append integers to. |
1065 | void populateMapper(Module &M, std::vector<IRInstructionData *> &InstrList, |
1066 | std::vector<unsigned> &IntegerMapping); |
1067 | |
1068 | /// Map the instructions in the modules vector to unsigned integers, using |
1069 | /// mapping already present in the mapper if possible. |
1070 | /// |
1071 | /// \param [in] Modules - The list of modules to use to populate the mapper |
1072 | /// \param [in,out] InstrList - The vector to append IRInstructionData to. |
1073 | /// \param [in,out] IntegerMapping - The vector to append integers to. |
1074 | void populateMapper(ArrayRef<std::unique_ptr<Module>> &Modules, |
1075 | std::vector<IRInstructionData *> &InstrList, |
1076 | std::vector<unsigned> &IntegerMapping); |
1077 | |
1078 | /// Find the similarity candidates in \p InstrList and corresponding |
1079 | /// \p UnsignedVec |
1080 | /// |
1081 | /// \param [in,out] InstrList - The vector to append IRInstructionData to. |
1082 | /// \param [in,out] IntegerMapping - The vector to append integers to. |
1083 | /// candidates found in the program. |
1084 | void findCandidates(std::vector<IRInstructionData *> &InstrList, |
1085 | std::vector<unsigned> &IntegerMapping); |
1086 | |
1087 | public: |
1088 | // Find the IRSimilarityCandidates in the \p Modules and group by structural |
1089 | // similarity in a SimilarityGroup, each group is returned in a |
1090 | // SimilarityGroupList. |
1091 | // |
1092 | // \param [in] Modules - the modules to analyze. |
1093 | // \returns The groups of similarity ranges found in the modules. |
1094 | SimilarityGroupList & |
1095 | findSimilarity(ArrayRef<std::unique_ptr<Module>> Modules); |
1096 | |
1097 | // Find the IRSimilarityCandidates in the given Module grouped by structural |
1098 | // similarity in a SimilarityGroup, contained inside a SimilarityGroupList. |
1099 | // |
1100 | // \param [in] M - the module to analyze. |
1101 | // \returns The groups of similarity ranges found in the module. |
1102 | SimilarityGroupList &findSimilarity(Module &M); |
1103 | |
1104 | // Clears \ref SimilarityCandidates if it is already filled by a previous run. |
1105 | void resetSimilarityCandidates() { |
1106 | // If we've already analyzed a Module or set of Modules, so we must clear |
1107 | // the SimilarityCandidates to make sure we do not have only old values |
1108 | // hanging around. |
1109 | if (SimilarityCandidates) |
1110 | SimilarityCandidates->clear(); |
1111 | else |
1112 | SimilarityCandidates = SimilarityGroupList(); |
1113 | } |
1114 | |
1115 | // \returns The groups of similarity ranges found in the most recently passed |
1116 | // set of modules. |
1117 | std::optional<SimilarityGroupList> &getSimilarity() { |
1118 | return SimilarityCandidates; |
1119 | } |
1120 | |
1121 | private: |
1122 | /// The allocator for IRInstructionData. |
1123 | SpecificBumpPtrAllocator<IRInstructionData> InstDataAllocator; |
1124 | |
1125 | /// The allocator for IRInstructionDataLists. |
1126 | SpecificBumpPtrAllocator<IRInstructionDataList> InstDataListAllocator; |
1127 | |
1128 | /// Map Instructions to unsigned integers and wraps the Instruction in an |
1129 | /// instance of IRInstructionData. |
1130 | IRInstructionMapper Mapper; |
1131 | |
1132 | /// The flag variable that marks whether we should check branches for |
1133 | /// similarity, or only look within basic blocks. |
1134 | bool EnableBranches = true; |
1135 | |
1136 | /// The flag variable that marks whether we allow indirect calls to be checked |
1137 | /// for similarity, or exclude them as a legal instruction. |
1138 | bool EnableIndirectCalls = true; |
1139 | |
1140 | /// The flag variable that marks whether we allow calls to be marked as |
1141 | /// similar if they do not have the same name, only the same calling |
1142 | /// convention, attributes and type signature. |
1143 | bool EnableMatchingCallsByName = true; |
1144 | |
1145 | /// The flag variable that marks whether we should check intrinsics for |
1146 | /// similarity. |
1147 | bool EnableIntrinsics = true; |
1148 | |
1149 | // The flag variable that marks whether we should allow tailcalls |
1150 | // to be checked for similarity. |
1151 | bool EnableMustTailCalls = false; |
1152 | |
1153 | /// The SimilarityGroups found with the most recent run of \ref |
1154 | /// findSimilarity. std::nullopt if there is no recent run. |
1155 | std::optional<SimilarityGroupList> SimilarityCandidates; |
1156 | }; |
1157 | |
1158 | } // end namespace IRSimilarity |
1159 | |
1160 | /// An analysis pass based on legacy pass manager that runs and returns |
1161 | /// IRSimilarityIdentifier run on the Module. |
1162 | class IRSimilarityIdentifierWrapperPass : public ModulePass { |
1163 | std::unique_ptr<IRSimilarity::IRSimilarityIdentifier> IRSI; |
1164 | |
1165 | public: |
1166 | static char ID; |
1167 | IRSimilarityIdentifierWrapperPass(); |
1168 | |
1169 | IRSimilarity::IRSimilarityIdentifier &getIRSI() { return *IRSI; } |
1170 | const IRSimilarity::IRSimilarityIdentifier &getIRSI() const { return *IRSI; } |
1171 | |
1172 | bool doInitialization(Module &M) override; |
1173 | bool doFinalization(Module &M) override; |
1174 | bool runOnModule(Module &M) override; |
1175 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
1176 | AU.setPreservesAll(); |
1177 | } |
1178 | }; |
1179 | |
1180 | /// An analysis pass that runs and returns the IRSimilarityIdentifier run on the |
1181 | /// Module. |
1182 | class IRSimilarityAnalysis : public AnalysisInfoMixin<IRSimilarityAnalysis> { |
1183 | public: |
1184 | typedef IRSimilarity::IRSimilarityIdentifier Result; |
1185 | |
1186 | Result run(Module &M, ModuleAnalysisManager &); |
1187 | |
1188 | private: |
1189 | friend AnalysisInfoMixin<IRSimilarityAnalysis>; |
1190 | static AnalysisKey Key; |
1191 | }; |
1192 | |
1193 | /// Printer pass that uses \c IRSimilarityAnalysis. |
1194 | class IRSimilarityAnalysisPrinterPass |
1195 | : public PassInfoMixin<IRSimilarityAnalysisPrinterPass> { |
1196 | raw_ostream &OS; |
1197 | |
1198 | public: |
1199 | explicit IRSimilarityAnalysisPrinterPass(raw_ostream &OS) : OS(OS) {} |
1200 | PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); |
1201 | static bool isRequired() { return true; } |
1202 | }; |
1203 | |
1204 | } // end namespace llvm |
1205 | |
1206 | #endif // LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H |
1207 | |