blob: 9e97541e542b3a0ba00051bc257700cbd7f43213 [file] [log] [blame]
//===- IRSimilarityIdentifier.h - Find similarity in a module --------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// \file
// Interface file for the IRSimilarityIdentifier for identifying similarities in
// IR including the IRInstructionMapper, which maps an Instruction to unsigned
// integers.
//
// Two sequences of instructions are called "similar" if they perform the same
// series of operations for all inputs.
//
// \code
// %1 = add i32 %a, 10
// %2 = add i32 %a, %1
// %3 = icmp slt icmp %1, %2
// \endcode
//
// and
//
// \code
// %1 = add i32 11, %a
// %2 = sub i32 %a, %1
// %3 = icmp sgt icmp %2, %1
// \endcode
//
// ultimately have the same result, even if the inputs, and structure are
// slightly different.
//
// For instructions, we do not worry about operands that do not have fixed
// semantic meaning to the program. We consider the opcode that the instruction
// has, the types, parameters, and extra information such as the function name,
// or comparison predicate. These are used to create a hash to map instructions
// to integers to be used in similarity matching in sequences of instructions
//
// Terminology:
// An IRSimilarityCandidate is a region of IRInstructionData (wrapped
// Instructions), usually used to denote a region of similarity has been found.
//
// A SimilarityGroup is a set of IRSimilarityCandidates that are structurally
// similar to one another.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H
#define LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H
#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
#include "llvm/Pass.h"
#include "llvm/Support/Allocator.h"
namespace llvm {
namespace IRSimilarity {
struct IRInstructionDataList;
/// This represents what is and is not supported when finding similarity in
/// Instructions.
///
/// Legal Instructions are considered when looking at similarity between
/// Instructions.
///
/// Illegal Instructions cannot be considered when looking for similarity
/// between Instructions. They act as boundaries between similarity regions.
///
/// Invisible Instructions are skipped over during analysis.
// TODO: Shared with MachineOutliner
enum InstrType { Legal, Illegal, Invisible };
/// This provides the utilities for hashing an Instruction to an unsigned
/// integer. Two IRInstructionDatas produce the same hash value when their
/// underlying Instructions perform the same operation (even if they don't have
/// the same input operands.)
/// As a more concrete example, consider the following:
///
/// \code
/// %add1 = add i32 %a, %b
/// %add2 = add i32 %c, %d
/// %add3 = add i64 %e, %f
/// \endcode
///
// Then the IRInstructionData wrappers for these Instructions may be hashed like
/// so:
///
/// \code
/// ; These two adds have the same types and operand types, so they hash to the
/// ; same number.
/// %add1 = add i32 %a, %b ; Hash: 1
/// %add2 = add i32 %c, %d ; Hash: 1
/// ; This add produces an i64. This differentiates it from %add1 and %add2. So,
/// ; it hashes to a different number.
/// %add3 = add i64 %e, %f; Hash: 2
/// \endcode
///
///
/// This hashing scheme will be used to represent the program as a very long
/// string. This string can then be placed in a data structure which can be used
/// for similarity queries.
///
/// TODO: Handle types of Instructions which can be equal even with different
/// operands. (E.g. comparisons with swapped predicates.)
/// TODO: Handle CallInsts, which are only checked for function type
/// by \ref isSameOperationAs.
/// TODO: Handle GetElementPtrInsts, as some of the operands have to be the
/// exact same, and some do not.
struct IRInstructionData : ilist_node<IRInstructionData> {
/// The source Instruction that is being wrapped.
Instruction *Inst = nullptr;
/// The values of the operands in the Instruction.
SmallVector<Value *, 4> OperVals;
/// The legality of the wrapped instruction. This is informed by InstrType,
/// and is used when checking when two instructions are considered similar.
/// If either instruction is not legal, the instructions are automatically not
/// considered similar.
bool Legal;
/// This is only relevant if we are wrapping a CmpInst where we needed to
/// change the predicate of a compare instruction from a greater than form
/// to a less than form. It is None otherwise.
Optional<CmpInst::Predicate> RevisedPredicate;
/// Gather the information that is difficult to gather for an Instruction, or
/// is changed. i.e. the operands of an Instruction and the Types of those
/// operands. This extra information allows for similarity matching to make
/// assertions that allow for more flexibility when checking for whether an
/// Instruction performs the same operation.
IRInstructionData(Instruction &I, bool Legality, IRInstructionDataList &IDL);
/// Get the predicate that the compare instruction is using for hashing the
/// instruction. the IRInstructionData must be wrapping a CmpInst.
CmpInst::Predicate getPredicate() const;
/// A function that swaps the predicates to their less than form if they are
/// in a greater than form. Otherwise, the predicate is unchanged.
///
/// \param CI - The comparison operation to find a consistent preidcate for.
/// \return the consistent comparison predicate.
static CmpInst::Predicate predicateForConsistency(CmpInst *CI);
/// Hashes \p Value based on its opcode, types, and operand types.
/// Two IRInstructionData instances produce the same hash when they perform
/// the same operation.
///
/// As a simple example, consider the following instructions.
///
/// \code
/// %add1 = add i32 %x1, %y1
/// %add2 = add i32 %x2, %y2
///
/// %sub = sub i32 %x1, %y1
///
/// %add_i64 = add i64 %x2, %y2
/// \endcode
///
/// Because the first two adds operate the same types, and are performing the
/// same action, they will be hashed to the same value.
///
/// However, the subtraction instruction is not the same as an addition, and
/// will be hashed to a different value.
///
/// Finally, the last add has a different type compared to the first two add
/// instructions, so it will also be hashed to a different value that any of
/// the previous instructions.
///
/// \param [in] ID - The IRInstructionData instance to be hashed.
/// \returns A hash_value of the IRInstructionData.
friend hash_code hash_value(const IRInstructionData &ID) {
SmallVector<Type *, 4> OperTypes;
for (Value *V : ID.OperVals)
OperTypes.push_back(V->getType());
if (isa<CmpInst>(ID.Inst))
return llvm::hash_combine(
llvm::hash_value(ID.Inst->getOpcode()),
llvm::hash_value(ID.Inst->getType()),
llvm::hash_value(ID.getPredicate()),
llvm::hash_combine_range(OperTypes.begin(), OperTypes.end()));
else if (CallInst *CI = dyn_cast<CallInst>(ID.Inst))
return llvm::hash_combine(
llvm::hash_value(ID.Inst->getOpcode()),
llvm::hash_value(ID.Inst->getType()),
llvm::hash_value(CI->getCalledFunction()->getName().str()),
llvm::hash_combine_range(OperTypes.begin(), OperTypes.end()));
return llvm::hash_combine(
llvm::hash_value(ID.Inst->getOpcode()),
llvm::hash_value(ID.Inst->getType()),
llvm::hash_combine_range(OperTypes.begin(), OperTypes.end()));
}
IRInstructionDataList *IDL = nullptr;
};
struct IRInstructionDataList : simple_ilist<IRInstructionData> {};
/// Compare one IRInstructionData class to another IRInstructionData class for
/// whether they are performing a the same operation, and can mapped to the
/// same value. For regular instructions if the hash value is the same, then
/// they will also be close.
///
/// \param A - The first IRInstructionData class to compare
/// \param B - The second IRInstructionData class to compare
/// \returns true if \p A and \p B are similar enough to be mapped to the same
/// value.
bool isClose(const IRInstructionData &A, const IRInstructionData &B);
struct IRInstructionDataTraits : DenseMapInfo<IRInstructionData *> {
static inline IRInstructionData *getEmptyKey() { return nullptr; }
static inline IRInstructionData *getTombstoneKey() {
return reinterpret_cast<IRInstructionData *>(-1);
}
static unsigned getHashValue(const IRInstructionData *E) {
using llvm::hash_value;
assert(E && "IRInstructionData is a nullptr?");
return hash_value(*E);
}
static bool isEqual(const IRInstructionData *LHS,
const IRInstructionData *RHS) {
if (RHS == getEmptyKey() || RHS == getTombstoneKey() ||
LHS == getEmptyKey() || LHS == getTombstoneKey())
return LHS == RHS;
assert(LHS && RHS && "nullptr should have been caught by getEmptyKey?");
return isClose(*LHS, *RHS);
}
};
/// Helper struct for converting the Instructions in a Module into a vector of
/// unsigned integers. This vector of unsigned integers can be thought of as a
/// "numeric string". This numeric string can then be queried by, for example,
/// data structures that find repeated substrings.
///
/// This hashing is done per BasicBlock in the module. To hash Instructions
/// based off of their operations, each Instruction is wrapped in an
/// IRInstructionData struct. The unsigned integer for an IRInstructionData
/// depends on:
/// - The hash provided by the IRInstructionData.
/// - Which member of InstrType the IRInstructionData is classified as.
// See InstrType for more details on the possible classifications, and how they
// manifest in the numeric string.
///
/// The numeric string for an individual BasicBlock is terminated by an unique
/// unsigned integer. This prevents data structures which rely on repetition
/// from matching across BasicBlocks. (For example, the SuffixTree.)
/// As a concrete example, if we have the following two BasicBlocks:
/// \code
/// bb0:
/// %add1 = add i32 %a, %b
/// %add2 = add i32 %c, %d
/// %add3 = add i64 %e, %f
/// bb1:
/// %sub = sub i32 %c, %d
/// \endcode
/// We may hash the Instructions like this (via IRInstructionData):
/// \code
/// bb0:
/// %add1 = add i32 %a, %b ; Hash: 1
/// %add2 = add i32 %c, %d; Hash: 1
/// %add3 = add i64 %e, %f; Hash: 2
/// bb1:
/// %sub = sub i32 %c, %d; Hash: 3
/// %add4 = add i32 %c, %d ; Hash: 1
/// \endcode
/// And produce a "numeric string representation" like so:
/// 1, 1, 2, unique_integer_1, 3, 1, unique_integer_2
///
/// TODO: This is very similar to the MachineOutliner, and should be
/// consolidated into the same interface.
struct IRInstructionMapper {
/// The starting illegal instruction number to map to.
///
/// Set to -3 for compatibility with DenseMapInfo<unsigned>.
unsigned IllegalInstrNumber = static_cast<unsigned>(-3);
/// The next available integer to assign to a legal Instruction to.
unsigned LegalInstrNumber = 0;
/// Correspondence from IRInstructionData to unsigned integers.
DenseMap<IRInstructionData *, unsigned, IRInstructionDataTraits>
InstructionIntegerMap;
/// Set if we added an illegal number in the previous step.
/// Since each illegal number is unique, we only need one of them between
/// each range of legal numbers. This lets us make sure we don't add more
/// than one illegal number per range.
bool AddedIllegalLastTime = false;
/// Marks whether we found a illegal instruction in the previous step.
bool CanCombineWithPrevInstr = false;
/// Marks whether we have found a set of instructions that is long enough
/// to be considered for similarity.
bool HaveLegalRange = false;
/// This allocator pointer is in charge of holding on to the IRInstructionData
/// so it is not deallocated until whatever external tool is using it is done
/// with the information.
SpecificBumpPtrAllocator<IRInstructionData> *InstDataAllocator = nullptr;
/// This allocator pointer is in charge of creating the IRInstructionDataList
/// so it is not deallocated until whatever external tool is using it is done
/// with the information.
SpecificBumpPtrAllocator<IRInstructionDataList> *IDLAllocator = nullptr;
/// Get an allocated IRInstructionData struct using the InstDataAllocator.
///
/// \param I - The Instruction to wrap with IRInstructionData.
/// \param Legality - A boolean value that is true if the instruction is to
/// be considered for similarity, and false if not.
/// \param IDL - The InstructionDataList that the IRInstructionData is
/// inserted into.
/// \returns An allocated IRInstructionData struct.
IRInstructionData *allocateIRInstructionData(Instruction &I, bool Legality,
IRInstructionDataList &IDL);
/// Get an allocated IRInstructionDataList object using the IDLAllocator.
///
/// \returns An allocated IRInstructionDataList object.
IRInstructionDataList *allocateIRInstructionDataList();
IRInstructionDataList *IDL = nullptr;
/// Maps the Instructions in a BasicBlock \p BB to legal or illegal integers
/// determined by \p InstrType. Two Instructions are mapped to the same value
/// if they are close as defined by the InstructionData class above.
///
/// \param [in] BB - The BasicBlock to be mapped to integers.
/// \param [in,out] InstrList - Vector of IRInstructionData to append to.
/// \param [in,out] IntegerMapping - Vector of unsigned integers to append to.
void convertToUnsignedVec(BasicBlock &BB,
std::vector<IRInstructionData *> &InstrList,
std::vector<unsigned> &IntegerMapping);
/// Maps an Instruction to a legal integer.
///
/// \param [in] It - The Instruction to be mapped to an integer.
/// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to
/// append to.
/// \param [in,out] InstrListForBB - Vector of InstructionData to append to.
/// \returns The integer \p It was mapped to.
unsigned mapToLegalUnsigned(BasicBlock::iterator &It,
std::vector<unsigned> &IntegerMappingForBB,
std::vector<IRInstructionData *> &InstrListForBB);
/// Maps an Instruction to an illegal integer.
///
/// \param [in] It - The \p Instruction to be mapped to an integer.
/// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to
/// append to.
/// \param [in,out] InstrListForBB - Vector of IRInstructionData to append to.
/// \param End - true if creating a dummy IRInstructionData at the end of a
/// basic block.
/// \returns The integer \p It was mapped to.
unsigned mapToIllegalUnsigned(
BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB,
std::vector<IRInstructionData *> &InstrListForBB, bool End = false);
IRInstructionMapper(SpecificBumpPtrAllocator<IRInstructionData> *IDA,
SpecificBumpPtrAllocator<IRInstructionDataList> *IDLA)
: InstDataAllocator(IDA), IDLAllocator(IDLA) {
// Make sure that the implementation of DenseMapInfo<unsigned> hasn't
// changed.
assert(DenseMapInfo<unsigned>::getEmptyKey() == static_cast<unsigned>(-1) &&
"DenseMapInfo<unsigned>'s empty key isn't -1!");
assert(DenseMapInfo<unsigned>::getTombstoneKey() ==
static_cast<unsigned>(-2) &&
"DenseMapInfo<unsigned>'s tombstone key isn't -2!");
IDL = new (IDLAllocator->Allocate())
IRInstructionDataList();
}
/// Custom InstVisitor to classify different instructions for whether it can
/// be analyzed for similarity.
struct InstructionClassification
: public InstVisitor<InstructionClassification, InstrType> {
InstructionClassification() {}
// TODO: Determine a scheme to resolve when the label is similar enough.
InstrType visitBranchInst(BranchInst &BI) { return Illegal; }
// TODO: Determine a scheme to resolve when the labels are similar enough.
InstrType visitPHINode(PHINode &PN) { return Illegal; }
// TODO: Handle allocas.
InstrType visitAllocaInst(AllocaInst &AI) { return Illegal; }
// We exclude variable argument instructions since variable arguments
// requires extra checking of the argument list.
InstrType visitVAArgInst(VAArgInst &VI) { return Illegal; }
// We exclude all exception handling cases since they are so context
// dependent.
InstrType visitLandingPadInst(LandingPadInst &LPI) { return Illegal; }
InstrType visitFuncletPadInst(FuncletPadInst &FPI) { return Illegal; }
// DebugInfo should be included in the regions, but should not be
// analyzed for similarity as it has no bearing on the outcome of the
// program.
InstrType visitDbgInfoIntrinsic(DbgInfoIntrinsic &DII) { return Invisible; }
// TODO: Handle specific intrinsics.
InstrType visitIntrinsicInst(IntrinsicInst &II) { return Illegal; }
// We only allow call instructions where the function has a name and
// is not an indirect call.
InstrType visitCallInst(CallInst &CI) {
Function *F = CI.getCalledFunction();
if (!F || CI.isIndirectCall() || !F->hasName())
return Illegal;
return Legal;
}
// TODO: We do not current handle similarity that changes the control flow.
InstrType visitInvokeInst(InvokeInst &II) { return Illegal; }
// TODO: We do not current handle similarity that changes the control flow.
InstrType visitCallBrInst(CallBrInst &CBI) { return Illegal; }
// TODO: Handle interblock similarity.
InstrType visitTerminator(Instruction &I) { return Illegal; }
InstrType visitInstruction(Instruction &I) { return Legal; }
};
/// Maps an Instruction to a member of InstrType.
InstructionClassification InstClassifier;
};
/// This is a class that wraps a range of IRInstructionData from one point to
/// another in the vector of IRInstructionData, which is a region of the
/// program. It is also responsible for defining the structure within this
/// region of instructions.
///
/// The structure of a region is defined through a value numbering system
/// assigned to each unique value in a region at the creation of the
/// IRSimilarityCandidate.
///
/// For example, for each Instruction we add a mapping for each new
/// value seen in that Instruction.
/// IR: Mapping Added:
/// %add1 = add i32 %a, c1 %add1 -> 3, %a -> 1, c1 -> 2
/// %add2 = add i32 %a, %1 %add2 -> 4
/// %add3 = add i32 c2, c1 %add3 -> 6, c2 -> 5
///
/// We can compare IRSimilarityCandidates against one another.
/// The \ref isSimilar function compares each IRInstructionData against one
/// another and if we have the same sequences of IRInstructionData that would
/// create the same hash, we have similar IRSimilarityCandidates.
///
/// We can also compare the structure of IRSimilarityCandidates. If we can
/// create a mapping of registers in the region contained by one
/// IRSimilarityCandidate to the region contained by different
/// IRSimilarityCandidate, they can be considered structurally similar.
///
/// IRSimilarityCandidate1: IRSimilarityCandidate2:
/// %add1 = add i32 %a, %b %add1 = add i32 %d, %e
/// %add2 = add i32 %a, %c %add2 = add i32 %d, %f
/// %add3 = add i32 c1, c2 %add3 = add i32 c3, c4
///
/// Can have the following mapping from candidate to candidate of:
/// %a -> %d, %b -> %e, %c -> %f, c1 -> c3, c2 -> c4
/// and can be considered similar.
///
/// IRSimilarityCandidate1: IRSimilarityCandidate2:
/// %add1 = add i32 %a, %b %add1 = add i32 %d, c4
/// %add2 = add i32 %a, %c %add2 = add i32 %d, %f
/// %add3 = add i32 c1, c2 %add3 = add i32 c3, c4
///
/// We cannot create the same mapping since the use of c4 is not used in the
/// same way as %b or c2.
class IRSimilarityCandidate {
private:
/// The start index of this IRSimilarityCandidate in the instruction list.
unsigned StartIdx = 0;
/// The number of instructions in this IRSimilarityCandidate.
unsigned Len = 0;
/// The first instruction in this IRSimilarityCandidate.
IRInstructionData *FirstInst = nullptr;
/// The last instruction in this IRSimilarityCandidate.
IRInstructionData *LastInst = nullptr;
/// Global Value Numbering structures
/// @{
/// Stores the mapping of the value to the number assigned to it in the
/// IRSimilarityCandidate.
DenseMap<Value *, unsigned> ValueToNumber;
/// Stores the mapping of the number to the value assigned this number.
DenseMap<unsigned, Value *> NumberToValue;
/// @}
public:
/// \param StartIdx - The starting location of the region.
/// \param Len - The length of the region.
/// \param FirstInstIt - The starting IRInstructionData of the region.
/// \param LastInstIt - The ending IRInstructionData of the region.
IRSimilarityCandidate(unsigned StartIdx, unsigned Len,
IRInstructionData *FirstInstIt,
IRInstructionData *LastInstIt);
/// \param A - The first IRInstructionCandidate to compare.
/// \param B - The second IRInstructionCandidate to compare.
/// \returns True when every IRInstructionData in \p A is similar to every
/// IRInstructionData in \p B.
static bool isSimilar(const IRSimilarityCandidate &A,
const IRSimilarityCandidate &B);
/// \param A - The first IRInstructionCandidate to compare.
/// \param B - The second IRInstructionCandidate to compare.
/// \returns True when every IRInstructionData in \p A is structurally similar
/// to \p B.
static bool compareStructure(const IRSimilarityCandidate &A,
const IRSimilarityCandidate &B);
struct OperandMapping {
/// The IRSimilarityCandidate that holds the instruction the OperVals were
/// pulled from.
const IRSimilarityCandidate &IRSC;
/// The operand values to be analyzed.
ArrayRef<Value *> &OperVals;
/// The current mapping of global value numbers from one IRSimilarityCandidate
/// to another IRSimilarityCandidate.
DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMapping;
};
/// Compare the operands in \p A and \p B and check that the current mapping
/// of global value numbers from \p A to \p B and \p B to \A is consistent.
///
/// \param A - The first IRInstructionCandidate, operand values, and current
/// operand mappings to compare.
/// \param B - The second IRInstructionCandidate, operand values, and current
/// operand mappings to compare.
/// \returns true if the IRSimilarityCandidates operands are compatible.
static bool compareNonCommutativeOperandMapping(OperandMapping A,
OperandMapping B);
/// Compare the operands in \p A and \p B and check that the current mapping
/// of global value numbers from \p A to \p B and \p B to \A is consistent
/// given that the operands are commutative.
///
/// \param A - The first IRInstructionCandidate, operand values, and current
/// operand mappings to compare.
/// \param B - The second IRInstructionCandidate, operand values, and current
/// operand mappings to compare.
/// \returns true if the IRSimilarityCandidates operands are compatible.
static bool compareCommutativeOperandMapping(OperandMapping A,
OperandMapping B);
/// Compare the start and end indices of the two IRSimilarityCandidates for
/// whether they overlap. If the start instruction of one
/// IRSimilarityCandidate is less than the end instruction of the other, and
/// the start instruction of one is greater than the start instruction of the
/// other, they overlap.
///
/// \returns true if the IRSimilarityCandidates do not have overlapping
/// instructions.
static bool overlap(const IRSimilarityCandidate &A,
const IRSimilarityCandidate &B);
/// \returns the number of instructions in this Candidate.
unsigned getLength() const { return Len; }
/// \returns the start index of this IRSimilarityCandidate.
unsigned getStartIdx() const { return StartIdx; }
/// \returns the end index of this IRSimilarityCandidate.
unsigned getEndIdx() const { return StartIdx + Len - 1; }
/// \returns The first IRInstructionData.
IRInstructionData *front() const { return FirstInst; }
/// \returns The last IRInstructionData.
IRInstructionData *back() const { return LastInst; }
/// \returns The first Instruction.
Instruction *frontInstruction() { return FirstInst->Inst; }
/// \returns The last Instruction
Instruction *backInstruction() { return LastInst->Inst; }
/// \returns The BasicBlock the IRSimilarityCandidate starts in.
BasicBlock *getStartBB() { return FirstInst->Inst->getParent(); }
/// \returns The BasicBlock the IRSimilarityCandidate ends in.
BasicBlock *getEndBB() { return LastInst->Inst->getParent(); }
/// \returns The Function that the IRSimilarityCandidate is located in.
Function *getFunction() { return getStartBB()->getParent(); }
/// Finds the positive number associated with \p V if it has been mapped.
/// \param [in] V - the Value to find.
/// \returns The positive number corresponding to the value.
/// \returns None if not present.
Optional<unsigned> getGVN(Value *V) {
assert(V != nullptr && "Value is a nullptr?");
DenseMap<Value *, unsigned>::iterator VNIt = ValueToNumber.find(V);
if (VNIt == ValueToNumber.end())
return None;
return VNIt->second;
}
/// Finds the Value associate with \p Num if it exists.
/// \param [in] Num - the number to find.
/// \returns The Value associated with the number.
/// \returns None if not present.
Optional<Value *> fromGVN(unsigned Num) {
DenseMap<unsigned, Value *>::iterator VNIt = NumberToValue.find(Num);
if (VNIt == NumberToValue.end())
return None;
assert(VNIt->second != nullptr && "Found value is a nullptr!");
return VNIt->second;
}
/// \param RHS -The IRSimilarityCandidate to compare against
/// \returns true if the IRSimilarityCandidate is occurs after the
/// IRSimilarityCandidate in the program.
bool operator<(const IRSimilarityCandidate &RHS) const {
return getStartIdx() > RHS.getStartIdx();
}
using iterator = IRInstructionDataList::iterator;
iterator begin() const { return iterator(front()); }
iterator end() const { return std::next(iterator(back())); }
};
typedef std::vector<IRSimilarityCandidate> SimilarityGroup;
typedef std::vector<SimilarityGroup> SimilarityGroupList;
/// This class puts all the pieces of the IRInstructionData,
/// IRInstructionMapper, IRSimilarityCandidate together.
///
/// It first feeds the Module or vector of Modules into the IRInstructionMapper,
/// and puts all the mapped instructions into a single long list of
/// IRInstructionData.
///
/// The list of unsigned integers is given to the Suffix Tree or similar data
/// structure to find repeated subsequences. We construct an
/// IRSimilarityCandidate for each instance of the subsequence. We compare them
/// against one another since These repeated subsequences can have different
/// structure. For each different kind of structure found, we create a
/// similarity group.
///
/// If we had four IRSimilarityCandidates A, B, C, and D where A, B and D are
/// structurally similar to one another, while C is different we would have two
/// SimilarityGroups:
///
/// SimilarityGroup 1: SimilarityGroup 2
/// A, B, D C
///
/// A list of the different similarity groups is then returned after
/// analyzing the module.
class IRSimilarityIdentifier {
public:
IRSimilarityIdentifier()
: Mapper(&InstDataAllocator, &InstDataListAllocator) {}
/// \param M the module to find similarity in.
explicit IRSimilarityIdentifier(Module &M)
: Mapper(&InstDataAllocator, &InstDataListAllocator) {
findSimilarity(M);
}
private:
/// Map the instructions in the module to unsigned integers, using mapping
/// already present in the Mapper if possible.
///
/// \param [in] M Module - To map to integers.
/// \param [in,out] InstrList - The vector to append IRInstructionData to.
/// \param [in,out] IntegerMapping - The vector to append integers to.
void populateMapper(Module &M, std::vector<IRInstructionData *> &InstrList,
std::vector<unsigned> &IntegerMapping);
/// Map the instructions in the modules vector to unsigned integers, using
/// mapping already present in the mapper if possible.
///
/// \param [in] Modules - The list of modules to use to populate the mapper
/// \param [in,out] InstrList - The vector to append IRInstructionData to.
/// \param [in,out] IntegerMapping - The vector to append integers to.
void populateMapper(ArrayRef<std::unique_ptr<Module>> &Modules,
std::vector<IRInstructionData *> &InstrList,
std::vector<unsigned> &IntegerMapping);
/// Find the similarity candidates in \p InstrList and corresponding
/// \p UnsignedVec
///
/// \param [in,out] InstrList - The vector to append IRInstructionData to.
/// \param [in,out] IntegerMapping - The vector to append integers to.
/// candidates found in the program.
void findCandidates(std::vector<IRInstructionData *> &InstrList,
std::vector<unsigned> &IntegerMapping);
public:
// Find the IRSimilarityCandidates in the \p Modules and group by structural
// similarity in a SimilarityGroup, each group is returned in a
// SimilarityGroupList.
//
// \param [in] Modules - the modules to analyze.
// \returns The groups of similarity ranges found in the modules.
SimilarityGroupList &
findSimilarity(ArrayRef<std::unique_ptr<Module>> Modules);
// Find the IRSimilarityCandidates in the given Module grouped by structural
// similarity in a SimilarityGroup, contained inside a SimilarityGroupList.
//
// \param [in] M - the module to analyze.
// \returns The groups of similarity ranges found in the module.
SimilarityGroupList &findSimilarity(Module &M);
// Clears \ref SimilarityCandidates if it is already filled by a previous run.
void resetSimilarityCandidates() {
// If we've already analyzed a Module or set of Modules, so we must clear
// the SimilarityCandidates to make sure we do not have only old values
// hanging around.
if (SimilarityCandidates.hasValue())
SimilarityCandidates->clear();
else
SimilarityCandidates = SimilarityGroupList();
}
// \returns The groups of similarity ranges found in the most recently passed
// set of modules.
Optional<SimilarityGroupList> &getSimilarity() {
return SimilarityCandidates;
}
private:
/// The allocator for IRInstructionData.
SpecificBumpPtrAllocator<IRInstructionData> InstDataAllocator;
/// The allocator for IRInstructionDataLists.
SpecificBumpPtrAllocator<IRInstructionDataList> InstDataListAllocator;
/// Map Instructions to unsigned integers and wraps the Instruction in an
/// instance of IRInstructionData.
IRInstructionMapper Mapper;
/// The SimilarityGroups found with the most recent run of \ref
/// findSimilarity. None if there is no recent run.
Optional<SimilarityGroupList> SimilarityCandidates;
};
} // end namespace IRSimilarity
/// An analysis pass based on legacy pass manager that runs and returns
/// IRSimilarityIdentifier run on the Module.
class IRSimilarityIdentifierWrapperPass : public ModulePass {
std::unique_ptr<IRSimilarity::IRSimilarityIdentifier> IRSI;
public:
static char ID;
IRSimilarityIdentifierWrapperPass();
IRSimilarity::IRSimilarityIdentifier &getIRSI() { return *IRSI; }
const IRSimilarity::IRSimilarityIdentifier &getIRSI() const { return *IRSI; }
bool doInitialization(Module &M) override;
bool doFinalization(Module &M) override;
bool runOnModule(Module &M) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesAll();
}
};
/// An analysis pass that runs and returns the IRSimilarityIdentifier run on the
/// Module.
class IRSimilarityAnalysis : public AnalysisInfoMixin<IRSimilarityAnalysis> {
public:
typedef IRSimilarity::IRSimilarityIdentifier Result;
Result run(Module &M, ModuleAnalysisManager &);
private:
friend AnalysisInfoMixin<IRSimilarityAnalysis>;
static AnalysisKey Key;
};
/// Printer pass that uses \c IRSimilarityAnalysis.
class IRSimilarityAnalysisPrinterPass
: public PassInfoMixin<IRSimilarityAnalysisPrinterPass> {
raw_ostream &OS;
public:
explicit IRSimilarityAnalysisPrinterPass(raw_ostream &OS) : OS(OS) {}
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
};
} // end namespace llvm
#endif // LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H