| //===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===// |
| // |
| // The LLVM Compiler Infrastructure |
| // |
| // This file is distributed under the University of Illinois Open Source |
| // License. See LICENSE.TXT for details. |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // The machine combiner pass uses machine trace metrics to ensure the combined |
| // instructions does not lengthen the critical path or the resource depth. |
| //===----------------------------------------------------------------------===// |
| #define DEBUG_TYPE "machine-combiner" |
| |
| #include "llvm/ADT/Statistic.h" |
| #include "llvm/ADT/DenseMap.h" |
| #include "llvm/CodeGen/MachineDominators.h" |
| #include "llvm/CodeGen/MachineFunction.h" |
| #include "llvm/CodeGen/MachineFunctionPass.h" |
| #include "llvm/CodeGen/MachineInstrBuilder.h" |
| #include "llvm/CodeGen/MachineLoopInfo.h" |
| #include "llvm/CodeGen/MachineRegisterInfo.h" |
| #include "llvm/CodeGen/MachineTraceMetrics.h" |
| #include "llvm/CodeGen/Passes.h" |
| #include "llvm/CodeGen/TargetSchedule.h" |
| #include "llvm/Support/CommandLine.h" |
| #include "llvm/Support/Debug.h" |
| #include "llvm/Support/raw_ostream.h" |
| #include "llvm/Target/TargetInstrInfo.h" |
| #include "llvm/Target/TargetRegisterInfo.h" |
| #include "llvm/Target/TargetSubtargetInfo.h" |
| |
| using namespace llvm; |
| |
| STATISTIC(NumInstCombined, "Number of machineinst combined"); |
| |
| namespace { |
| class MachineCombiner : public MachineFunctionPass { |
| const TargetInstrInfo *TII; |
| const TargetRegisterInfo *TRI; |
| MCSchedModel SchedModel; |
| MachineRegisterInfo *MRI; |
| MachineTraceMetrics *Traces; |
| MachineTraceMetrics::Ensemble *MinInstr; |
| |
| TargetSchedModel TSchedModel; |
| |
| /// True if optimizing for code size. |
| bool OptSize; |
| |
| public: |
| static char ID; |
| MachineCombiner() : MachineFunctionPass(ID) { |
| initializeMachineCombinerPass(*PassRegistry::getPassRegistry()); |
| } |
| void getAnalysisUsage(AnalysisUsage &AU) const override; |
| bool runOnMachineFunction(MachineFunction &MF) override; |
| const char *getPassName() const override { return "Machine InstCombiner"; } |
| |
| private: |
| bool doSubstitute(unsigned NewSize, unsigned OldSize); |
| bool combineInstructions(MachineBasicBlock *); |
| MachineInstr *getOperandDef(const MachineOperand &MO); |
| unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, |
| DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, |
| MachineTraceMetrics::Trace BlockTrace); |
| unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot, |
| MachineTraceMetrics::Trace BlockTrace); |
| bool |
| preservesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root, |
| MachineTraceMetrics::Trace BlockTrace, |
| SmallVectorImpl<MachineInstr *> &InsInstrs, |
| DenseMap<unsigned, unsigned> &InstrIdxForVirtReg); |
| bool preservesResourceLen(MachineBasicBlock *MBB, |
| MachineTraceMetrics::Trace BlockTrace, |
| SmallVectorImpl<MachineInstr *> &InsInstrs, |
| SmallVectorImpl<MachineInstr *> &DelInstrs); |
| void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs, |
| SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC); |
| }; |
| } |
| |
| char MachineCombiner::ID = 0; |
| char &llvm::MachineCombinerID = MachineCombiner::ID; |
| |
| INITIALIZE_PASS_BEGIN(MachineCombiner, "machine-combiner", |
| "Machine InstCombiner", false, false) |
| INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) |
| INITIALIZE_PASS_END(MachineCombiner, "machine-combiner", "Machine InstCombiner", |
| false, false) |
| |
| void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const { |
| AU.setPreservesCFG(); |
| AU.addPreserved<MachineDominatorTree>(); |
| AU.addPreserved<MachineLoopInfo>(); |
| AU.addRequired<MachineTraceMetrics>(); |
| AU.addPreserved<MachineTraceMetrics>(); |
| MachineFunctionPass::getAnalysisUsage(AU); |
| } |
| |
| MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) { |
| MachineInstr *DefInstr = nullptr; |
| // We need a virtual register definition. |
| if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) |
| DefInstr = MRI->getUniqueVRegDef(MO.getReg()); |
| // PHI's have no depth etc. |
| if (DefInstr && DefInstr->isPHI()) |
| DefInstr = nullptr; |
| return DefInstr; |
| } |
| |
| /// Computes depth of instructions in vector \InsInstr. |
| /// |
| /// \param InsInstrs is a vector of machine instructions |
| /// \param InstrIdxForVirtReg is a dense map of virtual register to index |
| /// of defining machine instruction in \p InsInstrs |
| /// \param BlockTrace is a trace of machine instructions |
| /// |
| /// \returns Depth of last instruction in \InsInstrs ("NewRoot") |
| unsigned |
| MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, |
| DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, |
| MachineTraceMetrics::Trace BlockTrace) { |
| |
| SmallVector<unsigned, 16> InstrDepth; |
| assert(TSchedModel.hasInstrSchedModel() && "Missing machine model\n"); |
| |
| // For each instruction in the new sequence compute the depth based on the |
| // operands. Use the trace information when possible. For new operands which |
| // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth |
| for (auto *InstrPtr : InsInstrs) { // for each Use |
| unsigned IDepth = 0; |
| DEBUG(dbgs() << "NEW INSTR "; InstrPtr->dump(); dbgs() << "\n";); |
| for (unsigned i = 0, e = InstrPtr->getNumOperands(); i != e; ++i) { |
| const MachineOperand &MO = InstrPtr->getOperand(i); |
| // Check for virtual register operand. |
| if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))) |
| continue; |
| if (!MO.isUse()) |
| continue; |
| unsigned DepthOp = 0; |
| unsigned LatencyOp = 0; |
| DenseMap<unsigned, unsigned>::iterator II = |
| InstrIdxForVirtReg.find(MO.getReg()); |
| if (II != InstrIdxForVirtReg.end()) { |
| // Operand is new virtual register not in trace |
| assert(II->second < InstrDepth.size() && "Bad Index"); |
| MachineInstr *DefInstr = InsInstrs[II->second]; |
| assert(DefInstr && |
| "There must be a definition for a new virtual register"); |
| DepthOp = InstrDepth[II->second]; |
| LatencyOp = TSchedModel.computeOperandLatency( |
| DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()), |
| InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg())); |
| } else { |
| MachineInstr *DefInstr = getOperandDef(MO); |
| if (DefInstr) { |
| DepthOp = BlockTrace.getInstrCycles(DefInstr).Depth; |
| LatencyOp = TSchedModel.computeOperandLatency( |
| DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()), |
| InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg())); |
| } |
| } |
| IDepth = std::max(IDepth, DepthOp + LatencyOp); |
| } |
| InstrDepth.push_back(IDepth); |
| } |
| unsigned NewRootIdx = InsInstrs.size() - 1; |
| return InstrDepth[NewRootIdx]; |
| } |
| |
| /// Computes instruction latency as max of latency of defined operands. |
| /// |
| /// \param Root is a machine instruction that could be replaced by NewRoot. |
| /// It is used to compute a more accurate latency information for NewRoot in |
| /// case there is a dependent instruction in the same trace (\p BlockTrace) |
| /// \param NewRoot is the instruction for which the latency is computed |
| /// \param BlockTrace is a trace of machine instructions |
| /// |
| /// \returns Latency of \p NewRoot |
| unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot, |
| MachineTraceMetrics::Trace BlockTrace) { |
| |
| assert(TSchedModel.hasInstrSchedModel() && "Missing machine model\n"); |
| |
| // Check each definition in NewRoot and compute the latency |
| unsigned NewRootLatency = 0; |
| |
| for (unsigned i = 0, e = NewRoot->getNumOperands(); i != e; ++i) { |
| const MachineOperand &MO = NewRoot->getOperand(i); |
| // Check for virtual register operand. |
| if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))) |
| continue; |
| if (!MO.isDef()) |
| continue; |
| // Get the first instruction that uses MO |
| MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(MO.getReg()); |
| RI++; |
| MachineInstr *UseMO = RI->getParent(); |
| unsigned LatencyOp = 0; |
| if (UseMO && BlockTrace.isDepInTrace(Root, UseMO)) { |
| LatencyOp = TSchedModel.computeOperandLatency( |
| NewRoot, NewRoot->findRegisterDefOperandIdx(MO.getReg()), UseMO, |
| UseMO->findRegisterUseOperandIdx(MO.getReg())); |
| } else { |
| LatencyOp = TSchedModel.computeInstrLatency(NewRoot->getOpcode()); |
| } |
| NewRootLatency = std::max(NewRootLatency, LatencyOp); |
| } |
| return NewRootLatency; |
| } |
| |
| /// True when the new instruction sequence does not |
| /// lengthen the critical path. The DAGCombine code sequence ends in MI |
| /// (Machine Instruction) Root. The new code sequence ends in MI NewRoot. A |
| /// necessary condition for the new sequence to replace the old sequence is that |
| /// it cannot lengthen the critical path. This is decided by the formula |
| /// (NewRootDepth + NewRootLatency) <= (RootDepth + RootLatency + RootSlack)). |
| /// The slack is the number of cycles Root can be delayed before the critical |
| /// patch becomes longer. |
| bool MachineCombiner::preservesCriticalPathLen( |
| MachineBasicBlock *MBB, MachineInstr *Root, |
| MachineTraceMetrics::Trace BlockTrace, |
| SmallVectorImpl<MachineInstr *> &InsInstrs, |
| DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) { |
| |
| assert(TSchedModel.hasInstrSchedModel() && "Missing machine model\n"); |
| // NewRoot is the last instruction in the \p InsInstrs vector |
| // Get depth and latency of NewRoot |
| unsigned NewRootIdx = InsInstrs.size() - 1; |
| MachineInstr *NewRoot = InsInstrs[NewRootIdx]; |
| unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace); |
| unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace); |
| |
| // Get depth, latency and slack of Root |
| unsigned RootDepth = BlockTrace.getInstrCycles(Root).Depth; |
| unsigned RootLatency = TSchedModel.computeInstrLatency(Root); |
| unsigned RootSlack = BlockTrace.getInstrSlack(Root); |
| |
| DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n"; |
| dbgs() << " NewRootDepth: " << NewRootDepth |
| << " NewRootLatency: " << NewRootLatency << "\n"; |
| dbgs() << " RootDepth: " << RootDepth << " RootLatency: " << RootLatency |
| << " RootSlack: " << RootSlack << "\n"; |
| dbgs() << " NewRootDepth + NewRootLatency " |
| << NewRootDepth + NewRootLatency << "\n"; |
| dbgs() << " RootDepth + RootLatency + RootSlack " |
| << RootDepth + RootLatency + RootSlack << "\n";); |
| |
| /// True when the new sequence does not lenghten the critical path. |
| return ((NewRootDepth + NewRootLatency) <= |
| (RootDepth + RootLatency + RootSlack)); |
| } |
| |
| /// helper routine to convert instructions into SC |
| void MachineCombiner::instr2instrSC( |
| SmallVectorImpl<MachineInstr *> &Instrs, |
| SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC) { |
| for (auto *InstrPtr : Instrs) { |
| unsigned Opc = InstrPtr->getOpcode(); |
| unsigned Idx = TII->get(Opc).getSchedClass(); |
| const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(Idx); |
| InstrsSC.push_back(SC); |
| } |
| } |
| /// True when the new instructions do not increase resource length |
| bool MachineCombiner::preservesResourceLen( |
| MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace, |
| SmallVectorImpl<MachineInstr *> &InsInstrs, |
| SmallVectorImpl<MachineInstr *> &DelInstrs) { |
| |
| // Compute current resource length |
| |
| //ArrayRef<const MachineBasicBlock *> MBBarr(MBB); |
| SmallVector <const MachineBasicBlock *, 1> MBBarr; |
| MBBarr.push_back(MBB); |
| unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(MBBarr); |
| |
| // Deal with SC rather than Instructions. |
| SmallVector<const MCSchedClassDesc *, 16> InsInstrsSC; |
| SmallVector<const MCSchedClassDesc *, 16> DelInstrsSC; |
| |
| instr2instrSC(InsInstrs, InsInstrsSC); |
| instr2instrSC(DelInstrs, DelInstrsSC); |
| |
| ArrayRef<const MCSchedClassDesc *> MSCInsArr = makeArrayRef(InsInstrsSC); |
| ArrayRef<const MCSchedClassDesc *> MSCDelArr = makeArrayRef(DelInstrsSC); |
| |
| // Compute new resource length |
| unsigned ResLenAfterCombine = |
| BlockTrace.getResourceLength(MBBarr, MSCInsArr, MSCDelArr); |
| |
| DEBUG(dbgs() << "RESOURCE DATA: \n"; |
| dbgs() << " resource len before: " << ResLenBeforeCombine |
| << " after: " << ResLenAfterCombine << "\n";); |
| |
| return ResLenAfterCombine <= ResLenBeforeCombine; |
| } |
| |
| /// \returns true when new instruction sequence should be generated |
| /// independent if it lengthens critical path or not |
| bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) { |
| if (OptSize && (NewSize < OldSize)) |
| return true; |
| if (!TSchedModel.hasInstrSchedModel()) |
| return true; |
| return false; |
| } |
| |
| /// Substitute a slow code sequence with a faster one by |
| /// evaluating instruction combining pattern. |
| /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction |
| /// combining based on machine trace metrics. Only combine a sequence of |
| /// instructions when this neither lengthens the critical path nor increases |
| /// resource pressure. When optimizing for codesize always combine when the new |
| /// sequence is shorter. |
| bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { |
| bool Changed = false; |
| DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n"); |
| |
| auto BlockIter = MBB->begin(); |
| |
| while (BlockIter != MBB->end()) { |
| auto &MI = *BlockIter++; |
| |
| DEBUG(dbgs() << "INSTR "; MI.dump(); dbgs() << "\n";); |
| SmallVector<MachineCombinerPattern::MC_PATTERN, 16> Pattern; |
| // The motivating example is: |
| // |
| // MUL Other MUL_op1 MUL_op2 Other |
| // \ / \ | / |
| // ADD/SUB => MADD/MSUB |
| // (=Root) (=NewRoot) |
| |
| // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is |
| // usually beneficial for code size it unfortunately can hurt performance |
| // when the ADD is on the critical path, but the MUL is not. With the |
| // substitution the MUL becomes part of the critical path (in form of the |
| // MADD) and can lengthen it on architectures where the MADD latency is |
| // longer than the ADD latency. |
| // |
| // For each instruction we check if it can be the root of a combiner |
| // pattern. Then for each pattern the new code sequence in form of MI is |
| // generated and evaluated. When the efficiency criteria (don't lengthen |
| // critical path, don't use more resources) is met the new sequence gets |
| // hooked up into the basic block before the old sequence is removed. |
| // |
| // The algorithm does not try to evaluate all patterns and pick the best. |
| // This is only an artificial restriction though. In practice there is |
| // mostly one pattern and hasPattern() can order patterns based on an |
| // internal cost heuristic. |
| |
| if (TII->hasPattern(MI, Pattern)) { |
| for (auto P : Pattern) { |
| SmallVector<MachineInstr *, 16> InsInstrs; |
| SmallVector<MachineInstr *, 16> DelInstrs; |
| DenseMap<unsigned, unsigned> InstrIdxForVirtReg; |
| if (!MinInstr) |
| MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount); |
| MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB); |
| Traces->verifyAnalysis(); |
| TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs, |
| InstrIdxForVirtReg); |
| // Found pattern, but did not generate alternative sequence. |
| // This can happen e.g. when an immediate could not be materialized |
| // in a single instruction. |
| if (!InsInstrs.size()) |
| continue; |
| // Substitute when we optimize for codesize and the new sequence has |
| // fewer instructions OR |
| // the new sequence neither lenghten the critical path nor increases |
| // resource pressure. |
| if (doSubstitute(InsInstrs.size(), DelInstrs.size()) || |
| (preservesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, |
| InstrIdxForVirtReg) && |
| preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) { |
| for (auto *InstrPtr : InsInstrs) |
| MBB->insert((MachineBasicBlock::iterator) & MI, |
| (MachineInstr *)InstrPtr); |
| for (auto *InstrPtr : DelInstrs) |
| InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval(); |
| |
| Changed = true; |
| ++NumInstCombined; |
| |
| Traces->invalidate(MBB); |
| Traces->verifyAnalysis(); |
| // Eagerly stop after the first pattern fired |
| break; |
| } else { |
| // Cleanup instructions of the alternative code sequence. There is no |
| // use for them. |
| for (auto *InstrPtr : InsInstrs) { |
| MachineFunction *MF = MBB->getParent(); |
| MF->DeleteMachineInstr((MachineInstr *)InstrPtr); |
| } |
| } |
| InstrIdxForVirtReg.clear(); |
| } |
| } |
| } |
| |
| return Changed; |
| } |
| |
| bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) { |
| const TargetSubtargetInfo &STI = MF.getSubtarget(); |
| TII = STI.getInstrInfo(); |
| TRI = STI.getRegisterInfo(); |
| SchedModel = STI.getSchedModel(); |
| TSchedModel.init(SchedModel, &STI, TII); |
| MRI = &MF.getRegInfo(); |
| Traces = &getAnalysis<MachineTraceMetrics>(); |
| MinInstr = 0; |
| |
| OptSize = MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize); |
| |
| DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n'); |
| if (!TII->useMachineCombiner()) { |
| DEBUG(dbgs() << " Skipping pass: Target does not support machine combiner\n"); |
| return false; |
| } |
| |
| bool Changed = false; |
| |
| // Try to combine instructions. |
| for (auto &MBB : MF) |
| Changed |= combineInstructions(&MBB); |
| |
| return Changed; |
| } |