blob: 2b512284f9409aa810f99d2599cdf3617e75f4b7 [file] [log] [blame]
//===-- BenchmarkRunner.cpp -------------------------------------*- C++ -*-===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#include <cmath>
#include <memory>
#include <string>
#include "Assembler.h"
#include "BenchmarkRunner.h"
#include "Error.h"
#include "MCInstrDescView.h"
#include "MmapUtils.h"
#include "PerfHelper.h"
#include "SubprocessMemory.h"
#include "Target.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/Support/CrashRecoveryContext.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/Program.h"
#include "llvm/Support/Signals.h"
#include "llvm/Support/SystemZ/zOSSupport.h"
#ifdef __linux__
#include <perfmon/perf_event.h>
#include <sys/mman.h>
#include <sys/ptrace.h>
#include <sys/resource.h>
#include <sys/socket.h>
#include <sys/syscall.h>
#include <sys/wait.h>
#include <unistd.h>
#if defined(__GLIBC__) && __has_include(<sys/rseq.h>) && defined(HAVE_BUILTIN_THREAD_POINTER)
#include <sys/rseq.h>
#if defined(RSEQ_SIG) && defined(SYS_rseq)
#endif // __linux__
namespace llvm {
namespace exegesis {
BenchmarkRunner::BenchmarkRunner(const LLVMState &State, Benchmark::ModeE Mode,
BenchmarkPhaseSelectorE BenchmarkPhaseSelector,
ExecutionModeE ExecutionMode,
ArrayRef<ValidationEvent> ValCounters)
: State(State), Mode(Mode), BenchmarkPhaseSelector(BenchmarkPhaseSelector),
ExecutionMode(ExecutionMode), ValidationCounters(ValCounters),
Scratch(std::make_unique<ScratchSpace>()) {}
BenchmarkRunner::~BenchmarkRunner() = default;
void BenchmarkRunner::FunctionExecutor::accumulateCounterValues(
const llvm::SmallVectorImpl<int64_t> &NewValues,
llvm::SmallVectorImpl<int64_t> *Result) {
const size_t NumValues = std::max(NewValues.size(), Result->size());
if (NumValues > Result->size())
Result->resize(NumValues, 0);
for (size_t I = 0, End = NewValues.size(); I < End; ++I)
(*Result)[I] += NewValues[I];
Expected<llvm::SmallVector<int64_t, 4>>
const char *Counters, ArrayRef<const char *> ValidationCounters,
SmallVectorImpl<int64_t> &ValidationCounterValues) const {
// We sum counts when there are several counters for a single ProcRes
// (e.g. P23 on SandyBridge).
llvm::SmallVector<int64_t, 4> CounterValues;
SmallVector<StringRef, 2> CounterNames;
StringRef(Counters).split(CounterNames, '+');
for (auto &CounterName : CounterNames) {
CounterName = CounterName.trim();
Expected<SmallVector<int64_t, 4>> ValueOrError = runWithCounter(
CounterName, ValidationCounters, ValidationCounterValues);
if (!ValueOrError)
return ValueOrError.takeError();
accumulateCounterValues(ValueOrError.get(), &CounterValues);
return CounterValues;
namespace {
class InProcessFunctionExecutorImpl : public BenchmarkRunner::FunctionExecutor {
static Expected<std::unique_ptr<InProcessFunctionExecutorImpl>>
create(const LLVMState &State, object::OwningBinary<object::ObjectFile> Obj,
BenchmarkRunner::ScratchSpace *Scratch) {
Expected<ExecutableFunction> EF =
ExecutableFunction::create(State.createTargetMachine(), std::move(Obj));
if (!EF)
return EF.takeError();
return std::unique_ptr<InProcessFunctionExecutorImpl>(
new InProcessFunctionExecutorImpl(State, std::move(*EF), Scratch));
InProcessFunctionExecutorImpl(const LLVMState &State,
ExecutableFunction Function,
BenchmarkRunner::ScratchSpace *Scratch)
: State(State), Function(std::move(Function)), Scratch(Scratch) {}
static void
accumulateCounterValues(const llvm::SmallVector<int64_t, 4> &NewValues,
llvm::SmallVector<int64_t, 4> *Result) {
const size_t NumValues = std::max(NewValues.size(), Result->size());
if (NumValues > Result->size())
Result->resize(NumValues, 0);
for (size_t I = 0, End = NewValues.size(); I < End; ++I)
(*Result)[I] += NewValues[I];
Expected<llvm::SmallVector<int64_t, 4>> runWithCounter(
StringRef CounterName, ArrayRef<const char *> ValidationCounters,
SmallVectorImpl<int64_t> &ValidationCounterValues) const override {
const ExegesisTarget &ET = State.getExegesisTarget();
char *const ScratchPtr = Scratch->ptr();
auto CounterOrError =
ET.createCounter(CounterName, State, ValidationCounters);
if (!CounterOrError)
return CounterOrError.takeError();
pfm::CounterGroup *Counter = CounterOrError.get().get();
auto PS = ET.withSavedState();
CrashRecoveryContext CRC;
const bool Crashed = !CRC.RunSafely([this, Counter, ScratchPtr]() {
if (Crashed) {
// See "Exit Status for Commands":
constexpr const int kSigOffset = 128;
return make_error<SnippetSignal>(CRC.RetCode - kSigOffset);
// The exit code of the process on windows is not meaningful as a
// signal, so simply pass in -1 as the signal into the error.
return make_error<SnippetSignal>(-1);
#endif // LLVM_ON_UNIX
auto ValidationValuesOrErr = Counter->readValidationCountersOrError();
if (!ValidationValuesOrErr)
return ValidationValuesOrErr.takeError();
ArrayRef RealValidationValues = *ValidationValuesOrErr;
for (size_t I = 0; I < RealValidationValues.size(); ++I)
ValidationCounterValues[I] = RealValidationValues[I];
return Counter->readOrError(Function.getFunctionBytes());
const LLVMState &State;
const ExecutableFunction Function;
BenchmarkRunner::ScratchSpace *const Scratch;
#ifdef __linux__
// The following class implements a function executor that executes the
// benchmark code within a subprocess rather than within the main llvm-exegesis
// process. This allows for much more control over the execution context of the
// snippet, particularly with regard to memory. This class performs all the
// necessary functions to create the subprocess, execute the snippet in the
// subprocess, and report results/handle errors.
class SubProcessFunctionExecutorImpl
: public BenchmarkRunner::FunctionExecutor {
static Expected<std::unique_ptr<SubProcessFunctionExecutorImpl>>
create(const LLVMState &State, object::OwningBinary<object::ObjectFile> Obj,
const BenchmarkKey &Key) {
Expected<ExecutableFunction> EF =
ExecutableFunction::create(State.createTargetMachine(), std::move(Obj));
if (!EF)
return EF.takeError();
return std::unique_ptr<SubProcessFunctionExecutorImpl>(
new SubProcessFunctionExecutorImpl(State, std::move(*EF), Key));
SubProcessFunctionExecutorImpl(const LLVMState &State,
ExecutableFunction Function,
const BenchmarkKey &Key)
: State(State), Function(std::move(Function)), Key(Key) {}
enum ChildProcessExitCodeE {
CounterFDReadFailed = 1,
StringRef childProcessExitCodeToString(int ExitCode) const {
switch (ExitCode) {
case ChildProcessExitCodeE::CounterFDReadFailed:
return "Counter file descriptor read failed";
case ChildProcessExitCodeE::RSeqDisableFailed:
return "Disabling restartable sequences failed";
case ChildProcessExitCodeE::FunctionDataMappingFailed:
return "Failed to map memory for assembled snippet";
case ChildProcessExitCodeE::AuxiliaryMemorySetupFailed:
return "Failed to setup auxiliary memory";
return "Child process returned with unknown exit code";
Error sendFileDescriptorThroughSocket(int SocketFD, int FD) const {
struct msghdr Message = {};
char Buffer[CMSG_SPACE(sizeof(FD))];
memset(Buffer, 0, sizeof(Buffer));
Message.msg_control = Buffer;
Message.msg_controllen = sizeof(Buffer);
struct cmsghdr *ControlMessage = CMSG_FIRSTHDR(&Message);
ControlMessage->cmsg_level = SOL_SOCKET;
ControlMessage->cmsg_type = SCM_RIGHTS;
ControlMessage->cmsg_len = CMSG_LEN(sizeof(FD));
memcpy(CMSG_DATA(ControlMessage), &FD, sizeof(FD));
Message.msg_controllen = CMSG_SPACE(sizeof(FD));
ssize_t BytesWritten = sendmsg(SocketFD, &Message, 0);
if (BytesWritten < 0)
return make_error<Failure>("Failed to write FD to socket: " +
return Error::success();
Expected<int> getFileDescriptorFromSocket(int SocketFD) const {
struct msghdr Message = {};
char ControlBuffer[256];
Message.msg_control = ControlBuffer;
Message.msg_controllen = sizeof(ControlBuffer);
ssize_t BytesRead = recvmsg(SocketFD, &Message, 0);
if (BytesRead < 0)
return make_error<Failure>("Failed to read FD from socket: " +
struct cmsghdr *ControlMessage = CMSG_FIRSTHDR(&Message);
int FD;
if (ControlMessage->cmsg_len != CMSG_LEN(sizeof(FD)))
return make_error<Failure>("Failed to get correct number of bytes for "
"file descriptor from socket.");
memcpy(&FD, CMSG_DATA(ControlMessage), sizeof(FD));
return FD;
Error createSubProcessAndRunBenchmark(
StringRef CounterName, SmallVectorImpl<int64_t> &CounterValues,
ArrayRef<const char *> ValidationCounters,
SmallVectorImpl<int64_t> &ValidationCounterValues) const {
int PipeFiles[2];
int PipeSuccessOrErr = socketpair(AF_UNIX, SOCK_DGRAM, 0, PipeFiles);
if (PipeSuccessOrErr != 0) {
return make_error<Failure>(
"Failed to create a pipe for interprocess communication between "
"llvm-exegesis and the benchmarking subprocess: " +
SubprocessMemory SPMemory;
Error MemoryInitError = SPMemory.initializeSubprocessMemory(getpid());
if (MemoryInitError)
return MemoryInitError;
Error AddMemDefError =
SPMemory.addMemoryDefinition(Key.MemoryValues, getpid());
if (AddMemDefError)
return AddMemDefError;
pid_t ParentOrChildPID = fork();
if (ParentOrChildPID == -1) {
return make_error<Failure>("Failed to create child process: " +
if (ParentOrChildPID == 0) {
// We are in the child process, close the write end of the pipe
// Unregister handlers, signal handling is now handled through ptrace in
// the host process
prepareAndRunBenchmark(PipeFiles[0], Key);
// The child process terminates in the above function, so we should never
// get to this point.
llvm_unreachable("Child process didn't exit when expected.");
const ExegesisTarget &ET = State.getExegesisTarget();
auto CounterOrError = ET.createCounter(
CounterName, State, ValidationCounters, ParentOrChildPID);
if (!CounterOrError)
return CounterOrError.takeError();
pfm::CounterGroup *Counter = CounterOrError.get().get();
// Make sure to attach to the process (and wait for the sigstop to be
// delivered and for the process to continue) before we write to the counter
// file descriptor. Attaching to the process before writing to the socket
// ensures that the subprocess at most has blocked on the read call. If we
// attach afterwards, the subprocess might exit before we get to the attach
// call due to effects like scheduler contention, introducing transient
// failures.
if (ptrace(PTRACE_ATTACH, ParentOrChildPID, NULL, NULL) != 0)
return make_error<Failure>("Failed to attach to the child process: " +
if (wait(NULL) == -1) {
return make_error<Failure>(
"Failed to wait for child process to stop after attaching: " +
if (ptrace(PTRACE_CONT, ParentOrChildPID, NULL, NULL) != 0)
return make_error<Failure>(
"Failed to continue execution of the child process: " +
int CounterFileDescriptor = Counter->getFileDescriptor();
Error SendError =
sendFileDescriptorThroughSocket(PipeFiles[1], CounterFileDescriptor);
if (SendError)
return SendError;
int ChildStatus;
if (wait(&ChildStatus) == -1) {
return make_error<Failure>(
"Waiting for the child process to complete failed: " +
if (WIFEXITED(ChildStatus)) {
int ChildExitCode = WEXITSTATUS(ChildStatus);
if (ChildExitCode == 0) {
// The child exited succesfully, read counter values and return
// success
auto CounterValueOrErr = Counter->readOrError();
if (!CounterValueOrErr)
return CounterValueOrErr.takeError();
CounterValues = std::move(*CounterValueOrErr);
auto ValidationValuesOrErr = Counter->readValidationCountersOrError();
if (!ValidationValuesOrErr)
return ValidationValuesOrErr.takeError();
ArrayRef RealValidationValues = *ValidationValuesOrErr;
for (size_t I = 0; I < RealValidationValues.size(); ++I)
ValidationCounterValues[I] = RealValidationValues[I];
return Error::success();
// The child exited, but not successfully
return make_error<Failure>(
"Child benchmarking process exited with non-zero exit code: " +
// An error was encountered running the snippet, process it
siginfo_t ChildSignalInfo;
if (ptrace(PTRACE_GETSIGINFO, ParentOrChildPID, NULL, &ChildSignalInfo) ==
-1) {
return make_error<Failure>("Getting signal info from the child failed: " +
if (ChildSignalInfo.si_signo == SIGSEGV)
return make_error<SnippetSegmentationFault>(
return make_error<SnippetSignal>(ChildSignalInfo.si_signo);
void disableCoreDumps() const {
struct rlimit rlim;
rlim.rlim_cur = 0;
setrlimit(RLIMIT_CORE, &rlim);
[[noreturn]] void prepareAndRunBenchmark(int Pipe,
const BenchmarkKey &Key) const {
// Disable core dumps in the child process as otherwise everytime we
// encounter an execution failure like a segmentation fault, we will create
// a core dump. We report the information directly rather than require the
// user inspect a core dump.
// The following occurs within the benchmarking subprocess
pid_t ParentPID = getppid();
Expected<int> CounterFileDescriptorOrError =
if (!CounterFileDescriptorOrError)
int CounterFileDescriptor = *CounterFileDescriptorOrError;
// Glibc versions greater than 2.35 automatically call rseq during
// initialization. Unmapping the region that glibc sets up for this causes
// segfaults in the program Unregister the rseq region so that we can safely
// unmap it later
long RseqDisableOutput =
syscall(SYS_rseq, (intptr_t)__builtin_thread_pointer() + __rseq_offset,
if (RseqDisableOutput != 0)
// The frontend that generates the memory annotation structures should
// validate that the address to map the snippet in at is a multiple of
// the page size. Assert that this is true here.
assert(Key.SnippetAddress % getpagesize() == 0 &&
"The snippet address needs to be aligned to a page boundary.");
size_t FunctionDataCopySize = this->Function.FunctionBytes.size();
void *MapAddress = NULL;
if (Key.SnippetAddress != 0) {
MapAddress = reinterpret_cast<void *>(Key.SnippetAddress);
char *FunctionDataCopy =
(char *)mmap(MapAddress, FunctionDataCopySize, PROT_READ | PROT_WRITE,
MapFlags, 0, 0);
if ((intptr_t)FunctionDataCopy == -1)
memcpy(FunctionDataCopy, this->,
mprotect(FunctionDataCopy, FunctionDataCopySize, PROT_READ | PROT_EXEC);
Expected<int> AuxMemFDOrError =
Key.MemoryValues, ParentPID, CounterFileDescriptor);
if (!AuxMemFDOrError)
((void (*)(size_t, int))(intptr_t)FunctionDataCopy)(FunctionDataCopySize,
Expected<llvm::SmallVector<int64_t, 4>> runWithCounter(
StringRef CounterName, ArrayRef<const char *> ValidationCounters,
SmallVectorImpl<int64_t> &ValidationCounterValues) const override {
SmallVector<int64_t, 4> Value(1, 0);
Error PossibleBenchmarkError = createSubProcessAndRunBenchmark(
CounterName, Value, ValidationCounters, ValidationCounterValues);
if (PossibleBenchmarkError)
return std::move(PossibleBenchmarkError);
return Value;
const LLVMState &State;
const ExecutableFunction Function;
const BenchmarkKey &Key;
#endif // __linux__
} // namespace
Expected<SmallString<0>> BenchmarkRunner::assembleSnippet(
const BenchmarkCode &BC, const SnippetRepetitor &Repetitor,
unsigned MinInstructions, unsigned LoopBodySize,
bool GenerateMemoryInstructions) const {
const std::vector<MCInst> &Instructions = BC.Key.Instructions;
SmallString<0> Buffer;
raw_svector_ostream OS(Buffer);
if (Error E = assembleToStream(
State.getExegesisTarget(), State.createTargetMachine(), BC.LiveIns,
Repetitor.Repeat(Instructions, MinInstructions, LoopBodySize,
OS, BC.Key, GenerateMemoryInstructions)) {
return std::move(E);
return Buffer;
const BenchmarkCode &BC, unsigned NumRepetitions, unsigned LoopBodySize,
const SnippetRepetitor &Repetitor) const {
RunnableConfiguration RC;
Benchmark &BenchmarkResult = RC.BenchmarkResult;
BenchmarkResult.Mode = Mode;
BenchmarkResult.CpuName =
BenchmarkResult.LLVMTriple =
BenchmarkResult.NumRepetitions = NumRepetitions;
BenchmarkResult.Info = BC.Info;
const std::vector<MCInst> &Instructions = BC.Key.Instructions;
bool GenerateMemoryInstructions = ExecutionMode == ExecutionModeE::SubProcess;
BenchmarkResult.Key = BC.Key;
// Assemble at least kMinInstructionsForSnippet instructions by repeating
// the snippet for debug/analysis. This is so that the user clearly
// understands that the inside instructions are repeated.
if (BenchmarkPhaseSelector > BenchmarkPhaseSelectorE::PrepareSnippet) {
const int MinInstructionsForSnippet = 4 * Instructions.size();
const int LoopBodySizeForSnippet = 2 * Instructions.size();
auto Snippet =
assembleSnippet(BC, Repetitor, MinInstructionsForSnippet,
LoopBodySizeForSnippet, GenerateMemoryInstructions);
if (Error E = Snippet.takeError())
return std::move(E);
if (auto Err = getBenchmarkFunctionBytes(*Snippet,
return std::move(Err);
// Assemble NumRepetitions instructions repetitions of the snippet for
// measurements.
if (BenchmarkPhaseSelector >
BenchmarkPhaseSelectorE::PrepareAndAssembleSnippet) {
auto Snippet =
assembleSnippet(BC, Repetitor, BenchmarkResult.NumRepetitions,
LoopBodySize, GenerateMemoryInstructions);
if (Error E = Snippet.takeError())
return std::move(E);
RC.ObjectFile = getObjectFromBuffer(*Snippet);
return std::move(RC);
object::OwningBinary<object::ObjectFile> ObjectFile,
const BenchmarkKey &Key) const {
switch (ExecutionMode) {
case ExecutionModeE::InProcess: {
auto InProcessExecutorOrErr = InProcessFunctionExecutorImpl::create(
State, std::move(ObjectFile), Scratch.get());
if (!InProcessExecutorOrErr)
return InProcessExecutorOrErr.takeError();
return std::move(*InProcessExecutorOrErr);
case ExecutionModeE::SubProcess: {
#ifdef __linux__
auto SubProcessExecutorOrErr = SubProcessFunctionExecutorImpl::create(
State, std::move(ObjectFile), Key);
if (!SubProcessExecutorOrErr)
return SubProcessExecutorOrErr.takeError();
return std::move(*SubProcessExecutorOrErr);
return make_error<Failure>(
"The subprocess execution mode is only supported on Linux");
llvm_unreachable("ExecutionMode is outside expected range");
std::pair<Error, Benchmark> BenchmarkRunner::runConfiguration(
RunnableConfiguration &&RC,
const std::optional<StringRef> &DumpFile) const {
Benchmark &BenchmarkResult = RC.BenchmarkResult;
object::OwningBinary<object::ObjectFile> &ObjectFile = RC.ObjectFile;
if (DumpFile && BenchmarkPhaseSelector >
BenchmarkPhaseSelectorE::PrepareAndAssembleSnippet) {
auto ObjectFilePath =
writeObjectFile(ObjectFile.getBinary()->getData(), *DumpFile);
if (Error E = ObjectFilePath.takeError()) {
return {std::move(E), std::move(BenchmarkResult)};
outs() << "Check generated assembly with: /usr/bin/objdump -d "
<< *ObjectFilePath << "\n";
if (BenchmarkPhaseSelector < BenchmarkPhaseSelectorE::Measure) {
BenchmarkResult.Error = "actual measurements skipped.";
return {Error::success(), std::move(BenchmarkResult)};
Expected<std::unique_ptr<BenchmarkRunner::FunctionExecutor>> Executor =
createFunctionExecutor(std::move(ObjectFile), RC.BenchmarkResult.Key);
if (!Executor)
return {Executor.takeError(), std::move(BenchmarkResult)};
auto NewMeasurements = runMeasurements(**Executor);
if (Error E = NewMeasurements.takeError()) {
return {std::move(E), std::move(BenchmarkResult)};
assert(BenchmarkResult.NumRepetitions > 0 && "invalid NumRepetitions");
for (BenchmarkMeasure &BM : *NewMeasurements) {
// Scale the measurements by instruction.
BM.PerInstructionValue /= BenchmarkResult.NumRepetitions;
// Scale the measurements by snippet.
BM.PerSnippetValue /=
std::ceil(BenchmarkResult.NumRepetitions /
BenchmarkResult.Measurements = std::move(*NewMeasurements);
return {Error::success(), std::move(BenchmarkResult)};
BenchmarkRunner::writeObjectFile(StringRef Buffer, StringRef FileName) const {
int ResultFD = 0;
SmallString<256> ResultPath = FileName;
if (Error E = errorCodeToError(
FileName.empty() ? sys::fs::createTemporaryFile("snippet", "o",
ResultFD, ResultPath)
: sys::fs::openFileForReadWrite(
FileName, ResultFD, sys::fs::CD_CreateAlways,
return std::move(E);
raw_fd_ostream OFS(ResultFD, true /*ShouldClose*/);
OFS.write(, Buffer.size());
return std::string(ResultPath);
static bool EventLessThan(const std::pair<ValidationEvent, const char *> LHS,
const ValidationEvent RHS) {
return static_cast<int>(LHS.first) < static_cast<int>(RHS);
Error BenchmarkRunner::getValidationCountersToRun(
SmallVector<const char *> &ValCountersToRun) const {
const PfmCountersInfo &PCI = State.getPfmCounters();
ArrayRef TargetValidationEvents(PCI.ValidationEvents,
for (const ValidationEvent RequestedValEvent : ValidationCounters) {
auto ValCounterIt =
lower_bound(TargetValidationEvents, RequestedValEvent, EventLessThan);
if (ValCounterIt == TargetValidationEvents.end() ||
ValCounterIt->first != RequestedValEvent)
return make_error<Failure>("Cannot create validation counter");
assert(ValCounterIt->first == RequestedValEvent &&
"The array of validation events from the target should be sorted");
return Error::success();
BenchmarkRunner::FunctionExecutor::~FunctionExecutor() {}
} // namespace exegesis
} // namespace llvm