| //===-- working_set.cpp ---------------------------------------------------===// |
| // |
| // The LLVM Compiler Infrastructure |
| // |
| // This file is distributed under the University of Illinois Open Source |
| // License. See LICENSE.TXT for details. |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This file is a part of EfficiencySanitizer, a family of performance tuners. |
| // |
| // This file contains working-set-specific code. |
| //===----------------------------------------------------------------------===// |
| |
| #include "working_set.h" |
| #include "esan.h" |
| #include "esan_circular_buffer.h" |
| #include "esan_flags.h" |
| #include "esan_shadow.h" |
| #include "esan_sideline.h" |
| #include "sanitizer_common/sanitizer_procmaps.h" |
| |
| // We shadow every cache line of app memory with one shadow byte. |
| // - The highest bit of each shadow byte indicates whether the corresponding |
| // cache line has ever been accessed. |
| // - The lowest bit of each shadow byte indicates whether the corresponding |
| // cache line was accessed since the last sample. |
| // - The other bits are used for working set snapshots at successively |
| // lower frequencies, each bit to the left from the lowest bit stepping |
| // down the frequency by 2 to the power of getFlags()->snapshot_step. |
| // Thus we have something like this: |
| // Bit 0: Since last sample |
| // Bit 1: Since last 2^2 samples |
| // Bit 2: Since last 2^4 samples |
| // Bit 3: ... |
| // Bit 7: Ever accessed. |
| // We live with races in accessing each shadow byte. |
| typedef unsigned char byte; |
| |
| namespace __esan { |
| |
| // Our shadow memory assumes that the line size is 64. |
| static const u32 CacheLineSize = 64; |
| |
| // See the shadow byte layout description above. |
| static const u32 TotalWorkingSetBitIdx = 7; |
| // We accumulate to the left until we hit this bit. |
| // We don't need to accumulate to the final bit as it's set on each ref |
| // by the compiler instrumentation. |
| static const u32 MaxAccumBitIdx = 6; |
| static const u32 CurWorkingSetBitIdx = 0; |
| static const byte ShadowAccessedVal = |
| (1 << TotalWorkingSetBitIdx) | (1 << CurWorkingSetBitIdx); |
| |
| static SidelineThread Thread; |
| // If we use real-time-based timer samples this won't overflow in any realistic |
| // scenario, but if we switch to some other unit (such as memory accesses) we |
| // may want to consider a 64-bit int. |
| static u32 SnapshotNum; |
| |
| // We store the wset size for each of 8 different sampling frequencies. |
| static const u32 NumFreq = 8; // One for each bit of our shadow bytes. |
| // We cannot use static objects as the global destructor is called |
| // prior to our finalize routine. |
| // These are each circular buffers, sized up front. |
| CircularBuffer<u32> SizePerFreq[NumFreq]; |
| // We cannot rely on static initializers (they may run too late) but |
| // we record the size here for clarity: |
| u32 CircularBufferSizes[NumFreq] = { |
| // These are each mmap-ed so our minimum is one page. |
| 32*1024, |
| 16*1024, |
| 8*1024, |
| 4*1024, |
| 4*1024, |
| 4*1024, |
| 4*1024, |
| 4*1024, |
| }; |
| |
| void processRangeAccessWorkingSet(uptr PC, uptr Addr, SIZE_T Size, |
| bool IsWrite) { |
| if (Size == 0) |
| return; |
| SIZE_T I = 0; |
| uptr LineSize = getFlags()->cache_line_size; |
| // As Addr+Size could overflow at the top of a 32-bit address space, |
| // we avoid the simpler formula that rounds the start and end. |
| SIZE_T NumLines = Size / LineSize + |
| // Add any extra at the start or end adding on an extra line: |
| (LineSize - 1 + Addr % LineSize + Size % LineSize) / LineSize; |
| byte *Shadow = (byte *)appToShadow(Addr); |
| // Write shadow bytes until we're word-aligned. |
| while (I < NumLines && (uptr)Shadow % 4 != 0) { |
| if ((*Shadow & ShadowAccessedVal) != ShadowAccessedVal) |
| *Shadow |= ShadowAccessedVal; |
| ++Shadow; |
| ++I; |
| } |
| // Write whole shadow words at a time. |
| // Using a word-stride loop improves the runtime of a microbenchmark of |
| // memset calls by 10%. |
| u32 WordValue = ShadowAccessedVal | ShadowAccessedVal << 8 | |
| ShadowAccessedVal << 16 | ShadowAccessedVal << 24; |
| while (I + 4 <= NumLines) { |
| if ((*(u32*)Shadow & WordValue) != WordValue) |
| *(u32*)Shadow |= WordValue; |
| Shadow += 4; |
| I += 4; |
| } |
| // Write any trailing shadow bytes. |
| while (I < NumLines) { |
| if ((*Shadow & ShadowAccessedVal) != ShadowAccessedVal) |
| *Shadow |= ShadowAccessedVal; |
| ++Shadow; |
| ++I; |
| } |
| } |
| |
| // This routine will word-align ShadowStart and ShadowEnd prior to scanning. |
| // It does *not* clear for BitIdx==TotalWorkingSetBitIdx, as that top bit |
| // measures the access during the entire execution and should never be cleared. |
| static u32 countAndClearShadowValues(u32 BitIdx, uptr ShadowStart, |
| uptr ShadowEnd) { |
| u32 WorkingSetSize = 0; |
| u32 ByteValue = 0x1 << BitIdx; |
| u32 WordValue = ByteValue | ByteValue << 8 | ByteValue << 16 | |
| ByteValue << 24; |
| // Get word aligned start. |
| ShadowStart = RoundDownTo(ShadowStart, sizeof(u32)); |
| bool Accum = getFlags()->record_snapshots && BitIdx < MaxAccumBitIdx; |
| // Do not clear the bit that measures access during the entire execution. |
| bool Clear = BitIdx < TotalWorkingSetBitIdx; |
| for (u32 *Ptr = (u32 *)ShadowStart; Ptr < (u32 *)ShadowEnd; ++Ptr) { |
| if ((*Ptr & WordValue) != 0) { |
| byte *BytePtr = (byte *)Ptr; |
| for (u32 j = 0; j < sizeof(u32); ++j) { |
| if (BytePtr[j] & ByteValue) { |
| ++WorkingSetSize; |
| if (Accum) { |
| // Accumulate to the lower-frequency bit to the left. |
| BytePtr[j] |= (ByteValue << 1); |
| } |
| } |
| } |
| if (Clear) { |
| // Clear this bit from every shadow byte. |
| *Ptr &= ~WordValue; |
| } |
| } |
| } |
| return WorkingSetSize; |
| } |
| |
| // Scan shadow memory to calculate the number of cache lines being accessed, |
| // i.e., the number of non-zero bits indexed by BitIdx in each shadow byte. |
| // We also clear the lowest bits (most recent working set snapshot). |
| // We do *not* clear for BitIdx==TotalWorkingSetBitIdx, as that top bit |
| // measures the access during the entire execution and should never be cleared. |
| static u32 computeWorkingSizeAndReset(u32 BitIdx) { |
| u32 WorkingSetSize = 0; |
| MemoryMappingLayout MemIter(true/*cache*/); |
| uptr Start, End, Prot; |
| while (MemIter.Next(&Start, &End, nullptr/*offs*/, nullptr/*file*/, |
| 0/*file size*/, &Prot)) { |
| VPrintf(4, "%s: considering %p-%p app=%d shadow=%d prot=%u\n", |
| __FUNCTION__, Start, End, Prot, isAppMem(Start), |
| isShadowMem(Start)); |
| if (isShadowMem(Start) && (Prot & MemoryMappingLayout::kProtectionWrite)) { |
| VPrintf(3, "%s: walking %p-%p\n", __FUNCTION__, Start, End); |
| WorkingSetSize += countAndClearShadowValues(BitIdx, Start, End); |
| } |
| } |
| return WorkingSetSize; |
| } |
| |
| // This is invoked from a signal handler but in a sideline thread doing nothing |
| // else so it is a little less fragile than a typical signal handler. |
| static void takeSample(void *Arg) { |
| u32 BitIdx = CurWorkingSetBitIdx; |
| u32 Freq = 1; |
| ++SnapshotNum; // Simpler to skip 0 whose mod matches everything. |
| while (BitIdx <= MaxAccumBitIdx && (SnapshotNum % Freq) == 0) { |
| u32 NumLines = computeWorkingSizeAndReset(BitIdx); |
| VReport(1, "%s: snapshot #%5d bit %d freq %4d: %8u\n", SanitizerToolName, |
| SnapshotNum, BitIdx, Freq, NumLines); |
| SizePerFreq[BitIdx].push_back(NumLines); |
| Freq = Freq << getFlags()->snapshot_step; |
| BitIdx++; |
| } |
| } |
| |
| // Initialization that must be done before any instrumented code is executed. |
| void initializeShadowWorkingSet() { |
| CHECK(getFlags()->cache_line_size == CacheLineSize); |
| registerMemoryFaultHandler(); |
| } |
| |
| void initializeWorkingSet() { |
| if (getFlags()->record_snapshots) { |
| for (u32 i = 0; i < NumFreq; ++i) |
| SizePerFreq[i].initialize(CircularBufferSizes[i]); |
| Thread.launchThread(takeSample, nullptr, getFlags()->sample_freq); |
| } |
| } |
| |
| static u32 getPeriodForPrinting(u32 MilliSec, const char *&Unit) { |
| if (MilliSec > 600000) { |
| Unit = "min"; |
| return MilliSec / 60000; |
| } else if (MilliSec > 10000) { |
| Unit = "sec"; |
| return MilliSec / 1000; |
| } else { |
| Unit = "ms"; |
| return MilliSec; |
| } |
| } |
| |
| static u32 getSizeForPrinting(u32 NumOfCachelines, const char *&Unit) { |
| // We need a constant to avoid software divide support: |
| static const u32 KilobyteCachelines = (0x1 << 10) / CacheLineSize; |
| static const u32 MegabyteCachelines = KilobyteCachelines << 10; |
| |
| if (NumOfCachelines > 10 * MegabyteCachelines) { |
| Unit = "MB"; |
| return NumOfCachelines / MegabyteCachelines; |
| } else if (NumOfCachelines > 10 * KilobyteCachelines) { |
| Unit = "KB"; |
| return NumOfCachelines / KilobyteCachelines; |
| } else { |
| Unit = "Bytes"; |
| return NumOfCachelines * CacheLineSize; |
| } |
| } |
| |
| void reportWorkingSet() { |
| const char *Unit; |
| if (getFlags()->record_snapshots) { |
| u32 Freq = 1; |
| Report(" Total number of samples: %u\n", SnapshotNum); |
| for (u32 i = 0; i < NumFreq; ++i) { |
| u32 Time = getPeriodForPrinting(getFlags()->sample_freq*Freq, Unit); |
| Report(" Samples array #%d at period %u %s\n", i, Time, Unit); |
| // FIXME: report whether we wrapped around and thus whether we |
| // have data on the whole run or just the last N samples. |
| for (u32 j = 0; j < SizePerFreq[i].size(); ++j) { |
| u32 Size = getSizeForPrinting(SizePerFreq[i][j], Unit); |
| Report("#%4d: %8u %s (%9u cache lines)\n", j, Size, Unit, |
| SizePerFreq[i][j]); |
| } |
| Freq = Freq << getFlags()->snapshot_step; |
| } |
| } |
| |
| // Get the working set size for the entire execution. |
| u32 NumOfCachelines = computeWorkingSizeAndReset(TotalWorkingSetBitIdx); |
| u32 Size = getSizeForPrinting(NumOfCachelines, Unit); |
| Report(" %s: the total working set size: %u %s (%u cache lines)\n", |
| SanitizerToolName, Size, Unit, NumOfCachelines); |
| } |
| |
| int finalizeWorkingSet() { |
| if (getFlags()->record_snapshots) |
| Thread.joinThread(); |
| reportWorkingSet(); |
| if (getFlags()->record_snapshots) { |
| for (u32 i = 0; i < NumFreq; ++i) |
| SizePerFreq[i].free(); |
| } |
| return 0; |
| } |
| |
| } // namespace __esan |