| #include "Python.h" |
| #include "pycore_ceval.h" // _PyPerf_Callbacks |
| #include "pycore_frame.h" |
| #include "pycore_interp.h" |
| |
| |
| #ifdef PY_HAVE_PERF_TRAMPOLINE |
| |
| #include <fcntl.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <sys/mman.h> // mmap() |
| #include <sys/types.h> |
| #include <unistd.h> // sysconf() |
| #include <sys/time.h> // gettimeofday() |
| #include <sys/syscall.h> |
| |
| // ---------------------------------- |
| // Perf jitdump API |
| // ---------------------------------- |
| |
| typedef struct { |
| FILE* perf_map; |
| PyThread_type_lock map_lock; |
| void* mapped_buffer; |
| size_t mapped_size; |
| int code_id; |
| } PerfMapJitState; |
| |
| static PerfMapJitState perf_jit_map_state; |
| |
| /* |
| Usually the binary and libraries are mapped in separate region like below: |
| |
| address -> |
| --+---------------------+--//--+---------------------+-- |
| | .text | .data | ... | | .text | .data | ... | |
| --+---------------------+--//--+---------------------+-- |
| myprog libc.so |
| |
| So it'd be easy and straight-forward to find a mapped binary or library from an |
| address. |
| |
| But for JIT code, the code arena only cares about the code section. But the |
| resulting DSOs (which is generated by perf inject -j) contain ELF headers and |
| unwind info too. Then it'd generate following address space with synthesized |
| MMAP events. Let's say it has a sample between address B and C. |
| |
| sample |
| | |
| address -> A B v C |
| --------------------------------------------------------------------------------------------------- |
| /tmp/jitted-PID-0.so | (headers) | .text | unwind info | |
| /tmp/jitted-PID-1.so | (headers) | .text | unwind info | |
| /tmp/jitted-PID-2.so | (headers) | .text | unwind info | |
| ... |
| --------------------------------------------------------------------------------------------------- |
| |
| If it only maps the .text section, it'd find the jitted-PID-1.so but cannot see |
| the unwind info. If it maps both .text section and unwind sections, the sample |
| could be mapped to either jitted-PID-0.so or jitted-PID-1.so and it's confusing |
| which one is right. So to make perf happy we have non-overlapping ranges for each |
| DSO: |
| |
| address -> |
| ------------------------------------------------------------------------------------------------------- |
| /tmp/jitted-PID-0.so | (headers) | .text | unwind info | |
| /tmp/jitted-PID-1.so | (headers) | .text | unwind info | |
| /tmp/jitted-PID-2.so | (headers) | .text | unwind info | |
| ... |
| ------------------------------------------------------------------------------------------------------- |
| |
| As the trampolines are constant, we add a constant padding but in general the padding needs to have the |
| size of the unwind info rounded to 16 bytes. In general, for our trampolines this is 0x50 |
| */ |
| |
| #define PERF_JIT_CODE_PADDING 0x100 |
| #define trampoline_api _PyRuntime.ceval.perf.trampoline_api |
| |
| typedef uint64_t uword; |
| typedef const char* CodeComments; |
| |
| #define Pd "d" |
| #define MB (1024 * 1024) |
| |
| #define EM_386 3 |
| #define EM_X86_64 62 |
| #define EM_ARM 40 |
| #define EM_AARCH64 183 |
| #define EM_RISCV 243 |
| |
| #define TARGET_ARCH_IA32 0 |
| #define TARGET_ARCH_X64 0 |
| #define TARGET_ARCH_ARM 0 |
| #define TARGET_ARCH_ARM64 0 |
| #define TARGET_ARCH_RISCV32 0 |
| #define TARGET_ARCH_RISCV64 0 |
| |
| #define FLAG_generate_perf_jitdump 0 |
| #define FLAG_write_protect_code 0 |
| #define FLAG_write_protect_vm_isolate 0 |
| #define FLAG_code_comments 0 |
| |
| #define UNREACHABLE() |
| |
| static uword GetElfMachineArchitecture(void) { |
| #if TARGET_ARCH_IA32 |
| return EM_386; |
| #elif TARGET_ARCH_X64 |
| return EM_X86_64; |
| #elif TARGET_ARCH_ARM |
| return EM_ARM; |
| #elif TARGET_ARCH_ARM64 |
| return EM_AARCH64; |
| #elif TARGET_ARCH_RISCV32 || TARGET_ARCH_RISCV64 |
| return EM_RISCV; |
| #else |
| UNREACHABLE(); |
| return 0; |
| #endif |
| } |
| |
| typedef struct { |
| uint32_t magic; |
| uint32_t version; |
| uint32_t size; |
| uint32_t elf_mach_target; |
| uint32_t reserved; |
| uint32_t process_id; |
| uint64_t time_stamp; |
| uint64_t flags; |
| } Header; |
| |
| enum PerfEvent { |
| PerfLoad = 0, |
| PerfMove = 1, |
| PerfDebugInfo = 2, |
| PerfClose = 3, |
| PerfUnwindingInfo = 4 |
| }; |
| |
| struct BaseEvent { |
| uint32_t event; |
| uint32_t size; |
| uint64_t time_stamp; |
| }; |
| |
| typedef struct { |
| struct BaseEvent base; |
| uint32_t process_id; |
| uint32_t thread_id; |
| uint64_t vma; |
| uint64_t code_address; |
| uint64_t code_size; |
| uint64_t code_id; |
| } CodeLoadEvent; |
| |
| typedef struct { |
| struct BaseEvent base; |
| uint64_t unwind_data_size; |
| uint64_t eh_frame_hdr_size; |
| uint64_t mapped_size; |
| } CodeUnwindingInfoEvent; |
| |
| static const intptr_t nanoseconds_per_second = 1000000000; |
| |
| // Dwarf encoding constants |
| |
| static const uint8_t DwarfUData4 = 0x03; |
| static const uint8_t DwarfSData4 = 0x0b; |
| static const uint8_t DwarfPcRel = 0x10; |
| static const uint8_t DwarfDataRel = 0x30; |
| // static uint8_t DwarfOmit = 0xff; |
| typedef struct { |
| unsigned char version; |
| unsigned char eh_frame_ptr_enc; |
| unsigned char fde_count_enc; |
| unsigned char table_enc; |
| int32_t eh_frame_ptr; |
| int32_t eh_fde_count; |
| int32_t from; |
| int32_t to; |
| } EhFrameHeader; |
| |
| static int64_t get_current_monotonic_ticks(void) { |
| struct timespec ts; |
| if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0) { |
| UNREACHABLE(); |
| return 0; |
| } |
| // Convert to nanoseconds. |
| int64_t result = ts.tv_sec; |
| result *= nanoseconds_per_second; |
| result += ts.tv_nsec; |
| return result; |
| } |
| |
| static int64_t get_current_time_microseconds(void) { |
| // gettimeofday has microsecond resolution. |
| struct timeval tv; |
| if (gettimeofday(&tv, NULL) < 0) { |
| UNREACHABLE(); |
| return 0; |
| } |
| return ((int64_t)(tv.tv_sec) * 1000000) + tv.tv_usec; |
| } |
| |
| |
| static size_t round_up(int64_t value, int64_t multiple) { |
| if (multiple == 0) { |
| // Avoid division by zero |
| return value; |
| } |
| |
| int64_t remainder = value % multiple; |
| if (remainder == 0) { |
| // Value is already a multiple of 'multiple' |
| return value; |
| } |
| |
| // Calculate the difference to the next multiple |
| int64_t difference = multiple - remainder; |
| |
| // Add the difference to the value |
| int64_t rounded_up_value = value + difference; |
| |
| return rounded_up_value; |
| } |
| |
| |
| static void perf_map_jit_write_fully(const void* buffer, size_t size) { |
| FILE* out_file = perf_jit_map_state.perf_map; |
| const char* ptr = (const char*)(buffer); |
| while (size > 0) { |
| const size_t written = fwrite(ptr, 1, size, out_file); |
| if (written == 0) { |
| UNREACHABLE(); |
| break; |
| } |
| size -= written; |
| ptr += written; |
| } |
| } |
| |
| static void perf_map_jit_write_header(int pid, FILE* out_file) { |
| Header header; |
| header.magic = 0x4A695444; |
| header.version = 1; |
| header.size = sizeof(Header); |
| header.elf_mach_target = GetElfMachineArchitecture(); |
| header.process_id = pid; |
| header.time_stamp = get_current_time_microseconds(); |
| header.flags = 0; |
| perf_map_jit_write_fully(&header, sizeof(header)); |
| } |
| |
| static void* perf_map_jit_init(void) { |
| char filename[100]; |
| int pid = getpid(); |
| snprintf(filename, sizeof(filename) - 1, "/tmp/jit-%d.dump", pid); |
| const int fd = open(filename, O_CREAT | O_TRUNC | O_RDWR, 0666); |
| if (fd == -1) { |
| return NULL; |
| } |
| |
| const long page_size = sysconf(_SC_PAGESIZE); // NOLINT(runtime/int) |
| if (page_size == -1) { |
| close(fd); |
| return NULL; |
| } |
| |
| // The perf jit interface forces us to map the first page of the file |
| // to signal that we are using the interface. |
| perf_jit_map_state.mapped_buffer = mmap(NULL, page_size, PROT_READ | PROT_EXEC, MAP_PRIVATE, fd, 0); |
| if (perf_jit_map_state.mapped_buffer == NULL) { |
| close(fd); |
| return NULL; |
| } |
| perf_jit_map_state.mapped_size = page_size; |
| perf_jit_map_state.perf_map = fdopen(fd, "w+"); |
| if (perf_jit_map_state.perf_map == NULL) { |
| close(fd); |
| return NULL; |
| } |
| setvbuf(perf_jit_map_state.perf_map, NULL, _IOFBF, 2 * MB); |
| perf_map_jit_write_header(pid, perf_jit_map_state.perf_map); |
| |
| perf_jit_map_state.map_lock = PyThread_allocate_lock(); |
| if (perf_jit_map_state.map_lock == NULL) { |
| fclose(perf_jit_map_state.perf_map); |
| return NULL; |
| } |
| perf_jit_map_state.code_id = 0; |
| |
| // trampoline_api.code_padding = PERF_JIT_CODE_PADDING; |
| return &perf_jit_map_state; |
| } |
| |
| /* DWARF definitions. */ |
| |
| #define DWRF_CIE_VERSION 1 |
| |
| enum { |
| DWRF_CFA_nop = 0x0, |
| DWRF_CFA_offset_extended = 0x5, |
| DWRF_CFA_def_cfa = 0xc, |
| DWRF_CFA_def_cfa_offset = 0xe, |
| DWRF_CFA_offset_extended_sf = 0x11, |
| DWRF_CFA_advance_loc = 0x40, |
| DWRF_CFA_offset = 0x80 |
| }; |
| |
| enum |
| { |
| DWRF_EH_PE_absptr = 0x00, |
| DWRF_EH_PE_omit = 0xff, |
| |
| /* FDE data encoding. */ |
| DWRF_EH_PE_uleb128 = 0x01, |
| DWRF_EH_PE_udata2 = 0x02, |
| DWRF_EH_PE_udata4 = 0x03, |
| DWRF_EH_PE_udata8 = 0x04, |
| DWRF_EH_PE_sleb128 = 0x09, |
| DWRF_EH_PE_sdata2 = 0x0a, |
| DWRF_EH_PE_sdata4 = 0x0b, |
| DWRF_EH_PE_sdata8 = 0x0c, |
| DWRF_EH_PE_signed = 0x08, |
| |
| /* FDE flags. */ |
| DWRF_EH_PE_pcrel = 0x10, |
| DWRF_EH_PE_textrel = 0x20, |
| DWRF_EH_PE_datarel = 0x30, |
| DWRF_EH_PE_funcrel = 0x40, |
| DWRF_EH_PE_aligned = 0x50, |
| |
| DWRF_EH_PE_indirect = 0x80 |
| }; |
| |
| enum { DWRF_TAG_compile_unit = 0x11 }; |
| |
| enum { DWRF_children_no = 0, DWRF_children_yes = 1 }; |
| |
| enum { DWRF_AT_name = 0x03, DWRF_AT_stmt_list = 0x10, DWRF_AT_low_pc = 0x11, DWRF_AT_high_pc = 0x12 }; |
| |
| enum { DWRF_FORM_addr = 0x01, DWRF_FORM_data4 = 0x06, DWRF_FORM_string = 0x08 }; |
| |
| enum { DWRF_LNS_extended_op = 0, DWRF_LNS_copy = 1, DWRF_LNS_advance_pc = 2, DWRF_LNS_advance_line = 3 }; |
| |
| enum { DWRF_LNE_end_sequence = 1, DWRF_LNE_set_address = 2 }; |
| |
| enum { |
| #ifdef __x86_64__ |
| /* Yes, the order is strange, but correct. */ |
| DWRF_REG_AX, |
| DWRF_REG_DX, |
| DWRF_REG_CX, |
| DWRF_REG_BX, |
| DWRF_REG_SI, |
| DWRF_REG_DI, |
| DWRF_REG_BP, |
| DWRF_REG_SP, |
| DWRF_REG_8, |
| DWRF_REG_9, |
| DWRF_REG_10, |
| DWRF_REG_11, |
| DWRF_REG_12, |
| DWRF_REG_13, |
| DWRF_REG_14, |
| DWRF_REG_15, |
| DWRF_REG_RA, |
| #elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) |
| DWRF_REG_SP = 31, |
| DWRF_REG_RA = 30, |
| #else |
| # error "Unsupported target architecture" |
| #endif |
| }; |
| |
| typedef struct ELFObjectContext |
| { |
| uint8_t* p; /* Pointer to next address in obj.space. */ |
| uint8_t* startp; /* Pointer to start address in obj.space. */ |
| uint8_t* eh_frame_p; /* Pointer to start address in obj.space. */ |
| uint32_t code_size; /* Size of machine code. */ |
| } ELFObjectContext; |
| |
| /* Append a null-terminated string. */ |
| static uint32_t |
| elfctx_append_string(ELFObjectContext* ctx, const char* str) |
| { |
| uint8_t* p = ctx->p; |
| uint32_t ofs = (uint32_t)(p - ctx->startp); |
| do { |
| *p++ = (uint8_t)*str; |
| } while (*str++); |
| ctx->p = p; |
| return ofs; |
| } |
| |
| /* Append a SLEB128 value. */ |
| static void |
| elfctx_append_sleb128(ELFObjectContext* ctx, int32_t v) |
| { |
| uint8_t* p = ctx->p; |
| for (; (uint32_t)(v + 0x40) >= 0x80; v >>= 7) { |
| *p++ = (uint8_t)((v & 0x7f) | 0x80); |
| } |
| *p++ = (uint8_t)(v & 0x7f); |
| ctx->p = p; |
| } |
| |
| /* Append a ULEB128 to buffer. */ |
| static void |
| elfctx_append_uleb128(ELFObjectContext* ctx, uint32_t v) |
| { |
| uint8_t* p = ctx->p; |
| for (; v >= 0x80; v >>= 7) { |
| *p++ = (char)((v & 0x7f) | 0x80); |
| } |
| *p++ = (char)v; |
| ctx->p = p; |
| } |
| |
| /* Shortcuts to generate DWARF structures. */ |
| #define DWRF_U8(x) (*p++ = (x)) |
| #define DWRF_I8(x) (*(int8_t*)p = (x), p++) |
| #define DWRF_U16(x) (*(uint16_t*)p = (x), p += 2) |
| #define DWRF_U32(x) (*(uint32_t*)p = (x), p += 4) |
| #define DWRF_ADDR(x) (*(uintptr_t*)p = (x), p += sizeof(uintptr_t)) |
| #define DWRF_UV(x) (ctx->p = p, elfctx_append_uleb128(ctx, (x)), p = ctx->p) |
| #define DWRF_SV(x) (ctx->p = p, elfctx_append_sleb128(ctx, (x)), p = ctx->p) |
| #define DWRF_STR(str) (ctx->p = p, elfctx_append_string(ctx, (str)), p = ctx->p) |
| #define DWRF_ALIGNNOP(s) \ |
| while ((uintptr_t)p & ((s)-1)) { \ |
| *p++ = DWRF_CFA_nop; \ |
| } |
| #define DWRF_SECTION(name, stmt) \ |
| { \ |
| uint32_t* szp_##name = (uint32_t*)p; \ |
| p += 4; \ |
| stmt; \ |
| *szp_##name = (uint32_t)((p - (uint8_t*)szp_##name) - 4); \ |
| } |
| |
| /* Initialize .eh_frame section. */ |
| static void |
| elf_init_ehframe(ELFObjectContext* ctx) |
| { |
| uint8_t* p = ctx->p; |
| uint8_t* framep = p; |
| |
| /* Emit DWARF EH CIE. */ |
| DWRF_SECTION(CIE, DWRF_U32(0); /* Offset to CIE itself. */ |
| DWRF_U8(DWRF_CIE_VERSION); |
| DWRF_STR("zR"); /* Augmentation. */ |
| DWRF_UV(1); /* Code alignment factor. */ |
| DWRF_SV(-(int64_t)sizeof(uintptr_t)); /* Data alignment factor. */ |
| DWRF_U8(DWRF_REG_RA); /* Return address register. */ |
| DWRF_UV(1); |
| DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); /* Augmentation data. */ |
| DWRF_U8(DWRF_CFA_def_cfa); DWRF_UV(DWRF_REG_SP); DWRF_UV(sizeof(uintptr_t)); |
| DWRF_U8(DWRF_CFA_offset|DWRF_REG_RA); DWRF_UV(1); |
| DWRF_ALIGNNOP(sizeof(uintptr_t)); |
| ) |
| |
| ctx->eh_frame_p = p; |
| |
| /* Emit DWARF EH FDE. */ |
| DWRF_SECTION(FDE, DWRF_U32((uint32_t)(p - framep)); /* Offset to CIE. */ |
| DWRF_U32(-0x30); /* Machine code offset relative to .text. */ |
| DWRF_U32(ctx->code_size); /* Machine code length. */ |
| DWRF_U8(0); /* Augmentation data. */ |
| /* Registers saved in CFRAME. */ |
| #ifdef __x86_64__ |
| DWRF_U8(DWRF_CFA_advance_loc | 4); |
| DWRF_U8(DWRF_CFA_def_cfa_offset); DWRF_UV(16); |
| DWRF_U8(DWRF_CFA_advance_loc | 6); |
| DWRF_U8(DWRF_CFA_def_cfa_offset); DWRF_UV(8); |
| /* Extra registers saved for JIT-compiled code. */ |
| #elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) |
| DWRF_U8(DWRF_CFA_advance_loc | 1); |
| DWRF_U8(DWRF_CFA_def_cfa_offset); DWRF_UV(16); |
| DWRF_U8(DWRF_CFA_offset | 29); DWRF_UV(2); |
| DWRF_U8(DWRF_CFA_offset | 30); DWRF_UV(1); |
| DWRF_U8(DWRF_CFA_advance_loc | 3); |
| DWRF_U8(DWRF_CFA_offset | -(64 - 29)); |
| DWRF_U8(DWRF_CFA_offset | -(64 - 30)); |
| DWRF_U8(DWRF_CFA_def_cfa_offset); |
| DWRF_UV(0); |
| #else |
| # error "Unsupported target architecture" |
| #endif |
| DWRF_ALIGNNOP(sizeof(uintptr_t));) |
| |
| ctx->p = p; |
| } |
| |
| static void perf_map_jit_write_entry(void *state, const void *code_addr, |
| unsigned int code_size, PyCodeObject *co) |
| { |
| |
| if (perf_jit_map_state.perf_map == NULL) { |
| void* ret = perf_map_jit_init(); |
| if(ret == NULL){ |
| return; |
| } |
| } |
| |
| const char *entry = ""; |
| if (co->co_qualname != NULL) { |
| entry = PyUnicode_AsUTF8(co->co_qualname); |
| } |
| const char *filename = ""; |
| if (co->co_filename != NULL) { |
| filename = PyUnicode_AsUTF8(co->co_filename); |
| } |
| |
| |
| size_t perf_map_entry_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1; |
| char* perf_map_entry = (char*) PyMem_RawMalloc(perf_map_entry_size); |
| if (perf_map_entry == NULL) { |
| return; |
| } |
| snprintf(perf_map_entry, perf_map_entry_size, "py::%s:%s", entry, filename); |
| |
| const size_t name_length = strlen(perf_map_entry); |
| uword base = (uword)code_addr; |
| uword size = code_size; |
| |
| // Write the code unwinding info event. |
| |
| // Create unwinding information (eh frame) |
| ELFObjectContext ctx; |
| char buffer[1024]; |
| ctx.code_size = code_size; |
| ctx.startp = ctx.p = (uint8_t*)buffer; |
| elf_init_ehframe(&ctx); |
| int eh_frame_size = ctx.p - ctx.startp; |
| |
| // Populate the unwind info event for perf |
| CodeUnwindingInfoEvent ev2; |
| ev2.base.event = PerfUnwindingInfo; |
| ev2.base.time_stamp = get_current_monotonic_ticks(); |
| ev2.unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size; |
| // Ensure we have enough space between DSOs when perf maps them |
| assert(ev2.unwind_data_size <= PERF_JIT_CODE_PADDING); |
| ev2.eh_frame_hdr_size = sizeof(EhFrameHeader); |
| ev2.mapped_size = round_up(ev2.unwind_data_size, 16); |
| int content_size = sizeof(ev2) + sizeof(EhFrameHeader) + eh_frame_size; |
| int padding_size = round_up(content_size, 8) - content_size; |
| ev2.base.size = content_size + padding_size; |
| perf_map_jit_write_fully(&ev2, sizeof(ev2)); |
| |
| |
| // Populate the eh Frame header |
| EhFrameHeader f; |
| f.version = 1; |
| f.eh_frame_ptr_enc = DwarfSData4 | DwarfPcRel; |
| f.fde_count_enc = DwarfUData4; |
| f.table_enc = DwarfSData4 | DwarfDataRel; |
| f.eh_frame_ptr = -(eh_frame_size + 4 * sizeof(unsigned char)); |
| f.eh_fde_count = 1; |
| f.from = -(round_up(code_size, 8) + eh_frame_size); |
| int cie_size = ctx.eh_frame_p - ctx.startp; |
| f.to = -(eh_frame_size - cie_size); |
| |
| perf_map_jit_write_fully(ctx.startp, eh_frame_size); |
| perf_map_jit_write_fully(&f, sizeof(f)); |
| |
| char padding_bytes[] = "\0\0\0\0\0\0\0\0"; |
| perf_map_jit_write_fully(&padding_bytes, padding_size); |
| |
| // Write the code load event. |
| CodeLoadEvent ev; |
| ev.base.event = PerfLoad; |
| ev.base.size = sizeof(ev) + (name_length+1) + size; |
| ev.base.time_stamp = get_current_monotonic_ticks(); |
| ev.process_id = getpid(); |
| ev.thread_id = syscall(SYS_gettid); |
| ev.vma = base; |
| ev.code_address = base; |
| ev.code_size = size; |
| perf_jit_map_state.code_id += 1; |
| ev.code_id = perf_jit_map_state.code_id; |
| |
| perf_map_jit_write_fully(&ev, sizeof(ev)); |
| perf_map_jit_write_fully(perf_map_entry, name_length+1); |
| perf_map_jit_write_fully((void*)(base), size); |
| return; |
| } |
| |
| static int perf_map_jit_fini(void* state) { |
| if (perf_jit_map_state.perf_map != NULL) { |
| // close the file |
| PyThread_acquire_lock(perf_jit_map_state.map_lock, 1); |
| fclose(perf_jit_map_state.perf_map); |
| PyThread_release_lock(perf_jit_map_state.map_lock); |
| |
| // clean up the lock and state |
| PyThread_free_lock(perf_jit_map_state.map_lock); |
| perf_jit_map_state.perf_map = NULL; |
| } |
| if (perf_jit_map_state.mapped_buffer != NULL) { |
| munmap(perf_jit_map_state.mapped_buffer, perf_jit_map_state.mapped_size); |
| } |
| trampoline_api.state = NULL; |
| return 0; |
| } |
| |
| _PyPerf_Callbacks _Py_perfmap_jit_callbacks = { |
| &perf_map_jit_init, |
| &perf_map_jit_write_entry, |
| &perf_map_jit_fini, |
| }; |
| |
| #endif |