Python/perf_jit_trampoline.c - platform/external/python/cpython3 - Git at Google

 #include "Python.h"
 #include "pycore_ceval.h"         // _PyPerf_Callbacks
 #include "pycore_frame.h"
 #include "pycore_interp.h"


 #ifdef PY_HAVE_PERF_TRAMPOLINE

 #include <fcntl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/mman.h>             // mmap()
 #include <sys/types.h>
 #include <unistd.h>               // sysconf()
 #include <sys/time.h>           // gettimeofday()
 #include <sys/syscall.h>

 // ----------------------------------
 //         Perf jitdump API
 // ----------------------------------

 typedef struct {
     FILE* perf_map;
     PyThread_type_lock map_lock;
     void* mapped_buffer;
     size_t mapped_size;
     int code_id;
 } PerfMapJitState;

 static PerfMapJitState perf_jit_map_state;

 /*
 Usually the binary and libraries are mapped in separate region like below:

   address ->
    --+---------------------+--//--+---------------------+--
      | .text | .data | ... |      | .text | .data | ... |
    --+---------------------+--//--+---------------------+--
          myprog                      libc.so

 So it'd be easy and straight-forward to find a mapped binary or library from an
 address.

 But for JIT code, the code arena only cares about the code section. But the
 resulting DSOs (which is generated by perf inject -j) contain ELF headers and
 unwind info too. Then it'd generate following address space with synthesized
 MMAP events. Let's say it has a sample between address B and C.

                                                sample
                                                  |
   address ->                         A       B   v   C
   ---------------------------------------------------------------------------------------------------
   /tmp/jitted-PID-0.so   | (headers) | .text | unwind info |
   /tmp/jitted-PID-1.so           | (headers) | .text | unwind info |
   /tmp/jitted-PID-2.so                   | (headers) | .text | unwind info |
     ...
   ---------------------------------------------------------------------------------------------------

 If it only maps the .text section, it'd find the jitted-PID-1.so but cannot see
 the unwind info. If it maps both .text section and unwind sections, the sample
 could be mapped to either jitted-PID-0.so or jitted-PID-1.so and it's confusing
 which one is right. So to make perf happy we have non-overlapping ranges for each
 DSO:

   address ->
   -------------------------------------------------------------------------------------------------------
   /tmp/jitted-PID-0.so   | (headers) | .text | unwind info |
   /tmp/jitted-PID-1.so                         | (headers) | .text | unwind info |
   /tmp/jitted-PID-2.so                                               | (headers) | .text | unwind info |
     ...
   -------------------------------------------------------------------------------------------------------

 As the trampolines are constant, we add a constant padding but in general the padding needs to have the
 size of the unwind info rounded to 16 bytes. In general, for our trampolines this is 0x50
  */

 #define PERF_JIT_CODE_PADDING 0x100
 #define trampoline_api _PyRuntime.ceval.perf.trampoline_api

 typedef uint64_t uword;
 typedef const char* CodeComments;

 #define Pd "d"
 #define MB (1024 * 1024)

 #define EM_386      3
 #define EM_X86_64   62
 #define EM_ARM      40
 #define EM_AARCH64  183
 #define EM_RISCV    243

 #define TARGET_ARCH_IA32   0
 #define TARGET_ARCH_X64    0
 #define TARGET_ARCH_ARM    0
 #define TARGET_ARCH_ARM64  0
 #define TARGET_ARCH_RISCV32 0
 #define TARGET_ARCH_RISCV64 0

 #define FLAG_generate_perf_jitdump 0
 #define FLAG_write_protect_code 0
 #define FLAG_write_protect_vm_isolate 0
 #define FLAG_code_comments 0

 #define UNREACHABLE()

 static uword GetElfMachineArchitecture(void) {
 #if TARGET_ARCH_IA32
     return EM_386;
 #elif TARGET_ARCH_X64
     return EM_X86_64;
 #elif TARGET_ARCH_ARM
     return EM_ARM;
 #elif TARGET_ARCH_ARM64
     return EM_AARCH64;
 #elif TARGET_ARCH_RISCV32 || TARGET_ARCH_RISCV64
     return EM_RISCV;
 #else
     UNREACHABLE();
     return 0;
 #endif
 }

 typedef struct {
     uint32_t magic;
     uint32_t version;
     uint32_t size;
     uint32_t elf_mach_target;
     uint32_t reserved;
     uint32_t process_id;
     uint64_t time_stamp;
     uint64_t flags;
 } Header;

  enum PerfEvent {
     PerfLoad = 0,
     PerfMove = 1,
     PerfDebugInfo = 2,
     PerfClose = 3,
     PerfUnwindingInfo = 4
 };

 struct BaseEvent {
     uint32_t event;
     uint32_t size;
     uint64_t time_stamp;
   };

 typedef struct {
     struct BaseEvent base;
     uint32_t process_id;
     uint32_t thread_id;
     uint64_t vma;
     uint64_t code_address;
     uint64_t code_size;
     uint64_t code_id;
 } CodeLoadEvent;

 typedef struct {
     struct BaseEvent base;
     uint64_t unwind_data_size;
     uint64_t eh_frame_hdr_size;
     uint64_t mapped_size;
 } CodeUnwindingInfoEvent;

 static const intptr_t nanoseconds_per_second = 1000000000;

 // Dwarf encoding constants

 static const uint8_t DwarfUData4 = 0x03;
 static const uint8_t DwarfSData4 = 0x0b;
 static const uint8_t DwarfPcRel = 0x10;
 static const uint8_t DwarfDataRel = 0x30;
 // static uint8_t DwarfOmit = 0xff;
 typedef struct {
     unsigned char version;
     unsigned char eh_frame_ptr_enc;
     unsigned char	fde_count_enc;
     unsigned char	table_enc;
     int32_t eh_frame_ptr;
     int32_t eh_fde_count;
     int32_t from;
     int32_t to;
 } EhFrameHeader;

 static int64_t get_current_monotonic_ticks(void) {
     struct timespec ts;
     if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0) {
         UNREACHABLE();
         return 0;
     }
     // Convert to nanoseconds.
     int64_t result = ts.tv_sec;
     result *= nanoseconds_per_second;
     result += ts.tv_nsec;
     return result;
 }

 static int64_t get_current_time_microseconds(void) {
   // gettimeofday has microsecond resolution.
   struct timeval tv;
   if (gettimeofday(&tv, NULL) < 0) {
     UNREACHABLE();
     return 0;
   }
   return ((int64_t)(tv.tv_sec) * 1000000) + tv.tv_usec;
 }


 static size_t round_up(int64_t value, int64_t multiple) {
     if (multiple == 0) {
         // Avoid division by zero
         return value;
     }

     int64_t remainder = value % multiple;
     if (remainder == 0) {
         // Value is already a multiple of 'multiple'
         return value;
     }

     // Calculate the difference to the next multiple
     int64_t difference = multiple - remainder;

     // Add the difference to the value
     int64_t rounded_up_value = value + difference;

     return rounded_up_value;
 }


 static void perf_map_jit_write_fully(const void* buffer, size_t size) {
     FILE* out_file = perf_jit_map_state.perf_map;
     const char* ptr = (const char*)(buffer);
     while (size > 0) {
         const size_t written = fwrite(ptr, 1, size, out_file);
         if (written == 0) {
             UNREACHABLE();
             break;
         }
         size -= written;
         ptr += written;
     }
 }

 static void perf_map_jit_write_header(int pid, FILE* out_file) {
     Header header;
     header.magic = 0x4A695444;
     header.version = 1;
     header.size = sizeof(Header);
     header.elf_mach_target = GetElfMachineArchitecture();
     header.process_id = pid;
     header.time_stamp = get_current_time_microseconds();
     header.flags = 0;
     perf_map_jit_write_fully(&header, sizeof(header));
 }

 static void* perf_map_jit_init(void) {
     char filename[100];
     int pid = getpid();
     snprintf(filename, sizeof(filename) - 1, "/tmp/jit-%d.dump", pid);
     const int fd = open(filename, O_CREAT | O_TRUNC | O_RDWR, 0666);
     if (fd == -1) {
         return NULL;
     }

     const long page_size = sysconf(_SC_PAGESIZE);  // NOLINT(runtime/int)
     if (page_size == -1) {
         close(fd);
         return NULL;
     }

     // The perf jit interface forces us to map the first page of the file
     // to signal that we are using the interface.
     perf_jit_map_state.mapped_buffer = mmap(NULL, page_size, PROT_READ | PROT_EXEC, MAP_PRIVATE, fd, 0);
     if (perf_jit_map_state.mapped_buffer == NULL) {
         close(fd);
         return NULL;
     }
     perf_jit_map_state.mapped_size = page_size;
     perf_jit_map_state.perf_map = fdopen(fd, "w+");
     if (perf_jit_map_state.perf_map == NULL) {
         close(fd);
         return NULL;
     }
     setvbuf(perf_jit_map_state.perf_map, NULL, _IOFBF, 2 * MB);
     perf_map_jit_write_header(pid, perf_jit_map_state.perf_map);

     perf_jit_map_state.map_lock = PyThread_allocate_lock();
     if (perf_jit_map_state.map_lock == NULL) {
         fclose(perf_jit_map_state.perf_map);
         return NULL;
     }
     perf_jit_map_state.code_id = 0;

     // trampoline_api.code_padding = PERF_JIT_CODE_PADDING;
     return &perf_jit_map_state;
 }

 /* DWARF definitions. */

 #define DWRF_CIE_VERSION 1

 enum {
     DWRF_CFA_nop = 0x0,
     DWRF_CFA_offset_extended = 0x5,
     DWRF_CFA_def_cfa = 0xc,
     DWRF_CFA_def_cfa_offset = 0xe,
     DWRF_CFA_offset_extended_sf = 0x11,
     DWRF_CFA_advance_loc = 0x40,
     DWRF_CFA_offset = 0x80
 };

 enum
   {
     DWRF_EH_PE_absptr = 0x00,
     DWRF_EH_PE_omit = 0xff,

     /* FDE data encoding.  */
     DWRF_EH_PE_uleb128 = 0x01,
     DWRF_EH_PE_udata2 = 0x02,
     DWRF_EH_PE_udata4 = 0x03,
     DWRF_EH_PE_udata8 = 0x04,
     DWRF_EH_PE_sleb128 = 0x09,
     DWRF_EH_PE_sdata2 = 0x0a,
     DWRF_EH_PE_sdata4 = 0x0b,
     DWRF_EH_PE_sdata8 = 0x0c,
     DWRF_EH_PE_signed = 0x08,

     /* FDE flags.  */
     DWRF_EH_PE_pcrel = 0x10,
     DWRF_EH_PE_textrel = 0x20,
     DWRF_EH_PE_datarel = 0x30,
     DWRF_EH_PE_funcrel = 0x40,
     DWRF_EH_PE_aligned = 0x50,

     DWRF_EH_PE_indirect = 0x80
   };

 enum { DWRF_TAG_compile_unit = 0x11 };

 enum { DWRF_children_no = 0, DWRF_children_yes = 1 };

 enum { DWRF_AT_name = 0x03, DWRF_AT_stmt_list = 0x10, DWRF_AT_low_pc = 0x11, DWRF_AT_high_pc = 0x12 };

 enum { DWRF_FORM_addr = 0x01, DWRF_FORM_data4 = 0x06, DWRF_FORM_string = 0x08 };

 enum { DWRF_LNS_extended_op = 0, DWRF_LNS_copy = 1, DWRF_LNS_advance_pc = 2, DWRF_LNS_advance_line = 3 };

 enum { DWRF_LNE_end_sequence = 1, DWRF_LNE_set_address = 2 };

 enum {
 #ifdef __x86_64__
     /* Yes, the order is strange, but correct. */
     DWRF_REG_AX,
     DWRF_REG_DX,
     DWRF_REG_CX,
     DWRF_REG_BX,
     DWRF_REG_SI,
     DWRF_REG_DI,
     DWRF_REG_BP,
     DWRF_REG_SP,
     DWRF_REG_8,
     DWRF_REG_9,
     DWRF_REG_10,
     DWRF_REG_11,
     DWRF_REG_12,
     DWRF_REG_13,
     DWRF_REG_14,
     DWRF_REG_15,
     DWRF_REG_RA,
 #elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
     DWRF_REG_SP = 31,
     DWRF_REG_RA = 30,
 #else
 #    error "Unsupported target architecture"
 #endif
 };

 typedef struct ELFObjectContext
 {
     uint8_t* p; /* Pointer to next address in obj.space. */
     uint8_t* startp; /* Pointer to start address in obj.space. */
     uint8_t* eh_frame_p; /* Pointer to start address in obj.space. */
     uint32_t code_size; /* Size of machine code. */
 } ELFObjectContext;

 /* Append a null-terminated string. */
 static uint32_t
 elfctx_append_string(ELFObjectContext* ctx, const char* str)
 {
     uint8_t* p = ctx->p;
     uint32_t ofs = (uint32_t)(p - ctx->startp);
     do {
         *p++ = (uint8_t)*str;
     } while (*str++);
     ctx->p = p;
     return ofs;
 }

 /* Append a SLEB128 value. */
 static void
 elfctx_append_sleb128(ELFObjectContext* ctx, int32_t v)
 {
     uint8_t* p = ctx->p;
     for (; (uint32_t)(v + 0x40) >= 0x80; v >>= 7) {
         *p++ = (uint8_t)((v & 0x7f) | 0x80);
     }
     *p++ = (uint8_t)(v & 0x7f);
     ctx->p = p;
 }

 /* Append a ULEB128 to buffer. */
 static void
 elfctx_append_uleb128(ELFObjectContext* ctx, uint32_t v)
 {
     uint8_t* p = ctx->p;
     for (; v >= 0x80; v >>= 7) {
         *p++ = (char)((v & 0x7f) | 0x80);
     }
     *p++ = (char)v;
     ctx->p = p;
 }

 /* Shortcuts to generate DWARF structures. */
 #define DWRF_U8(x) (*p++ = (x))
 #define DWRF_I8(x) (*(int8_t*)p = (x), p++)
 #define DWRF_U16(x) (*(uint16_t*)p = (x), p += 2)
 #define DWRF_U32(x) (*(uint32_t*)p = (x), p += 4)
 #define DWRF_ADDR(x) (*(uintptr_t*)p = (x), p += sizeof(uintptr_t))
 #define DWRF_UV(x) (ctx->p = p, elfctx_append_uleb128(ctx, (x)), p = ctx->p)
 #define DWRF_SV(x) (ctx->p = p, elfctx_append_sleb128(ctx, (x)), p = ctx->p)
 #define DWRF_STR(str) (ctx->p = p, elfctx_append_string(ctx, (str)), p = ctx->p)
 #define DWRF_ALIGNNOP(s)                                                                                \
     while ((uintptr_t)p & ((s)-1)) {                                                                    \
         *p++ = DWRF_CFA_nop;                                                                            \
     }
 #define DWRF_SECTION(name, stmt)                                                                        \
     {                                                                                                   \
         uint32_t* szp_##name = (uint32_t*)p;                                                            \
         p += 4;                                                                                         \
         stmt;                                                                                           \
         *szp_##name = (uint32_t)((p - (uint8_t*)szp_##name) - 4);                                       \
     }

 /* Initialize .eh_frame section. */
 static void
 elf_init_ehframe(ELFObjectContext* ctx)
 {
     uint8_t* p = ctx->p;
     uint8_t* framep = p;

     /* Emit DWARF EH CIE. */
     DWRF_SECTION(CIE, DWRF_U32(0); /* Offset to CIE itself. */
                  DWRF_U8(DWRF_CIE_VERSION);
                  DWRF_STR("zR"); /* Augmentation. */
                  DWRF_UV(1); /* Code alignment factor. */
                  DWRF_SV(-(int64_t)sizeof(uintptr_t)); /* Data alignment factor. */
                  DWRF_U8(DWRF_REG_RA); /* Return address register. */
                  DWRF_UV(1);
                  DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); /* Augmentation data. */
                  DWRF_U8(DWRF_CFA_def_cfa); DWRF_UV(DWRF_REG_SP); DWRF_UV(sizeof(uintptr_t));
                  DWRF_U8(DWRF_CFA_offset|DWRF_REG_RA); DWRF_UV(1);
                  DWRF_ALIGNNOP(sizeof(uintptr_t));
     )

     ctx->eh_frame_p = p;

     /* Emit DWARF EH FDE. */
     DWRF_SECTION(FDE, DWRF_U32((uint32_t)(p - framep)); /* Offset to CIE. */
                  DWRF_U32(-0x30); /* Machine code offset relative to .text. */
                  DWRF_U32(ctx->code_size); /* Machine code length. */
                  DWRF_U8(0); /* Augmentation data. */
     /* Registers saved in CFRAME. */
 #ifdef __x86_64__
                  DWRF_U8(DWRF_CFA_advance_loc | 4);
                  DWRF_U8(DWRF_CFA_def_cfa_offset); DWRF_UV(16);
                  DWRF_U8(DWRF_CFA_advance_loc | 6);
                  DWRF_U8(DWRF_CFA_def_cfa_offset); DWRF_UV(8);
     /* Extra registers saved for JIT-compiled code. */
 #elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
                  DWRF_U8(DWRF_CFA_advance_loc | 1);
                  DWRF_U8(DWRF_CFA_def_cfa_offset); DWRF_UV(16);
                  DWRF_U8(DWRF_CFA_offset | 29); DWRF_UV(2);
                  DWRF_U8(DWRF_CFA_offset | 30); DWRF_UV(1);
                  DWRF_U8(DWRF_CFA_advance_loc | 3);
                  DWRF_U8(DWRF_CFA_offset | -(64 - 29));
                  DWRF_U8(DWRF_CFA_offset | -(64 - 30));
                  DWRF_U8(DWRF_CFA_def_cfa_offset);
                  DWRF_UV(0);
 #else
 #    error "Unsupported target architecture"
 #endif
                  DWRF_ALIGNNOP(sizeof(uintptr_t));)

     ctx->p = p;
 }

 static void perf_map_jit_write_entry(void *state, const void *code_addr,
                          unsigned int code_size, PyCodeObject *co)
 {

     if (perf_jit_map_state.perf_map == NULL) {
         void* ret = perf_map_jit_init();
         if(ret == NULL){
             return;
         }
     }

     const char *entry = "";
     if (co->co_qualname != NULL) {
         entry = PyUnicode_AsUTF8(co->co_qualname);
     }
     const char *filename = "";
     if (co->co_filename != NULL) {
         filename = PyUnicode_AsUTF8(co->co_filename);
     }


     size_t perf_map_entry_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1;
     char* perf_map_entry = (char*) PyMem_RawMalloc(perf_map_entry_size);
     if (perf_map_entry == NULL) {
         return;
     }
     snprintf(perf_map_entry, perf_map_entry_size, "py::%s:%s", entry, filename);

     const size_t name_length = strlen(perf_map_entry);
     uword base = (uword)code_addr;
     uword size = code_size;

     // Write the code unwinding info event.

     // Create unwinding information (eh frame)
     ELFObjectContext ctx;
     char buffer[1024];
     ctx.code_size = code_size;
     ctx.startp = ctx.p = (uint8_t*)buffer;
     elf_init_ehframe(&ctx);
     int eh_frame_size = ctx.p - ctx.startp;

     // Populate the unwind info event for perf
     CodeUnwindingInfoEvent ev2;
     ev2.base.event = PerfUnwindingInfo;
     ev2.base.time_stamp = get_current_monotonic_ticks();
     ev2.unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size;
     // Ensure we have enough space between DSOs when perf maps them
     assert(ev2.unwind_data_size <= PERF_JIT_CODE_PADDING);
     ev2.eh_frame_hdr_size = sizeof(EhFrameHeader);
     ev2.mapped_size = round_up(ev2.unwind_data_size, 16);
     int content_size = sizeof(ev2) + sizeof(EhFrameHeader) + eh_frame_size;
     int padding_size = round_up(content_size, 8) - content_size;
     ev2.base.size = content_size + padding_size;
     perf_map_jit_write_fully(&ev2, sizeof(ev2));


     // Populate the eh Frame header
     EhFrameHeader f;
     f.version = 1;
     f.eh_frame_ptr_enc = DwarfSData4 | DwarfPcRel;
     f.fde_count_enc = DwarfUData4;
     f.table_enc = DwarfSData4 | DwarfDataRel;
     f.eh_frame_ptr = -(eh_frame_size + 4 * sizeof(unsigned char));
     f.eh_fde_count = 1;
     f.from = -(round_up(code_size, 8) + eh_frame_size);
     int cie_size = ctx.eh_frame_p - ctx.startp;
     f.to = -(eh_frame_size - cie_size);

     perf_map_jit_write_fully(ctx.startp, eh_frame_size);
     perf_map_jit_write_fully(&f, sizeof(f));

     char padding_bytes[] = "\0\0\0\0\0\0\0\0";
     perf_map_jit_write_fully(&padding_bytes, padding_size);

     // Write the code load event.
     CodeLoadEvent ev;
     ev.base.event = PerfLoad;
     ev.base.size = sizeof(ev) + (name_length+1) + size;
     ev.base.time_stamp = get_current_monotonic_ticks();
     ev.process_id = getpid();
     ev.thread_id = syscall(SYS_gettid);
     ev.vma = base;
     ev.code_address = base;
     ev.code_size = size;
     perf_jit_map_state.code_id += 1;
     ev.code_id = perf_jit_map_state.code_id;

     perf_map_jit_write_fully(&ev, sizeof(ev));
     perf_map_jit_write_fully(perf_map_entry, name_length+1);
     perf_map_jit_write_fully((void*)(base), size);
     return;
 }

 static int perf_map_jit_fini(void* state) {
     if (perf_jit_map_state.perf_map != NULL) {
         // close the file
         PyThread_acquire_lock(perf_jit_map_state.map_lock, 1);
         fclose(perf_jit_map_state.perf_map);
         PyThread_release_lock(perf_jit_map_state.map_lock);

         // clean up the lock and state
         PyThread_free_lock(perf_jit_map_state.map_lock);
         perf_jit_map_state.perf_map = NULL;
     }
     if (perf_jit_map_state.mapped_buffer != NULL) {
         munmap(perf_jit_map_state.mapped_buffer, perf_jit_map_state.mapped_size);
     }
     trampoline_api.state = NULL;
     return 0;
 }

 _PyPerf_Callbacks _Py_perfmap_jit_callbacks = {
     &perf_map_jit_init,
     &perf_map_jit_write_entry,
     &perf_map_jit_fini,
 };

 #endif
	#include "Python.h"
	#include "pycore_ceval.h" // _PyPerf_Callbacks
	#include "pycore_frame.h"
	#include "pycore_interp.h"


	#ifdef PY_HAVE_PERF_TRAMPOLINE

	#include <fcntl.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <sys/mman.h> // mmap()
	#include <sys/types.h>
	#include <unistd.h> // sysconf()
	#include <sys/time.h> // gettimeofday()
	#include <sys/syscall.h>

	// ----------------------------------
	// Perf jitdump API
	// ----------------------------------

	typedef struct {
	FILE* perf_map;
	PyThread_type_lock map_lock;
	void* mapped_buffer;
	size_t mapped_size;
	int code_id;
	} PerfMapJitState;

	static PerfMapJitState perf_jit_map_state;

	/*
	Usually the binary and libraries are mapped in separate region like below:

	address ->
	--+---------------------+--//--+---------------------+--
	\| .text \| .data \| ... \| \| .text \| .data \| ... \|
	--+---------------------+--//--+---------------------+--
	myprog libc.so

	So it'd be easy and straight-forward to find a mapped binary or library from an
	address.

	But for JIT code, the code arena only cares about the code section. But the
	resulting DSOs (which is generated by perf inject -j) contain ELF headers and
	unwind info too. Then it'd generate following address space with synthesized
	MMAP events. Let's say it has a sample between address B and C.

	sample
	\|
	address -> A B v C
	---------------------------------------------------------------------------------------------------
	/tmp/jitted-PID-0.so \| (headers) \| .text \| unwind info \|
	/tmp/jitted-PID-1.so \| (headers) \| .text \| unwind info \|
	/tmp/jitted-PID-2.so \| (headers) \| .text \| unwind info \|
	...
	---------------------------------------------------------------------------------------------------

	If it only maps the .text section, it'd find the jitted-PID-1.so but cannot see
	the unwind info. If it maps both .text section and unwind sections, the sample
	could be mapped to either jitted-PID-0.so or jitted-PID-1.so and it's confusing
	which one is right. So to make perf happy we have non-overlapping ranges for each
	DSO:

	address ->
	-------------------------------------------------------------------------------------------------------
	/tmp/jitted-PID-0.so \| (headers) \| .text \| unwind info \|
	/tmp/jitted-PID-1.so \| (headers) \| .text \| unwind info \|
	/tmp/jitted-PID-2.so \| (headers) \| .text \| unwind info \|
	...
	-------------------------------------------------------------------------------------------------------

	As the trampolines are constant, we add a constant padding but in general the padding needs to have the
	size of the unwind info rounded to 16 bytes. In general, for our trampolines this is 0x50
	*/

	#define PERF_JIT_CODE_PADDING 0x100
	#define trampoline_api _PyRuntime.ceval.perf.trampoline_api

	typedef uint64_t uword;
	typedef const char* CodeComments;

	#define Pd "d"
	#define MB (1024 * 1024)

	#define EM_386 3
	#define EM_X86_64 62
	#define EM_ARM 40
	#define EM_AARCH64 183
	#define EM_RISCV 243

	#define TARGET_ARCH_IA32 0
	#define TARGET_ARCH_X64 0
	#define TARGET_ARCH_ARM 0
	#define TARGET_ARCH_ARM64 0
	#define TARGET_ARCH_RISCV32 0
	#define TARGET_ARCH_RISCV64 0

	#define FLAG_generate_perf_jitdump 0
	#define FLAG_write_protect_code 0
	#define FLAG_write_protect_vm_isolate 0
	#define FLAG_code_comments 0

	#define UNREACHABLE()

	static uword GetElfMachineArchitecture(void) {
	#if TARGET_ARCH_IA32
	return EM_386;
	#elif TARGET_ARCH_X64
	return EM_X86_64;
	#elif TARGET_ARCH_ARM
	return EM_ARM;
	#elif TARGET_ARCH_ARM64
	return EM_AARCH64;
	#elif TARGET_ARCH_RISCV32 \|\| TARGET_ARCH_RISCV64
	return EM_RISCV;
	#else
	UNREACHABLE();
	return 0;
	#endif
	}

	typedef struct {
	uint32_t magic;
	uint32_t version;
	uint32_t size;
	uint32_t elf_mach_target;
	uint32_t reserved;
	uint32_t process_id;
	uint64_t time_stamp;
	uint64_t flags;
	} Header;

	enum PerfEvent {
	PerfLoad = 0,
	PerfMove = 1,
	PerfDebugInfo = 2,
	PerfClose = 3,
	PerfUnwindingInfo = 4
	};

	struct BaseEvent {
	uint32_t event;
	uint32_t size;
	uint64_t time_stamp;
	};

	typedef struct {
	struct BaseEvent base;
	uint32_t process_id;
	uint32_t thread_id;
	uint64_t vma;
	uint64_t code_address;
	uint64_t code_size;
	uint64_t code_id;
	} CodeLoadEvent;

	typedef struct {
	struct BaseEvent base;
	uint64_t unwind_data_size;
	uint64_t eh_frame_hdr_size;
	uint64_t mapped_size;
	} CodeUnwindingInfoEvent;

	static const intptr_t nanoseconds_per_second = 1000000000;

	// Dwarf encoding constants

	static const uint8_t DwarfUData4 = 0x03;
	static const uint8_t DwarfSData4 = 0x0b;
	static const uint8_t DwarfPcRel = 0x10;
	static const uint8_t DwarfDataRel = 0x30;
	// static uint8_t DwarfOmit = 0xff;
	typedef struct {
	unsigned char version;
	unsigned char eh_frame_ptr_enc;
	unsigned char fde_count_enc;
	unsigned char table_enc;
	int32_t eh_frame_ptr;
	int32_t eh_fde_count;
	int32_t from;
	int32_t to;
	} EhFrameHeader;

	static int64_t get_current_monotonic_ticks(void) {
	struct timespec ts;
	if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0) {
	UNREACHABLE();
	return 0;
	}
	// Convert to nanoseconds.
	int64_t result = ts.tv_sec;
	result *= nanoseconds_per_second;
	result += ts.tv_nsec;
	return result;
	}

	static int64_t get_current_time_microseconds(void) {
	// gettimeofday has microsecond resolution.
	struct timeval tv;
	if (gettimeofday(&tv, NULL) < 0) {
	UNREACHABLE();
	return 0;
	}
	return ((int64_t)(tv.tv_sec) * 1000000) + tv.tv_usec;
	}


	static size_t round_up(int64_t value, int64_t multiple) {
	if (multiple == 0) {
	// Avoid division by zero
	return value;
	}

	int64_t remainder = value % multiple;
	if (remainder == 0) {
	// Value is already a multiple of 'multiple'
	return value;
	}

	// Calculate the difference to the next multiple
	int64_t difference = multiple - remainder;

	// Add the difference to the value
	int64_t rounded_up_value = value + difference;

	return rounded_up_value;
	}


	static void perf_map_jit_write_fully(const void* buffer, size_t size) {
	FILE* out_file = perf_jit_map_state.perf_map;
	const char* ptr = (const char*)(buffer);
	while (size > 0) {
	const size_t written = fwrite(ptr, 1, size, out_file);
	if (written == 0) {
	UNREACHABLE();
	break;
	}
	size -= written;
	ptr += written;
	}
	}

	static void perf_map_jit_write_header(int pid, FILE* out_file) {
	Header header;
	header.magic = 0x4A695444;
	header.version = 1;
	header.size = sizeof(Header);
	header.elf_mach_target = GetElfMachineArchitecture();
	header.process_id = pid;
	header.time_stamp = get_current_time_microseconds();
	header.flags = 0;
	perf_map_jit_write_fully(&header, sizeof(header));
	}

	static void* perf_map_jit_init(void) {
	char filename[100];
	int pid = getpid();
	snprintf(filename, sizeof(filename) - 1, "/tmp/jit-%d.dump", pid);
	const int fd = open(filename, O_CREAT \| O_TRUNC \| O_RDWR, 0666);
	if (fd == -1) {
	return NULL;
	}

	const long page_size = sysconf(_SC_PAGESIZE); // NOLINT(runtime/int)
	if (page_size == -1) {
	close(fd);
	return NULL;
	}

	// The perf jit interface forces us to map the first page of the file
	// to signal that we are using the interface.
	perf_jit_map_state.mapped_buffer = mmap(NULL, page_size, PROT_READ \| PROT_EXEC, MAP_PRIVATE, fd, 0);
	if (perf_jit_map_state.mapped_buffer == NULL) {
	close(fd);
	return NULL;
	}
	perf_jit_map_state.mapped_size = page_size;
	perf_jit_map_state.perf_map = fdopen(fd, "w+");
	if (perf_jit_map_state.perf_map == NULL) {
	close(fd);
	return NULL;
	}
	setvbuf(perf_jit_map_state.perf_map, NULL, _IOFBF, 2 * MB);
	perf_map_jit_write_header(pid, perf_jit_map_state.perf_map);

	perf_jit_map_state.map_lock = PyThread_allocate_lock();
	if (perf_jit_map_state.map_lock == NULL) {
	fclose(perf_jit_map_state.perf_map);
	return NULL;
	}
	perf_jit_map_state.code_id = 0;

	// trampoline_api.code_padding = PERF_JIT_CODE_PADDING;
	return &perf_jit_map_state;
	}

	/* DWARF definitions. */

	#define DWRF_CIE_VERSION 1

	enum {
	DWRF_CFA_nop = 0x0,
	DWRF_CFA_offset_extended = 0x5,
	DWRF_CFA_def_cfa = 0xc,
	DWRF_CFA_def_cfa_offset = 0xe,
	DWRF_CFA_offset_extended_sf = 0x11,
	DWRF_CFA_advance_loc = 0x40,
	DWRF_CFA_offset = 0x80
	};

	enum
	{
	DWRF_EH_PE_absptr = 0x00,
	DWRF_EH_PE_omit = 0xff,

	/* FDE data encoding. */
	DWRF_EH_PE_uleb128 = 0x01,
	DWRF_EH_PE_udata2 = 0x02,
	DWRF_EH_PE_udata4 = 0x03,
	DWRF_EH_PE_udata8 = 0x04,
	DWRF_EH_PE_sleb128 = 0x09,
	DWRF_EH_PE_sdata2 = 0x0a,
	DWRF_EH_PE_sdata4 = 0x0b,
	DWRF_EH_PE_sdata8 = 0x0c,
	DWRF_EH_PE_signed = 0x08,

	/* FDE flags. */
	DWRF_EH_PE_pcrel = 0x10,
	DWRF_EH_PE_textrel = 0x20,
	DWRF_EH_PE_datarel = 0x30,
	DWRF_EH_PE_funcrel = 0x40,
	DWRF_EH_PE_aligned = 0x50,

	DWRF_EH_PE_indirect = 0x80
	};

	enum { DWRF_TAG_compile_unit = 0x11 };

	enum { DWRF_children_no = 0, DWRF_children_yes = 1 };

	enum { DWRF_AT_name = 0x03, DWRF_AT_stmt_list = 0x10, DWRF_AT_low_pc = 0x11, DWRF_AT_high_pc = 0x12 };

	enum { DWRF_FORM_addr = 0x01, DWRF_FORM_data4 = 0x06, DWRF_FORM_string = 0x08 };

	enum { DWRF_LNS_extended_op = 0, DWRF_LNS_copy = 1, DWRF_LNS_advance_pc = 2, DWRF_LNS_advance_line = 3 };

	enum { DWRF_LNE_end_sequence = 1, DWRF_LNE_set_address = 2 };

	enum {
	#ifdef __x86_64__
	/* Yes, the order is strange, but correct. */
	DWRF_REG_AX,
	DWRF_REG_DX,
	DWRF_REG_CX,
	DWRF_REG_BX,
	DWRF_REG_SI,
	DWRF_REG_DI,
	DWRF_REG_BP,
	DWRF_REG_SP,
	DWRF_REG_8,
	DWRF_REG_9,
	DWRF_REG_10,
	DWRF_REG_11,
	DWRF_REG_12,
	DWRF_REG_13,
	DWRF_REG_14,
	DWRF_REG_15,
	DWRF_REG_RA,
	#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
	DWRF_REG_SP = 31,
	DWRF_REG_RA = 30,
	#else
	# error "Unsupported target architecture"
	#endif
	};

	typedef struct ELFObjectContext
	{
	uint8_t* p; /* Pointer to next address in obj.space. */
	uint8_t* startp; /* Pointer to start address in obj.space. */
	uint8_t* eh_frame_p; /* Pointer to start address in obj.space. */
	uint32_t code_size; /* Size of machine code. */
	} ELFObjectContext;

	/* Append a null-terminated string. */
	static uint32_t
	elfctx_append_string(ELFObjectContext* ctx, const char* str)
	{
	uint8_t* p = ctx->p;
	uint32_t ofs = (uint32_t)(p - ctx->startp);
	do {
	p++ = (uint8_t)str;
	} while (*str++);
	ctx->p = p;
	return ofs;
	}

	/* Append a SLEB128 value. */
	static void
	elfctx_append_sleb128(ELFObjectContext* ctx, int32_t v)
	{
	uint8_t* p = ctx->p;
	for (; (uint32_t)(v + 0x40) >= 0x80; v >>= 7) {
	*p++ = (uint8_t)((v & 0x7f) \| 0x80);
	}
	*p++ = (uint8_t)(v & 0x7f);
	ctx->p = p;
	}

	/* Append a ULEB128 to buffer. */
	static void
	elfctx_append_uleb128(ELFObjectContext* ctx, uint32_t v)
	{
	uint8_t* p = ctx->p;
	for (; v >= 0x80; v >>= 7) {
	*p++ = (char)((v & 0x7f) \| 0x80);
	}
	*p++ = (char)v;
	ctx->p = p;
	}

	/* Shortcuts to generate DWARF structures. */
	#define DWRF_U8(x) (*p++ = (x))
	#define DWRF_I8(x) ((int8_t)p = (x), p++)
	#define DWRF_U16(x) ((uint16_t)p = (x), p += 2)
	#define DWRF_U32(x) ((uint32_t)p = (x), p += 4)
	#define DWRF_ADDR(x) ((uintptr_t)p = (x), p += sizeof(uintptr_t))
	#define DWRF_UV(x) (ctx->p = p, elfctx_append_uleb128(ctx, (x)), p = ctx->p)
	#define DWRF_SV(x) (ctx->p = p, elfctx_append_sleb128(ctx, (x)), p = ctx->p)
	#define DWRF_STR(str) (ctx->p = p, elfctx_append_string(ctx, (str)), p = ctx->p)
	#define DWRF_ALIGNNOP(s) \
	while ((uintptr_t)p & ((s)-1)) { \
	*p++ = DWRF_CFA_nop; \
	}
	#define DWRF_SECTION(name, stmt) \
	{ \
	uint32_t* szp_##name = (uint32_t*)p; \
	p += 4; \
	stmt; \
	szp_##name = (uint32_t)((p - (uint8_t)szp_##name) - 4); \
	}

	/* Initialize .eh_frame section. */
	static void
	elf_init_ehframe(ELFObjectContext* ctx)
	{
	uint8_t* p = ctx->p;
	uint8_t* framep = p;

	/* Emit DWARF EH CIE. */
	DWRF_SECTION(CIE, DWRF_U32(0); /* Offset to CIE itself. */
	DWRF_U8(DWRF_CIE_VERSION);
	DWRF_STR("zR"); /* Augmentation. */
	DWRF_UV(1); /* Code alignment factor. */
	DWRF_SV(-(int64_t)sizeof(uintptr_t)); /* Data alignment factor. */
	DWRF_U8(DWRF_REG_RA); /* Return address register. */
	DWRF_UV(1);
	DWRF_U8(DWRF_EH_PE_pcrel \| DWRF_EH_PE_sdata4); /* Augmentation data. */
	DWRF_U8(DWRF_CFA_def_cfa); DWRF_UV(DWRF_REG_SP); DWRF_UV(sizeof(uintptr_t));
	DWRF_U8(DWRF_CFA_offset\|DWRF_REG_RA); DWRF_UV(1);
	DWRF_ALIGNNOP(sizeof(uintptr_t));
	)

	ctx->eh_frame_p = p;

	/* Emit DWARF EH FDE. */
	DWRF_SECTION(FDE, DWRF_U32((uint32_t)(p - framep)); /* Offset to CIE. */
	DWRF_U32(-0x30); /* Machine code offset relative to .text. */
	DWRF_U32(ctx->code_size); /* Machine code length. */
	DWRF_U8(0); /* Augmentation data. */
	/* Registers saved in CFRAME. */
	#ifdef __x86_64__
	DWRF_U8(DWRF_CFA_advance_loc \| 4);
	DWRF_U8(DWRF_CFA_def_cfa_offset); DWRF_UV(16);
	DWRF_U8(DWRF_CFA_advance_loc \| 6);
	DWRF_U8(DWRF_CFA_def_cfa_offset); DWRF_UV(8);
	/* Extra registers saved for JIT-compiled code. */
	#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
	DWRF_U8(DWRF_CFA_advance_loc \| 1);
	DWRF_U8(DWRF_CFA_def_cfa_offset); DWRF_UV(16);
	DWRF_U8(DWRF_CFA_offset \| 29); DWRF_UV(2);
	DWRF_U8(DWRF_CFA_offset \| 30); DWRF_UV(1);
	DWRF_U8(DWRF_CFA_advance_loc \| 3);
	DWRF_U8(DWRF_CFA_offset \| -(64 - 29));
	DWRF_U8(DWRF_CFA_offset \| -(64 - 30));
	DWRF_U8(DWRF_CFA_def_cfa_offset);
	DWRF_UV(0);
	#else
	# error "Unsupported target architecture"
	#endif
	DWRF_ALIGNNOP(sizeof(uintptr_t));)

	ctx->p = p;
	}

	static void perf_map_jit_write_entry(void state, const void code_addr,
	unsigned int code_size, PyCodeObject *co)
	{

	if (perf_jit_map_state.perf_map == NULL) {
	void* ret = perf_map_jit_init();
	if(ret == NULL){
	return;
	}
	}

	const char *entry = "";
	if (co->co_qualname != NULL) {
	entry = PyUnicode_AsUTF8(co->co_qualname);
	}
	const char *filename = "";
	if (co->co_filename != NULL) {
	filename = PyUnicode_AsUTF8(co->co_filename);
	}


	size_t perf_map_entry_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1;
	char* perf_map_entry = (char*) PyMem_RawMalloc(perf_map_entry_size);
	if (perf_map_entry == NULL) {
	return;
	}
	snprintf(perf_map_entry, perf_map_entry_size, "py::%s:%s", entry, filename);

	const size_t name_length = strlen(perf_map_entry);
	uword base = (uword)code_addr;
	uword size = code_size;

	// Write the code unwinding info event.

	// Create unwinding information (eh frame)
	ELFObjectContext ctx;
	char buffer[1024];
	ctx.code_size = code_size;
	ctx.startp = ctx.p = (uint8_t*)buffer;
	elf_init_ehframe(&ctx);
	int eh_frame_size = ctx.p - ctx.startp;

	// Populate the unwind info event for perf
	CodeUnwindingInfoEvent ev2;
	ev2.base.event = PerfUnwindingInfo;
	ev2.base.time_stamp = get_current_monotonic_ticks();
	ev2.unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size;
	// Ensure we have enough space between DSOs when perf maps them
	assert(ev2.unwind_data_size <= PERF_JIT_CODE_PADDING);
	ev2.eh_frame_hdr_size = sizeof(EhFrameHeader);
	ev2.mapped_size = round_up(ev2.unwind_data_size, 16);
	int content_size = sizeof(ev2) + sizeof(EhFrameHeader) + eh_frame_size;
	int padding_size = round_up(content_size, 8) - content_size;
	ev2.base.size = content_size + padding_size;
	perf_map_jit_write_fully(&ev2, sizeof(ev2));


	// Populate the eh Frame header
	EhFrameHeader f;
	f.version = 1;
	f.eh_frame_ptr_enc = DwarfSData4 \| DwarfPcRel;
	f.fde_count_enc = DwarfUData4;
	f.table_enc = DwarfSData4 \| DwarfDataRel;
	f.eh_frame_ptr = -(eh_frame_size + 4 * sizeof(unsigned char));
	f.eh_fde_count = 1;
	f.from = -(round_up(code_size, 8) + eh_frame_size);
	int cie_size = ctx.eh_frame_p - ctx.startp;
	f.to = -(eh_frame_size - cie_size);

	perf_map_jit_write_fully(ctx.startp, eh_frame_size);
	perf_map_jit_write_fully(&f, sizeof(f));

	char padding_bytes[] = "\0\0\0\0\0\0\0\0";
	perf_map_jit_write_fully(&padding_bytes, padding_size);

	// Write the code load event.
	CodeLoadEvent ev;
	ev.base.event = PerfLoad;
	ev.base.size = sizeof(ev) + (name_length+1) + size;
	ev.base.time_stamp = get_current_monotonic_ticks();
	ev.process_id = getpid();
	ev.thread_id = syscall(SYS_gettid);
	ev.vma = base;
	ev.code_address = base;
	ev.code_size = size;
	perf_jit_map_state.code_id += 1;
	ev.code_id = perf_jit_map_state.code_id;

	perf_map_jit_write_fully(&ev, sizeof(ev));
	perf_map_jit_write_fully(perf_map_entry, name_length+1);
	perf_map_jit_write_fully((void*)(base), size);
	return;
	}

	static int perf_map_jit_fini(void* state) {
	if (perf_jit_map_state.perf_map != NULL) {
	// close the file
	PyThread_acquire_lock(perf_jit_map_state.map_lock, 1);
	fclose(perf_jit_map_state.perf_map);
	PyThread_release_lock(perf_jit_map_state.map_lock);

	// clean up the lock and state
	PyThread_free_lock(perf_jit_map_state.map_lock);
	perf_jit_map_state.perf_map = NULL;
	}
	if (perf_jit_map_state.mapped_buffer != NULL) {
	munmap(perf_jit_map_state.mapped_buffer, perf_jit_map_state.mapped_size);
	}
	trampoline_api.state = NULL;
	return 0;
	}

	_PyPerf_Callbacks _Py_perfmap_jit_callbacks = {
	&perf_map_jit_init,
	&perf_map_jit_write_entry,
	&perf_map_jit_fini,
	};

	#endif