runtime/platform/profiler.cpp - platform/external/executorch - Git at Google

 /*
  * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */

 #include <string.h>

 #include <executorch/runtime/platform/assert.h>
 #include <executorch/runtime/platform/hooks.h>
 #include <executorch/runtime/platform/platform.h>
 #include <executorch/runtime/platform/profiler.h>
 #include <inttypes.h>

 namespace torch {
 namespace executor {

 namespace {
 static uint8_t prof_buf[prof_buf_size * MAX_PROFILE_BLOCKS];
 // Base pointer for header
 static prof_header_t* prof_header =
     (prof_header_t*)((uintptr_t)prof_buf + prof_header_offset);
 // Base pointer for profiling entries
 static prof_event_t* prof_arr =
     (prof_event_t*)((uintptr_t)prof_buf + prof_events_offset);
 // Base pointer for memory allocator info array
 static prof_allocator_t* mem_allocator_arr =
     (prof_allocator_t*)((uintptr_t)prof_buf + prof_mem_alloc_info_offset);
 // Base pointer for memory profiling entries
 static mem_prof_event_t* mem_prof_arr =
     (mem_prof_event_t*)((uintptr_t)prof_buf + prof_mem_alloc_events_offset);

 static uint32_t num_blocks = 0;
 static bool prof_stats_dumped = false;
 prof_state_t profile_state_tls{-1, 0u};
 } // namespace

 const prof_state_t& get_profile_tls_state() {
   return profile_state_tls;
 }

 void set_profile_tls_state(const prof_state_t& state) {
   profile_state_tls = state;
 }

 ExecutorchProfilerInstructionScope::ExecutorchProfilerInstructionScope(
     const prof_state_t& state)
     : old_state_(get_profile_tls_state()) {
   set_profile_tls_state(state);
 }

 ExecutorchProfilerInstructionScope::~ExecutorchProfilerInstructionScope() {
   set_profile_tls_state(old_state_);
 }

 uint32_t begin_profiling(const char* name) {
   ET_CHECK_MSG(
       prof_header->prof_entries < MAX_PROFILE_EVENTS,
       "Out of profiling buffer space. Increase MAX_PROFILE_EVENTS and re-compile.");
   uint32_t curr_counter = prof_header->prof_entries;
   prof_header->prof_entries++;
   prof_arr[curr_counter].end_time = 0;
   prof_arr[curr_counter].name_str = name;
   prof_state_t state = get_profile_tls_state();
   prof_arr[curr_counter].chain_idx = state.chain_idx;
   prof_arr[curr_counter].instruction_idx = state.instruction_idx;
   // Set start time at the last to ensure that we're not capturing
   // any of the overhead in this function.
   prof_arr[curr_counter].start_time = et_pal_current_ticks();
   return curr_counter;
 }

 void end_profiling(uint32_t token_id) {
   ET_CHECK_MSG(token_id < MAX_PROFILE_EVENTS, "Invalid token id.");
   prof_arr[token_id].end_time = et_pal_current_ticks();
 }

 void dump_profile_stats(prof_result_t* prof_result) {
   prof_result->prof_data = (uint8_t*)prof_buf;
   prof_result->num_bytes = num_blocks * prof_buf_size;
   prof_result->num_blocks = num_blocks;

   if (!prof_stats_dumped) {
     for (size_t i = 0; i < num_blocks; i++) {
       prof_header_t* prof_header_local =
           (prof_header_t*)(prof_buf + prof_buf_size * i);
       prof_event_t* prof_event_local =
           (prof_event_t*)(prof_buf + prof_buf_size * i + prof_events_offset);
       // Copy over the string names into the space allocated in prof_event_t. We
       // avoided doing this earlier to keep the overhead in begin_profiling and
       // end_profiling as low as possible.
       for (size_t j = 0; j < prof_header_local->prof_entries; j++) {
         size_t str_len = strlen(prof_event_local[j].name_str);
         const char* str_ptr = prof_event_local[j].name_str;
         memset(prof_event_local[j].name, 0, PROF_NAME_MAX_LEN);
         if (str_len > PROF_NAME_MAX_LEN) {
           memcpy(prof_event_local[j].name, str_ptr, PROF_NAME_MAX_LEN);
         } else {
           memcpy(prof_event_local[j].name, str_ptr, str_len);
         }
       }
     }
   }

   prof_stats_dumped = true;
 }

 void reset_profile_stats() {
   prof_stats_dumped = false;
   prof_header->prof_entries = 0;
   prof_header->allocator_entries = 0;
   prof_header->mem_prof_entries = 0;
 }

 void track_allocation(int32_t id, uint32_t size) {
   if (id == -1)
     return;
   ET_CHECK_MSG(
       prof_header->mem_prof_entries < MAX_MEM_PROFILE_EVENTS,
       "Out of memory profiling buffer space. Increase MAX_MEM_PROFILE_EVENTS\
        to %" PRIu32 " and re-compile.",
       prof_header->mem_prof_entries);
   mem_prof_arr[prof_header->mem_prof_entries].allocator_id = id;
   mem_prof_arr[prof_header->mem_prof_entries].allocation_size = size;
   prof_header->mem_prof_entries++;
 }

 uint32_t track_allocator(const char* name) {
   ET_CHECK_MSG(
       prof_header->allocator_entries < MEM_PROFILE_MAX_ALLOCATORS,
       "Out of allocator tracking space, %d is needed. Increase MEM_PROFILE_MAX_ALLOCATORS and re-compile",
       prof_header->allocator_entries);
   size_t str_len = strlen(name);
   size_t num_allocators = prof_header->allocator_entries;
   memset(mem_allocator_arr[num_allocators].name, 0, PROF_NAME_MAX_LEN);
   if (str_len > PROF_NAME_MAX_LEN) {
     memcpy(mem_allocator_arr[num_allocators].name, name, PROF_NAME_MAX_LEN);
   } else {
     memcpy(mem_allocator_arr[num_allocators].name, name, str_len);
   }
   mem_allocator_arr[num_allocators].allocator_id = num_allocators;
   return prof_header->allocator_entries++;
 }

 void profiling_create_block(const char* name) {
   // If the current profiling block is not used then continue to use this, if
   // not move onto the next block.
   if (prof_header->prof_entries != 0 || prof_header->mem_prof_entries != 0 ||
       prof_header->allocator_entries != 0 || num_blocks == 0) {
     num_blocks += 1;
     ET_CHECK_MSG(
         num_blocks <= MAX_PROFILE_BLOCKS,
         "Only %d blocks are supported and they've all been used up but %d is used. Increment MAX_PROFILE_BLOCKS and re-run",
         MAX_PROFILE_BLOCKS,
         num_blocks);
   }

   // Copy over the name of this profiling block.
   size_t str_len =
       strlen(name) >= PROF_NAME_MAX_LEN ? PROF_NAME_MAX_LEN : strlen(name);
   uintptr_t base = (uintptr_t)prof_buf + (num_blocks - 1) * prof_buf_size;
   prof_header = (prof_header_t*)(base + prof_header_offset);
   memset(prof_header->name, 0, PROF_NAME_MAX_LEN);
   memcpy(prof_header->name, name, str_len);

   // Set profiler version for compatiblity checks in the post-processing
   // tool.
   prof_header->prof_ver = ET_PROF_VER;
   // Set the maximum number of entries that this block can support.
   prof_header->max_prof_entries = MAX_PROFILE_EVENTS;
   prof_header->max_allocator_entries = MEM_PROFILE_MAX_ALLOCATORS;
   prof_header->max_mem_prof_entries = MAX_MEM_PROFILE_EVENTS;
   reset_profile_stats();

   // Set the base addresses for the various profiling entries arrays.
   prof_arr = (prof_event_t*)(base + prof_events_offset);
   mem_allocator_arr = (prof_allocator_t*)(base + prof_mem_alloc_info_offset);
   mem_prof_arr = (mem_prof_event_t*)(base + prof_mem_alloc_events_offset);
 }

 void profiler_init(void) {
   profiling_create_block("default");
 }

 ExecutorchProfiler::ExecutorchProfiler(const char* name) {
   prof_tok = begin_profiling(name);
 }

 ExecutorchProfiler::~ExecutorchProfiler() {
   end_profiling(prof_tok);
 }

 } // namespace executor
 } // namespace torch
	/*
	* Copyright (c) Meta Platforms, Inc. and affiliates.
	* All rights reserved.
	*
	* This source code is licensed under the BSD-style license found in the
	* LICENSE file in the root directory of this source tree.
	*/

	#include <string.h>

	#include <executorch/runtime/platform/assert.h>
	#include <executorch/runtime/platform/hooks.h>
	#include <executorch/runtime/platform/platform.h>
	#include <executorch/runtime/platform/profiler.h>
	#include <inttypes.h>

	namespace torch {
	namespace executor {

	namespace {
	static uint8_t prof_buf[prof_buf_size * MAX_PROFILE_BLOCKS];
	// Base pointer for header
	static prof_header_t* prof_header =
	(prof_header_t*)((uintptr_t)prof_buf + prof_header_offset);
	// Base pointer for profiling entries
	static prof_event_t* prof_arr =
	(prof_event_t*)((uintptr_t)prof_buf + prof_events_offset);
	// Base pointer for memory allocator info array
	static prof_allocator_t* mem_allocator_arr =
	(prof_allocator_t*)((uintptr_t)prof_buf + prof_mem_alloc_info_offset);
	// Base pointer for memory profiling entries
	static mem_prof_event_t* mem_prof_arr =
	(mem_prof_event_t*)((uintptr_t)prof_buf + prof_mem_alloc_events_offset);

	static uint32_t num_blocks = 0;
	static bool prof_stats_dumped = false;
	prof_state_t profile_state_tls{-1, 0u};
	} // namespace

	const prof_state_t& get_profile_tls_state() {
	return profile_state_tls;
	}

	void set_profile_tls_state(const prof_state_t& state) {
	profile_state_tls = state;
	}

	ExecutorchProfilerInstructionScope::ExecutorchProfilerInstructionScope(
	const prof_state_t& state)
	: old_state_(get_profile_tls_state()) {
	set_profile_tls_state(state);
	}

	ExecutorchProfilerInstructionScope::~ExecutorchProfilerInstructionScope() {
	set_profile_tls_state(old_state_);
	}

	uint32_t begin_profiling(const char* name) {
	ET_CHECK_MSG(
	prof_header->prof_entries < MAX_PROFILE_EVENTS,
	"Out of profiling buffer space. Increase MAX_PROFILE_EVENTS and re-compile.");
	uint32_t curr_counter = prof_header->prof_entries;
	prof_header->prof_entries++;
	prof_arr[curr_counter].end_time = 0;
	prof_arr[curr_counter].name_str = name;
	prof_state_t state = get_profile_tls_state();
	prof_arr[curr_counter].chain_idx = state.chain_idx;
	prof_arr[curr_counter].instruction_idx = state.instruction_idx;
	// Set start time at the last to ensure that we're not capturing
	// any of the overhead in this function.
	prof_arr[curr_counter].start_time = et_pal_current_ticks();
	return curr_counter;
	}

	void end_profiling(uint32_t token_id) {
	ET_CHECK_MSG(token_id < MAX_PROFILE_EVENTS, "Invalid token id.");
	prof_arr[token_id].end_time = et_pal_current_ticks();
	}

	void dump_profile_stats(prof_result_t* prof_result) {
	prof_result->prof_data = (uint8_t*)prof_buf;
	prof_result->num_bytes = num_blocks * prof_buf_size;
	prof_result->num_blocks = num_blocks;

	if (!prof_stats_dumped) {
	for (size_t i = 0; i < num_blocks; i++) {
	prof_header_t* prof_header_local =
	(prof_header_t)(prof_buf + prof_buf_size i);
	prof_event_t* prof_event_local =
	(prof_event_t)(prof_buf + prof_buf_size i + prof_events_offset);
	// Copy over the string names into the space allocated in prof_event_t. We
	// avoided doing this earlier to keep the overhead in begin_profiling and
	// end_profiling as low as possible.
	for (size_t j = 0; j < prof_header_local->prof_entries; j++) {
	size_t str_len = strlen(prof_event_local[j].name_str);
	const char* str_ptr = prof_event_local[j].name_str;
	memset(prof_event_local[j].name, 0, PROF_NAME_MAX_LEN);
	if (str_len > PROF_NAME_MAX_LEN) {
	memcpy(prof_event_local[j].name, str_ptr, PROF_NAME_MAX_LEN);
	} else {
	memcpy(prof_event_local[j].name, str_ptr, str_len);
	}
	}
	}
	}

	prof_stats_dumped = true;
	}

	void reset_profile_stats() {
	prof_stats_dumped = false;
	prof_header->prof_entries = 0;
	prof_header->allocator_entries = 0;
	prof_header->mem_prof_entries = 0;
	}

	void track_allocation(int32_t id, uint32_t size) {
	if (id == -1)
	return;
	ET_CHECK_MSG(
	prof_header->mem_prof_entries < MAX_MEM_PROFILE_EVENTS,
	"Out of memory profiling buffer space. Increase MAX_MEM_PROFILE_EVENTS\
	to %" PRIu32 " and re-compile.",
	prof_header->mem_prof_entries);
	mem_prof_arr[prof_header->mem_prof_entries].allocator_id = id;
	mem_prof_arr[prof_header->mem_prof_entries].allocation_size = size;
	prof_header->mem_prof_entries++;
	}

	uint32_t track_allocator(const char* name) {
	ET_CHECK_MSG(
	prof_header->allocator_entries < MEM_PROFILE_MAX_ALLOCATORS,
	"Out of allocator tracking space, %d is needed. Increase MEM_PROFILE_MAX_ALLOCATORS and re-compile",
	prof_header->allocator_entries);
	size_t str_len = strlen(name);
	size_t num_allocators = prof_header->allocator_entries;
	memset(mem_allocator_arr[num_allocators].name, 0, PROF_NAME_MAX_LEN);
	if (str_len > PROF_NAME_MAX_LEN) {
	memcpy(mem_allocator_arr[num_allocators].name, name, PROF_NAME_MAX_LEN);
	} else {
	memcpy(mem_allocator_arr[num_allocators].name, name, str_len);
	}
	mem_allocator_arr[num_allocators].allocator_id = num_allocators;
	return prof_header->allocator_entries++;
	}

	void profiling_create_block(const char* name) {
	// If the current profiling block is not used then continue to use this, if
	// not move onto the next block.
	if (prof_header->prof_entries != 0 \|\| prof_header->mem_prof_entries != 0 \|\|
	prof_header->allocator_entries != 0 \|\| num_blocks == 0) {
	num_blocks += 1;
	ET_CHECK_MSG(
	num_blocks <= MAX_PROFILE_BLOCKS,
	"Only %d blocks are supported and they've all been used up but %d is used. Increment MAX_PROFILE_BLOCKS and re-run",
	MAX_PROFILE_BLOCKS,
	num_blocks);
	}

	// Copy over the name of this profiling block.
	size_t str_len =
	strlen(name) >= PROF_NAME_MAX_LEN ? PROF_NAME_MAX_LEN : strlen(name);
	uintptr_t base = (uintptr_t)prof_buf + (num_blocks - 1) * prof_buf_size;
	prof_header = (prof_header_t*)(base + prof_header_offset);
	memset(prof_header->name, 0, PROF_NAME_MAX_LEN);
	memcpy(prof_header->name, name, str_len);

	// Set profiler version for compatiblity checks in the post-processing
	// tool.
	prof_header->prof_ver = ET_PROF_VER;
	// Set the maximum number of entries that this block can support.
	prof_header->max_prof_entries = MAX_PROFILE_EVENTS;
	prof_header->max_allocator_entries = MEM_PROFILE_MAX_ALLOCATORS;
	prof_header->max_mem_prof_entries = MAX_MEM_PROFILE_EVENTS;
	reset_profile_stats();

	// Set the base addresses for the various profiling entries arrays.
	prof_arr = (prof_event_t*)(base + prof_events_offset);
	mem_allocator_arr = (prof_allocator_t*)(base + prof_mem_alloc_info_offset);
	mem_prof_arr = (mem_prof_event_t*)(base + prof_mem_alloc_events_offset);
	}

	void profiler_init(void) {
	profiling_create_block("default");
	}

	ExecutorchProfiler::ExecutorchProfiler(const char* name) {
	prof_tok = begin_profiling(name);
	}

	ExecutorchProfiler::~ExecutorchProfiler() {
	end_profiling(prof_tok);
	}

	} // namespace executor
	} // namespace torch