c10/mobile/CPUProfilingAllocator.cpp - platform/external/pytorch - Git at Google

 #include <c10/core/impl/alloc_cpu.h>
 #include <c10/mobile/CPUProfilingAllocator.h>
 #include <c10/util/irange.h>

 #include <map>
 #include <set>

 namespace c10 {

 namespace {
 thread_local AllocationPlanner* allocation_planner{nullptr};
 thread_local CPUProfilingAllocator* profiling_allocator{nullptr};

 struct MemBlock {
   uint64_t start_offset, end_offset;
   MemBlock(uint64_t s, uint64_t e) : start_offset(s), end_offset(e) {}
   bool operator<(const MemBlock& other) const {
     return start_offset < other.start_offset;
   }
 };

 enum class EventType { Allocate = 0, Free, Invalid };

 struct MemEvent {
   uint64_t time;
   uint64_t allocation_id;
   uint64_t size;
   EventType type{EventType::Invalid};
   MemEvent(uint64_t t, uint64_t id, uint64_t s, EventType e)
       : time(t), allocation_id(id), size(s), type(e) {}
 };

 bool overlaps(const MemBlock& a, const MemBlock& b) {
   // two blocks dont overlap if
   // |---a--------|--------------b--------|
   // strat_a     end_a <= start_b       end_b
   return !(
       (a.end_offset <= b.start_offset) || (b.end_offset <= a.start_offset));
 }

 bool validate_allocation_plan(
     const std::vector<MemEvent>& alloc_events,
     const std::vector<uint64_t>& allocation_offsets) {
   std::set<MemBlock> allocations;
   for (const auto& event : alloc_events) {
     auto alloc_id = event.allocation_id;
     // Skip allocations not managed by AllocationPlan
     if (allocation_offsets[alloc_id] == std::numeric_limits<uint64_t>::max()) {
       continue;
     }
     auto start_offset = allocation_offsets[alloc_id];
     auto end_offset = allocation_offsets[alloc_id] + event.size;
     MemBlock mem_block(start_offset, end_offset);
     if (event.type == EventType::Allocate) {
       auto it = allocations.lower_bound(mem_block);
       if (it != allocations.end()) {
         auto next_block = *it;
         if (overlaps(next_block, mem_block)) {
           return false;
         }
       }
       if (it != allocations.begin()) {
         auto prev_block = *(--it);
         if (overlaps(prev_block, mem_block)) {
           return false;
         }
       }
       allocations.emplace(mem_block);
     } else if (event.type == EventType::Free) {
       auto it = allocations.find(mem_block);
       TORCH_CHECK(
           (*it).end_offset == end_offset,
           "Enf offset of allocation being freed must match the one recorded.");
       TORCH_CHECK(
           it != allocations.end(),
           "ProfilingAllocator: Allocate event "
           "must have preceded deallocate event.");
       allocations.erase(it);
     } else {
       TORCH_CHECK(false, "ProfilingAllocator: Invalid event type.");
     }
   }
   return true;
 }

 std::vector<MemEvent> create_and_sort_mem_events(
     const std::vector<uint64_t>& allocation_sizes,
     const std::vector<uint64_t>& allocation_lifetimes) {
   std::vector<MemEvent> events;
   for (uint64_t i = 0; i < allocation_sizes.size(); ++i) {
     // If observed allocation are freed outside the scope of
     // observation, then allocations are not managed by the
     // AllocationPlan.
     if (allocation_lifetimes[i] == std::numeric_limits<uint64_t>::max()) {
       continue;
     }
     events.emplace_back(i, i, allocation_sizes[i], EventType::Allocate);
     events.emplace_back(
         allocation_lifetimes[i], i, allocation_sizes[i], EventType::Free);
   }
   std::sort(
       events.begin(),
       events.end(),
       [](const MemEvent& a, const MemEvent& b) -> bool {
         return a.time < b.time;
       });
   return events;
 }

 std::vector<uint64_t> formulate_greedy_allocation_plan(
     const std::vector<uint64_t>& allocation_sizes,
     const std::vector<uint64_t>& allocation_lifetimes) {
   // Step 1. Construct all allocation/free events.
   //         Sort these events by timestamp.
   // Step 2. Iterate through all events.
   //  2.1 If allocate event:
   //      Find all candidate in free_size_to_offset map
   //      Greedily pick the first one.
   //      Remove the entry from free_size_to_offset map.
   //      new_offset = offset + request_size
   //      new_size = size - request_size
   //      Add new entry to both maps
   //  2.2 If free event.
   //      Check if the returned offset merges with another chunk.
   //      If so merge until no more merging is possible.
   //      If returned offset does not merge, then
   //      just return it as a chunk.

   // lower_bound on this map will get all candidates of
   // the right size for allocation.
   std::map<uint64_t, uint64_t> free_size_to_offset;
   // This provides fast lookup when we want to insert freed block
   // back, especially when we want to merge blocks.
   ska::flat_hash_map<uint64_t, std::map<uint64_t, uint64_t>::iterator>
       free_start_offset_to_size_iter;
   ska::flat_hash_map<uint64_t, std::map<uint64_t, uint64_t>::iterator>
       free_end_offset_to_size_iter;
   // Upon free end_ptr = offset + size
   // If end_ptr exists merge freed allocation
   // Also find corresponding offset in size_to_offset
   // Remove that entry and update with new size and offset
   // If end_ptr does not exist then just insert offset,size
   // in map and correspondingly size, offset in the other map.
   // Merging should always be done recursively until no more chunks
   // that can be found.
   // After last free we should have only one entry left in these maps.

   std::vector<uint64_t> allocation_offsets(
       allocation_sizes.size(), std::numeric_limits<uint64_t>::max());
   auto mem_events =
       create_and_sort_mem_events(allocation_sizes, allocation_lifetimes);
   uint64_t max_offset{0};
   for (const auto& mem_event : mem_events) {
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     uint64_t alloc_offset;
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     uint64_t new_offset, new_size;
     if (mem_event.type == EventType::Allocate) {
       auto it = free_size_to_offset.lower_bound(mem_event.size);
       if (it == free_size_to_offset.end()) {
         // If there is no contiguous block of the size requested
         // allocate a new one.
         alloc_offset = max_offset;
         max_offset += mem_event.size;
       } else {
         // If we have found a block of the size we want
         // 1. change the block by allocating out of it.
         //    1.1 Erase the entire block
         //    1.2 Erase the reverse map entries
         // 2. If block still has space left insert the remainder back in map.
         //    Including reverse map entries.
         alloc_offset = it->second;
         new_offset = alloc_offset + mem_event.size;
         new_size = it->first - mem_event.size;
         free_size_to_offset.erase(it);
         free_start_offset_to_size_iter.erase(alloc_offset);
         free_end_offset_to_size_iter.erase(alloc_offset + it->first);
         if (new_size > 0) {
           auto ref_it = free_size_to_offset.emplace(new_size, new_offset).first;
           free_start_offset_to_size_iter.emplace(new_offset, ref_it);
           free_end_offset_to_size_iter.emplace(new_offset + new_size, ref_it);
         }
       }
       allocation_offsets[mem_event.allocation_id] = alloc_offset;
     } else {
       // 1. Check if freed block is adjacent to an existing free block
       //    at its end boundary. This is done by checking
       //    free_end_offset_to_size_iter.
       //    If we find such a block, remove it and adjust size of
       //    the block being freed.
       // 2. Similarly check if freed block is adjacent to an existing
       //    free block at start boundary. This is done by checking
       //    free_start_offset_to_size_iter.
       //    If we find such a block, remove it and adjust size of
       //    the block being freed.
       // 3. Insert the freed block in map.
       auto freed_offset = allocation_offsets[mem_event.allocation_id];
       auto freed_size = mem_event.size;
       auto end_offset = freed_offset + freed_size;
       // Merge when another free block exist at the end of this block
       auto end_it = free_start_offset_to_size_iter.find(end_offset);
       if (end_it != free_start_offset_to_size_iter.end()) {
         auto merge_block_iter = end_it->second;
         auto merge_block_size = merge_block_iter->first;
         freed_size += merge_block_size;
         free_size_to_offset.erase(merge_block_iter);
         free_start_offset_to_size_iter.erase(end_it);
         // If the block is being merged then also remove it from
         // free_end_offset_to_size_iter
         free_end_offset_to_size_iter.erase(end_offset + merge_block_size);
       }
       // Merge when freed block exist at the end of another free block
       auto start_it = free_end_offset_to_size_iter.find(freed_offset);
       if (start_it != free_end_offset_to_size_iter.end()) {
         auto merge_block_iter = start_it->second;
         auto merge_block_size = merge_block_iter->first;
         freed_size += merge_block_size;
         freed_offset -= merge_block_size;
         free_size_to_offset.erase(merge_block_iter);
         free_end_offset_to_size_iter.erase(start_it);
         // If the block is being merged then also remove it from
         // free_start_offset_to_size_iter
         free_start_offset_to_size_iter.erase(freed_offset);
       }
       auto freed_block_it =
           free_size_to_offset.emplace(freed_size, freed_offset).first;
       free_start_offset_to_size_iter.emplace(freed_offset, freed_block_it);
       free_end_offset_to_size_iter.emplace(
           freed_offset + freed_size, freed_block_it);
     }
   }
   TORCH_CHECK(
       validate_allocation_plan(mem_events, allocation_offsets),
       "ProfilingAllocator: Allocation plan invalid.");
   return allocation_offsets;
 }

 } // namespace

 void AllocationPlan::clear() {
   allocation_sizes.clear();
   allocation_lifetimes.clear();
   allocation_offsets.clear();
 }

 void AllocationPlanner::record_allocation(
     const uint64_t size,
     const void* ptr) {
   if (validation_mode_) {
     validation_success = validation_success && validate_allocation(size, ptr);
     return;
   }
   allocation_plan_->allocation_sizes.push_back(size);
   allocation_plan_->allocation_lifetimes.push_back(
       std::numeric_limits<uint64_t>::max());
   allocation_ptr_to_id_[ptr] = allocation_id_;
   allocation_id_++;
 }

 void AllocationPlanner::record_free(const void* ptr) {
   if (validation_mode_) {
     validation_success = validation_success && validate_free(ptr);
     return;
   }
   auto it = allocation_ptr_to_id_.find(ptr);
   if (it == allocation_ptr_to_id_.end()) {
     // Free being recorded was allocated outside of WithProfileAllocationGuard
     return;
   }
   auto id = it->second;
   TORCH_CHECK(
       id < allocation_plan_->allocation_lifetimes.size(),
       "Allocation must have been recorded during record_allocation.");
   allocation_plan_->allocation_lifetimes[id] = allocation_id_;
 }

 bool AllocationPlanner::validate_allocation(
     const uint64_t size,
     const void* ptr) {
   if (allocation_id_ >= allocation_plan_->allocation_sizes.size() ||
       allocation_plan_->allocation_sizes[allocation_id_] != size) {
     TORCH_WARN(
         "Allocation request does not match plan:",
         "Allocation id:",
         allocation_id_,
         ", Number of recorded allocations:",
         allocation_plan_->allocation_sizes.size(),
         ", Recorded size of the requested allocation:",
         allocation_plan_->allocation_sizes[allocation_id_],
         ", but got:",
         size);

     return false;
   }
   allocation_ptr_to_id_[ptr] = allocation_id_;
   allocation_id_++;
   return true;
 }

 bool AllocationPlanner::validate_free(const void* ptr) {
   auto it = allocation_ptr_to_id_.find(ptr);
   if (it == allocation_ptr_to_id_.end()) {
     // Allocation that was made outside the validation scope is being freed here
     return true;
   }
   auto id = (*it).second;
   TORCH_CHECK(
       id < allocation_plan_->allocation_lifetimes.size(),
       "Allocation must have been recorded during validate_allocation.");
   auto lifetime_id = allocation_plan_->allocation_lifetimes[id];
   return (lifetime_id == allocation_id_);
 }

 void AllocationPlanner::formulate_plan() {
   allocation_plan_->allocation_offsets = formulate_greedy_allocation_plan(
       allocation_plan_->allocation_sizes,
       allocation_plan_->allocation_lifetimes);
   allocation_plan_->total_size = 0;
   for (const auto i : c10::irange(allocation_plan_->allocation_sizes.size())) {
     if (allocation_plan_->allocation_lifetimes[i] ==
         std::numeric_limits<uint64_t>::max()) {
       continue;
     }
     auto limit = allocation_plan_->allocation_offsets[i] +
         allocation_plan_->allocation_sizes[i];
     allocation_plan_->total_size =
         std::max(allocation_plan_->total_size, limit);
   }
 }

 void AllocationPlanner::clear() {
   allocation_plan_->clear();
   allocation_ptr_to_id_.clear();
 }

 void CPUProfilingAllocator::set_plan(const AllocationPlan* plan) {
   TORCH_CHECK(plan != nullptr, "Allocation plan is nullptr.");
   plan_ = plan;
   allocation_id_ = 0;
   allocation_ptr_to_id_.clear();
   if (current_size_ < plan->total_size) {
     // Free existing memory and reallocate for larger size.
     c10::free_cpu(blob_);
     blob_ = c10::alloc_cpu(plan->total_size);
     current_size_ = plan->total_size;
   }
 }

 void CPUProfilingAllocator::unset_plan() {
   allocation_id_ = 0;
   allocation_ptr_to_id_.clear();
   plan_ = nullptr;
 }

 void* CPUProfilingAllocator::allocate(const size_t bytes) {
   TORCH_CHECK(
       bytes == plan_->allocation_sizes[allocation_id_],
       "Got allocation request that does not match with the plan.");
   if (plan_->allocation_lifetimes[allocation_id_] ==
       std::numeric_limits<uint64_t>::max()) {
     // This allocation is not managed by ProfilingAllocator.
     allocation_id_++;
     return c10::alloc_cpu(bytes);
   }
   void* ptr = reinterpret_cast<uint8_t*>(blob_) +
       plan_->allocation_offsets[allocation_id_];
   allocation_ptr_to_id_[ptr] = allocation_id_;
   allocation_id_++;
   return ptr;
 }

 void CPUProfilingAllocator::free(void* const ptr) {
   auto it = allocation_ptr_to_id_.find(ptr);
   if (it == allocation_ptr_to_id_.end()) {
     // Either
     // 1. Allocation that was made outside the validation scope is being freed
     // here or
     // 2. Allocation that is not managed by profiling allocator is being freed.
     //    Example of the second type
     //    Tensor out;
     //    for (....) {
     //      {
     //        CPUProfilingAllocator
     //        out = ...some op (This also frees previous memory held by out)
     //      }
     //      out is used..
     //    }
     c10::free_cpu(ptr);
     return;
   }
   auto id = it->second;
   TORCH_CHECK(
       id < plan_->allocation_lifetimes.size(),
       "Freeing allocation that is not accordingly to the plan.");
   auto lifetime_id = plan_->allocation_lifetimes[id];
   TORCH_CHECK(
       lifetime_id == allocation_id_,
       "Lifetime of allocations do not match: allocation_id ",
       id,
       ", expected:",
       lifetime_id,
       ", got:",
       allocation_id_);
 }

 CPUProfilingAllocator::~CPUProfilingAllocator() {
   c10::free_cpu(blob_);
 }

 WithProfileAllocationsGuard::WithProfileAllocationsGuard(AllocationPlan* plan) {
   // Nesting of allocation profiling does not seem meaningful.
   TORCH_CHECK(
       allocation_planner == nullptr,
       "Nesting profiling allocations is not supported.");
   planner_ = std::make_unique<AllocationPlanner>(plan);
   planner_->clear();
   allocation_planner = planner_.get();
 }

 WithProfileAllocationsGuard::~WithProfileAllocationsGuard() {
   planner_->formulate_plan();
   allocation_planner = nullptr;
 }

 WithValidateAllocationPlanGuard::WithValidateAllocationPlanGuard(
     AllocationPlan* plan,
     bool* success) {
   // Nesting of allocation profiling does not seem meaningful.
   TORCH_CHECK(
       allocation_planner == nullptr,
       "Nesting profiling allocations is not supported.");
   planner_ = std::make_unique<AllocationPlanner>(plan, true);
   success_ = success;
   allocation_planner = planner_.get();
 }

 WithValidateAllocationPlanGuard::~WithValidateAllocationPlanGuard() {
   *success_ = planner_->validation_success;
   allocation_planner = nullptr;
 }

 AllocationPlanner* GetThreadLocalAllocationPlanner() {
   return allocation_planner;
 }

 WithProfilingAllocatorGuard::WithProfilingAllocatorGuard(
     CPUProfilingAllocator* allocator,
     const AllocationPlan* plan) {
   // Nesting of profiling allocator is not supported.
   TORCH_CHECK(
       profiling_allocator == nullptr,
       "Nesting profiling allocators is not supported.");
   profiling_allocator = allocator;
   profiling_allocator->set_plan(plan);
 }

 WithProfilingAllocatorGuard::~WithProfilingAllocatorGuard() {
   profiling_allocator->unset_plan();
   profiling_allocator = nullptr;
 }

 CPUProfilingAllocator* GetThreadLocalProfilingAllocator() {
   return profiling_allocator;
 }

 } // namespace c10
	#include <c10/core/impl/alloc_cpu.h>
	#include <c10/mobile/CPUProfilingAllocator.h>
	#include <c10/util/irange.h>

	#include <map>
	#include <set>

	namespace c10 {

	namespace {
	thread_local AllocationPlanner* allocation_planner{nullptr};
	thread_local CPUProfilingAllocator* profiling_allocator{nullptr};

	struct MemBlock {
	uint64_t start_offset, end_offset;
	MemBlock(uint64_t s, uint64_t e) : start_offset(s), end_offset(e) {}
	bool operator<(const MemBlock& other) const {
	return start_offset < other.start_offset;
	}
	};

	enum class EventType { Allocate = 0, Free, Invalid };

	struct MemEvent {
	uint64_t time;
	uint64_t allocation_id;
	uint64_t size;
	EventType type{EventType::Invalid};
	MemEvent(uint64_t t, uint64_t id, uint64_t s, EventType e)
	: time(t), allocation_id(id), size(s), type(e) {}
	};

	bool overlaps(const MemBlock& a, const MemBlock& b) {
	// two blocks dont overlap if
	// \|---a--------\|--------------b--------\|
	// strat_a end_a <= start_b end_b
	return !(
	(a.end_offset <= b.start_offset) \|\| (b.end_offset <= a.start_offset));
	}

	bool validate_allocation_plan(
	const std::vector<MemEvent>& alloc_events,
	const std::vector<uint64_t>& allocation_offsets) {
	std::set<MemBlock> allocations;
	for (const auto& event : alloc_events) {
	auto alloc_id = event.allocation_id;
	// Skip allocations not managed by AllocationPlan
	if (allocation_offsets[alloc_id] == std::numeric_limits<uint64_t>::max()) {
	continue;
	}
	auto start_offset = allocation_offsets[alloc_id];
	auto end_offset = allocation_offsets[alloc_id] + event.size;
	MemBlock mem_block(start_offset, end_offset);
	if (event.type == EventType::Allocate) {
	auto it = allocations.lower_bound(mem_block);
	if (it != allocations.end()) {
	auto next_block = *it;
	if (overlaps(next_block, mem_block)) {
	return false;
	}
	}
	if (it != allocations.begin()) {
	auto prev_block = *(--it);
	if (overlaps(prev_block, mem_block)) {
	return false;
	}
	}
	allocations.emplace(mem_block);
	} else if (event.type == EventType::Free) {
	auto it = allocations.find(mem_block);
	TORCH_CHECK(
	(*it).end_offset == end_offset,
	"Enf offset of allocation being freed must match the one recorded.");
	TORCH_CHECK(
	it != allocations.end(),
	"ProfilingAllocator: Allocate event "
	"must have preceded deallocate event.");
	allocations.erase(it);
	} else {
	TORCH_CHECK(false, "ProfilingAllocator: Invalid event type.");
	}
	}
	return true;
	}

	std::vector<MemEvent> create_and_sort_mem_events(
	const std::vector<uint64_t>& allocation_sizes,
	const std::vector<uint64_t>& allocation_lifetimes) {
	std::vector<MemEvent> events;
	for (uint64_t i = 0; i < allocation_sizes.size(); ++i) {
	// If observed allocation are freed outside the scope of
	// observation, then allocations are not managed by the
	// AllocationPlan.
	if (allocation_lifetimes[i] == std::numeric_limits<uint64_t>::max()) {
	continue;
	}
	events.emplace_back(i, i, allocation_sizes[i], EventType::Allocate);
	events.emplace_back(
	allocation_lifetimes[i], i, allocation_sizes[i], EventType::Free);
	}
	std::sort(
	events.begin(),
	events.end(),
	[](const MemEvent& a, const MemEvent& b) -> bool {
	return a.time < b.time;
	});
	return events;
	}

	std::vector<uint64_t> formulate_greedy_allocation_plan(
	const std::vector<uint64_t>& allocation_sizes,
	const std::vector<uint64_t>& allocation_lifetimes) {
	// Step 1. Construct all allocation/free events.
	// Sort these events by timestamp.
	// Step 2. Iterate through all events.
	// 2.1 If allocate event:
	// Find all candidate in free_size_to_offset map
	// Greedily pick the first one.
	// Remove the entry from free_size_to_offset map.
	// new_offset = offset + request_size
	// new_size = size - request_size
	// Add new entry to both maps
	// 2.2 If free event.
	// Check if the returned offset merges with another chunk.
	// If so merge until no more merging is possible.
	// If returned offset does not merge, then
	// just return it as a chunk.

	// lower_bound on this map will get all candidates of
	// the right size for allocation.
	std::map<uint64_t, uint64_t> free_size_to_offset;
	// This provides fast lookup when we want to insert freed block
	// back, especially when we want to merge blocks.
	ska::flat_hash_map<uint64_t, std::map<uint64_t, uint64_t>::iterator>
	free_start_offset_to_size_iter;
	ska::flat_hash_map<uint64_t, std::map<uint64_t, uint64_t>::iterator>
	free_end_offset_to_size_iter;
	// Upon free end_ptr = offset + size
	// If end_ptr exists merge freed allocation
	// Also find corresponding offset in size_to_offset
	// Remove that entry and update with new size and offset
	// If end_ptr does not exist then just insert offset,size
	// in map and correspondingly size, offset in the other map.
	// Merging should always be done recursively until no more chunks
	// that can be found.
	// After last free we should have only one entry left in these maps.

	std::vector<uint64_t> allocation_offsets(
	allocation_sizes.size(), std::numeric_limits<uint64_t>::max());
	auto mem_events =
	create_and_sort_mem_events(allocation_sizes, allocation_lifetimes);
	uint64_t max_offset{0};
	for (const auto& mem_event : mem_events) {
	// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
	uint64_t alloc_offset;
	// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
	uint64_t new_offset, new_size;
	if (mem_event.type == EventType::Allocate) {
	auto it = free_size_to_offset.lower_bound(mem_event.size);
	if (it == free_size_to_offset.end()) {
	// If there is no contiguous block of the size requested
	// allocate a new one.
	alloc_offset = max_offset;
	max_offset += mem_event.size;
	} else {
	// If we have found a block of the size we want
	// 1. change the block by allocating out of it.
	// 1.1 Erase the entire block
	// 1.2 Erase the reverse map entries
	// 2. If block still has space left insert the remainder back in map.
	// Including reverse map entries.
	alloc_offset = it->second;
	new_offset = alloc_offset + mem_event.size;
	new_size = it->first - mem_event.size;
	free_size_to_offset.erase(it);
	free_start_offset_to_size_iter.erase(alloc_offset);
	free_end_offset_to_size_iter.erase(alloc_offset + it->first);
	if (new_size > 0) {
	auto ref_it = free_size_to_offset.emplace(new_size, new_offset).first;
	free_start_offset_to_size_iter.emplace(new_offset, ref_it);
	free_end_offset_to_size_iter.emplace(new_offset + new_size, ref_it);
	}
	}
	allocation_offsets[mem_event.allocation_id] = alloc_offset;
	} else {
	// 1. Check if freed block is adjacent to an existing free block
	// at its end boundary. This is done by checking
	// free_end_offset_to_size_iter.
	// If we find such a block, remove it and adjust size of
	// the block being freed.
	// 2. Similarly check if freed block is adjacent to an existing
	// free block at start boundary. This is done by checking
	// free_start_offset_to_size_iter.
	// If we find such a block, remove it and adjust size of
	// the block being freed.
	// 3. Insert the freed block in map.
	auto freed_offset = allocation_offsets[mem_event.allocation_id];
	auto freed_size = mem_event.size;
	auto end_offset = freed_offset + freed_size;
	// Merge when another free block exist at the end of this block
	auto end_it = free_start_offset_to_size_iter.find(end_offset);
	if (end_it != free_start_offset_to_size_iter.end()) {
	auto merge_block_iter = end_it->second;
	auto merge_block_size = merge_block_iter->first;
	freed_size += merge_block_size;
	free_size_to_offset.erase(merge_block_iter);
	free_start_offset_to_size_iter.erase(end_it);
	// If the block is being merged then also remove it from
	// free_end_offset_to_size_iter
	free_end_offset_to_size_iter.erase(end_offset + merge_block_size);
	}
	// Merge when freed block exist at the end of another free block
	auto start_it = free_end_offset_to_size_iter.find(freed_offset);
	if (start_it != free_end_offset_to_size_iter.end()) {
	auto merge_block_iter = start_it->second;
	auto merge_block_size = merge_block_iter->first;
	freed_size += merge_block_size;
	freed_offset -= merge_block_size;
	free_size_to_offset.erase(merge_block_iter);
	free_end_offset_to_size_iter.erase(start_it);
	// If the block is being merged then also remove it from
	// free_start_offset_to_size_iter
	free_start_offset_to_size_iter.erase(freed_offset);
	}
	auto freed_block_it =
	free_size_to_offset.emplace(freed_size, freed_offset).first;
	free_start_offset_to_size_iter.emplace(freed_offset, freed_block_it);
	free_end_offset_to_size_iter.emplace(
	freed_offset + freed_size, freed_block_it);
	}
	}
	TORCH_CHECK(
	validate_allocation_plan(mem_events, allocation_offsets),
	"ProfilingAllocator: Allocation plan invalid.");
	return allocation_offsets;
	}

	} // namespace

	void AllocationPlan::clear() {
	allocation_sizes.clear();
	allocation_lifetimes.clear();
	allocation_offsets.clear();
	}

	void AllocationPlanner::record_allocation(
	const uint64_t size,
	const void* ptr) {
	if (validation_mode_) {
	validation_success = validation_success && validate_allocation(size, ptr);
	return;
	}
	allocation_plan_->allocation_sizes.push_back(size);
	allocation_plan_->allocation_lifetimes.push_back(
	std::numeric_limits<uint64_t>::max());
	allocation_ptr_to_id_[ptr] = allocation_id_;
	allocation_id_++;
	}

	void AllocationPlanner::record_free(const void* ptr) {
	if (validation_mode_) {
	validation_success = validation_success && validate_free(ptr);
	return;
	}
	auto it = allocation_ptr_to_id_.find(ptr);
	if (it == allocation_ptr_to_id_.end()) {
	// Free being recorded was allocated outside of WithProfileAllocationGuard
	return;
	}
	auto id = it->second;
	TORCH_CHECK(
	id < allocation_plan_->allocation_lifetimes.size(),
	"Allocation must have been recorded during record_allocation.");
	allocation_plan_->allocation_lifetimes[id] = allocation_id_;
	}

	bool AllocationPlanner::validate_allocation(
	const uint64_t size,
	const void* ptr) {
	if (allocation_id_ >= allocation_plan_->allocation_sizes.size() \|\|
	allocation_plan_->allocation_sizes[allocation_id_] != size) {
	TORCH_WARN(
	"Allocation request does not match plan:",
	"Allocation id:",
	allocation_id_,
	", Number of recorded allocations:",
	allocation_plan_->allocation_sizes.size(),
	", Recorded size of the requested allocation:",
	allocation_plan_->allocation_sizes[allocation_id_],
	", but got:",
	size);

	return false;
	}
	allocation_ptr_to_id_[ptr] = allocation_id_;
	allocation_id_++;
	return true;
	}

	bool AllocationPlanner::validate_free(const void* ptr) {
	auto it = allocation_ptr_to_id_.find(ptr);
	if (it == allocation_ptr_to_id_.end()) {
	// Allocation that was made outside the validation scope is being freed here
	return true;
	}
	auto id = (*it).second;
	TORCH_CHECK(
	id < allocation_plan_->allocation_lifetimes.size(),
	"Allocation must have been recorded during validate_allocation.");
	auto lifetime_id = allocation_plan_->allocation_lifetimes[id];
	return (lifetime_id == allocation_id_);
	}

	void AllocationPlanner::formulate_plan() {
	allocation_plan_->allocation_offsets = formulate_greedy_allocation_plan(
	allocation_plan_->allocation_sizes,
	allocation_plan_->allocation_lifetimes);
	allocation_plan_->total_size = 0;
	for (const auto i : c10::irange(allocation_plan_->allocation_sizes.size())) {
	if (allocation_plan_->allocation_lifetimes[i] ==
	std::numeric_limits<uint64_t>::max()) {
	continue;
	}
	auto limit = allocation_plan_->allocation_offsets[i] +
	allocation_plan_->allocation_sizes[i];
	allocation_plan_->total_size =
	std::max(allocation_plan_->total_size, limit);
	}
	}

	void AllocationPlanner::clear() {
	allocation_plan_->clear();
	allocation_ptr_to_id_.clear();
	}

	void CPUProfilingAllocator::set_plan(const AllocationPlan* plan) {
	TORCH_CHECK(plan != nullptr, "Allocation plan is nullptr.");
	plan_ = plan;
	allocation_id_ = 0;
	allocation_ptr_to_id_.clear();
	if (current_size_ < plan->total_size) {
	// Free existing memory and reallocate for larger size.
	c10::free_cpu(blob_);
	blob_ = c10::alloc_cpu(plan->total_size);
	current_size_ = plan->total_size;
	}
	}

	void CPUProfilingAllocator::unset_plan() {
	allocation_id_ = 0;
	allocation_ptr_to_id_.clear();
	plan_ = nullptr;
	}

	void* CPUProfilingAllocator::allocate(const size_t bytes) {
	TORCH_CHECK(
	bytes == plan_->allocation_sizes[allocation_id_],
	"Got allocation request that does not match with the plan.");
	if (plan_->allocation_lifetimes[allocation_id_] ==
	std::numeric_limits<uint64_t>::max()) {
	// This allocation is not managed by ProfilingAllocator.
	allocation_id_++;
	return c10::alloc_cpu(bytes);
	}
	void* ptr = reinterpret_cast<uint8_t*>(blob_) +
	plan_->allocation_offsets[allocation_id_];
	allocation_ptr_to_id_[ptr] = allocation_id_;
	allocation_id_++;
	return ptr;
	}

	void CPUProfilingAllocator::free(void* const ptr) {
	auto it = allocation_ptr_to_id_.find(ptr);
	if (it == allocation_ptr_to_id_.end()) {
	// Either
	// 1. Allocation that was made outside the validation scope is being freed
	// here or
	// 2. Allocation that is not managed by profiling allocator is being freed.
	// Example of the second type
	// Tensor out;
	// for (....) {
	// {
	// CPUProfilingAllocator
	// out = ...some op (This also frees previous memory held by out)
	// }
	// out is used..
	// }
	c10::free_cpu(ptr);
	return;
	}
	auto id = it->second;
	TORCH_CHECK(
	id < plan_->allocation_lifetimes.size(),
	"Freeing allocation that is not accordingly to the plan.");
	auto lifetime_id = plan_->allocation_lifetimes[id];
	TORCH_CHECK(
	lifetime_id == allocation_id_,
	"Lifetime of allocations do not match: allocation_id ",
	id,
	", expected:",
	lifetime_id,
	", got:",
	allocation_id_);
	}

	CPUProfilingAllocator::~CPUProfilingAllocator() {
	c10::free_cpu(blob_);
	}

	WithProfileAllocationsGuard::WithProfileAllocationsGuard(AllocationPlan* plan) {
	// Nesting of allocation profiling does not seem meaningful.
	TORCH_CHECK(
	allocation_planner == nullptr,
	"Nesting profiling allocations is not supported.");
	planner_ = std::make_unique<AllocationPlanner>(plan);
	planner_->clear();
	allocation_planner = planner_.get();
	}

	WithProfileAllocationsGuard::~WithProfileAllocationsGuard() {
	planner_->formulate_plan();
	allocation_planner = nullptr;
	}

	WithValidateAllocationPlanGuard::WithValidateAllocationPlanGuard(
	AllocationPlan* plan,
	bool* success) {
	// Nesting of allocation profiling does not seem meaningful.
	TORCH_CHECK(
	allocation_planner == nullptr,
	"Nesting profiling allocations is not supported.");
	planner_ = std::make_unique<AllocationPlanner>(plan, true);
	success_ = success;
	allocation_planner = planner_.get();
	}

	WithValidateAllocationPlanGuard::~WithValidateAllocationPlanGuard() {
	*success_ = planner_->validation_success;
	allocation_planner = nullptr;
	}

	AllocationPlanner* GetThreadLocalAllocationPlanner() {
	return allocation_planner;
	}

	WithProfilingAllocatorGuard::WithProfilingAllocatorGuard(
	CPUProfilingAllocator* allocator,
	const AllocationPlan* plan) {
	// Nesting of profiling allocator is not supported.
	TORCH_CHECK(
	profiling_allocator == nullptr,
	"Nesting profiling allocators is not supported.");
	profiling_allocator = allocator;
	profiling_allocator->set_plan(plan);
	}

	WithProfilingAllocatorGuard::~WithProfilingAllocatorGuard() {
	profiling_allocator->unset_plan();
	profiling_allocator = nullptr;
	}

	CPUProfilingAllocator* GetThreadLocalProfilingAllocator() {
	return profiling_allocator;
	}

	} // namespace c10