blob: c655d7953a0b4fcfb4f01df7316bc1a67f90ba71 [file] [log] [blame]
#include <c10/core/impl/alloc_cpu.h>
#include <c10/mobile/CPUProfilingAllocator.h>
#include <c10/util/irange.h>
#include <map>
#include <set>
namespace c10 {
namespace {
thread_local AllocationPlanner* allocation_planner{nullptr};
thread_local CPUProfilingAllocator* profiling_allocator{nullptr};
struct MemBlock {
uint64_t start_offset, end_offset;
MemBlock(uint64_t s, uint64_t e) : start_offset(s), end_offset(e) {}
bool operator<(const MemBlock& other) const {
return start_offset < other.start_offset;
}
};
enum class EventType { Allocate = 0, Free, Invalid };
struct MemEvent {
uint64_t time;
uint64_t allocation_id;
uint64_t size;
EventType type{EventType::Invalid};
MemEvent(uint64_t t, uint64_t id, uint64_t s, EventType e)
: time(t), allocation_id(id), size(s), type(e) {}
};
bool overlaps(const MemBlock& a, const MemBlock& b) {
// two blocks dont overlap if
// |---a--------|--------------b--------|
// strat_a end_a <= start_b end_b
return !(
(a.end_offset <= b.start_offset) || (b.end_offset <= a.start_offset));
}
bool validate_allocation_plan(
const std::vector<MemEvent>& alloc_events,
const std::vector<uint64_t>& allocation_offsets) {
std::set<MemBlock> allocations;
for (const auto& event : alloc_events) {
auto alloc_id = event.allocation_id;
// Skip allocations not managed by AllocationPlan
if (allocation_offsets[alloc_id] == std::numeric_limits<uint64_t>::max()) {
continue;
}
auto start_offset = allocation_offsets[alloc_id];
auto end_offset = allocation_offsets[alloc_id] + event.size;
MemBlock mem_block(start_offset, end_offset);
if (event.type == EventType::Allocate) {
auto it = allocations.lower_bound(mem_block);
if (it != allocations.end()) {
auto next_block = *it;
if (overlaps(next_block, mem_block)) {
return false;
}
}
if (it != allocations.begin()) {
auto prev_block = *(--it);
if (overlaps(prev_block, mem_block)) {
return false;
}
}
allocations.emplace(mem_block);
} else if (event.type == EventType::Free) {
auto it = allocations.find(mem_block);
TORCH_CHECK(
(*it).end_offset == end_offset,
"Enf offset of allocation being freed must match the one recorded.");
TORCH_CHECK(
it != allocations.end(),
"ProfilingAllocator: Allocate event "
"must have preceded deallocate event.");
allocations.erase(it);
} else {
TORCH_CHECK(false, "ProfilingAllocator: Invalid event type.");
}
}
return true;
}
std::vector<MemEvent> create_and_sort_mem_events(
const std::vector<uint64_t>& allocation_sizes,
const std::vector<uint64_t>& allocation_lifetimes) {
std::vector<MemEvent> events;
for (uint64_t i = 0; i < allocation_sizes.size(); ++i) {
// If observed allocation are freed outside the scope of
// observation, then allocations are not managed by the
// AllocationPlan.
if (allocation_lifetimes[i] == std::numeric_limits<uint64_t>::max()) {
continue;
}
events.emplace_back(i, i, allocation_sizes[i], EventType::Allocate);
events.emplace_back(
allocation_lifetimes[i], i, allocation_sizes[i], EventType::Free);
}
std::sort(
events.begin(),
events.end(),
[](const MemEvent& a, const MemEvent& b) -> bool {
return a.time < b.time;
});
return events;
}
std::vector<uint64_t> formulate_greedy_allocation_plan(
const std::vector<uint64_t>& allocation_sizes,
const std::vector<uint64_t>& allocation_lifetimes) {
// Step 1. Construct all allocation/free events.
// Sort these events by timestamp.
// Step 2. Iterate through all events.
// 2.1 If allocate event:
// Find all candidate in free_size_to_offset map
// Greedily pick the first one.
// Remove the entry from free_size_to_offset map.
// new_offset = offset + request_size
// new_size = size - request_size
// Add new entry to both maps
// 2.2 If free event.
// Check if the returned offset merges with another chunk.
// If so merge until no more merging is possible.
// If returned offset does not merge, then
// just return it as a chunk.
// lower_bound on this map will get all candidates of
// the right size for allocation.
std::map<uint64_t, uint64_t> free_size_to_offset;
// This provides fast lookup when we want to insert freed block
// back, especially when we want to merge blocks.
ska::flat_hash_map<uint64_t, std::map<uint64_t, uint64_t>::iterator>
free_start_offset_to_size_iter;
ska::flat_hash_map<uint64_t, std::map<uint64_t, uint64_t>::iterator>
free_end_offset_to_size_iter;
// Upon free end_ptr = offset + size
// If end_ptr exists merge freed allocation
// Also find corresponding offset in size_to_offset
// Remove that entry and update with new size and offset
// If end_ptr does not exist then just insert offset,size
// in map and correspondingly size, offset in the other map.
// Merging should always be done recursively until no more chunks
// that can be found.
// After last free we should have only one entry left in these maps.
std::vector<uint64_t> allocation_offsets(
allocation_sizes.size(), std::numeric_limits<uint64_t>::max());
auto mem_events =
create_and_sort_mem_events(allocation_sizes, allocation_lifetimes);
uint64_t max_offset{0};
for (const auto& mem_event : mem_events) {
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
uint64_t alloc_offset;
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
uint64_t new_offset, new_size;
if (mem_event.type == EventType::Allocate) {
auto it = free_size_to_offset.lower_bound(mem_event.size);
if (it == free_size_to_offset.end()) {
// If there is no contiguous block of the size requested
// allocate a new one.
alloc_offset = max_offset;
max_offset += mem_event.size;
} else {
// If we have found a block of the size we want
// 1. change the block by allocating out of it.
// 1.1 Erase the entire block
// 1.2 Erase the reverse map entries
// 2. If block still has space left insert the remainder back in map.
// Including reverse map entries.
alloc_offset = it->second;
new_offset = alloc_offset + mem_event.size;
new_size = it->first - mem_event.size;
free_size_to_offset.erase(it);
free_start_offset_to_size_iter.erase(alloc_offset);
free_end_offset_to_size_iter.erase(alloc_offset + it->first);
if (new_size > 0) {
auto ref_it = free_size_to_offset.emplace(new_size, new_offset).first;
free_start_offset_to_size_iter.emplace(new_offset, ref_it);
free_end_offset_to_size_iter.emplace(new_offset + new_size, ref_it);
}
}
allocation_offsets[mem_event.allocation_id] = alloc_offset;
} else {
// 1. Check if freed block is adjacent to an existing free block
// at its end boundary. This is done by checking
// free_end_offset_to_size_iter.
// If we find such a block, remove it and adjust size of
// the block being freed.
// 2. Similarly check if freed block is adjacent to an existing
// free block at start boundary. This is done by checking
// free_start_offset_to_size_iter.
// If we find such a block, remove it and adjust size of
// the block being freed.
// 3. Insert the freed block in map.
auto freed_offset = allocation_offsets[mem_event.allocation_id];
auto freed_size = mem_event.size;
auto end_offset = freed_offset + freed_size;
// Merge when another free block exist at the end of this block
auto end_it = free_start_offset_to_size_iter.find(end_offset);
if (end_it != free_start_offset_to_size_iter.end()) {
auto merge_block_iter = end_it->second;
auto merge_block_size = merge_block_iter->first;
freed_size += merge_block_size;
free_size_to_offset.erase(merge_block_iter);
free_start_offset_to_size_iter.erase(end_it);
// If the block is being merged then also remove it from
// free_end_offset_to_size_iter
free_end_offset_to_size_iter.erase(end_offset + merge_block_size);
}
// Merge when freed block exist at the end of another free block
auto start_it = free_end_offset_to_size_iter.find(freed_offset);
if (start_it != free_end_offset_to_size_iter.end()) {
auto merge_block_iter = start_it->second;
auto merge_block_size = merge_block_iter->first;
freed_size += merge_block_size;
freed_offset -= merge_block_size;
free_size_to_offset.erase(merge_block_iter);
free_end_offset_to_size_iter.erase(start_it);
// If the block is being merged then also remove it from
// free_start_offset_to_size_iter
free_start_offset_to_size_iter.erase(freed_offset);
}
auto freed_block_it =
free_size_to_offset.emplace(freed_size, freed_offset).first;
free_start_offset_to_size_iter.emplace(freed_offset, freed_block_it);
free_end_offset_to_size_iter.emplace(
freed_offset + freed_size, freed_block_it);
}
}
TORCH_CHECK(
validate_allocation_plan(mem_events, allocation_offsets),
"ProfilingAllocator: Allocation plan invalid.");
return allocation_offsets;
}
} // namespace
void AllocationPlan::clear() {
allocation_sizes.clear();
allocation_lifetimes.clear();
allocation_offsets.clear();
}
void AllocationPlanner::record_allocation(
const uint64_t size,
const void* ptr) {
if (validation_mode_) {
validation_success = validation_success && validate_allocation(size, ptr);
return;
}
allocation_plan_->allocation_sizes.push_back(size);
allocation_plan_->allocation_lifetimes.push_back(
std::numeric_limits<uint64_t>::max());
allocation_ptr_to_id_[ptr] = allocation_id_;
allocation_id_++;
}
void AllocationPlanner::record_free(const void* ptr) {
if (validation_mode_) {
validation_success = validation_success && validate_free(ptr);
return;
}
auto it = allocation_ptr_to_id_.find(ptr);
if (it == allocation_ptr_to_id_.end()) {
// Free being recorded was allocated outside of WithProfileAllocationGuard
return;
}
auto id = it->second;
TORCH_CHECK(
id < allocation_plan_->allocation_lifetimes.size(),
"Allocation must have been recorded during record_allocation.");
allocation_plan_->allocation_lifetimes[id] = allocation_id_;
}
bool AllocationPlanner::validate_allocation(
const uint64_t size,
const void* ptr) {
if (allocation_id_ >= allocation_plan_->allocation_sizes.size() ||
allocation_plan_->allocation_sizes[allocation_id_] != size) {
TORCH_WARN(
"Allocation request does not match plan:",
"Allocation id:",
allocation_id_,
", Number of recorded allocations:",
allocation_plan_->allocation_sizes.size(),
", Recorded size of the requested allocation:",
allocation_plan_->allocation_sizes[allocation_id_],
", but got:",
size);
return false;
}
allocation_ptr_to_id_[ptr] = allocation_id_;
allocation_id_++;
return true;
}
bool AllocationPlanner::validate_free(const void* ptr) {
auto it = allocation_ptr_to_id_.find(ptr);
if (it == allocation_ptr_to_id_.end()) {
// Allocation that was made outside the validation scope is being freed here
return true;
}
auto id = (*it).second;
TORCH_CHECK(
id < allocation_plan_->allocation_lifetimes.size(),
"Allocation must have been recorded during validate_allocation.");
auto lifetime_id = allocation_plan_->allocation_lifetimes[id];
return (lifetime_id == allocation_id_);
}
void AllocationPlanner::formulate_plan() {
allocation_plan_->allocation_offsets = formulate_greedy_allocation_plan(
allocation_plan_->allocation_sizes,
allocation_plan_->allocation_lifetimes);
allocation_plan_->total_size = 0;
for (const auto i : c10::irange(allocation_plan_->allocation_sizes.size())) {
if (allocation_plan_->allocation_lifetimes[i] ==
std::numeric_limits<uint64_t>::max()) {
continue;
}
auto limit = allocation_plan_->allocation_offsets[i] +
allocation_plan_->allocation_sizes[i];
allocation_plan_->total_size =
std::max(allocation_plan_->total_size, limit);
}
}
void AllocationPlanner::clear() {
allocation_plan_->clear();
allocation_ptr_to_id_.clear();
}
void CPUProfilingAllocator::set_plan(const AllocationPlan* plan) {
TORCH_CHECK(plan != nullptr, "Allocation plan is nullptr.");
plan_ = plan;
allocation_id_ = 0;
allocation_ptr_to_id_.clear();
if (current_size_ < plan->total_size) {
// Free existing memory and reallocate for larger size.
c10::free_cpu(blob_);
blob_ = c10::alloc_cpu(plan->total_size);
current_size_ = plan->total_size;
}
}
void CPUProfilingAllocator::unset_plan() {
allocation_id_ = 0;
allocation_ptr_to_id_.clear();
plan_ = nullptr;
}
void* CPUProfilingAllocator::allocate(const size_t bytes) {
TORCH_CHECK(
bytes == plan_->allocation_sizes[allocation_id_],
"Got allocation request that does not match with the plan.");
if (plan_->allocation_lifetimes[allocation_id_] ==
std::numeric_limits<uint64_t>::max()) {
// This allocation is not managed by ProfilingAllocator.
allocation_id_++;
return c10::alloc_cpu(bytes);
}
void* ptr = reinterpret_cast<uint8_t*>(blob_) +
plan_->allocation_offsets[allocation_id_];
allocation_ptr_to_id_[ptr] = allocation_id_;
allocation_id_++;
return ptr;
}
void CPUProfilingAllocator::free(void* const ptr) {
auto it = allocation_ptr_to_id_.find(ptr);
if (it == allocation_ptr_to_id_.end()) {
// Either
// 1. Allocation that was made outside the validation scope is being freed
// here or
// 2. Allocation that is not managed by profiling allocator is being freed.
// Example of the second type
// Tensor out;
// for (....) {
// {
// CPUProfilingAllocator
// out = ...some op (This also frees previous memory held by out)
// }
// out is used..
// }
c10::free_cpu(ptr);
return;
}
auto id = it->second;
TORCH_CHECK(
id < plan_->allocation_lifetimes.size(),
"Freeing allocation that is not accordingly to the plan.");
auto lifetime_id = plan_->allocation_lifetimes[id];
TORCH_CHECK(
lifetime_id == allocation_id_,
"Lifetime of allocations do not match: allocation_id ",
id,
", expected:",
lifetime_id,
", got:",
allocation_id_);
}
CPUProfilingAllocator::~CPUProfilingAllocator() {
c10::free_cpu(blob_);
}
WithProfileAllocationsGuard::WithProfileAllocationsGuard(AllocationPlan* plan) {
// Nesting of allocation profiling does not seem meaningful.
TORCH_CHECK(
allocation_planner == nullptr,
"Nesting profiling allocations is not supported.");
planner_ = std::make_unique<AllocationPlanner>(plan);
planner_->clear();
allocation_planner = planner_.get();
}
WithProfileAllocationsGuard::~WithProfileAllocationsGuard() {
planner_->formulate_plan();
allocation_planner = nullptr;
}
WithValidateAllocationPlanGuard::WithValidateAllocationPlanGuard(
AllocationPlan* plan,
bool* success) {
// Nesting of allocation profiling does not seem meaningful.
TORCH_CHECK(
allocation_planner == nullptr,
"Nesting profiling allocations is not supported.");
planner_ = std::make_unique<AllocationPlanner>(plan, true);
success_ = success;
allocation_planner = planner_.get();
}
WithValidateAllocationPlanGuard::~WithValidateAllocationPlanGuard() {
*success_ = planner_->validation_success;
allocation_planner = nullptr;
}
AllocationPlanner* GetThreadLocalAllocationPlanner() {
return allocation_planner;
}
WithProfilingAllocatorGuard::WithProfilingAllocatorGuard(
CPUProfilingAllocator* allocator,
const AllocationPlan* plan) {
// Nesting of profiling allocator is not supported.
TORCH_CHECK(
profiling_allocator == nullptr,
"Nesting profiling allocators is not supported.");
profiling_allocator = allocator;
profiling_allocator->set_plan(plan);
}
WithProfilingAllocatorGuard::~WithProfilingAllocatorGuard() {
profiling_allocator->unset_plan();
profiling_allocator = nullptr;
}
CPUProfilingAllocator* GetThreadLocalProfilingAllocator() {
return profiling_allocator;
}
} // namespace c10