tensorflow/core/common_runtime/bfc_allocator.cc - platform/external/tensorflow - Git at Google

 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

 #include "tensorflow/core/common_runtime/bfc_allocator.h"

 #include <atomic>

 #include "absl/strings/string_view.h"
 #include "tensorflow/core/common_runtime/allocator_retry.h"
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/file_system.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/stacktrace.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/protobuf/bfc_memory_map.pb.h"

 namespace tensorflow {

 constexpr BFCAllocator::ChunkHandle BFCAllocator::kInvalidChunkHandle;
 constexpr uint64 BFCAllocator::kMemDebugHistorySize;

 BFCAllocator::BFCAllocator(SubAllocator* sub_allocator, size_t total_memory,
                            bool allow_growth, const string& name,
                            bool garbage_collection)
     : garbage_collection_(garbage_collection),
       coalesce_regions_(sub_allocator->SupportsCoalescing()),
       sub_allocator_(sub_allocator),
       name_(name),
       free_chunks_list_(kInvalidChunkHandle),
       next_allocation_id_(1),
       action_counter_(0) {
   if (allow_growth) {
     // 2MiB smallest initial allocation, unless total memory available
     // is less.
     curr_region_allocation_bytes_ =
         RoundedBytes(std::min(total_memory, size_t{2 << 20}));
   } else {
     curr_region_allocation_bytes_ = RoundedBytes(total_memory);
   }

   // Allocate the requested amount of memory.
   memory_limit_ = total_memory;
   stats_.bytes_limit = static_cast<int64>(total_memory);

   // Create a bunch of bins of various good sizes.

   // We create bins to fit all possible ranges that cover the
   // memory_limit_ starting from allocations up to 256 bytes to
   // allocations up to (and including) the memory limit.
   for (BinNum b = 0; b < kNumBins; b++) {
     size_t bin_size = BinNumToSize(b);
     VLOG(1) << "Creating bin of max chunk size "
             << strings::HumanReadableNumBytes(bin_size);
     new (BinFromIndex(b)) Bin(this, bin_size);
     CHECK_EQ(BinForSize(bin_size), BinFromIndex(b));
     CHECK_EQ(BinForSize(bin_size + 255), BinFromIndex(b));
     CHECK_EQ(BinForSize(bin_size * 2 - 1), BinFromIndex(b));
     if (b + 1 < kNumBins) {
       CHECK_NE(BinForSize(bin_size * 2), BinFromIndex(b));
     }
   }
 }

 BFCAllocator::~BFCAllocator() {
   // Return memory back.
   VLOG(2) << "Number of regions allocated: "
           << region_manager_.regions().size();
   for (const auto& region : region_manager_.regions()) {
     sub_allocator_->Free(region.ptr(), region.memory_size());
   }

   for (BinNum b = 0; b < kNumBins; b++) {
     BinFromIndex(b)->~Bin();
   }
 }

 BFCAllocator::Chunk* BFCAllocator::ChunkFromHandle(ChunkHandle h) {
   DCHECK_GE(h, 0);
   DCHECK_LT(h, static_cast<int>(chunks_.size()));
   return &(chunks_[h]);
 }

 const BFCAllocator::Chunk* BFCAllocator::ChunkFromHandle(ChunkHandle h) const {
   DCHECK_GE(h, 0);
   DCHECK_LT(h, static_cast<int>(chunks_.size()));
   return &(chunks_[h]);
 }

 bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
   size_t available_bytes = memory_limit_ - total_region_allocated_bytes_;
   // Rounds available_bytes down to the nearest multiple of kMinAllocationSize.
   available_bytes = (available_bytes / kMinAllocationSize) * kMinAllocationSize;

   // Do we have enough space to handle the client's request?
   // If not, fail immediately.
   if (rounded_bytes > available_bytes) {
     return false;
   }

   // If curr_region_allocation_bytes_ is not enough to satisfy the
   // allocation, keep multiplying by a power of two until that is
   // sufficient.
   bool increased_allocation = false;
   while (rounded_bytes > curr_region_allocation_bytes_) {
     curr_region_allocation_bytes_ *= 2;
     increased_allocation = true;
   }

   // Try allocating.
   size_t bytes = std::min(curr_region_allocation_bytes_, available_bytes);
   size_t bytes_received;
   void* mem_addr = sub_allocator_->Alloc(alignment, bytes, &bytes_received);
   if (mem_addr == nullptr && !started_backpedal_) {
     // Only backpedal once.
     started_backpedal_ = true;

     static constexpr float kBackpedalFactor = 0.9;

     // Try allocating less memory.
     while (mem_addr == nullptr) {
       bytes = RoundedBytes(bytes * kBackpedalFactor);
       if (bytes < rounded_bytes) break;
       mem_addr = sub_allocator_->Alloc(alignment, bytes, &bytes_received);
     }
   }

   if (mem_addr == nullptr) {
     return false;
   }

   if (!increased_allocation) {
     // Increase the region size of the next required allocation.
     curr_region_allocation_bytes_ *= 2;
   }

   VLOG(1) << "Extending allocation by "
           << strings::HumanReadableNumBytes(bytes_received) << " bytes.";

   total_region_allocated_bytes_ += bytes_received;
   VLOG(1) << "Total allocated bytes: "
           << strings::HumanReadableNumBytes(total_region_allocated_bytes_);

   VLOG(1) << "Allocated memory at " << mem_addr << " to "
           << static_cast<void*>(static_cast<char*>(mem_addr) + bytes_received);

   AllocationRegion* maybe_extended_region = nullptr;
   if (coalesce_regions_) {
     maybe_extended_region =
         region_manager_.AddOrExtendAllocationRegion(mem_addr, bytes_received);
   } else {
     region_manager_.AddAllocationRegion(mem_addr, bytes_received);
   }

   // Create one large chunk for the whole memory space that will
   // be chunked later.
   ChunkHandle h = AllocateChunk();
   BFCAllocator::Chunk* c = ChunkFromHandle(h);
   c->ptr = mem_addr;
   c->size = bytes_received;
   c->allocation_id = -1;
   c->prev = kInvalidChunkHandle;
   c->next = kInvalidChunkHandle;
   c->freed_at_count = 0;

   region_manager_.set_handle(c->ptr, h);

   // If the region was extended, then there exists a previous chunk that should
   // be linked to the new chunk.
   if (maybe_extended_region != nullptr) {
     ChunkHandle prev =
         maybe_extended_region->get_handle(maybe_extended_region->ptr());
     BFCAllocator::Chunk* prev_chunk = ChunkFromHandle(prev);
     // Find the last recorded chunk in the extended region.
     while (prev_chunk->next != kInvalidChunkHandle) {
       prev = prev_chunk->next;
       prev_chunk = ChunkFromHandle(prev);
     }
     c->prev = prev;
     prev_chunk->next = h;
   }

   // Maybe merge adjacent chunks and insert the chunk into the right bin.
   InsertFreeChunkIntoBin(TryToCoalesce(h, /*ignore_freed_at=*/false));

   return true;
 }

 BFCAllocator::ChunkHandle BFCAllocator::AllocateChunk() {
   if (free_chunks_list_ != kInvalidChunkHandle) {
     ChunkHandle h = free_chunks_list_;
     Chunk* c = ChunkFromHandle(h);
     free_chunks_list_ = c->next;
     return h;
   } else {
     ChunkHandle h = chunks_.size();
     chunks_.resize(h + 1);
     return h;
   }
 }

 void BFCAllocator::DeallocateChunk(ChunkHandle h) {
   Chunk* c = ChunkFromHandle(h);
   c->allocation_id = -1;
   c->bin_num = kInvalidBinNum;
   c->next = free_chunks_list_;
   free_chunks_list_ = h;
 }

 void* BFCAllocator::AllocateRawInternalWithRetry(
     size_t unused_alignment, size_t num_bytes,
     const AllocationAttributes& allocation_attr) {
   // Fast path: Try once to allocate without getting the retry_helper_ involved
   uint64 freed_by_count = 0;
   if (allocation_attr.freed_by_func != nullptr) {
     freed_by_count = (*allocation_attr.freed_by_func)();
   }
   void* r =
       AllocateRawInternal(unused_alignment, num_bytes, false, freed_by_count);
   if (r != nullptr) {
     return r;
   } else {
     static const int64 kMaxMillisToWait = 10000;  // 10 seconds
     r = retry_helper_.AllocateRaw(
         [this, &allocation_attr](size_t a, size_t nb, bool v) {
           uint64 freed_by_count = 0;
           if (allocation_attr.freed_by_func != nullptr) {
             freed_by_count = (*allocation_attr.freed_by_func)();
           }
           return AllocateRawInternal(a, nb, v, freed_by_count);
         },
         kMaxMillisToWait, unused_alignment, num_bytes);
     return r;
   }
 }

 void* BFCAllocator::AllocateRaw(size_t unused_alignment, size_t num_bytes,
                                 const AllocationAttributes& allocation_attr) {
   VLOG(1) << "AllocateRaw " << Name() << "  " << num_bytes;
   if (!allocation_attr.retry_on_failure) {
     // Return immediately upon the first failure if this is for allocating an
     // optional scratch space.
     bool dump_log_on_failure = VLOG_IS_ON(2);
     uint64 freed_by_count = 0;
     if (allocation_attr.freed_by_func != nullptr) {
       freed_by_count = (*allocation_attr.freed_by_func)();
     }
     void* result = AllocateRawInternal(unused_alignment, num_bytes,
                                        dump_log_on_failure, freed_by_count);
     if (result == nullptr) {
       static std::atomic<int32> log_counter{0};
       int32 counter_value = log_counter.load(std::memory_order_relaxed);
       if (counter_value < 10) {
         log_counter.store(counter_value + 1, std::memory_order_relaxed);
         LOG(WARNING)
             << "Allocator (" << Name() << ") ran out of memory trying "
             << "to allocate " << strings::HumanReadableNumBytes(num_bytes)
             << " with freed_by_count=" << freed_by_count

             << ". The caller indicates that this is not a failure, but"
             << " may mean that there could be performance gains if more"
             << " memory were available.";
       }
     }
     return result;
   } else {
     return AllocateRawInternalWithRetry(unused_alignment, num_bytes,
                                         allocation_attr);
   }
 }

 // static
 size_t BFCAllocator::RoundedBytes(size_t bytes) {
   size_t rounded_bytes =
       (kMinAllocationSize *
        ((bytes + kMinAllocationSize - 1) / kMinAllocationSize));
   DCHECK_EQ(size_t{0}, rounded_bytes % kMinAllocationSize);
   return rounded_bytes;
 }

 bool BFCAllocator::DeallocateFreeRegions(size_t rounded_bytes)
     TF_EXCLUSIVE_LOCKS_REQUIRED(lock_) {
   // Do nothing if garbage collection is off.
   if (!garbage_collection_) {
     return false;
   }

   // Searching for free regions.
   absl::flat_hash_set<void*> free_region_ptrs;
   size_t total_free_bytes = 0;
   for (const AllocationRegion& region : region_manager_.regions()) {
     ChunkHandle h = region_manager_.get_handle(region.ptr());
     bool any_use = false;
     while (h != kInvalidChunkHandle) {
       const Chunk* c = ChunkFromHandle(h);
       if (c->in_use()) {
         any_use = true;
         break;
       }
       h = c->next;
     }

     if (!any_use) {
       VLOG(2) << "Found free region with ptr = " << region.ptr();
       free_region_ptrs.insert(region.ptr());
       total_free_bytes += region.memory_size();
     }
   }

   if (total_free_bytes == 0) {
     return false;
   }

   // Rough estimation to check whether deallocation can help.
   size_t available_bytes =
       memory_limit_ - total_region_allocated_bytes_ + total_free_bytes;
   if (rounded_bytes > available_bytes) {
     return false;
   }

   LOG(WARNING) << "Garbage collection: deallocate free memory regions"
                << " (i.e., allocations) so that we can re-allocate a larger"
                << " region to avoid OOM due to memory fragmentation. If you"
                << " see this message frequently, you are running near the"
                << " threshold of the available device memory and re-allocation"
                << " may incur great performance overhead. You may try smaller"
                << " batch sizes to observe the performance impact."
                << " Set TF_ENABLE_GPU_GARBAGE_COLLECTION=false if you'd like to"
                << " disable this feature.";

   // Deallocate free regions.
   DeallocateRegions(free_region_ptrs);

   return true;
 }

 void BFCAllocator::DeallocateRegions(
     const absl::flat_hash_set<void*>& region_ptrs)
     TF_EXCLUSIVE_LOCKS_REQUIRED(lock_) {
   // Explicitly remove the const qualifier as some compilers disallow passing
   // const_iterator to std::vector::erase(), which is used in
   // RemoveAllocationRegion().
   auto regions =
       const_cast<std::vector<AllocationRegion>*>(&region_manager_.regions());
   auto it = regions->begin();
   while (it != regions->end()) {
     if (!region_ptrs.contains(it->ptr())) {
       ++it;
       continue;
     }

     VLOG(2) << "Deallocate region with ptr = " << it->ptr();
     // Remove all chunk registrations from Bins.
     ChunkHandle h = region_manager_.get_handle(it->ptr());
     while (h != kInvalidChunkHandle) {
       const Chunk* c = ChunkFromHandle(h);
       if (c->bin_num != kInvalidBinNum) {
         RemoveFreeChunkFromBin(h);
       }
       auto h_to_delete = h;
       h = c->next;
       DeleteChunk(h_to_delete);
     }

     // Deallocate the memory.
     sub_allocator_->Free(it->ptr(), it->memory_size());
     total_region_allocated_bytes_ -= it->memory_size();
     it = region_manager_.RemoveAllocationRegion(it);
   }
 }

 void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
                                         size_t num_bytes,
                                         bool dump_log_on_failure,
                                         uint64 freed_before) {
   if (num_bytes == 0) {
     VLOG(2) << "tried to allocate 0 bytes";
     return nullptr;
   }
   // First, always allocate memory of at least kMinAllocationSize
   // bytes, and always allocate multiples of kMinAllocationSize bytes
   // so all memory addresses are nicely byte aligned.
   size_t rounded_bytes = RoundedBytes(num_bytes);

   // The BFC allocator tries to find the best fit first.
   BinNum bin_num = BinNumForSize(rounded_bytes);

   mutex_lock l(lock_);
   if (!timestamped_chunks_.empty()) {
     // Merge timestamped chunks whose counts have become safe for general use.
     MergeTimestampedChunks(0);
   }
   void* ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
   if (ptr != nullptr) {
     AddTraceMe("MemoryAllocation", ptr);
     return ptr;
   }

   // Try to extend
   if (Extend(unused_alignment, rounded_bytes)) {
     ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
     if (ptr != nullptr) {
       AddTraceMe("MemoryAllocation", ptr);
       return ptr;
     }
   }

   if ((freed_before == 0) && (!timestamped_chunks_.empty())) {
     // We're unable to satisfy an allocation request without a specific
     // timestamp requirement.  Rather than fail, try merging any held-out
     // timestamped chunks more aggressively until a free chunk of the necessary
     // size is formed.
     if (MergeTimestampedChunks(rounded_bytes)) {
       ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
       if (ptr != nullptr) {
         AddTraceMe("MemoryAllocation", ptr);
         return ptr;
       }
     }
   }

   // Reaching this point means that no chunks can satisfy the request. Also,
   // the unallocated bytes cannot satisfy the request. Before giving up, let's
   // try deallocating free regions so that suballocator can combine them with
   // the unallocated bytes and form a larger region.
   if (DeallocateFreeRegions(rounded_bytes) &&
       Extend(unused_alignment, rounded_bytes)) {
     ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
     if (ptr != nullptr) {
       AddTraceMe("MemoryAllocation", ptr);
       return ptr;
     }
   }

   // We searched all bins for an existing free chunk to use and
   // couldn't find one.  This means we must have run out of memory,
   // Dump the memory log for analysis.
   MaybeWriteMemoryMap();
   if (dump_log_on_failure) {
     LOG(WARNING)
         << "Allocator (" << Name() << ") ran out of memory trying "
         << "to allocate " << strings::HumanReadableNumBytes(num_bytes)
         << " (rounded to " << rounded_bytes << ")"
         << "requested by op "
         << ScopedMemoryDebugAnnotation::CurrentAnnotation().pending_op_name
         << "\nIf the cause is memory fragmentation maybe the environment "
         << "variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will "
         << "improve the situation. \nCurrent allocation summary follows."
         << "\nCurrent allocation summary follows.";
     DumpMemoryLog(rounded_bytes);
     LOG(WARNING) << RenderOccupancy();
   }
   return nullptr;
 }

 int64 BFCAllocator::LargestFreeChunk() {
   for (int i = kNumBins - 1; i >= 0; i--) {
     if (!BinFromIndex(i)->free_chunks.empty()) {
       return ChunkFromHandle(*BinFromIndex(i)->free_chunks.rbegin())->size;
     }
   }
   return 0;
 }

 double BFCAllocator::GetFragmentation() {
   int64 bytes_available = total_region_allocated_bytes_ - stats_.bytes_in_use;
   DCHECK_GT(bytes_available, 0);
   return static_cast<double>(bytes_available - LargestFreeChunk()) /
          bytes_available;
 }

 void BFCAllocator::AddTraceMe(absl::string_view traceme_name, const void* ptr) {
   BFCAllocator::Chunk* chunk = ChunkFromHandle(region_manager_.get_handle(ptr));
   AddTraceMe(traceme_name, chunk->ptr, chunk->requested_size, chunk->size);
 }

 void BFCAllocator::AddTraceMe(absl::string_view traceme_name,
                               const void* chunk_ptr, int64 req_bytes,
                               int64 alloc_bytes) {
   tensorflow::profiler::TraceMe::InstantActivity(
       [this, traceme_name, chunk_ptr, req_bytes, alloc_bytes]()
           TF_NO_THREAD_SAFETY_ANALYSIS {
             int64 bytes_available =
                 memory_limit_ - stats_.bytes_reserved - stats_.bytes_in_use;
             const auto& annotation =
                 ScopedMemoryDebugAnnotation::CurrentAnnotation();
             std::string tensor_shape;
             if (annotation.pending_shape) {
               tensor_shape = annotation.pending_shape->DebugString();
             }
             return tensorflow::profiler::TraceMeEncode(
                 traceme_name, {{"allocator_name", name_},
                                {"bytes_reserved", stats_.bytes_reserved},
                                {"bytes_allocated", stats_.bytes_in_use},
                                {"bytes_available", bytes_available},
                                {"fragmentation", GetFragmentation()},
                                {"peak_bytes_in_use", stats_.peak_bytes_in_use},
                                {"requested_bytes", req_bytes},
                                {"allocation_bytes", alloc_bytes},
                                {"addr", reinterpret_cast<uint64>(chunk_ptr)},
                                {"tf_op", annotation.pending_op_name},
                                {"id", annotation.pending_step_id},
                                {"region_type", annotation.pending_region_type},
                                {"data_type", annotation.pending_data_type},
                                {"shape", tensor_shape}});
           },
       /*level=*/profiler::TraceMeLevel::kInfo);
 }

 void* BFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes,
                                  size_t num_bytes, uint64 freed_before) {
   // First identify the first bin that could satisfy rounded_bytes.
   for (; bin_num < kNumBins; bin_num++) {
     // Start searching from the first bin for the smallest chunk that fits
     // rounded_bytes.
     Bin* b = BinFromIndex(bin_num);
     for (auto citer = b->free_chunks.begin(); citer != b->free_chunks.end();
          ++citer) {
       const BFCAllocator::ChunkHandle h = (*citer);
       BFCAllocator::Chunk* chunk = ChunkFromHandle(h);
       DCHECK(!chunk->in_use());
       if (freed_before > 0 && freed_before < chunk->freed_at_count) {
         continue;
       }
       if (chunk->size >= rounded_bytes) {
         // We found an existing chunk that fits us that wasn't in use, so remove
         // it from the free bin structure prior to using.
         RemoveFreeChunkIterFromBin(&b->free_chunks, citer);

         // If we can break the size of the chunk into two reasonably large
         // pieces, do so.  In any case don't waste more than
         // kMaxInternalFragmentation bytes on padding this alloc.
         const int64 kMaxInternalFragmentation = 128 << 20;  // 128mb
         if (chunk->size >= rounded_bytes * 2 ||
             static_cast<int64>(chunk->size) - rounded_bytes >=
                 kMaxInternalFragmentation) {
           SplitChunk(h, rounded_bytes);
           chunk = ChunkFromHandle(h);  // Update chunk pointer in case it moved
         }

         // The requested size of the returned chunk is what the user
         // has allocated.
         chunk->requested_size = num_bytes;
         // Assign a unique id and increment the id counter, marking the
         // chunk as being in use.
         chunk->allocation_id = next_allocation_id_++;

         // Update stats.
         ++stats_.num_allocs;
         stats_.bytes_in_use += chunk->size;
         stats_.peak_bytes_in_use =
             std::max(stats_.peak_bytes_in_use, stats_.bytes_in_use);
         stats_.largest_alloc_size =
             std::max<std::size_t>(stats_.largest_alloc_size, chunk->size);
         if (ShouldRecordOpName()) {
           const auto& annotation =
               ScopedMemoryDebugAnnotation::CurrentAnnotation();
           chunk->op_name = annotation.pending_op_name;
           if (!annotation.pending_op_name) {
             VLOG(2) << "missing pending_op_name for " << Name()
                     << " reading addr "
                     << static_cast<const void*>(&annotation.pending_op_name)
                     << "\n"
                     << CurrentStackTrace();
           }
           chunk->step_id = annotation.pending_step_id;
           chunk->action_count = ++action_counter_;
           uint64 slot = chunk->action_count % kMemDebugHistorySize;
           size_history_[slot] = stats_.bytes_in_use;
         }

         VLOG(4) << "Returning: " << chunk->ptr;
         if (VLOG_IS_ON(4)) {
           LOG(INFO) << "A: " << RenderOccupancy();
         }
         return chunk->ptr;
       }
     }
   }

   return nullptr;
 }

 void BFCAllocator::SplitChunk(BFCAllocator::ChunkHandle h, size_t num_bytes) {
   // Allocate the new chunk before we do any ChunkFromHandle
   ChunkHandle h_new_chunk = AllocateChunk();

   Chunk* c = ChunkFromHandle(h);
   CHECK(!c->in_use() && (c->bin_num == kInvalidBinNum));

   // Create a new chunk starting num_bytes after c
   BFCAllocator::Chunk* new_chunk = ChunkFromHandle(h_new_chunk);
   new_chunk->ptr = static_cast<void*>(static_cast<char*>(c->ptr) + num_bytes);
   region_manager_.set_handle(new_chunk->ptr, h_new_chunk);

   // Set the new sizes of the chunks.
   new_chunk->size = c->size - num_bytes;
   c->size = num_bytes;

   // The new chunk is not in use.
   new_chunk->allocation_id = -1;

   // It inherits the freed time.
   new_chunk->freed_at_count = c->freed_at_count;

   // Maintain the pointers.
   // c <-> c_neighbor becomes
   // c <-> new_chunk <-> c_neighbor
   BFCAllocator::ChunkHandle h_neighbor = c->next;
   new_chunk->prev = h;
   new_chunk->next = h_neighbor;
   c->next = h_new_chunk;
   if (h_neighbor != kInvalidChunkHandle) {
     Chunk* c_neighbor = ChunkFromHandle(h_neighbor);
     c_neighbor->prev = h_new_chunk;
   }

   // Add the newly free chunk to the free bin.
   InsertFreeChunkIntoBin(h_new_chunk);
 }

 void BFCAllocator::DeallocateRaw(void* ptr) {
   VLOG(1) << "DeallocateRaw " << Name() << " "
           << (ptr ? RequestedSize(ptr) : 0);
   DeallocateRawInternal(ptr);
   retry_helper_.NotifyDealloc();
 }

 void BFCAllocator::DeallocateRawInternal(void* ptr) {
   if (ptr == nullptr) {
     VLOG(2) << "tried to deallocate nullptr";
     return;
   }
   mutex_lock l(lock_);

   // Find the chunk from the ptr.
   BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
   CHECK(h != kInvalidChunkHandle);
   // Record chunk information before it's freed.
   Chunk* chunk = ChunkFromHandle(h);
   void* chunk_ptr = chunk->ptr;
   int64 req_bytes = chunk->requested_size;
   int64 alloc_bytes = chunk->size;

   MarkFree(h);

   // Consider coalescing it.
   if (timing_counter_) {
     InsertFreeChunkIntoBin(h);
     timestamped_chunks_.push_back(h);
   } else {
     InsertFreeChunkIntoBin(TryToCoalesce(h, false));
   }

   // TraceMe needs to be added after MarkFree and InsertFreeChunkIntoBin for
   // correct aggregation stats (bytes_in_use, fragmentation).
   AddTraceMe("MemoryDeallocation", chunk_ptr, req_bytes, alloc_bytes);

   if (VLOG_IS_ON(4)) {
     LOG(INFO) << "F: " << RenderOccupancy();
   }
 }

 // Merges h1 and h2 when Chunk(h1)->next is h2 and Chunk(h2)->prev is c1.
 // We merge Chunk(h2) into Chunk(h1).
 void BFCAllocator::Merge(BFCAllocator::ChunkHandle h1,
                          BFCAllocator::ChunkHandle h2) {
   Chunk* c1 = ChunkFromHandle(h1);
   Chunk* c2 = ChunkFromHandle(h2);
   // We can only merge chunks that are not in use.
   CHECK(!c1->in_use() && !c2->in_use());

   // c1's prev doesn't change, still points to the same ptr, and is
   // still not in use.

   // Fix up neighbor pointers
   //
   // c1 <-> c2 <-> c3 should become
   // c1 <-> c3

   BFCAllocator::ChunkHandle h3 = c2->next;
   c1->next = h3;
   CHECK(c2->prev == h1);
   if (h3 != kInvalidChunkHandle) {
     BFCAllocator::Chunk* c3 = ChunkFromHandle(h3);
     c3->prev = h1;
   }

   // Set the new size
   c1->size += c2->size;

   // Pick latest free time.
   c1->freed_at_count = std::max(c1->freed_at_count, c2->freed_at_count);

   DeleteChunk(h2);
 }

 void BFCAllocator::DeleteChunk(ChunkHandle h) {
   // Delete h and cleanup all state
   Chunk* c = ChunkFromHandle(h);
   //  VLOG(4) << "Removing: " << c->ptr;
   region_manager_.erase(c->ptr);
   DeallocateChunk(h);
 }

 void BFCAllocator::InsertFreeChunkIntoBin(BFCAllocator::ChunkHandle h) {
   Chunk* c = ChunkFromHandle(h);
   CHECK(!c->in_use() && (c->bin_num == kInvalidBinNum));
   BinNum bin_num = BinNumForSize(c->size);
   Bin* new_bin = BinFromIndex(bin_num);
   c->bin_num = bin_num;
   new_bin->free_chunks.insert(h);
 }

 void BFCAllocator::RemoveFreeChunkIterFromBin(
     BFCAllocator::Bin::FreeChunkSet* free_chunks,
     const BFCAllocator::Bin::FreeChunkSet::iterator& citer) {
   ChunkHandle h = *citer;
   Chunk* c = ChunkFromHandle(h);
   CHECK(!c->in_use() && (c->bin_num != kInvalidBinNum));
   free_chunks->erase(citer);
   c->bin_num = kInvalidBinNum;
 }

 void BFCAllocator::RemoveFreeChunkFromBin(BFCAllocator::ChunkHandle h) {
   Chunk* c = ChunkFromHandle(h);
   CHECK(!c->in_use() && (c->bin_num != kInvalidBinNum));
   CHECK_GT(BinFromIndex(c->bin_num)->free_chunks.erase(h), 0)
       << "Could not find chunk in bin";
   c->bin_num = kInvalidBinNum;
 }

 void BFCAllocator::MarkFree(BFCAllocator::ChunkHandle h) {
   Chunk* c = ChunkFromHandle(h);
   CHECK(c->in_use() && (c->bin_num == kInvalidBinNum));

   // Mark the chunk as no longer in use.
   c->allocation_id = -1;

   // Optionally record the free time.
   if (timing_counter_) {
     c->freed_at_count = timing_counter_->next();
   }

   // Updates the stats.
   stats_.bytes_in_use -= c->size;

   if (ShouldRecordOpName()) {
     c->action_count = ++action_counter_;
     uint64 slot = c->action_count % kMemDebugHistorySize;
     size_history_[slot] = stats_.bytes_in_use;
   }
 }

 BFCAllocator::ChunkHandle BFCAllocator::TryToCoalesce(ChunkHandle h,
                                                       bool ignore_freed_at) {
   Chunk* c = ChunkFromHandle(h);
   if ((!ignore_freed_at) && c->freed_at_count > 0) return h;
   ChunkHandle coalesced_chunk = h;

   // If the next chunk is free, merge it into c and delete it.
   if (c->next != kInvalidChunkHandle && !ChunkFromHandle(c->next)->in_use()) {
     Chunk* n = ChunkFromHandle(c->next);
     if ((n->freed_at_count == 0) || ignore_freed_at) {
       VLOG(4) << "Merging c->next " << n->ptr << " with c " << c->ptr;
       RemoveFreeChunkFromBin(c->next);
       Merge(h, c->next);
     }
   }

   // If the previous chunk is free, merge c into it and delete c.
   if (c->prev != kInvalidChunkHandle && !ChunkFromHandle(c->prev)->in_use()) {
     Chunk* n = ChunkFromHandle(c->prev);
     if ((n->freed_at_count == 0) || ignore_freed_at) {
       VLOG(4) << "Merging c " << c->ptr << " into c->prev " << n->ptr;
       coalesced_chunk = c->prev;
       RemoveFreeChunkFromBin(c->prev);
       Merge(c->prev, h);
     }
   }

   return coalesced_chunk;
 }

 void BFCAllocator::SetSafeFrontier(uint64 count) {
   uint64 current = safe_frontier_.load(std::memory_order_relaxed);
   while (count > current) {
     if (safe_frontier_.compare_exchange_strong(current, count)) {
       retry_helper_.NotifyDealloc();
       return;
     } else {
       current = safe_frontier_.load(std::memory_order_relaxed);
     }
   }
 }

 bool BFCAllocator::MergeTimestampedChunks(size_t required_bytes) {
   VLOG(1) << "MergeTimestampedChunks queue_len=" << timestamped_chunks_.size()
           << " required_bytes=" << required_bytes;
   bool satisfied = (required_bytes == 0);
   std::vector<void*> to_merge;
   std::deque<ChunkHandle> new_ts_queue;
   while (!timestamped_chunks_.empty()) {
     ChunkHandle h = timestamped_chunks_.front();
     timestamped_chunks_.pop_front();
     DCHECK_NE(h, kInvalidChunkHandle);
     Chunk* c = ChunkFromHandle(h);
     // It's possible this chunk has already been merged so refetch and retest
     // the handle.
     h = region_manager_.get_handle(c->ptr);
     if (h == kInvalidChunkHandle) {
       continue;
     }
     if (c->in_use() || (c->bin_num == kInvalidBinNum)) {
       // This chunk has already been reallocated.
       continue;
     }
     if (c->freed_at_count == 0) {
       to_merge.push_back(c->ptr);
       continue;
     }
     // Chunk should be free and assigned to a bin.
     DCHECK_NE(c->bin_num, kInvalidBinNum);
     if (c->freed_at_count < safe_frontier_) {
       c->freed_at_count = 0;
       to_merge.push_back(c->ptr);
     } else if (required_bytes > 0) {
       to_merge.push_back(c->ptr);
     } else {
       new_ts_queue.push_back(h);
     }
   }
   DCHECK(timestamped_chunks_.empty());
   std::swap(timestamped_chunks_, new_ts_queue);

   // At this point all candidate chunks have been moved from timestamped_chunks_
   // to to_merge.  If this is a standard merge (required_bytes == 0) then
   // merge them all, otherwise merge just until a Chunk of the required size
   // is produced.
   for (int ci = 0, end = to_merge.size(); ci < end; ++ci) {
     void* ptr = to_merge[ci];
     // It's possible that the Chunk associated with this memory location got
     // merged and deallocated in a prior iteration so refetch the handle and
     // retest.
     ChunkHandle h = region_manager_.get_handle(ptr);
     if (h == kInvalidChunkHandle) continue;
     if (required_bytes == 0 || !satisfied) {
       Chunk* c = ChunkFromHandle(h);
       DCHECK_NE(c->bin_num, kInvalidBinNum);
       DCHECK(!c->in_use());
       RemoveFreeChunkFromBin(h);
       ChunkHandle new_h = TryToCoalesce(h, (required_bytes > 0));
       InsertFreeChunkIntoBin(new_h);
       if (required_bytes > 0) {
         c = ChunkFromHandle(new_h);
         if (new_h != h && c->freed_at_count > 0) {
           timestamped_chunks_.push_back(new_h);
         }
         if (c->size >= required_bytes) {
           satisfied = true;
         }
       }
     } else {
       // We were force merging Chunks with unsafe timestamps, but managed
       // to create a satisfying Chunk so just requeue the rest.
       timestamped_chunks_.push_back(h);
     }
   }
   return satisfied;
 }

 bool BFCAllocator::TracksAllocationSizes() const { return true; }

 size_t BFCAllocator::RequestedSize(const void* ptr) const {
   CHECK(ptr);
   mutex_lock l(lock_);
   BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
   CHECK(h != kInvalidChunkHandle)
       << "Asked for requested size of pointer we never allocated: " << ptr;
   const BFCAllocator::Chunk* c = ChunkFromHandle(h);
   return c->requested_size;
 }

 size_t BFCAllocator::AllocatedSize(const void* ptr) const {
   mutex_lock l(lock_);
   BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
   CHECK(h != kInvalidChunkHandle)
       << "Asked for allocated size of pointer we never allocated: " << ptr;
   const BFCAllocator::Chunk* c = ChunkFromHandle(h);
   return c->size;
 }

 int64 BFCAllocator::AllocationId(const void* ptr) const {
   mutex_lock l(lock_);
   BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
   CHECK(h != kInvalidChunkHandle)
       << "Asked for allocation id of pointer we never allocated: " << ptr;
   const BFCAllocator::Chunk* c = ChunkFromHandle(h);
   return c->allocation_id;
 }

 namespace {

 void RenderRegion(char* rendered, const size_t resolution,
                   const size_t total_render_size, const size_t offset,
                   const void* base_ptr, const void* ptr, const size_t size,
                   const char c) {
   const char* base_ptr_c = static_cast<const char*>(base_ptr);
   const char* ptr_c = static_cast<const char*>(ptr);

   size_t start_location =
       ((ptr_c - base_ptr_c + offset) * resolution) / total_render_size;
   CHECK_GE(start_location, 0);
   CHECK_LT(start_location, resolution);
   size_t end_location =
       ((ptr_c + size - 1 - base_ptr_c + offset) * resolution) /
       total_render_size;
   CHECK_GE(end_location, 0);
   CHECK_LT(end_location, resolution);

   for (size_t i = start_location; i <= end_location; ++i) {
     rendered[i] = c;
   }
 }

 }  // namespace

 string BFCAllocator::RenderOccupancy() {
   // Make a buffer for the ASCII-art representation.
   const size_t resolution = 100;
   char rendered[resolution];

   // Compute the total region size to render over
   size_t total_region_size = 0;
   for (const auto& region : region_manager_.regions()) {
     total_region_size += region.memory_size();
   }

   if (total_region_size == 0) {
     return "<allocator contains no memory>";
   }

   // Start out with everything empty
   RenderRegion(rendered, resolution, total_region_size, 0, nullptr, nullptr,
                total_region_size, '_');

   size_t region_offset = 0;
   for (const auto& region : region_manager_.regions()) {
     ChunkHandle h = region_manager_.get_handle(region.ptr());
     // Then render each chunk left to right.
     while (h != kInvalidChunkHandle) {
       Chunk* c = ChunkFromHandle(h);
       if (c->in_use()) {
         // Render the wasted space
         size_t wasted = c->size - c->requested_size;
         if (wasted > 0) {
           RenderRegion(rendered, resolution, total_region_size,
                        region_offset + c->requested_size, region.ptr(), c->ptr,
                        wasted, 'x');
         }
         // Then the occupied space
         RenderRegion(rendered, resolution, total_region_size, region_offset,
                      region.ptr(), c->ptr, c->requested_size, '*');
       }
       h = c->next;
     }
     region_offset += region.memory_size();
   }

   return string(rendered, resolution);
 }

 void BFCAllocator::DumpMemoryLog(size_t num_bytes) {
   const std::array<BinDebugInfo, kNumBins> bin_infos = get_bin_debug_info();
   LOG(INFO) << "BFCAllocator dump for " << Name();
   for (BinNum bin_num = 0; bin_num < kNumBins; bin_num++) {
     Bin* b = BinFromIndex(bin_num);
     const BinDebugInfo& bin_info = bin_infos[bin_num];
     CHECK_EQ(b->free_chunks.size(),
              bin_info.total_chunks_in_bin - bin_info.total_chunks_in_use);

     LOG(INFO) << "Bin (" << b->bin_size
               << "): \tTotal Chunks: " << bin_info.total_chunks_in_bin
               << ", Chunks in use: " << bin_info.total_chunks_in_use << ". "
               << strings::HumanReadableNumBytes(bin_info.total_bytes_in_bin)
               << " allocated for chunks. "
               << strings::HumanReadableNumBytes(bin_info.total_bytes_in_use)
               << " in use in bin. "
               << strings::HumanReadableNumBytes(
                      bin_info.total_requested_bytes_in_use)
               << " client-requested in use in bin.";
   }

   // Find the bin that we would have liked to allocate in, so we
   // can get some further analysis about fragmentation.
   Bin* b = BinForSize(num_bytes);

   LOG(INFO) << "Bin for " << strings::HumanReadableNumBytes(num_bytes)
             << " was " << strings::HumanReadableNumBytes(b->bin_size)
             << ", Chunk State: ";

   for (ChunkHandle h : b->free_chunks) {
     Chunk* c = ChunkFromHandle(h);
     LOG(INFO) << c->DebugString(this, true);
   }

   // Next show the chunks that are in use, and also summarize their
   // number by size.
   std::map<size_t, int> in_use_by_size;
   for (const auto& region : region_manager_.regions()) {
     LOG(INFO) << "Next region of size " << region.memory_size();
     ChunkHandle h = region_manager_.get_handle(region.ptr());
     while (h != kInvalidChunkHandle) {
       const Chunk* c = ChunkFromHandle(h);
       if (c->in_use()) {
         in_use_by_size[c->size]++;
       }
       string buf = strings::StrCat(
           (c->in_use() ? "InUse" : "Free "), " at ",
           strings::Hex(reinterpret_cast<uint64>(c->ptr)), " of size ", c->size);
       if (ShouldRecordOpName()) {
         strings::StrAppend(&buf, " by op ", c->GetDebugOpName(),
                            " action_count ", c->action_count, " step ",
                            c->step_id);
       }
       strings::StrAppend(&buf, " next ", c->next);
       if (timing_counter_) {
         strings::StrAppend(&buf, " freed_at_count ", c->freed_at_count);
       }
       LOG(INFO) << buf;
       h = c->next;
     }
   }

   LOG(INFO) << "     Summary of in-use Chunks by size: ";
   size_t total_bytes = 0;
   for (auto& it : in_use_by_size) {
     LOG(INFO) << it.second << " Chunks of size " << it.first << " totalling "
               << strings::HumanReadableNumBytes(it.first * it.second);
     total_bytes += (it.first * it.second);
   }
   LOG(INFO) << "Sum Total of in-use chunks: "
             << strings::HumanReadableNumBytes(total_bytes);
   LOG(INFO) << "total_region_allocated_bytes_: "
             << total_region_allocated_bytes_
             << " memory_limit_: " << memory_limit_ << " available bytes: "
             << (memory_limit_ - total_region_allocated_bytes_)
             << " curr_region_allocation_bytes_: "
             << curr_region_allocation_bytes_;
   LOG(INFO) << "Stats: \n" << stats_.DebugString();
 }

 void BFCAllocator::MaybeWriteMemoryMap() {
   const char* gpu_memory_map_file = std::getenv("TF_BFC_MEMORY_DUMP");
   if (gpu_memory_map_file != nullptr) {
     std::unique_ptr<WritableFile> dump_file;
     string file_name = strings::StrCat(gpu_memory_map_file, "_", Name(), ".",
                                        Env::Default()->NowMicros());
     Status status = Env::Default()->NewWritableFile(file_name, &dump_file);
     if (!status.ok()) {
       LOG(ERROR) << "Failed to open file " << file_name;
       return;
     }
     MemoryDump md = RecordMemoryMapInternal();
     status = dump_file->Append(md.SerializeAsString());
     if (!status.ok()) {
       LOG(ERROR) << "Error on writing to file " << gpu_memory_map_file << ": "
                  << status;
     }
   }
 }

 MemoryDump BFCAllocator::RecordMemoryMap() {
   mutex_lock l(lock_);
   return RecordMemoryMapInternal();
 }

 MemoryDump BFCAllocator::RecordMemoryMapInternal() {
   MemoryDump md;
   md.set_allocator_name(Name());

   // Record the general stats
   MemAllocatorStats* mas = md.mutable_stats();
   mas->set_num_allocs(stats_.num_allocs);
   mas->set_bytes_in_use(stats_.bytes_in_use);
   mas->set_peak_bytes_in_use(stats_.peak_bytes_in_use);
   mas->set_largest_alloc_size(stats_.largest_alloc_size);

   // Record summary data for every bin.
   const std::array<BinDebugInfo, kNumBins> bin_infos = get_bin_debug_info();
   for (BinNum bin_num = 0; bin_num < kNumBins; bin_num++) {
     Bin* b = BinFromIndex(bin_num);
     const BinDebugInfo& bin_info = bin_infos[bin_num];
     DCHECK_EQ(b->free_chunks.size(),
               bin_info.total_chunks_in_bin - bin_info.total_chunks_in_use);
     BinSummary* bs = md.add_bin_summary();
     bs->set_bin(bin_num);
     bs->set_total_bytes_in_use(bin_info.total_bytes_in_use);
     bs->set_total_bytes_in_bin(bin_info.total_bytes_in_bin);
     bs->set_total_chunks_in_use(bin_info.total_chunks_in_use);
     bs->set_total_chunks_in_bin(bin_info.total_chunks_in_bin);
   }

   // Record state of every defined Chunk.
   for (const auto& region : region_manager_.regions()) {
     ChunkHandle h = region_manager_.get_handle(region.ptr());
     while (h != kInvalidChunkHandle) {
       const Chunk* c = ChunkFromHandle(h);
       MemChunk* mc = md.add_chunk();
       mc->set_in_use(c->in_use());
       mc->set_address(reinterpret_cast<uint64>(c->ptr));
       mc->set_size(c->size);
       mc->set_requested_size(c->requested_size);
       mc->set_bin(c->bin_num);
       mc->set_op_name(c->GetDebugOpName());
       mc->set_step_id(c->step_id);
       mc->set_action_count(c->action_count);
       if (timing_counter_) {
         mc->set_freed_at_count(c->in_use() ? 0 : c->freed_at_count);
       }
       h = c->next;
     }
   }

   mas->set_fragmentation_metric(GetFragmentation());

   // Record the recent size history
   uint64 history_len = std::min(action_counter_, kMemDebugHistorySize);
   for (uint64 i = action_counter_ - history_len; i < action_counter_; ++i) {
     SnapShot* ss = md.add_snap_shot();
     ss->set_action_count(i);
     uint64 slot = i % kMemDebugHistorySize;
     ss->set_size(size_history_[slot]);
   }

   return md;
 }

 absl::optional<AllocatorStats> BFCAllocator::GetStats() {
   mutex_lock l(lock_);
   return stats_;
 }

 bool BFCAllocator::ClearStats() {
   mutex_lock l(lock_);
   stats_.num_allocs = 0;
   stats_.peak_bytes_in_use = stats_.bytes_in_use;
   stats_.largest_alloc_size = 0;
   return true;
 }

 std::array<BFCAllocator::BinDebugInfo, BFCAllocator::kNumBins>
 BFCAllocator::get_bin_debug_info() {
   std::array<BinDebugInfo, kNumBins> bin_infos;
   for (const auto& region : region_manager_.regions()) {
     ChunkHandle h = region_manager_.get_handle(region.ptr());
     while (h != kInvalidChunkHandle) {
       const Chunk* c = ChunkFromHandle(h);
       BinNum bin_num = BinNumForSize(c->size);
       BinDebugInfo& bin_info = bin_infos[bin_num];
       bin_info.total_bytes_in_bin += c->size;
       bin_info.total_chunks_in_bin++;
       if (c->in_use()) {
         bin_info.total_bytes_in_use += c->size;
         bin_info.total_requested_bytes_in_use += c->requested_size;
         bin_info.total_chunks_in_use++;
       } else {
         Bin* bin = BinFromIndex(bin_num);
         CHECK_EQ(bin->free_chunks.count(h), 1);
         CHECK_EQ(c->bin_num, bin_num);
       }
       h = c->next;
     }
   }
   return bin_infos;
 }

 }  // namespace tensorflow