blob: e4c834b30d1eb251bfe2eb5d02441da9e6e5ec46 [file] [log] [blame]
/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h"
#include <cstddef>
#include <vector>
#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
#include "tensorflow/core/platform/stream_executor.h"
#define MASK_WORDS 2
#define MASK_BYTES (MASK_WORDS * sizeof(int64))
namespace tensorflow {
namespace {
int64* NewMask(int64 word) {
int64* m = new int64[MASK_WORDS];
for (int i = 0; i < MASK_WORDS; ++i) {
m[i] = word;
}
return m;
}
int64* before_mask = NewMask(0xabababababababab);
int64* after_mask = NewMask(0xcdcdcdcdcdcdcdcd);
bool CheckMask(se::StreamExecutor* exec, void* ptr, int64* mask) {
se::DeviceMemory<int64> gpu_ptr{se::DeviceMemoryBase{ptr, MASK_BYTES}};
int64 tmp[MASK_WORDS];
if (!exec->SynchronousMemcpy(&tmp, gpu_ptr, MASK_BYTES)) {
LOG(FATAL) << "Could not copy debug mask";
}
bool ok = true;
for (int i = 0; i < MASK_WORDS; ++i) {
ok &= (mask[i] == tmp[i]);
if (!ok) {
LOG(ERROR) << "i=" << i
<< " mask=" << reinterpret_cast<const void*>(mask[i])
<< " field=" << reinterpret_cast<const void*>(tmp[i]);
}
}
return ok;
}
void InitMask(se::StreamExecutor* exec, void* ptr, int64* mask) {
se::DeviceMemory<int64> gpu_ptr{se::DeviceMemoryBase{ptr, MASK_BYTES}};
if (!exec->SynchronousMemcpy(&gpu_ptr, mask, MASK_BYTES)) {
LOG(FATAL) << "Could not copy debug mask";
}
}
} // namespace
// -----------------------------------------------------------------------------
// GPUDebugAllocator
// -----------------------------------------------------------------------------
GPUDebugAllocator::GPUDebugAllocator(VisitableAllocator* allocator,
CudaGpuId cuda_gpu_id)
: base_allocator_(allocator) {
stream_exec_ = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
}
GPUDebugAllocator::~GPUDebugAllocator() { delete base_allocator_; }
void* GPUDebugAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
num_bytes += (2 * MASK_BYTES);
void* allocated_ptr = base_allocator_->AllocateRaw(alignment, num_bytes);
if (allocated_ptr == nullptr) return allocated_ptr;
// Return the pointer after the header
void* rv = static_cast<char*>(allocated_ptr) + MASK_BYTES;
// Write the header at allocated_ptr
InitMask(stream_exec_, allocated_ptr, before_mask);
// Write the footer at the end.
size_t req_size = base_allocator_->RequestedSize(allocated_ptr);
InitMask(stream_exec_,
static_cast<char*>(allocated_ptr) + req_size - MASK_BYTES,
after_mask);
return rv;
}
void GPUDebugAllocator::DeallocateRaw(void* ptr) {
if (ptr != nullptr) {
CHECK(CheckHeader(ptr)) << "before_mask has been overwritten";
CHECK(CheckFooter(ptr)) << "after_mask has been overwritten";
// Backtrack to the beginning of the header.
ptr = static_cast<void*>(static_cast<char*>(ptr) - MASK_BYTES);
}
// Deallocate the memory
base_allocator_->DeallocateRaw(ptr);
}
void GPUDebugAllocator::AddAllocVisitor(Visitor visitor) {
return base_allocator_->AddAllocVisitor(visitor);
}
void GPUDebugAllocator::AddFreeVisitor(Visitor visitor) {
return base_allocator_->AddFreeVisitor(visitor);
}
bool GPUDebugAllocator::TracksAllocationSizes() { return true; }
size_t GPUDebugAllocator::RequestedSize(const void* ptr) {
auto req_size = base_allocator_->RequestedSize(static_cast<const char*>(ptr) -
MASK_BYTES);
return req_size - 2 * MASK_BYTES;
}
size_t GPUDebugAllocator::AllocatedSize(const void* ptr) {
return base_allocator_->AllocatedSize(static_cast<const char*>(ptr) -
MASK_BYTES);
}
int64 GPUDebugAllocator::AllocationId(const void* ptr) {
return base_allocator_->AllocationId(static_cast<const char*>(ptr) -
MASK_BYTES);
}
void GPUDebugAllocator::GetStats(AllocatorStats* stats) {
base_allocator_->GetStats(stats);
}
void GPUDebugAllocator::ClearStats() { base_allocator_->ClearStats(); }
bool GPUDebugAllocator::CheckHeader(void* ptr) {
return CheckMask(stream_exec_, static_cast<char*>(ptr) - MASK_BYTES,
before_mask);
}
bool GPUDebugAllocator::CheckFooter(void* ptr) {
char* original_ptr = static_cast<char*>(ptr) - MASK_BYTES;
size_t req_size = base_allocator_->RequestedSize(original_ptr);
return CheckMask(stream_exec_, original_ptr + req_size - MASK_BYTES,
after_mask);
}
// -----------------------------------------------------------------------------
// GPUNanResetAllocator
// -----------------------------------------------------------------------------
GPUNanResetAllocator::GPUNanResetAllocator(VisitableAllocator* allocator,
CudaGpuId cuda_gpu_id)
: base_allocator_(allocator) {
stream_exec_ = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
}
GPUNanResetAllocator::~GPUNanResetAllocator() { delete base_allocator_; }
void* GPUNanResetAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
void* allocated_ptr = base_allocator_->AllocateRaw(alignment, num_bytes);
if (allocated_ptr == nullptr) return allocated_ptr;
// Initialize the buffer to Nans
size_t req_size = base_allocator_->RequestedSize(allocated_ptr);
std::vector<float> nans((req_size + sizeof(float) - 1) / sizeof(float),
std::nanf(""));
se::DeviceMemory<float> nan_ptr{
se::DeviceMemoryBase{static_cast<float*>(allocated_ptr), req_size}};
if (!stream_exec_->SynchronousMemcpy(&nan_ptr, &nans[0], req_size)) {
LOG(ERROR) << "Could not initialize to NaNs";
}
return allocated_ptr;
}
void GPUNanResetAllocator::DeallocateRaw(void* ptr) {
if (ptr != nullptr) {
// Reset the buffer to Nans
size_t req_size = base_allocator_->RequestedSize(ptr);
std::vector<float> nans((req_size + sizeof(float) - 1) / sizeof(float),
std::nanf(""));
se::DeviceMemory<float> nan_ptr{
se::DeviceMemoryBase{static_cast<float*>(ptr), req_size}};
if (!stream_exec_->SynchronousMemcpy(&nan_ptr, &nans[0], req_size)) {
LOG(ERROR) << "Could not initialize to NaNs";
}
}
// Deallocate the memory
base_allocator_->DeallocateRaw(ptr);
}
void GPUNanResetAllocator::AddAllocVisitor(Visitor visitor) {
return base_allocator_->AddAllocVisitor(visitor);
}
void GPUNanResetAllocator::AddFreeVisitor(Visitor visitor) {
return base_allocator_->AddFreeVisitor(visitor);
}
size_t GPUNanResetAllocator::RequestedSize(const void* ptr) {
return base_allocator_->RequestedSize(ptr);
}
size_t GPUNanResetAllocator::AllocatedSize(const void* ptr) {
return base_allocator_->AllocatedSize(ptr);
}
void GPUNanResetAllocator::GetStats(AllocatorStats* stats) {
base_allocator_->GetStats(stats);
}
void GPUNanResetAllocator::ClearStats() { base_allocator_->ClearStats(); }
} // namespace tensorflow