blob: a9425864dd377fd70825b1b139b4e0e907a89fb4 [file] [log] [blame]
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
#include "tensorflow/core/platform/logging.h"
#if GOOGLE_CUDA
#if GOOGLE_TENSORRT
#include "cuda/include/cuda_runtime_api.h"
#endif // GOOGLE_TENSORRT
#endif // GOOGLE_CUDA
namespace tensorflow {
namespace tensorrt {
// std::align is not supported, so this method mimic its behavior.
//
// NOTE(aaroey): according to the TensorRT API,
// nvinfer1::IGpuAllocator::allocate() uses uint64_t type for size and alignment
// parameters, so here we use the same type to make it compatible.
void* Align(uint64_t alignment, uint64_t size, void*& ptr, uint64_t& space) {
QCHECK_GT(alignment, 0ul) << "alignment must be greater than 0.";
QCHECK_EQ(0, alignment & (alignment - 1)) << "Alignment must be power of 2.";
QCHECK_GT(size, 0ul) << "size must be greater than 0.";
QCHECK(ptr) << "ptr must not be nullptr.";
QCHECK_GT(space, 0ul) << "space must be greater than 0.";
const uintptr_t ptr_val = reinterpret_cast<uintptr_t>(ptr);
QCHECK_GE(ptr_val + space, ptr_val) << "Provided space overflows.";
if (size > space) return nullptr;
const uintptr_t aligned_ptr_val = ((ptr_val + alignment - 1) & -alignment);
if (aligned_ptr_val > ptr_val + space - size) return nullptr;
ptr = reinterpret_cast<void*>(aligned_ptr_val);
const uintptr_t diff = aligned_ptr_val - ptr_val;
space -= diff;
return ptr;
}
} // namespace tensorrt
} // namespace tensorflow
#if GOOGLE_CUDA
#if GOOGLE_TENSORRT
#if NV_TENSORRT_MAJOR > 2
namespace tensorflow {
namespace tensorrt {
void* TRTCudaAllocator::allocate(uint64_t size, uint64_t alignment,
uint32_t flags) {
assert((alignment & (alignment - 1)) == 0); // zero or a power of 2.
void* memory;
cudaMalloc(&memory, size);
return memory;
}
void TRTCudaAllocator::free(void* memory) { cudaFree(memory); }
void* TRTDeviceAllocator::allocate(uint64_t size, uint64_t alignment,
uint32_t flags) {
if (size == 0) return nullptr;
// WAR for allocator alignment requirement. Certain cuda API calls require GPU
// memory with alignemtn to cudaDeviceProp::textureAlignment.
// See issue #20856
alignment = 512;
assert((alignment & (alignment - 1)) == 0); // zero or a power of 2.
uint64_t total_size = size + alignment;
// TODO(aaroey): AllocateRaw takes size_t size as input, so it'll produce
// unexpected result when TRT tries to allocate more bytes than size_t can
// carry. Fix this.
void* mem = allocator_->AllocateRaw(alignment, total_size);
if (!mem) return nullptr;
void* alloc_mem = mem;
QCHECK(Align(alignment, size, mem, total_size));
if (mem != alloc_mem) {
QCHECK(mem_map_.insert({mem, alloc_mem}).second);
}
VLOG(2) << "Allocated " << total_size << " bytes memory @" << alloc_mem
<< "; aligned to " << size << " bytes @" << mem << " with alignment "
<< alignment;
return mem;
}
TRTDeviceAllocator::TRTDeviceAllocator(tensorflow::Allocator* allocator)
: allocator_(allocator) {
VLOG(1) << "Using " << allocator->Name() << " allocator from TensorFlow";
}
void TRTDeviceAllocator::free(void* memory) {
VLOG(2) << "Deallocating @ " << memory;
// allocated memory adjusted for alignment, restore the original pointer
if (memory) {
auto alloc_mem = mem_map_.find(memory);
if (alloc_mem != mem_map_.end()) {
memory = alloc_mem->second;
mem_map_.erase(alloc_mem->first);
}
allocator_->DeallocateRaw(memory);
}
}
} // namespace tensorrt
} // namespace tensorflow
#endif
#endif // GOOGLE_TENSORRT
#endif // GOOGLE_CUDA