THCCachingHostAllocator.cpp - platform/external/pytorch - Git at Google

 #include "THCCachingHostAllocator.h"

 #include <cuda_runtime_api.h>
 #include <deque>
 #include <mutex>
 #include <set>
 #include <stdint.h>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>


 namespace {

 struct BlockSize
 {
   size_t  size; // allocation size
   void*   ptr;  // host memory pointer

   BlockSize(size_t size, void* ptr=NULL) : size(size), ptr(ptr) {}
 };

 struct Block : public BlockSize
 {
   bool  allocated;    // true if the block is currently allocated
   int   event_count;  // number of outstanding cuda events
   std::unordered_set<THCStream *> streams;

   Block(size_t size, void* ptr, bool allocated) :
       BlockSize(size, ptr), allocated(allocated), event_count(0) { }
 };

 struct BlockStreamCleaner {
   std::unordered_set<THCStream *> &streams;

   BlockStreamCleaner(std::unordered_set<THCStream *> &streams) : streams(streams) {}
   ~BlockStreamCleaner() {
     for(auto it = streams.begin(); it != streams.end(); ++it) {
       if (*it != NULL) {
         THCStream_free(*it);
       }
     }
     streams.clear();
   }
 };
 static bool BlockComparator(const BlockSize& a, const BlockSize& b)
 {
   // sort by size, break ties with pointer
   if (a.size != b.size) {
     return a.size < b.size;
   }
   return (uintptr_t)a.ptr < (uintptr_t)b.ptr;
 }

 struct HostAllocator
 {
   typedef bool (*Comparison)(const BlockSize&, const BlockSize&);

   // lock around all operations
   std::mutex mutex;

   // blocks by pointer
   std::unordered_map<void*, Block> blocks;

   // pointers that are ready to be allocated (event_count=0)
   std::set<BlockSize, Comparison> available;

   // outstanding cuda events
   std::deque<std::pair<cudaEvent_t, void*>> cuda_events;

   HostAllocator() : available(BlockComparator) {}

   cudaError_t malloc(void** ptr, size_t size)
   {
     std::lock_guard<std::mutex> lock(mutex);

     // process outstanding cuda events which may have occurred
     cudaError_t err = processEvents();
     if (err != cudaSuccess) {
       return err;
     }

     // search for the smallest block which can hold this allocation
     BlockSize search_key(size);
     auto it = available.lower_bound(search_key);
     if (it != available.end()) {
       Block& block = blocks.at(it->ptr);
       THAssert(!block.allocated && block.event_count == 0);
       block.allocated = true;
       *ptr = block.ptr;
       available.erase(it);
       return cudaSuccess;
     }

     // note that cudaHostAlloc may not touch pointer if size is 0
     *ptr = 0;

     // allocate a new block if no cached allocation is found
     err = cudaHostAlloc(ptr, size, cudaHostAllocDefault);
     if (err != cudaSuccess) {
       return err;
     }

     blocks.insert({*ptr, Block(size, *ptr, true)});
     return cudaSuccess;
   }

   cudaError_t free(void* ptr)
   {
     std::lock_guard<std::mutex> lock(mutex);

     if (!ptr) {
       return cudaSuccess;
     }

     // process outstanding cuda events which may have occurred
     cudaError_t err = processEvents();
     if (err != cudaSuccess) {
       return err;
     }

     auto it = blocks.find(ptr);
     THAssert(it != blocks.end());

     Block& block = it->second;
     THAssert(block.allocated);

     // free (on valid memory) shouldn't fail, so mark unallocated before
     // we process the streams.
     block.allocated = false;

     // since the block has been deallocated, no point in keeping around the
     // streams, even in case of error.
     BlockStreamCleaner sc(block.streams);
     for (auto it = block.streams.begin(); it != block.streams.end(); ++it) {
       cudaEvent_t event;
       err = cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
       if (err != cudaSuccess) {
         return err;
       }

       err = cudaEventRecord(event, (*it) == NULL ? NULL : (*it)->stream);
       if (err != cudaSuccess) {
         return err;
       }

       // the block will not be re-used until all associated events have occured
       block.event_count++;
       cuda_events.emplace_back(event, ptr);
     }
     if (block.event_count == 0) {
       // the block can be re-used if there are no outstanding cuda events
       available.insert(block);
     }
     return cudaSuccess;
   }

   cudaError_t recordEvent(void* ptr, THCStream *stream)
   {
     std::lock_guard<std::mutex> lock(mutex);
     cudaError_t err;

     auto it = blocks.find(ptr);
     if (it == blocks.end()) {
       // ignore events for untracked pointers
       return cudaSuccess;
     }

     Block& block = it->second;
     THAssert(block.allocated);
     auto res = block.streams.insert(stream);
     if (res.second == true && stream != NULL) {
       THCStream_retain(stream);
     }

     return cudaSuccess;
   }

   cudaError_t processEvents()
   {
     // Process outstanding cudaEvents. Events that are completed are removed
     // from the queue, and the 'event_count' for the corresponding allocation
     // is decremented. Stops at the first event which has not been completed.
     // Since events on different devices or streams may occur out of order,
     // the processing of some events may be delayed.
     while (!cuda_events.empty()) {
       auto& e = cuda_events.front();
       cudaEvent_t event = e.first;

       cudaError_t err = cudaEventQuery(event);
       if (err == cudaErrorNotReady) {
         break;
       } else if (err != cudaSuccess) {
         return err;
       }
       err = cudaEventDestroy(event);
       if (err != cudaSuccess) {
         return err;
       }

       Block& block = blocks.at(e.second);
       block.event_count--;
       if (block.event_count == 0 && !block.allocated) {
         available.insert(block);
       }
       cuda_events.pop_front();
     }
     return cudaSuccess;
   }

   void emptyCache()
   {
     std::lock_guard<std::mutex> lock(mutex);

     // remove events for freed blocks
     for (auto it = cuda_events.begin(); it != cuda_events.end(); ++it) {
       cudaEvent_t event = it->first;
       Block& block = blocks.at(it->second);
       if (!block.allocated) {
         THCudaCheckWarn(cudaEventDestroy(event));
         block.event_count--;
       }
     }

     // all cuda_events have been processed
     cuda_events.clear();

     // clear list of available blocks
     available.clear();

     // free and erase non-allocated blocks
     for (auto it = blocks.begin(); it != blocks.end();) {
       Block& block = it->second;
       if (!block.allocated) {
         THCudaCheckWarn(cudaFreeHost(block.ptr));
         it = blocks.erase(it);
       } else {
         ++it;
       }
     }
   }
 };

 }  // namespace

 static HostAllocator allocator;

 static void* THCCachingHostAllocator_malloc(void* ctx, ptrdiff_t size)
 {
   THAssert(size >= 0);
   void *ptr;
   THCudaCheck(allocator.malloc(&ptr, size));
   return ptr;
 }

 static void THCCachingHostAllocator_free(void* ctx, void* ptr)
 {
   allocator.free(ptr);
 }

 cudaError_t THCCachingHostAllocator_recordEvent(void *ptr, THCStream *stream)
 {
   return allocator.recordEvent(ptr, stream);
 }

 void THCCachingHostAllocator_emptyCache()
 {
   allocator.emptyCache();
 }

 THAllocator THCCachingHostAllocator = {
   &THCCachingHostAllocator_malloc,
   NULL,
   &THCCachingHostAllocator_free,
 };
	#include "THCCachingHostAllocator.h"

	#include <cuda_runtime_api.h>
	#include <deque>
	#include <mutex>
	#include <set>
	#include <stdint.h>
	#include <unordered_map>
	#include <unordered_set>
	#include <utility>


	namespace {

	struct BlockSize
	{
	size_t size; // allocation size
	void* ptr; // host memory pointer

	BlockSize(size_t size, void* ptr=NULL) : size(size), ptr(ptr) {}
	};

	struct Block : public BlockSize
	{
	bool allocated; // true if the block is currently allocated
	int event_count; // number of outstanding cuda events
	std::unordered_set<THCStream *> streams;

	Block(size_t size, void* ptr, bool allocated) :
	BlockSize(size, ptr), allocated(allocated), event_count(0) { }
	};

	struct BlockStreamCleaner {
	std::unordered_set<THCStream *> &streams;

	BlockStreamCleaner(std::unordered_set<THCStream *> &streams) : streams(streams) {}
	~BlockStreamCleaner() {
	for(auto it = streams.begin(); it != streams.end(); ++it) {
	if (*it != NULL) {
	THCStream_free(*it);
	}
	}
	streams.clear();
	}
	};
	static bool BlockComparator(const BlockSize& a, const BlockSize& b)
	{
	// sort by size, break ties with pointer
	if (a.size != b.size) {
	return a.size < b.size;
	}
	return (uintptr_t)a.ptr < (uintptr_t)b.ptr;
	}

	struct HostAllocator
	{
	typedef bool (*Comparison)(const BlockSize&, const BlockSize&);

	// lock around all operations
	std::mutex mutex;

	// blocks by pointer
	std::unordered_map<void*, Block> blocks;

	// pointers that are ready to be allocated (event_count=0)
	std::set<BlockSize, Comparison> available;

	// outstanding cuda events
	std::deque<std::pair<cudaEvent_t, void*>> cuda_events;

	HostAllocator() : available(BlockComparator) {}

	cudaError_t malloc(void** ptr, size_t size)
	{
	std::lock_guard<std::mutex> lock(mutex);

	// process outstanding cuda events which may have occurred
	cudaError_t err = processEvents();
	if (err != cudaSuccess) {
	return err;
	}

	// search for the smallest block which can hold this allocation
	BlockSize search_key(size);
	auto it = available.lower_bound(search_key);
	if (it != available.end()) {
	Block& block = blocks.at(it->ptr);
	THAssert(!block.allocated && block.event_count == 0);
	block.allocated = true;
	*ptr = block.ptr;
	available.erase(it);
	return cudaSuccess;
	}

	// note that cudaHostAlloc may not touch pointer if size is 0
	*ptr = 0;

	// allocate a new block if no cached allocation is found
	err = cudaHostAlloc(ptr, size, cudaHostAllocDefault);
	if (err != cudaSuccess) {
	return err;
	}

	blocks.insert({ptr, Block(size, ptr, true)});
	return cudaSuccess;
	}

	cudaError_t free(void* ptr)
	{
	std::lock_guard<std::mutex> lock(mutex);

	if (!ptr) {
	return cudaSuccess;
	}

	// process outstanding cuda events which may have occurred
	cudaError_t err = processEvents();
	if (err != cudaSuccess) {
	return err;
	}

	auto it = blocks.find(ptr);
	THAssert(it != blocks.end());

	Block& block = it->second;
	THAssert(block.allocated);

	// free (on valid memory) shouldn't fail, so mark unallocated before
	// we process the streams.
	block.allocated = false;

	// since the block has been deallocated, no point in keeping around the
	// streams, even in case of error.
	BlockStreamCleaner sc(block.streams);
	for (auto it = block.streams.begin(); it != block.streams.end(); ++it) {
	cudaEvent_t event;
	err = cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
	if (err != cudaSuccess) {
	return err;
	}

	err = cudaEventRecord(event, (it) == NULL ? NULL : (it)->stream);
	if (err != cudaSuccess) {
	return err;
	}

	// the block will not be re-used until all associated events have occured
	block.event_count++;
	cuda_events.emplace_back(event, ptr);
	}
	if (block.event_count == 0) {
	// the block can be re-used if there are no outstanding cuda events
	available.insert(block);
	}
	return cudaSuccess;
	}

	cudaError_t recordEvent(void* ptr, THCStream *stream)
	{
	std::lock_guard<std::mutex> lock(mutex);
	cudaError_t err;

	auto it = blocks.find(ptr);
	if (it == blocks.end()) {
	// ignore events for untracked pointers
	return cudaSuccess;
	}

	Block& block = it->second;
	THAssert(block.allocated);
	auto res = block.streams.insert(stream);
	if (res.second == true && stream != NULL) {
	THCStream_retain(stream);
	}

	return cudaSuccess;
	}

	cudaError_t processEvents()
	{
	// Process outstanding cudaEvents. Events that are completed are removed
	// from the queue, and the 'event_count' for the corresponding allocation
	// is decremented. Stops at the first event which has not been completed.
	// Since events on different devices or streams may occur out of order,
	// the processing of some events may be delayed.
	while (!cuda_events.empty()) {
	auto& e = cuda_events.front();
	cudaEvent_t event = e.first;

	cudaError_t err = cudaEventQuery(event);
	if (err == cudaErrorNotReady) {
	break;
	} else if (err != cudaSuccess) {
	return err;
	}
	err = cudaEventDestroy(event);
	if (err != cudaSuccess) {
	return err;
	}

	Block& block = blocks.at(e.second);
	block.event_count--;
	if (block.event_count == 0 && !block.allocated) {
	available.insert(block);
	}
	cuda_events.pop_front();
	}
	return cudaSuccess;
	}

	void emptyCache()
	{
	std::lock_guard<std::mutex> lock(mutex);

	// remove events for freed blocks
	for (auto it = cuda_events.begin(); it != cuda_events.end(); ++it) {
	cudaEvent_t event = it->first;
	Block& block = blocks.at(it->second);
	if (!block.allocated) {
	THCudaCheckWarn(cudaEventDestroy(event));
	block.event_count--;
	}
	}

	// all cuda_events have been processed
	cuda_events.clear();

	// clear list of available blocks
	available.clear();

	// free and erase non-allocated blocks
	for (auto it = blocks.begin(); it != blocks.end();) {
	Block& block = it->second;
	if (!block.allocated) {
	THCudaCheckWarn(cudaFreeHost(block.ptr));
	it = blocks.erase(it);
	} else {
	++it;
	}
	}
	}
	};

	} // namespace

	static HostAllocator allocator;

	static void* THCCachingHostAllocator_malloc(void* ctx, ptrdiff_t size)
	{
	THAssert(size >= 0);
	void *ptr;
	THCudaCheck(allocator.malloc(&ptr, size));
	return ptr;
	}

	static void THCCachingHostAllocator_free(void* ctx, void* ptr)
	{
	allocator.free(ptr);
	}

	cudaError_t THCCachingHostAllocator_recordEvent(void ptr, THCStream stream)
	{
	return allocator.recordEvent(ptr, stream);
	}

	void THCCachingHostAllocator_emptyCache()
	{
	allocator.emptyCache();
	}

	THAllocator THCCachingHostAllocator = {
	&THCCachingHostAllocator_malloc,
	NULL,
	&THCCachingHostAllocator_free,
	};