c10/mobile/CPUCachingAllocator.h - platform/external/pytorch - Git at Google

 #pragma once

 #include <algorithm>
 #include <deque>
 #include <memory>
 #include <mutex>

 #include <c10/core/CPUAllocator.h>
 #include <c10/util/Exception.h>
 #include <c10/util/SmallVector.h>
 #include <c10/util/flat_hash_map.h>

 /*
  * CPUCachingAllocator:
  * DISCLAIMER:
  *    This is subject to change (beta) and only supported on mobile builds.
  *    If code snippet such as in 'Usage pattern' is used outside of mobile
  *    build you will not observe the intended behavior.
  *    See below for more information.
  * Why?
  *    It has been observed that some mobile platforms, such as pixel 3, return
  *    memory aggressively to the system. This results in page faults in some
  * cases and ends up hurting performance. This caching allocator aims to address
  * that. Furthermore it also allows users to specify their own allocator by
  * implementing allocate/free virtual interfaces. What are the cons? There are
  * some cons that were observed where use of caching allocator led to worse
  * performance on some platforms. Reason being that the caching mechanism used
  * by this allocator left us worse off compared to the corresponding platform's
  *    tuned memory allocator. In that case it seemed better to not use this
  * allocator. Note there are some ideas to fix this in the works.
  *
  * Usage:
  * Usage pattern:
  * Instantiate and own the caching allocator.
  * std::unique_ptr<c10::CPUCachingAllocator> caching_allocator =
  *   std::make_unique<c10::CPUCachingAllocator>();
  * Use caching allocator with a scoped guard at inference time.
  * {
  * WithCPUCachingAllocatorGuard(caching_allocator.get());
  * ... model.forward(...);
  * }
  */

 namespace c10 {

 class C10_API CPUCachingAllocator {
   /*
    * What it does:
    * Caches all the allocations carried out by this allocator.
    * Cache key is the size of the allocation.
    * If requested size is found in the cache returns the cached pointer.
    * What it does not do:
    * No speculative allocation for any future allocations.
    */
  private:
   inline void* allocate_and_cache(const size_t bytes);
   void free_cached();

  protected:
   // Invariants.
   // 1. If memory is ever allocated via this allocator then
   //    the pointer will exist in allocation_map_, unless the allocator
   //    returned the memory to OS via free_cached.
   //  1.1. Therefore even when the said memory is "freed" via this
   //       allocator (and thus cached), it will continue to stay
   //       in allocation_map_. Furthermore it will also exist in
   //       available_map_. Thus an allocated memory pointer can be in both
   //       allocation_map_ and available_map_ simultaneously.
   // 2. Memory pointer maybe removed from allocation_map_, when it
   //    is freed outside of the scope of this allocator, but was allocated
   //    by this allocator.
   // 3. Available map only contains that memory which was allocated
   //    by this allocator and subsequently freed by this allocator.
   // As a result of above invariants, allocated memory ptr cannot be in
   // available_map_ unless it is in allocation_map_ as well.
   ska::flat_hash_map<size_t, c10::SmallVector<void*, 16>> available_map_;
   static ska::flat_hash_map<void*, size_t> allocation_map_;
   // Since allocation_map, which is a global instance, is mutated/read via
   // all public APIs we need a global mutex.
   static std::mutex mutex_;

  public:
   static void record_free(void* ptr);
   virtual ~CPUCachingAllocator();
   // Checks the cache to see if allocation of size bytes can be found.
   // If so return cached memory, else
   // allocates memory, records it for caching and returns.
   virtual void* allocate(const size_t bytes);
   // Checks if the memory being freed is was marked for allocation by
   // an earlier call to allocate. If so cache the allocation.
   // Otherwise free.
   virtual void free(void* ptr);
 };

 CPUCachingAllocator* GetDefaultCPUCachingAllocator();

 bool ThreadLocalCachingAllocatorEnabled();
 CPUCachingAllocator* GetThreadLocalCachingAllocator();

 class C10_API WithCPUCachingAllocatorGuard {
  public:
   WithCPUCachingAllocatorGuard(CPUCachingAllocator* allocator);
   ~WithCPUCachingAllocatorGuard();

  private:
   CPUCachingAllocator* prev_caching_allocator_ptr_{nullptr};
 };

 } // namespace c10
	#pragma once

	#include <algorithm>
	#include <deque>
	#include <memory>
	#include <mutex>

	#include <c10/core/CPUAllocator.h>
	#include <c10/util/Exception.h>
	#include <c10/util/SmallVector.h>
	#include <c10/util/flat_hash_map.h>

	/*
	* CPUCachingAllocator:
	* DISCLAIMER:
	* This is subject to change (beta) and only supported on mobile builds.
	* If code snippet such as in 'Usage pattern' is used outside of mobile
	* build you will not observe the intended behavior.
	* See below for more information.
	* Why?
	* It has been observed that some mobile platforms, such as pixel 3, return
	* memory aggressively to the system. This results in page faults in some
	* cases and ends up hurting performance. This caching allocator aims to address
	* that. Furthermore it also allows users to specify their own allocator by
	* implementing allocate/free virtual interfaces. What are the cons? There are
	* some cons that were observed where use of caching allocator led to worse
	* performance on some platforms. Reason being that the caching mechanism used
	* by this allocator left us worse off compared to the corresponding platform's
	* tuned memory allocator. In that case it seemed better to not use this
	* allocator. Note there are some ideas to fix this in the works.
	*
	* Usage:
	* Usage pattern:
	* Instantiate and own the caching allocator.
	* std::unique_ptr<c10::CPUCachingAllocator> caching_allocator =
	* std::make_unique<c10::CPUCachingAllocator>();
	* Use caching allocator with a scoped guard at inference time.
	* {
	* WithCPUCachingAllocatorGuard(caching_allocator.get());
	* ... model.forward(...);
	* }
	*/

	namespace c10 {

	class C10_API CPUCachingAllocator {
	/*
	* What it does:
	* Caches all the allocations carried out by this allocator.
	* Cache key is the size of the allocation.
	* If requested size is found in the cache returns the cached pointer.
	* What it does not do:
	* No speculative allocation for any future allocations.
	*/
	private:
	inline void* allocate_and_cache(const size_t bytes);
	void free_cached();

	protected:
	// Invariants.
	// 1. If memory is ever allocated via this allocator then
	// the pointer will exist in allocation_map_, unless the allocator
	// returned the memory to OS via free_cached.
	// 1.1. Therefore even when the said memory is "freed" via this
	// allocator (and thus cached), it will continue to stay
	// in allocation_map_. Furthermore it will also exist in
	// available_map_. Thus an allocated memory pointer can be in both
	// allocation_map_ and available_map_ simultaneously.
	// 2. Memory pointer maybe removed from allocation_map_, when it
	// is freed outside of the scope of this allocator, but was allocated
	// by this allocator.
	// 3. Available map only contains that memory which was allocated
	// by this allocator and subsequently freed by this allocator.
	// As a result of above invariants, allocated memory ptr cannot be in
	// available_map_ unless it is in allocation_map_ as well.
	ska::flat_hash_map<size_t, c10::SmallVector<void*, 16>> available_map_;
	static ska::flat_hash_map<void*, size_t> allocation_map_;
	// Since allocation_map, which is a global instance, is mutated/read via
	// all public APIs we need a global mutex.
	static std::mutex mutex_;

	public:
	static void record_free(void* ptr);
	virtual ~CPUCachingAllocator();
	// Checks the cache to see if allocation of size bytes can be found.
	// If so return cached memory, else
	// allocates memory, records it for caching and returns.
	virtual void* allocate(const size_t bytes);
	// Checks if the memory being freed is was marked for allocation by
	// an earlier call to allocate. If so cache the allocation.
	// Otherwise free.
	virtual void free(void* ptr);
	};

	CPUCachingAllocator* GetDefaultCPUCachingAllocator();

	bool ThreadLocalCachingAllocatorEnabled();
	CPUCachingAllocator* GetThreadLocalCachingAllocator();

	class C10_API WithCPUCachingAllocatorGuard {
	public:
	WithCPUCachingAllocatorGuard(CPUCachingAllocator* allocator);
	~WithCPUCachingAllocatorGuard();

	private:
	CPUCachingAllocator* prev_caching_allocator_ptr_{nullptr};
	};

	} // namespace c10