aten/src/ATen/cuda/CublasHandlePool.cpp - platform/external/pytorch - Git at Google

 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/detail/DeviceThreadHandles.h>

 #include <c10/cuda/CUDACachingAllocator.h>

 #include <map>
 #include <memory>
 #include <regex>
 #include <string>
 #include <tuple>

 /**
  * Note [hipblaslt handles]
  * ~~~~~~~~~~~~~~~~~~~~~~~~
  * The cublas documentation states:
  * cuBLAS handle (cublasHandle_t) encapsulates a cuBLASLt handle.
  * Any valid cublasHandle_t can be used in place of cublasLtHandle_t with a simple cast.
  *
  * hipblaslt does not behave in this way.
  * A hipblas handle does not encapsulate a hipblaslt handle.
  *
  * To work around this difference in behavior, a separate handle pool is available for ROCm builds.
  * For CUDA builds, getCurrentCUDABlasLtHandle will alias for getCurrentCUDABlasHandle,
  * whereas for ROCm builds, it is a distinct function.
  */

 namespace at::cuda {

 namespace {

 #if defined(USE_ROCM) && ROCM_VERSION >= 50700
 void createCublasLtHandle(cublasLtHandle_t *handle) {
   TORCH_CUDABLAS_CHECK(cublasLtCreate(handle));
 }

 void destroyCublasLtHandle(cublasLtHandle_t handle) {
 // this is because of something dumb in the ordering of
 // destruction. Sometimes atexit, the cuda context (or something)
 // would already be destroyed by the time this gets destroyed. It
 // happens in fbcode setting. @colesbury and @soumith decided to not destroy
 // the handle as a workaround.
 //   - Comments of @soumith copied from cuDNN handle pool implementation
 #ifdef NO_CUDNN_DESTROY_HANDLE
 #else
     cublasLtDestroy(handle);
 #endif
 }

 using CuBlasLtPoolType = DeviceThreadHandlePool<cublasLtHandle_t, createCublasLtHandle, destroyCublasLtHandle>;
 #endif

 std::map<std::tuple<void *, void *>, at::DataPtr>& cublas_handle_stream_to_workspace() {
   static auto& instance = *new std::map<std::tuple<void *, void *>, at::DataPtr>;
   return instance;
 }

 void createCublasHandle(cublasHandle_t *handle) {
   TORCH_CUDABLAS_CHECK(cublasCreate(handle));
 }

 void destroyCublasHandle(cublasHandle_t handle) {
 // this is because of something dumb in the ordering of
 // destruction. Sometimes atexit, the cuda context (or something)
 // would already be destroyed by the time this gets destroyed. It
 // happens in fbcode setting. @colesbury and @soumith decided to not destroy
 // the handle as a workaround.
 //   - Comments of @soumith copied from cuDNN handle pool implementation
 #ifdef NO_CUDNN_DESTROY_HANDLE
 #else
     cublasDestroy(handle);
 #endif
 }

 using CuBlasPoolType = DeviceThreadHandlePool<cublasHandle_t, createCublasHandle, destroyCublasHandle>;

 } // namespace

 void clearCublasWorkspaces() {
   #if !defined(USE_ROCM) && defined(CUDA_VERSION) && CUDA_VERSION < 12200
       cublas_handle_stream_to_workspace().clear();
   #endif
 }

 size_t parseChosenWorkspaceSize() {
   const char * val = getenv("CUBLAS_WORKSPACE_CONFIG");
   /* :4096:2:16:8 default, 32MiB for Hopper */
   cudaDeviceProp* properties = at::cuda::getCurrentDeviceProperties();
   const bool sm90 = properties != nullptr && properties->major == 9 && properties->minor == 0;
   const size_t default_size = sm90 ? 4096 * 8 * 1024 : 4096 * 1024 * 2 + 16 * 1024 * 8;

   if (val) {
     size_t total_size = 0;
     const std::string config(val);
     std::regex exp(":([0-9]+):([0-9]+)");
     std::sregex_iterator next(config.begin(), config.end(), exp);
     std::sregex_iterator end;
     if (next == end) {
       TORCH_WARN("Could not parse CUBLAS_WORKSPACE_CONFIG, using default workspace size of ", default_size, " bytes.");
       return default_size;
     }
     while (next != end) {
       std::smatch match = *next;
       TORCH_CHECK(match.size() == 3, "Expected CUBLAS_WORKSPACE_SPACE_CONFIG match of size 3 (Format :SIZE:COUNT)");
       size_t curr_size = (size_t) std::stoi(match.str(1));
       size_t count = (size_t) std::stoi(match.str(2));
       total_size += curr_size * 1024 * count;
       next++;
     }
     return total_size;
   } else {
     return default_size;
   }
 }

 size_t getChosenWorkspaceSize() {
   size_t pool_size = parseChosenWorkspaceSize();
   return pool_size;
 }

 at::DataPtr getNewWorkspace() {
   return c10::cuda::CUDACachingAllocator::get()->allocate(getChosenWorkspaceSize());
 }

 cublasHandle_t getCurrentCUDABlasHandle() {
   int device;
   AT_CUDA_CHECK(c10::cuda::GetDevice(&device));

   // Thread local PoolWindows are lazily-initialized
   // to avoid initialization issues that caused hangs on Windows.
   // See: https://github.com/pytorch/pytorch/pull/22405
   // This thread local unique_ptrs will be destroyed when the thread terminates,
   // releasing its reserved handles back to the pool.

   // Use a leaky singleton for the pool following standard practice around
   // singletons: https://isocpp.org/wiki/faq/ctors#construct-on-first-use-v2
   static auto pool = std::shared_ptr<CuBlasPoolType>(
       new CuBlasPoolType(), [](CuBlasPoolType* p) {
         // Leak the memory.
       });
   thread_local std::unique_ptr<CuBlasPoolType::PoolWindow> myPoolWindow(
       pool->newPoolWindow());

   auto handle = myPoolWindow->reserve(device);
   auto stream = c10::cuda::getCurrentCUDAStream();
   TORCH_CUDABLAS_CHECK(cublasSetStream(handle, stream));
 #if !defined(USE_ROCM) && defined(CUDA_VERSION) && CUDA_VERSION < 12200
   // cuBLAS should not need an explicitly allocated workspace after CUDA 12.2
   // to avoid increasing memory usage during graph captures
   // original issue: https://github.com/pytorch/pytorch/pull/83461
   cudaStream_t _stream = stream;
   auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
   auto workspace_it = cublas_handle_stream_to_workspace().find(key);
   if (workspace_it == cublas_handle_stream_to_workspace().end()) {
     workspace_it = cublas_handle_stream_to_workspace().insert(workspace_it, {key, getNewWorkspace()});
   }
   TORCH_CUDABLAS_CHECK(cublasSetWorkspace(handle, workspace_it->second.get(), getChosenWorkspaceSize()));
 #endif
 #if !defined(USE_ROCM)
   // On CUDA >= 11, and architecture >= Ampere, cuBLAS can use TF32 to speedup
   // FP32 data type calculations based on the value of the allow_tf32 flag.
   // To enable TF32, set the math mode of the handle to CUBLAS_TF32_TENSOR_OP_MATH.
   if (!NoTF32Guard::should_disable_tf32() && at::globalContext().allowTF32CuBLAS()) {
     TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH));
   } else {
     TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
   }
 #endif
 #if defined(USE_ROCM)
   hipblasAtomicsMode_t hipblas_mode;
   if (at::globalContext().deterministicAlgorithms()) {
     hipblas_mode = HIPBLAS_ATOMICS_NOT_ALLOWED;
   } else {
     hipblas_mode = HIPBLAS_ATOMICS_ALLOWED;
   }
   TORCH_CUDABLAS_CHECK(hipblasSetAtomicsMode(handle, hipblas_mode));
 #endif
   return handle;
 }

 #if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
 cublasLtHandle_t getCurrentCUDABlasLtHandle() {
 #ifdef USE_ROCM
   int device;
   AT_CUDA_CHECK(c10::cuda::GetDevice(&device));

   // Thread local PoolWindows are lazily-initialized
   // to avoid initialization issues that caused hangs on Windows.
   // See: https://github.com/pytorch/pytorch/pull/22405
   // This thread local unique_ptrs will be destroyed when the thread terminates,
   // releasing its reserved handles back to the pool.

   // Use a leaky singleton for the pool following standard practice around
   // singletons: https://isocpp.org/wiki/faq/ctors#construct-on-first-use-v2
   static auto pool = std::shared_ptr<CuBlasLtPoolType>(
       new CuBlasLtPoolType(), [](CuBlasLtPoolType* p) {
         // Leak the memory.
       });
   thread_local std::unique_ptr<CuBlasLtPoolType::PoolWindow> myPoolWindow(
       pool->newPoolWindow());

   auto handle = myPoolWindow->reserve(device);
   return handle;
 #else
   return reinterpret_cast<cublasLtHandle_t>(getCurrentCUDABlasHandle());
 #endif
 }
 #endif

 } // namespace at::cuda
	#include <ATen/cuda/CUDAContext.h>
	#include <ATen/cuda/detail/DeviceThreadHandles.h>

	#include <c10/cuda/CUDACachingAllocator.h>

	#include <map>
	#include <memory>
	#include <regex>
	#include <string>
	#include <tuple>

	/**
	* Note [hipblaslt handles]
	* ~~~~~~~~~~~~~~~~~~~~~~~~
	* The cublas documentation states:
	* cuBLAS handle (cublasHandle_t) encapsulates a cuBLASLt handle.
	* Any valid cublasHandle_t can be used in place of cublasLtHandle_t with a simple cast.
	*
	* hipblaslt does not behave in this way.
	* A hipblas handle does not encapsulate a hipblaslt handle.
	*
	* To work around this difference in behavior, a separate handle pool is available for ROCm builds.
	* For CUDA builds, getCurrentCUDABlasLtHandle will alias for getCurrentCUDABlasHandle,
	* whereas for ROCm builds, it is a distinct function.
	*/

	namespace at::cuda {

	namespace {

	#if defined(USE_ROCM) && ROCM_VERSION >= 50700
	void createCublasLtHandle(cublasLtHandle_t *handle) {
	TORCH_CUDABLAS_CHECK(cublasLtCreate(handle));
	}

	void destroyCublasLtHandle(cublasLtHandle_t handle) {
	// this is because of something dumb in the ordering of
	// destruction. Sometimes atexit, the cuda context (or something)
	// would already be destroyed by the time this gets destroyed. It
	// happens in fbcode setting. @colesbury and @soumith decided to not destroy
	// the handle as a workaround.
	// - Comments of @soumith copied from cuDNN handle pool implementation
	#ifdef NO_CUDNN_DESTROY_HANDLE
	#else
	cublasLtDestroy(handle);
	#endif
	}

	using CuBlasLtPoolType = DeviceThreadHandlePool<cublasLtHandle_t, createCublasLtHandle, destroyCublasLtHandle>;
	#endif

	std::map<std::tuple<void , void >, at::DataPtr>& cublas_handle_stream_to_workspace() {
	static auto& instance = new std::map<std::tuple<void , void *>, at::DataPtr>;
	return instance;
	}

	void createCublasHandle(cublasHandle_t *handle) {
	TORCH_CUDABLAS_CHECK(cublasCreate(handle));
	}

	void destroyCublasHandle(cublasHandle_t handle) {
	// this is because of something dumb in the ordering of
	// destruction. Sometimes atexit, the cuda context (or something)
	// would already be destroyed by the time this gets destroyed. It
	// happens in fbcode setting. @colesbury and @soumith decided to not destroy
	// the handle as a workaround.
	// - Comments of @soumith copied from cuDNN handle pool implementation
	#ifdef NO_CUDNN_DESTROY_HANDLE
	#else
	cublasDestroy(handle);
	#endif
	}

	using CuBlasPoolType = DeviceThreadHandlePool<cublasHandle_t, createCublasHandle, destroyCublasHandle>;

	} // namespace

	void clearCublasWorkspaces() {
	#if !defined(USE_ROCM) && defined(CUDA_VERSION) && CUDA_VERSION < 12200
	cublas_handle_stream_to_workspace().clear();
	#endif
	}

	size_t parseChosenWorkspaceSize() {
	const char * val = getenv("CUBLAS_WORKSPACE_CONFIG");
	/* :4096:2:16:8 default, 32MiB for Hopper */
	cudaDeviceProp* properties = at::cuda::getCurrentDeviceProperties();
	const bool sm90 = properties != nullptr && properties->major == 9 && properties->minor == 0;
	const size_t default_size = sm90 ? 4096 * 8 * 1024 : 4096 * 1024 * 2 + 16 * 1024 * 8;

	if (val) {
	size_t total_size = 0;
	const std::string config(val);
	std::regex exp(":([0-9]+):([0-9]+)");
	std::sregex_iterator next(config.begin(), config.end(), exp);
	std::sregex_iterator end;
	if (next == end) {
	TORCH_WARN("Could not parse CUBLAS_WORKSPACE_CONFIG, using default workspace size of ", default_size, " bytes.");
	return default_size;
	}
	while (next != end) {
	std::smatch match = *next;
	TORCH_CHECK(match.size() == 3, "Expected CUBLAS_WORKSPACE_SPACE_CONFIG match of size 3 (Format :SIZE:COUNT)");
	size_t curr_size = (size_t) std::stoi(match.str(1));
	size_t count = (size_t) std::stoi(match.str(2));
	total_size += curr_size * 1024 * count;
	next++;
	}
	return total_size;
	} else {
	return default_size;
	}
	}

	size_t getChosenWorkspaceSize() {
	size_t pool_size = parseChosenWorkspaceSize();
	return pool_size;
	}

	at::DataPtr getNewWorkspace() {
	return c10::cuda::CUDACachingAllocator::get()->allocate(getChosenWorkspaceSize());
	}

	cublasHandle_t getCurrentCUDABlasHandle() {
	int device;
	AT_CUDA_CHECK(c10::cuda::GetDevice(&device));

	// Thread local PoolWindows are lazily-initialized
	// to avoid initialization issues that caused hangs on Windows.
	// See: https://github.com/pytorch/pytorch/pull/22405
	// This thread local unique_ptrs will be destroyed when the thread terminates,
	// releasing its reserved handles back to the pool.

	// Use a leaky singleton for the pool following standard practice around
	// singletons: https://isocpp.org/wiki/faq/ctors#construct-on-first-use-v2
	static auto pool = std::shared_ptr<CuBlasPoolType>(
	new CuBlasPoolType(), [](CuBlasPoolType* p) {
	// Leak the memory.
	});
	thread_local std::unique_ptr<CuBlasPoolType::PoolWindow> myPoolWindow(
	pool->newPoolWindow());

	auto handle = myPoolWindow->reserve(device);
	auto stream = c10::cuda::getCurrentCUDAStream();
	TORCH_CUDABLAS_CHECK(cublasSetStream(handle, stream));
	#if !defined(USE_ROCM) && defined(CUDA_VERSION) && CUDA_VERSION < 12200
	// cuBLAS should not need an explicitly allocated workspace after CUDA 12.2
	// to avoid increasing memory usage during graph captures
	// original issue: https://github.com/pytorch/pytorch/pull/83461
	cudaStream_t _stream = stream;
	auto key = std::make_tuple(static_cast<void >(handle), static_cast<void >(_stream));
	auto workspace_it = cublas_handle_stream_to_workspace().find(key);
	if (workspace_it == cublas_handle_stream_to_workspace().end()) {
	workspace_it = cublas_handle_stream_to_workspace().insert(workspace_it, {key, getNewWorkspace()});
	}
	TORCH_CUDABLAS_CHECK(cublasSetWorkspace(handle, workspace_it->second.get(), getChosenWorkspaceSize()));
	#endif
	#if !defined(USE_ROCM)
	// On CUDA >= 11, and architecture >= Ampere, cuBLAS can use TF32 to speedup
	// FP32 data type calculations based on the value of the allow_tf32 flag.
	// To enable TF32, set the math mode of the handle to CUBLAS_TF32_TENSOR_OP_MATH.
	if (!NoTF32Guard::should_disable_tf32() && at::globalContext().allowTF32CuBLAS()) {
	TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH));
	} else {
	TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
	}
	#endif
	#if defined(USE_ROCM)
	hipblasAtomicsMode_t hipblas_mode;
	if (at::globalContext().deterministicAlgorithms()) {
	hipblas_mode = HIPBLAS_ATOMICS_NOT_ALLOWED;
	} else {
	hipblas_mode = HIPBLAS_ATOMICS_ALLOWED;
	}
	TORCH_CUDABLAS_CHECK(hipblasSetAtomicsMode(handle, hipblas_mode));
	#endif
	return handle;
	}

	#if (!defined(USE_ROCM) && !defined(_MSC_VER)) \|\| (defined(USE_ROCM) && ROCM_VERSION >= 50700)
	cublasLtHandle_t getCurrentCUDABlasLtHandle() {
	#ifdef USE_ROCM
	int device;
	AT_CUDA_CHECK(c10::cuda::GetDevice(&device));

	// Thread local PoolWindows are lazily-initialized
	// to avoid initialization issues that caused hangs on Windows.
	// See: https://github.com/pytorch/pytorch/pull/22405
	// This thread local unique_ptrs will be destroyed when the thread terminates,
	// releasing its reserved handles back to the pool.

	// Use a leaky singleton for the pool following standard practice around
	// singletons: https://isocpp.org/wiki/faq/ctors#construct-on-first-use-v2
	static auto pool = std::shared_ptr<CuBlasLtPoolType>(
	new CuBlasLtPoolType(), [](CuBlasLtPoolType* p) {
	// Leak the memory.
	});
	thread_local std::unique_ptr<CuBlasLtPoolType::PoolWindow> myPoolWindow(
	pool->newPoolWindow());

	auto handle = myPoolWindow->reserve(device);
	return handle;
	#else
	return reinterpret_cast<cublasLtHandle_t>(getCurrentCUDABlasHandle());
	#endif
	}
	#endif

	} // namespace at::cuda