[ATen][CUDA][CUBLAS] cublasLtMatmul increase workspace_size (#120925)
According to the [cuBLAS API Reference](https://docs.nvidia.com/cuda/cublas/index.html#cublassetworkspace) the recommended workspace size for Hopper is 32 MiB and for the rest architectures 4 MiB. This PR increases the workspace size accordingly. I am not aware of the recommended workspace size for HIP, that is why I am keeping it unchanged.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/120925
Approved by: https://github.com/eqy, https://github.com/malfet
diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
index d534ec5..46d73bc 100644
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -183,13 +183,22 @@
static size_t _parseChosenWorkspaceSize() {
const char * val = getenv("CUBLASLT_WORKSPACE_SIZE");
+ size_t workspace_size = 1024;
#ifdef USE_ROCM
if (!val) {
// accept either env var
val = getenv("HIPBLASLT_WORKSPACE_SIZE");
}
+#else
+ cudaDeviceProp* p = at::cuda::getDeviceProperties(c10::cuda::current_device());
+ // Keep workspace_size = 1024 for small Ampere GPUs
+ // See https://github.com/pytorch/pytorch/pull/120925#issuecomment-1977556485
+ if (p->major == 8 && p->totalGlobalMem / 1073741824 >= 24) {
+ workspace_size = 4096;
+ } else if (p->major >= 9) {
+ workspace_size = 32768;
+ }
#endif
- size_t workspace_size = 1024; /* default size in KiB according to #73328 */
if (val) {
try {
workspace_size = std::stoi(val);