disable LT interface (#74021)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/74021

Disables cublasLt as it is buggy with cuda 11.0. We'll reenable it based on cuda version (some known bugs are fixed in 11.5)

Test Plan: Existing tests

Reviewed By: jspark1105

Differential Revision: D34775050

fbshipit-source-id: 3faf3d15a0d9e24ca99e3ab515d2bc73a4c51fb5
(cherry picked from commit ac50bfc0ad5b96794292aedaf41d8232bf18355e)
diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
index ec50994..fbf57f2 100644
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@@ -129,7 +129,7 @@
   at::ScalarType scalar_type = self.scalar_type();
   c10::MaybeOwned<Tensor> self_;
   if (&result != &self) {
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && !defined(_MSC_VER)
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11040 && !defined(_MSC_VER)
     // Strangely, if mat2 has only 1 row or column, we get
     // CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic.
     // self.dim() == 1 && result.dim() == 2 && self.sizes()[0] == mat2_sizes[1]
@@ -142,12 +142,6 @@
          scalar_type == at::ScalarType::Half ||
          scalar_type == at::ScalarType::BFloat16) &&
         mat2_sizes[0] > 1 && mat2_sizes[1] > 1;
-
-    // https://docs.nvidia.com/cuda/cublas/index.html#cublasLt-general-description
-    // Batch size > 65535 does not work in most cases.
-    if (mat1_sizes[0] > 65535) {
-      useLtInterface = false;
-    }
 #endif
     if (!useLtInterface) {
       self_ = expand_size(self, {mat1_sizes[0], mat2_sizes[1]}, "addmm");